src/jalview/io/MegaFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  * The Jalview Authors are detailed in the 'AUTHORS' file.
  18  */
  19 package jalview.io;
  20
  21 import jalview.datamodel.AlignmentI;
  22 import jalview.datamodel.Sequence;
  23 import jalview.datamodel.SequenceFeature;
  24 import jalview.datamodel.SequenceI;
  25
  26 import java.io.IOException;
  27 import java.util.ArrayList;
  28 import java.util.HashMap;
  29 import java.util.Iterator;
  30 import java.util.LinkedHashMap;
  31 import java.util.List;
  32 import java.util.Map;
  33 import java.util.Map.Entry;
  34 import java.util.Set;
  35
  36 /**
  37  * A parser for input or output of MEGA format files. <br>
  38  * <br>
  39  * Tamura K, Stecher G, Peterson D, Filipski A, and Kumar S (2013) MEGA6:
  40  * Molecular Evolutionary Genetics Analysis Version 6.0. Molecular Biology and
  41  * Evolution 30: 2725-2729. <br>
  42  * <br>
  43  *
  44  * MEGA file format is supported as described in
  45  * http://www.megasoftware.net/manual.pdf <br>
  46  * Limitations:
  47  * <ul>
  48  * <li>nested comments (marked by [ ]) are accepted but not preserved</li>
  49  * <li>to be completed</li>
  50  * </ul>
  51  *
  52  * @see http://www.megasoftware.net/
  53  */
  54 public class MegaFile extends AlignFile
  55 {
  56   private static final int DEFAULT_LINE_LENGTH = 60;
  57
  58   private static final String INDENT = "    ";
  59
  60   private static final String N_SITES = "NSites";
  61
  62   private static final String N_SEQS = "NSeqs";
  63
  64   private static final String MISSING = "Missing";
  65
  66   private static final String IDENTICAL = "Identical";
  67
  68   private static final String INDEL = "Indel";
  69
  70   private static final String CODETABLE = "CodeTable";
  71
  72   private static final String PROTEIN = "Protein";
  73
  74   private static final String NUCLEOTIDE = "Nucleotide";
  75
  76   private static final String DATATYPE = "DataType";
  77
  78   private static final char COMMENT_START = '[';
  79
  80   private static final char COMMENT_END = ']';
  81
  82   private static final String HASHSIGN = "#";
  83
  84   private static final String SEMICOLON = ";";
  85
  86   private static final String BANG = "!";
  87
  88   private static final String EQUALS = "=";
  89
  90   private static final String MEGA_ID = HASHSIGN + "MEGA";
  91
  92   private static final String TITLE = "Title";
  93
  94   private static final String FORMAT = "Format";
  95
  96   private static final String DESCRIPTION = "Description";
  97
  98   private static final String GENE = "Gene";
  99
 100   private static final String DOMAIN = "Domain";
 101
 102   /*
 103    * names of properties to save to the alignment (may affect eventual output
 104    * format)
 105    */
 106   static final String PROP_TITLE = "MEGA_TITLE";
 107
 108   static final String PROP_INTERLEAVED = "MEGA_INTERLEAVED";
 109
 110   static final String PROP_DESCRIPTION = "MEGA_DESCRIPTION";
 111
 112   static final String PROP_CODETABLE = "MEGA_CODETABLE";
 113
 114   static final String PROP_IDENTITY = "MEGA_IDENTITY";
 115
 116   static final String PROP_MISSING = "MEGA_MISSING";
 117
 118   static final String PROP_DATATYPE = "MEGA_DATATYPE";
 119
 120   // number of bases per line of file (value is inferred)
 121   static final String PROP_LINELENGTH = "MEGA_LINELENGTH";
 122
 123   // TODO: need a controlled name for Gene as a feature if we want to be able to
 124   // output the MEGA file with !Gene headers
 125   // WTF do we do if the sequences get realigned?
 126
 127   // initial size for sequence data buffer
 128   private static final int SEQBUFFERSIZE = 256;
 129
 130   private static final String SPACE = " ";
 131
 132   /*
 133    * number of sequence positions output per line
 134    */
 135   private int positionsPerLine;
 136
 137   private String title;
 138
 139   // gap character may be explicitly declared, default is -
 140   private char gapCharacter = '-';
 141
 142   // identity character if declared
 143   private char identityCharacter = 0;
 144
 145   // this can be True, False or null (meaning not asserted in file)
 146   private Boolean nucleotide;
 147
 148   // set once we have seen one block of interleaved data
 149   private boolean firstDataBlockRead = false;
 150
 151   // this can be True, False or null (meaning we don't know yet)
 152   private Boolean interleaved;
 153
 154   // write end of line positions as a comment
 155   private boolean writePositionNumbers = true;
 156
 157   // id of sequence being processed
 158   private String currentSequenceId;
 159
 160   /*
 161    * Temporary store of {sequenceId, positionData} while parsing interleaved
 162    * sequences; sequences are maintained in the order in which they are added
 163    * i.e. read in the file
 164    */
 165   Map<String, StringBuilder> seqData;
 166
 167   // number of residues read (so far) per sequence
 168   Map<String, Integer> residuesRead;
 169
 170   // start residue (base 1) per sequence of current feature
 171   Map<String, Integer> featureStart;
 172
 173   // feature (Gene/Domain) if any we are parsing
 174   private String currentFeature;
 175
 176   // feature type (Gene/Domain) if any we are parsing
 177   private String currentFeatureType;
 178
 179   // map of SequenceFeature's by sequence id
 180   Map<String, List<SequenceFeature>> sequenceFeatures;
 181
 182   public MegaFile()
 183   {
 184   }
 185
 186   public MegaFile(String inFile, String type) throws IOException
 187   {
 188     super(inFile, type);
 189   }
 190
 191   public MegaFile(FileParse source) throws IOException
 192   {
 193     super(source);
 194   }
 195
 196   /**
 197    * Parse the input stream.
 198    */
 199   @Override
 200   public void parse() throws IOException
 201   {
 202     gapCharacter = '-';
 203     sequenceFeatures = new HashMap<String, List<SequenceFeature>>();
 204     featureStart = new HashMap<String, Integer>();
 205     residuesRead = new HashMap<String, Integer>();
 206
 207     /*
 208      * Read and process MEGA and Title/Format/Description headers if present.
 209      * Returns the first data line following the headers.
 210      */
 211     String dataLine = parseHeaderLines();
 212
 213     /*
 214      * order-preserving map to hold sequences by id as they are built up during
 215      * parsing
 216      */
 217     seqData = new LinkedHashMap<String, StringBuilder>();
 218
 219     /*
 220      * The id of the sequence being read (for non-interleaved)
 221      */
 222     currentSequenceId = "";
 223
 224     while (dataLine != null)
 225     {
 226       dataLine = dataLine.trim();
 227       if (dataLine.length() > 0)
 228       {
 229         if (dataLine.startsWith(BANG + GENE))
 230         {
 231           parseFeature(GENE, dataLine);
 232         }
 233         else if (dataLine.startsWith(BANG + DOMAIN))
 234         {
 235           parseFeature(DOMAIN, dataLine);
 236         }
 237         else
 238         {
 239           currentSequenceId = parseDataLine(dataLine);
 240         }
 241       }
 242       else if (!seqData.isEmpty())
 243       {
 244         /*
 245          * Blank line after processing some data...
 246          */
 247         endOfDataBlock();
 248       }
 249       dataLine = nextNonCommentLine();
 250     }
 251
 252     // remember the (longest) line length read in, so we can output the same
 253     setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
 254
 255     deriveSequences();
 256   }
 257
 258   /**
 259    * Post-processing after reading one block of interleaved data
 260    */
 261   protected void endOfDataBlock()
 262   {
 263     this.firstDataBlockRead = true;
 264     // TODO:
 265     // (initialise and) populate arrays of sequence length so far (excluding
 266     // gaps)
 267     // On change or end of a denoted Gene or Domain, add sequence features for
 268     // it
 269   }
 270
 271   /**
 272    * Parse a !Gene or !Domain command line
 273    *
 274    * @param featureType
 275    * @param dataLine
 276    */
 277   protected void parseFeature(String featureType, String dataLine)
 278   {
 279     String featureName = getValue(dataLine);
 280     // TODO parse !Gene=xyx Property=end; ???
 281     if (this.currentFeature != null)
 282     {
 283       endSequenceFeature();
 284     }
 285     startSequenceFeature(featureName, featureType);
 286   }
 287
 288   /**
 289    * Start processing a new feature
 290    *
 291    * @param featureName
 292    */
 293   protected void startSequenceFeature(String featureName, String featureType)
 294   {
 295     currentFeature = featureName;
 296     currentFeatureType = featureType;
 297
 298     /*
 299      * If the feature name precedes all sequences, we will know in
 300      * endSequenceFeature that it starts with residue 1; otherwise note now
 301      * where it starts in each sequence
 302      */
 303     if (!residuesRead.isEmpty())
 304     {
 305       for (Entry<String, Integer> entry : residuesRead.entrySet())
 306       {
 307         String seqId = entry.getKey();
 308         Integer nextResidue = entry.getValue() + 1;
 309         featureStart.put(seqId, nextResidue);
 310       }
 311     }
 312   }
 313
 314   /**
 315    * Add a SequenceFeature for the current feature to each sequence, using the
 316    * current feature start/end values per sequence
 317    */
 318   protected void endSequenceFeature()
 319   {
 320     Iterator<String> seqids = this.seqData.keySet().iterator();
 321     while (seqids.hasNext())
 322     {
 323       String seqid = seqids.next();
 324       Integer startAt = featureStart.get(seqid);
 325       int sfstart = startAt == null ? 1 : startAt.intValue();
 326       int sfend = residuesRead.get(seqid);
 327       if (sfend >= sfstart)
 328       {
 329         /*
 330          * don't add feature if entirely gapped in the sequence
 331          */
 332         SequenceFeature sf = new SequenceFeature(currentFeature,
 333                 currentFeatureType, sfstart, sfend, 0f, null);
 334         sequenceFeatures.get(seqid).add(sf);
 335       }
 336     }
 337   }
 338
 339   /**
 340    * Parse a !Domain command line
 341    *
 342    * @param dataLine
 343    */
 344   private void parseDomain(String dataLine)
 345   {
 346   }
 347
 348   /**
 349    * Returns the next line that is not a comment, or null at end of file.
 350    * Comments in MEGA are within [ ] brackets, and may be nested.
 351    *
 352    * @return
 353    * @throws IOException
 354    */
 355   protected String nextNonCommentLine() throws IOException
 356   {
 357     return nextNonCommentLine(0);
 358   }
 359
 360   /**
 361    * Returns the next non-comment line (or part line), or null at end of file.
 362    * Comments in MEGA are within [ ] brackets, and may be nested. They may occur
 363    * anywhere within a line (for example at the end with position numbers); this
 364    * method returns the line with any comments removed.
 365    *
 366    * @param depth
 367    *          current depth of nesting of comments while parsing
 368    * @return
 369    * @throws IOException
 370    */
 371   protected String nextNonCommentLine(final int depth) throws IOException
 372   {
 373     String data = null;
 374     data = nextLine();
 375     if (data == null)
 376     {
 377       if (depth > 0)
 378       {
 379         System.err.println("Warning: unterminated comment in data file");
 380       }
 381       return data;
 382     }
 383
 384     /*
 385      * If we are in a (possibly nested) comment after parsing this line, keep
 386      * reading recursively until the comment has unwound
 387      */
 388     int newDepth = commentDepth(data, depth);
 389     if (newDepth > 0)
 390     {
 391       return nextNonCommentLine(newDepth);
 392     }
 393     else
 394     {
 395       /*
 396        * not in a comment by end of this line; return what is left
 397        */
 398       String nonCommentPart = getNonCommentContent(data, depth);
 399       return nonCommentPart;
 400     }
 401   }
 402
 403   /**
 404    * Returns what is left of the input data after removing any comments, whether
 405    * 'in progress' from preceding lines, or embedded in the current line
 406    *
 407    * @param data
 408    *          input data
 409    * @param depth
 410    *          nested depth of comments pending termination
 411    * @return
 412    * @throws FileFormatException
 413    */
 414   protected static String getNonCommentContent(String data, int depth)
 415           throws FileFormatException
 416   {
 417     int len = data.length();
 418     StringBuilder result = new StringBuilder(len);
 419     for (int i = 0; i < len; i++)
 420     {
 421       char c = data.charAt(i);
 422       switch (c)
 423       {
 424       case COMMENT_START:
 425         depth++;
 426         break;
 427
 428       case COMMENT_END:
 429         if (depth > 0)
 430         {
 431           depth--;
 432         }
 433         else
 434         {
 435           result.append(c);
 436         }
 437         break;
 438
 439       default:
 440         if (depth == 0)
 441         {
 442           result.append(c);
 443         }
 444       }
 445     }
 446     return result.toString();
 447   }
 448
 449   /**
 450    * Calculates new depth of comment after parsing an input line i.e. the excess
 451    * of opening '[' over closing ']' characters. Any excess ']' are ignored (not
 452    * treated as comment delimiters).
 453    *
 454    * @param data
 455    *          input line
 456    * @param depth
 457    *          current comment nested depth before parsing the line
 458    * @return new depth after parsing the line
 459    */
 460   protected static int commentDepth(CharSequence data, int depth)
 461   {
 462     int newDepth = depth;
 463     int len = data.length();
 464     for (int i = 0; i < len; i++)
 465     {
 466       char c = data.charAt(i);
 467       if (c == COMMENT_START)
 468       {
 469         newDepth++;
 470       }
 471       else if (c == COMMENT_END && newDepth > 0)
 472       {
 473         newDepth--;
 474       }
 475     }
 476     return newDepth;
 477   }
 478
 479   /**
 480    * Convert the parsed sequence strings to objects and store them in the model.
 481    */
 482   protected void deriveSequences()
 483   {
 484     Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
 485
 486     for (Entry<String, StringBuilder> dataset : datasets)
 487     {
 488       String sequenceId = dataset.getKey();
 489       StringBuilder characters = dataset.getValue();
 490       SequenceI s = new Sequence(sequenceId, new String(characters));
 491       this.seqs.addElement(s);
 492
 493       /*
 494        * and add any derived sequence features to the sequence
 495        */
 496       for (SequenceFeature sf : sequenceFeatures.get(sequenceId))
 497       {
 498         s.addSequenceFeature(sf);
 499       }
 500     }
 501   }
 502
 503   /**
 504    * Process one line of sequence data. If it has no sequence identifier, append
 505    * to the current id's sequence. Else parse out the sequence id and append the
 506    * data (if any) to that id's sequence. Returns the sequence id (implicit or
 507    * explicit) for this line.
 508    *
 509    * @param dataLine
 510    * @return
 511    * @throws IOException
 512    */
 513   protected String parseDataLine(String dataLine)
 514           throws IOException
 515   {
 516     String seqId = getSequenceId(dataLine);
 517     if (seqId == null)
 518     {
 519       /*
 520        * Just character data
 521        */
 522       parseNoninterleavedDataLine(dataLine);
 523       return currentSequenceId;
 524     }
 525     else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
 526     {
 527       /*
 528        * Sequence id only - header line for noninterleaved data
 529        */
 530       return seqId;
 531     }
 532     else
 533     {
 534       /*
 535        * Sequence id followed by data
 536        */
 537       parseInterleavedDataLine(dataLine, seqId);
 538       return seqId;
 539     }
 540   }
 541
 542   /**
 543    * Add a line of sequence data to the buffer for the given sequence id. Start
 544    * a new one if we haven't seen it before.
 545    *
 546    * @param dataLine
 547    * @throws IOException
 548    */
 549   protected void parseNoninterleavedDataLine(String dataLine)
 550           throws IOException
 551   {
 552     if (currentSequenceId == null)
 553     {
 554       /*
 555        * Oops. Data but no sequence id context.
 556        */
 557       throw new IOException("No sequence id context at: " + dataLine);
 558     }
 559
 560     assertInterleaved(false, dataLine);
 561
 562     dataLine = addSequenceData(currentSequenceId, dataLine);
 563
 564     setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
 565   }
 566
 567   /**
 568    * Get the sequence data for this sequence id, starting a new one if
 569    * necessary.
 570    *
 571    * @param currentId
 572    * @return
 573    */
 574   protected StringBuilder getSequenceDataBuffer(String currentId)
 575   {
 576     StringBuilder sb = seqData.get(currentId);
 577     if (sb == null)
 578     {
 579       // first data met for this sequence id, start a new buffer
 580       sb = new StringBuilder(SEQBUFFERSIZE);
 581       seqData.put(currentId, sb);
 582
 583       // and a placeholder for any SequenceFeature found
 584       sequenceFeatures.put(currentId, new ArrayList<SequenceFeature>());
 585     }
 586     return sb;
 587   }
 588
 589   /**
 590    * Parse one line of interleaved data e.g.
 591    *
 592    * <pre>
 593    * #TheSeqId CGATCGCATGCA
 594    * </pre>
 595    *
 596    * @param dataLine
 597    * @param seqId
 598    * @throws FileFormatException
 599    */
 600   protected void parseInterleavedDataLine(String dataLine, String seqId)
 601           throws FileFormatException
 602   {
 603     /*
 604      * New sequence found in second or later data block - error.
 605      */
 606     if (this.firstDataBlockRead && !seqData.containsKey(seqId))
 607     {
 608       throw new FileFormatException(
 609               "Parse error: misplaced new sequence starting at " + dataLine);
 610     }
 611
 612     String data = dataLine.substring(seqId.length() + 1).trim();
 613
 614     /*
 615      * Do nothing if this line is _only_ a sequence id with no data following.
 616      */
 617     if (data != null && data.length() > 0)
 618     {
 619       data = addSequenceData(seqId, data);
 620       setPositionsPerLine(Math.max(positionsPerLine, data.length()));
 621       assertInterleaved(true, dataLine);
 622     }
 623   }
 624
 625   /**
 626    * Remove spaces, and replace identity symbol, before appending the sequence
 627    * data to the buffer for the sequence id. Returns the reformatted added data.
 628    * Also updates a count of residues read for the sequence.
 629    *
 630    * @param seqId
 631    * @param data
 632    * @return
 633    */
 634   protected String addSequenceData(String seqId, String data)
 635   {
 636     StringBuilder sb = getSequenceDataBuffer(seqId);
 637     int len = sb.length();
 638     String formatted = data.replace(SPACE, "");
 639
 640     /*
 641      * If sequence contains '.' or other identity symbol; replace these with the
 642      * same position from the first (reference) sequence
 643      */
 644     int nonGapped = 0;
 645     StringBuilder referenceSequence = seqData.values().iterator().next();
 646     StringBuilder sb1 = new StringBuilder(formatted.length());
 647     for (int i = 0; i < formatted.length(); i++)
 648     {
 649       char nextChar = formatted.charAt(i);
 650       if (nextChar != gapCharacter)
 651       {
 652         nonGapped++;
 653       }
 654       if (nextChar == identityCharacter
 655               && len + i < referenceSequence.length())
 656       {
 657         sb1.append(referenceSequence.charAt(len + i));
 658       }
 659       else
 660       {
 661         sb1.append(nextChar);
 662       }
 663     }
 664     formatted = sb1.toString();
 665
 666     data = formatted;
 667     sb.append(data);
 668
 669     /*
 670      * increment residue count for the sequence
 671      */
 672     if (nonGapped > 0)
 673     {
 674       Integer residueCount = residuesRead.get(seqId);
 675       residuesRead.put(seqId, nonGapped
 676               + (residueCount == null ? 0 : residueCount));
 677     }
 678
 679     return data;
 680   }
 681
 682   /**
 683    * If the line begins with (e.g.) "#abcde " then returns "abcde" as the
 684    * identifier. Else returns null.
 685    *
 686    * @param dataLine
 687    * @return
 688    */
 689   public static String getSequenceId(String dataLine)
 690   {
 691     // TODO refactor to a StringUtils type class
 692     if (dataLine != null)
 693     {
 694       if (dataLine.startsWith(HASHSIGN))
 695       {
 696         int spacePos = dataLine.indexOf(" ");
 697         return (spacePos == -1 ? dataLine.substring(1) : dataLine
 698                 .substring(1, spacePos));
 699       }
 700     }
 701     return null;
 702   }
 703
 704   /**
 705    * Read the #MEGA and Title/Format/Description header lines (if present).
 706    *
 707    * Save as alignment properties in case useful.
 708    *
 709    * @return the next non-blank line following the header lines.
 710    * @throws IOException
 711    */
 712   protected String parseHeaderLines() throws IOException
 713   {
 714     String inputLine = null;
 715     while ((inputLine = nextNonCommentLine()) != null)
 716     {
 717       inputLine = inputLine.trim();
 718
 719       /*
 720        * skip blank lines
 721        */
 722       if (inputLine.length() == 0)
 723       {
 724         continue;
 725       }
 726
 727       if (inputLine.toUpperCase().startsWith(MEGA_ID))
 728       {
 729         continue;
 730       }
 731
 732       if (isTitle(inputLine))
 733       {
 734         this.title = getValue(inputLine);
 735         setAlignmentProperty(PROP_TITLE, title);
 736       }
 737       else if (inputLine.startsWith(BANG + DESCRIPTION))
 738       {
 739         parseDescription(inputLine);
 740       }
 741
 742       else if (inputLine.startsWith(BANG + FORMAT))
 743       {
 744         parseFormat(inputLine);
 745       }
 746       else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
 747       {
 748
 749         /*
 750          * Return the first 'data line' i.e. one that is not blank, #MEGA or
 751          * TITLE:
 752          */
 753         break;
 754       }
 755     }
 756     return inputLine;
 757   }
 758
 759   /**
 760    * Parse a !Format statement. This may be multiline, and is ended by a
 761    * semicolon.
 762    *
 763    * @param inputLine
 764    * @throws IOException
 765    */
 766   protected void parseFormat(String inputLine) throws IOException
 767   {
 768     while (inputLine != null)
 769     {
 770       parseFormatLine(inputLine);
 771       if (inputLine.endsWith(SEMICOLON))
 772       {
 773         break;
 774       }
 775       inputLine = nextNonCommentLine();
 776     }
 777   }
 778
 779   /**
 780    * Parse one line of a !Format statement. This may contain one or more
 781    * keyword=value pairs.
 782    *
 783    * @param inputLine
 784    * @throws FileFormatException
 785    */
 786   protected void parseFormatLine(String inputLine)
 787           throws FileFormatException
 788   {
 789     if (inputLine.startsWith(BANG + FORMAT))
 790     {
 791       inputLine = inputLine.substring((BANG + FORMAT).length());
 792     }
 793     if (inputLine.endsWith(SEMICOLON))
 794     {
 795       inputLine = inputLine.substring(0, inputLine.length() - 1);
 796     }
 797     if (inputLine.length() == 0)
 798     {
 799       return;
 800     }
 801     String[] tokens = inputLine.trim().split("\\s"); // any whitespace
 802     for (String token : tokens)
 803     {
 804       parseFormatKeyword(token);
 805     }
 806   }
 807
 808   /**
 809    * Parse a Keyword=Value token. Possible keywords are
 810    * <ul>
 811    * <li>DataType= DNA, RNA, Nucleotide, Protein</li>
 812    * <li>DataFormat= Interleaved, ?</li>
 813    * <li>NSeqs= number of sequences (synonym NTaxa)</li>
 814    * <li>NSites= number of bases / residues</li>
 815    * <li>Property= Exon (or Coding), Intron (or Noncoding), End (of domain)</li>
 816    * <li>Indel= gap character</li>
 817    * <li>Identical= identity character (to first sequence) (synonym MatchChar)</li>
 818    * <li>Missing= missing data character</li>
 819    * <li>CodeTable= Standard, other (MEGA supports various)</li>
 820    * </ul>
 821    *
 822    * @param token
 823    * @throws FileFormatException
 824    *           if an unrecognised keyword or value is encountered
 825    */
 826   protected void parseFormatKeyword(String token)
 827           throws FileFormatException
 828   {
 829     String msg = "Unrecognised Format command: " + token;
 830     String[] bits = token.split(EQUALS);
 831     if (bits.length != 2)
 832     {
 833       throw new FileFormatException(msg);
 834     }
 835     String keyword = bits[0];
 836     String value = bits[1];
 837
 838     /*
 839      * Jalview will work out whether nucleotide or not anyway
 840      */
 841     if (keyword.equalsIgnoreCase(DATATYPE))
 842     {
 843       if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
 844               || value.equalsIgnoreCase("Nucleotide"))
 845       {
 846         this.nucleotide = true;
 847         // alignment computes whether or not it is nucleotide when created
 848       }
 849       else if (value.equalsIgnoreCase(PROTEIN))
 850       {
 851         this.nucleotide = false;
 852       }
 853       else
 854       {
 855         throw new FileFormatException(msg);
 856       }
 857       setAlignmentProperty(PROP_DATATYPE, value);
 858     }
 859
 860     /*
 861      * accept non-Standard code table but save in case we want to disable
 862      * 'translate as cDNA'
 863      */
 864     else if (keyword.equalsIgnoreCase(CODETABLE))
 865     {
 866       setAlignmentProperty(PROP_CODETABLE, value);
 867     }
 868
 869     /*
 870      * save gap char to set later on alignment once created
 871      */
 872     else if (keyword.equalsIgnoreCase(INDEL))
 873     {
 874       this.gapCharacter = value.charAt(0);
 875     }
 876
 877     else if (keyword.equalsIgnoreCase(IDENTICAL)
 878             || keyword.equalsIgnoreCase("MatchChar"))
 879     {
 880       setAlignmentProperty(PROP_IDENTITY, value);
 881       this.identityCharacter = value.charAt(0);
 882       if (!".".equals(value))
 883       {
 884         System.err.println("Warning: " + token
 885                 + " not supported, Jalview uses '.' for identity");
 886       }
 887     }
 888
 889     else if (keyword.equalsIgnoreCase(MISSING))
 890     {
 891       setAlignmentProperty(PROP_MISSING, value);
 892       System.err.println("Warning: " + token + " not supported");
 893     }
 894
 895     else if (keyword.equalsIgnoreCase("Property"))
 896     {
 897       // TODO: figure out what to do with this
 898       // can it appear more than once in a file?
 899       setAlignmentProperty(PROP_MISSING, value);
 900     }
 901
 902     else if (!keyword.equalsIgnoreCase(N_SEQS)
 903             && !keyword.equalsIgnoreCase(N_SITES))
 904     {
 905       System.err.println("Warning: " + msg);
 906     }
 907   }
 908
 909   /**
 910    * Returns the trimmed data on the line following either whitespace or '=',
 911    * with any trailing semi-colon removed<br>
 912    * So
 913    * <ul>
 914    * <li>Hello World</li>
 915    * <li>!Hello: \tWorld;</li>
 916    * <li>!Hello=World</li>
 917    * <ul>
 918    * should all return "World"
 919    *
 920    * @param inputLine
 921    * @return
 922    */
 923   protected static String getValue(String inputLine)
 924   {
 925     if (inputLine == null)
 926     {
 927       return null;
 928     }
 929     String value = null;
 930     String s = inputLine.replaceAll("\t", " ").trim();
 931
 932     /*
 933      * KEYWORD = VALUE should return VALUE
 934      */
 935     int equalsPos = s.indexOf("=");
 936     if (equalsPos >= 0)
 937     {
 938       value = s.substring(equalsPos + 1);
 939     }
 940     else
 941     {
 942       int spacePos = s.indexOf(' ');
 943       value = spacePos == -1 ? "" : s.substring(spacePos + 1);
 944     }
 945     value = value.trim();
 946     if (value.endsWith(SEMICOLON))
 947     {
 948       value = value.substring(0, value.length() - 1).trim();
 949     }
 950     return value;
 951   }
 952
 953   /**
 954    * Returns true if the input line starts with "TITLE" or "!TITLE" (not case
 955    * sensitive). The latter is the official format, some older data file
 956    * examples have it without the !.
 957    *
 958    * @param inputLine
 959    * @return
 960    */
 961   protected static boolean isTitle(String inputLine)
 962   {
 963     if (inputLine == null)
 964     {
 965       return false;
 966     }
 967     String upper = inputLine.toUpperCase();
 968     return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
 969             + TITLE.toUpperCase()));
 970   }
 971
 972   /**
 973    * Reads lines until terminated by semicolon, appending each to the
 974    * Description property value.
 975    *
 976    * @throws IOException
 977    */
 978   protected void parseDescription(String firstDescriptionLine)
 979           throws IOException
 980   {
 981     StringBuilder desc = new StringBuilder(256);
 982     desc.append(getValue(firstDescriptionLine));
 983     if (!firstDescriptionLine.endsWith(SEMICOLON))
 984     {
 985       String line = nextNonCommentLine();
 986       while (line != null)
 987       {
 988         if (line.endsWith(SEMICOLON))
 989         {
 990           desc.append(line.substring(0, line.length() - 1));
 991           break;
 992         }
 993         else if (line.length() > 0)
 994         {
 995           desc.append(line).append(newline);
 996         }
 997         line = nextNonCommentLine();
 998       }
 999     }
1000     setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
1001   }
1002
1003   /**
1004    * Returns the alignment sequences in Mega format.
1005    */
1006   @Override
1007   public String print()
1008   {
1009     return MEGA_ID + newline + print(getSeqsAsArray());
1010   }
1011
1012   /**
1013    * Write out the alignment sequences in Mega format - interleaved unless
1014    * explicitly noninterleaved.
1015    */
1016   protected String print(SequenceI[] s)
1017   {
1018     String result;
1019     if (this.interleaved != null && !this.interleaved)
1020     {
1021       result = printNonInterleaved(s);
1022     }
1023     else
1024     {
1025       result = printInterleaved(s);
1026     }
1027     return result;
1028   }
1029
1030   /**
1031    * Print to string in Interleaved format - blocks of next N characters of each
1032    * sequence in turn.
1033    *
1034    * @param s
1035    */
1036   protected String printInterleaved(SequenceI[] s)
1037   {
1038     int maxIdLength = getMaxIdLength(s);
1039     int maxSequenceLength = getMaxSequenceLength(s);
1040     int numLines = maxSequenceLength / positionsPerLine + 3; // approx
1041
1042     int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1;
1043     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1044     int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;
1045
1046     /*
1047      * Roughly size a buffer to hold the whole output
1048      */
1049     StringBuilder sb = new StringBuilder(numLines
1050             * (maxIdLength + positionsPerLine + chunksPerLine + 10));
1051
1052     /*
1053      * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
1054      */
1055     int from = 0;
1056     for (int i = 0; i < numDataBlocks; i++)
1057     {
1058       sb.append(newline);
1059       boolean first = true;
1060       int advancedBy = 0;
1061       for (SequenceI seq : s)
1062       {
1063         int seqFrom = from;
1064         String seqId = String.format("#%-" + maxIdLength + "s",
1065                 seq.getName());
1066
1067         /*
1068          * output next line for this sequence
1069          */
1070         sb.append(seqId);
1071         int lastPos = seqFrom + positionsPerLine; // exclusive
1072         for (int j = 0; j < chunksPerLine; j++)
1073         {
1074           char[] subSequence = seq.getSequence(seqFrom,
1075                   Math.min(lastPos, seqFrom + spaceEvery));
1076           if (subSequence.length > 0)
1077           {
1078             sb.append(SPACE).append(subSequence);
1079           }
1080           seqFrom += subSequence.length;
1081           if (first)
1082           {
1083             // all sequences should be the same length in MEGA
1084             advancedBy += subSequence.length;
1085           }
1086         }
1087         // write last position as a comment
1088         if (writePositionNumbers)
1089         {
1090           sb.append(SPACE).append(COMMENT_START).append(from + advancedBy)
1091                   .append(COMMENT_END);
1092         }
1093         sb.append(newline);
1094         first = false;
1095       }
1096       from += advancedBy;
1097     }
1098
1099     return new String(sb);
1100   }
1101
1102   /**
1103    * Outputs to string the MEGA header and any other known and relevant
1104    * alignment properties
1105    *
1106    * @param al
1107    */
1108   protected String printHeaders(AlignmentI al)
1109   {
1110     StringBuilder sb = new StringBuilder(128);
1111     sb.append(MEGA_ID).append(newline);
1112     String propertyValue = (String) al.getProperty(PROP_TITLE);
1113     if (propertyValue != null)
1114     {
1115       sb.append(BANG).append(TITLE).append(SPACE).append(propertyValue)
1116               .append(SEMICOLON).append(newline);
1117     }
1118     propertyValue = (String) al.getProperty(PROP_DESCRIPTION);
1119     if (propertyValue != null)
1120     {
1121       sb.append(BANG).append(DESCRIPTION).append(newline)
1122               .append(propertyValue).append(SEMICOLON)
1123               .append(newline);
1124     }
1125
1126     /*
1127      * !Format DataType CodeTable
1128      */
1129     sb.append(BANG).append(FORMAT).append(newline);
1130     String dataType = (String) al.getProperty(PROP_DATATYPE);
1131     if (dataType == null)
1132     {
1133       dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
1134     }
1135     sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
1136     String codeTable = (String) al.getProperty(PROP_CODETABLE);
1137     sb.append(SPACE).append(CODETABLE).append(EQUALS)
1138             .append(codeTable == null ? "Standard" : codeTable)
1139             .append(newline);
1140
1141     /*
1142      * !Format NSeqs NSites (the length of sequences - they should all be the
1143      * same - including gaps)
1144      */
1145     sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
1146     sb.append(SPACE).append(N_SITES).append(EQUALS)
1147             .append(String.valueOf(al.getWidth()));
1148     sb.append(newline);
1149
1150     /*
1151      * !Format Indel Identical Missing
1152      */
1153     sb.append(INDENT);
1154     sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
1155     String identity = (String) al.getProperty(PROP_IDENTITY);
1156     if (identity != null)
1157     {
1158       sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
1159     }
1160     String missing = (String) al.getProperty(PROP_MISSING);
1161     if (missing != null)
1162     {
1163       sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
1164     }
1165     sb.append(SEMICOLON).append(newline);
1166
1167     return sb.toString();
1168   }
1169
1170   /**
1171    * Get the longest sequence id (to allow aligned printout).
1172    *
1173    * @param s
1174    * @return
1175    */
1176   protected static int getMaxIdLength(SequenceI[] s)
1177   {
1178     // TODO pull up for reuse
1179     int maxLength = 0;
1180     for (SequenceI seq : s)
1181     {
1182       int len = seq.getName().length();
1183       if (len > maxLength)
1184       {
1185         maxLength = len;
1186       }
1187     }
1188     return maxLength;
1189   }
1190
1191   /**
1192    * Get the longest sequence length
1193    *
1194    * @param s
1195    * @return
1196    */
1197   protected static int getMaxSequenceLength(SequenceI[] s)
1198   {
1199     // TODO pull up for reuse
1200     int maxLength = 0;
1201     for (SequenceI seq : s)
1202     {
1203       int len = seq.getLength();
1204       if (len > maxLength)
1205       {
1206         maxLength = len;
1207       }
1208     }
1209     return maxLength;
1210   }
1211
1212   /**
1213    * Print to string in noninterleaved format - all of each sequence in turn, in
1214    * blocks of 50 characters.
1215    *
1216    * @param s
1217    * @return
1218    */
1219   protected String printNonInterleaved(SequenceI[] s)
1220   {
1221     int maxSequenceLength = getMaxSequenceLength(s);
1222     // approx
1223     int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;
1224
1225     /*
1226      * Roughly size a buffer to hold the whole output
1227      */
1228     StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
1229
1230     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1231     int chunksPerLine = positionsPerLine / spaceEvery;
1232     for (SequenceI seq : s)
1233     {
1234       sb.append(newline);
1235       sb.append(HASHSIGN + seq.getName()).append(newline);
1236       int startPos = 0;
1237       while (startPos < seq.getLength())
1238       {
1239         boolean firstChunk = true;
1240         /*
1241          * print next line for this sequence
1242          */
1243         int lastPos = startPos + positionsPerLine; // exclusive
1244         for (int j = 0; j < chunksPerLine; j++)
1245         {
1246           char[] subSequence = seq.getSequence(startPos,
1247                   Math.min(lastPos, startPos + positionsPerLine));
1248           if (subSequence.length > 0)
1249           {
1250             if (!firstChunk)
1251             {
1252               sb.append(SPACE);
1253             }
1254             sb.append(subSequence);
1255             firstChunk = false;
1256           }
1257           startPos += subSequence.length;
1258         }
1259         sb.append(newline);
1260       }
1261     }
1262
1263     return new String(sb);
1264   }
1265
1266   /**
1267    * Flag this file as interleaved or not, based on data format. Throws an
1268    * exception if has previously been determined to be otherwise.
1269    *
1270    * @param isIt
1271    * @param dataLine
1272    * @throws IOException
1273    */
1274   protected void assertInterleaved(boolean isIt, String dataLine)
1275           throws FileFormatException
1276   {
1277     if (this.interleaved != null && isIt != this.interleaved.booleanValue())
1278     {
1279       throw new FileFormatException(
1280               "Parse error: mix of interleaved and noninterleaved detected, at line: "
1281                       + dataLine);
1282     }
1283     this.interleaved = new Boolean(isIt);
1284     setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
1285   }
1286
1287   public boolean isInterleaved()
1288   {
1289     return this.interleaved == null ? false : this.interleaved
1290             .booleanValue();
1291   }
1292
1293   /**
1294    * Adds saved parsed values either as alignment properties, or (in some cases)
1295    * as specific member fields of the alignment
1296    */
1297   @Override
1298   public void addProperties(AlignmentI al)
1299   {
1300     super.addProperties(al);
1301     al.setGapCharacter(gapCharacter);
1302
1303     /*
1304      * warn if e.g. DataType=DNA but data is protein (or vice versa)
1305      */
1306     if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) {
1307       System.err.println("Warning: " + this.title + " declared "
1308               + (nucleotide ? "" : " not ") + "nucleotide but it is"
1309               + (nucleotide ? " not" : ""));
1310     }
1311   }
1312
1313   /**
1314    * Print the given alignment in MEGA format. If the alignment was created by
1315    * parsing a MEGA file, it should have properties set (e.g. Title) which can
1316    * influence the output.
1317    */
1318   @Override
1319   public String print(AlignmentI al)
1320   {
1321     this.nucleotide = al.isNucleotide();
1322
1323     String lineLength = (String) al.getProperty(PROP_LINELENGTH);
1324     this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
1325             .parseInt(lineLength);
1326
1327     /*
1328      * round down to a multiple of 3 positions per line for nucleotide
1329      */
1330     if (nucleotide)
1331     {
1332       positionsPerLine = positionsPerLine - (positionsPerLine % 3);
1333     }
1334
1335     String interleave = (String) al.getProperty(PROP_INTERLEAVED);
1336     if (interleave != null)
1337     {
1338       this.interleaved = Boolean.valueOf(interleave);
1339     }
1340
1341     String headers = printHeaders(al);
1342     return headers + print(al.getSequencesArray());
1343   }
1344
1345   /**
1346    * Returns the number of sequence positions output per line
1347    *
1348    * @return
1349    */
1350   public int getPositionsPerLine()
1351   {
1352     return positionsPerLine;
1353   }
1354
1355   /**
1356    * Sets the number of sequence positions output per line. Note these will be
1357    * formatted in blocks of 3 (nucleotide) or 10 (peptide).
1358    *
1359    * @param p
1360    */
1361   public void setPositionsPerLine(int p)
1362   {
1363     this.positionsPerLine = p;
1364   }
1365 }