src/jalview/io/MegaFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  * The Jalview Authors are detailed in the 'AUTHORS' file.
  18  */
  19 package jalview.io;
  20
  21 import jalview.datamodel.AlignmentI;
  22 import jalview.datamodel.Sequence;
  23 import jalview.datamodel.SequenceI;
  24
  25 import java.io.IOException;
  26 import java.util.LinkedHashMap;
  27 import java.util.Map;
  28 import java.util.Map.Entry;
  29 import java.util.Set;
  30
  31 /**
  32  * A parser for input or output of MEGA format files. <br>
  33  * <br>
  34  * Tamura K, Stecher G, Peterson D, Filipski A, and Kumar S (2013) MEGA6:
  35  * Molecular Evolutionary Genetics Analysis Version 6.0. Molecular Biology and
  36  * Evolution 30: 2725-2729. <br>
  37  * <br>
  38  *
  39  * MEGA file format is supported as described in
  40  * http://www.megasoftware.net/manual.pdf <br>
  41  * Limitations:
  42  * <ul>
  43  * <li>nested comments (marked by [ ]) are accepted but not preserved</li>
  44  * <li>to be completed</li>
  45  * </ul>
  46  *
  47  * @see http://www.megasoftware.net/
  48  */
  49 public class MegaFile extends AlignFile
  50 {
  51   private static final int DEFAULT_LINE_LENGTH = 60;
  52
  53   private static final String INDENT = "    ";
  54
  55   private static final String N_SITES = "NSites";
  56
  57   private static final String N_SEQS = "NSeqs";
  58
  59   private static final String MISSING = "Missing";
  60
  61   private static final String IDENTICAL = "Identical";
  62
  63   private static final String INDEL = "Indel";
  64
  65   private static final String CODETABLE = "CodeTable";
  66
  67   private static final String PROTEIN = "Protein";
  68
  69   private static final String NUCLEOTIDE = "Nucleotide";
  70
  71   private static final String DATATYPE = "DataType";
  72
  73   private static final char COMMENT_START = '[';
  74
  75   private static final char COMMENT_END = ']';
  76
  77   private static final String HASHSIGN = "#";
  78
  79   private static final String SEMICOLON = ";";
  80
  81   private static final String BANG = "!";
  82
  83   private static final String EQUALS = "=";
  84
  85   private static final String MEGA_ID = HASHSIGN + "MEGA";
  86
  87   private static final String TITLE = "Title";
  88
  89   private static final String FORMAT = "Format";
  90
  91   private static final String DESCRIPTION = "Description";
  92
  93   private static final String GENE = "Gene";
  94
  95   private static final String DOMAIN = "Domain";
  96
  97   /*
  98    * names of properties to save to the alignment (may affect eventual output
  99    * format)
 100    */
 101   static final String PROP_TITLE = "MEGA_TITLE";
 102
 103   static final String PROP_INTERLEAVED = "MEGA_INTERLEAVED";
 104
 105   static final String PROP_DESCRIPTION = "MEGA_DESCRIPTION";
 106
 107   static final String PROP_CODETABLE = "MEGA_CODETABLE";
 108
 109   static final String PROP_IDENTITY = "MEGA_IDENTITY";
 110
 111   static final String PROP_MISSING = "MEGA_MISSING";
 112
 113   static final String PROP_DATATYPE = "MEGA_DATATYPE";
 114
 115   // number of bases per line of file (value is inferred)
 116   static final String PROP_LINELENGTH = "MEGA_LINELENGTH";
 117
 118   // TODO: need a controlled name for Gene as a feature if we want to be able to
 119   // output the MEGA file with !Gene headers
 120   // WTF do we do if the sequences get realigned?
 121
 122   // initial size for sequence data buffer
 123   private static final int SEQBUFFERSIZE = 256;
 124
 125   private static final String SPACE = " ";
 126
 127   /*
 128    * number of sequence positions output per line
 129    */
 130   private int positionsPerLine;
 131
 132   private String title;
 133
 134   // gap character may be explicitly declared, if not we infer it
 135   private Character gapCharacter;
 136
 137   // this can be True, False or null (meaning not asserted in file)
 138   private Boolean nucleotide;
 139
 140   // set once we have seen one block of interleaved data
 141   private boolean firstDataBlockRead = false;
 142
 143   // this can be True, False or null (meaning we don't know yet)
 144   private Boolean interleaved;
 145
 146   // write end of line positions as a comment
 147   private boolean writePositionNumbers = true;
 148
 149   public MegaFile()
 150   {
 151   }
 152
 153   public MegaFile(String inFile, String type) throws IOException
 154   {
 155     super(inFile, type);
 156   }
 157
 158   public MegaFile(FileParse source) throws IOException
 159   {
 160     super(source);
 161   }
 162
 163   /**
 164    * Parse the input stream.
 165    */
 166   @Override
 167   public void parse() throws IOException
 168   {
 169     /*
 170      * Read and process MEGA and Title/Format/Description headers if present.
 171      * Returns the first data line following the headers.
 172      */
 173     String dataLine = parseHeaderLines();
 174
 175     /*
 176      * Temporary store of {sequenceId, positionData} while parsing interleaved
 177      * sequences; sequences are maintained in the order in which they are added
 178      * i.e. read in the file
 179      */
 180     Map<String, StringBuilder> seqData = new LinkedHashMap<String, StringBuilder>();
 181
 182     /*
 183      * The id of the sequence being read (for non-interleaved)
 184      */
 185     String currentId = "";
 186
 187     while (dataLine != null)
 188     {
 189       dataLine = dataLine.trim();
 190       if (dataLine.length() > 0)
 191       {
 192         if (dataLine.startsWith(BANG + GENE))
 193         {
 194           parseGene(dataLine);
 195         }
 196         else if (dataLine.startsWith(BANG + DOMAIN))
 197         {
 198           parseDomain(dataLine);
 199         }
 200         else
 201         {
 202           currentId = parseDataLine(dataLine, seqData, currentId);
 203         }
 204       }
 205       else if (!seqData.isEmpty())
 206       {
 207         /*
 208          * Blank line after processing some data...
 209          */
 210         this.firstDataBlockRead = true;
 211       }
 212       dataLine = nextNonCommentLine();
 213     }
 214
 215     // remember the (longest) line length read in, so we can output the same
 216     setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
 217
 218     setSequences(seqData);
 219   }
 220
 221   /**
 222    * Parse a !Gene command line
 223    *
 224    * @param dataLine
 225    */
 226   protected void parseGene(String dataLine)
 227   {
 228   }
 229
 230   /**
 231    * Parse a !Domain command line
 232    *
 233    * @param dataLine
 234    */
 235   private void parseDomain(String dataLine)
 236   {
 237   }
 238
 239   /**
 240    * Returns the next line that is not a comment, or null at end of file.
 241    * Comments in MEGA are within [ ] brackets, and may be nested.
 242    *
 243    * @return
 244    * @throws IOException
 245    */
 246   protected String nextNonCommentLine() throws IOException
 247   {
 248     return nextNonCommentLine(0);
 249   }
 250
 251   /**
 252    * Returns the next line that is not a comment, or null at end of file.
 253    * Comments in MEGA are within [ ] brackets, and may be nested.
 254    *
 255    * @param depth
 256    *          current depth of nesting of comments while parsing
 257    * @return
 258    * @throws IOException
 259    */
 260   protected String nextNonCommentLine(final int depth) throws IOException
 261   {
 262     String data = null;
 263     data = nextLine();
 264     if (data == null)
 265     {
 266       if (depth > 0)
 267       {
 268         System.err.println("Warning: unterminated comment in data file");
 269       }
 270       return data;
 271     }
 272     int leftBracket = data.indexOf(COMMENT_START);
 273
 274     /*
 275      * reject unnested comment following data on the same line
 276      */
 277     if (depth == 0 && leftBracket > 0)
 278     {
 279       throw new FileFormatException(
 280               "Can't parse comment following data at " + data);
 281     }
 282
 283     /*
 284      * If we are in a (possibly nested) comment after parsing this line, keep
 285      * reading recursively until the comment has unwound
 286      */
 287     int newDepth = commentDepth(data, depth);
 288     if (newDepth > 0)
 289     {
 290       return nextNonCommentLine(newDepth);
 291     }
 292     else
 293     {
 294       /*
 295        * not in a comment by end of this line; return what is left (or the next
 296        * line if that is empty)
 297        */
 298       String nonCommentPart = getNonCommentContent(data, depth);
 299       // if (nonCommentPart.length() > 0)
 300       // {
 301         return nonCommentPart;
 302       // }
 303       // return nextNonCommentLine(0);
 304     }
 305   }
 306
 307   /**
 308    * Returns what is left of the input data after removing any comments, whether
 309    * 'in progress' from preceding lines, or embedded in the current line
 310    *
 311    * @param data
 312    *          input data
 313    * @param depth
 314    *          nested depth of comments pending termination
 315    * @return
 316    * @throws FileFormatException
 317    */
 318   protected static String getNonCommentContent(String data, int depth)
 319           throws FileFormatException
 320   {
 321     int len = data.length();
 322     StringBuilder result = new StringBuilder(len);
 323     for (int i = 0; i < len; i++)
 324     {
 325       char c = data.charAt(i);
 326       switch (c)
 327       {
 328       case COMMENT_START:
 329         depth++;
 330         break;
 331
 332       case COMMENT_END:
 333         if (depth > 0)
 334         {
 335           depth--;
 336         }
 337         else
 338         {
 339           result.append(c);
 340         }
 341         break;
 342
 343       default:
 344         if (depth == 0)
 345         {
 346           result.append(c);
 347         }
 348       }
 349     }
 350     return result.toString();
 351   }
 352
 353   /**
 354    * Calculates new depth of comment after parsing an input line i.e. the excess
 355    * of opening '[' over closing ']' characters. Any excess ']' are ignored (not
 356    * treated as comment delimiters).
 357    *
 358    * @param data
 359    *          input line
 360    * @param depth
 361    *          current comment nested depth before parsing the line
 362    * @return new depth after parsing the line
 363    */
 364   protected static int commentDepth(CharSequence data, int depth)
 365   {
 366     int newDepth = depth;
 367     int len = data.length();
 368     for (int i = 0; i < len; i++)
 369     {
 370       char c = data.charAt(i);
 371       if (c == COMMENT_START)
 372       {
 373         newDepth++;
 374       }
 375       else if (c == COMMENT_END && newDepth > 0)
 376       {
 377         newDepth--;
 378       }
 379     }
 380     return newDepth;
 381   }
 382
 383   /**
 384    * Convert the parsed sequence strings to objects and store them in the model.
 385    *
 386    * @param seqData
 387    */
 388   protected void setSequences(Map<String, StringBuilder> seqData)
 389   {
 390     Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
 391
 392     for (Entry<String, StringBuilder> dataset : datasets)
 393     {
 394       String sequenceId = dataset.getKey();
 395       StringBuilder characters = dataset.getValue();
 396       SequenceI s = new Sequence(sequenceId, new String(characters));
 397       this.seqs.addElement(s);
 398     }
 399   }
 400
 401   /**
 402    * Process one line of sequence data. If it has no sequence identifier, append
 403    * to the current id's sequence. Else parse out the sequence id and append the
 404    * data (if any) to that id's sequence. Returns the sequence id (implicit or
 405    * explicit) for this line.
 406    *
 407    * @param dataLine
 408    * @param seqData
 409    * @param currentid
 410    * @return
 411    * @throws IOException
 412    */
 413   protected String parseDataLine(String dataLine,
 414           Map<String, StringBuilder> seqData, String currentId)
 415           throws IOException
 416   {
 417     String seqId = getSequenceId(dataLine);
 418     if (seqId == null)
 419     {
 420       /*
 421        * Just character data
 422        */
 423       parseNoninterleavedDataLine(dataLine, seqData, currentId);
 424       return currentId;
 425     }
 426     else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
 427     {
 428       /*
 429        * Sequence id only - header line for noninterleaved data
 430        */
 431       return seqId;
 432     }
 433     else
 434     {
 435       /*
 436        * Sequence id followed by data
 437        */
 438       parseInterleavedDataLine(dataLine, seqData, seqId);
 439       return seqId;
 440     }
 441   }
 442
 443   /**
 444    * Add a line of sequence data to the buffer for the given sequence id. Start
 445    * a new one if we haven't seen it before.
 446    *
 447    * @param dataLine
 448    * @param seqData
 449    * @param currentId
 450    * @throws IOException
 451    */
 452   protected void parseNoninterleavedDataLine(String dataLine,
 453           Map<String, StringBuilder> seqData, String currentId)
 454           throws IOException
 455   {
 456     if (currentId == null)
 457     {
 458       /*
 459        * Oops. Data but no sequence id context.
 460        */
 461       throw new IOException("No sequence id context at: " + dataLine);
 462     }
 463
 464     assertInterleaved(false, dataLine);
 465
 466     StringBuilder sb = getSequenceDataBuffer(seqData, currentId);
 467
 468     /*
 469      * Add the current line of data to the sequence.
 470      */
 471     sb.append(dataLine);
 472
 473     setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
 474   }
 475
 476   /**
 477    * Get the sequence data for this sequence id, starting a new one if
 478    * necessary.
 479    *
 480    * @param seqData
 481    * @param currentId
 482    * @return
 483    */
 484   protected StringBuilder getSequenceDataBuffer(
 485           Map<String, StringBuilder> seqData, String currentId)
 486   {
 487     StringBuilder sb = seqData.get(currentId);
 488     if (sb == null)
 489     {
 490       // first data met for this sequence id, start a new buffer
 491       sb = new StringBuilder(SEQBUFFERSIZE);
 492       seqData.put(currentId, sb);
 493     }
 494     return sb;
 495   }
 496
 497   /**
 498    * Parse one line of interleaved data e.g.
 499    *
 500    * <pre>
 501    * #TheSeqId CGATCGCATGCA
 502    * </pre>
 503    *
 504    * @param dataLine
 505    * @param seqData
 506    * @param seqId
 507    * @throws IOException
 508    */
 509   protected void parseInterleavedDataLine(String dataLine,
 510           Map<String, StringBuilder> seqData, String seqId)
 511           throws IOException
 512   {
 513     /*
 514      * New sequence found in second or later data block - error.
 515      */
 516     if (this.firstDataBlockRead && !seqData.containsKey(seqId))
 517     {
 518       throw new IOException(
 519               "Parse error: misplaced new sequence starting at " + dataLine);
 520     }
 521
 522     StringBuilder sb = getSequenceDataBuffer(seqData, seqId);
 523     String data = dataLine.substring(seqId.length() + 1).trim();
 524
 525     /*
 526      * Do nothing if this line is _only_ a sequence id with no data following.
 527      *
 528      * Remove any internal spaces
 529      */
 530     if (data != null && data.length() > 0)
 531     {
 532       if (data.indexOf(SPACE) != -1)
 533       {
 534         data = data.replace(SPACE, "");
 535       }
 536       sb.append(data);
 537       setPositionsPerLine(Math.max(positionsPerLine, data.length()));
 538       assertInterleaved(true, dataLine);
 539     }
 540   }
 541
 542   /**
 543    * If the line begins with (e.g.) "#abcde " then returns "abcde" as the
 544    * identifier. Else returns null.
 545    *
 546    * @param dataLine
 547    * @return
 548    */
 549   public static String getSequenceId(String dataLine)
 550   {
 551     // TODO refactor to a StringUtils type class
 552     if (dataLine != null)
 553     {
 554       if (dataLine.startsWith(HASHSIGN))
 555       {
 556         int spacePos = dataLine.indexOf(" ");
 557         return (spacePos == -1 ? dataLine.substring(1) : dataLine
 558                 .substring(1, spacePos));
 559       }
 560     }
 561     return null;
 562   }
 563
 564   /**
 565    * Read the #MEGA and Title/Format/Description header lines (if present).
 566    *
 567    * Save as alignment properties in case useful.
 568    *
 569    * @return the next non-blank line following the header lines.
 570    * @throws IOException
 571    */
 572   protected String parseHeaderLines() throws IOException
 573   {
 574     String inputLine = null;
 575     while ((inputLine = nextNonCommentLine()) != null)
 576     {
 577       inputLine = inputLine.trim();
 578
 579       /*
 580        * skip blank lines
 581        */
 582       if (inputLine.length() == 0)
 583       {
 584         continue;
 585       }
 586
 587       if (inputLine.toUpperCase().startsWith(MEGA_ID))
 588       {
 589         continue;
 590       }
 591
 592       if (isTitle(inputLine))
 593       {
 594         this.title = getValue(inputLine);
 595         setAlignmentProperty(PROP_TITLE, title);
 596       }
 597       else if (inputLine.startsWith(BANG + DESCRIPTION))
 598       {
 599         parseDescription(inputLine);
 600       }
 601
 602       else if (inputLine.startsWith(BANG + FORMAT))
 603       {
 604         parseFormat(inputLine);
 605       }
 606       else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
 607       {
 608
 609         /*
 610          * Return the first 'data line' i.e. one that is not blank, #MEGA or
 611          * TITLE:
 612          */
 613         break;
 614       }
 615     }
 616     return inputLine;
 617   }
 618
 619   /**
 620    * Parse a !Format statement. This may be multiline, and is ended by a
 621    * semicolon.
 622    *
 623    * @param inputLine
 624    * @throws IOException
 625    */
 626   protected void parseFormat(String inputLine) throws IOException
 627   {
 628     while (inputLine != null)
 629     {
 630       parseFormatLine(inputLine);
 631       if (inputLine.endsWith(SEMICOLON))
 632       {
 633         break;
 634       }
 635       inputLine = nextNonCommentLine();
 636     }
 637   }
 638
 639   /**
 640    * Parse one line of a !Format statement. This may contain one or more
 641    * keyword=value pairs.
 642    *
 643    * @param inputLine
 644    * @throws FileFormatException
 645    */
 646   protected void parseFormatLine(String inputLine)
 647           throws FileFormatException
 648   {
 649     if (inputLine.startsWith(BANG + FORMAT))
 650     {
 651       inputLine = inputLine.substring((BANG + FORMAT).length());
 652     }
 653     if (inputLine.endsWith(SEMICOLON))
 654     {
 655       inputLine = inputLine.substring(0, inputLine.length() - 1);
 656     }
 657     if (inputLine.length() == 0)
 658     {
 659       return;
 660     }
 661     String[] tokens = inputLine.trim().split("\\s"); // any whitespace
 662     for (String token : tokens)
 663     {
 664       parseFormatKeyword(token);
 665     }
 666   }
 667
 668   /**
 669    * Parse a Keyword=Value token. Possible keywords are
 670    * <ul>
 671    * <li>DataType= DNA, RNA, Nucleotide, Protein</li>
 672    * <li>DataFormat= Interleaved, ?</li>
 673    * <li>NSeqs= number of sequences (synonym NTaxa)</li>
 674    * <li>NSites= number of bases / residues</li>
 675    * <li>Property= Exon (or Coding), Intron (or Noncoding), End (of domain)</li>
 676    * <li>Indel= gap character</li>
 677    * <li>Identical= identity character (to first sequence) (synonym MatchChar)</li>
 678    * <li>Missing= missing data character</li>
 679    * <li>CodeTable= Standard, other (MEGA supports various)</li>
 680    * </ul>
 681    *
 682    * @param token
 683    * @throws FileFormatException
 684    *           if an unrecognised keyword or value is encountered
 685    */
 686   protected void parseFormatKeyword(String token)
 687           throws FileFormatException
 688   {
 689     String msg = "Unrecognised Format command: " + token;
 690     String[] bits = token.split(EQUALS);
 691     if (bits.length != 2)
 692     {
 693       throw new FileFormatException(msg);
 694     }
 695     String keyword = bits[0];
 696     String value = bits[1];
 697
 698     /*
 699      * Jalview will work out whether nucleotide or not anyway
 700      */
 701     if (keyword.equalsIgnoreCase(DATATYPE))
 702     {
 703       if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
 704               || value.equalsIgnoreCase("Nucleotide"))
 705       {
 706         this.nucleotide = true;
 707         // alignment computes whether or not it is nucleotide when created
 708       }
 709       else if (value.equalsIgnoreCase(PROTEIN))
 710       {
 711         this.nucleotide = false;
 712       }
 713       else
 714       {
 715         throw new FileFormatException(msg);
 716       }
 717       setAlignmentProperty(PROP_DATATYPE, value);
 718     }
 719
 720     /*
 721      * accept non-Standard code table but save in case we want to disable
 722      * 'translate as cDNA'
 723      */
 724     else if (keyword.equalsIgnoreCase(CODETABLE))
 725     {
 726       setAlignmentProperty(PROP_CODETABLE, value);
 727     }
 728
 729     /*
 730      * save gap char to set later on alignment once created
 731      */
 732     else if (keyword.equalsIgnoreCase(INDEL))
 733     {
 734       this.gapCharacter = value.charAt(0);
 735     }
 736
 737     else if (keyword.equalsIgnoreCase(IDENTICAL)
 738             || keyword.equalsIgnoreCase("MatchChar"))
 739     {
 740       setAlignmentProperty(PROP_IDENTITY, value);
 741       if (!".".equals(value))
 742       {
 743         System.err.println("Warning: " + token
 744                 + " not supported, Jalview uses '.' for identity");
 745       }
 746     }
 747
 748     else if (keyword.equalsIgnoreCase(MISSING))
 749     {
 750       setAlignmentProperty(PROP_MISSING, value);
 751       System.err.println("Warning: " + token + " not supported");
 752     }
 753
 754     else if (keyword.equalsIgnoreCase("Property"))
 755     {
 756       // TODO: figure out what to do with this
 757       // can it appear more than once in a file?
 758       setAlignmentProperty(PROP_MISSING, value);
 759     }
 760
 761     else if (!keyword.equalsIgnoreCase(N_SEQS)
 762             && !keyword.equalsIgnoreCase(N_SITES))
 763     {
 764       System.err.println("Warning: " + msg);
 765     }
 766   }
 767
 768   /**
 769    * Returns the trimmed data on the line following either whitespace or '=',
 770    * with any trailing semi-colon removed<br>
 771    * So
 772    * <ul>
 773    * <li>Hello World</li>
 774    * <li>!Hello: \tWorld;</li>
 775    * <li>!Hello=World</li>
 776    * <ul>
 777    * should all return "World"
 778    *
 779    * @param inputLine
 780    * @return
 781    */
 782   protected static String getValue(String inputLine)
 783   {
 784     if (inputLine == null)
 785     {
 786       return null;
 787     }
 788     String value = null;
 789     String s = inputLine.replaceAll("\t", " ").trim();
 790
 791     /*
 792      * KEYWORD = VALUE should return VALUE
 793      */
 794     int equalsPos = s.indexOf("=");
 795     if (equalsPos >= 0)
 796     {
 797       value = s.substring(equalsPos + 1);
 798     }
 799     else
 800     {
 801       int spacePos = s.indexOf(' ');
 802       value = spacePos == -1 ? "" : s.substring(spacePos + 1);
 803     }
 804     value = value.trim();
 805     if (value.endsWith(SEMICOLON))
 806     {
 807       value = value.substring(0, value.length() - 1).trim();
 808     }
 809     return value;
 810   }
 811
 812   /**
 813    * Returns true if the input line starts with "TITLE" or "!TITLE" (not case
 814    * sensitive). The latter is the official format, some older data file
 815    * examples have it without the !.
 816    *
 817    * @param inputLine
 818    * @return
 819    */
 820   protected static boolean isTitle(String inputLine)
 821   {
 822     if (inputLine == null)
 823     {
 824       return false;
 825     }
 826     String upper = inputLine.toUpperCase();
 827     return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
 828             + TITLE.toUpperCase()));
 829   }
 830
 831   /**
 832    * Reads lines until terminated by semicolon, appending each to the
 833    * Description property value.
 834    *
 835    * @throws IOException
 836    */
 837   protected void parseDescription(String firstDescriptionLine)
 838           throws IOException
 839   {
 840     StringBuilder desc = new StringBuilder(256);
 841     String line = getValue(firstDescriptionLine);
 842     while (line != null)
 843     {
 844       if (line.endsWith(SEMICOLON))
 845       {
 846         desc.append(line.substring(0, line.length() - 1));
 847         break;
 848       }
 849       else if (line.length() > 0)
 850       {
 851         desc.append(line).append(newline);
 852       }
 853       line = nextNonCommentLine();
 854     }
 855     setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
 856   }
 857
 858   /**
 859    * Returns the alignment sequences in Mega format.
 860    */
 861   @Override
 862   public String print()
 863   {
 864     return MEGA_ID + newline + print(getSeqsAsArray());
 865   }
 866
 867   /**
 868    * Write out the alignment sequences in Mega format - interleaved unless
 869    * explicitly noninterleaved.
 870    */
 871   protected String print(SequenceI[] s)
 872   {
 873     String result;
 874     if (this.interleaved != null && !this.interleaved)
 875     {
 876       result = printNonInterleaved(s);
 877     }
 878     else
 879     {
 880       result = printInterleaved(s);
 881     }
 882     return result;
 883   }
 884
 885   /**
 886    * Print to string in Interleaved format - blocks of next N characters of each
 887    * sequence in turn.
 888    *
 889    * @param s
 890    */
 891   protected String printInterleaved(SequenceI[] s)
 892   {
 893     int maxIdLength = getMaxIdLength(s);
 894     int maxSequenceLength = getMaxSequenceLength(s);
 895     int numLines = maxSequenceLength / positionsPerLine + 3; // approx
 896
 897     int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1;
 898     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
 899     int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;
 900
 901     /*
 902      * Roughly size a buffer to hold the whole output
 903      */
 904     StringBuilder sb = new StringBuilder(numLines
 905             * (maxIdLength + positionsPerLine + chunksPerLine + 10));
 906
 907     /*
 908      * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
 909      */
 910     int from = 0;
 911     for (int i = 0; i < numDataBlocks; i++)
 912     {
 913       sb.append(newline);
 914       boolean first = true;
 915       int advancedBy = 0;
 916       for (SequenceI seq : s)
 917       {
 918         int seqFrom = from;
 919         String seqId = String.format("#%-" + maxIdLength + "s",
 920                 seq.getName());
 921
 922         /*
 923          * output next line for this sequence
 924          */
 925         sb.append(seqId);
 926         int lastPos = seqFrom + positionsPerLine; // exclusive
 927         for (int j = 0; j < chunksPerLine; j++)
 928         {
 929           char[] subSequence = seq.getSequence(seqFrom,
 930                   Math.min(lastPos, seqFrom + spaceEvery));
 931           if (subSequence.length > 0)
 932           {
 933             sb.append(SPACE).append(subSequence);
 934           }
 935           seqFrom += subSequence.length;
 936           if (first)
 937           {
 938             // all sequences should be the same length in MEGA
 939             advancedBy += subSequence.length;
 940           }
 941         }
 942         // write last position as a comment
 943         if (writePositionNumbers)
 944         {
 945           sb.append(SPACE).append(COMMENT_START).append(from + advancedBy)
 946                   .append(COMMENT_END);
 947         }
 948         sb.append(newline);
 949         first = false;
 950       }
 951       from += advancedBy;
 952     }
 953
 954     return new String(sb);
 955   }
 956
 957   /**
 958    * Outputs to string the MEGA header and any other known and relevant
 959    * alignment properties
 960    *
 961    * @param al
 962    */
 963   protected String printHeaders(AlignmentI al)
 964   {
 965     StringBuilder sb = new StringBuilder(128);
 966     sb.append(MEGA_ID).append(newline);
 967     String propertyValue = (String) al.getProperty(PROP_TITLE);
 968     if (propertyValue != null)
 969     {
 970       sb.append(BANG).append(TITLE).append(SPACE)
 971 .append(propertyValue)
 972               .append(SEMICOLON)
 973               .append(newline);
 974     }
 975     propertyValue = (String) al.getProperty(PROP_DESCRIPTION);
 976     if (propertyValue != null)
 977     {
 978       sb.append(BANG).append(DESCRIPTION).append(newline)
 979               .append(propertyValue).append(SEMICOLON)
 980               .append(newline);
 981     }
 982
 983     /*
 984      * !Format DataType CodeTable
 985      */
 986     sb.append(BANG).append(FORMAT).append(newline);
 987     String dataType = (String) al.getProperty(PROP_DATATYPE);
 988     if (dataType == null)
 989     {
 990       dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
 991     }
 992     sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
 993     String codeTable = (String) al.getProperty(PROP_CODETABLE);
 994     sb.append(SPACE).append(CODETABLE).append(EQUALS)
 995             .append(codeTable == null ? "Standard" : codeTable)
 996             .append(newline);
 997
 998     /*
 999      * !Format NSeqs NSites
1000      * NSites the length of any sequence (they should all be the same), excluding
1001      * gaps?!?
1002      */
1003     sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
1004     SequenceI seq = al.getSequenceAt(0);
1005     sb.append(SPACE).append(N_SITES).append(EQUALS)
1006             .append(seq.getEnd() - seq.getStart() + 1);
1007     sb.append(newline);
1008
1009     /*
1010      * !Format Indel Identical Missing
1011      */
1012     sb.append(INDENT);
1013     sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
1014     String identity = (String) al.getProperty(PROP_IDENTITY);
1015     if (identity != null)
1016     {
1017       sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
1018     }
1019     String missing = (String) al.getProperty(PROP_MISSING);
1020     if (missing != null)
1021     {
1022       sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
1023     }
1024     sb.append(SEMICOLON).append(newline);
1025
1026     return sb.toString();
1027   }
1028
1029   /**
1030    * Get the longest sequence id (to allow aligned printout).
1031    *
1032    * @param s
1033    * @return
1034    */
1035   protected static int getMaxIdLength(SequenceI[] s)
1036   {
1037     // TODO pull up for reuse
1038     int maxLength = 0;
1039     for (SequenceI seq : s)
1040     {
1041       int len = seq.getName().length();
1042       if (len > maxLength)
1043       {
1044         maxLength = len;
1045       }
1046     }
1047     return maxLength;
1048   }
1049
1050   /**
1051    * Get the longest sequence length
1052    *
1053    * @param s
1054    * @return
1055    */
1056   protected static int getMaxSequenceLength(SequenceI[] s)
1057   {
1058     // TODO pull up for reuse
1059     int maxLength = 0;
1060     for (SequenceI seq : s)
1061     {
1062       int len = seq.getLength();
1063       if (len > maxLength)
1064       {
1065         maxLength = len;
1066       }
1067     }
1068     return maxLength;
1069   }
1070
1071   /**
1072    * Print to string in noninterleaved format - all of each sequence in turn, in
1073    * blocks of 50 characters.
1074    *
1075    * @param s
1076    * @return
1077    */
1078   protected String printNonInterleaved(SequenceI[] s)
1079   {
1080     int maxSequenceLength = getMaxSequenceLength(s);
1081     // approx
1082     int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;
1083
1084     /*
1085      * Roughly size a buffer to hold the whole output
1086      */
1087     StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
1088
1089     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1090     int chunksPerLine = positionsPerLine / spaceEvery;
1091     for (SequenceI seq : s)
1092     {
1093       sb.append(newline);
1094       sb.append(HASHSIGN + seq.getName()).append(newline);
1095       int startPos = 0;
1096       while (startPos < seq.getLength())
1097       {
1098         boolean firstChunk = true;
1099         /*
1100          * print next line for this sequence
1101          */
1102         int lastPos = startPos + positionsPerLine; // exclusive
1103         for (int j = 0; j < chunksPerLine; j++)
1104         {
1105           char[] subSequence = seq.getSequence(startPos,
1106                   Math.min(lastPos, startPos + positionsPerLine));
1107           if (subSequence.length > 0)
1108           {
1109             if (!firstChunk)
1110             {
1111               sb.append(SPACE);
1112             }
1113             sb.append(subSequence);
1114             firstChunk = false;
1115           }
1116           startPos += subSequence.length;
1117         }
1118         sb.append(newline);
1119       }
1120     }
1121
1122     return new String(sb);
1123   }
1124
1125   /**
1126    * Flag this file as interleaved or not, based on data format. Throws an
1127    * exception if has previously been determined to be otherwise.
1128    *
1129    * @param isIt
1130    * @param dataLine
1131    * @throws IOException
1132    */
1133   protected void assertInterleaved(boolean isIt, String dataLine)
1134           throws FileFormatException
1135   {
1136     if (this.interleaved != null && isIt != this.interleaved.booleanValue())
1137     {
1138       throw new FileFormatException(
1139               "Parse error: mix of interleaved and noninterleaved detected, at line: "
1140                       + dataLine);
1141     }
1142     this.interleaved = new Boolean(isIt);
1143     setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
1144   }
1145
1146   public boolean isInterleaved()
1147   {
1148     return this.interleaved == null ? false : this.interleaved
1149             .booleanValue();
1150   }
1151
1152   /**
1153    * Adds saved parsed values either as alignment properties, or (in some cases)
1154    * as specific member fields of the alignment
1155    */
1156   @Override
1157   public void addProperties(AlignmentI al)
1158   {
1159     super.addProperties(al);
1160     if (this.gapCharacter != null)
1161     {
1162       al.setGapCharacter(gapCharacter);
1163     }
1164
1165     /*
1166      * warn if e.g. DataType=DNA but data is protein (or vice versa)
1167      */
1168     if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) {
1169       System.err.println("Warning: " + this.title + " declared "
1170               + (nucleotide ? "" : " not ") + "nucleotide but it is"
1171               + (nucleotide ? " not" : ""));
1172     }
1173   }
1174
1175   /**
1176    * Print the given alignment in MEGA format. If the alignment was created by
1177    * parsing a MEGA file, it should have properties set (e.g. Title) which can
1178    * influence the output.
1179    */
1180   @Override
1181   public String print(AlignmentI al)
1182   {
1183     this.nucleotide = al.isNucleotide();
1184     String lineLength = (String) al.getProperty(PROP_LINELENGTH);
1185     this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
1186             .parseInt(lineLength);
1187     return printHeaders(al) + print(al.getSequencesArray());
1188   }
1189
1190   /**
1191    * Returns the number of sequence positions output per line
1192    *
1193    * @return
1194    */
1195   public int getPositionsPerLine()
1196   {
1197     return positionsPerLine;
1198   }
1199
1200   /**
1201    * Sets the number of sequence positions output per line. Note these will be
1202    * formatted in blocks of 3 (nucleotide) or 10 (peptide).
1203    *
1204    * @param p
1205    */
1206   public void setPositionsPerLine(int p)
1207   {
1208     this.positionsPerLine = p;
1209   }
1210 }