src/jalview/io/MegaFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  * The Jalview Authors are detailed in the 'AUTHORS' file.
  18  */
  19 package jalview.io;
  20
  21 import jalview.datamodel.AlignmentI;
  22 import jalview.datamodel.Sequence;
  23 import jalview.datamodel.SequenceI;
  24
  25 import java.io.IOException;
  26 import java.util.LinkedHashMap;
  27 import java.util.Map;
  28 import java.util.Map.Entry;
  29 import java.util.Set;
  30
  31 /**
  32  * A parser for input or output of MEGA format files. <br>
  33  * <br>
  34  * Tamura K, Stecher G, Peterson D, Filipski A, and Kumar S (2013) MEGA6:
  35  * Molecular Evolutionary Genetics Analysis Version 6.0. Molecular Biology and
  36  * Evolution 30: 2725-2729. <br>
  37  * <br>
  38  *
  39  * MEGA file format is supported as described in
  40  * http://www.megasoftware.net/manual.pdf <br>
  41  * Limitations:
  42  * <ul>
  43  * <li>nested comments (marked by [ ]) are accepted but not preserved</li>
  44  * <li>to be completed</li>
  45  * </ul>
  46  *
  47  * @see http://www.megasoftware.net/
  48  */
  49 public class MegaFile extends AlignFile
  50 {
  51   private static final int DEFAULT_LINE_LENGTH = 60;
  52
  53   private static final String INDENT = "    ";
  54
  55   private static final String N_SITES = "NSites";
  56
  57   private static final String N_SEQS = "NSeqs";
  58
  59   private static final String MISSING = "Missing";
  60
  61   private static final String IDENTICAL = "Identical";
  62
  63   private static final String INDEL = "Indel";
  64
  65   private static final String CODETABLE = "CodeTable";
  66
  67   private static final String PROTEIN = "Protein";
  68
  69   private static final String NUCLEOTIDE = "Nucleotide";
  70
  71   private static final String DATATYPE = "DataType";
  72
  73   private static final char COMMENT_START = '[';
  74
  75   private static final char COMMENT_END = ']';
  76
  77   private static final String HASHSIGN = "#";
  78
  79   private static final String SEMICOLON = ";";
  80
  81   private static final String BANG = "!";
  82
  83   private static final String EQUALS = "=";
  84
  85   private static final String MEGA_ID = HASHSIGN + "MEGA";
  86
  87   private static final String TITLE = "Title";
  88
  89   private static final String FORMAT = "Format";
  90
  91   private static final String DESCRIPTION = "Description";
  92
  93   private static final String GENE = "Gene";
  94
  95   private static final String DOMAIN = "Domain";
  96
  97   /*
  98    * names of properties to save to the alignment (may affect eventual output
  99    * format)
 100    */
 101   static final String PROP_TITLE = "MEGA_TITLE";
 102
 103   static final String PROP_INTERLEAVED = "MEGA_INTERLEAVED";
 104
 105   static final String PROP_DESCRIPTION = "MEGA_DESCRIPTION";
 106
 107   static final String PROP_CODETABLE = "MEGA_CODETABLE";
 108
 109   static final String PROP_IDENTITY = "MEGA_IDENTITY";
 110
 111   static final String PROP_MISSING = "MEGA_MISSING";
 112
 113   static final String PROP_DATATYPE = "MEGA_DATATYPE";
 114
 115   // number of bases per line of file (value is inferred)
 116   static final String PROP_LINELENGTH = "MEGA_LINELENGTH";
 117
 118   // TODO: need a controlled name for Gene as a feature if we want to be able to
 119   // output the MEGA file with !Gene headers
 120   // WTF do we do if the sequences get realigned?
 121
 122   // initial size for sequence data buffer
 123   private static final int SEQBUFFERSIZE = 256;
 124
 125   private static final String SPACE = " ";
 126
 127   /*
 128    * number of sequence positions output per line
 129    */
 130   private int positionsPerLine;
 131
 132   private String title;
 133
 134   // gap character may be explicitly declared, if not we infer it
 135   private Character gapCharacter;
 136
 137   // identity character if declared
 138   private char identityCharacter = 0;
 139
 140   // this can be True, False or null (meaning not asserted in file)
 141   private Boolean nucleotide;
 142
 143   // set once we have seen one block of interleaved data
 144   private boolean firstDataBlockRead = false;
 145
 146   // this can be True, False or null (meaning we don't know yet)
 147   private Boolean interleaved;
 148
 149   // write end of line positions as a comment
 150   private boolean writePositionNumbers = true;
 151
 152   public MegaFile()
 153   {
 154   }
 155
 156   public MegaFile(String inFile, String type) throws IOException
 157   {
 158     super(inFile, type);
 159   }
 160
 161   public MegaFile(FileParse source) throws IOException
 162   {
 163     super(source);
 164   }
 165
 166   /**
 167    * Parse the input stream.
 168    */
 169   @Override
 170   public void parse() throws IOException
 171   {
 172     /*
 173      * Read and process MEGA and Title/Format/Description headers if present.
 174      * Returns the first data line following the headers.
 175      */
 176     String dataLine = parseHeaderLines();
 177
 178     /*
 179      * Temporary store of {sequenceId, positionData} while parsing interleaved
 180      * sequences; sequences are maintained in the order in which they are added
 181      * i.e. read in the file
 182      */
 183     Map<String, StringBuilder> seqData = new LinkedHashMap<String, StringBuilder>();
 184
 185     /*
 186      * The id of the sequence being read (for non-interleaved)
 187      */
 188     String currentId = "";
 189
 190     while (dataLine != null)
 191     {
 192       dataLine = dataLine.trim();
 193       if (dataLine.length() > 0)
 194       {
 195         if (dataLine.startsWith(BANG + GENE))
 196         {
 197           parseGene(dataLine);
 198         }
 199         else if (dataLine.startsWith(BANG + DOMAIN))
 200         {
 201           parseDomain(dataLine);
 202         }
 203         else
 204         {
 205           currentId = parseDataLine(dataLine, seqData, currentId);
 206         }
 207       }
 208       else if (!seqData.isEmpty())
 209       {
 210         /*
 211          * Blank line after processing some data...
 212          */
 213         this.firstDataBlockRead = true;
 214       }
 215       dataLine = nextNonCommentLine();
 216     }
 217
 218     // remember the (longest) line length read in, so we can output the same
 219     setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
 220
 221     setSequences(seqData);
 222   }
 223
 224   /**
 225    * Parse a !Gene command line
 226    *
 227    * @param dataLine
 228    */
 229   protected void parseGene(String dataLine)
 230   {
 231   }
 232
 233   /**
 234    * Parse a !Domain command line
 235    *
 236    * @param dataLine
 237    */
 238   private void parseDomain(String dataLine)
 239   {
 240   }
 241
 242   /**
 243    * Returns the next line that is not a comment, or null at end of file.
 244    * Comments in MEGA are within [ ] brackets, and may be nested.
 245    *
 246    * @return
 247    * @throws IOException
 248    */
 249   protected String nextNonCommentLine() throws IOException
 250   {
 251     return nextNonCommentLine(0);
 252   }
 253
 254   /**
 255    * Returns the next line that is not a comment, or null at end of file.
 256    * Comments in MEGA are within [ ] brackets, and may be nested.
 257    *
 258    * @param depth
 259    *          current depth of nesting of comments while parsing
 260    * @return
 261    * @throws IOException
 262    */
 263   protected String nextNonCommentLine(final int depth) throws IOException
 264   {
 265     String data = null;
 266     data = nextLine();
 267     if (data == null)
 268     {
 269       if (depth > 0)
 270       {
 271         System.err.println("Warning: unterminated comment in data file");
 272       }
 273       return data;
 274     }
 275     int leftBracket = data.indexOf(COMMENT_START);
 276
 277     /*
 278      * reject unnested comment following data on the same line
 279      */
 280     if (depth == 0 && leftBracket > 0)
 281     {
 282       throw new FileFormatException(
 283               "Can't parse comment following data at " + data);
 284     }
 285
 286     /*
 287      * If we are in a (possibly nested) comment after parsing this line, keep
 288      * reading recursively until the comment has unwound
 289      */
 290     int newDepth = commentDepth(data, depth);
 291     if (newDepth > 0)
 292     {
 293       return nextNonCommentLine(newDepth);
 294     }
 295     else
 296     {
 297       /*
 298        * not in a comment by end of this line; return what is left (or the next
 299        * line if that is empty)
 300        */
 301       String nonCommentPart = getNonCommentContent(data, depth);
 302       // if (nonCommentPart.length() > 0)
 303       // {
 304         return nonCommentPart;
 305       // }
 306       // return nextNonCommentLine(0);
 307     }
 308   }
 309
 310   /**
 311    * Returns what is left of the input data after removing any comments, whether
 312    * 'in progress' from preceding lines, or embedded in the current line
 313    *
 314    * @param data
 315    *          input data
 316    * @param depth
 317    *          nested depth of comments pending termination
 318    * @return
 319    * @throws FileFormatException
 320    */
 321   protected static String getNonCommentContent(String data, int depth)
 322           throws FileFormatException
 323   {
 324     int len = data.length();
 325     StringBuilder result = new StringBuilder(len);
 326     for (int i = 0; i < len; i++)
 327     {
 328       char c = data.charAt(i);
 329       switch (c)
 330       {
 331       case COMMENT_START:
 332         depth++;
 333         break;
 334
 335       case COMMENT_END:
 336         if (depth > 0)
 337         {
 338           depth--;
 339         }
 340         else
 341         {
 342           result.append(c);
 343         }
 344         break;
 345
 346       default:
 347         if (depth == 0)
 348         {
 349           result.append(c);
 350         }
 351       }
 352     }
 353     return result.toString();
 354   }
 355
 356   /**
 357    * Calculates new depth of comment after parsing an input line i.e. the excess
 358    * of opening '[' over closing ']' characters. Any excess ']' are ignored (not
 359    * treated as comment delimiters).
 360    *
 361    * @param data
 362    *          input line
 363    * @param depth
 364    *          current comment nested depth before parsing the line
 365    * @return new depth after parsing the line
 366    */
 367   protected static int commentDepth(CharSequence data, int depth)
 368   {
 369     int newDepth = depth;
 370     int len = data.length();
 371     for (int i = 0; i < len; i++)
 372     {
 373       char c = data.charAt(i);
 374       if (c == COMMENT_START)
 375       {
 376         newDepth++;
 377       }
 378       else if (c == COMMENT_END && newDepth > 0)
 379       {
 380         newDepth--;
 381       }
 382     }
 383     return newDepth;
 384   }
 385
 386   /**
 387    * Convert the parsed sequence strings to objects and store them in the model.
 388    *
 389    * @param seqData
 390    */
 391   protected void setSequences(Map<String, StringBuilder> seqData)
 392   {
 393     Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
 394
 395     for (Entry<String, StringBuilder> dataset : datasets)
 396     {
 397       String sequenceId = dataset.getKey();
 398       StringBuilder characters = dataset.getValue();
 399       SequenceI s = new Sequence(sequenceId, new String(characters));
 400       this.seqs.addElement(s);
 401     }
 402   }
 403
 404   /**
 405    * Process one line of sequence data. If it has no sequence identifier, append
 406    * to the current id's sequence. Else parse out the sequence id and append the
 407    * data (if any) to that id's sequence. Returns the sequence id (implicit or
 408    * explicit) for this line.
 409    *
 410    * @param dataLine
 411    * @param seqData
 412    * @param currentid
 413    * @return
 414    * @throws IOException
 415    */
 416   protected String parseDataLine(String dataLine,
 417           Map<String, StringBuilder> seqData, String currentId)
 418           throws IOException
 419   {
 420     String seqId = getSequenceId(dataLine);
 421     if (seqId == null)
 422     {
 423       /*
 424        * Just character data
 425        */
 426       parseNoninterleavedDataLine(dataLine, seqData, currentId);
 427       return currentId;
 428     }
 429     else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
 430     {
 431       /*
 432        * Sequence id only - header line for noninterleaved data
 433        */
 434       return seqId;
 435     }
 436     else
 437     {
 438       /*
 439        * Sequence id followed by data
 440        */
 441       parseInterleavedDataLine(dataLine, seqData, seqId);
 442       return seqId;
 443     }
 444   }
 445
 446   /**
 447    * Add a line of sequence data to the buffer for the given sequence id. Start
 448    * a new one if we haven't seen it before.
 449    *
 450    * @param dataLine
 451    * @param seqData
 452    * @param currentId
 453    * @throws IOException
 454    */
 455   protected void parseNoninterleavedDataLine(String dataLine,
 456           Map<String, StringBuilder> seqData, String currentId)
 457           throws IOException
 458   {
 459     if (currentId == null)
 460     {
 461       /*
 462        * Oops. Data but no sequence id context.
 463        */
 464       throw new IOException("No sequence id context at: " + dataLine);
 465     }
 466
 467     assertInterleaved(false, dataLine);
 468
 469     StringBuilder sb = getSequenceDataBuffer(seqData, currentId);
 470
 471     dataLine = reformatSequenceData(dataLine, sb.length(), seqData);
 472     sb.append(dataLine);
 473
 474     setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
 475   }
 476
 477   /**
 478    * Get the sequence data for this sequence id, starting a new one if
 479    * necessary.
 480    *
 481    * @param seqData
 482    * @param currentId
 483    * @return
 484    */
 485   protected StringBuilder getSequenceDataBuffer(
 486           Map<String, StringBuilder> seqData, String currentId)
 487   {
 488     StringBuilder sb = seqData.get(currentId);
 489     if (sb == null)
 490     {
 491       // first data met for this sequence id, start a new buffer
 492       sb = new StringBuilder(SEQBUFFERSIZE);
 493       seqData.put(currentId, sb);
 494     }
 495     return sb;
 496   }
 497
 498   /**
 499    * Parse one line of interleaved data e.g.
 500    *
 501    * <pre>
 502    * #TheSeqId CGATCGCATGCA
 503    * </pre>
 504    *
 505    * @param dataLine
 506    * @param seqData
 507    * @param seqId
 508    * @throws IOException
 509    */
 510   protected void parseInterleavedDataLine(String dataLine,
 511           Map<String, StringBuilder> seqData, String seqId)
 512           throws IOException
 513   {
 514     /*
 515      * New sequence found in second or later data block - error.
 516      */
 517     if (this.firstDataBlockRead && !seqData.containsKey(seqId))
 518     {
 519       throw new IOException(
 520               "Parse error: misplaced new sequence starting at " + dataLine);
 521     }
 522
 523     StringBuilder sb = getSequenceDataBuffer(seqData, seqId);
 524     String data = dataLine.substring(seqId.length() + 1).trim();
 525
 526     /*
 527      * Do nothing if this line is _only_ a sequence id with no data following.
 528      *
 529      * Remove any internal spaces
 530      */
 531     if (data != null && data.length() > 0)
 532     {
 533       data = reformatSequenceData(data, sb.length(), seqData);
 534       sb.append(data);
 535       setPositionsPerLine(Math.max(positionsPerLine, data.length()));
 536       assertInterleaved(true, dataLine);
 537     }
 538   }
 539
 540   /**
 541    * Reformat input sequence data by removing any internal formatting spaces,
 542    * and converting any 'identity' characters to the corresponding position in
 543    * the first sequence.
 544    *
 545    * @param data
 546    * @param startPos
 547    *          the sequence position (base 0) of the start of the data
 548    * @param seqData
 549    * @return
 550    */
 551   protected String reformatSequenceData(String data, int startPos, Map<String, StringBuilder> seqData)
 552   {
 553     String formatted = data.replace(SPACE, "");
 554     if (formatted.indexOf(identityCharacter) > -1)
 555     {
 556       /*
 557        * sequence contains '.' or other identity symbol; replace these with the
 558        * same position from the first (reference) sequence
 559        */
 560       StringBuilder referenceSequence = seqData.values().iterator().next();
 561       StringBuilder sb = new StringBuilder(formatted.length());
 562       for (int i = 0 ; i < formatted.length() ; i++) {
 563         char nextChar = formatted.charAt(i);
 564         if (nextChar != identityCharacter) {
 565           sb.append(nextChar);
 566         } else {
 567           sb.append(referenceSequence.charAt(startPos + i));
 568         }
 569       }
 570       formatted = sb.toString();
 571     }
 572     return formatted;
 573   }
 574
 575   /**
 576    * If the line begins with (e.g.) "#abcde " then returns "abcde" as the
 577    * identifier. Else returns null.
 578    *
 579    * @param dataLine
 580    * @return
 581    */
 582   public static String getSequenceId(String dataLine)
 583   {
 584     // TODO refactor to a StringUtils type class
 585     if (dataLine != null)
 586     {
 587       if (dataLine.startsWith(HASHSIGN))
 588       {
 589         int spacePos = dataLine.indexOf(" ");
 590         return (spacePos == -1 ? dataLine.substring(1) : dataLine
 591                 .substring(1, spacePos));
 592       }
 593     }
 594     return null;
 595   }
 596
 597   /**
 598    * Read the #MEGA and Title/Format/Description header lines (if present).
 599    *
 600    * Save as alignment properties in case useful.
 601    *
 602    * @return the next non-blank line following the header lines.
 603    * @throws IOException
 604    */
 605   protected String parseHeaderLines() throws IOException
 606   {
 607     String inputLine = null;
 608     while ((inputLine = nextNonCommentLine()) != null)
 609     {
 610       inputLine = inputLine.trim();
 611
 612       /*
 613        * skip blank lines
 614        */
 615       if (inputLine.length() == 0)
 616       {
 617         continue;
 618       }
 619
 620       if (inputLine.toUpperCase().startsWith(MEGA_ID))
 621       {
 622         continue;
 623       }
 624
 625       if (isTitle(inputLine))
 626       {
 627         this.title = getValue(inputLine);
 628         setAlignmentProperty(PROP_TITLE, title);
 629       }
 630       else if (inputLine.startsWith(BANG + DESCRIPTION))
 631       {
 632         parseDescription(inputLine);
 633       }
 634
 635       else if (inputLine.startsWith(BANG + FORMAT))
 636       {
 637         parseFormat(inputLine);
 638       }
 639       else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
 640       {
 641
 642         /*
 643          * Return the first 'data line' i.e. one that is not blank, #MEGA or
 644          * TITLE:
 645          */
 646         break;
 647       }
 648     }
 649     return inputLine;
 650   }
 651
 652   /**
 653    * Parse a !Format statement. This may be multiline, and is ended by a
 654    * semicolon.
 655    *
 656    * @param inputLine
 657    * @throws IOException
 658    */
 659   protected void parseFormat(String inputLine) throws IOException
 660   {
 661     while (inputLine != null)
 662     {
 663       parseFormatLine(inputLine);
 664       if (inputLine.endsWith(SEMICOLON))
 665       {
 666         break;
 667       }
 668       inputLine = nextNonCommentLine();
 669     }
 670   }
 671
 672   /**
 673    * Parse one line of a !Format statement. This may contain one or more
 674    * keyword=value pairs.
 675    *
 676    * @param inputLine
 677    * @throws FileFormatException
 678    */
 679   protected void parseFormatLine(String inputLine)
 680           throws FileFormatException
 681   {
 682     if (inputLine.startsWith(BANG + FORMAT))
 683     {
 684       inputLine = inputLine.substring((BANG + FORMAT).length());
 685     }
 686     if (inputLine.endsWith(SEMICOLON))
 687     {
 688       inputLine = inputLine.substring(0, inputLine.length() - 1);
 689     }
 690     if (inputLine.length() == 0)
 691     {
 692       return;
 693     }
 694     String[] tokens = inputLine.trim().split("\\s"); // any whitespace
 695     for (String token : tokens)
 696     {
 697       parseFormatKeyword(token);
 698     }
 699   }
 700
 701   /**
 702    * Parse a Keyword=Value token. Possible keywords are
 703    * <ul>
 704    * <li>DataType= DNA, RNA, Nucleotide, Protein</li>
 705    * <li>DataFormat= Interleaved, ?</li>
 706    * <li>NSeqs= number of sequences (synonym NTaxa)</li>
 707    * <li>NSites= number of bases / residues</li>
 708    * <li>Property= Exon (or Coding), Intron (or Noncoding), End (of domain)</li>
 709    * <li>Indel= gap character</li>
 710    * <li>Identical= identity character (to first sequence) (synonym MatchChar)</li>
 711    * <li>Missing= missing data character</li>
 712    * <li>CodeTable= Standard, other (MEGA supports various)</li>
 713    * </ul>
 714    *
 715    * @param token
 716    * @throws FileFormatException
 717    *           if an unrecognised keyword or value is encountered
 718    */
 719   protected void parseFormatKeyword(String token)
 720           throws FileFormatException
 721   {
 722     String msg = "Unrecognised Format command: " + token;
 723     String[] bits = token.split(EQUALS);
 724     if (bits.length != 2)
 725     {
 726       throw new FileFormatException(msg);
 727     }
 728     String keyword = bits[0];
 729     String value = bits[1];
 730
 731     /*
 732      * Jalview will work out whether nucleotide or not anyway
 733      */
 734     if (keyword.equalsIgnoreCase(DATATYPE))
 735     {
 736       if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
 737               || value.equalsIgnoreCase("Nucleotide"))
 738       {
 739         this.nucleotide = true;
 740         // alignment computes whether or not it is nucleotide when created
 741       }
 742       else if (value.equalsIgnoreCase(PROTEIN))
 743       {
 744         this.nucleotide = false;
 745       }
 746       else
 747       {
 748         throw new FileFormatException(msg);
 749       }
 750       setAlignmentProperty(PROP_DATATYPE, value);
 751     }
 752
 753     /*
 754      * accept non-Standard code table but save in case we want to disable
 755      * 'translate as cDNA'
 756      */
 757     else if (keyword.equalsIgnoreCase(CODETABLE))
 758     {
 759       setAlignmentProperty(PROP_CODETABLE, value);
 760     }
 761
 762     /*
 763      * save gap char to set later on alignment once created
 764      */
 765     else if (keyword.equalsIgnoreCase(INDEL))
 766     {
 767       this.gapCharacter = value.charAt(0);
 768     }
 769
 770     else if (keyword.equalsIgnoreCase(IDENTICAL)
 771             || keyword.equalsIgnoreCase("MatchChar"))
 772     {
 773       setAlignmentProperty(PROP_IDENTITY, value);
 774       this.identityCharacter = value.charAt(0);
 775       if (!".".equals(value))
 776       {
 777         System.err.println("Warning: " + token
 778                 + " not supported, Jalview uses '.' for identity");
 779       }
 780     }
 781
 782     else if (keyword.equalsIgnoreCase(MISSING))
 783     {
 784       setAlignmentProperty(PROP_MISSING, value);
 785       System.err.println("Warning: " + token + " not supported");
 786     }
 787
 788     else if (keyword.equalsIgnoreCase("Property"))
 789     {
 790       // TODO: figure out what to do with this
 791       // can it appear more than once in a file?
 792       setAlignmentProperty(PROP_MISSING, value);
 793     }
 794
 795     else if (!keyword.equalsIgnoreCase(N_SEQS)
 796             && !keyword.equalsIgnoreCase(N_SITES))
 797     {
 798       System.err.println("Warning: " + msg);
 799     }
 800   }
 801
 802   /**
 803    * Returns the trimmed data on the line following either whitespace or '=',
 804    * with any trailing semi-colon removed<br>
 805    * So
 806    * <ul>
 807    * <li>Hello World</li>
 808    * <li>!Hello: \tWorld;</li>
 809    * <li>!Hello=World</li>
 810    * <ul>
 811    * should all return "World"
 812    *
 813    * @param inputLine
 814    * @return
 815    */
 816   protected static String getValue(String inputLine)
 817   {
 818     if (inputLine == null)
 819     {
 820       return null;
 821     }
 822     String value = null;
 823     String s = inputLine.replaceAll("\t", " ").trim();
 824
 825     /*
 826      * KEYWORD = VALUE should return VALUE
 827      */
 828     int equalsPos = s.indexOf("=");
 829     if (equalsPos >= 0)
 830     {
 831       value = s.substring(equalsPos + 1);
 832     }
 833     else
 834     {
 835       int spacePos = s.indexOf(' ');
 836       value = spacePos == -1 ? "" : s.substring(spacePos + 1);
 837     }
 838     value = value.trim();
 839     if (value.endsWith(SEMICOLON))
 840     {
 841       value = value.substring(0, value.length() - 1).trim();
 842     }
 843     return value;
 844   }
 845
 846   /**
 847    * Returns true if the input line starts with "TITLE" or "!TITLE" (not case
 848    * sensitive). The latter is the official format, some older data file
 849    * examples have it without the !.
 850    *
 851    * @param inputLine
 852    * @return
 853    */
 854   protected static boolean isTitle(String inputLine)
 855   {
 856     if (inputLine == null)
 857     {
 858       return false;
 859     }
 860     String upper = inputLine.toUpperCase();
 861     return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
 862             + TITLE.toUpperCase()));
 863   }
 864
 865   /**
 866    * Reads lines until terminated by semicolon, appending each to the
 867    * Description property value.
 868    *
 869    * @throws IOException
 870    */
 871   protected void parseDescription(String firstDescriptionLine)
 872           throws IOException
 873   {
 874     StringBuilder desc = new StringBuilder(256);
 875     String line = getValue(firstDescriptionLine);
 876     while (line != null)
 877     {
 878       if (line.endsWith(SEMICOLON))
 879       {
 880         desc.append(line.substring(0, line.length() - 1));
 881         break;
 882       }
 883       else if (line.length() > 0)
 884       {
 885         desc.append(line).append(newline);
 886       }
 887       line = nextNonCommentLine();
 888     }
 889     setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
 890   }
 891
 892   /**
 893    * Returns the alignment sequences in Mega format.
 894    */
 895   @Override
 896   public String print()
 897   {
 898     return MEGA_ID + newline + print(getSeqsAsArray());
 899   }
 900
 901   /**
 902    * Write out the alignment sequences in Mega format - interleaved unless
 903    * explicitly noninterleaved.
 904    */
 905   protected String print(SequenceI[] s)
 906   {
 907     String result;
 908     if (this.interleaved != null && !this.interleaved)
 909     {
 910       result = printNonInterleaved(s);
 911     }
 912     else
 913     {
 914       result = printInterleaved(s);
 915     }
 916     return result;
 917   }
 918
 919   /**
 920    * Print to string in Interleaved format - blocks of next N characters of each
 921    * sequence in turn.
 922    *
 923    * @param s
 924    */
 925   protected String printInterleaved(SequenceI[] s)
 926   {
 927     int maxIdLength = getMaxIdLength(s);
 928     int maxSequenceLength = getMaxSequenceLength(s);
 929     int numLines = maxSequenceLength / positionsPerLine + 3; // approx
 930
 931     int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1;
 932     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
 933     int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;
 934
 935     /*
 936      * Roughly size a buffer to hold the whole output
 937      */
 938     StringBuilder sb = new StringBuilder(numLines
 939             * (maxIdLength + positionsPerLine + chunksPerLine + 10));
 940
 941     /*
 942      * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
 943      */
 944     int from = 0;
 945     for (int i = 0; i < numDataBlocks; i++)
 946     {
 947       sb.append(newline);
 948       boolean first = true;
 949       int advancedBy = 0;
 950       for (SequenceI seq : s)
 951       {
 952         int seqFrom = from;
 953         String seqId = String.format("#%-" + maxIdLength + "s",
 954                 seq.getName());
 955
 956         /*
 957          * output next line for this sequence
 958          */
 959         sb.append(seqId);
 960         int lastPos = seqFrom + positionsPerLine; // exclusive
 961         for (int j = 0; j < chunksPerLine; j++)
 962         {
 963           char[] subSequence = seq.getSequence(seqFrom,
 964                   Math.min(lastPos, seqFrom + spaceEvery));
 965           if (subSequence.length > 0)
 966           {
 967             sb.append(SPACE).append(subSequence);
 968           }
 969           seqFrom += subSequence.length;
 970           if (first)
 971           {
 972             // all sequences should be the same length in MEGA
 973             advancedBy += subSequence.length;
 974           }
 975         }
 976         // write last position as a comment
 977         if (writePositionNumbers)
 978         {
 979           sb.append(SPACE).append(COMMENT_START).append(from + advancedBy)
 980                   .append(COMMENT_END);
 981         }
 982         sb.append(newline);
 983         first = false;
 984       }
 985       from += advancedBy;
 986     }
 987
 988     return new String(sb);
 989   }
 990
 991   /**
 992    * Outputs to string the MEGA header and any other known and relevant
 993    * alignment properties
 994    *
 995    * @param al
 996    */
 997   protected String printHeaders(AlignmentI al)
 998   {
 999     StringBuilder sb = new StringBuilder(128);
1000     sb.append(MEGA_ID).append(newline);
1001     String propertyValue = (String) al.getProperty(PROP_TITLE);
1002     if (propertyValue != null)
1003     {
1004       sb.append(BANG).append(TITLE).append(SPACE)
1005 .append(propertyValue)
1006               .append(SEMICOLON)
1007               .append(newline);
1008     }
1009     propertyValue = (String) al.getProperty(PROP_DESCRIPTION);
1010     if (propertyValue != null)
1011     {
1012       sb.append(BANG).append(DESCRIPTION).append(newline)
1013               .append(propertyValue).append(SEMICOLON)
1014               .append(newline);
1015     }
1016
1017     /*
1018      * !Format DataType CodeTable
1019      */
1020     sb.append(BANG).append(FORMAT).append(newline);
1021     String dataType = (String) al.getProperty(PROP_DATATYPE);
1022     if (dataType == null)
1023     {
1024       dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
1025     }
1026     sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
1027     String codeTable = (String) al.getProperty(PROP_CODETABLE);
1028     sb.append(SPACE).append(CODETABLE).append(EQUALS)
1029             .append(codeTable == null ? "Standard" : codeTable)
1030             .append(newline);
1031
1032     /*
1033      * !Format NSeqs NSites
1034      * NSites the length of any sequence (they should all be the same), excluding
1035      * gaps?!?
1036      */
1037     sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
1038     SequenceI seq = al.getSequenceAt(0);
1039     sb.append(SPACE).append(N_SITES).append(EQUALS)
1040             .append(seq.getEnd() - seq.getStart() + 1);
1041     sb.append(newline);
1042
1043     /*
1044      * !Format Indel Identical Missing
1045      */
1046     sb.append(INDENT);
1047     sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
1048     String identity = (String) al.getProperty(PROP_IDENTITY);
1049     if (identity != null)
1050     {
1051       sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
1052     }
1053     String missing = (String) al.getProperty(PROP_MISSING);
1054     if (missing != null)
1055     {
1056       sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
1057     }
1058     sb.append(SEMICOLON).append(newline);
1059
1060     return sb.toString();
1061   }
1062
1063   /**
1064    * Get the longest sequence id (to allow aligned printout).
1065    *
1066    * @param s
1067    * @return
1068    */
1069   protected static int getMaxIdLength(SequenceI[] s)
1070   {
1071     // TODO pull up for reuse
1072     int maxLength = 0;
1073     for (SequenceI seq : s)
1074     {
1075       int len = seq.getName().length();
1076       if (len > maxLength)
1077       {
1078         maxLength = len;
1079       }
1080     }
1081     return maxLength;
1082   }
1083
1084   /**
1085    * Get the longest sequence length
1086    *
1087    * @param s
1088    * @return
1089    */
1090   protected static int getMaxSequenceLength(SequenceI[] s)
1091   {
1092     // TODO pull up for reuse
1093     int maxLength = 0;
1094     for (SequenceI seq : s)
1095     {
1096       int len = seq.getLength();
1097       if (len > maxLength)
1098       {
1099         maxLength = len;
1100       }
1101     }
1102     return maxLength;
1103   }
1104
1105   /**
1106    * Print to string in noninterleaved format - all of each sequence in turn, in
1107    * blocks of 50 characters.
1108    *
1109    * @param s
1110    * @return
1111    */
1112   protected String printNonInterleaved(SequenceI[] s)
1113   {
1114     int maxSequenceLength = getMaxSequenceLength(s);
1115     // approx
1116     int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;
1117
1118     /*
1119      * Roughly size a buffer to hold the whole output
1120      */
1121     StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
1122
1123     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1124     int chunksPerLine = positionsPerLine / spaceEvery;
1125     for (SequenceI seq : s)
1126     {
1127       sb.append(newline);
1128       sb.append(HASHSIGN + seq.getName()).append(newline);
1129       int startPos = 0;
1130       while (startPos < seq.getLength())
1131       {
1132         boolean firstChunk = true;
1133         /*
1134          * print next line for this sequence
1135          */
1136         int lastPos = startPos + positionsPerLine; // exclusive
1137         for (int j = 0; j < chunksPerLine; j++)
1138         {
1139           char[] subSequence = seq.getSequence(startPos,
1140                   Math.min(lastPos, startPos + positionsPerLine));
1141           if (subSequence.length > 0)
1142           {
1143             if (!firstChunk)
1144             {
1145               sb.append(SPACE);
1146             }
1147             sb.append(subSequence);
1148             firstChunk = false;
1149           }
1150           startPos += subSequence.length;
1151         }
1152         sb.append(newline);
1153       }
1154     }
1155
1156     return new String(sb);
1157   }
1158
1159   /**
1160    * Flag this file as interleaved or not, based on data format. Throws an
1161    * exception if has previously been determined to be otherwise.
1162    *
1163    * @param isIt
1164    * @param dataLine
1165    * @throws IOException
1166    */
1167   protected void assertInterleaved(boolean isIt, String dataLine)
1168           throws FileFormatException
1169   {
1170     if (this.interleaved != null && isIt != this.interleaved.booleanValue())
1171     {
1172       throw new FileFormatException(
1173               "Parse error: mix of interleaved and noninterleaved detected, at line: "
1174                       + dataLine);
1175     }
1176     this.interleaved = new Boolean(isIt);
1177     setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
1178   }
1179
1180   public boolean isInterleaved()
1181   {
1182     return this.interleaved == null ? false : this.interleaved
1183             .booleanValue();
1184   }
1185
1186   /**
1187    * Adds saved parsed values either as alignment properties, or (in some cases)
1188    * as specific member fields of the alignment
1189    */
1190   @Override
1191   public void addProperties(AlignmentI al)
1192   {
1193     super.addProperties(al);
1194     if (this.gapCharacter != null)
1195     {
1196       al.setGapCharacter(gapCharacter);
1197     }
1198
1199     /*
1200      * warn if e.g. DataType=DNA but data is protein (or vice versa)
1201      */
1202     if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) {
1203       System.err.println("Warning: " + this.title + " declared "
1204               + (nucleotide ? "" : " not ") + "nucleotide but it is"
1205               + (nucleotide ? " not" : ""));
1206     }
1207   }
1208
1209   /**
1210    * Print the given alignment in MEGA format. If the alignment was created by
1211    * parsing a MEGA file, it should have properties set (e.g. Title) which can
1212    * influence the output.
1213    */
1214   @Override
1215   public String print(AlignmentI al)
1216   {
1217     this.nucleotide = al.isNucleotide();
1218     String lineLength = (String) al.getProperty(PROP_LINELENGTH);
1219     this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
1220             .parseInt(lineLength);
1221     return printHeaders(al) + print(al.getSequencesArray());
1222   }
1223
1224   /**
1225    * Returns the number of sequence positions output per line
1226    *
1227    * @return
1228    */
1229   public int getPositionsPerLine()
1230   {
1231     return positionsPerLine;
1232   }
1233
1234   /**
1235    * Sets the number of sequence positions output per line. Note these will be
1236    * formatted in blocks of 3 (nucleotide) or 10 (peptide).
1237    *
1238    * @param p
1239    */
1240   public void setPositionsPerLine(int p)
1241   {
1242     this.positionsPerLine = p;
1243   }
1244 }