src/jalview/io/MegaFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  * The Jalview Authors are detailed in the 'AUTHORS' file.
  18  */
  19 package jalview.io;
  20
  21 import jalview.datamodel.AlignmentI;
  22 import jalview.datamodel.Sequence;
  23 import jalview.datamodel.SequenceI;
  24
  25 import java.io.IOException;
  26 import java.util.LinkedHashMap;
  27 import java.util.Map;
  28 import java.util.Map.Entry;
  29 import java.util.Set;
  30
  31 /**
  32  * A parser for input or output of MEGA format files. <br>
  33  * <br>
  34  * Tamura K, Stecher G, Peterson D, Filipski A, and Kumar S (2013) MEGA6:
  35  * Molecular Evolutionary Genetics Analysis Version 6.0. Molecular Biology and
  36  * Evolution 30: 2725-2729. <br>
  37  * <br>
  38  *
  39  * MEGA file format is supported as described in
  40  * http://www.megasoftware.net/manual.pdf <br>
  41  * Limitations:
  42  * <ul>
  43  * <li>nested comments (marked by [ ]) are accepted but not preserved</li>
  44  * <li>to be completed</li>
  45  * </ul>
  46  *
  47  * @see http://www.megasoftware.net/
  48  */
  49 public class MegaFile extends AlignFile
  50 {
  51   private static final int DEFAULT_LINE_LENGTH = 60;
  52
  53   private static final String INDENT = "    ";
  54
  55   private static final String N_SITES = "NSites";
  56
  57   private static final String N_SEQS = "NSeqs";
  58
  59   private static final String MISSING = "Missing";
  60
  61   private static final String IDENTICAL = "Identical";
  62
  63   private static final String INDEL = "Indel";
  64
  65   private static final String CODETABLE = "CodeTable";
  66
  67   private static final String PROTEIN = "Protein";
  68
  69   private static final String NUCLEOTIDE = "Nucleotide";
  70
  71   private static final String DATATYPE = "DataType";
  72
  73   private static final char COMMENT_START = '[';
  74
  75   private static final char COMMENT_END = ']';
  76
  77   private static final String HASHSIGN = "#";
  78
  79   private static final String SEMICOLON = ";";
  80
  81   private static final String BANG = "!";
  82
  83   private static final String EQUALS = "=";
  84
  85   private static final String MEGA_ID = HASHSIGN + "MEGA";
  86
  87   private static final String TITLE = "Title";
  88
  89   private static final String FORMAT = "Format";
  90
  91   private static final String DESCRIPTION = "Description";
  92
  93   private static final String GENE = "Gene";
  94
  95   private static final String DOMAIN = "Domain";
  96
  97   /*
  98    * names of properties to save to the alignment (may affect eventual output
  99    * format)
 100    */
 101   static final String PROP_TITLE = "MEGA_TITLE";
 102
 103   static final String PROP_INTERLEAVED = "MEGA_INTERLEAVED";
 104
 105   static final String PROP_DESCRIPTION = "MEGA_DESCRIPTION";
 106
 107   static final String PROP_CODETABLE = "MEGA_CODETABLE";
 108
 109   static final String PROP_IDENTITY = "MEGA_IDENTITY";
 110
 111   static final String PROP_MISSING = "MEGA_MISSING";
 112
 113   static final String PROP_DATATYPE = "MEGA_DATATYPE";
 114
 115   // number of bases per line of file (value is inferred)
 116   static final String PROP_LINELENGTH = "MEGA_LINELENGTH";
 117
 118   // TODO: need a controlled name for Gene as a feature if we want to be able to
 119   // output the MEGA file with !Gene headers
 120   // WTF do we do if the sequences get realigned?
 121
 122   // initial size for sequence data buffer
 123   private static final int SEQBUFFERSIZE = 256;
 124
 125   private static final String SPACE = " ";
 126
 127   /*
 128    * number of sequence positions output per line
 129    */
 130   private int positionsPerLine;
 131
 132   private String title;
 133
 134   // gap character may be explicitly declared, if not we infer it
 135   private Character gapCharacter;
 136
 137   // identity character if declared
 138   private char identityCharacter = 0;
 139
 140   // this can be True, False or null (meaning not asserted in file)
 141   private Boolean nucleotide;
 142
 143   // set once we have seen one block of interleaved data
 144   private boolean firstDataBlockRead = false;
 145
 146   // this can be True, False or null (meaning we don't know yet)
 147   private Boolean interleaved;
 148
 149   // write end of line positions as a comment
 150   private boolean writePositionNumbers = true;
 151
 152   public MegaFile()
 153   {
 154   }
 155
 156   public MegaFile(String inFile, String type) throws IOException
 157   {
 158     super(inFile, type);
 159   }
 160
 161   public MegaFile(FileParse source) throws IOException
 162   {
 163     super(source);
 164   }
 165
 166   /**
 167    * Parse the input stream.
 168    */
 169   @Override
 170   public void parse() throws IOException
 171   {
 172     /*
 173      * Read and process MEGA and Title/Format/Description headers if present.
 174      * Returns the first data line following the headers.
 175      */
 176     String dataLine = parseHeaderLines();
 177
 178     /*
 179      * Temporary store of {sequenceId, positionData} while parsing interleaved
 180      * sequences; sequences are maintained in the order in which they are added
 181      * i.e. read in the file
 182      */
 183     Map<String, StringBuilder> seqData = new LinkedHashMap<String, StringBuilder>();
 184
 185     /*
 186      * The id of the sequence being read (for non-interleaved)
 187      */
 188     String currentId = "";
 189
 190     while (dataLine != null)
 191     {
 192       dataLine = dataLine.trim();
 193       if (dataLine.length() > 0)
 194       {
 195         if (dataLine.startsWith(BANG + GENE))
 196         {
 197           parseGene(dataLine);
 198         }
 199         else if (dataLine.startsWith(BANG + DOMAIN))
 200         {
 201           parseDomain(dataLine);
 202         }
 203         else
 204         {
 205           currentId = parseDataLine(dataLine, seqData, currentId);
 206         }
 207       }
 208       else if (!seqData.isEmpty())
 209       {
 210         /*
 211          * Blank line after processing some data...
 212          */
 213         this.firstDataBlockRead = true;
 214       }
 215       dataLine = nextNonCommentLine();
 216     }
 217
 218     // remember the (longest) line length read in, so we can output the same
 219     setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
 220
 221     setSequences(seqData);
 222   }
 223
 224   /**
 225    * Parse a !Gene command line
 226    *
 227    * @param dataLine
 228    */
 229   protected void parseGene(String dataLine)
 230   {
 231   }
 232
 233   /**
 234    * Parse a !Domain command line
 235    *
 236    * @param dataLine
 237    */
 238   private void parseDomain(String dataLine)
 239   {
 240   }
 241
 242   /**
 243    * Returns the next line that is not a comment, or null at end of file.
 244    * Comments in MEGA are within [ ] brackets, and may be nested.
 245    *
 246    * @return
 247    * @throws IOException
 248    */
 249   protected String nextNonCommentLine() throws IOException
 250   {
 251     return nextNonCommentLine(0);
 252   }
 253
 254   /**
 255    * Returns the next non-comment line (or part line), or null at end of file.
 256    * Comments in MEGA are within [ ] brackets, and may be nested. They may occur
 257    * anywhere within a line (for example at the end with position numbers); this
 258    * method returns the line with any comments removed.
 259    *
 260    * @param depth
 261    *          current depth of nesting of comments while parsing
 262    * @return
 263    * @throws IOException
 264    */
 265   protected String nextNonCommentLine(final int depth) throws IOException
 266   {
 267     String data = null;
 268     data = nextLine();
 269     if (data == null)
 270     {
 271       if (depth > 0)
 272       {
 273         System.err.println("Warning: unterminated comment in data file");
 274       }
 275       return data;
 276     }
 277
 278     /*
 279      * If we are in a (possibly nested) comment after parsing this line, keep
 280      * reading recursively until the comment has unwound
 281      */
 282     int newDepth = commentDepth(data, depth);
 283     if (newDepth > 0)
 284     {
 285       return nextNonCommentLine(newDepth);
 286     }
 287     else
 288     {
 289       /*
 290        * not in a comment by end of this line; return what is left
 291        */
 292       String nonCommentPart = getNonCommentContent(data, depth);
 293       return nonCommentPart;
 294     }
 295   }
 296
 297   /**
 298    * Returns what is left of the input data after removing any comments, whether
 299    * 'in progress' from preceding lines, or embedded in the current line
 300    *
 301    * @param data
 302    *          input data
 303    * @param depth
 304    *          nested depth of comments pending termination
 305    * @return
 306    * @throws FileFormatException
 307    */
 308   protected static String getNonCommentContent(String data, int depth)
 309           throws FileFormatException
 310   {
 311     int len = data.length();
 312     StringBuilder result = new StringBuilder(len);
 313     for (int i = 0; i < len; i++)
 314     {
 315       char c = data.charAt(i);
 316       switch (c)
 317       {
 318       case COMMENT_START:
 319         depth++;
 320         break;
 321
 322       case COMMENT_END:
 323         if (depth > 0)
 324         {
 325           depth--;
 326         }
 327         else
 328         {
 329           result.append(c);
 330         }
 331         break;
 332
 333       default:
 334         if (depth == 0)
 335         {
 336           result.append(c);
 337         }
 338       }
 339     }
 340     return result.toString();
 341   }
 342
 343   /**
 344    * Calculates new depth of comment after parsing an input line i.e. the excess
 345    * of opening '[' over closing ']' characters. Any excess ']' are ignored (not
 346    * treated as comment delimiters).
 347    *
 348    * @param data
 349    *          input line
 350    * @param depth
 351    *          current comment nested depth before parsing the line
 352    * @return new depth after parsing the line
 353    */
 354   protected static int commentDepth(CharSequence data, int depth)
 355   {
 356     int newDepth = depth;
 357     int len = data.length();
 358     for (int i = 0; i < len; i++)
 359     {
 360       char c = data.charAt(i);
 361       if (c == COMMENT_START)
 362       {
 363         newDepth++;
 364       }
 365       else if (c == COMMENT_END && newDepth > 0)
 366       {
 367         newDepth--;
 368       }
 369     }
 370     return newDepth;
 371   }
 372
 373   /**
 374    * Convert the parsed sequence strings to objects and store them in the model.
 375    *
 376    * @param seqData
 377    */
 378   protected void setSequences(Map<String, StringBuilder> seqData)
 379   {
 380     Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
 381
 382     for (Entry<String, StringBuilder> dataset : datasets)
 383     {
 384       String sequenceId = dataset.getKey();
 385       StringBuilder characters = dataset.getValue();
 386       SequenceI s = new Sequence(sequenceId, new String(characters));
 387       this.seqs.addElement(s);
 388     }
 389   }
 390
 391   /**
 392    * Process one line of sequence data. If it has no sequence identifier, append
 393    * to the current id's sequence. Else parse out the sequence id and append the
 394    * data (if any) to that id's sequence. Returns the sequence id (implicit or
 395    * explicit) for this line.
 396    *
 397    * @param dataLine
 398    * @param seqData
 399    * @param currentid
 400    * @return
 401    * @throws IOException
 402    */
 403   protected String parseDataLine(String dataLine,
 404           Map<String, StringBuilder> seqData, String currentId)
 405           throws IOException
 406   {
 407     String seqId = getSequenceId(dataLine);
 408     if (seqId == null)
 409     {
 410       /*
 411        * Just character data
 412        */
 413       parseNoninterleavedDataLine(dataLine, seqData, currentId);
 414       return currentId;
 415     }
 416     else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
 417     {
 418       /*
 419        * Sequence id only - header line for noninterleaved data
 420        */
 421       return seqId;
 422     }
 423     else
 424     {
 425       /*
 426        * Sequence id followed by data
 427        */
 428       parseInterleavedDataLine(dataLine, seqData, seqId);
 429       return seqId;
 430     }
 431   }
 432
 433   /**
 434    * Add a line of sequence data to the buffer for the given sequence id. Start
 435    * a new one if we haven't seen it before.
 436    *
 437    * @param dataLine
 438    * @param seqData
 439    * @param currentId
 440    * @throws IOException
 441    */
 442   protected void parseNoninterleavedDataLine(String dataLine,
 443           Map<String, StringBuilder> seqData, String currentId)
 444           throws IOException
 445   {
 446     if (currentId == null)
 447     {
 448       /*
 449        * Oops. Data but no sequence id context.
 450        */
 451       throw new IOException("No sequence id context at: " + dataLine);
 452     }
 453
 454     assertInterleaved(false, dataLine);
 455
 456     StringBuilder sb = getSequenceDataBuffer(seqData, currentId);
 457
 458     dataLine = reformatSequenceData(dataLine, sb.length(), seqData);
 459     sb.append(dataLine);
 460
 461     setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
 462   }
 463
 464   /**
 465    * Get the sequence data for this sequence id, starting a new one if
 466    * necessary.
 467    *
 468    * @param seqData
 469    * @param currentId
 470    * @return
 471    */
 472   protected StringBuilder getSequenceDataBuffer(
 473           Map<String, StringBuilder> seqData, String currentId)
 474   {
 475     StringBuilder sb = seqData.get(currentId);
 476     if (sb == null)
 477     {
 478       // first data met for this sequence id, start a new buffer
 479       sb = new StringBuilder(SEQBUFFERSIZE);
 480       seqData.put(currentId, sb);
 481     }
 482     return sb;
 483   }
 484
 485   /**
 486    * Parse one line of interleaved data e.g.
 487    *
 488    * <pre>
 489    * #TheSeqId CGATCGCATGCA
 490    * </pre>
 491    *
 492    * @param dataLine
 493    * @param seqData
 494    * @param seqId
 495    * @throws IOException
 496    */
 497   protected void parseInterleavedDataLine(String dataLine,
 498           Map<String, StringBuilder> seqData, String seqId)
 499           throws IOException
 500   {
 501     /*
 502      * New sequence found in second or later data block - error.
 503      */
 504     if (this.firstDataBlockRead && !seqData.containsKey(seqId))
 505     {
 506       throw new IOException(
 507               "Parse error: misplaced new sequence starting at " + dataLine);
 508     }
 509
 510     StringBuilder sb = getSequenceDataBuffer(seqData, seqId);
 511     String data = dataLine.substring(seqId.length() + 1).trim();
 512
 513     /*
 514      * Do nothing if this line is _only_ a sequence id with no data following.
 515      *
 516      * Remove any internal spaces
 517      */
 518     if (data != null && data.length() > 0)
 519     {
 520       data = reformatSequenceData(data, sb.length(), seqData);
 521       sb.append(data);
 522       setPositionsPerLine(Math.max(positionsPerLine, data.length()));
 523       assertInterleaved(true, dataLine);
 524     }
 525   }
 526
 527   /**
 528    * Reformat input sequence data by removing any internal formatting spaces,
 529    * and converting any 'identity' characters to the corresponding position in
 530    * the first sequence.
 531    *
 532    * @param data
 533    * @param startPos
 534    *          the sequence position (base 0) of the start of the data
 535    * @param seqData
 536    * @return
 537    */
 538   protected String reformatSequenceData(String data, int startPos, Map<String, StringBuilder> seqData)
 539   {
 540     String formatted = data.replace(SPACE, "");
 541     if (formatted.indexOf(identityCharacter) > -1)
 542     {
 543       /*
 544        * sequence contains '.' or other identity symbol; replace these with the
 545        * same position from the first (reference) sequence
 546        */
 547       StringBuilder referenceSequence = seqData.values().iterator().next();
 548       StringBuilder sb = new StringBuilder(formatted.length());
 549       for (int i = 0 ; i < formatted.length() ; i++) {
 550         char nextChar = formatted.charAt(i);
 551         if (nextChar != identityCharacter) {
 552           sb.append(nextChar);
 553         } else {
 554           sb.append(referenceSequence.charAt(startPos + i));
 555         }
 556       }
 557       formatted = sb.toString();
 558     }
 559     return formatted;
 560   }
 561
 562   /**
 563    * If the line begins with (e.g.) "#abcde " then returns "abcde" as the
 564    * identifier. Else returns null.
 565    *
 566    * @param dataLine
 567    * @return
 568    */
 569   public static String getSequenceId(String dataLine)
 570   {
 571     // TODO refactor to a StringUtils type class
 572     if (dataLine != null)
 573     {
 574       if (dataLine.startsWith(HASHSIGN))
 575       {
 576         int spacePos = dataLine.indexOf(" ");
 577         return (spacePos == -1 ? dataLine.substring(1) : dataLine
 578                 .substring(1, spacePos));
 579       }
 580     }
 581     return null;
 582   }
 583
 584   /**
 585    * Read the #MEGA and Title/Format/Description header lines (if present).
 586    *
 587    * Save as alignment properties in case useful.
 588    *
 589    * @return the next non-blank line following the header lines.
 590    * @throws IOException
 591    */
 592   protected String parseHeaderLines() throws IOException
 593   {
 594     String inputLine = null;
 595     while ((inputLine = nextNonCommentLine()) != null)
 596     {
 597       inputLine = inputLine.trim();
 598
 599       /*
 600        * skip blank lines
 601        */
 602       if (inputLine.length() == 0)
 603       {
 604         continue;
 605       }
 606
 607       if (inputLine.toUpperCase().startsWith(MEGA_ID))
 608       {
 609         continue;
 610       }
 611
 612       if (isTitle(inputLine))
 613       {
 614         this.title = getValue(inputLine);
 615         setAlignmentProperty(PROP_TITLE, title);
 616       }
 617       else if (inputLine.startsWith(BANG + DESCRIPTION))
 618       {
 619         parseDescription(inputLine);
 620       }
 621
 622       else if (inputLine.startsWith(BANG + FORMAT))
 623       {
 624         parseFormat(inputLine);
 625       }
 626       else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
 627       {
 628
 629         /*
 630          * Return the first 'data line' i.e. one that is not blank, #MEGA or
 631          * TITLE:
 632          */
 633         break;
 634       }
 635     }
 636     return inputLine;
 637   }
 638
 639   /**
 640    * Parse a !Format statement. This may be multiline, and is ended by a
 641    * semicolon.
 642    *
 643    * @param inputLine
 644    * @throws IOException
 645    */
 646   protected void parseFormat(String inputLine) throws IOException
 647   {
 648     while (inputLine != null)
 649     {
 650       parseFormatLine(inputLine);
 651       if (inputLine.endsWith(SEMICOLON))
 652       {
 653         break;
 654       }
 655       inputLine = nextNonCommentLine();
 656     }
 657   }
 658
 659   /**
 660    * Parse one line of a !Format statement. This may contain one or more
 661    * keyword=value pairs.
 662    *
 663    * @param inputLine
 664    * @throws FileFormatException
 665    */
 666   protected void parseFormatLine(String inputLine)
 667           throws FileFormatException
 668   {
 669     if (inputLine.startsWith(BANG + FORMAT))
 670     {
 671       inputLine = inputLine.substring((BANG + FORMAT).length());
 672     }
 673     if (inputLine.endsWith(SEMICOLON))
 674     {
 675       inputLine = inputLine.substring(0, inputLine.length() - 1);
 676     }
 677     if (inputLine.length() == 0)
 678     {
 679       return;
 680     }
 681     String[] tokens = inputLine.trim().split("\\s"); // any whitespace
 682     for (String token : tokens)
 683     {
 684       parseFormatKeyword(token);
 685     }
 686   }
 687
 688   /**
 689    * Parse a Keyword=Value token. Possible keywords are
 690    * <ul>
 691    * <li>DataType= DNA, RNA, Nucleotide, Protein</li>
 692    * <li>DataFormat= Interleaved, ?</li>
 693    * <li>NSeqs= number of sequences (synonym NTaxa)</li>
 694    * <li>NSites= number of bases / residues</li>
 695    * <li>Property= Exon (or Coding), Intron (or Noncoding), End (of domain)</li>
 696    * <li>Indel= gap character</li>
 697    * <li>Identical= identity character (to first sequence) (synonym MatchChar)</li>
 698    * <li>Missing= missing data character</li>
 699    * <li>CodeTable= Standard, other (MEGA supports various)</li>
 700    * </ul>
 701    *
 702    * @param token
 703    * @throws FileFormatException
 704    *           if an unrecognised keyword or value is encountered
 705    */
 706   protected void parseFormatKeyword(String token)
 707           throws FileFormatException
 708   {
 709     String msg = "Unrecognised Format command: " + token;
 710     String[] bits = token.split(EQUALS);
 711     if (bits.length != 2)
 712     {
 713       throw new FileFormatException(msg);
 714     }
 715     String keyword = bits[0];
 716     String value = bits[1];
 717
 718     /*
 719      * Jalview will work out whether nucleotide or not anyway
 720      */
 721     if (keyword.equalsIgnoreCase(DATATYPE))
 722     {
 723       if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
 724               || value.equalsIgnoreCase("Nucleotide"))
 725       {
 726         this.nucleotide = true;
 727         // alignment computes whether or not it is nucleotide when created
 728       }
 729       else if (value.equalsIgnoreCase(PROTEIN))
 730       {
 731         this.nucleotide = false;
 732       }
 733       else
 734       {
 735         throw new FileFormatException(msg);
 736       }
 737       setAlignmentProperty(PROP_DATATYPE, value);
 738     }
 739
 740     /*
 741      * accept non-Standard code table but save in case we want to disable
 742      * 'translate as cDNA'
 743      */
 744     else if (keyword.equalsIgnoreCase(CODETABLE))
 745     {
 746       setAlignmentProperty(PROP_CODETABLE, value);
 747     }
 748
 749     /*
 750      * save gap char to set later on alignment once created
 751      */
 752     else if (keyword.equalsIgnoreCase(INDEL))
 753     {
 754       this.gapCharacter = value.charAt(0);
 755     }
 756
 757     else if (keyword.equalsIgnoreCase(IDENTICAL)
 758             || keyword.equalsIgnoreCase("MatchChar"))
 759     {
 760       setAlignmentProperty(PROP_IDENTITY, value);
 761       this.identityCharacter = value.charAt(0);
 762       if (!".".equals(value))
 763       {
 764         System.err.println("Warning: " + token
 765                 + " not supported, Jalview uses '.' for identity");
 766       }
 767     }
 768
 769     else if (keyword.equalsIgnoreCase(MISSING))
 770     {
 771       setAlignmentProperty(PROP_MISSING, value);
 772       System.err.println("Warning: " + token + " not supported");
 773     }
 774
 775     else if (keyword.equalsIgnoreCase("Property"))
 776     {
 777       // TODO: figure out what to do with this
 778       // can it appear more than once in a file?
 779       setAlignmentProperty(PROP_MISSING, value);
 780     }
 781
 782     else if (!keyword.equalsIgnoreCase(N_SEQS)
 783             && !keyword.equalsIgnoreCase(N_SITES))
 784     {
 785       System.err.println("Warning: " + msg);
 786     }
 787   }
 788
 789   /**
 790    * Returns the trimmed data on the line following either whitespace or '=',
 791    * with any trailing semi-colon removed<br>
 792    * So
 793    * <ul>
 794    * <li>Hello World</li>
 795    * <li>!Hello: \tWorld;</li>
 796    * <li>!Hello=World</li>
 797    * <ul>
 798    * should all return "World"
 799    *
 800    * @param inputLine
 801    * @return
 802    */
 803   protected static String getValue(String inputLine)
 804   {
 805     if (inputLine == null)
 806     {
 807       return null;
 808     }
 809     String value = null;
 810     String s = inputLine.replaceAll("\t", " ").trim();
 811
 812     /*
 813      * KEYWORD = VALUE should return VALUE
 814      */
 815     int equalsPos = s.indexOf("=");
 816     if (equalsPos >= 0)
 817     {
 818       value = s.substring(equalsPos + 1);
 819     }
 820     else
 821     {
 822       int spacePos = s.indexOf(' ');
 823       value = spacePos == -1 ? "" : s.substring(spacePos + 1);
 824     }
 825     value = value.trim();
 826     if (value.endsWith(SEMICOLON))
 827     {
 828       value = value.substring(0, value.length() - 1).trim();
 829     }
 830     return value;
 831   }
 832
 833   /**
 834    * Returns true if the input line starts with "TITLE" or "!TITLE" (not case
 835    * sensitive). The latter is the official format, some older data file
 836    * examples have it without the !.
 837    *
 838    * @param inputLine
 839    * @return
 840    */
 841   protected static boolean isTitle(String inputLine)
 842   {
 843     if (inputLine == null)
 844     {
 845       return false;
 846     }
 847     String upper = inputLine.toUpperCase();
 848     return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
 849             + TITLE.toUpperCase()));
 850   }
 851
 852   /**
 853    * Reads lines until terminated by semicolon, appending each to the
 854    * Description property value.
 855    *
 856    * @throws IOException
 857    */
 858   protected void parseDescription(String firstDescriptionLine)
 859           throws IOException
 860   {
 861     StringBuilder desc = new StringBuilder(256);
 862     desc.append(getValue(firstDescriptionLine));
 863     if (!firstDescriptionLine.endsWith(SEMICOLON))
 864     {
 865       String line = nextNonCommentLine();
 866       while (line != null)
 867       {
 868         if (line.endsWith(SEMICOLON))
 869         {
 870           desc.append(line.substring(0, line.length() - 1));
 871           break;
 872         }
 873         else if (line.length() > 0)
 874         {
 875           desc.append(line).append(newline);
 876         }
 877         line = nextNonCommentLine();
 878       }
 879     }
 880     setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
 881   }
 882
 883   /**
 884    * Returns the alignment sequences in Mega format.
 885    */
 886   @Override
 887   public String print()
 888   {
 889     return MEGA_ID + newline + print(getSeqsAsArray());
 890   }
 891
 892   /**
 893    * Write out the alignment sequences in Mega format - interleaved unless
 894    * explicitly noninterleaved.
 895    */
 896   protected String print(SequenceI[] s)
 897   {
 898     String result;
 899     if (this.interleaved != null && !this.interleaved)
 900     {
 901       result = printNonInterleaved(s);
 902     }
 903     else
 904     {
 905       result = printInterleaved(s);
 906     }
 907     return result;
 908   }
 909
 910   /**
 911    * Print to string in Interleaved format - blocks of next N characters of each
 912    * sequence in turn.
 913    *
 914    * @param s
 915    */
 916   protected String printInterleaved(SequenceI[] s)
 917   {
 918     int maxIdLength = getMaxIdLength(s);
 919     int maxSequenceLength = getMaxSequenceLength(s);
 920     int numLines = maxSequenceLength / positionsPerLine + 3; // approx
 921
 922     int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1;
 923     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
 924     int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;
 925
 926     /*
 927      * Roughly size a buffer to hold the whole output
 928      */
 929     StringBuilder sb = new StringBuilder(numLines
 930             * (maxIdLength + positionsPerLine + chunksPerLine + 10));
 931
 932     /*
 933      * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
 934      */
 935     int from = 0;
 936     for (int i = 0; i < numDataBlocks; i++)
 937     {
 938       sb.append(newline);
 939       boolean first = true;
 940       int advancedBy = 0;
 941       for (SequenceI seq : s)
 942       {
 943         int seqFrom = from;
 944         String seqId = String.format("#%-" + maxIdLength + "s",
 945                 seq.getName());
 946
 947         /*
 948          * output next line for this sequence
 949          */
 950         sb.append(seqId);
 951         int lastPos = seqFrom + positionsPerLine; // exclusive
 952         for (int j = 0; j < chunksPerLine; j++)
 953         {
 954           char[] subSequence = seq.getSequence(seqFrom,
 955                   Math.min(lastPos, seqFrom + spaceEvery));
 956           if (subSequence.length > 0)
 957           {
 958             sb.append(SPACE).append(subSequence);
 959           }
 960           seqFrom += subSequence.length;
 961           if (first)
 962           {
 963             // all sequences should be the same length in MEGA
 964             advancedBy += subSequence.length;
 965           }
 966         }
 967         // write last position as a comment
 968         if (writePositionNumbers)
 969         {
 970           sb.append(SPACE).append(COMMENT_START).append(from + advancedBy)
 971                   .append(COMMENT_END);
 972         }
 973         sb.append(newline);
 974         first = false;
 975       }
 976       from += advancedBy;
 977     }
 978
 979     return new String(sb);
 980   }
 981
 982   /**
 983    * Outputs to string the MEGA header and any other known and relevant
 984    * alignment properties
 985    *
 986    * @param al
 987    */
 988   protected String printHeaders(AlignmentI al)
 989   {
 990     StringBuilder sb = new StringBuilder(128);
 991     sb.append(MEGA_ID).append(newline);
 992     String propertyValue = (String) al.getProperty(PROP_TITLE);
 993     if (propertyValue != null)
 994     {
 995       sb.append(BANG).append(TITLE).append(SPACE).append(propertyValue)
 996               .append(SEMICOLON).append(newline);
 997     }
 998     propertyValue = (String) al.getProperty(PROP_DESCRIPTION);
 999     if (propertyValue != null)
1000     {
1001       sb.append(BANG).append(DESCRIPTION).append(newline)
1002               .append(propertyValue).append(SEMICOLON)
1003               .append(newline);
1004     }
1005
1006     /*
1007      * !Format DataType CodeTable
1008      */
1009     sb.append(BANG).append(FORMAT).append(newline);
1010     String dataType = (String) al.getProperty(PROP_DATATYPE);
1011     if (dataType == null)
1012     {
1013       dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
1014     }
1015     sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
1016     String codeTable = (String) al.getProperty(PROP_CODETABLE);
1017     sb.append(SPACE).append(CODETABLE).append(EQUALS)
1018             .append(codeTable == null ? "Standard" : codeTable)
1019             .append(newline);
1020
1021     /*
1022      * !Format NSeqs NSites (the length of sequences - they should all be the
1023      * same - including gaps)
1024      */
1025     sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
1026     sb.append(SPACE).append(N_SITES).append(EQUALS)
1027             .append(String.valueOf(al.getWidth()));
1028     sb.append(newline);
1029
1030     /*
1031      * !Format Indel Identical Missing
1032      */
1033     sb.append(INDENT);
1034     sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
1035     String identity = (String) al.getProperty(PROP_IDENTITY);
1036     if (identity != null)
1037     {
1038       sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
1039     }
1040     String missing = (String) al.getProperty(PROP_MISSING);
1041     if (missing != null)
1042     {
1043       sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
1044     }
1045     sb.append(SEMICOLON).append(newline);
1046
1047     return sb.toString();
1048   }
1049
1050   /**
1051    * Get the longest sequence id (to allow aligned printout).
1052    *
1053    * @param s
1054    * @return
1055    */
1056   protected static int getMaxIdLength(SequenceI[] s)
1057   {
1058     // TODO pull up for reuse
1059     int maxLength = 0;
1060     for (SequenceI seq : s)
1061     {
1062       int len = seq.getName().length();
1063       if (len > maxLength)
1064       {
1065         maxLength = len;
1066       }
1067     }
1068     return maxLength;
1069   }
1070
1071   /**
1072    * Get the longest sequence length
1073    *
1074    * @param s
1075    * @return
1076    */
1077   protected static int getMaxSequenceLength(SequenceI[] s)
1078   {
1079     // TODO pull up for reuse
1080     int maxLength = 0;
1081     for (SequenceI seq : s)
1082     {
1083       int len = seq.getLength();
1084       if (len > maxLength)
1085       {
1086         maxLength = len;
1087       }
1088     }
1089     return maxLength;
1090   }
1091
1092   /**
1093    * Print to string in noninterleaved format - all of each sequence in turn, in
1094    * blocks of 50 characters.
1095    *
1096    * @param s
1097    * @return
1098    */
1099   protected String printNonInterleaved(SequenceI[] s)
1100   {
1101     int maxSequenceLength = getMaxSequenceLength(s);
1102     // approx
1103     int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;
1104
1105     /*
1106      * Roughly size a buffer to hold the whole output
1107      */
1108     StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
1109
1110     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1111     int chunksPerLine = positionsPerLine / spaceEvery;
1112     for (SequenceI seq : s)
1113     {
1114       sb.append(newline);
1115       sb.append(HASHSIGN + seq.getName()).append(newline);
1116       int startPos = 0;
1117       while (startPos < seq.getLength())
1118       {
1119         boolean firstChunk = true;
1120         /*
1121          * print next line for this sequence
1122          */
1123         int lastPos = startPos + positionsPerLine; // exclusive
1124         for (int j = 0; j < chunksPerLine; j++)
1125         {
1126           char[] subSequence = seq.getSequence(startPos,
1127                   Math.min(lastPos, startPos + positionsPerLine));
1128           if (subSequence.length > 0)
1129           {
1130             if (!firstChunk)
1131             {
1132               sb.append(SPACE);
1133             }
1134             sb.append(subSequence);
1135             firstChunk = false;
1136           }
1137           startPos += subSequence.length;
1138         }
1139         sb.append(newline);
1140       }
1141     }
1142
1143     return new String(sb);
1144   }
1145
1146   /**
1147    * Flag this file as interleaved or not, based on data format. Throws an
1148    * exception if has previously been determined to be otherwise.
1149    *
1150    * @param isIt
1151    * @param dataLine
1152    * @throws IOException
1153    */
1154   protected void assertInterleaved(boolean isIt, String dataLine)
1155           throws FileFormatException
1156   {
1157     if (this.interleaved != null && isIt != this.interleaved.booleanValue())
1158     {
1159       throw new FileFormatException(
1160               "Parse error: mix of interleaved and noninterleaved detected, at line: "
1161                       + dataLine);
1162     }
1163     this.interleaved = new Boolean(isIt);
1164     setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
1165   }
1166
1167   public boolean isInterleaved()
1168   {
1169     return this.interleaved == null ? false : this.interleaved
1170             .booleanValue();
1171   }
1172
1173   /**
1174    * Adds saved parsed values either as alignment properties, or (in some cases)
1175    * as specific member fields of the alignment
1176    */
1177   @Override
1178   public void addProperties(AlignmentI al)
1179   {
1180     super.addProperties(al);
1181     if (this.gapCharacter != null)
1182     {
1183       al.setGapCharacter(gapCharacter);
1184     }
1185
1186     /*
1187      * warn if e.g. DataType=DNA but data is protein (or vice versa)
1188      */
1189     if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) {
1190       System.err.println("Warning: " + this.title + " declared "
1191               + (nucleotide ? "" : " not ") + "nucleotide but it is"
1192               + (nucleotide ? " not" : ""));
1193     }
1194   }
1195
1196   /**
1197    * Print the given alignment in MEGA format. If the alignment was created by
1198    * parsing a MEGA file, it should have properties set (e.g. Title) which can
1199    * influence the output.
1200    */
1201   @Override
1202   public String print(AlignmentI al)
1203   {
1204     this.nucleotide = al.isNucleotide();
1205
1206     String lineLength = (String) al.getProperty(PROP_LINELENGTH);
1207     this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
1208             .parseInt(lineLength);
1209
1210     String interleave = (String) al.getProperty(PROP_INTERLEAVED);
1211     if (interleave != null)
1212     {
1213       this.interleaved = Boolean.valueOf(interleave);
1214     }
1215
1216     String headers = printHeaders(al);
1217     return headers + print(al.getSequencesArray());
1218   }
1219
1220   /**
1221    * Returns the number of sequence positions output per line
1222    *
1223    * @return
1224    */
1225   public int getPositionsPerLine()
1226   {
1227     return positionsPerLine;
1228   }
1229
1230   /**
1231    * Sets the number of sequence positions output per line. Note these will be
1232    * formatted in blocks of 3 (nucleotide) or 10 (peptide).
1233    *
1234    * @param p
1235    */
1236   public void setPositionsPerLine(int p)
1237   {
1238     this.positionsPerLine = p;
1239   }
1240 }