src/jalview/io/MegaFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  * The Jalview Authors are detailed in the 'AUTHORS' file.
  18  */
  19 package jalview.io;
  20
  21 import jalview.datamodel.AlignmentI;
  22 import jalview.datamodel.Sequence;
  23 import jalview.datamodel.SequenceI;
  24
  25 import java.io.IOException;
  26 import java.util.LinkedHashMap;
  27 import java.util.Map;
  28 import java.util.Map.Entry;
  29 import java.util.Set;
  30
  31 /**
  32  * A parser for input or output of MEGA format files. <br>
  33  * <br>
  34  * Tamura K, Stecher G, Peterson D, Filipski A, and Kumar S (2013) MEGA6:
  35  * Molecular Evolutionary Genetics Analysis Version 6.0. Molecular Biology and
  36  * Evolution 30: 2725-2729. <br>
  37  * <br>
  38  *
  39  * MEGA file format is supported as described in
  40  * http://www.megasoftware.net/manual.pdf <br>
  41  * Limitations:
  42  * <ul>
  43  * <li>nested comments (marked by [ ]) are accepted but not preserved</li>
  44  * <li>to be completed</li>
  45  * </ul>
  46  *
  47  * @see http://www.megasoftware.net/
  48  */
  49 public class MegaFile extends AlignFile
  50 {
  51   private static final int DEFAULT_LINE_LENGTH = 60;
  52
  53   private static final String INDENT = "    ";
  54
  55   private static final String N_SITES = "NSites";
  56
  57   private static final String N_SEQS = "NSeqs";
  58
  59   private static final String MISSING = "Missing";
  60
  61   private static final String IDENTICAL = "Identical";
  62
  63   private static final String INDEL = "Indel";
  64
  65   private static final String CODETABLE = "CodeTable";
  66
  67   private static final String PROTEIN = "Protein";
  68
  69   private static final String NUCLEOTIDE = "Nucleotide";
  70
  71   private static final String DATATYPE = "DataType";
  72
  73   private static final char COMMENT_START = '[';
  74
  75   private static final char COMMENT_END = ']';
  76
  77   private static final String HASHSIGN = "#";
  78
  79   private static final String SEMICOLON = ";";
  80
  81   private static final String BANG = "!";
  82
  83   private static final String EQUALS = "=";
  84
  85   private static final String MEGA_ID = HASHSIGN + "MEGA";
  86
  87   private static final String TITLE = "Title";
  88
  89   private static final String FORMAT = "Format";
  90
  91   private static final String DESCRIPTION = "Description";
  92
  93   private static final String GENE = "Gene";
  94
  95   private static final String DOMAIN = "Domain";
  96
  97   /*
  98    * names of properties to save to the alignment (may affect eventual output
  99    * format)
 100    */
 101   static final String PROP_TITLE = "MEGA_TITLE";
 102
 103   static final String PROP_INTERLEAVED = "MEGA_INTERLEAVED";
 104
 105   static final String PROP_DESCRIPTION = "MEGA_DESCRIPTION";
 106
 107   static final String PROP_CODETABLE = "MEGA_CODETABLE";
 108
 109   static final String PROP_IDENTITY = "MEGA_IDENTITY";
 110
 111   static final String PROP_MISSING = "MEGA_MISSING";
 112
 113   static final String PROP_DATATYPE = "MEGA_DATATYPE";
 114
 115   // number of bases per line of file (value is inferred)
 116   static final String PROP_LINELENGTH = "MEGA_LINELENGTH";
 117
 118   // TODO: need a controlled name for Gene as a feature if we want to be able to
 119   // output the MEGA file with !Gene headers
 120   // WTF do we do if the sequences get realigned?
 121
 122   // initial size for sequence data buffer
 123   private static final int SEQBUFFERSIZE = 256;
 124
 125   private static final String SPACE = " ";
 126
 127   /*
 128    * number of sequence positions output per line
 129    */
 130   private int positionsPerLine;
 131
 132   private String title;
 133
 134   // gap character may be explicitly declared, if not we infer it
 135   private Character gapCharacter;
 136
 137   // this can be True, False or null (meaning not asserted in file)
 138   private Boolean nucleotide;
 139
 140   // set once we have seen one block of interleaved data
 141   private boolean firstDataBlockRead = false;
 142
 143   // this can be True, False or null (meaning we don't know yet)
 144   private Boolean interleaved;
 145
 146   public MegaFile()
 147   {
 148   }
 149
 150   public MegaFile(String inFile, String type) throws IOException
 151   {
 152     super(inFile, type);
 153   }
 154
 155   public MegaFile(FileParse source) throws IOException
 156   {
 157     super(source);
 158   }
 159
 160   /**
 161    * Parse the input stream.
 162    */
 163   @Override
 164   public void parse() throws IOException
 165   {
 166     /*
 167      * Read and process MEGA and Title/Format/Description headers if present.
 168      * Returns the first data line following the headers.
 169      */
 170     String dataLine = parseHeaderLines();
 171
 172     /*
 173      * Temporary store of {sequenceId, positionData} while parsing interleaved
 174      * sequences; sequences are maintained in the order in which they are added
 175      * i.e. read in the file
 176      */
 177     Map<String, StringBuilder> seqData = new LinkedHashMap<String, StringBuilder>();
 178
 179     /*
 180      * The id of the sequence being read (for non-interleaved)
 181      */
 182     String currentId = "";
 183
 184     while (dataLine != null)
 185     {
 186       dataLine = dataLine.trim();
 187       if (dataLine.length() > 0)
 188       {
 189         if (dataLine.startsWith(BANG + GENE))
 190         {
 191           parseGene(dataLine);
 192         }
 193         else if (dataLine.startsWith(BANG + DOMAIN))
 194         {
 195           parseDomain(dataLine);
 196         }
 197         else
 198         {
 199           currentId = parseDataLine(dataLine, seqData, currentId);
 200         }
 201       }
 202       else if (!seqData.isEmpty())
 203       {
 204         /*
 205          * Blank line after processing some data...
 206          */
 207         this.firstDataBlockRead = true;
 208       }
 209       dataLine = nextNonCommentLine();
 210     }
 211
 212     // remember the (longest) line length read in, so we can output the same
 213     setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
 214
 215     setSequences(seqData);
 216   }
 217
 218   /**
 219    * Parse a !Gene command line
 220    *
 221    * @param dataLine
 222    */
 223   protected void parseGene(String dataLine)
 224   {
 225   }
 226
 227   /**
 228    * Parse a !Domain command line
 229    *
 230    * @param dataLine
 231    */
 232   private void parseDomain(String dataLine)
 233   {
 234   }
 235
 236   /**
 237    * Returns the next line that is not a comment, or null at end of file.
 238    * Comments in MEGA are within [ ] brackets, and may be nested.
 239    *
 240    * @return
 241    * @throws IOException
 242    */
 243   protected String nextNonCommentLine() throws IOException
 244   {
 245     return nextNonCommentLine(0);
 246   }
 247
 248   /**
 249    * Returns the next line that is not a comment, or null at end of file.
 250    * Comments in MEGA are within [ ] brackets, and may be nested.
 251    *
 252    * @param depth
 253    *          current depth of nesting of comments while parsing
 254    * @return
 255    * @throws IOException
 256    */
 257   protected String nextNonCommentLine(final int depth) throws IOException
 258   {
 259     String data = null;
 260     data = nextLine();
 261     if (data == null)
 262     {
 263       if (depth > 0)
 264       {
 265         System.err.println("Warning: unterminated comment in data file");
 266       }
 267       return data;
 268     }
 269     int leftBracket = data.indexOf(COMMENT_START);
 270
 271     /*
 272      * reject unnested comment following data on the same line
 273      */
 274     if (depth == 0 && leftBracket > 0)
 275     {
 276       throw new FileFormatException(
 277               "Can't parse comment following data at " + data);
 278     }
 279
 280     /*
 281      * If we are in a (possibly nested) comment after parsing this line, keep
 282      * reading recursively until the comment has unwound
 283      */
 284     int newDepth = commentDepth(data, depth);
 285     if (newDepth > 0)
 286     {
 287       return nextNonCommentLine(newDepth);
 288     }
 289     else
 290     {
 291       /*
 292        * not in a comment by end of this line; return what is left (or the next
 293        * line if that is empty)
 294        */
 295       String nonCommentPart = getNonCommentContent(data, depth);
 296       // if (nonCommentPart.length() > 0)
 297       // {
 298         return nonCommentPart;
 299       // }
 300       // return nextNonCommentLine(0);
 301     }
 302   }
 303
 304   /**
 305    * Returns what is left of the input data after removing any comments, whether
 306    * 'in progress' from preceding lines, or embedded in the current line
 307    *
 308    * @param data
 309    *          input data
 310    * @param depth
 311    *          nested depth of comments pending termination
 312    * @return
 313    * @throws FileFormatException
 314    */
 315   protected static String getNonCommentContent(String data, int depth)
 316           throws FileFormatException
 317   {
 318     int len = data.length();
 319     StringBuilder result = new StringBuilder(len);
 320     for (int i = 0; i < len; i++)
 321     {
 322       char c = data.charAt(i);
 323       switch (c)
 324       {
 325       case COMMENT_START:
 326         depth++;
 327         break;
 328
 329       case COMMENT_END:
 330         if (depth > 0)
 331         {
 332           depth--;
 333         }
 334         else
 335         {
 336           result.append(c);
 337         }
 338         break;
 339
 340       default:
 341         if (depth == 0)
 342         {
 343           result.append(c);
 344         }
 345       }
 346     }
 347     return result.toString();
 348   }
 349
 350   /**
 351    * Calculates new depth of comment after parsing an input line i.e. the excess
 352    * of opening '[' over closing ']' characters. Any excess ']' are ignored (not
 353    * treated as comment delimiters).
 354    *
 355    * @param data
 356    *          input line
 357    * @param depth
 358    *          current comment nested depth before parsing the line
 359    * @return new depth after parsing the line
 360    */
 361   protected static int commentDepth(CharSequence data, int depth)
 362   {
 363     int newDepth = depth;
 364     int len = data.length();
 365     for (int i = 0; i < len; i++)
 366     {
 367       char c = data.charAt(i);
 368       if (c == COMMENT_START)
 369       {
 370         newDepth++;
 371       }
 372       else if (c == COMMENT_END && newDepth > 0)
 373       {
 374         newDepth--;
 375       }
 376     }
 377     return newDepth;
 378   }
 379
 380   /**
 381    * Convert the parsed sequence strings to objects and store them in the model.
 382    *
 383    * @param seqData
 384    */
 385   protected void setSequences(Map<String, StringBuilder> seqData)
 386   {
 387     Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
 388
 389     for (Entry<String, StringBuilder> dataset : datasets)
 390     {
 391       String sequenceId = dataset.getKey();
 392       StringBuilder characters = dataset.getValue();
 393       SequenceI s = new Sequence(sequenceId, new String(characters));
 394       this.seqs.addElement(s);
 395     }
 396   }
 397
 398   /**
 399    * Process one line of sequence data. If it has no sequence identifier, append
 400    * to the current id's sequence. Else parse out the sequence id and append the
 401    * data (if any) to that id's sequence. Returns the sequence id (implicit or
 402    * explicit) for this line.
 403    *
 404    * @param dataLine
 405    * @param seqData
 406    * @param currentid
 407    * @return
 408    * @throws IOException
 409    */
 410   protected String parseDataLine(String dataLine,
 411           Map<String, StringBuilder> seqData, String currentId)
 412           throws IOException
 413   {
 414     String seqId = getSequenceId(dataLine);
 415     if (seqId == null)
 416     {
 417       /*
 418        * Just character data
 419        */
 420       parseNoninterleavedDataLine(dataLine, seqData, currentId);
 421       return currentId;
 422     }
 423     else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
 424     {
 425       /*
 426        * Sequence id only - header line for noninterleaved data
 427        */
 428       return seqId;
 429     }
 430     else
 431     {
 432       /*
 433        * Sequence id followed by data
 434        */
 435       parseInterleavedDataLine(dataLine, seqData, seqId);
 436       return seqId;
 437     }
 438   }
 439
 440   /**
 441    * Add a line of sequence data to the buffer for the given sequence id. Start
 442    * a new one if we haven't seen it before.
 443    *
 444    * @param dataLine
 445    * @param seqData
 446    * @param currentId
 447    * @throws IOException
 448    */
 449   protected void parseNoninterleavedDataLine(String dataLine,
 450           Map<String, StringBuilder> seqData, String currentId)
 451           throws IOException
 452   {
 453     if (currentId == null)
 454     {
 455       /*
 456        * Oops. Data but no sequence id context.
 457        */
 458       throw new IOException("No sequence id context at: " + dataLine);
 459     }
 460
 461     assertInterleaved(false, dataLine);
 462
 463     StringBuilder sb = getSequenceDataBuffer(seqData, currentId);
 464
 465     /*
 466      * Add the current line of data to the sequence.
 467      */
 468     sb.append(dataLine);
 469
 470     setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
 471   }
 472
 473   /**
 474    * Get the sequence data for this sequence id, starting a new one if
 475    * necessary.
 476    *
 477    * @param seqData
 478    * @param currentId
 479    * @return
 480    */
 481   protected StringBuilder getSequenceDataBuffer(
 482           Map<String, StringBuilder> seqData, String currentId)
 483   {
 484     StringBuilder sb = seqData.get(currentId);
 485     if (sb == null)
 486     {
 487       // first data met for this sequence id, start a new buffer
 488       sb = new StringBuilder(SEQBUFFERSIZE);
 489       seqData.put(currentId, sb);
 490     }
 491     return sb;
 492   }
 493
 494   /**
 495    * Parse one line of interleaved data e.g.
 496    *
 497    * <pre>
 498    * #TheSeqId CGATCGCATGCA
 499    * </pre>
 500    *
 501    * @param dataLine
 502    * @param seqData
 503    * @param seqId
 504    * @throws IOException
 505    */
 506   protected void parseInterleavedDataLine(String dataLine,
 507           Map<String, StringBuilder> seqData, String seqId)
 508           throws IOException
 509   {
 510     /*
 511      * New sequence found in second or later data block - error.
 512      */
 513     if (this.firstDataBlockRead && !seqData.containsKey(seqId))
 514     {
 515       throw new IOException(
 516               "Parse error: misplaced new sequence starting at " + dataLine);
 517     }
 518
 519     StringBuilder sb = getSequenceDataBuffer(seqData, seqId);
 520     String data = dataLine.substring(seqId.length() + 1).trim();
 521
 522     /*
 523      * Do nothing if this line is _only_ a sequence id with no data following.
 524      *
 525      * Remove any internal spaces
 526      */
 527     if (data != null && data.length() > 0)
 528     {
 529       if (data.indexOf(SPACE) != -1)
 530       {
 531         data = data.replace(SPACE, "");
 532       }
 533       sb.append(data);
 534       setPositionsPerLine(Math.max(positionsPerLine, data.length()));
 535       assertInterleaved(true, dataLine);
 536     }
 537   }
 538
 539   /**
 540    * If the line begins with (e.g.) "#abcde " then returns "abcde" as the
 541    * identifier. Else returns null.
 542    *
 543    * @param dataLine
 544    * @return
 545    */
 546   public static String getSequenceId(String dataLine)
 547   {
 548     // TODO refactor to a StringUtils type class
 549     if (dataLine != null)
 550     {
 551       if (dataLine.startsWith(HASHSIGN))
 552       {
 553         int spacePos = dataLine.indexOf(" ");
 554         return (spacePos == -1 ? dataLine.substring(1) : dataLine
 555                 .substring(1, spacePos));
 556       }
 557     }
 558     return null;
 559   }
 560
 561   /**
 562    * Read the #MEGA and Title/Format/Description header lines (if present).
 563    *
 564    * Save as alignment properties in case useful.
 565    *
 566    * @return the next non-blank line following the header lines.
 567    * @throws IOException
 568    */
 569   protected String parseHeaderLines() throws IOException
 570   {
 571     String inputLine = null;
 572     while ((inputLine = nextNonCommentLine()) != null)
 573     {
 574       inputLine = inputLine.trim();
 575
 576       /*
 577        * skip blank lines
 578        */
 579       if (inputLine.length() == 0)
 580       {
 581         continue;
 582       }
 583
 584       if (inputLine.toUpperCase().startsWith(MEGA_ID))
 585       {
 586         continue;
 587       }
 588
 589       if (isTitle(inputLine))
 590       {
 591         this.title = getValue(inputLine);
 592         setAlignmentProperty(PROP_TITLE, title);
 593       }
 594       else if (inputLine.startsWith(BANG + DESCRIPTION))
 595       {
 596         parseDescription(inputLine);
 597       }
 598
 599       else if (inputLine.startsWith(BANG + FORMAT))
 600       {
 601         parseFormat(inputLine);
 602       }
 603       else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
 604       {
 605
 606         /*
 607          * Return the first 'data line' i.e. one that is not blank, #MEGA or
 608          * TITLE:
 609          */
 610         break;
 611       }
 612     }
 613     return inputLine;
 614   }
 615
 616   /**
 617    * Parse a !Format statement. This may be multiline, and is ended by a
 618    * semicolon.
 619    *
 620    * @param inputLine
 621    * @throws IOException
 622    */
 623   protected void parseFormat(String inputLine) throws IOException
 624   {
 625     while (inputLine != null)
 626     {
 627       parseFormatLine(inputLine);
 628       if (inputLine.endsWith(SEMICOLON))
 629       {
 630         break;
 631       }
 632       inputLine = nextNonCommentLine();
 633     }
 634   }
 635
 636   /**
 637    * Parse one line of a !Format statement. This may contain one or more
 638    * keyword=value pairs.
 639    *
 640    * @param inputLine
 641    * @throws FileFormatException
 642    */
 643   protected void parseFormatLine(String inputLine)
 644           throws FileFormatException
 645   {
 646     if (inputLine.startsWith(BANG + FORMAT))
 647     {
 648       inputLine = inputLine.substring((BANG + FORMAT).length());
 649     }
 650     if (inputLine.endsWith(SEMICOLON))
 651     {
 652       inputLine = inputLine.substring(0, inputLine.length() - 1);
 653     }
 654     if (inputLine.length() == 0)
 655     {
 656       return;
 657     }
 658     String[] tokens = inputLine.trim().split("\\s"); // any whitespace
 659     for (String token : tokens)
 660     {
 661       parseFormatKeyword(token);
 662     }
 663   }
 664
 665   /**
 666    * Parse a Keyword=Value token. Possible keywords are
 667    * <ul>
 668    * <li>DataType= DNA, RNA, Nucleotide, Protein</li>
 669    * <li>DataFormat= Interleaved, ?</li>
 670    * <li>NSeqs= number of sequences (synonym NTaxa)</li>
 671    * <li>NSites= number of bases / residues</li>
 672    * <li>Property= Exon (or Coding), Intron (or Noncoding), End (of domain)</li>
 673    * <li>Indel= gap character</li>
 674    * <li>Identical= identity character (to first sequence) (synonym MatchChar)</li>
 675    * <li>Missing= missing data character</li>
 676    * <li>CodeTable= Standard, other (MEGA supports various)</li>
 677    * </ul>
 678    *
 679    * @param token
 680    * @throws FileFormatException
 681    *           if an unrecognised keyword or value is encountered
 682    */
 683   protected void parseFormatKeyword(String token)
 684           throws FileFormatException
 685   {
 686     String msg = "Unrecognised Format command: " + token;
 687     String[] bits = token.split(EQUALS);
 688     if (bits.length != 2)
 689     {
 690       throw new FileFormatException(msg);
 691     }
 692     String keyword = bits[0];
 693     String value = bits[1];
 694
 695     /*
 696      * Jalview will work out whether nucleotide or not anyway
 697      */
 698     if (keyword.equalsIgnoreCase(DATATYPE))
 699     {
 700       if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
 701               || value.equalsIgnoreCase("Nucleotide"))
 702       {
 703         this.nucleotide = true;
 704         // alignment computes whether or not it is nucleotide when created
 705       }
 706       else if (value.equalsIgnoreCase(PROTEIN))
 707       {
 708         this.nucleotide = false;
 709       }
 710       else
 711       {
 712         throw new FileFormatException(msg);
 713       }
 714       setAlignmentProperty(PROP_DATATYPE, value);
 715     }
 716
 717     /*
 718      * accept non-Standard code table but save in case we want to disable
 719      * 'translate as cDNA'
 720      */
 721     else if (keyword.equalsIgnoreCase(CODETABLE))
 722     {
 723       setAlignmentProperty(PROP_CODETABLE, value);
 724     }
 725
 726     /*
 727      * save gap char to set later on alignment once created
 728      */
 729     else if (keyword.equalsIgnoreCase(INDEL))
 730     {
 731       this.gapCharacter = value.charAt(0);
 732     }
 733
 734     else if (keyword.equalsIgnoreCase(IDENTICAL)
 735             || keyword.equalsIgnoreCase("MatchChar"))
 736     {
 737       setAlignmentProperty(PROP_IDENTITY, value);
 738       if (!".".equals(value))
 739       {
 740         System.err.println("Warning: " + token
 741                 + " not supported, Jalview uses '.' for identity");
 742       }
 743     }
 744
 745     else if (keyword.equalsIgnoreCase(MISSING))
 746     {
 747       setAlignmentProperty(PROP_MISSING, value);
 748       System.err.println("Warning: " + token + " not supported");
 749     }
 750
 751     else if (keyword.equalsIgnoreCase("Property"))
 752     {
 753       // TODO: figure out what to do with this
 754       // can it appear more than once in a file?
 755       setAlignmentProperty(PROP_MISSING, value);
 756     }
 757
 758     else if (!keyword.equalsIgnoreCase(N_SEQS)
 759             && !keyword.equalsIgnoreCase(N_SITES))
 760     {
 761       System.err.println("Warning: " + msg);
 762     }
 763   }
 764
 765   /**
 766    * Returns the trimmed data on the line following either whitespace or '=',
 767    * with any trailing semi-colon removed<br>
 768    * So
 769    * <ul>
 770    * <li>Hello World</li>
 771    * <li>!Hello: \tWorld;</li>
 772    * <li>!Hello=World</li>
 773    * <ul>
 774    * should all return "World"
 775    *
 776    * @param inputLine
 777    * @return
 778    */
 779   protected static String getValue(String inputLine)
 780   {
 781     if (inputLine == null)
 782     {
 783       return null;
 784     }
 785     String value = null;
 786     String s = inputLine.replaceAll("\t", " ").trim();
 787
 788     /*
 789      * KEYWORD = VALUE should return VALUE
 790      */
 791     int equalsPos = s.indexOf("=");
 792     if (equalsPos >= 0)
 793     {
 794       value = s.substring(equalsPos + 1);
 795     }
 796     else
 797     {
 798       int spacePos = s.indexOf(' ');
 799       value = spacePos == -1 ? "" : s.substring(spacePos + 1);
 800     }
 801     value = value.trim();
 802     if (value.endsWith(SEMICOLON))
 803     {
 804       value = value.substring(0, value.length() - 1).trim();
 805     }
 806     return value;
 807   }
 808
 809   /**
 810    * Returns true if the input line starts with "TITLE" or "!TITLE" (not case
 811    * sensitive). The latter is the official format, some older data file
 812    * examples have it without the !.
 813    *
 814    * @param inputLine
 815    * @return
 816    */
 817   protected static boolean isTitle(String inputLine)
 818   {
 819     if (inputLine == null)
 820     {
 821       return false;
 822     }
 823     String upper = inputLine.toUpperCase();
 824     return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
 825             + TITLE.toUpperCase()));
 826   }
 827
 828   /**
 829    * Reads lines until terminated by semicolon, appending each to the
 830    * Description property value.
 831    *
 832    * @throws IOException
 833    */
 834   protected void parseDescription(String firstDescriptionLine)
 835           throws IOException
 836   {
 837     StringBuilder desc = new StringBuilder(256);
 838     String line = getValue(firstDescriptionLine);
 839     while (line != null)
 840     {
 841       if (line.endsWith(SEMICOLON))
 842       {
 843         desc.append(line.substring(0, line.length() - 1));
 844         break;
 845       }
 846       else if (line.length() > 0)
 847       {
 848         desc.append(line).append(newline);
 849       }
 850       line = nextNonCommentLine();
 851     }
 852     setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
 853   }
 854
 855   /**
 856    * Returns the alignment sequences in Mega format.
 857    */
 858   @Override
 859   public String print()
 860   {
 861     return MEGA_ID + newline + print(getSeqsAsArray());
 862   }
 863
 864   /**
 865    * Write out the alignment sequences in Mega format - interleaved unless
 866    * explicitly noninterleaved.
 867    */
 868   protected String print(SequenceI[] s)
 869   {
 870     String result;
 871     if (this.interleaved != null && !this.interleaved)
 872     {
 873       result = printNonInterleaved(s);
 874     }
 875     else
 876     {
 877       result = printInterleaved(s);
 878     }
 879     return result;
 880   }
 881
 882   /**
 883    * Print to string in Interleaved format - blocks of next N characters of each
 884    * sequence in turn.
 885    *
 886    * @param s
 887    */
 888   protected String printInterleaved(SequenceI[] s)
 889   {
 890     int maxIdLength = getMaxIdLength(s);
 891     int maxSequenceLength = getMaxSequenceLength(s);
 892     int numLines = maxSequenceLength / positionsPerLine + 3; // approx
 893
 894     /*
 895      * Size a buffer to hold the whole output
 896      */
 897     StringBuilder sb = new StringBuilder(numLines
 898             * (maxIdLength + 2 + positionsPerLine));
 899
 900     int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1;
 901     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
 902     int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;
 903
 904     /*
 905      * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
 906      */
 907     int from = 0;
 908     for (int i = 0; i < numDataBlocks; i++)
 909     {
 910       sb.append(newline);
 911       boolean first = true;
 912       int advancedBy = 0;
 913       for (SequenceI seq : s)
 914       {
 915         int seqFrom = from;
 916         String seqId = String.format("#%-" + maxIdLength + "s",
 917                 seq.getName());
 918
 919         /*
 920          * output next line for this sequence
 921          */
 922         sb.append(seqId);
 923         int lastPos = seqFrom + positionsPerLine; // exclusive
 924         for (int j = 0; j < chunksPerLine; j++)
 925         {
 926           char[] subSequence = seq.getSequence(seqFrom,
 927                   Math.min(lastPos, seqFrom + spaceEvery));
 928           if (subSequence.length > 0)
 929           {
 930             sb.append(SPACE).append(subSequence);
 931           }
 932           seqFrom += subSequence.length;
 933           if (first)
 934           {
 935             // all sequences should be the same length in MEGA
 936             advancedBy += subSequence.length;
 937           }
 938         }
 939         sb.append(newline);
 940         first = false;
 941       }
 942       from += advancedBy;
 943     }
 944
 945     return new String(sb);
 946   }
 947
 948   /**
 949    * Outputs to string the MEGA header and any other known and relevant
 950    * alignment properties
 951    *
 952    * @param al
 953    */
 954   protected String printHeaders(AlignmentI al)
 955   {
 956     StringBuilder sb = new StringBuilder(128);
 957     sb.append(MEGA_ID).append(newline);
 958     printProperty(al, sb, PROP_TITLE, TITLE);
 959     printProperty(al, sb, PROP_DESCRIPTION, DESCRIPTION);
 960
 961     /*
 962      * !Format DataType CodeTable
 963      */
 964     sb.append(BANG).append(FORMAT).append(newline);
 965     String dataType = (String) al.getProperty(PROP_DATATYPE);
 966     if (dataType == null)
 967     {
 968       dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
 969     }
 970     sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
 971     String codeTable = (String) al.getProperty(PROP_CODETABLE);
 972     sb.append(SPACE).append(CODETABLE).append(EQUALS)
 973             .append(codeTable == null ? "Standard" : codeTable)
 974             .append(newline);
 975
 976     /*
 977      * !Format NSeqs NSites
 978      * NSites the length of any sequence (they should all be the same), excluding
 979      * gaps?!?
 980      */
 981     sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
 982     SequenceI seq = al.getSequenceAt(0);
 983     sb.append(SPACE).append(N_SITES).append(EQUALS)
 984             .append(seq.getEnd() - seq.getStart() + 1);
 985     sb.append(newline);
 986
 987     /*
 988      * !Format Indel Identical Missing
 989      */
 990     sb.append(INDENT);
 991     sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
 992     String identity = (String) al.getProperty(PROP_IDENTITY);
 993     if (identity != null)
 994     {
 995       sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
 996     }
 997     String missing = (String) al.getProperty(PROP_MISSING);
 998     if (missing != null)
 999     {
1000       sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
1001     }
1002     sb.append(SEMICOLON).append(newline);
1003
1004     return sb.toString();
1005   }
1006
1007   /**
1008    * Get the longest sequence id (to allow aligned printout).
1009    *
1010    * @param s
1011    * @return
1012    */
1013   protected static int getMaxIdLength(SequenceI[] s)
1014   {
1015     // TODO pull up for reuse
1016     int maxLength = 0;
1017     for (SequenceI seq : s)
1018     {
1019       int len = seq.getName().length();
1020       if (len > maxLength)
1021       {
1022         maxLength = len;
1023       }
1024     }
1025     return maxLength;
1026   }
1027
1028   /**
1029    * Get the longest sequence length
1030    *
1031    * @param s
1032    * @return
1033    */
1034   protected static int getMaxSequenceLength(SequenceI[] s)
1035   {
1036     // TODO pull up for reuse
1037     int maxLength = 0;
1038     for (SequenceI seq : s)
1039     {
1040       int len = seq.getLength();
1041       if (len > maxLength)
1042       {
1043         maxLength = len;
1044       }
1045     }
1046     return maxLength;
1047   }
1048
1049   /**
1050    * Print to string in noninterleaved format - all of each sequence in turn, in
1051    * blocks of 50 characters.
1052    *
1053    * @param s
1054    * @return
1055    */
1056   protected String printNonInterleaved(SequenceI[] s)
1057   {
1058     int maxSequenceLength = getMaxSequenceLength(s);
1059     // approx
1060     int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;
1061
1062     /*
1063      * Roughly size a buffer to hold the whole output
1064      */
1065     StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
1066
1067     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1068     int chunksPerLine = positionsPerLine / spaceEvery;
1069     for (SequenceI seq : s)
1070     {
1071       sb.append(newline);
1072       sb.append(HASHSIGN + seq.getName()).append(newline);
1073       int startPos = 0;
1074       while (startPos < seq.getLength())
1075       {
1076         boolean firstChunk = true;
1077         /*
1078          * print next line for this sequence
1079          */
1080         int lastPos = startPos + positionsPerLine; // exclusive
1081         for (int j = 0; j < chunksPerLine; j++)
1082         {
1083           char[] subSequence = seq.getSequence(startPos,
1084                   Math.min(lastPos, startPos + positionsPerLine));
1085           if (subSequence.length > 0)
1086           {
1087             if (!firstChunk)
1088             {
1089               sb.append(SPACE);
1090             }
1091             sb.append(subSequence);
1092             firstChunk = false;
1093           }
1094           startPos += subSequence.length;
1095         }
1096         sb.append(newline);
1097       }
1098     }
1099
1100     return new String(sb);
1101   }
1102
1103   /**
1104    * Flag this file as interleaved or not, based on data format. Throws an
1105    * exception if has previously been determined to be otherwise.
1106    *
1107    * @param isIt
1108    * @param dataLine
1109    * @throws IOException
1110    */
1111   protected void assertInterleaved(boolean isIt, String dataLine)
1112           throws FileFormatException
1113   {
1114     if (this.interleaved != null && isIt != this.interleaved.booleanValue())
1115     {
1116       throw new FileFormatException(
1117               "Parse error: mix of interleaved and noninterleaved detected, at line: "
1118                       + dataLine);
1119     }
1120     this.interleaved = new Boolean(isIt);
1121     setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
1122   }
1123
1124   public boolean isInterleaved()
1125   {
1126     return this.interleaved == null ? false : this.interleaved
1127             .booleanValue();
1128   }
1129
1130   /**
1131    * Adds saved parsed values either as alignment properties, or (in some cases)
1132    * as specific member fields of the alignment
1133    */
1134   @Override
1135   public void addProperties(AlignmentI al)
1136   {
1137     super.addProperties(al);
1138     if (this.gapCharacter != null)
1139     {
1140       al.setGapCharacter(gapCharacter);
1141     }
1142
1143     /*
1144      * warn if e.g. DataType=DNA but data is protein (or vice versa)
1145      */
1146     if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) {
1147       System.err.println("Warning: " + this.title + " declared "
1148               + (nucleotide ? "" : " not ") + "nucleotide but it is"
1149               + (nucleotide ? " not" : ""));
1150     }
1151   }
1152
1153   /**
1154    * Print the given alignment in MEGA format. If the alignment was created by
1155    * parsing a MEGA file, it should have properties set (e.g. Title) which can
1156    * influence the output.
1157    */
1158   @Override
1159   public String print(AlignmentI al)
1160   {
1161     this.nucleotide = al.isNucleotide();
1162     String lineLength = (String) al.getProperty(PROP_LINELENGTH);
1163     this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
1164             .parseInt(lineLength);
1165     return printHeaders(al) + print(al.getSequencesArray());
1166   }
1167
1168   /**
1169    * Helper method to append a property e.g. !Title to the output buffer, if the
1170    * property is set on the alignment.
1171    *
1172    * @param al
1173    * @param headers
1174    * @param propertyName
1175    * @param propertyKeyword
1176    */
1177   protected void printProperty(AlignmentI al, StringBuilder headers,
1178           String propertyName, String propertyKeyword)
1179   {
1180     String propertyValue = (String) al.getProperty(propertyName);
1181     if (propertyValue != null)
1182     {
1183       headers.append(BANG).append(propertyKeyword).append(SPACE)
1184               .append(propertyValue).append(SEMICOLON)
1185               .append(newline);
1186     }
1187   }
1188
1189   /**
1190    * Returns the number of sequence positions output per line
1191    *
1192    * @return
1193    */
1194   public int getPositionsPerLine()
1195   {
1196     return positionsPerLine;
1197   }
1198
1199   /**
1200    * Sets the number of sequence positions output per line. Note these will be
1201    * formatted in blocks of 3 (nucleotide) or 10 (peptide).
1202    *
1203    * @param p
1204    */
1205   public void setPositionsPerLine(int p)
1206   {
1207     this.positionsPerLine = p;
1208   }
1209 }