src/jalview/io/MegaFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  * The Jalview Authors are detailed in the 'AUTHORS' file.
  18  */
  19 package jalview.io;
  20
  21 import jalview.datamodel.AlignmentI;
  22 import jalview.datamodel.Sequence;
  23 import jalview.datamodel.SequenceFeature;
  24 import jalview.datamodel.SequenceI;
  25
  26 import java.io.IOException;
  27 import java.util.ArrayList;
  28 import java.util.HashMap;
  29 import java.util.Iterator;
  30 import java.util.LinkedHashMap;
  31 import java.util.List;
  32 import java.util.Map;
  33 import java.util.Map.Entry;
  34 import java.util.Set;
  35
  36 /**
  37  * A parser for input or output of MEGA format files. <br>
  38  * <br>
  39  * Tamura K, Stecher G, Peterson D, Filipski A, and Kumar S (2013) MEGA6:
  40  * Molecular Evolutionary Genetics Analysis Version 6.0. Molecular Biology and
  41  * Evolution 30: 2725-2729. <br>
  42  * <br>
  43  *
  44  * MEGA file format is supported as described in
  45  * http://www.megasoftware.net/manual.pdf <br>
  46  * Limitations:
  47  * <ul>
  48  * <li>nested comments (marked by [ ]) are accepted but not preserved</li>
  49  * <li>to be completed</li>
  50  * </ul>
  51  *
  52  * @see http://www.megasoftware.net/
  53  */
  54 public class MegaFile extends AlignFile
  55 {
  56   private static final String WHITESPACE = "\\s+";
  57
  58   private static final int DEFAULT_LINE_LENGTH = 60;
  59
  60   private static final String INDENT = "    ";
  61
  62   private static final String N_SITES = "NSites";
  63
  64   private static final String N_SEQS = "NSeqs";
  65
  66   private static final String MISSING = "Missing";
  67
  68   private static final String IDENTICAL = "Identical";
  69
  70   private static final String INDEL = "Indel";
  71
  72   private static final String CODETABLE = "CodeTable";
  73
  74   private static final String PROTEIN = "Protein";
  75
  76   private static final String NUCLEOTIDE = "Nucleotide";
  77
  78   private static final String DATATYPE = "DataType";
  79
  80   private static final char COMMENT_START = '[';
  81
  82   private static final char COMMENT_END = ']';
  83
  84   private static final String HASHSIGN = "#";
  85
  86   private static final String SEMICOLON = ";";
  87
  88   private static final String BANG = "!";
  89
  90   private static final String EQUALS = "=";
  91
  92   private static final String MEGA_ID = HASHSIGN + "MEGA";
  93
  94   private static final String TITLE = "Title";
  95
  96   private static final String FORMAT = "Format";
  97
  98   private static final String DESCRIPTION = "Description";
  99
 100   private static final String GENE = "Gene";
 101
 102   private static final String DOMAIN = "Domain";
 103
 104   private static final String PROPERTY = "Property";
 105
 106   private static final String CODONSTART = "CodonStart";
 107
 108   private static final String LABEL = "Label";
 109
 110   /*
 111    * names of properties to save to the alignment (may affect eventual output
 112    * format)
 113    */
 114   static final String PROP_TITLE = "MEGA_TITLE";
 115
 116   static final String PROP_INTERLEAVED = "MEGA_INTERLEAVED";
 117
 118   static final String PROP_DESCRIPTION = "MEGA_DESCRIPTION";
 119
 120   static final String PROP_CODETABLE = "MEGA_CODETABLE";
 121
 122   static final String PROP_IDENTITY = "MEGA_IDENTITY";
 123
 124   static final String PROP_MISSING = "MEGA_MISSING";
 125
 126   static final String PROP_DATATYPE = "MEGA_DATATYPE";
 127
 128   // number of bases per line of file (value is inferred)
 129   static final String PROP_LINELENGTH = "MEGA_LINELENGTH";
 130
 131   // TODO: need a controlled name for Gene as a feature if we want to be able to
 132   // output the MEGA file with !Gene headers
 133   // WTF do we do if the sequences get realigned?
 134
 135   // initial size for sequence data buffer
 136   private static final int SEQBUFFERSIZE = 256;
 137
 138   private static final String SPACE = " ";
 139
 140   /*
 141    * number of sequence positions output per line
 142    */
 143   private int positionsPerLine;
 144
 145   private String title;
 146
 147   // gap character may be explicitly declared, default is -
 148   private char gapCharacter = '-';
 149
 150   // identity character if declared
 151   private char identityCharacter = 0;
 152
 153   // this can be True, False or null (meaning not asserted in file)
 154   private Boolean nucleotide;
 155
 156   // set once we have seen one block of interleaved data
 157   private boolean firstDataBlockRead = false;
 158
 159   // this can be True, False or null (meaning we don't know yet)
 160   private Boolean interleaved;
 161
 162   // write end of line positions as a comment
 163   private boolean writePositionNumbers = true;
 164
 165   // id of sequence being processed
 166   private String currentSequenceId;
 167
 168   /*
 169    * Temporary store of {sequenceId, positionData} while parsing interleaved
 170    * sequences; sequences are maintained in the order in which they are added
 171    * i.e. read in the file
 172    */
 173   Map<String, StringBuilder> seqData;
 174
 175   // number of residues read (so far) per sequence
 176   Map<String, Integer> residuesRead;
 177
 178   // current Gene if any we are parsing
 179   private String currentGene;
 180
 181   // start residue (base 1) per sequence of current gene
 182   Map<String, Integer> geneStart;
 183
 184   // current Domain if any we are parsing
 185   private String currentDomain;
 186
 187   // start residue (base 1) per sequence of current domain
 188   Map<String, Integer> domainStart;
 189
 190   // map of SequenceFeature's by sequence id
 191   Map<String, List<SequenceFeature>> sequenceFeatures;
 192
 193   public MegaFile()
 194   {
 195   }
 196
 197   public MegaFile(String inFile, String type) throws IOException
 198   {
 199     super(inFile, type);
 200   }
 201
 202   public MegaFile(FileParse source) throws IOException
 203   {
 204     super(source);
 205   }
 206
 207   /**
 208    * Parse the input stream.
 209    */
 210   @Override
 211   public void parse() throws IOException
 212   {
 213     gapCharacter = '-';
 214     sequenceFeatures = new HashMap<String, List<SequenceFeature>>();
 215     geneStart = new HashMap<String, Integer>();
 216     domainStart = new HashMap<String, Integer>();
 217     residuesRead = new HashMap<String, Integer>();
 218
 219     /*
 220      * Read and process MEGA and Title/Format/Description headers if present.
 221      * Returns the first data line following the headers.
 222      */
 223     String dataLine = parseHeaderLines();
 224
 225     /*
 226      * order-preserving map to hold sequences by id as they are built up during
 227      * parsing
 228      */
 229     seqData = new LinkedHashMap<String, StringBuilder>();
 230
 231     /*
 232      * The id of the sequence being read (for non-interleaved)
 233      */
 234     currentSequenceId = "";
 235
 236     while (dataLine != null)
 237     {
 238       dataLine = dataLine.trim();
 239       if (dataLine.length() > 0)
 240       {
 241         if (dataLine.startsWith(BANG + GENE)
 242                 || dataLine.startsWith(BANG + DOMAIN))
 243         {
 244           parseGeneOrDomain(dataLine);
 245         }
 246         else if (dataLine.startsWith(BANG + LABEL))
 247         {
 248           parseLabel(dataLine);
 249         }
 250         else
 251         {
 252           currentSequenceId = parseDataLine(dataLine);
 253         }
 254       }
 255       else if (!seqData.isEmpty())
 256       {
 257         /*
 258          * Blank line after processing some data...
 259          */
 260         endOfDataBlock();
 261       }
 262       dataLine = nextNonCommentLine();
 263     }
 264
 265     /*
 266      * close off any features currently being parsed
 267      */
 268     createFeature(GENE, currentGene, geneStart);
 269     createFeature(DOMAIN, currentDomain, domainStart);
 270
 271     // remember the (longest) line length read in, so we can output the same
 272     setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
 273
 274     deriveSequences();
 275   }
 276
 277   /**
 278    * Parse a !Label
 279    *
 280    * @param dataLine
 281    */
 282   protected void parseLabel(String dataLine)
 283   {
 284     // TODO Auto-generated method stub
 285
 286   }
 287
 288   /**
 289    * Post-processing after reading one block of interleaved data
 290    */
 291   protected void endOfDataBlock()
 292   {
 293     this.firstDataBlockRead = true;
 294     // TODO:
 295     // (initialise and) populate arrays of sequence length so far (excluding
 296     // gaps)
 297     // On change or end of a denoted Gene or Domain, add sequence features for
 298     // it
 299   }
 300
 301   /**
 302    * Parse a !Gene or !Domain command line. MEGA accepts
 303    * <ul>
 304    * <li>!Gene=name;</li>
 305    * <li>!Gene=name Property=Coding/Noncoding CodonStart=1/2/3;</li>
 306    * <li>!Gene=genename Domain=domainname Property= etc</li>
 307    * <li>!Domain=domainname Gene=genename Property= etc</li>
 308    * <li>!Domain=domainname Property= etc</li>
 309    * <li>!domain=domainname property=domainend</li>
 310    * </ul>
 311    * Properly, a Gene should be composed of Domain segments, but MEGA accepts
 312    * without. Note that keywords don't seem to be case sensitive.
 313    *
 314    * @param dataLine
 315    * @throws FileFormatException
 316    */
 317   protected void parseGeneOrDomain(String dataLine)
 318           throws FileFormatException
 319   {
 320     String domain = null;
 321     String gene = null;
 322     String property = null;
 323     String codonStart = null;
 324     String errorMsg = "Unrecognized format: " + dataLine;
 325
 326     if (!dataLine.startsWith(BANG) || !dataLine.endsWith(SEMICOLON))
 327     {
 328       throw new FileFormatException(errorMsg);
 329     }
 330     String trimmed = dataLine.substring(1, dataLine.length() - 1).trim();
 331     String[] tokens = trimmed.split(WHITESPACE);
 332     for (String token : tokens)
 333     {
 334       String[] keyValue = token.split("=");
 335       if (keyValue.length != 2)
 336       {
 337         throw new FileFormatException(errorMsg);
 338       }
 339       String key = keyValue[0];
 340       if (GENE.equalsIgnoreCase(key))
 341       {
 342         gene = keyValue[1];
 343       }
 344       else if (DOMAIN.equalsIgnoreCase(key))
 345       {
 346         domain = keyValue[1];
 347       }
 348       else if (PROPERTY.equalsIgnoreCase(key))
 349       {
 350         property = keyValue[1];
 351       }
 352       else if (CODONSTART.equalsIgnoreCase(key))
 353       {
 354         codonStart = keyValue[1];
 355       }
 356       else
 357       {
 358         System.err.println("Unrecognised token: '" + key + "; in "
 359                 + dataLine);
 360       }
 361     }
 362
 363     processGeneOrDomain(gene, domain, property, codonStart);
 364   }
 365
 366   /**
 367    * Process a statement containing one or both of Gene and Domain, and
 368    * optionally Property or CodonStart commands.
 369    *
 370    * @param gene
 371    *          the Gene name if specified, else null
 372    * @param domain
 373    *          the Domain name if specified, else null
 374    * @param property
 375    *          the Property value if specified, else null
 376    * @param codonStart
 377    *          the CodonStart value if specified, else null
 378    */
 379   protected void processGeneOrDomain(String gene, String domain,
 380           String property, String codonStart)
 381   {
 382     /*
 383      * the order of processing below ensures that we correctly capture where a
 384      * domain is in the context of an enclosing gene
 385      */
 386     processDomainEnd(domain, property);
 387
 388     processGeneEnd(gene);
 389
 390     processGeneStart(gene);
 391
 392     processDomainStart(domain, property);
 393
 394     // TODO save codonStart if we plan to involve it in 'translate as cDNA'
 395   }
 396
 397   /**
 398    * If we have declared a domain, and it is not continuing, start a sequence
 399    * feature for it
 400    *
 401    * @param domain
 402    * @param property
 403    */
 404   protected void processDomainStart(String domain, String property)
 405   {
 406     if ("domainend".equalsIgnoreCase(property))
 407     {
 408       currentDomain = null;
 409       return;
 410     }
 411
 412     if (domain != null && !domain.equals(currentDomain))
 413     {
 414       String verboseDomain = makeVerboseDomainName(domain, property);
 415       startSequenceFeature(domainStart);
 416
 417       currentDomain = verboseDomain;
 418     }
 419   }
 420
 421   /**
 422    * If we have declared a gene, and it is not continuing, start a sequence
 423    * feature for it
 424    *
 425    * @param gene
 426    */
 427   protected void processGeneStart(String gene)
 428   {
 429     if (gene != null && !gene.equals(currentGene))
 430     {
 431       startSequenceFeature(geneStart);
 432     }
 433     currentGene = gene;
 434   }
 435
 436   /**
 437    * If we have been processing a domain, and it is not being continued, then
 438    * make a sequence feature for the domain just ended
 439    *
 440    * @param domain
 441    * @param property
 442    * @return true if a feature is created, else false
 443    */
 444   protected boolean processDomainEnd(String domain, String property)
 445   {
 446     String verboseDomain = makeVerboseDomainName(domain, property);
 447     if (this.currentDomain != null)
 448     {
 449       boolean domainEnded = "domainend".equalsIgnoreCase(property);
 450       if (!this.currentDomain.equals(verboseDomain) || domainEnded)
 451       {
 452         createFeature(DOMAIN, currentDomain, domainStart);
 453         return true;
 454       }
 455     }
 456     return false;
 457   }
 458
 459   /**
 460    * If we have been processing a gene, and it is not being continued, then make
 461    * a sequence feature for the gene just ended
 462    *
 463    * @param gene
 464    * @return true if a feature is created, else false
 465    */
 466   protected boolean processGeneEnd(String gene)
 467   {
 468     boolean created = false;
 469     /*
 470      * If we were processing a gene and now have either another, or none, create
 471      * a sequence feature for that gene
 472      */
 473     if (this.currentGene != null && !this.currentGene.equals(gene))
 474     {
 475       createFeature(GENE, currentGene, geneStart);
 476       created = true;
 477     }
 478
 479     return created;
 480   }
 481
 482   /**
 483    * Makes an expanded descriptive name for Domain if possible e.g.
 484    * "Intron1 (Adh Coding)". Currently incorporates the current gene name (if
 485    * any) and the Coding/Noncoding property value (if given).
 486    *
 487    * @param domain
 488    * @param property
 489    * @return
 490    */
 491   protected String makeVerboseDomainName(String domain, String property)
 492   {
 493     String verboseDomain = domain;
 494     if (domain != null)
 495     {
 496       String coding = "";
 497       if ("Exon".equalsIgnoreCase(property)
 498               || "Coding".equalsIgnoreCase(property))
 499       {
 500         coding = " Coding";
 501       }
 502       else if ("Intron".equalsIgnoreCase(property)
 503               || "Noncoding".equalsIgnoreCase(property))
 504       {
 505         coding = " Noncoding";
 506       }
 507       verboseDomain = domain
 508               + (currentGene == null ? "" : " (" + currentGene + coding
 509                       + ")");
 510     }
 511     return verboseDomain;
 512   }
 513
 514   /**
 515    * Start processing a new feature
 516    *
 517    * @param startPositions
 518    */
 519   protected void startSequenceFeature(Map<String, Integer> startPositions)
 520   {
 521     /*
 522      * If the feature declaration precedes all sequences, we will know in
 523      * createFeature that it started with residue 1; otherwise note now where it
 524      * starts in each sequence
 525      */
 526     if (!residuesRead.isEmpty())
 527     {
 528       for (Entry<String, Integer> entry : residuesRead.entrySet())
 529       {
 530         String seqId = entry.getKey();
 531         Integer nextResidue = entry.getValue() + 1;
 532         startPositions.put(seqId, nextResidue);
 533       }
 534     }
 535   }
 536
 537   /**
 538    * Add a SequenceFeature to each sequence, using the given start/end values
 539    * per sequence
 540    *
 541    * @param featureType
 542    * @param featureValue
 543    * @param featureStartResidues
 544    */
 545   protected void createFeature(String featureType, String featureValue,
 546           Map<String, Integer> featureStartResidues)
 547   {
 548     if (featureValue == null)
 549     {
 550       return;
 551     }
 552
 553     Iterator<String> seqids = this.seqData.keySet().iterator();
 554     while (seqids.hasNext())
 555     {
 556       String seqid = seqids.next();
 557       Integer startAt = featureStartResidues.get(seqid);
 558       int sfstart = startAt == null ? 1 : startAt.intValue();
 559       int sfend = residuesRead.get(seqid);
 560       if (sfend >= sfstart)
 561       {
 562         /*
 563          * don't add feature if entirely gapped in the sequence
 564          */
 565         // TODO: type="Gene" (but then all coloured the same) or
 566         // type="GeneName"?
 567         SequenceFeature sf = new SequenceFeature(featureValue, featureType,
 568                 sfstart, sfend, 0f, null);
 569         sequenceFeatures.get(seqid).add(sf);
 570       }
 571     }
 572   }
 573
 574   /**
 575    * Returns the next line that is not a comment, or null at end of file.
 576    * Comments in MEGA are within [ ] brackets, and may be nested.
 577    *
 578    * @return
 579    * @throws IOException
 580    */
 581   protected String nextNonCommentLine() throws IOException
 582   {
 583     return nextNonCommentLine(0);
 584   }
 585
 586   /**
 587    * Returns the next non-comment line (or part line), or null at end of file.
 588    * Comments in MEGA are within [ ] brackets, and may be nested. They may occur
 589    * anywhere within a line (for example at the end with position numbers); this
 590    * method returns the line with any comments removed.
 591    *
 592    * @param depth
 593    *          current depth of nesting of comments while parsing
 594    * @return
 595    * @throws IOException
 596    */
 597   protected String nextNonCommentLine(final int depth) throws IOException
 598   {
 599     String data = null;
 600     data = nextLine();
 601     if (data == null)
 602     {
 603       if (depth > 0)
 604       {
 605         System.err.println("Warning: unterminated comment in data file");
 606       }
 607       return data;
 608     }
 609
 610     /*
 611      * If we are in a (possibly nested) comment after parsing this line, keep
 612      * reading recursively until the comment has unwound
 613      */
 614     int newDepth = commentDepth(data, depth);
 615     if (newDepth > 0)
 616     {
 617       return nextNonCommentLine(newDepth);
 618     }
 619     else
 620     {
 621       /*
 622        * not in a comment by end of this line; return what is left
 623        */
 624       String nonCommentPart = getNonCommentContent(data, depth);
 625       return nonCommentPart;
 626     }
 627   }
 628
 629   /**
 630    * Returns what is left of the input data after removing any comments, whether
 631    * 'in progress' from preceding lines, or embedded in the current line
 632    *
 633    * @param data
 634    *          input data
 635    * @param depth
 636    *          nested depth of comments pending termination
 637    * @return
 638    * @throws FileFormatException
 639    */
 640   protected static String getNonCommentContent(String data, int depth)
 641           throws FileFormatException
 642   {
 643     int len = data.length();
 644     StringBuilder result = new StringBuilder(len);
 645     for (int i = 0; i < len; i++)
 646     {
 647       char c = data.charAt(i);
 648       switch (c)
 649       {
 650       case COMMENT_START:
 651         depth++;
 652         break;
 653
 654       case COMMENT_END:
 655         if (depth > 0)
 656         {
 657           depth--;
 658         }
 659         else
 660         {
 661           result.append(c);
 662         }
 663         break;
 664
 665       default:
 666         if (depth == 0)
 667         {
 668           result.append(c);
 669         }
 670       }
 671     }
 672     return result.toString();
 673   }
 674
 675   /**
 676    * Calculates new depth of comment after parsing an input line i.e. the excess
 677    * of opening '[' over closing ']' characters. Any excess ']' are ignored (not
 678    * treated as comment delimiters).
 679    *
 680    * @param data
 681    *          input line
 682    * @param depth
 683    *          current comment nested depth before parsing the line
 684    * @return new depth after parsing the line
 685    */
 686   protected static int commentDepth(CharSequence data, int depth)
 687   {
 688     int newDepth = depth;
 689     int len = data.length();
 690     for (int i = 0; i < len; i++)
 691     {
 692       char c = data.charAt(i);
 693       if (c == COMMENT_START)
 694       {
 695         newDepth++;
 696       }
 697       else if (c == COMMENT_END && newDepth > 0)
 698       {
 699         newDepth--;
 700       }
 701     }
 702     return newDepth;
 703   }
 704
 705   /**
 706    * Convert the parsed sequence strings to objects and store them in the model.
 707    */
 708   protected void deriveSequences()
 709   {
 710     Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
 711
 712     for (Entry<String, StringBuilder> dataset : datasets)
 713     {
 714       String sequenceId = dataset.getKey();
 715       StringBuilder characters = dataset.getValue();
 716       SequenceI s = new Sequence(sequenceId, new String(characters));
 717       this.seqs.addElement(s);
 718
 719       /*
 720        * and add any derived sequence features to the sequence
 721        */
 722       for (SequenceFeature sf : sequenceFeatures.get(sequenceId))
 723       {
 724         s.addSequenceFeature(sf);
 725       }
 726     }
 727   }
 728
 729   /**
 730    * Process one line of sequence data. If it has no sequence identifier, append
 731    * to the current id's sequence. Else parse out the sequence id and append the
 732    * data (if any) to that id's sequence. Returns the sequence id (implicit or
 733    * explicit) for this line.
 734    *
 735    * @param dataLine
 736    * @return
 737    * @throws IOException
 738    */
 739   protected String parseDataLine(String dataLine)
 740           throws IOException
 741   {
 742     String seqId = getSequenceId(dataLine);
 743     if (seqId == null)
 744     {
 745       /*
 746        * Just character data
 747        */
 748       parseNoninterleavedDataLine(dataLine);
 749       return currentSequenceId;
 750     }
 751     else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
 752     {
 753       /*
 754        * Sequence id only - header line for noninterleaved data
 755        */
 756       return seqId;
 757     }
 758     else
 759     {
 760       /*
 761        * Sequence id followed by data
 762        */
 763       parseInterleavedDataLine(dataLine, seqId);
 764       return seqId;
 765     }
 766   }
 767
 768   /**
 769    * Add a line of sequence data to the buffer for the given sequence id. Start
 770    * a new one if we haven't seen it before.
 771    *
 772    * @param dataLine
 773    * @throws IOException
 774    */
 775   protected void parseNoninterleavedDataLine(String dataLine)
 776           throws IOException
 777   {
 778     if (currentSequenceId == null)
 779     {
 780       /*
 781        * Oops. Data but no sequence id context.
 782        */
 783       throw new IOException("No sequence id context at: " + dataLine);
 784     }
 785
 786     assertInterleaved(false, dataLine);
 787
 788     dataLine = addSequenceData(currentSequenceId, dataLine);
 789
 790     setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
 791   }
 792
 793   /**
 794    * Get the sequence data for this sequence id, starting a new one if
 795    * necessary.
 796    *
 797    * @param currentId
 798    * @return
 799    */
 800   protected StringBuilder getSequenceDataBuffer(String currentId)
 801   {
 802     StringBuilder sb = seqData.get(currentId);
 803     if (sb == null)
 804     {
 805       // first data met for this sequence id, start a new buffer
 806       sb = new StringBuilder(SEQBUFFERSIZE);
 807       seqData.put(currentId, sb);
 808
 809       // and a placeholder for any SequenceFeature found
 810       sequenceFeatures.put(currentId, new ArrayList<SequenceFeature>());
 811     }
 812     return sb;
 813   }
 814
 815   /**
 816    * Parse one line of interleaved data e.g.
 817    *
 818    * <pre>
 819    * #TheSeqId CGATCGCATGCA
 820    * </pre>
 821    *
 822    * @param dataLine
 823    * @param seqId
 824    * @throws FileFormatException
 825    */
 826   protected void parseInterleavedDataLine(String dataLine, String seqId)
 827           throws FileFormatException
 828   {
 829     /*
 830      * New sequence found in second or later data block - error.
 831      */
 832     if (this.firstDataBlockRead && !seqData.containsKey(seqId))
 833     {
 834       throw new FileFormatException(
 835               "Parse error: misplaced new sequence starting at " + dataLine);
 836     }
 837
 838     String data = dataLine.substring(seqId.length() + 1).trim();
 839
 840     /*
 841      * Do nothing if this line is _only_ a sequence id with no data following.
 842      */
 843     if (data != null && data.length() > 0)
 844     {
 845       data = addSequenceData(seqId, data);
 846       setPositionsPerLine(Math.max(positionsPerLine, data.length()));
 847       assertInterleaved(true, dataLine);
 848     }
 849   }
 850
 851   /**
 852    * Remove spaces, and replace identity symbol, before appending the sequence
 853    * data to the buffer for the sequence id. Returns the reformatted added data.
 854    * Also updates a count of residues read for the sequence.
 855    *
 856    * @param seqId
 857    * @param data
 858    * @return
 859    */
 860   protected String addSequenceData(String seqId, String data)
 861   {
 862     StringBuilder sb = getSequenceDataBuffer(seqId);
 863     int len = sb.length();
 864     String formatted = data.replace(SPACE, "");
 865
 866     /*
 867      * If sequence contains '.' or other identity symbol; replace these with the
 868      * same position from the first (reference) sequence
 869      */
 870     int nonGapped = 0;
 871     StringBuilder referenceSequence = seqData.values().iterator().next();
 872     StringBuilder sb1 = new StringBuilder(formatted.length());
 873     for (int i = 0; i < formatted.length(); i++)
 874     {
 875       char nextChar = formatted.charAt(i);
 876       if (nextChar != gapCharacter)
 877       {
 878         nonGapped++;
 879       }
 880       if (nextChar == identityCharacter
 881               && len + i < referenceSequence.length())
 882       {
 883         sb1.append(referenceSequence.charAt(len + i));
 884       }
 885       else
 886       {
 887         sb1.append(nextChar);
 888       }
 889     }
 890     formatted = sb1.toString();
 891
 892     data = formatted;
 893     sb.append(data);
 894
 895     /*
 896      * increment residue count for the sequence
 897      */
 898     if (nonGapped > 0)
 899     {
 900       Integer residueCount = residuesRead.get(seqId);
 901       residuesRead.put(seqId, nonGapped
 902               + (residueCount == null ? 0 : residueCount));
 903     }
 904
 905     return data;
 906   }
 907
 908   /**
 909    * If the line begins with (e.g.) "#abcde " then returns "abcde" as the
 910    * identifier. Else returns null.
 911    *
 912    * @param dataLine
 913    * @return
 914    */
 915   public static String getSequenceId(String dataLine)
 916   {
 917     // TODO refactor to a StringUtils type class
 918     if (dataLine != null)
 919     {
 920       if (dataLine.startsWith(HASHSIGN))
 921       {
 922         int spacePos = dataLine.indexOf(" ");
 923         return (spacePos == -1 ? dataLine.substring(1) : dataLine
 924                 .substring(1, spacePos));
 925       }
 926     }
 927     return null;
 928   }
 929
 930   /**
 931    * Read the #MEGA and Title/Format/Description header lines (if present).
 932    *
 933    * Save as alignment properties in case useful.
 934    *
 935    * @return the next non-blank line following the header lines.
 936    * @throws IOException
 937    */
 938   protected String parseHeaderLines() throws IOException
 939   {
 940     String inputLine = null;
 941     while ((inputLine = nextNonCommentLine()) != null)
 942     {
 943       inputLine = inputLine.trim();
 944
 945       /*
 946        * skip blank lines
 947        */
 948       if (inputLine.length() == 0)
 949       {
 950         continue;
 951       }
 952
 953       if (inputLine.toUpperCase().startsWith(MEGA_ID))
 954       {
 955         continue;
 956       }
 957
 958       if (isTitle(inputLine))
 959       {
 960         this.title = getValue(inputLine);
 961         setAlignmentProperty(PROP_TITLE, title);
 962       }
 963       else if (inputLine.startsWith(BANG + DESCRIPTION))
 964       {
 965         parseDescription(inputLine);
 966       }
 967
 968       else if (inputLine.startsWith(BANG + FORMAT))
 969       {
 970         parseFormat(inputLine);
 971       }
 972       else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
 973       {
 974
 975         /*
 976          * Return the first 'data line' i.e. one that is not blank, #MEGA or
 977          * TITLE:
 978          */
 979         break;
 980       }
 981     }
 982     return inputLine;
 983   }
 984
 985   /**
 986    * Parse a !Format statement. This may be multiline, and is ended by a
 987    * semicolon.
 988    *
 989    * @param inputLine
 990    * @throws IOException
 991    */
 992   protected void parseFormat(String inputLine) throws IOException
 993   {
 994     while (inputLine != null)
 995     {
 996       parseFormatLine(inputLine);
 997       if (inputLine.endsWith(SEMICOLON))
 998       {
 999         break;
1000       }
1001       inputLine = nextNonCommentLine();
1002     }
1003   }
1004
1005   /**
1006    * Parse one line of a !Format statement. This may contain one or more
1007    * keyword=value pairs.
1008    *
1009    * @param inputLine
1010    * @throws FileFormatException
1011    */
1012   protected void parseFormatLine(String inputLine)
1013           throws FileFormatException
1014   {
1015     if (inputLine.startsWith(BANG + FORMAT))
1016     {
1017       inputLine = inputLine.substring((BANG + FORMAT).length());
1018     }
1019     if (inputLine.endsWith(SEMICOLON))
1020     {
1021       inputLine = inputLine.substring(0, inputLine.length() - 1);
1022     }
1023     if (inputLine.length() == 0)
1024     {
1025       return;
1026     }
1027     String[] tokens = inputLine.trim().split(WHITESPACE);
1028     for (String token : tokens)
1029     {
1030       parseFormatKeyword(token);
1031     }
1032   }
1033
1034   /**
1035    * Parse a Keyword=Value token. Possible keywords are
1036    * <ul>
1037    * <li>DataType= DNA, RNA, Nucleotide, Protein</li>
1038    * <li>DataFormat= Interleaved, ?</li>
1039    * <li>NSeqs= number of sequences (synonym NTaxa)</li>
1040    * <li>NSites= number of bases / residues</li>
1041    * <li>Property= Exon (or Coding), Intron (or Noncoding), End (of domain)</li>
1042    * <li>Indel= gap character</li>
1043    * <li>Identical= identity character (to first sequence) (synonym MatchChar)</li>
1044    * <li>Missing= missing data character</li>
1045    * <li>CodeTable= Standard, other (MEGA supports various)</li>
1046    * </ul>
1047    *
1048    * @param token
1049    * @throws FileFormatException
1050    *           if an unrecognised keyword or value is encountered
1051    */
1052   protected void parseFormatKeyword(String token)
1053           throws FileFormatException
1054   {
1055     String msg = "Unrecognised Format command: " + token;
1056     String[] bits = token.split(EQUALS);
1057     if (bits.length != 2)
1058     {
1059       throw new FileFormatException(msg);
1060     }
1061     String keyword = bits[0];
1062     String value = bits[1];
1063
1064     /*
1065      * Jalview will work out whether nucleotide or not anyway
1066      */
1067     if (keyword.equalsIgnoreCase(DATATYPE))
1068     {
1069       if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
1070               || value.equalsIgnoreCase("Nucleotide"))
1071       {
1072         this.nucleotide = true;
1073         // alignment computes whether or not it is nucleotide when created
1074       }
1075       else if (value.equalsIgnoreCase(PROTEIN))
1076       {
1077         this.nucleotide = false;
1078       }
1079       else
1080       {
1081         throw new FileFormatException(msg);
1082       }
1083       setAlignmentProperty(PROP_DATATYPE, value);
1084     }
1085
1086     /*
1087      * accept non-Standard code table but save in case we want to disable
1088      * 'translate as cDNA'
1089      */
1090     else if (keyword.equalsIgnoreCase(CODETABLE))
1091     {
1092       setAlignmentProperty(PROP_CODETABLE, value);
1093     }
1094
1095     /*
1096      * save gap char to set later on alignment once created
1097      */
1098     else if (keyword.equalsIgnoreCase(INDEL))
1099     {
1100       this.gapCharacter = value.charAt(0);
1101     }
1102
1103     else if (keyword.equalsIgnoreCase(IDENTICAL)
1104             || keyword.equalsIgnoreCase("MatchChar"))
1105     {
1106       setAlignmentProperty(PROP_IDENTITY, value);
1107       this.identityCharacter = value.charAt(0);
1108       if (!".".equals(value))
1109       {
1110         System.err.println("Warning: " + token
1111                 + " not supported, Jalview uses '.' for identity");
1112       }
1113     }
1114
1115     else if (keyword.equalsIgnoreCase(MISSING))
1116     {
1117       setAlignmentProperty(PROP_MISSING, value);
1118       System.err.println("Warning: " + token + " not supported");
1119     }
1120
1121     else if (keyword.equalsIgnoreCase(PROPERTY))
1122     {
1123       // TODO: can Property appear in a Format command?
1124       // suspect this is a mistake in the manual
1125     }
1126
1127     else if (!keyword.equalsIgnoreCase(N_SEQS)
1128             && !keyword.equalsIgnoreCase("NTaxa")
1129             && !keyword.equalsIgnoreCase(N_SITES))
1130     {
1131       System.err.println("Warning: " + msg);
1132     }
1133   }
1134
1135   /**
1136    * Returns the trimmed data on the line following either whitespace or '=',
1137    * with any trailing semi-colon removed<br>
1138    * So
1139    * <ul>
1140    * <li>Hello World</li>
1141    * <li>!Hello: \tWorld;</li>
1142    * <li>!Hello=World</li>
1143    * <ul>
1144    * should all return "World"
1145    *
1146    * @param inputLine
1147    * @return
1148    */
1149   protected static String getValue(String inputLine)
1150   {
1151     if (inputLine == null)
1152     {
1153       return null;
1154     }
1155     String value = null;
1156     String s = inputLine.replaceAll("\t", " ").trim();
1157
1158     /*
1159      * KEYWORD = VALUE should return VALUE
1160      */
1161     int equalsPos = s.indexOf("=");
1162     if (equalsPos >= 0)
1163     {
1164       value = s.substring(equalsPos + 1);
1165     }
1166     else
1167     {
1168       int spacePos = s.indexOf(' ');
1169       value = spacePos == -1 ? "" : s.substring(spacePos + 1);
1170     }
1171     value = value.trim();
1172     if (value.endsWith(SEMICOLON))
1173     {
1174       value = value.substring(0, value.length() - 1).trim();
1175     }
1176     return value;
1177   }
1178
1179   /**
1180    * Returns true if the input line starts with "TITLE" or "!TITLE" (not case
1181    * sensitive). The latter is the official format, some older data file
1182    * examples have it without the !.
1183    *
1184    * @param inputLine
1185    * @return
1186    */
1187   protected static boolean isTitle(String inputLine)
1188   {
1189     if (inputLine == null)
1190     {
1191       return false;
1192     }
1193     String upper = inputLine.toUpperCase();
1194     return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
1195             + TITLE.toUpperCase()));
1196   }
1197
1198   /**
1199    * Reads lines until terminated by semicolon, appending each to the
1200    * Description property value.
1201    *
1202    * @throws IOException
1203    */
1204   protected void parseDescription(String firstDescriptionLine)
1205           throws IOException
1206   {
1207     StringBuilder desc = new StringBuilder(256);
1208     desc.append(getValue(firstDescriptionLine));
1209     if (!firstDescriptionLine.endsWith(SEMICOLON))
1210     {
1211       String line = nextNonCommentLine();
1212       while (line != null)
1213       {
1214         if (line.endsWith(SEMICOLON))
1215         {
1216           desc.append(line.substring(0, line.length() - 1));
1217           break;
1218         }
1219         else if (line.length() > 0)
1220         {
1221           desc.append(line).append(newline);
1222         }
1223         line = nextNonCommentLine();
1224       }
1225     }
1226     setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
1227   }
1228
1229   /**
1230    * Returns the alignment sequences in Mega format.
1231    */
1232   @Override
1233   public String print()
1234   {
1235     return MEGA_ID + newline + print(getSeqsAsArray());
1236   }
1237
1238   /**
1239    * Write out the alignment sequences in Mega format - interleaved unless
1240    * explicitly noninterleaved.
1241    */
1242   protected String print(SequenceI[] s)
1243   {
1244     String result;
1245     if (this.interleaved != null && !this.interleaved)
1246     {
1247       result = printNonInterleaved(s);
1248     }
1249     else
1250     {
1251       result = printInterleaved(s);
1252     }
1253     return result;
1254   }
1255
1256   /**
1257    * Print to string in Interleaved format - blocks of next N characters of each
1258    * sequence in turn.
1259    *
1260    * @param s
1261    */
1262   protected String printInterleaved(SequenceI[] s)
1263   {
1264     int maxIdLength = getMaxIdLength(s);
1265     int maxSequenceLength = getMaxSequenceLength(s);
1266     int numLines = maxSequenceLength / positionsPerLine + 3; // approx
1267
1268     int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1;
1269     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1270     int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;
1271
1272     /*
1273      * Roughly size a buffer to hold the whole output
1274      */
1275     StringBuilder sb = new StringBuilder(numLines
1276             * (maxIdLength + positionsPerLine + chunksPerLine + 10));
1277
1278     /*
1279      * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
1280      */
1281     int from = 0;
1282     for (int i = 0; i < numDataBlocks; i++)
1283     {
1284       sb.append(newline);
1285       boolean first = true;
1286       int advancedBy = 0;
1287       for (SequenceI seq : s)
1288       {
1289         int seqFrom = from;
1290         String seqId = String.format("#%-" + maxIdLength + "s",
1291                 seq.getName());
1292
1293         /*
1294          * output next line for this sequence
1295          */
1296         sb.append(seqId);
1297         int lastPos = seqFrom + positionsPerLine; // exclusive
1298         for (int j = 0; j < chunksPerLine; j++)
1299         {
1300           char[] subSequence = seq.getSequence(seqFrom,
1301                   Math.min(lastPos, seqFrom + spaceEvery));
1302           if (subSequence.length > 0)
1303           {
1304             sb.append(SPACE).append(subSequence);
1305           }
1306           seqFrom += subSequence.length;
1307           if (first)
1308           {
1309             // all sequences should be the same length in MEGA
1310             advancedBy += subSequence.length;
1311           }
1312         }
1313         // write last position as a comment
1314         if (writePositionNumbers)
1315         {
1316           sb.append(SPACE).append(COMMENT_START).append(from + advancedBy)
1317                   .append(COMMENT_END);
1318         }
1319         sb.append(newline);
1320         first = false;
1321       }
1322       from += advancedBy;
1323     }
1324
1325     return new String(sb);
1326   }
1327
1328   /**
1329    * Outputs to string the MEGA header and any other known and relevant
1330    * alignment properties
1331    *
1332    * @param al
1333    */
1334   protected String printHeaders(AlignmentI al)
1335   {
1336     StringBuilder sb = new StringBuilder(128);
1337     sb.append(MEGA_ID).append(newline);
1338     String propertyValue = (String) al.getProperty(PROP_TITLE);
1339     if (propertyValue != null)
1340     {
1341       sb.append(BANG).append(TITLE).append(SPACE).append(propertyValue)
1342               .append(SEMICOLON).append(newline);
1343     }
1344     propertyValue = (String) al.getProperty(PROP_DESCRIPTION);
1345     if (propertyValue != null)
1346     {
1347       sb.append(BANG).append(DESCRIPTION).append(newline)
1348               .append(propertyValue).append(SEMICOLON)
1349               .append(newline);
1350     }
1351
1352     /*
1353      * !Format DataType CodeTable
1354      */
1355     sb.append(BANG).append(FORMAT).append(newline);
1356     String dataType = (String) al.getProperty(PROP_DATATYPE);
1357     if (dataType == null)
1358     {
1359       dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
1360     }
1361     sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
1362     String codeTable = (String) al.getProperty(PROP_CODETABLE);
1363     sb.append(SPACE).append(CODETABLE).append(EQUALS)
1364             .append(codeTable == null ? "Standard" : codeTable)
1365             .append(newline);
1366
1367     /*
1368      * !Format NSeqs NSites (the length of sequences - they should all be the
1369      * same - including gaps)
1370      */
1371     sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
1372     sb.append(SPACE).append(N_SITES).append(EQUALS)
1373             .append(String.valueOf(al.getWidth()));
1374     sb.append(newline);
1375
1376     /*
1377      * !Format Indel Identical Missing
1378      */
1379     sb.append(INDENT);
1380     sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
1381     String identity = (String) al.getProperty(PROP_IDENTITY);
1382     if (identity != null)
1383     {
1384       sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
1385     }
1386     String missing = (String) al.getProperty(PROP_MISSING);
1387     if (missing != null)
1388     {
1389       sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
1390     }
1391     sb.append(SEMICOLON).append(newline);
1392
1393     return sb.toString();
1394   }
1395
1396   /**
1397    * Get the longest sequence id (to allow aligned printout).
1398    *
1399    * @param s
1400    * @return
1401    */
1402   protected static int getMaxIdLength(SequenceI[] s)
1403   {
1404     // TODO pull up for reuse
1405     int maxLength = 0;
1406     for (SequenceI seq : s)
1407     {
1408       int len = seq.getName().length();
1409       if (len > maxLength)
1410       {
1411         maxLength = len;
1412       }
1413     }
1414     return maxLength;
1415   }
1416
1417   /**
1418    * Get the longest sequence length
1419    *
1420    * @param s
1421    * @return
1422    */
1423   protected static int getMaxSequenceLength(SequenceI[] s)
1424   {
1425     // TODO pull up for reuse
1426     int maxLength = 0;
1427     for (SequenceI seq : s)
1428     {
1429       int len = seq.getLength();
1430       if (len > maxLength)
1431       {
1432         maxLength = len;
1433       }
1434     }
1435     return maxLength;
1436   }
1437
1438   /**
1439    * Print to string in noninterleaved format - all of each sequence in turn, in
1440    * blocks of 50 characters.
1441    *
1442    * @param s
1443    * @return
1444    */
1445   protected String printNonInterleaved(SequenceI[] s)
1446   {
1447     int maxSequenceLength = getMaxSequenceLength(s);
1448     // approx
1449     int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;
1450
1451     /*
1452      * Roughly size a buffer to hold the whole output
1453      */
1454     StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
1455
1456     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1457     int chunksPerLine = positionsPerLine / spaceEvery;
1458     for (SequenceI seq : s)
1459     {
1460       sb.append(newline);
1461       sb.append(HASHSIGN + seq.getName()).append(newline);
1462       int startPos = 0;
1463       while (startPos < seq.getLength())
1464       {
1465         boolean firstChunk = true;
1466         /*
1467          * print next line for this sequence
1468          */
1469         int lastPos = startPos + positionsPerLine; // exclusive
1470         for (int j = 0; j < chunksPerLine; j++)
1471         {
1472           char[] subSequence = seq.getSequence(startPos,
1473                   Math.min(lastPos, startPos + positionsPerLine));
1474           if (subSequence.length > 0)
1475           {
1476             if (!firstChunk)
1477             {
1478               sb.append(SPACE);
1479             }
1480             sb.append(subSequence);
1481             firstChunk = false;
1482           }
1483           startPos += subSequence.length;
1484         }
1485         sb.append(newline);
1486       }
1487     }
1488
1489     return new String(sb);
1490   }
1491
1492   /**
1493    * Flag this file as interleaved or not, based on data format. Throws an
1494    * exception if has previously been determined to be otherwise.
1495    *
1496    * @param isIt
1497    * @param dataLine
1498    * @throws IOException
1499    */
1500   protected void assertInterleaved(boolean isIt, String dataLine)
1501           throws FileFormatException
1502   {
1503     if (this.interleaved != null && isIt != this.interleaved.booleanValue())
1504     {
1505       throw new FileFormatException("Parse error: interleaved was " + !isIt
1506               + " but now seems to be " + isIt + ", at line: " + dataLine);
1507     }
1508     this.interleaved = new Boolean(isIt);
1509     setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
1510   }
1511
1512   public boolean isInterleaved()
1513   {
1514     return this.interleaved == null ? false : this.interleaved
1515             .booleanValue();
1516   }
1517
1518   /**
1519    * Adds saved parsed values either as alignment properties, or (in some cases)
1520    * as specific member fields of the alignment
1521    */
1522   @Override
1523   public void addProperties(AlignmentI al)
1524   {
1525     super.addProperties(al);
1526     al.setGapCharacter(gapCharacter);
1527
1528     /*
1529      * warn if e.g. DataType=DNA but data is protein (or vice versa)
1530      */
1531     if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) {
1532       System.err.println("Warning: " + this.title + " declared "
1533               + (nucleotide ? "" : " not ") + "nucleotide but it is"
1534               + (nucleotide ? " not" : ""));
1535     }
1536   }
1537
1538   /**
1539    * Print the given alignment in MEGA format. If the alignment was created by
1540    * parsing a MEGA file, it should have properties set (e.g. Title) which can
1541    * influence the output.
1542    */
1543   @Override
1544   public String print(AlignmentI al)
1545   {
1546     this.nucleotide = al.isNucleotide();
1547
1548     String lineLength = (String) al.getProperty(PROP_LINELENGTH);
1549     this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
1550             .parseInt(lineLength);
1551
1552     /*
1553      * round down to a multiple of 3 positions per line for nucleotide
1554      */
1555     if (nucleotide)
1556     {
1557       positionsPerLine = positionsPerLine - (positionsPerLine % 3);
1558     }
1559
1560     String interleave = (String) al.getProperty(PROP_INTERLEAVED);
1561     if (interleave != null)
1562     {
1563       this.interleaved = Boolean.valueOf(interleave);
1564     }
1565
1566     String headers = printHeaders(al);
1567     return headers + print(al.getSequencesArray());
1568   }
1569
1570   /**
1571    * Returns the number of sequence positions output per line
1572    *
1573    * @return
1574    */
1575   public int getPositionsPerLine()
1576   {
1577     return positionsPerLine;
1578   }
1579
1580   /**
1581    * Sets the number of sequence positions output per line. Note these will be
1582    * formatted in blocks of 3 (nucleotide) or 10 (peptide).
1583    *
1584    * @param p
1585    */
1586   public void setPositionsPerLine(int p)
1587   {
1588     this.positionsPerLine = p;
1589   }
1590 }