src/jalview/io/MegaFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  * The Jalview Authors are detailed in the 'AUTHORS' file.
  18  */
  19 package jalview.io;
  20
  21 import jalview.datamodel.AlignmentAnnotation;
  22 import jalview.datamodel.AlignmentI;
  23 import jalview.datamodel.Annotation;
  24 import jalview.datamodel.Sequence;
  25 import jalview.datamodel.SequenceFeature;
  26 import jalview.datamodel.SequenceI;
  27 import jalview.schemes.UserColourScheme;
  28 import jalview.util.Comparison;
  29
  30 import java.awt.Color;
  31 import java.io.IOException;
  32 import java.util.ArrayList;
  33 import java.util.HashMap;
  34 import java.util.Iterator;
  35 import java.util.LinkedHashMap;
  36 import java.util.List;
  37 import java.util.Map;
  38 import java.util.Map.Entry;
  39 import java.util.Set;
  40 import java.util.Vector;
  41
  42 /**
  43  * A parser for input or output of MEGA format files. <br>
  44  * <br>
  45  * Tamura K, Stecher G, Peterson D, Filipski A, and Kumar S (2013) MEGA6:
  46  * Molecular Evolutionary Genetics Analysis Version 6.0. Molecular Biology and
  47  * Evolution 30: 2725-2729. <br>
  48  * <br>
  49  *
  50  * MEGA file format is supported as described in
  51  * http://www.megasoftware.net/manual.pdf <br>
  52  * Limitations:
  53  * <ul>
  54  * <li>any comments (delimited by [ ]) are ignored and not preserved</li>
  55  * </ul>
  56  *
  57  * @see http://www.megasoftware.net/
  58  */
  59 public class MegaFile extends AlignFile
  60 {
  61   private static final String MEGA = "MEGA";
  62
  63   private static final String MEGA_ANNOTATION_LABEL = MEGA + " Label";
  64
  65   private static final String MEGA_ANNOTATION_GENE = MEGA + " Gene";
  66
  67   private static final String MEGA_ANNOTATION_DOMAIN = MEGA + " Domain";
  68
  69   private static final char UNDERSCORE = '_';
  70
  71   private static final String WHITESPACE = "\\s+";
  72
  73   private static final int DEFAULT_LINE_LENGTH = 60;
  74
  75   private static final String INDENT = "    ";
  76
  77   private static final String N_SITES = "NSites";
  78
  79   private static final String N_SEQS = "NSeqs";
  80
  81   private static final String MISSING = "Missing";
  82
  83   private static final String IDENTICAL = "Identical";
  84
  85   private static final String INDEL = "Indel";
  86
  87   private static final String CODETABLE = "CodeTable";
  88
  89   private static final String PROTEIN = "Protein";
  90
  91   private static final String NUCLEOTIDE = "Nucleotide";
  92
  93   private static final String DATATYPE = "DataType";
  94
  95   private static final char COMMENT_START = '[';
  96
  97   private static final char COMMENT_END = ']';
  98
  99   private static final String HASHSIGN = "#";
 100
 101   private static final String SEMICOLON = ";";
 102
 103   private static final String BANG = "!";
 104
 105   private static final String EQUALS = "=";
 106
 107   private static final String MEGA_ID = HASHSIGN + MEGA;
 108
 109   private static final String TITLE = "Title";
 110
 111   private static final String FORMAT = "Format";
 112
 113   private static final String DESCRIPTION = "Description";
 114
 115   private static final String GENE = "Gene";
 116
 117   private static final String DOMAIN = "Domain";
 118
 119   private static final String PROPERTY = "Property";
 120
 121   private static final String CODONSTART = "CodonStart";
 122
 123   private static final String DOMAINEND = "domainend";
 124
 125   private static final String LABEL = "Label";
 126
 127   /*
 128    * names of properties to save to the alignment (may affect eventual output
 129    * format)
 130    */
 131   static final String PROP_TITLE = "MEGA_TITLE";
 132
 133   static final String PROP_INTERLEAVED = "MEGA_INTERLEAVED";
 134
 135   static final String PROP_DESCRIPTION = "MEGA_DESCRIPTION";
 136
 137   static final String PROP_CODETABLE = "MEGA_CODETABLE";
 138
 139   static final String PROP_IDENTITY = "MEGA_IDENTITY";
 140
 141   static final String PROP_MISSING = "MEGA_MISSING";
 142
 143   static final String PROP_DATATYPE = "MEGA_DATATYPE";
 144
 145   // number of bases per line of file (value is inferred)
 146   static final String PROP_LINELENGTH = "MEGA_LINELENGTH";
 147
 148   // TODO: need a controlled name for Gene as a feature if we want to be able to
 149   // output the MEGA file with !Gene headers
 150   // WTF do we do if the sequences get realigned?
 151
 152   // initial size for sequence data buffer
 153   private static final int SEQBUFFERSIZE = 256;
 154
 155   private static final String SPACE = " ";
 156
 157   private static final String TAB = "\t";
 158
 159   private static final char DEFAULT_GAP = '-';
 160
 161   /*
 162    * number of sequence positions output per line
 163    */
 164   private int positionsPerLine;
 165
 166   private String title;
 167
 168   // gap character may be explicitly declared, default is -
 169   private char gapCharacter = DEFAULT_GAP;
 170
 171   // identity character if declared
 172   private char identityCharacter = 0;
 173
 174   // this can be True, False or null (meaning not asserted in file)
 175   private Boolean nucleotide;
 176
 177   // set once we have seen one block of interleaved data
 178   private boolean seenAllSequences = false;
 179
 180   // this can be True, False or null (meaning we don't know yet)
 181   private Boolean interleaved;
 182
 183   // write end of line positions as a comment
 184   private boolean writePositionNumbers = true;
 185
 186   // id of sequence being processed
 187   private String currentSequenceId;
 188
 189   /*
 190    * Temporary store of {sequenceId, positionData} while parsing interleaved
 191    * sequences; sequences are maintained in the order in which they are added
 192    * i.e. read in the file
 193    */
 194   Map<String, StringBuilder> seqData;
 195
 196   // number of residues read (so far) per sequence
 197   Map<String, Integer> residuesRead;
 198
 199   // current Gene if any we are parsing
 200   private String currentGene;
 201
 202   // start position in alignment (base 0) of current Gene
 203   private int currentGeneStartCol;
 204
 205   // start residue (base 1) per sequence of current Gene
 206   private Map<String, Integer> geneStart;
 207
 208   // current Domain if any we are parsing
 209   private String currentDomain;
 210
 211   // start position in alignment (base 0) of current Domain
 212   private int currentDomainStartCol;
 213
 214   // start residue (base 1) per sequence of current Domain
 215   private Map<String, Integer> domainStart;
 216
 217   // map of SequenceFeature's by sequence id
 218   private Map<String, List<SequenceFeature>> sequenceFeatures;
 219
 220   // each !Label line character becomes an Annotation (except underscores)
 221   List<Annotation> labelAnnotations;
 222
 223   // records any declared Gene positions (including null values)
 224   List<Annotation> geneAnnotations;
 225
 226   // records any declared Domain positions (including null values)
 227   List<Annotation> domainAnnotations;
 228
 229   public MegaFile()
 230   {
 231   }
 232
 233   public MegaFile(String inFile, String type) throws IOException
 234   {
 235     super(inFile, type);
 236   }
 237
 238   public MegaFile(FileParse source) throws IOException
 239   {
 240     super(source);
 241   }
 242
 243   /**
 244    * Parse the input stream.
 245    */
 246   @Override
 247   public void parse() throws IOException
 248   {
 249     gapCharacter = DEFAULT_GAP;
 250     sequenceFeatures = new HashMap<String, List<SequenceFeature>>();
 251     geneStart = new HashMap<String, Integer>();
 252     domainStart = new HashMap<String, Integer>();
 253     residuesRead = new HashMap<String, Integer>();
 254     labelAnnotations = new ArrayList<Annotation>();
 255     geneAnnotations = new ArrayList<Annotation>();
 256     domainAnnotations = new ArrayList<Annotation>();
 257     currentDomainStartCol = -1;
 258     currentGeneStartCol = -1;
 259
 260     /*
 261      * Read and process MEGA and Title/Format/Description headers if present.
 262      * Returns the first data line following the headers.
 263      */
 264     String dataLine = parseHeaderLines();
 265
 266     /*
 267      * order-preserving map to hold sequences by id as they are built up during
 268      * parsing
 269      */
 270     seqData = new LinkedHashMap<String, StringBuilder>();
 271
 272     /*
 273      * The id of the sequence being read (for non-interleaved)
 274      */
 275     currentSequenceId = "";
 276
 277     while (dataLine != null)
 278     {
 279       dataLine = dataLine.trim();
 280       if (dataLine.length() > 0)
 281       {
 282         dataLine = dataLine.replace(TAB, SPACE);
 283         String upperCased = dataLine.toUpperCase();
 284         if (upperCased.startsWith(BANG + GENE.toUpperCase())
 285                 || upperCased.startsWith(BANG + DOMAIN.toUpperCase()))
 286         {
 287           endDataBlock();
 288           parseGeneOrDomain(dataLine);
 289         }
 290         else if (upperCased.startsWith(BANG + LABEL.toUpperCase()))
 291         {
 292           parseLabel(dataLine);
 293           endDataBlock();
 294         }
 295         else
 296         {
 297           currentSequenceId = parseDataLine(dataLine);
 298         }
 299       }
 300       else if (!seqData.isEmpty())
 301       {
 302         /*
 303          * Blank line after processing some data...
 304          */
 305         endDataBlock();
 306       }
 307       dataLine = nextNonCommentLine();
 308     }
 309
 310     /*
 311      * close off any features currently being parsed
 312      */
 313     createFeature(GENE, currentGene, geneStart);
 314     createFeature(DOMAIN, currentDomain, domainStart);
 315
 316     extendAnnotation(geneAnnotations, currentGene, currentGeneStartCol);
 317     extendAnnotation(domainAnnotations, currentDomain,
 318             currentDomainStartCol);
 319
 320     // remember the (longest) line length read in, so we can output the same
 321     setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
 322
 323     deriveSequencesAndFeatures();
 324
 325     deriveAnnotations();
 326   }
 327
 328   /**
 329    * Create AlignmentAnnotation for Label, Gene and Domain (provided at least
 330    * one non-null annotation is present)
 331    */
 332   protected void deriveAnnotations()
 333   {
 334     deriveAnnotation(this.labelAnnotations, MEGA_ANNOTATION_LABEL);
 335     deriveAnnotation(this.geneAnnotations, MEGA_ANNOTATION_GENE);
 336     deriveAnnotation(this.domainAnnotations, MEGA_ANNOTATION_DOMAIN);
 337   }
 338
 339   /**
 340    * Create and ad an AlignmentAnnotation (provided at least one non-null
 341    * annotation is present)
 342    *
 343    * @param anns
 344    * @param label
 345    */
 346   protected void deriveAnnotation(List<Annotation> anns, String label)
 347   {
 348     if (anns.size() > 0 && hasNonNullEntry(anns))
 349     {
 350       Annotation[] annotationArray = anns.toArray(new Annotation[anns
 351               .size()]);
 352       AlignmentAnnotation aa = new AlignmentAnnotation(label, "",
 353               annotationArray);
 354       this.annotations.add(aa);
 355     }
 356   }
 357
 358   protected static boolean hasNonNullEntry(List<? extends Object> l)
 359   {
 360     for (Object o : l)
 361     {
 362       if (o != null)
 363       {
 364         return true;
 365       }
 366     }
 367     return false;
 368   }
 369
 370   /**
 371    * Parse a !Label line. This contains a single character per position (column)
 372    * of the alignment block above. An underscore character represents no label.
 373    * Labels are assembled into an AlignmentAnnotation object.
 374    *
 375    * @param dataLine
 376    * @return true if any non-null annotation was created
 377    * @throws FileFormatException
 378    */
 379   protected boolean parseLabel(String dataLine) throws FileFormatException
 380   {
 381     // strip off leading !Label and following spaces
 382     dataLine = dataLine.substring(LABEL.length() + 1).trim();
 383
 384     // remove internal spacing and any leading tab
 385     String labels = dataLine.replace(SPACE, "");
 386     if (labels.endsWith(SEMICOLON))
 387     {
 388       labels = labels.substring(0, labels.length() - 1);
 389     }
 390     else
 391     {
 392       System.err.println("Warning: '" + dataLine
 393               + "' should end with semi-colon");
 394     }
 395     boolean added = false;
 396     for (char c : labels.toCharArray())
 397     {
 398       if (c == UNDERSCORE)
 399       {
 400         this.labelAnnotations.add(null);
 401       }
 402       else
 403       {
 404         this.labelAnnotations.add(new Annotation(String.valueOf(c), "",
 405                 ' ', 0f));
 406         added = true;
 407       }
 408     }
 409     return added;
 410   }
 411
 412   /**
 413    * Post-processing after reading one block of interleaved data
 414    */
 415   protected void endDataBlock()
 416   {
 417     padAnnotations(labelAnnotations);
 418   }
 419
 420   /**
 421    * Append null annotations to keep the annotations list the same length as the
 422    * sequences. This ensures that when the list is converted to an array it is
 423    * correctly aligned with the alignment columns. It is needed when there are
 424    * gaps in declared 'annotations' in a MEGA file, such as lines with no !Label
 425    * statement, or regions between marked genes or domains.
 426    *
 427    * @param anns
 428    */
 429   protected void padAnnotations(List<Annotation> anns)
 430   {
 431     addNullAnnotations(anns, getAlignmentWidth());
 432   }
 433
 434   /**
 435    * Append null annotations for positions up to (and excluding) the given end
 436    * column (base 0)
 437    *
 438    * @param anns
 439    * @param upTo
 440    */
 441   protected void addNullAnnotations(List<Annotation> anns, int upTo)
 442   {
 443     int annotationsToAdd = upTo - anns.size();
 444     for (int i = 0; i < annotationsToAdd; i++)
 445     {
 446       anns.add(null);
 447     }
 448   }
 449
 450   /**
 451    * Parse a !Gene or !Domain command line. MEGA accepts
 452    * <ul>
 453    * <li>!Gene=name;</li>
 454    * <li>!Gene=name Property=Coding/Noncoding CodonStart=1/2/3;</li>
 455    * <li>!Gene=genename Domain=domainname Property= etc</li>
 456    * <li>!Domain=domainname Gene=genename Property= etc</li>
 457    * <li>!Domain=domainname Property= etc</li>
 458    * <li>!domain=domainname property=domainend</li>
 459    * </ul>
 460    * Properly, a Gene should be composed of Domain segments, but MEGA accepts
 461    * without. Note that keywords don't seem to be case sensitive.
 462    *
 463    * @param dataLine
 464    * @throws FileFormatException
 465    */
 466   protected void parseGeneOrDomain(String dataLine)
 467           throws FileFormatException
 468   {
 469     String domain = null;
 470     String gene = null;
 471     String property = null;
 472     String codonStart = null;
 473     String errorMsg = "Unrecognized format: " + dataLine;
 474
 475     if (!dataLine.startsWith(BANG) || !dataLine.endsWith(SEMICOLON))
 476     {
 477       throw new FileFormatException(errorMsg);
 478     }
 479     String trimmed = dataLine.substring(1, dataLine.length() - 1).trim();
 480     String[] tokens = trimmed.split(WHITESPACE);
 481     for (String token : tokens)
 482     {
 483       String[] keyValue = token.split("=");
 484       if (keyValue.length != 2)
 485       {
 486         throw new FileFormatException(errorMsg);
 487       }
 488       String key = keyValue[0];
 489       if (GENE.equalsIgnoreCase(key))
 490       {
 491         gene = keyValue[1];
 492       }
 493       else if (DOMAIN.equalsIgnoreCase(key))
 494       {
 495         domain = keyValue[1];
 496       }
 497       else if (PROPERTY.equalsIgnoreCase(key))
 498       {
 499         property = keyValue[1];
 500       }
 501       else if (CODONSTART.equalsIgnoreCase(key))
 502       {
 503         codonStart = keyValue[1];
 504       }
 505       else
 506       {
 507         System.err.println("Unrecognised token: '" + key + "; in "
 508                 + dataLine);
 509       }
 510     }
 511
 512     processGeneOrDomain(gene, domain, property, codonStart);
 513   }
 514
 515   /**
 516    * Process a statement containing one or both of Gene and Domain, and
 517    * optionally Property or CodonStart commands.
 518    *
 519    * @param gene
 520    *          the Gene name if specified, else null
 521    * @param domain
 522    *          the Domain name if specified, else null
 523    * @param property
 524    *          the Property value if specified, else null
 525    * @param codonStart
 526    *          the CodonStart value if specified, else null
 527    */
 528   protected void processGeneOrDomain(String gene, String domain,
 529           String property, String codonStart)
 530   {
 531     /*
 532      * the order of processing below ensures that we correctly handle a domain
 533      * in the context of an enclosing gene
 534      */
 535     processDomainEnd(domain, gene, property);
 536
 537     processGeneEnd(gene);
 538
 539     processGeneStart(gene);
 540
 541     processDomainStart(domain, property);
 542
 543     // TODO save codonStart if we plan to involve it in 'translate as cDNA'
 544   }
 545
 546   /**
 547    * If we have declared a domain, and it is not continuing, start a sequence
 548    * feature for it
 549    *
 550    * @param domain
 551    * @param property
 552    */
 553   protected void processDomainStart(String domain, String property)
 554   {
 555     if (DOMAINEND.equalsIgnoreCase(property))
 556     {
 557       currentDomain = null;
 558       return;
 559     }
 560
 561     if (domain != null && !domain.equals(currentDomain))
 562     {
 563       String verboseDomain = makeVerboseDomainName(domain, property);
 564       startSequenceFeature(domainStart);
 565       currentDomainStartCol = getAlignmentWidth();
 566
 567       currentDomain = verboseDomain;
 568     }
 569   }
 570
 571   /**
 572    * Returns the width of alignment parsed so far. Note we assume (as does MEGA)
 573    * that all sequences are the same length, so we can just take the length of
 574    * the first one.
 575    *
 576    * @return
 577    */
 578   protected int getAlignmentWidth()
 579   {
 580     return seqData.isEmpty() ? 0 : seqData.values()
 581             .iterator().next().length();
 582   }
 583
 584   /**
 585    * If we have declared a gene, and it is not continuing, start a sequence
 586    * feature for it
 587    *
 588    * @param gene
 589    */
 590   protected void processGeneStart(String gene)
 591   {
 592     if (gene != null && !gene.equals(currentGene))
 593     {
 594       startSequenceFeature(geneStart);
 595       currentGeneStartCol = getAlignmentWidth();
 596     }
 597     currentGene = gene;
 598   }
 599
 600   /**
 601    * If we have been processing a domain, and it is not being continued, then
 602    * make a sequence feature for the domain just ended. Criteria for the domain
 603    * not being continued are either an explicit new domain or gene name, or a
 604    * 'Property=domainend' statement
 605    *
 606    * @param domain
 607    * @param gene
 608    * @param property
 609    * @return true if a feature is created, else false
 610    */
 611   protected boolean processDomainEnd(String domain, String gene,
 612           String property)
 613   {
 614     boolean newGene = (gene != null && !gene.equals(currentGene));
 615
 616     String verboseDomain = makeVerboseDomainName(domain, property);
 617
 618     if (this.currentDomain != null)
 619     {
 620       boolean newDomain = !this.currentDomain.equals(verboseDomain);
 621       boolean domainEnded = DOMAINEND.equalsIgnoreCase(property);
 622       if (newDomain || newGene || domainEnded)
 623       {
 624         createFeature(DOMAIN, currentDomain, domainStart);
 625         // and/or... create annnotations for domain
 626         extendAnnotation(domainAnnotations, currentDomain,
 627                 currentDomainStartCol);
 628         currentDomain = null;
 629         currentDomainStartCol = -1;
 630         return true;
 631       }
 632     }
 633     return false;
 634   }
 635
 636   /**
 637    * If we have been processing a gene, and it is not being continued, then make
 638    * a sequence feature for the gene just ended
 639    *
 640    * @param gene
 641    * @return true if a feature is created, else false
 642    */
 643   protected boolean processGeneEnd(String gene)
 644   {
 645     boolean created = false;
 646     /*
 647      * If we were processing a gene and now have either another, or none, create
 648      * a sequence feature for that gene
 649      */
 650     if (this.currentGene != null && !this.currentGene.equals(gene))
 651     {
 652       createFeature(GENE, currentGene, geneStart);
 653       // and/or... add annotations for Gene
 654       extendAnnotation(geneAnnotations, currentGene, currentGeneStartCol);
 655       currentGene = null;
 656       currentGeneStartCol = -1;
 657       created = true;
 658     }
 659
 660     return created;
 661   }
 662
 663   /**
 664    * Helper method to add Annotation elements, with the given description and
 665    * starting at the given start column, up to the end of the sequence length
 666    * parsed so far. Null elements are inserted for any skipped columns since the
 667    * last annotation (if any), i.e. positions with no annotation of this type.
 668    *
 669    * @param anns
 670    * @param description
 671    * @param startColumn
 672    *          the start column of the annotations to add, or -1 if nothing to
 673    *          add
 674    */
 675   protected void extendAnnotation(List<Annotation> anns,
 676           String description, int startColumn)
 677   {
 678     int alignmentWidth = getAlignmentWidth();
 679     addNullAnnotations(anns, startColumn == -1 ? alignmentWidth
 680             : startColumn);
 681
 682     int numberToAdd = alignmentWidth - anns.size();
 683     if (numberToAdd > 0)
 684     {
 685       Color col = description == null ? Color.black : UserColourScheme
 686               .createColourFromName(description);
 687       for (int i = 0; i < numberToAdd; i++)
 688       {
 689         anns.add(new Annotation("X", description, ' ', 0f, col));
 690       }
 691     }
 692   }
 693
 694   /**
 695    * Makes an expanded descriptive name for Domain if possible e.g.
 696    * "Intron1 (Adh Coding)". Currently incorporates the current gene name (if
 697    * any) and the Coding/Noncoding property value (if given).
 698    *
 699    * @param domain
 700    * @param property
 701    * @return
 702    */
 703   protected String makeVerboseDomainName(String domain, String property)
 704   {
 705     String verboseDomain = domain;
 706     if (domain != null)
 707     {
 708       String coding = "";
 709       if ("Exon".equalsIgnoreCase(property)
 710               || "Coding".equalsIgnoreCase(property))
 711       {
 712         coding = " Coding";
 713       }
 714       else if ("Intron".equalsIgnoreCase(property)
 715               || "Noncoding".equalsIgnoreCase(property))
 716       {
 717         coding = " Noncoding";
 718       }
 719       verboseDomain = domain
 720               + (currentGene == null ? "" : " (" + currentGene + coding
 721                       + ")");
 722     }
 723     return verboseDomain;
 724   }
 725
 726   /**
 727    * Start processing a new feature
 728    *
 729    * @param startPositions
 730    */
 731   protected void startSequenceFeature(Map<String, Integer> startPositions)
 732   {
 733     /*
 734      * If the feature declaration precedes all sequences, we will know in
 735      * createFeature that it started with residue 1; otherwise note now where it
 736      * starts in each sequence
 737      */
 738     if (!residuesRead.isEmpty())
 739     {
 740       for (Entry<String, Integer> entry : residuesRead.entrySet())
 741       {
 742         String seqId = entry.getKey();
 743         Integer nextResidue = entry.getValue() + 1;
 744         startPositions.put(seqId, nextResidue);
 745       }
 746     }
 747   }
 748
 749   /**
 750    * Add a SequenceFeature to each sequence, using the given start/end values
 751    * per sequence
 752    *
 753    * @param featureType
 754    * @param featureValue
 755    * @param featureStartResidues
 756    */
 757   protected void createFeature(String featureType, String featureValue,
 758           Map<String, Integer> featureStartResidues)
 759   {
 760     if (featureValue == null)
 761     {
 762       return;
 763     }
 764
 765     Iterator<String> seqids = this.seqData.keySet().iterator();
 766     while (seqids.hasNext())
 767     {
 768       String seqid = seqids.next();
 769       Integer startAt = featureStartResidues.get(seqid);
 770       int sfstart = startAt == null ? 1 : startAt.intValue();
 771       int sfend = residuesRead.get(seqid);
 772       if (sfend >= sfstart)
 773       {
 774         /*
 775          * don't add feature if entirely gapped in the sequence
 776          */
 777         // NB use 'colour feature by label' to show up distinct instances of
 778         // feature type 'Gene' or 'Domain' on the alignment
 779         SequenceFeature sf = new SequenceFeature(featureType, featureValue,
 780                 sfstart, sfend, 0f, null);
 781         sequenceFeatures.get(seqid).add(sf);
 782       }
 783     }
 784   }
 785
 786   /**
 787    * Returns the next line that is not a comment, or null at end of file.
 788    * Comments in MEGA are within [ ] brackets, and may be nested.
 789    *
 790    * @return
 791    * @throws FileFormatException
 792    */
 793   protected String nextNonCommentLine() throws FileFormatException
 794   {
 795     return nextNonCommentLine(0);
 796   }
 797
 798   /**
 799    * Returns the next non-comment line (or part line), or null at end of file.
 800    * Comments in MEGA are within [ ] brackets, and may be nested. They may occur
 801    * anywhere within a line (for example at the end with position numbers); this
 802    * method returns the line with any comments removed.
 803    *
 804    * @param depth
 805    *          current depth of nesting of comments while parsing
 806    * @return
 807    * @throws FileFormatException
 808    */
 809   protected String nextNonCommentLine(final int depth)
 810           throws FileFormatException
 811   {
 812     String data = null;
 813     try
 814     {
 815       data = nextLine();
 816     } catch (IOException e)
 817     {
 818       throw new FileFormatException(e.getMessage());
 819     }
 820     if (data == null)
 821     {
 822       if (depth > 0)
 823       {
 824         System.err.println("Warning: unterminated comment in data file");
 825       }
 826       return data;
 827     }
 828
 829     /*
 830      * If we are in a (possibly nested) comment after parsing this line, keep
 831      * reading recursively until the comment has unwound
 832      */
 833     int newDepth = commentDepth(data, depth);
 834     if (newDepth > 0)
 835     {
 836       return nextNonCommentLine(newDepth);
 837     }
 838     else
 839     {
 840       /*
 841        * not in a comment by end of this line; return what is left
 842        */
 843       String nonCommentPart = getNonCommentContent(data, depth);
 844       return nonCommentPart;
 845     }
 846   }
 847
 848   /**
 849    * Returns what is left of the input data after removing any comments, whether
 850    * 'in progress' from preceding lines, or embedded in the current line
 851    *
 852    * @param data
 853    *          input data
 854    * @param depth
 855    *          nested depth of comments pending termination
 856    * @return
 857    * @throws FileFormatException
 858    */
 859   protected static String getNonCommentContent(String data, int depth)
 860           throws FileFormatException
 861   {
 862     int len = data.length();
 863     StringBuilder result = new StringBuilder(len);
 864     for (int i = 0; i < len; i++)
 865     {
 866       char c = data.charAt(i);
 867       switch (c)
 868       {
 869       case COMMENT_START:
 870         depth++;
 871         break;
 872
 873       case COMMENT_END:
 874         if (depth > 0)
 875         {
 876           depth--;
 877         }
 878         else
 879         {
 880           result.append(c);
 881         }
 882         break;
 883
 884       default:
 885         if (depth == 0)
 886         {
 887           result.append(c);
 888         }
 889       }
 890     }
 891     return result.toString();
 892   }
 893
 894   /**
 895    * Calculates new depth of comment after parsing an input line i.e. the excess
 896    * of opening '[' over closing ']' characters. Any excess ']' are ignored (not
 897    * treated as comment delimiters).
 898    *
 899    * @param data
 900    *          input line
 901    * @param depth
 902    *          current comment nested depth before parsing the line
 903    * @return new depth after parsing the line
 904    */
 905   protected static int commentDepth(CharSequence data, int depth)
 906   {
 907     int newDepth = depth;
 908     int len = data.length();
 909     for (int i = 0; i < len; i++)
 910     {
 911       char c = data.charAt(i);
 912       if (c == COMMENT_START)
 913       {
 914         newDepth++;
 915       }
 916       else if (c == COMMENT_END && newDepth > 0)
 917       {
 918         newDepth--;
 919       }
 920     }
 921     return newDepth;
 922   }
 923
 924   /**
 925    * Convert the parsed sequence strings to objects and store them in the model.
 926    */
 927   protected void deriveSequencesAndFeatures()
 928   {
 929     Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
 930
 931     for (Entry<String, StringBuilder> dataset : datasets)
 932     {
 933       String sequenceId = dataset.getKey();
 934       StringBuilder characters = dataset.getValue();
 935       SequenceI s = new Sequence(sequenceId, new String(characters));
 936       this.seqs.addElement(s);
 937
 938       /*
 939        * and add any derived sequence features to the sequence
 940        */
 941       for (SequenceFeature sf : sequenceFeatures.get(sequenceId))
 942       {
 943         s.addSequenceFeature(sf);
 944       }
 945     }
 946   }
 947
 948   /**
 949    * Process one line of sequence data. If it has no sequence identifier, append
 950    * to the current id's sequence. Else parse out the sequence id and append the
 951    * data (if any) to that id's sequence. Returns the sequence id (implicit or
 952    * explicit) for this line.
 953    *
 954    * @param dataLine
 955    * @return
 956    * @throws FileFormatException
 957    */
 958   protected String parseDataLine(String dataLine)
 959           throws FileFormatException
 960   {
 961     String seqId = getSequenceId(dataLine);
 962     if (seqId == null)
 963     {
 964       /*
 965        * Just character data
 966        */
 967       parseNoninterleavedDataLine(dataLine);
 968       return currentSequenceId;
 969     }
 970     else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
 971     {
 972       /*
 973        * Sequence id only - header line for noninterleaved data
 974        */
 975       return seqId;
 976     }
 977     else
 978     {
 979       /*
 980        * Sequence id followed by data
 981        */
 982       parseInterleavedDataLine(dataLine, seqId);
 983       return seqId;
 984     }
 985   }
 986
 987   /**
 988    * Add a line of sequence data to the buffer for the given sequence id. Start
 989    * a new one if we haven't seen it before.
 990    *
 991    * @param dataLine
 992    * @throws FileFormatException
 993    */
 994   protected void parseNoninterleavedDataLine(String dataLine)
 995           throws FileFormatException
 996   {
 997     if (currentSequenceId == null)
 998     {
 999       /*
1000        * Oops. Data but no sequence id context.
1001        */
1002       throw new FileFormatException("No sequence id context at: "
1003               + dataLine);
1004     }
1005
1006     assertInterleaved(false, dataLine);
1007
1008     dataLine = addSequenceData(currentSequenceId, dataLine);
1009
1010     setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
1011   }
1012
1013   /**
1014    * Get the sequence data for this sequence id, starting a new one if
1015    * necessary.
1016    *
1017    * @param currentId
1018    * @return
1019    */
1020   protected StringBuilder getSequenceDataBuffer(String currentId)
1021   {
1022     StringBuilder sb = seqData.get(currentId);
1023     if (sb == null)
1024     {
1025       // first data met for this sequence id, start a new buffer
1026       sb = new StringBuilder(SEQBUFFERSIZE);
1027       seqData.put(currentId, sb);
1028
1029       // and a placeholder for any SequenceFeature found
1030       sequenceFeatures.put(currentId, new ArrayList<SequenceFeature>());
1031     }
1032     else
1033     {
1034       /*
1035        * we are appending to a previously seen sequence; flag that we have seen
1036        * all sequences
1037        */
1038       this.seenAllSequences = true;
1039     }
1040     return sb;
1041   }
1042
1043   /**
1044    * Parse one line of interleaved data e.g.
1045    *
1046    * <pre>
1047    * #TheSeqId CGATCGCATGCA
1048    * </pre>
1049    *
1050    * @param dataLine
1051    * @param seqId
1052    * @throws FileFormatException
1053    */
1054   protected void parseInterleavedDataLine(String dataLine, String seqId)
1055           throws FileFormatException
1056   {
1057     /*
1058      * New sequence found in second or later data block - error.
1059      */
1060     if (this.seenAllSequences && !seqData.containsKey(seqId))
1061     {
1062       throw new FileFormatException(
1063               "Parse error: misplaced new sequence starting at " + dataLine);
1064     }
1065
1066     String data = dataLine.substring(seqId.length() + 1).trim();
1067
1068     /*
1069      * Do nothing if this line is _only_ a sequence id with no data following.
1070      */
1071     if (data != null && data.length() > 0)
1072     {
1073       data = addSequenceData(seqId, data);
1074       setPositionsPerLine(Math.max(positionsPerLine, data.length()));
1075       assertInterleaved(true, dataLine);
1076     }
1077   }
1078
1079   /**
1080    * Remove spaces, and replace identity symbol, before appending the sequence
1081    * data to the buffer for the sequence id. Returns the reformatted added data.
1082    * Also updates a count of residues read for the sequence.
1083    *
1084    * @param seqId
1085    * @param data
1086    * @return
1087    */
1088   protected String addSequenceData(String seqId, String data)
1089   {
1090     StringBuilder sb = getSequenceDataBuffer(seqId);
1091     int len = sb.length();
1092     String formatted = data.replace(SPACE, "");
1093
1094     /*
1095      * If sequence contains '.' or other identity symbol; replace these with the
1096      * same position from the first (reference) sequence
1097      */
1098     int nonGapped = 0;
1099     StringBuilder referenceSequence = seqData.values().iterator().next();
1100     StringBuilder sb1 = new StringBuilder(formatted.length());
1101     for (int i = 0; i < formatted.length(); i++)
1102     {
1103       char nextChar = formatted.charAt(i);
1104       if (nextChar == gapCharacter)
1105       {
1106         sb1.append(Comparison.isGap(nextChar) ? nextChar : DEFAULT_GAP);
1107       }
1108       else
1109       {
1110         nonGapped++;
1111         if (nextChar == identityCharacter
1112                 && len + i < referenceSequence.length())
1113         {
1114           sb1.append(referenceSequence.charAt(len + i));
1115         }
1116         else
1117         {
1118           sb1.append(nextChar);
1119         }
1120       }
1121     }
1122     formatted = sb1.toString();
1123
1124     data = formatted;
1125     sb.append(data);
1126
1127     /*
1128      * increment residue count for the sequence
1129      */
1130     if (nonGapped > 0)
1131     {
1132       Integer residueCount = residuesRead.get(seqId);
1133       residuesRead.put(seqId, nonGapped
1134               + (residueCount == null ? 0 : residueCount));
1135     }
1136
1137     return data;
1138   }
1139
1140   /**
1141    * If the line begins with (e.g.) "#abcde " then returns "abcde" as the
1142    * identifier. Else returns null.
1143    *
1144    * @param dataLine
1145    * @return
1146    */
1147   public static String getSequenceId(String dataLine)
1148   {
1149     // TODO refactor to a StringUtils type class
1150     if (dataLine != null)
1151     {
1152       if (dataLine.startsWith(HASHSIGN))
1153       {
1154         int spacePos = dataLine.indexOf(" ");
1155         return (spacePos == -1 ? dataLine.substring(1) : dataLine
1156                 .substring(1, spacePos));
1157       }
1158     }
1159     return null;
1160   }
1161
1162   /**
1163    * Read the #MEGA and Title/Format/Description header lines (if present).
1164    *
1165    * Save as alignment properties in case useful.
1166    *
1167    * @return the next non-blank line following the header lines.
1168    * @throws FileFormatException
1169    */
1170   protected String parseHeaderLines() throws FileFormatException
1171   {
1172     String inputLine = null;
1173     while ((inputLine = nextNonCommentLine()) != null)
1174     {
1175       inputLine = inputLine.trim();
1176
1177       /*
1178        * skip blank lines
1179        */
1180       if (inputLine.length() == 0)
1181       {
1182         continue;
1183       }
1184
1185       if (inputLine.toUpperCase().startsWith(MEGA_ID))
1186       {
1187         continue;
1188       }
1189
1190       if (isTitle(inputLine))
1191       {
1192         this.title = getValue(inputLine);
1193         setAlignmentProperty(PROP_TITLE, title);
1194       }
1195       else if (inputLine.startsWith(BANG + DESCRIPTION))
1196       {
1197         parseDescription(inputLine);
1198       }
1199
1200       else if (inputLine.startsWith(BANG + FORMAT))
1201       {
1202         parseFormat(inputLine);
1203       }
1204       else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
1205       {
1206
1207         /*
1208          * Return the first 'data line' i.e. one that is not blank, #MEGA or
1209          * TITLE:
1210          */
1211         break;
1212       }
1213     }
1214     return inputLine;
1215   }
1216
1217   /**
1218    * Parse a !Format statement. This may be multiline, and is ended by a
1219    * semicolon.
1220    *
1221    * @param inputLine
1222    * @throws FileFormatException
1223    */
1224   protected void parseFormat(String inputLine) throws FileFormatException
1225   {
1226     while (inputLine != null)
1227     {
1228       parseFormatLine(inputLine);
1229       if (inputLine.endsWith(SEMICOLON))
1230       {
1231         break;
1232       }
1233       inputLine = nextNonCommentLine();
1234     }
1235   }
1236
1237   /**
1238    * Parse one line of a !Format statement. This may contain one or more
1239    * keyword=value pairs.
1240    *
1241    * @param inputLine
1242    * @throws FileFormatException
1243    */
1244   protected void parseFormatLine(String inputLine)
1245           throws FileFormatException
1246   {
1247     if (inputLine.startsWith(BANG + FORMAT))
1248     {
1249       inputLine = inputLine.substring((BANG + FORMAT).length());
1250     }
1251     if (inputLine.endsWith(SEMICOLON))
1252     {
1253       inputLine = inputLine.substring(0, inputLine.length() - 1);
1254     }
1255     if (inputLine.length() == 0)
1256     {
1257       return;
1258     }
1259     String[] tokens = inputLine.trim().split(WHITESPACE);
1260     for (String token : tokens)
1261     {
1262       parseFormatKeyword(token);
1263     }
1264   }
1265
1266   /**
1267    * Parse a Keyword=Value token. Possible keywords are
1268    * <ul>
1269    * <li>DataType= DNA, RNA, Nucleotide, Protein</li>
1270    * <li>DataFormat= Interleaved, ?</li>
1271    * <li>NSeqs= number of sequences (synonym NTaxa)</li>
1272    * <li>NSites= number of bases / residues</li>
1273    * <li>Property= Exon (or Coding), Intron (or Noncoding), End (of domain)</li>
1274    * <li>Indel= gap character</li>
1275    * <li>Identical= identity character (to first sequence) (synonym MatchChar)</li>
1276    * <li>Missing= missing data character</li>
1277    * <li>CodeTable= Standard, other (MEGA supports various)</li>
1278    * </ul>
1279    *
1280    * @param token
1281    * @throws FileFormatException
1282    *           if an unrecognised keyword or value is encountered
1283    */
1284   protected void parseFormatKeyword(String token)
1285           throws FileFormatException
1286   {
1287     String msg = "Unrecognised Format command: " + token;
1288     String[] bits = token.split(EQUALS);
1289     if (bits.length != 2)
1290     {
1291       throw new FileFormatException(msg);
1292     }
1293     String keyword = bits[0];
1294     String value = bits[1];
1295
1296     /*
1297      * Jalview will work out whether nucleotide or not anyway
1298      */
1299     if (keyword.equalsIgnoreCase(DATATYPE))
1300     {
1301       if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
1302               || value.equalsIgnoreCase("Nucleotide"))
1303       {
1304         this.nucleotide = true;
1305         // alignment computes whether or not it is nucleotide when created
1306       }
1307       else if (value.equalsIgnoreCase(PROTEIN))
1308       {
1309         this.nucleotide = false;
1310       }
1311       else
1312       {
1313         throw new FileFormatException(msg);
1314       }
1315       setAlignmentProperty(PROP_DATATYPE, value);
1316     }
1317
1318     /*
1319      * accept non-Standard code table but save in case we want to disable
1320      * 'translate as cDNA'
1321      */
1322     else if (keyword.equalsIgnoreCase(CODETABLE))
1323     {
1324       setAlignmentProperty(PROP_CODETABLE, value);
1325     }
1326
1327     /*
1328      * save gap char to set later on alignment once created
1329      */
1330     else if (keyword.equalsIgnoreCase(INDEL))
1331     {
1332       this.gapCharacter = value.charAt(0);
1333       if (!Comparison.isGap(gapCharacter))
1334       {
1335         System.err.println("Jalview doesn't support '" + gapCharacter
1336                 + "' for gaps, will be converted to '" + DEFAULT_GAP + "'");
1337       }
1338     }
1339
1340     else if (keyword.equalsIgnoreCase(IDENTICAL)
1341             || keyword.equalsIgnoreCase("MatchChar"))
1342     {
1343       setAlignmentProperty(PROP_IDENTITY, value);
1344       this.identityCharacter = value.charAt(0);
1345       if (!".".equals(value))
1346       {
1347         System.err.println("Warning: " + token
1348                 + " not supported, Jalview uses '.' for identity");
1349       }
1350     }
1351
1352     else if (keyword.equalsIgnoreCase(MISSING))
1353     {
1354       setAlignmentProperty(PROP_MISSING, value);
1355       System.err.println("Warning: " + token + " not supported");
1356     }
1357
1358     else if (keyword.equalsIgnoreCase(PROPERTY))
1359     {
1360       // TODO: can Property appear in a Format command?
1361       // suspect this is a mistake in the manual
1362     }
1363
1364     else if (!keyword.equalsIgnoreCase(N_SEQS)
1365             && !keyword.equalsIgnoreCase("NTaxa")
1366             && !keyword.equalsIgnoreCase(N_SITES))
1367     {
1368       System.err.println("Warning: " + msg);
1369     }
1370   }
1371
1372   /**
1373    * Returns the trimmed data on the line following either whitespace or '=',
1374    * with any trailing semi-colon removed<br>
1375    * So
1376    * <ul>
1377    * <li>Hello World</li>
1378    * <li>!Hello: \tWorld;</li>
1379    * <li>!Hello=World</li>
1380    * <ul>
1381    * should all return "World"
1382    *
1383    * @param inputLine
1384    * @return
1385    */
1386   protected static String getValue(String inputLine)
1387   {
1388     if (inputLine == null)
1389     {
1390       return null;
1391     }
1392     String value = null;
1393     String s = inputLine.replaceAll("\t", " ").trim();
1394
1395     /*
1396      * KEYWORD = VALUE should return VALUE
1397      */
1398     int equalsPos = s.indexOf("=");
1399     if (equalsPos >= 0)
1400     {
1401       value = s.substring(equalsPos + 1);
1402     }
1403     else
1404     {
1405       int spacePos = s.indexOf(' ');
1406       value = spacePos == -1 ? "" : s.substring(spacePos + 1);
1407     }
1408     value = value.trim();
1409     if (value.endsWith(SEMICOLON))
1410     {
1411       value = value.substring(0, value.length() - 1).trim();
1412     }
1413     return value;
1414   }
1415
1416   /**
1417    * Returns true if the input line starts with "TITLE" or "!TITLE" (not case
1418    * sensitive). The latter is the official format, some older data file
1419    * examples have it without the !.
1420    *
1421    * @param inputLine
1422    * @return
1423    */
1424   protected static boolean isTitle(String inputLine)
1425   {
1426     if (inputLine == null)
1427     {
1428       return false;
1429     }
1430     String upper = inputLine.toUpperCase();
1431     return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
1432             + TITLE.toUpperCase()));
1433   }
1434
1435   /**
1436    * Reads lines until terminated by semicolon, appending each to the
1437    * Description property value.
1438    *
1439    * @throws FileFormatException
1440    */
1441   protected void parseDescription(String firstDescriptionLine)
1442           throws FileFormatException
1443   {
1444     StringBuilder desc = new StringBuilder(256);
1445     desc.append(getValue(firstDescriptionLine));
1446     if (!firstDescriptionLine.endsWith(SEMICOLON))
1447     {
1448       String line = nextNonCommentLine();
1449       while (line != null)
1450       {
1451         if (line.endsWith(SEMICOLON))
1452         {
1453           desc.append(line.substring(0, line.length() - 1));
1454           break;
1455         }
1456         else if (line.length() > 0)
1457         {
1458           desc.append(line).append(newline);
1459         }
1460         line = nextNonCommentLine();
1461       }
1462     }
1463     setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
1464   }
1465
1466   /**
1467    * Returns the alignment sequences in Mega format.
1468    */
1469   @Override
1470   public String print()
1471   {
1472     return MEGA_ID + newline + print(getSeqsAsArray());
1473   }
1474
1475   /**
1476    * Write out the alignment sequences in Mega format - interleaved unless
1477    * explicitly noninterleaved.
1478    */
1479   protected String print(SequenceI[] s)
1480   {
1481     String result;
1482     if (this.interleaved != null && !this.interleaved)
1483     {
1484       result = printNonInterleaved(s);
1485     }
1486     else
1487     {
1488       result = printInterleaved(s);
1489     }
1490     return result;
1491   }
1492
1493   /**
1494    * Print to string in Interleaved format - blocks of next N characters of each
1495    * sequence in turn.
1496    *
1497    * @param s
1498    */
1499   protected String printInterleaved(SequenceI[] s)
1500   {
1501     int maxIdLength = getMaxIdLength(s);
1502     int maxSequenceLength = getMaxSequenceLength(s);
1503
1504     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1505     int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;
1506
1507     /*
1508      * Roughly size a buffer to hold the whole output
1509      */
1510     int numLines = maxSequenceLength / positionsPerLine + 3; // approx
1511     StringBuilder sb = new StringBuilder(numLines
1512             * (maxIdLength + positionsPerLine + chunksPerLine + 10));
1513
1514     /*
1515      * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
1516      */
1517     AlignmentAnnotation geneAnnotation = findAnnotation(MEGA_ANNOTATION_GENE);
1518     AlignmentAnnotation domainAnnotation = findAnnotation(MEGA_ANNOTATION_DOMAIN);
1519     currentGene = null;
1520     currentDomain = null;
1521     int from = 0;
1522
1523     while (from < maxSequenceLength)
1524     {
1525       printGeneOrDomainHeader(sb, from, geneAnnotation, domainAnnotation);
1526       int maxCol = from + positionsPerLine; // exclusive
1527       for (int col = from; col <= maxCol; col++)
1528       {
1529         if (geneOrDomainChanged(col, geneAnnotation, domainAnnotation)
1530                 || col == maxCol)
1531         {
1532           /*
1533            * print a block of sequences up to [col-1]
1534            */
1535           sb.append(newline);
1536           boolean first = true;
1537           int advancedBy = 0;
1538           for (SequenceI seq : s)
1539           {
1540             int seqFrom = from;
1541             String seqId = String.format("#%-" + maxIdLength + "s",
1542                     seq.getName());
1543
1544             /*
1545              * output next line for this sequence
1546              */
1547             sb.append(seqId);
1548             for (int j = 0; j < chunksPerLine; j++)
1549             {
1550               char[] subSequence = seq.getSequence(seqFrom,
1551                       Math.min(col, seqFrom + spaceEvery));
1552               if (subSequence.length > 0)
1553               {
1554                 sb.append(SPACE).append(subSequence);
1555               }
1556               seqFrom += subSequence.length;
1557               if (first)
1558               {
1559                 // all sequences should be the same length in MEGA
1560                 advancedBy += subSequence.length;
1561               }
1562             }
1563             // write last position as a comment
1564             if (writePositionNumbers)
1565             {
1566               sb.append(SPACE).append(COMMENT_START)
1567                       .append(String.valueOf(from + advancedBy))
1568                       .append(COMMENT_END);
1569             }
1570             sb.append(newline);
1571             first = false;
1572           }
1573           sb.append(printLabel(from, advancedBy, maxIdLength));
1574           from += advancedBy;
1575           break;
1576         }
1577       }
1578     }
1579
1580     return new String(sb);
1581   }
1582
1583   /**
1584    * Returns true if we detect a change of gene or domain at the given column
1585    * position. Currently done by inspecting any "MEGA Gene" or "MEGA Domain"
1586    * annotation for the column.
1587    *
1588    * @param column
1589    * @param geneAnnotation
1590    * @param domainAnnotation
1591    * @return
1592    */
1593   protected boolean geneOrDomainChanged(int column,
1594           AlignmentAnnotation geneAnnotation,
1595           AlignmentAnnotation domainAnnotation)
1596   {
1597     String gene = getGeneFromAnnotation(column, geneAnnotation);
1598     String domain = getDomainFromAnnotation(column, domainAnnotation);
1599     boolean domainEnd = domain != null
1600             && domain.toUpperCase().indexOf(DOMAINEND.toUpperCase()) != -1;
1601
1602     if ((domainEnd || gene == null) && currentGene != null)
1603     {
1604       return true;
1605     }
1606     else if (gene != null && !gene.equals(currentGene))
1607     {
1608       return true;
1609     }
1610     // todo: Domain
1611     return false;
1612   }
1613
1614   /**
1615    * Extracts the name of a domain from MEGA Domain annotation at the given
1616    * column position, if any
1617    *
1618    * @param column
1619    * @param domainAnnotation
1620    * @return
1621    */
1622   protected static String getDomainFromAnnotation(int column,
1623           AlignmentAnnotation domainAnnotation)
1624   {
1625     Annotation domainAnn = domainAnnotation == null ? null
1626             : (column >= domainAnnotation.annotations.length ? null
1627                     : domainAnnotation.annotations[column]);
1628     String domain = domainAnn == null ? null : domainAnn.description;
1629     if (domain != null && domain.indexOf("(") > 0)
1630     {
1631       domain = domain.substring(0, domain.indexOf("(")).trim();
1632     }
1633     return domain;
1634   }
1635
1636   /**
1637    * Extracts the name of a gene from MEGA Gene annotation at the given column
1638    * position, if any
1639    *
1640    * @param column
1641    * @param geneAnnotation
1642    * @return
1643    */
1644   protected static String getGeneFromAnnotation(int column,
1645           AlignmentAnnotation geneAnnotation)
1646   {
1647     Annotation geneAnn = geneAnnotation == null ? null
1648             : (column >= geneAnnotation.annotations.length ? null
1649                     : geneAnnotation.annotations[column]);
1650     String gene = geneAnn == null ? null : geneAnn.description;
1651     return gene;
1652   }
1653
1654   /**
1655    * Appends a !Domain header on change of Domain (or Domain/Gene) annotation.
1656    * If it changes to a null value, appends a !Domain Property=domainend;
1657    * statement
1658    *
1659    * @param sb
1660    *          buffer to append to
1661    * @param column
1662    *          of alignment being output
1663    * @param geneAnnotation
1664    *          "MEGA Gene" annotations
1665    * @param domainAnnotation
1666    *          "MEGA Domain" annotations
1667    */
1668   protected void printGeneOrDomainHeader(StringBuilder sb, int column,
1669           AlignmentAnnotation geneAnnotation,
1670           AlignmentAnnotation domainAnnotation)
1671   {
1672     String gene = getGeneFromAnnotation(column, geneAnnotation);
1673     String domain = getDomainFromAnnotation(column, domainAnnotation);
1674     String property = getPropertyFromAnnotation(column, domainAnnotation);
1675
1676     if ((gene == null && currentGene != null)
1677             || (gene == null && domain == null && currentDomain != null))
1678     {
1679       /*
1680        * end of Gene or Domain annotation
1681        */
1682       sb.append(newline).append(BANG).append(DOMAIN).append(EQUALS)
1683               .append(currentDomain).append(SPACE).append(PROPERTY)
1684               .append(EQUALS).append(DOMAINEND).append(SEMICOLON);
1685     }
1686     else if (gene != null && !gene.equals(currentGene))
1687     {
1688       /*
1689        * start of a new Gene; output as "!Domain Gene=..." if we have domain
1690        * annotation
1691        */
1692       if (domain != null)
1693       {
1694         sb.append(newline).append(BANG).append(DOMAIN).append(EQUALS)
1695                 .append(domain).append(SPACE).append(GENE).append(EQUALS)
1696                 .append(gene);
1697         if (property != null)
1698         {
1699           sb.append(SPACE).append(PROPERTY).append(EQUALS).append(property);
1700         }
1701         sb.append(SEMICOLON);
1702       }
1703       else
1704       {
1705         sb.append(newline).append(BANG).append(GENE).append(EQUALS)
1706               .append(gene).append(SEMICOLON);
1707       }
1708     }
1709     else if (domain != null && !domain.equals(currentGene))
1710     {
1711       /*
1712        * change of domain within same (or no) gene
1713        */
1714       sb.append(newline).append(BANG).append(DOMAIN).append(EQUALS)
1715               .append(domain);
1716       if (currentGene != null)
1717       {
1718         sb.append(SPACE).append(GENE).append(EQUALS).append(currentGene);
1719       }
1720       if (property != null)
1721       {
1722         sb.append(SPACE).append(PROPERTY).append(EQUALS).append(property);
1723       }
1724       sb.append(SEMICOLON);
1725     }
1726     currentGene = gene;
1727     currentDomain = domain;
1728   }
1729
1730   /**
1731    * Outputs to string the MEGA header and any other known and relevant
1732    * alignment properties
1733    *
1734    * @param al
1735    */
1736   protected String printHeaders(AlignmentI al)
1737   {
1738     StringBuilder sb = new StringBuilder(128);
1739     sb.append(MEGA_ID).append(newline);
1740     String propertyValue = (String) al.getProperty(PROP_TITLE);
1741     if (propertyValue != null)
1742     {
1743       sb.append(BANG).append(TITLE).append(SPACE).append(propertyValue)
1744               .append(SEMICOLON).append(newline);
1745     }
1746     propertyValue = (String) al.getProperty(PROP_DESCRIPTION);
1747     if (propertyValue != null)
1748     {
1749       sb.append(BANG).append(DESCRIPTION).append(newline)
1750               .append(propertyValue).append(SEMICOLON)
1751               .append(newline);
1752     }
1753
1754     /*
1755      * !Format DataType CodeTable
1756      */
1757     sb.append(BANG).append(FORMAT).append(newline);
1758     String dataType = (String) al.getProperty(PROP_DATATYPE);
1759     if (dataType == null)
1760     {
1761       dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
1762     }
1763     sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
1764     String codeTable = (String) al.getProperty(PROP_CODETABLE);
1765     sb.append(SPACE).append(CODETABLE).append(EQUALS)
1766             .append(codeTable == null ? "Standard" : codeTable)
1767             .append(newline);
1768
1769     /*
1770      * !Format NSeqs NSites (the length of sequences - they should all be the
1771      * same - including gaps)
1772      */
1773     sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
1774     sb.append(SPACE).append(N_SITES).append(EQUALS)
1775             .append(String.valueOf(al.getWidth()));
1776     sb.append(newline);
1777
1778     /*
1779      * !Format Indel Identical Missing
1780      */
1781     sb.append(INDENT);
1782     sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
1783     String identity = (String) al.getProperty(PROP_IDENTITY);
1784     if (identity != null)
1785     {
1786       sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
1787     }
1788     String missing = (String) al.getProperty(PROP_MISSING);
1789     if (missing != null)
1790     {
1791       sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
1792     }
1793     sb.append(SEMICOLON).append(newline);
1794
1795     return sb.toString();
1796   }
1797
1798   /**
1799    * Get the longest sequence id (to allow aligned printout).
1800    *
1801    * @param s
1802    * @return
1803    */
1804   protected static int getMaxIdLength(SequenceI[] s)
1805   {
1806     // TODO pull up for reuse
1807     int maxLength = 0;
1808     for (SequenceI seq : s)
1809     {
1810       int len = seq.getName().length();
1811       if (len > maxLength)
1812       {
1813         maxLength = len;
1814       }
1815     }
1816     return maxLength;
1817   }
1818
1819   /**
1820    * Get the longest sequence length (including gaps)
1821    *
1822    * @param s
1823    * @return
1824    */
1825   protected static int getMaxSequenceLength(SequenceI[] s)
1826   {
1827     // TODO pull up for reuse
1828     int maxLength = 0;
1829     for (SequenceI seq : s)
1830     {
1831       int len = seq.getLength();
1832       if (len > maxLength)
1833       {
1834         maxLength = len;
1835       }
1836     }
1837     return maxLength;
1838   }
1839
1840   /**
1841    * Print to string in noninterleaved format - all of each sequence in turn, in
1842    * blocks of 50 characters.
1843    *
1844    * @param s
1845    * @return
1846    */
1847   protected String printNonInterleaved(SequenceI[] s)
1848   {
1849     int maxSequenceLength = getMaxSequenceLength(s);
1850     // approx
1851     int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;
1852
1853     /*
1854      * Roughly size a buffer to hold the whole output
1855      */
1856     StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
1857
1858     for (SequenceI seq : s)
1859     {
1860       printSequence(sb, seq);
1861     }
1862
1863     return new String(sb);
1864   }
1865
1866   /**
1867    * Append a formatted complete sequence to the string buffer
1868    *
1869    * @param sb
1870    * @param seq
1871    */
1872   protected void printSequence(StringBuilder sb, SequenceI seq)
1873   {
1874     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1875     // round down to output a whole number of spaced blocks
1876     int chunksPerLine = positionsPerLine / spaceEvery;
1877
1878     sb.append(newline);
1879     sb.append(HASHSIGN + seq.getName()).append(newline);
1880     int startPos = 0;
1881     while (startPos < seq.getLength())
1882     {
1883       /*
1884        * print next line for this sequence
1885        */
1886       boolean firstChunk = true;
1887       int lastPos = startPos + positionsPerLine; // exclusive
1888       for (int j = 0; j < chunksPerLine; j++)
1889       {
1890         char[] subSequence = seq.getSequence(startPos,
1891                 Math.min(lastPos, startPos + spaceEvery));
1892         if (subSequence.length > 0)
1893         {
1894           if (!firstChunk)
1895           {
1896             sb.append(SPACE);
1897           }
1898           sb.append(subSequence);
1899           firstChunk = false;
1900         }
1901         startPos += subSequence.length;
1902       }
1903       // line end position (base 1) as a comment
1904       sb.append(SPACE).append(COMMENT_START).append(startPos)
1905               .append(COMMENT_END);
1906       sb.append(newline);
1907     }
1908   }
1909
1910   /**
1911    * Returns a formatted string like <br>
1912    * !Label aa_b_ ab_b_ <br>
1913    * where underscore represents no annotation, any other character a MEGA label
1914    * character <br>
1915    * Returns an empty string if there is no non-null annotation in the given
1916    * alignment range
1917    *
1918    * @param fromPos
1919    *          start column of the alignment (base 0)
1920    * @param positions
1921    *          number of positions to output
1922    * @param labelWidth
1923    *          padded width of !Label statement to output
1924    * @return
1925    */
1926   protected String printLabel(int fromPos, int positions, int labelWidth)
1927   {
1928     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1929     String none = "";
1930     AlignmentAnnotation ann = findAnnotation(MEGA_ANNOTATION_LABEL);
1931     if (ann == null)
1932     {
1933       return none;
1934     }
1935
1936     StringBuilder sb = new StringBuilder(positions + 20);
1937     sb.append(String.format("%-" + labelWidth + "s ", BANG + LABEL));
1938     Annotation[] anns = annotations.get(0).annotations;
1939     int blockCharCount = 0;
1940     boolean annotationFound = false;
1941
1942     for (int i = fromPos; i < fromPos + positions; i++)
1943     {
1944       String label = String.valueOf(UNDERSCORE);
1945       if (i < anns.length && anns[i] != null)
1946       {
1947         label = anns[i].displayCharacter;
1948       }
1949       sb.append(label);
1950       if (label.charAt(0) != UNDERSCORE)
1951       {
1952         annotationFound = true;
1953       }
1954       // add a space after each block except the last
1955       if (++blockCharCount % spaceEvery == 0
1956               && (i < fromPos + positions - 1))
1957       {
1958         sb.append(SPACE);
1959       }
1960     }
1961     sb.append(SEMICOLON).append(newline);
1962
1963     return annotationFound ? sb.toString() : none;
1964   }
1965
1966   /**
1967    * Returns the first stored annotation found with the given label, or null
1968    *
1969    * @param annotationLabel
1970    * @return
1971    */
1972   protected AlignmentAnnotation findAnnotation(String annotationLabel)
1973   {
1974     if (annotations != null)
1975     {
1976       for (AlignmentAnnotation ann : annotations)
1977       {
1978         if (annotationLabel.equals(ann.label))
1979         {
1980           return ann;
1981         }
1982       }
1983     }
1984     return null;
1985   }
1986
1987   /**
1988    * Flag this file as interleaved or not, based on data format. Throws an
1989    * exception if has previously been determined to be otherwise.
1990    *
1991    * @param isIt
1992    * @param dataLine
1993    * @throws FileFormatException
1994    */
1995   protected void assertInterleaved(boolean isIt, String dataLine)
1996           throws FileFormatException
1997   {
1998     if (this.interleaved != null && isIt != this.interleaved.booleanValue())
1999     {
2000       throw new FileFormatException("Parse error: interleaved was " + !isIt
2001               + " but now seems to be " + isIt + ", at line: " + dataLine);
2002     }
2003     this.interleaved = new Boolean(isIt);
2004     setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
2005   }
2006
2007   public boolean isInterleaved()
2008   {
2009     return this.interleaved == null ? false : this.interleaved
2010             .booleanValue();
2011   }
2012
2013   /**
2014    * Adds saved parsed values either as alignment properties, or (in some cases)
2015    * as specific member fields of the alignment
2016    */
2017   @Override
2018   public void addProperties(AlignmentI al)
2019   {
2020     super.addProperties(al);
2021
2022     /*
2023      * record gap character specified, but convert to '-' if not one we support
2024      */
2025     al.setGapCharacter(Comparison.isGap(gapCharacter) ? gapCharacter
2026             : DEFAULT_GAP);
2027
2028     /*
2029      * warn if e.g. DataType=DNA but data is protein (or vice versa)
2030      */
2031     if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) {
2032       System.err.println("Warning: " + this.title + " declared "
2033               + (nucleotide ? "" : " not ") + "nucleotide but it is"
2034               + (nucleotide ? " not" : ""));
2035     }
2036   }
2037
2038   /**
2039    * Print the given alignment in MEGA format. If the alignment was created by
2040    * parsing a MEGA file, it should have properties set (e.g. Title) which can
2041    * surface in the output.
2042    */
2043   @Override
2044   public String print(AlignmentI al)
2045   {
2046     this.nucleotide = al.isNucleotide();
2047
2048     /*
2049      * "MEGA Gene", "MEGA Domain" or "MEGA Label" annotations can be output
2050      */
2051     AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
2052     if (anns != null)
2053     {
2054       this.annotations = new Vector<AlignmentAnnotation>();
2055       for (AlignmentAnnotation ann : anns)
2056       {
2057         annotations.add(ann);
2058       }
2059     }
2060
2061     String lineLength = (String) al.getProperty(PROP_LINELENGTH);
2062     this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
2063             .parseInt(lineLength);
2064
2065     /*
2066      * round down to a multiple of 3 positions per line for nucleotide
2067      */
2068     if (nucleotide)
2069     {
2070       positionsPerLine = positionsPerLine - (positionsPerLine % 3);
2071     }
2072
2073     String interleave = (String) al.getProperty(PROP_INTERLEAVED);
2074     if (interleave != null)
2075     {
2076       this.interleaved = Boolean.valueOf(interleave);
2077     }
2078
2079     String headers = printHeaders(al);
2080     return headers + print(al.getSequencesArray());
2081   }
2082
2083   /**
2084    * Returns the number of sequence positions output per line
2085    *
2086    * @return
2087    */
2088   public int getPositionsPerLine()
2089   {
2090     return positionsPerLine;
2091   }
2092
2093   /**
2094    * Sets the number of sequence positions output per line. Note these will be
2095    * formatted in blocks of 3 (nucleotide) or 10 (peptide).
2096    *
2097    * @param p
2098    */
2099   public void setPositionsPerLine(int p)
2100   {
2101     this.positionsPerLine = p;
2102   }
2103
2104   /**
2105    * Extracts the Coding / Noncoding property of a domain from MEGA Domain
2106    * annotation at the given column position, if present
2107    *
2108    * @param column
2109    * @param domainAnnotation
2110    * @return
2111    */
2112   protected static String getPropertyFromAnnotation(int column,
2113           AlignmentAnnotation domainAnnotation)
2114   {
2115     /*
2116      * currently depends on parsing "Exon1 (Aspx Coding)" or similar
2117      */
2118     String property = null;
2119     Annotation domainAnn = domainAnnotation == null ? null
2120             : (column >= domainAnnotation.annotations.length ? null
2121                     : domainAnnotation.annotations[column]);
2122     String domain = domainAnn == null ? null : domainAnn.description;
2123     if (domain != null && domain.indexOf("(") > 0)
2124     {
2125       domain = domain.substring(domain.indexOf("(") +1);
2126       if (domain.indexOf(SPACE) > -1 && domain.endsWith(")")) {
2127         property = domain.substring(domain.indexOf(SPACE) + 1,
2128                 domain.length() - 1);
2129       }
2130     }
2131     return property;
2132   }
2133 }