src/jalview/io/MegaFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  * The Jalview Authors are detailed in the 'AUTHORS' file.
  18  */
  19 package jalview.io;
  20
  21 import jalview.datamodel.AlignmentAnnotation;
  22 import jalview.datamodel.AlignmentI;
  23 import jalview.datamodel.Annotation;
  24 import jalview.datamodel.Sequence;
  25 import jalview.datamodel.SequenceFeature;
  26 import jalview.datamodel.SequenceI;
  27
  28 import java.io.IOException;
  29 import java.util.ArrayList;
  30 import java.util.HashMap;
  31 import java.util.Iterator;
  32 import java.util.LinkedHashMap;
  33 import java.util.List;
  34 import java.util.Map;
  35 import java.util.Map.Entry;
  36 import java.util.Set;
  37
  38 /**
  39  * A parser for input or output of MEGA format files. <br>
  40  * <br>
  41  * Tamura K, Stecher G, Peterson D, Filipski A, and Kumar S (2013) MEGA6:
  42  * Molecular Evolutionary Genetics Analysis Version 6.0. Molecular Biology and
  43  * Evolution 30: 2725-2729. <br>
  44  * <br>
  45  *
  46  * MEGA file format is supported as described in
  47  * http://www.megasoftware.net/manual.pdf <br>
  48  * Limitations:
  49  * <ul>
  50  * <li>nested comments (marked by [ ]) are accepted but not preserved</li>
  51  * <li>to be completed</li>
  52  * </ul>
  53  *
  54  * @see http://www.megasoftware.net/
  55  */
  56 public class MegaFile extends AlignFile
  57 {
  58   private static final char UNDERSCORE = '_';
  59
  60   private static final String WHITESPACE = "\\s+";
  61
  62   private static final int DEFAULT_LINE_LENGTH = 60;
  63
  64   private static final String INDENT = "    ";
  65
  66   private static final String N_SITES = "NSites";
  67
  68   private static final String N_SEQS = "NSeqs";
  69
  70   private static final String MISSING = "Missing";
  71
  72   private static final String IDENTICAL = "Identical";
  73
  74   private static final String INDEL = "Indel";
  75
  76   private static final String CODETABLE = "CodeTable";
  77
  78   private static final String PROTEIN = "Protein";
  79
  80   private static final String NUCLEOTIDE = "Nucleotide";
  81
  82   private static final String DATATYPE = "DataType";
  83
  84   private static final char COMMENT_START = '[';
  85
  86   private static final char COMMENT_END = ']';
  87
  88   private static final String HASHSIGN = "#";
  89
  90   private static final String SEMICOLON = ";";
  91
  92   private static final String BANG = "!";
  93
  94   private static final String EQUALS = "=";
  95
  96   private static final String MEGA_ID = HASHSIGN + "MEGA";
  97
  98   private static final String TITLE = "Title";
  99
 100   private static final String FORMAT = "Format";
 101
 102   private static final String DESCRIPTION = "Description";
 103
 104   private static final String GENE = "Gene";
 105
 106   private static final String DOMAIN = "Domain";
 107
 108   private static final String PROPERTY = "Property";
 109
 110   private static final String CODONSTART = "CodonStart";
 111
 112   private static final String LABEL = "Label";
 113
 114   /*
 115    * names of properties to save to the alignment (may affect eventual output
 116    * format)
 117    */
 118   static final String PROP_TITLE = "MEGA_TITLE";
 119
 120   static final String PROP_INTERLEAVED = "MEGA_INTERLEAVED";
 121
 122   static final String PROP_DESCRIPTION = "MEGA_DESCRIPTION";
 123
 124   static final String PROP_CODETABLE = "MEGA_CODETABLE";
 125
 126   static final String PROP_IDENTITY = "MEGA_IDENTITY";
 127
 128   static final String PROP_MISSING = "MEGA_MISSING";
 129
 130   static final String PROP_DATATYPE = "MEGA_DATATYPE";
 131
 132   // number of bases per line of file (value is inferred)
 133   static final String PROP_LINELENGTH = "MEGA_LINELENGTH";
 134
 135   // TODO: need a controlled name for Gene as a feature if we want to be able to
 136   // output the MEGA file with !Gene headers
 137   // WTF do we do if the sequences get realigned?
 138
 139   // initial size for sequence data buffer
 140   private static final int SEQBUFFERSIZE = 256;
 141
 142   private static final String SPACE = " ";
 143
 144   private static final String TAB = "\t";
 145
 146   /*
 147    * number of sequence positions output per line
 148    */
 149   private int positionsPerLine;
 150
 151   private String title;
 152
 153   // gap character may be explicitly declared, default is -
 154   private char gapCharacter = '-';
 155
 156   // identity character if declared
 157   private char identityCharacter = 0;
 158
 159   // this can be True, False or null (meaning not asserted in file)
 160   private Boolean nucleotide;
 161
 162   // set once we have seen one block of interleaved data
 163   private boolean firstDataBlockRead = false;
 164
 165   // this can be True, False or null (meaning we don't know yet)
 166   private Boolean interleaved;
 167
 168   // write end of line positions as a comment
 169   private boolean writePositionNumbers = true;
 170
 171   // id of sequence being processed
 172   private String currentSequenceId;
 173
 174   /*
 175    * Temporary store of {sequenceId, positionData} while parsing interleaved
 176    * sequences; sequences are maintained in the order in which they are added
 177    * i.e. read in the file
 178    */
 179   Map<String, StringBuilder> seqData;
 180
 181   // number of residues read (so far) per sequence
 182   Map<String, Integer> residuesRead;
 183
 184   // current Gene if any we are parsing
 185   private String currentGene;
 186
 187   // start residue (base 1) per sequence of current gene
 188   Map<String, Integer> geneStart;
 189
 190   // current Domain if any we are parsing
 191   private String currentDomain;
 192
 193   // start residue (base 1) per sequence of current domain
 194   Map<String, Integer> domainStart;
 195
 196   // map of SequenceFeature's by sequence id
 197   Map<String, List<SequenceFeature>> sequenceFeatures;
 198
 199   // each !Label line character becomes an Annotation (except underscores)
 200   List<Annotation> labelAnnotations;
 201
 202   public MegaFile()
 203   {
 204   }
 205
 206   public MegaFile(String inFile, String type) throws IOException
 207   {
 208     super(inFile, type);
 209   }
 210
 211   public MegaFile(FileParse source) throws IOException
 212   {
 213     super(source);
 214   }
 215
 216   /**
 217    * Parse the input stream.
 218    */
 219   @Override
 220   public void parse() throws IOException
 221   {
 222     gapCharacter = '-';
 223     sequenceFeatures = new HashMap<String, List<SequenceFeature>>();
 224     geneStart = new HashMap<String, Integer>();
 225     domainStart = new HashMap<String, Integer>();
 226     residuesRead = new HashMap<String, Integer>();
 227     labelAnnotations = new ArrayList<Annotation>();
 228
 229     /*
 230      * Read and process MEGA and Title/Format/Description headers if present.
 231      * Returns the first data line following the headers.
 232      */
 233     String dataLine = parseHeaderLines();
 234
 235     /*
 236      * order-preserving map to hold sequences by id as they are built up during
 237      * parsing
 238      */
 239     seqData = new LinkedHashMap<String, StringBuilder>();
 240
 241     /*
 242      * The id of the sequence being read (for non-interleaved)
 243      */
 244     currentSequenceId = "";
 245
 246     while (dataLine != null)
 247     {
 248       dataLine = dataLine.trim();
 249       if (dataLine.length() > 0)
 250       {
 251         dataLine = dataLine.replace(TAB, SPACE);
 252         String upperCased = dataLine.toUpperCase();
 253         if (upperCased.startsWith(BANG + GENE.toUpperCase())
 254                 || upperCased.startsWith(BANG + DOMAIN.toUpperCase()))
 255         {
 256           parseGeneOrDomain(dataLine);
 257         }
 258         else if (upperCased.startsWith(BANG + LABEL.toUpperCase()))
 259         {
 260           parseLabel(dataLine);
 261         }
 262         else
 263         {
 264           currentSequenceId = parseDataLine(dataLine);
 265         }
 266       }
 267       else if (!seqData.isEmpty())
 268       {
 269         /*
 270          * Blank line after processing some data...
 271          */
 272         endOfDataBlock();
 273       }
 274       dataLine = nextNonCommentLine();
 275     }
 276
 277     /*
 278      * close off any features currently being parsed
 279      */
 280     createFeature(GENE, currentGene, geneStart);
 281     createFeature(DOMAIN, currentDomain, domainStart);
 282
 283     // remember the (longest) line length read in, so we can output the same
 284     setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
 285
 286     deriveSequencesAndFeatures();
 287
 288     deriveAnnotations();
 289   }
 290
 291   /**
 292    * If we parsed !Label statements into a list of Annotation objects, create an
 293    * AlignmentAnnotation
 294    */
 295   protected void deriveAnnotations()
 296   {
 297     if (this.labelAnnotations.size() > 0)
 298     {
 299       Annotation[] anns = labelAnnotations
 300               .toArray(new Annotation[labelAnnotations.size()]);
 301       AlignmentAnnotation aa = new AlignmentAnnotation("MEGA", "Label",
 302               anns);
 303       this.annotations.add(aa);
 304     }
 305   }
 306
 307   /**
 308    * Parse a !Label line. This contains a single character per position (column)
 309    * of the alignment block above. An underscore character represents no label.
 310    * Labels are assembled into an AlignmentAnnotation object.
 311    *
 312    * @param dataLine
 313    * @throws FileFormatException
 314    */
 315   protected void parseLabel(String dataLine) throws FileFormatException
 316   {
 317     // strip off leading !Label and following spaces
 318     dataLine = dataLine.substring(LABEL.length() + 1).trim();
 319
 320     // remove internal spacing and any leading tab
 321     String labels = dataLine.replace(SPACE, "");
 322     if (labels.endsWith(SEMICOLON))
 323     {
 324       labels = labels.substring(0, labels.length() - 1);
 325     }
 326     else
 327     {
 328       System.err.println("Warning: '" + dataLine
 329               + "' should end with semi-colon");
 330     }
 331     for (char c : labels.toCharArray())
 332     {
 333       if (c == UNDERSCORE)
 334       {
 335         this.labelAnnotations.add(null);
 336       }
 337       else
 338       {
 339         this.labelAnnotations.add(new Annotation(String.valueOf(c), "",
 340                 ' ', 0f));
 341       }
 342     }
 343
 344     /*
 345      * sanity check - the number of labels added should exactly match the
 346      * sequence length so far
 347      */
 348     int sequenceLength = seqData.isEmpty() ? 0 : seqData.values()
 349             .iterator().next().length();
 350     if (labelAnnotations.size() != sequenceLength)
 351     {
 352       System.err.println("Warning: file inconsistent - "
 353               + labelAnnotations.size() + " labels for " + sequenceLength
 354               + " positions after " + dataLine);
 355     }
 356   }
 357
 358   /**
 359    * Post-processing after reading one block of interleaved data
 360    */
 361   protected void endOfDataBlock()
 362   {
 363     this.firstDataBlockRead = true;
 364     // TODO:
 365     // (initialise and) populate arrays of sequence length so far (excluding
 366     // gaps)
 367     // On change or end of a denoted Gene or Domain, add sequence features for
 368     // it
 369   }
 370
 371   /**
 372    * Parse a !Gene or !Domain command line. MEGA accepts
 373    * <ul>
 374    * <li>!Gene=name;</li>
 375    * <li>!Gene=name Property=Coding/Noncoding CodonStart=1/2/3;</li>
 376    * <li>!Gene=genename Domain=domainname Property= etc</li>
 377    * <li>!Domain=domainname Gene=genename Property= etc</li>
 378    * <li>!Domain=domainname Property= etc</li>
 379    * <li>!domain=domainname property=domainend</li>
 380    * </ul>
 381    * Properly, a Gene should be composed of Domain segments, but MEGA accepts
 382    * without. Note that keywords don't seem to be case sensitive.
 383    *
 384    * @param dataLine
 385    * @throws FileFormatException
 386    */
 387   protected void parseGeneOrDomain(String dataLine)
 388           throws FileFormatException
 389   {
 390     String domain = null;
 391     String gene = null;
 392     String property = null;
 393     String codonStart = null;
 394     String errorMsg = "Unrecognized format: " + dataLine;
 395
 396     if (!dataLine.startsWith(BANG) || !dataLine.endsWith(SEMICOLON))
 397     {
 398       throw new FileFormatException(errorMsg);
 399     }
 400     String trimmed = dataLine.substring(1, dataLine.length() - 1).trim();
 401     String[] tokens = trimmed.split(WHITESPACE);
 402     for (String token : tokens)
 403     {
 404       String[] keyValue = token.split("=");
 405       if (keyValue.length != 2)
 406       {
 407         throw new FileFormatException(errorMsg);
 408       }
 409       String key = keyValue[0];
 410       if (GENE.equalsIgnoreCase(key))
 411       {
 412         gene = keyValue[1];
 413       }
 414       else if (DOMAIN.equalsIgnoreCase(key))
 415       {
 416         domain = keyValue[1];
 417       }
 418       else if (PROPERTY.equalsIgnoreCase(key))
 419       {
 420         property = keyValue[1];
 421       }
 422       else if (CODONSTART.equalsIgnoreCase(key))
 423       {
 424         codonStart = keyValue[1];
 425       }
 426       else
 427       {
 428         System.err.println("Unrecognised token: '" + key + "; in "
 429                 + dataLine);
 430       }
 431     }
 432
 433     processGeneOrDomain(gene, domain, property, codonStart);
 434   }
 435
 436   /**
 437    * Process a statement containing one or both of Gene and Domain, and
 438    * optionally Property or CodonStart commands.
 439    *
 440    * @param gene
 441    *          the Gene name if specified, else null
 442    * @param domain
 443    *          the Domain name if specified, else null
 444    * @param property
 445    *          the Property value if specified, else null
 446    * @param codonStart
 447    *          the CodonStart value if specified, else null
 448    */
 449   protected void processGeneOrDomain(String gene, String domain,
 450           String property, String codonStart)
 451   {
 452     /*
 453      * the order of processing below ensures that we correctly capture where a
 454      * domain is in the context of an enclosing gene
 455      */
 456     processDomainEnd(domain, gene, property);
 457
 458     processGeneEnd(gene);
 459
 460     processGeneStart(gene);
 461
 462     processDomainStart(domain, property);
 463
 464     // TODO save codonStart if we plan to involve it in 'translate as cDNA'
 465   }
 466
 467   /**
 468    * If we have declared a domain, and it is not continuing, start a sequence
 469    * feature for it
 470    *
 471    * @param domain
 472    * @param property
 473    */
 474   protected void processDomainStart(String domain, String property)
 475   {
 476     if ("domainend".equalsIgnoreCase(property))
 477     {
 478       currentDomain = null;
 479       return;
 480     }
 481
 482     if (domain != null && !domain.equals(currentDomain))
 483     {
 484       String verboseDomain = makeVerboseDomainName(domain, property);
 485       startSequenceFeature(domainStart);
 486
 487       currentDomain = verboseDomain;
 488     }
 489   }
 490
 491   /**
 492    * If we have declared a gene, and it is not continuing, start a sequence
 493    * feature for it
 494    *
 495    * @param gene
 496    */
 497   protected void processGeneStart(String gene)
 498   {
 499     if (gene != null && !gene.equals(currentGene))
 500     {
 501       startSequenceFeature(geneStart);
 502     }
 503     currentGene = gene;
 504   }
 505
 506   /**
 507    * If we have been processing a domain, and it is not being continued, then
 508    * make a sequence feature for the domain just ended. Criteria for the domain
 509    * not being continued are either an explicit new domain or gene name, or a
 510    * 'Property=domainend' statement
 511    *
 512    * @param domain
 513    * @param gene
 514    * @param property
 515    * @return true if a feature is created, else false
 516    */
 517   protected boolean processDomainEnd(String domain, String gene,
 518           String property)
 519   {
 520     boolean newGene = (gene != null && !gene.equals(currentGene));
 521
 522     String verboseDomain = makeVerboseDomainName(domain, property);
 523
 524     if (this.currentDomain != null)
 525     {
 526       boolean newDomain = !this.currentDomain.equals(verboseDomain);
 527       boolean domainEnded = "domainend".equalsIgnoreCase(property);
 528       if (newDomain || newGene || domainEnded)
 529       {
 530         createFeature(DOMAIN, currentDomain, domainStart);
 531         currentDomain = null;
 532         return true;
 533       }
 534     }
 535     return false;
 536   }
 537
 538   /**
 539    * If we have been processing a gene, and it is not being continued, then make
 540    * a sequence feature for the gene just ended
 541    *
 542    * @param gene
 543    * @return true if a feature is created, else false
 544    */
 545   protected boolean processGeneEnd(String gene)
 546   {
 547     boolean created = false;
 548     /*
 549      * If we were processing a gene and now have either another, or none, create
 550      * a sequence feature for that gene
 551      */
 552     if (this.currentGene != null && !this.currentGene.equals(gene))
 553     {
 554       createFeature(GENE, currentGene, geneStart);
 555       currentGene = null;
 556       created = true;
 557     }
 558
 559     return created;
 560   }
 561
 562   /**
 563    * Makes an expanded descriptive name for Domain if possible e.g.
 564    * "Intron1 (Adh Coding)". Currently incorporates the current gene name (if
 565    * any) and the Coding/Noncoding property value (if given).
 566    *
 567    * @param domain
 568    * @param property
 569    * @return
 570    */
 571   protected String makeVerboseDomainName(String domain, String property)
 572   {
 573     String verboseDomain = domain;
 574     if (domain != null)
 575     {
 576       String coding = "";
 577       if ("Exon".equalsIgnoreCase(property)
 578               || "Coding".equalsIgnoreCase(property))
 579       {
 580         coding = " Coding";
 581       }
 582       else if ("Intron".equalsIgnoreCase(property)
 583               || "Noncoding".equalsIgnoreCase(property))
 584       {
 585         coding = " Noncoding";
 586       }
 587       verboseDomain = domain
 588               + (currentGene == null ? "" : " (" + currentGene + coding
 589                       + ")");
 590     }
 591     return verboseDomain;
 592   }
 593
 594   /**
 595    * Start processing a new feature
 596    *
 597    * @param startPositions
 598    */
 599   protected void startSequenceFeature(Map<String, Integer> startPositions)
 600   {
 601     /*
 602      * If the feature declaration precedes all sequences, we will know in
 603      * createFeature that it started with residue 1; otherwise note now where it
 604      * starts in each sequence
 605      */
 606     if (!residuesRead.isEmpty())
 607     {
 608       for (Entry<String, Integer> entry : residuesRead.entrySet())
 609       {
 610         String seqId = entry.getKey();
 611         Integer nextResidue = entry.getValue() + 1;
 612         startPositions.put(seqId, nextResidue);
 613       }
 614     }
 615   }
 616
 617   /**
 618    * Add a SequenceFeature to each sequence, using the given start/end values
 619    * per sequence
 620    *
 621    * @param featureType
 622    * @param featureValue
 623    * @param featureStartResidues
 624    */
 625   protected void createFeature(String featureType, String featureValue,
 626           Map<String, Integer> featureStartResidues)
 627   {
 628     if (featureValue == null)
 629     {
 630       return;
 631     }
 632
 633     Iterator<String> seqids = this.seqData.keySet().iterator();
 634     while (seqids.hasNext())
 635     {
 636       String seqid = seqids.next();
 637       Integer startAt = featureStartResidues.get(seqid);
 638       int sfstart = startAt == null ? 1 : startAt.intValue();
 639       int sfend = residuesRead.get(seqid);
 640       if (sfend >= sfstart)
 641       {
 642         /*
 643          * don't add feature if entirely gapped in the sequence
 644          */
 645         // TODO: type="Gene" (but then all coloured the same) or
 646         // type="GeneName"?
 647         SequenceFeature sf = new SequenceFeature(featureValue, featureType,
 648                 sfstart, sfend, 0f, null);
 649         sequenceFeatures.get(seqid).add(sf);
 650       }
 651     }
 652   }
 653
 654   /**
 655    * Returns the next line that is not a comment, or null at end of file.
 656    * Comments in MEGA are within [ ] brackets, and may be nested.
 657    *
 658    * @return
 659    * @throws IOException
 660    */
 661   protected String nextNonCommentLine() throws IOException
 662   {
 663     return nextNonCommentLine(0);
 664   }
 665
 666   /**
 667    * Returns the next non-comment line (or part line), or null at end of file.
 668    * Comments in MEGA are within [ ] brackets, and may be nested. They may occur
 669    * anywhere within a line (for example at the end with position numbers); this
 670    * method returns the line with any comments removed.
 671    *
 672    * @param depth
 673    *          current depth of nesting of comments while parsing
 674    * @return
 675    * @throws IOException
 676    */
 677   protected String nextNonCommentLine(final int depth) throws IOException
 678   {
 679     String data = null;
 680     data = nextLine();
 681     if (data == null)
 682     {
 683       if (depth > 0)
 684       {
 685         System.err.println("Warning: unterminated comment in data file");
 686       }
 687       return data;
 688     }
 689
 690     /*
 691      * If we are in a (possibly nested) comment after parsing this line, keep
 692      * reading recursively until the comment has unwound
 693      */
 694     int newDepth = commentDepth(data, depth);
 695     if (newDepth > 0)
 696     {
 697       return nextNonCommentLine(newDepth);
 698     }
 699     else
 700     {
 701       /*
 702        * not in a comment by end of this line; return what is left
 703        */
 704       String nonCommentPart = getNonCommentContent(data, depth);
 705       return nonCommentPart;
 706     }
 707   }
 708
 709   /**
 710    * Returns what is left of the input data after removing any comments, whether
 711    * 'in progress' from preceding lines, or embedded in the current line
 712    *
 713    * @param data
 714    *          input data
 715    * @param depth
 716    *          nested depth of comments pending termination
 717    * @return
 718    * @throws FileFormatException
 719    */
 720   protected static String getNonCommentContent(String data, int depth)
 721           throws FileFormatException
 722   {
 723     int len = data.length();
 724     StringBuilder result = new StringBuilder(len);
 725     for (int i = 0; i < len; i++)
 726     {
 727       char c = data.charAt(i);
 728       switch (c)
 729       {
 730       case COMMENT_START:
 731         depth++;
 732         break;
 733
 734       case COMMENT_END:
 735         if (depth > 0)
 736         {
 737           depth--;
 738         }
 739         else
 740         {
 741           result.append(c);
 742         }
 743         break;
 744
 745       default:
 746         if (depth == 0)
 747         {
 748           result.append(c);
 749         }
 750       }
 751     }
 752     return result.toString();
 753   }
 754
 755   /**
 756    * Calculates new depth of comment after parsing an input line i.e. the excess
 757    * of opening '[' over closing ']' characters. Any excess ']' are ignored (not
 758    * treated as comment delimiters).
 759    *
 760    * @param data
 761    *          input line
 762    * @param depth
 763    *          current comment nested depth before parsing the line
 764    * @return new depth after parsing the line
 765    */
 766   protected static int commentDepth(CharSequence data, int depth)
 767   {
 768     int newDepth = depth;
 769     int len = data.length();
 770     for (int i = 0; i < len; i++)
 771     {
 772       char c = data.charAt(i);
 773       if (c == COMMENT_START)
 774       {
 775         newDepth++;
 776       }
 777       else if (c == COMMENT_END && newDepth > 0)
 778       {
 779         newDepth--;
 780       }
 781     }
 782     return newDepth;
 783   }
 784
 785   /**
 786    * Convert the parsed sequence strings to objects and store them in the model.
 787    */
 788   protected void deriveSequencesAndFeatures()
 789   {
 790     Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
 791
 792     for (Entry<String, StringBuilder> dataset : datasets)
 793     {
 794       String sequenceId = dataset.getKey();
 795       StringBuilder characters = dataset.getValue();
 796       SequenceI s = new Sequence(sequenceId, new String(characters));
 797       this.seqs.addElement(s);
 798
 799       /*
 800        * and add any derived sequence features to the sequence
 801        */
 802       for (SequenceFeature sf : sequenceFeatures.get(sequenceId))
 803       {
 804         s.addSequenceFeature(sf);
 805       }
 806     }
 807   }
 808
 809   /**
 810    * Process one line of sequence data. If it has no sequence identifier, append
 811    * to the current id's sequence. Else parse out the sequence id and append the
 812    * data (if any) to that id's sequence. Returns the sequence id (implicit or
 813    * explicit) for this line.
 814    *
 815    * @param dataLine
 816    * @return
 817    * @throws IOException
 818    */
 819   protected String parseDataLine(String dataLine)
 820           throws IOException
 821   {
 822     String seqId = getSequenceId(dataLine);
 823     if (seqId == null)
 824     {
 825       /*
 826        * Just character data
 827        */
 828       parseNoninterleavedDataLine(dataLine);
 829       return currentSequenceId;
 830     }
 831     else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
 832     {
 833       /*
 834        * Sequence id only - header line for noninterleaved data
 835        */
 836       return seqId;
 837     }
 838     else
 839     {
 840       /*
 841        * Sequence id followed by data
 842        */
 843       parseInterleavedDataLine(dataLine, seqId);
 844       return seqId;
 845     }
 846   }
 847
 848   /**
 849    * Add a line of sequence data to the buffer for the given sequence id. Start
 850    * a new one if we haven't seen it before.
 851    *
 852    * @param dataLine
 853    * @throws IOException
 854    */
 855   protected void parseNoninterleavedDataLine(String dataLine)
 856           throws FileFormatException
 857   {
 858     if (currentSequenceId == null)
 859     {
 860       /*
 861        * Oops. Data but no sequence id context.
 862        */
 863       throw new FileFormatException("No sequence id context at: "
 864               + dataLine);
 865     }
 866
 867     assertInterleaved(false, dataLine);
 868
 869     dataLine = addSequenceData(currentSequenceId, dataLine);
 870
 871     setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
 872   }
 873
 874   /**
 875    * Get the sequence data for this sequence id, starting a new one if
 876    * necessary.
 877    *
 878    * @param currentId
 879    * @return
 880    */
 881   protected StringBuilder getSequenceDataBuffer(String currentId)
 882   {
 883     StringBuilder sb = seqData.get(currentId);
 884     if (sb == null)
 885     {
 886       // first data met for this sequence id, start a new buffer
 887       sb = new StringBuilder(SEQBUFFERSIZE);
 888       seqData.put(currentId, sb);
 889
 890       // and a placeholder for any SequenceFeature found
 891       sequenceFeatures.put(currentId, new ArrayList<SequenceFeature>());
 892     }
 893     return sb;
 894   }
 895
 896   /**
 897    * Parse one line of interleaved data e.g.
 898    *
 899    * <pre>
 900    * #TheSeqId CGATCGCATGCA
 901    * </pre>
 902    *
 903    * @param dataLine
 904    * @param seqId
 905    * @throws FileFormatException
 906    */
 907   protected void parseInterleavedDataLine(String dataLine, String seqId)
 908           throws FileFormatException
 909   {
 910     /*
 911      * New sequence found in second or later data block - error.
 912      */
 913     if (this.firstDataBlockRead && !seqData.containsKey(seqId))
 914     {
 915       throw new FileFormatException(
 916               "Parse error: misplaced new sequence starting at " + dataLine);
 917     }
 918
 919     String data = dataLine.substring(seqId.length() + 1).trim();
 920
 921     /*
 922      * Do nothing if this line is _only_ a sequence id with no data following.
 923      */
 924     if (data != null && data.length() > 0)
 925     {
 926       data = addSequenceData(seqId, data);
 927       setPositionsPerLine(Math.max(positionsPerLine, data.length()));
 928       assertInterleaved(true, dataLine);
 929     }
 930   }
 931
 932   /**
 933    * Remove spaces, and replace identity symbol, before appending the sequence
 934    * data to the buffer for the sequence id. Returns the reformatted added data.
 935    * Also updates a count of residues read for the sequence.
 936    *
 937    * @param seqId
 938    * @param data
 939    * @return
 940    */
 941   protected String addSequenceData(String seqId, String data)
 942   {
 943     StringBuilder sb = getSequenceDataBuffer(seqId);
 944     int len = sb.length();
 945     String formatted = data.replace(SPACE, "");
 946
 947     /*
 948      * If sequence contains '.' or other identity symbol; replace these with the
 949      * same position from the first (reference) sequence
 950      */
 951     int nonGapped = 0;
 952     StringBuilder referenceSequence = seqData.values().iterator().next();
 953     StringBuilder sb1 = new StringBuilder(formatted.length());
 954     for (int i = 0; i < formatted.length(); i++)
 955     {
 956       char nextChar = formatted.charAt(i);
 957       if (nextChar != gapCharacter)
 958       {
 959         nonGapped++;
 960       }
 961       if (nextChar == identityCharacter
 962               && len + i < referenceSequence.length())
 963       {
 964         sb1.append(referenceSequence.charAt(len + i));
 965       }
 966       else
 967       {
 968         sb1.append(nextChar);
 969       }
 970     }
 971     formatted = sb1.toString();
 972
 973     data = formatted;
 974     sb.append(data);
 975
 976     /*
 977      * increment residue count for the sequence
 978      */
 979     if (nonGapped > 0)
 980     {
 981       Integer residueCount = residuesRead.get(seqId);
 982       residuesRead.put(seqId, nonGapped
 983               + (residueCount == null ? 0 : residueCount));
 984     }
 985
 986     return data;
 987   }
 988
 989   /**
 990    * If the line begins with (e.g.) "#abcde " then returns "abcde" as the
 991    * identifier. Else returns null.
 992    *
 993    * @param dataLine
 994    * @return
 995    */
 996   public static String getSequenceId(String dataLine)
 997   {
 998     // TODO refactor to a StringUtils type class
 999     if (dataLine != null)
1000     {
1001       if (dataLine.startsWith(HASHSIGN))
1002       {
1003         int spacePos = dataLine.indexOf(" ");
1004         return (spacePos == -1 ? dataLine.substring(1) : dataLine
1005                 .substring(1, spacePos));
1006       }
1007     }
1008     return null;
1009   }
1010
1011   /**
1012    * Read the #MEGA and Title/Format/Description header lines (if present).
1013    *
1014    * Save as alignment properties in case useful.
1015    *
1016    * @return the next non-blank line following the header lines.
1017    * @throws IOException
1018    */
1019   protected String parseHeaderLines() throws IOException
1020   {
1021     String inputLine = null;
1022     while ((inputLine = nextNonCommentLine()) != null)
1023     {
1024       inputLine = inputLine.trim();
1025
1026       /*
1027        * skip blank lines
1028        */
1029       if (inputLine.length() == 0)
1030       {
1031         continue;
1032       }
1033
1034       if (inputLine.toUpperCase().startsWith(MEGA_ID))
1035       {
1036         continue;
1037       }
1038
1039       if (isTitle(inputLine))
1040       {
1041         this.title = getValue(inputLine);
1042         setAlignmentProperty(PROP_TITLE, title);
1043       }
1044       else if (inputLine.startsWith(BANG + DESCRIPTION))
1045       {
1046         parseDescription(inputLine);
1047       }
1048
1049       else if (inputLine.startsWith(BANG + FORMAT))
1050       {
1051         parseFormat(inputLine);
1052       }
1053       else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
1054       {
1055
1056         /*
1057          * Return the first 'data line' i.e. one that is not blank, #MEGA or
1058          * TITLE:
1059          */
1060         break;
1061       }
1062     }
1063     return inputLine;
1064   }
1065
1066   /**
1067    * Parse a !Format statement. This may be multiline, and is ended by a
1068    * semicolon.
1069    *
1070    * @param inputLine
1071    * @throws IOException
1072    */
1073   protected void parseFormat(String inputLine) throws IOException
1074   {
1075     while (inputLine != null)
1076     {
1077       parseFormatLine(inputLine);
1078       if (inputLine.endsWith(SEMICOLON))
1079       {
1080         break;
1081       }
1082       inputLine = nextNonCommentLine();
1083     }
1084   }
1085
1086   /**
1087    * Parse one line of a !Format statement. This may contain one or more
1088    * keyword=value pairs.
1089    *
1090    * @param inputLine
1091    * @throws FileFormatException
1092    */
1093   protected void parseFormatLine(String inputLine)
1094           throws FileFormatException
1095   {
1096     if (inputLine.startsWith(BANG + FORMAT))
1097     {
1098       inputLine = inputLine.substring((BANG + FORMAT).length());
1099     }
1100     if (inputLine.endsWith(SEMICOLON))
1101     {
1102       inputLine = inputLine.substring(0, inputLine.length() - 1);
1103     }
1104     if (inputLine.length() == 0)
1105     {
1106       return;
1107     }
1108     String[] tokens = inputLine.trim().split(WHITESPACE);
1109     for (String token : tokens)
1110     {
1111       parseFormatKeyword(token);
1112     }
1113   }
1114
1115   /**
1116    * Parse a Keyword=Value token. Possible keywords are
1117    * <ul>
1118    * <li>DataType= DNA, RNA, Nucleotide, Protein</li>
1119    * <li>DataFormat= Interleaved, ?</li>
1120    * <li>NSeqs= number of sequences (synonym NTaxa)</li>
1121    * <li>NSites= number of bases / residues</li>
1122    * <li>Property= Exon (or Coding), Intron (or Noncoding), End (of domain)</li>
1123    * <li>Indel= gap character</li>
1124    * <li>Identical= identity character (to first sequence) (synonym MatchChar)</li>
1125    * <li>Missing= missing data character</li>
1126    * <li>CodeTable= Standard, other (MEGA supports various)</li>
1127    * </ul>
1128    *
1129    * @param token
1130    * @throws FileFormatException
1131    *           if an unrecognised keyword or value is encountered
1132    */
1133   protected void parseFormatKeyword(String token)
1134           throws FileFormatException
1135   {
1136     String msg = "Unrecognised Format command: " + token;
1137     String[] bits = token.split(EQUALS);
1138     if (bits.length != 2)
1139     {
1140       throw new FileFormatException(msg);
1141     }
1142     String keyword = bits[0];
1143     String value = bits[1];
1144
1145     /*
1146      * Jalview will work out whether nucleotide or not anyway
1147      */
1148     if (keyword.equalsIgnoreCase(DATATYPE))
1149     {
1150       if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
1151               || value.equalsIgnoreCase("Nucleotide"))
1152       {
1153         this.nucleotide = true;
1154         // alignment computes whether or not it is nucleotide when created
1155       }
1156       else if (value.equalsIgnoreCase(PROTEIN))
1157       {
1158         this.nucleotide = false;
1159       }
1160       else
1161       {
1162         throw new FileFormatException(msg);
1163       }
1164       setAlignmentProperty(PROP_DATATYPE, value);
1165     }
1166
1167     /*
1168      * accept non-Standard code table but save in case we want to disable
1169      * 'translate as cDNA'
1170      */
1171     else if (keyword.equalsIgnoreCase(CODETABLE))
1172     {
1173       setAlignmentProperty(PROP_CODETABLE, value);
1174     }
1175
1176     /*
1177      * save gap char to set later on alignment once created
1178      */
1179     else if (keyword.equalsIgnoreCase(INDEL))
1180     {
1181       this.gapCharacter = value.charAt(0);
1182     }
1183
1184     else if (keyword.equalsIgnoreCase(IDENTICAL)
1185             || keyword.equalsIgnoreCase("MatchChar"))
1186     {
1187       setAlignmentProperty(PROP_IDENTITY, value);
1188       this.identityCharacter = value.charAt(0);
1189       if (!".".equals(value))
1190       {
1191         System.err.println("Warning: " + token
1192                 + " not supported, Jalview uses '.' for identity");
1193       }
1194     }
1195
1196     else if (keyword.equalsIgnoreCase(MISSING))
1197     {
1198       setAlignmentProperty(PROP_MISSING, value);
1199       System.err.println("Warning: " + token + " not supported");
1200     }
1201
1202     else if (keyword.equalsIgnoreCase(PROPERTY))
1203     {
1204       // TODO: can Property appear in a Format command?
1205       // suspect this is a mistake in the manual
1206     }
1207
1208     else if (!keyword.equalsIgnoreCase(N_SEQS)
1209             && !keyword.equalsIgnoreCase("NTaxa")
1210             && !keyword.equalsIgnoreCase(N_SITES))
1211     {
1212       System.err.println("Warning: " + msg);
1213     }
1214   }
1215
1216   /**
1217    * Returns the trimmed data on the line following either whitespace or '=',
1218    * with any trailing semi-colon removed<br>
1219    * So
1220    * <ul>
1221    * <li>Hello World</li>
1222    * <li>!Hello: \tWorld;</li>
1223    * <li>!Hello=World</li>
1224    * <ul>
1225    * should all return "World"
1226    *
1227    * @param inputLine
1228    * @return
1229    */
1230   protected static String getValue(String inputLine)
1231   {
1232     if (inputLine == null)
1233     {
1234       return null;
1235     }
1236     String value = null;
1237     String s = inputLine.replaceAll("\t", " ").trim();
1238
1239     /*
1240      * KEYWORD = VALUE should return VALUE
1241      */
1242     int equalsPos = s.indexOf("=");
1243     if (equalsPos >= 0)
1244     {
1245       value = s.substring(equalsPos + 1);
1246     }
1247     else
1248     {
1249       int spacePos = s.indexOf(' ');
1250       value = spacePos == -1 ? "" : s.substring(spacePos + 1);
1251     }
1252     value = value.trim();
1253     if (value.endsWith(SEMICOLON))
1254     {
1255       value = value.substring(0, value.length() - 1).trim();
1256     }
1257     return value;
1258   }
1259
1260   /**
1261    * Returns true if the input line starts with "TITLE" or "!TITLE" (not case
1262    * sensitive). The latter is the official format, some older data file
1263    * examples have it without the !.
1264    *
1265    * @param inputLine
1266    * @return
1267    */
1268   protected static boolean isTitle(String inputLine)
1269   {
1270     if (inputLine == null)
1271     {
1272       return false;
1273     }
1274     String upper = inputLine.toUpperCase();
1275     return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
1276             + TITLE.toUpperCase()));
1277   }
1278
1279   /**
1280    * Reads lines until terminated by semicolon, appending each to the
1281    * Description property value.
1282    *
1283    * @throws IOException
1284    */
1285   protected void parseDescription(String firstDescriptionLine)
1286           throws IOException
1287   {
1288     StringBuilder desc = new StringBuilder(256);
1289     desc.append(getValue(firstDescriptionLine));
1290     if (!firstDescriptionLine.endsWith(SEMICOLON))
1291     {
1292       String line = nextNonCommentLine();
1293       while (line != null)
1294       {
1295         if (line.endsWith(SEMICOLON))
1296         {
1297           desc.append(line.substring(0, line.length() - 1));
1298           break;
1299         }
1300         else if (line.length() > 0)
1301         {
1302           desc.append(line).append(newline);
1303         }
1304         line = nextNonCommentLine();
1305       }
1306     }
1307     setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
1308   }
1309
1310   /**
1311    * Returns the alignment sequences in Mega format.
1312    */
1313   @Override
1314   public String print()
1315   {
1316     return MEGA_ID + newline + print(getSeqsAsArray());
1317   }
1318
1319   /**
1320    * Write out the alignment sequences in Mega format - interleaved unless
1321    * explicitly noninterleaved.
1322    */
1323   protected String print(SequenceI[] s)
1324   {
1325     String result;
1326     if (this.interleaved != null && !this.interleaved)
1327     {
1328       result = printNonInterleaved(s);
1329     }
1330     else
1331     {
1332       result = printInterleaved(s);
1333     }
1334     return result;
1335   }
1336
1337   /**
1338    * Print to string in Interleaved format - blocks of next N characters of each
1339    * sequence in turn.
1340    *
1341    * @param s
1342    */
1343   protected String printInterleaved(SequenceI[] s)
1344   {
1345     int maxIdLength = getMaxIdLength(s);
1346     int maxSequenceLength = getMaxSequenceLength(s);
1347     int numLines = maxSequenceLength / positionsPerLine + 3; // approx
1348
1349     int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1;
1350     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1351     int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;
1352
1353     /*
1354      * Roughly size a buffer to hold the whole output
1355      */
1356     StringBuilder sb = new StringBuilder(numLines
1357             * (maxIdLength + positionsPerLine + chunksPerLine + 10));
1358
1359     /*
1360      * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
1361      */
1362     int from = 0;
1363     for (int i = 0; i < numDataBlocks; i++)
1364     {
1365       sb.append(newline);
1366       boolean first = true;
1367       int advancedBy = 0;
1368       for (SequenceI seq : s)
1369       {
1370         int seqFrom = from;
1371         String seqId = String.format("#%-" + maxIdLength + "s",
1372                 seq.getName());
1373
1374         /*
1375          * output next line for this sequence
1376          */
1377         sb.append(seqId);
1378         int lastPos = seqFrom + positionsPerLine; // exclusive
1379         for (int j = 0; j < chunksPerLine; j++)
1380         {
1381           char[] subSequence = seq.getSequence(seqFrom,
1382                   Math.min(lastPos, seqFrom + spaceEvery));
1383           if (subSequence.length > 0)
1384           {
1385             sb.append(SPACE).append(subSequence);
1386           }
1387           seqFrom += subSequence.length;
1388           if (first)
1389           {
1390             // all sequences should be the same length in MEGA
1391             advancedBy += subSequence.length;
1392           }
1393         }
1394         // write last position as a comment
1395         if (writePositionNumbers)
1396         {
1397           sb.append(SPACE).append(COMMENT_START).append(from + advancedBy)
1398                   .append(COMMENT_END);
1399         }
1400         sb.append(newline);
1401         first = false;
1402       }
1403       from += advancedBy;
1404     }
1405
1406     return new String(sb);
1407   }
1408
1409   /**
1410    * Outputs to string the MEGA header and any other known and relevant
1411    * alignment properties
1412    *
1413    * @param al
1414    */
1415   protected String printHeaders(AlignmentI al)
1416   {
1417     StringBuilder sb = new StringBuilder(128);
1418     sb.append(MEGA_ID).append(newline);
1419     String propertyValue = (String) al.getProperty(PROP_TITLE);
1420     if (propertyValue != null)
1421     {
1422       sb.append(BANG).append(TITLE).append(SPACE).append(propertyValue)
1423               .append(SEMICOLON).append(newline);
1424     }
1425     propertyValue = (String) al.getProperty(PROP_DESCRIPTION);
1426     if (propertyValue != null)
1427     {
1428       sb.append(BANG).append(DESCRIPTION).append(newline)
1429               .append(propertyValue).append(SEMICOLON)
1430               .append(newline);
1431     }
1432
1433     /*
1434      * !Format DataType CodeTable
1435      */
1436     sb.append(BANG).append(FORMAT).append(newline);
1437     String dataType = (String) al.getProperty(PROP_DATATYPE);
1438     if (dataType == null)
1439     {
1440       dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
1441     }
1442     sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
1443     String codeTable = (String) al.getProperty(PROP_CODETABLE);
1444     sb.append(SPACE).append(CODETABLE).append(EQUALS)
1445             .append(codeTable == null ? "Standard" : codeTable)
1446             .append(newline);
1447
1448     /*
1449      * !Format NSeqs NSites (the length of sequences - they should all be the
1450      * same - including gaps)
1451      */
1452     sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
1453     sb.append(SPACE).append(N_SITES).append(EQUALS)
1454             .append(String.valueOf(al.getWidth()));
1455     sb.append(newline);
1456
1457     /*
1458      * !Format Indel Identical Missing
1459      */
1460     sb.append(INDENT);
1461     sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
1462     String identity = (String) al.getProperty(PROP_IDENTITY);
1463     if (identity != null)
1464     {
1465       sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
1466     }
1467     String missing = (String) al.getProperty(PROP_MISSING);
1468     if (missing != null)
1469     {
1470       sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
1471     }
1472     sb.append(SEMICOLON).append(newline);
1473
1474     return sb.toString();
1475   }
1476
1477   /**
1478    * Get the longest sequence id (to allow aligned printout).
1479    *
1480    * @param s
1481    * @return
1482    */
1483   protected static int getMaxIdLength(SequenceI[] s)
1484   {
1485     // TODO pull up for reuse
1486     int maxLength = 0;
1487     for (SequenceI seq : s)
1488     {
1489       int len = seq.getName().length();
1490       if (len > maxLength)
1491       {
1492         maxLength = len;
1493       }
1494     }
1495     return maxLength;
1496   }
1497
1498   /**
1499    * Get the longest sequence length
1500    *
1501    * @param s
1502    * @return
1503    */
1504   protected static int getMaxSequenceLength(SequenceI[] s)
1505   {
1506     // TODO pull up for reuse
1507     int maxLength = 0;
1508     for (SequenceI seq : s)
1509     {
1510       int len = seq.getLength();
1511       if (len > maxLength)
1512       {
1513         maxLength = len;
1514       }
1515     }
1516     return maxLength;
1517   }
1518
1519   /**
1520    * Print to string in noninterleaved format - all of each sequence in turn, in
1521    * blocks of 50 characters.
1522    *
1523    * @param s
1524    * @return
1525    */
1526   protected String printNonInterleaved(SequenceI[] s)
1527   {
1528     int maxSequenceLength = getMaxSequenceLength(s);
1529     // approx
1530     int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;
1531
1532     /*
1533      * Roughly size a buffer to hold the whole output
1534      */
1535     StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
1536
1537     int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1538     int chunksPerLine = positionsPerLine / spaceEvery;
1539     for (SequenceI seq : s)
1540     {
1541       sb.append(newline);
1542       sb.append(HASHSIGN + seq.getName()).append(newline);
1543       int startPos = 0;
1544       while (startPos < seq.getLength())
1545       {
1546         boolean firstChunk = true;
1547         /*
1548          * print next line for this sequence
1549          */
1550         int lastPos = startPos + positionsPerLine; // exclusive
1551         for (int j = 0; j < chunksPerLine; j++)
1552         {
1553           char[] subSequence = seq.getSequence(startPos,
1554                   Math.min(lastPos, startPos + positionsPerLine));
1555           if (subSequence.length > 0)
1556           {
1557             if (!firstChunk)
1558             {
1559               sb.append(SPACE);
1560             }
1561             sb.append(subSequence);
1562             firstChunk = false;
1563           }
1564           startPos += subSequence.length;
1565         }
1566         sb.append(newline);
1567       }
1568     }
1569
1570     return new String(sb);
1571   }
1572
1573   /**
1574    * Flag this file as interleaved or not, based on data format. Throws an
1575    * exception if has previously been determined to be otherwise.
1576    *
1577    * @param isIt
1578    * @param dataLine
1579    * @throws IOException
1580    */
1581   protected void assertInterleaved(boolean isIt, String dataLine)
1582           throws FileFormatException
1583   {
1584     if (this.interleaved != null && isIt != this.interleaved.booleanValue())
1585     {
1586       throw new FileFormatException("Parse error: interleaved was " + !isIt
1587               + " but now seems to be " + isIt + ", at line: " + dataLine);
1588     }
1589     this.interleaved = new Boolean(isIt);
1590     setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
1591   }
1592
1593   public boolean isInterleaved()
1594   {
1595     return this.interleaved == null ? false : this.interleaved
1596             .booleanValue();
1597   }
1598
1599   /**
1600    * Adds saved parsed values either as alignment properties, or (in some cases)
1601    * as specific member fields of the alignment
1602    */
1603   @Override
1604   public void addProperties(AlignmentI al)
1605   {
1606     super.addProperties(al);
1607     al.setGapCharacter(gapCharacter);
1608
1609     /*
1610      * warn if e.g. DataType=DNA but data is protein (or vice versa)
1611      */
1612     if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) {
1613       System.err.println("Warning: " + this.title + " declared "
1614               + (nucleotide ? "" : " not ") + "nucleotide but it is"
1615               + (nucleotide ? " not" : ""));
1616     }
1617   }
1618
1619   /**
1620    * Print the given alignment in MEGA format. If the alignment was created by
1621    * parsing a MEGA file, it should have properties set (e.g. Title) which can
1622    * influence the output.
1623    */
1624   @Override
1625   public String print(AlignmentI al)
1626   {
1627     this.nucleotide = al.isNucleotide();
1628
1629     String lineLength = (String) al.getProperty(PROP_LINELENGTH);
1630     this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
1631             .parseInt(lineLength);
1632
1633     /*
1634      * round down to a multiple of 3 positions per line for nucleotide
1635      */
1636     if (nucleotide)
1637     {
1638       positionsPerLine = positionsPerLine - (positionsPerLine % 3);
1639     }
1640
1641     String interleave = (String) al.getProperty(PROP_INTERLEAVED);
1642     if (interleave != null)
1643     {
1644       this.interleaved = Boolean.valueOf(interleave);
1645     }
1646
1647     String headers = printHeaders(al);
1648     return headers + print(al.getSequencesArray());
1649   }
1650
1651   /**
1652    * Returns the number of sequence positions output per line
1653    *
1654    * @return
1655    */
1656   public int getPositionsPerLine()
1657   {
1658     return positionsPerLine;
1659   }
1660
1661   /**
1662    * Sets the number of sequence positions output per line. Note these will be
1663    * formatted in blocks of 3 (nucleotide) or 10 (peptide).
1664    *
1665    * @param p
1666    */
1667   public void setPositionsPerLine(int p)
1668   {
1669     this.positionsPerLine = p;
1670   }
1671 }