/*
 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
 * Copyright (C) 2014 The Jalview Authors
 * 
 * This file is part of Jalview.
 * 
 * Jalview is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License 
 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
 *  
 * Jalview is distributed in the hope that it will be useful, but 
 * WITHOUT ANY WARRANTY; without even the implied warranty 
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR 
 * PURPOSE.  See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
 * The Jalview Authors are detailed in the 'AUTHORS' file.
 */
package jalview.io;

import jalview.datamodel.AlignmentI;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

/**
 * A parser for input or output of MEGA format files. <br>
 * <br>
 * Tamura K, Stecher G, Peterson D, Filipski A, and Kumar S (2013) MEGA6:
 * Molecular Evolutionary Genetics Analysis Version 6.0. Molecular Biology and
 * Evolution 30: 2725-2729. <br>
 * <br>
 * 
 * MEGA file format is supported as described in
 * http://www.megasoftware.net/manual.pdf <br>
 * Limitations:
 * <ul>
 * <li>nested comments (marked by [ ]) are accepted but not preserved</li>
 * <li>to be completed</li>
 * </ul>
 * 
 * @see http://www.megasoftware.net/
 */
public class MegaFile extends AlignFile
{
  private static final String WHITESPACE = "\\s+";

  private static final int DEFAULT_LINE_LENGTH = 60;

  private static final String INDENT = "    ";

  private static final String N_SITES = "NSites";

  private static final String N_SEQS = "NSeqs";

  private static final String MISSING = "Missing";

  private static final String IDENTICAL = "Identical";

  private static final String INDEL = "Indel";

  private static final String CODETABLE = "CodeTable";

  private static final String PROTEIN = "Protein";

  private static final String NUCLEOTIDE = "Nucleotide";

  private static final String DATATYPE = "DataType";

  private static final char COMMENT_START = '[';

  private static final char COMMENT_END = ']';

  private static final String HASHSIGN = "#";

  private static final String SEMICOLON = ";";

  private static final String BANG = "!";

  private static final String EQUALS = "=";

  private static final String MEGA_ID = HASHSIGN + "MEGA";

  private static final String TITLE = "Title";

  private static final String FORMAT = "Format";

  private static final String DESCRIPTION = "Description";

  private static final String GENE = "Gene";

  private static final String DOMAIN = "Domain";

  private static final String PROPERTY = "Property";

  private static final String CODONSTART = "CodonStart";

  /*
   * names of properties to save to the alignment (may affect eventual output
   * format)
   */
  static final String PROP_TITLE = "MEGA_TITLE";

  static final String PROP_INTERLEAVED = "MEGA_INTERLEAVED";

  static final String PROP_DESCRIPTION = "MEGA_DESCRIPTION";

  static final String PROP_CODETABLE = "MEGA_CODETABLE";

  static final String PROP_IDENTITY = "MEGA_IDENTITY";

  static final String PROP_MISSING = "MEGA_MISSING";

  static final String PROP_DATATYPE = "MEGA_DATATYPE";

  // number of bases per line of file (value is inferred)
  static final String PROP_LINELENGTH = "MEGA_LINELENGTH";

  // TODO: need a controlled name for Gene as a feature if we want to be able to
  // output the MEGA file with !Gene headers
  // WTF do we do if the sequences get realigned?

  // initial size for sequence data buffer
  private static final int SEQBUFFERSIZE = 256;

  private static final String SPACE = " ";

  /*
   * number of sequence positions output per line
   */
  private int positionsPerLine;

  private String title;

  // gap character may be explicitly declared, default is -
  private char gapCharacter = '-';

  // identity character if declared
  private char identityCharacter = 0;

  // this can be True, False or null (meaning not asserted in file)
  private Boolean nucleotide;

  // set once we have seen one block of interleaved data
  private boolean firstDataBlockRead = false;

  // this can be True, False or null (meaning we don't know yet)
  private Boolean interleaved;

  // write end of line positions as a comment
  private boolean writePositionNumbers = true;

  // id of sequence being processed
  private String currentSequenceId;

  /*
   * Temporary store of {sequenceId, positionData} while parsing interleaved
   * sequences; sequences are maintained in the order in which they are added
   * i.e. read in the file
   */
  Map<String, StringBuilder> seqData;
  
  // number of residues read (so far) per sequence
  Map<String, Integer> residuesRead;
  
  // current Gene if any we are parsing
  private String currentGene;

  // start residue (base 1) per sequence of current gene
  Map<String, Integer> geneStart;

  // current Domain if any we are parsing
  private String currentDomain;

  // start residue (base 1) per sequence of current domain
  Map<String, Integer> domainStart;

  // map of SequenceFeature's by sequence id
  Map<String, List<SequenceFeature>> sequenceFeatures;

  public MegaFile()
  {
  }

  public MegaFile(String inFile, String type) throws IOException
  {
    super(inFile, type);
  }

  public MegaFile(FileParse source) throws IOException
  {
    super(source);
  }

  /**
   * Parse the input stream.
   */
  @Override
  public void parse() throws IOException
  {
    gapCharacter = '-';
    sequenceFeatures = new HashMap<String, List<SequenceFeature>>();
    geneStart = new HashMap<String, Integer>();
    domainStart = new HashMap<String, Integer>();
    residuesRead = new HashMap<String, Integer>();

    /*
     * Read and process MEGA and Title/Format/Description headers if present.
     * Returns the first data line following the headers.
     */
    String dataLine = parseHeaderLines();

    /*
     * order-preserving map to hold sequences by id as they are built up during
     * parsing
     */
    seqData = new LinkedHashMap<String, StringBuilder>();

    /*
     * The id of the sequence being read (for non-interleaved)
     */
    currentSequenceId = "";

    while (dataLine != null)
    {
      dataLine = dataLine.trim();
      if (dataLine.length() > 0)
      {
        if (dataLine.startsWith(BANG + GENE)
                || dataLine.startsWith(BANG + DOMAIN))
        {
          parseGeneOrDomain(dataLine);
        }
        else
        {
          currentSequenceId = parseDataLine(dataLine);
        }
      }
      else if (!seqData.isEmpty())
      {
        /*
         * Blank line after processing some data...
         */
        endOfDataBlock();
      }
      dataLine = nextNonCommentLine();
    }

    /*
     * close off any features currently being parsed
     */
    createFeature(GENE, currentGene, geneStart);
    createFeature(DOMAIN, currentDomain, domainStart);

    // remember the (longest) line length read in, so we can output the same
    setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));

    deriveSequences();
  }

  /**
   * Post-processing after reading one block of interleaved data
   */
  protected void endOfDataBlock()
  {
    this.firstDataBlockRead = true;
    // TODO:
    // (initialise and) populate arrays of sequence length so far (excluding
    // gaps)
    // On change or end of a denoted Gene or Domain, add sequence features for
    // it
  }

  /**
   * Parse a !Gene or !Domain command line. MEGA accepts
   * <ul>
   * <li>!Gene=name;</li>
   * <li>!Gene=name Property=Coding/Noncoding CodonStart=1/2/3;</li>
   * <li>!Gene=genename Domain=domainname Property= etc</li>
   * <li>!Domain=domainname Gene=genename Property= etc</li>
   * <li>!Domain=domainname Property= etc</li>
   * <li>!domain=domainname property=domainend</li>
   * </ul>
   * Properly, a Gene should be composed of Domain segments, but MEGA accepts
   * without. Note that keywords don't seem to be case sensitive.
   * 
   * @param dataLine
   * @throws FileFormatException
   */
  protected void parseGeneOrDomain(String dataLine)
          throws FileFormatException
  {
    String domain = null;
    String gene = null;
    String property = null;
    String codonStart = null;
    String errorMsg = "Unrecognized format: " + dataLine;

    if (!dataLine.startsWith(BANG) || !dataLine.endsWith(SEMICOLON))
    {
      throw new FileFormatException(errorMsg);
    }
    String trimmed = dataLine.substring(1, dataLine.length() - 1).trim();
    String[] tokens = trimmed.split(WHITESPACE);
    for (String token : tokens)
    {
      String[] keyValue = token.split("=");
      if (keyValue.length != 2)
      {
        throw new FileFormatException(errorMsg);
      }
      String key = keyValue[0];
      if (GENE.equalsIgnoreCase(key))
      {
        gene = keyValue[1];
      }
      else if (DOMAIN.equalsIgnoreCase(key))
      {
        domain = keyValue[1];
      }
      else if (PROPERTY.equalsIgnoreCase(key))
      {
        property = keyValue[1];
      }
      else if (CODONSTART.equalsIgnoreCase(key))
      {
        codonStart = keyValue[1];
      }
      else
      {
        System.err.println("Unrecognised token: '" + key + "; in "
                + dataLine);
      }
    }

    processGeneOrDomain(gene, domain, property, codonStart);
  }

  /**
   * Process a statement containing one or both of Gene and Domain, and
   * optionally Property or CodonStart commands.
   * 
   * @param gene
   *          the Gene name if specified, else null
   * @param domain
   *          the Domain name if specified, else null
   * @param property
   *          the Property value if specified, else null
   * @param codonStart
   *          the CodonStart value if specified, else null
   */
  protected void processGeneOrDomain(String gene, String domain,
          String property, String codonStart)
  {
    boolean domainEnd = "domainend".equalsIgnoreCase(property);

    /*
     * If we have been processing a Domain or Gene, and this does not continue
     * it, then close it off (generate sequence features for it). Do Domain
     * first as it is in the context of the enclosing gene if any.
     */
    if (this.currentDomain != null)
    {
      if (!this.currentDomain.equals(domain) || domainEnd)
      {
        String description = currentDomain
                + (currentGene == null ? "" : " (" + currentGene + ")");
        createFeature(DOMAIN, description, domainStart);
      }
    }
    if (this.currentGene != null && !this.currentGene.equals(gene))
    {
      createFeature(GENE, currentGene, geneStart);
    }

    /*
     * and if we have declared a Gene or Domain which does not continue the
     * current one, then record its start positions per sequence
     */
    if (gene != null && !gene.equals(currentGene))
    {
      startSequenceFeature(geneStart);
    }
    if (domain != null && !domain.equals(currentDomain))
    {
      startSequenceFeature(domainStart);
    }

    currentGene = gene;
    currentDomain = domainEnd ? null : domain;
  }

  /**
   * Start processing a new feature
   * 
   * @param startPositions
   */
  protected void startSequenceFeature(Map<String, Integer> startPositions)
  {
    /*
     * If the feature declaration precedes all sequences, we will know in
     * createFeature that it started with residue 1; otherwise note now where it
     * starts in each sequence
     */
    if (!residuesRead.isEmpty())
    {
      for (Entry<String, Integer> entry : residuesRead.entrySet())
      {
        String seqId = entry.getKey();
        Integer nextResidue = entry.getValue() + 1;
        startPositions.put(seqId, nextResidue);
      }
    }
  }

  /**
   * Add a SequenceFeature to each sequence, using the given start/end values
   * per sequence
   * 
   * @param featureType
   * @param featureValue
   * @param featureStartResidues
   */
  protected void createFeature(String featureType, String featureValue,
          Map<String, Integer> featureStartResidues)
  {
    if (featureValue == null)
    {
      return;
    }

    Iterator<String> seqids = this.seqData.keySet().iterator();
    while (seqids.hasNext())
    {
      String seqid = seqids.next();
      Integer startAt = featureStartResidues.get(seqid);
      int sfstart = startAt == null ? 1 : startAt.intValue();
      int sfend = residuesRead.get(seqid);
      if (sfend >= sfstart)
      {
        /*
         * don't add feature if entirely gapped in the sequence
         */
        // TODO: type="Gene" (but then all coloured the same) or
        // type="GeneName"?
        SequenceFeature sf = new SequenceFeature(featureValue, featureType,
                sfstart, sfend, 0f, null);
        sequenceFeatures.get(seqid).add(sf);
      }
    }
  }

  /**
   * Returns the next line that is not a comment, or null at end of file.
   * Comments in MEGA are within [ ] brackets, and may be nested.
   * 
   * @return
   * @throws IOException
   */
  protected String nextNonCommentLine() throws IOException
  {
    return nextNonCommentLine(0);
  }

  /**
   * Returns the next non-comment line (or part line), or null at end of file.
   * Comments in MEGA are within [ ] brackets, and may be nested. They may occur
   * anywhere within a line (for example at the end with position numbers); this
   * method returns the line with any comments removed.
   * 
   * @param depth
   *          current depth of nesting of comments while parsing
   * @return
   * @throws IOException
   */
  protected String nextNonCommentLine(final int depth) throws IOException
  {
    String data = null;
    data = nextLine();
    if (data == null)
    {
      if (depth > 0)
      {
        System.err.println("Warning: unterminated comment in data file");
      }
      return data;
    }

    /*
     * If we are in a (possibly nested) comment after parsing this line, keep
     * reading recursively until the comment has unwound
     */
    int newDepth = commentDepth(data, depth);
    if (newDepth > 0)
    {
      return nextNonCommentLine(newDepth);
    }
    else
    {
      /*
       * not in a comment by end of this line; return what is left
       */
      String nonCommentPart = getNonCommentContent(data, depth);
      return nonCommentPart;
    }
  }

  /**
   * Returns what is left of the input data after removing any comments, whether
   * 'in progress' from preceding lines, or embedded in the current line
   * 
   * @param data
   *          input data
   * @param depth
   *          nested depth of comments pending termination
   * @return
   * @throws FileFormatException
   */
  protected static String getNonCommentContent(String data, int depth)
          throws FileFormatException
  {
    int len = data.length();
    StringBuilder result = new StringBuilder(len);
    for (int i = 0; i < len; i++)
    {
      char c = data.charAt(i);
      switch (c)
      {
      case COMMENT_START:
        depth++;
        break;

      case COMMENT_END:
        if (depth > 0)
        {
          depth--;
        }
        else
        {
          result.append(c);
        }
        break;

      default:
        if (depth == 0)
        {
          result.append(c);
        }
      }
    }
    return result.toString();
  }

  /**
   * Calculates new depth of comment after parsing an input line i.e. the excess
   * of opening '[' over closing ']' characters. Any excess ']' are ignored (not
   * treated as comment delimiters).
   * 
   * @param data
   *          input line
   * @param depth
   *          current comment nested depth before parsing the line
   * @return new depth after parsing the line
   */
  protected static int commentDepth(CharSequence data, int depth)
  {
    int newDepth = depth;
    int len = data.length();
    for (int i = 0; i < len; i++)
    {
      char c = data.charAt(i);
      if (c == COMMENT_START)
      {
        newDepth++;
      }
      else if (c == COMMENT_END && newDepth > 0)
      {
        newDepth--;
      }
    }
    return newDepth;
  }

  /**
   * Convert the parsed sequence strings to objects and store them in the model.
   */
  protected void deriveSequences()
  {
    Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();

    for (Entry<String, StringBuilder> dataset : datasets)
    {
      String sequenceId = dataset.getKey();
      StringBuilder characters = dataset.getValue();
      SequenceI s = new Sequence(sequenceId, new String(characters));
      this.seqs.addElement(s);

      /*
       * and add any derived sequence features to the sequence
       */
      for (SequenceFeature sf : sequenceFeatures.get(sequenceId))
      {
        s.addSequenceFeature(sf);
      }
    }
  }

  /**
   * Process one line of sequence data. If it has no sequence identifier, append
   * to the current id's sequence. Else parse out the sequence id and append the
   * data (if any) to that id's sequence. Returns the sequence id (implicit or
   * explicit) for this line.
   * 
   * @param dataLine
   * @return
   * @throws IOException
   */
  protected String parseDataLine(String dataLine)
          throws IOException
  {
    String seqId = getSequenceId(dataLine);
    if (seqId == null)
    {
      /*
       * Just character data
       */
      parseNoninterleavedDataLine(dataLine);
      return currentSequenceId;
    }
    else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
    {
      /*
       * Sequence id only - header line for noninterleaved data
       */
      return seqId;
    }
    else
    {
      /*
       * Sequence id followed by data
       */
      parseInterleavedDataLine(dataLine, seqId);
      return seqId;
    }
  }

  /**
   * Add a line of sequence data to the buffer for the given sequence id. Start
   * a new one if we haven't seen it before.
   * 
   * @param dataLine
   * @throws IOException
   */
  protected void parseNoninterleavedDataLine(String dataLine)
          throws IOException
  {
    if (currentSequenceId == null)
    {
      /*
       * Oops. Data but no sequence id context.
       */
      throw new IOException("No sequence id context at: " + dataLine);
    }

    assertInterleaved(false, dataLine);

    dataLine = addSequenceData(currentSequenceId, dataLine);

    setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
  }

  /**
   * Get the sequence data for this sequence id, starting a new one if
   * necessary.
   * 
   * @param currentId
   * @return
   */
  protected StringBuilder getSequenceDataBuffer(String currentId)
  {
    StringBuilder sb = seqData.get(currentId);
    if (sb == null)
    {
      // first data met for this sequence id, start a new buffer
      sb = new StringBuilder(SEQBUFFERSIZE);
      seqData.put(currentId, sb);

      // and a placeholder for any SequenceFeature found
      sequenceFeatures.put(currentId, new ArrayList<SequenceFeature>());
    }
    return sb;
  }

  /**
   * Parse one line of interleaved data e.g.
   * 
   * <pre>
   * #TheSeqId CGATCGCATGCA
   * </pre>
   * 
   * @param dataLine
   * @param seqId
   * @throws FileFormatException
   */
  protected void parseInterleavedDataLine(String dataLine, String seqId)
          throws FileFormatException
  {
    /*
     * New sequence found in second or later data block - error.
     */
    if (this.firstDataBlockRead && !seqData.containsKey(seqId))
    {
      throw new FileFormatException(
              "Parse error: misplaced new sequence starting at " + dataLine);
    }

    String data = dataLine.substring(seqId.length() + 1).trim();

    /*
     * Do nothing if this line is _only_ a sequence id with no data following.
     */
    if (data != null && data.length() > 0)
    {
      data = addSequenceData(seqId, data);
      setPositionsPerLine(Math.max(positionsPerLine, data.length()));
      assertInterleaved(true, dataLine);
    }
  }

  /**
   * Remove spaces, and replace identity symbol, before appending the sequence
   * data to the buffer for the sequence id. Returns the reformatted added data.
   * Also updates a count of residues read for the sequence.
   * 
   * @param seqId
   * @param data
   * @return
   */
  protected String addSequenceData(String seqId, String data)
  {
    StringBuilder sb = getSequenceDataBuffer(seqId);
    int len = sb.length();
    String formatted = data.replace(SPACE, "");

    /*
     * If sequence contains '.' or other identity symbol; replace these with the
     * same position from the first (reference) sequence
     */
    int nonGapped = 0;
    StringBuilder referenceSequence = seqData.values().iterator().next();
    StringBuilder sb1 = new StringBuilder(formatted.length());
    for (int i = 0; i < formatted.length(); i++)
    {
      char nextChar = formatted.charAt(i);
      if (nextChar != gapCharacter)
      {
        nonGapped++;
      }
      if (nextChar == identityCharacter
              && len + i < referenceSequence.length())
      {
        sb1.append(referenceSequence.charAt(len + i));
      }
      else
      {
        sb1.append(nextChar);
      }
    }
    formatted = sb1.toString();

    data = formatted;
    sb.append(data);

    /*
     * increment residue count for the sequence
     */
    if (nonGapped > 0)
    {
      Integer residueCount = residuesRead.get(seqId);
      residuesRead.put(seqId, nonGapped
              + (residueCount == null ? 0 : residueCount));
    }

    return data;
  }

  /**
   * If the line begins with (e.g.) "#abcde " then returns "abcde" as the
   * identifier. Else returns null.
   * 
   * @param dataLine
   * @return
   */
  public static String getSequenceId(String dataLine)
  {
    // TODO refactor to a StringUtils type class
    if (dataLine != null)
    {
      if (dataLine.startsWith(HASHSIGN))
      {
        int spacePos = dataLine.indexOf(" ");
        return (spacePos == -1 ? dataLine.substring(1) : dataLine
                .substring(1, spacePos));
      }
    }
    return null;
  }

  /**
   * Read the #MEGA and Title/Format/Description header lines (if present).
   * 
   * Save as alignment properties in case useful.
   * 
   * @return the next non-blank line following the header lines.
   * @throws IOException
   */
  protected String parseHeaderLines() throws IOException
  {
    String inputLine = null;
    while ((inputLine = nextNonCommentLine()) != null)
    {
      inputLine = inputLine.trim();

      /*
       * skip blank lines
       */
      if (inputLine.length() == 0)
      {
        continue;
      }

      if (inputLine.toUpperCase().startsWith(MEGA_ID))
      {
        continue;
      }

      if (isTitle(inputLine))
      {
        this.title = getValue(inputLine);
        setAlignmentProperty(PROP_TITLE, title);
      }
      else if (inputLine.startsWith(BANG + DESCRIPTION))
      {
        parseDescription(inputLine);
      }

      else if (inputLine.startsWith(BANG + FORMAT))
      {
        parseFormat(inputLine);
      }
      else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
      {

        /*
         * Return the first 'data line' i.e. one that is not blank, #MEGA or
         * TITLE:
         */
        break;
      }
    }
    return inputLine;
  }

  /**
   * Parse a !Format statement. This may be multiline, and is ended by a
   * semicolon.
   * 
   * @param inputLine
   * @throws IOException
   */
  protected void parseFormat(String inputLine) throws IOException
  {
    while (inputLine != null)
    {
      parseFormatLine(inputLine);
      if (inputLine.endsWith(SEMICOLON))
      {
        break;
      }
      inputLine = nextNonCommentLine();
    }
  }

  /**
   * Parse one line of a !Format statement. This may contain one or more
   * keyword=value pairs.
   * 
   * @param inputLine
   * @throws FileFormatException
   */
  protected void parseFormatLine(String inputLine)
          throws FileFormatException
  {
    if (inputLine.startsWith(BANG + FORMAT))
    {
      inputLine = inputLine.substring((BANG + FORMAT).length());
    }
    if (inputLine.endsWith(SEMICOLON))
    {
      inputLine = inputLine.substring(0, inputLine.length() - 1);
    }
    if (inputLine.length() == 0)
    {
      return;
    }
    String[] tokens = inputLine.trim().split(WHITESPACE);
    for (String token : tokens)
    {
      parseFormatKeyword(token);
    }
  }

  /**
   * Parse a Keyword=Value token. Possible keywords are
   * <ul>
   * <li>DataType= DNA, RNA, Nucleotide, Protein</li>
   * <li>DataFormat= Interleaved, ?</li>
   * <li>NSeqs= number of sequences (synonym NTaxa)</li>
   * <li>NSites= number of bases / residues</li>
   * <li>Property= Exon (or Coding), Intron (or Noncoding), End (of domain)</li>
   * <li>Indel= gap character</li>
   * <li>Identical= identity character (to first sequence) (synonym MatchChar)</li>
   * <li>Missing= missing data character</li>
   * <li>CodeTable= Standard, other (MEGA supports various)</li>
   * </ul>
   * 
   * @param token
   * @throws FileFormatException
   *           if an unrecognised keyword or value is encountered
   */
  protected void parseFormatKeyword(String token)
          throws FileFormatException
  {
    String msg = "Unrecognised Format command: " + token;
    String[] bits = token.split(EQUALS);
    if (bits.length != 2)
    {
      throw new FileFormatException(msg);
    }
    String keyword = bits[0];
    String value = bits[1];

    /*
     * Jalview will work out whether nucleotide or not anyway
     */
    if (keyword.equalsIgnoreCase(DATATYPE))
    {
      if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
              || value.equalsIgnoreCase("Nucleotide"))
      {
        this.nucleotide = true;
        // alignment computes whether or not it is nucleotide when created
      }
      else if (value.equalsIgnoreCase(PROTEIN))
      {
        this.nucleotide = false;
      }
      else
      {
        throw new FileFormatException(msg);
      }
      setAlignmentProperty(PROP_DATATYPE, value);
    }

    /*
     * accept non-Standard code table but save in case we want to disable
     * 'translate as cDNA'
     */
    else if (keyword.equalsIgnoreCase(CODETABLE))
    {
      setAlignmentProperty(PROP_CODETABLE, value);
    }

    /*
     * save gap char to set later on alignment once created
     */
    else if (keyword.equalsIgnoreCase(INDEL))
    {
      this.gapCharacter = value.charAt(0);
    }

    else if (keyword.equalsIgnoreCase(IDENTICAL)
            || keyword.equalsIgnoreCase("MatchChar"))
    {
      setAlignmentProperty(PROP_IDENTITY, value);
      this.identityCharacter = value.charAt(0);
      if (!".".equals(value))
      {
        System.err.println("Warning: " + token
                + " not supported, Jalview uses '.' for identity");
      }
    }

    else if (keyword.equalsIgnoreCase(MISSING))
    {
      setAlignmentProperty(PROP_MISSING, value);
      System.err.println("Warning: " + token + " not supported");
    }

    else if (keyword.equalsIgnoreCase(PROPERTY))
    {
      // TODO: can Property appear in a Format command?
      // suspect this is a mistake in the manual
    }

    else if (!keyword.equalsIgnoreCase(N_SEQS)
            && !keyword.equalsIgnoreCase("NTaxa")
            && !keyword.equalsIgnoreCase(N_SITES))
    {
      System.err.println("Warning: " + msg);
    }
  }

  /**
   * Returns the trimmed data on the line following either whitespace or '=',
   * with any trailing semi-colon removed<br>
   * So
   * <ul>
   * <li>Hello World</li>
   * <li>!Hello: \tWorld;</li>
   * <li>!Hello=World</li>
   * <ul>
   * should all return "World"
   * 
   * @param inputLine
   * @return
   */
  protected static String getValue(String inputLine)
  {
    if (inputLine == null)
    {
      return null;
    }
    String value = null;
    String s = inputLine.replaceAll("\t", " ").trim();

    /*
     * KEYWORD = VALUE should return VALUE
     */
    int equalsPos = s.indexOf("=");
    if (equalsPos >= 0)
    {
      value = s.substring(equalsPos + 1);
    }
    else
    {
      int spacePos = s.indexOf(' ');
      value = spacePos == -1 ? "" : s.substring(spacePos + 1);
    }
    value = value.trim();
    if (value.endsWith(SEMICOLON))
    {
      value = value.substring(0, value.length() - 1).trim();
    }
    return value;
  }

  /**
   * Returns true if the input line starts with "TITLE" or "!TITLE" (not case
   * sensitive). The latter is the official format, some older data file
   * examples have it without the !.
   * 
   * @param inputLine
   * @return
   */
  protected static boolean isTitle(String inputLine)
  {
    if (inputLine == null)
    {
      return false;
    }
    String upper = inputLine.toUpperCase();
    return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
            + TITLE.toUpperCase()));
  }

  /**
   * Reads lines until terminated by semicolon, appending each to the
   * Description property value.
   * 
   * @throws IOException
   */
  protected void parseDescription(String firstDescriptionLine)
          throws IOException
  {
    StringBuilder desc = new StringBuilder(256);
    desc.append(getValue(firstDescriptionLine));
    if (!firstDescriptionLine.endsWith(SEMICOLON))
    {
      String line = nextNonCommentLine();
      while (line != null)
      {
        if (line.endsWith(SEMICOLON))
        {
          desc.append(line.substring(0, line.length() - 1));
          break;
        }
        else if (line.length() > 0)
        {
          desc.append(line).append(newline);
        }
        line = nextNonCommentLine();
      }
    }
    setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
  }

  /**
   * Returns the alignment sequences in Mega format.
   */
  @Override
  public String print()
  {
    return MEGA_ID + newline + print(getSeqsAsArray());
  }

  /**
   * Write out the alignment sequences in Mega format - interleaved unless
   * explicitly noninterleaved.
   */
  protected String print(SequenceI[] s)
  {
    String result;
    if (this.interleaved != null && !this.interleaved)
    {
      result = printNonInterleaved(s);
    }
    else
    {
      result = printInterleaved(s);
    }
    return result;
  }

  /**
   * Print to string in Interleaved format - blocks of next N characters of each
   * sequence in turn.
   * 
   * @param s
   */
  protected String printInterleaved(SequenceI[] s)
  {
    int maxIdLength = getMaxIdLength(s);
    int maxSequenceLength = getMaxSequenceLength(s);
    int numLines = maxSequenceLength / positionsPerLine + 3; // approx

    int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1;
    int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
    int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;

    /*
     * Roughly size a buffer to hold the whole output
     */
    StringBuilder sb = new StringBuilder(numLines
            * (maxIdLength + positionsPerLine + chunksPerLine + 10));

    /*
     * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
     */
    int from = 0;
    for (int i = 0; i < numDataBlocks; i++)
    {
      sb.append(newline);
      boolean first = true;
      int advancedBy = 0;
      for (SequenceI seq : s)
      {
        int seqFrom = from;
        String seqId = String.format("#%-" + maxIdLength + "s",
                seq.getName());

        /*
         * output next line for this sequence
         */
        sb.append(seqId);
        int lastPos = seqFrom + positionsPerLine; // exclusive
        for (int j = 0; j < chunksPerLine; j++)
        {
          char[] subSequence = seq.getSequence(seqFrom,
                  Math.min(lastPos, seqFrom + spaceEvery));
          if (subSequence.length > 0)
          {
            sb.append(SPACE).append(subSequence);
          }
          seqFrom += subSequence.length;
          if (first)
          {
            // all sequences should be the same length in MEGA
            advancedBy += subSequence.length;
          }
        }
        // write last position as a comment
        if (writePositionNumbers)
        {
          sb.append(SPACE).append(COMMENT_START).append(from + advancedBy)
                  .append(COMMENT_END);
        }
        sb.append(newline);
        first = false;
      }
      from += advancedBy;
    }

    return new String(sb);
  }

  /**
   * Outputs to string the MEGA header and any other known and relevant
   * alignment properties
   * 
   * @param al
   */
  protected String printHeaders(AlignmentI al)
  {
    StringBuilder sb = new StringBuilder(128);
    sb.append(MEGA_ID).append(newline);
    String propertyValue = (String) al.getProperty(PROP_TITLE);
    if (propertyValue != null)
    {
      sb.append(BANG).append(TITLE).append(SPACE).append(propertyValue)
              .append(SEMICOLON).append(newline);
    }
    propertyValue = (String) al.getProperty(PROP_DESCRIPTION);
    if (propertyValue != null)
    {
      sb.append(BANG).append(DESCRIPTION).append(newline)
              .append(propertyValue).append(SEMICOLON)
              .append(newline);
    }

    /*
     * !Format DataType CodeTable
     */
    sb.append(BANG).append(FORMAT).append(newline);
    String dataType = (String) al.getProperty(PROP_DATATYPE);
    if (dataType == null)
    {
      dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
    }
    sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
    String codeTable = (String) al.getProperty(PROP_CODETABLE);
    sb.append(SPACE).append(CODETABLE).append(EQUALS)
            .append(codeTable == null ? "Standard" : codeTable)
            .append(newline);
    
    /*
     * !Format NSeqs NSites (the length of sequences - they should all be the
     * same - including gaps)
     */
    sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
    sb.append(SPACE).append(N_SITES).append(EQUALS)
            .append(String.valueOf(al.getWidth()));
    sb.append(newline);

    /*
     * !Format Indel Identical Missing
     */
    sb.append(INDENT);
    sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
    String identity = (String) al.getProperty(PROP_IDENTITY);
    if (identity != null)
    {
      sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
    }
    String missing = (String) al.getProperty(PROP_MISSING);
    if (missing != null)
    {
      sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
    }
    sb.append(SEMICOLON).append(newline);

    return sb.toString();
  }

  /**
   * Get the longest sequence id (to allow aligned printout).
   * 
   * @param s
   * @return
   */
  protected static int getMaxIdLength(SequenceI[] s)
  {
    // TODO pull up for reuse
    int maxLength = 0;
    for (SequenceI seq : s)
    {
      int len = seq.getName().length();
      if (len > maxLength)
      {
        maxLength = len;
      }
    }
    return maxLength;
  }

  /**
   * Get the longest sequence length
   * 
   * @param s
   * @return
   */
  protected static int getMaxSequenceLength(SequenceI[] s)
  {
    // TODO pull up for reuse
    int maxLength = 0;
    for (SequenceI seq : s)
    {
      int len = seq.getLength();
      if (len > maxLength)
      {
        maxLength = len;
      }
    }
    return maxLength;
  }

  /**
   * Print to string in noninterleaved format - all of each sequence in turn, in
   * blocks of 50 characters.
   * 
   * @param s
   * @return
   */
  protected String printNonInterleaved(SequenceI[] s)
  {
    int maxSequenceLength = getMaxSequenceLength(s);
    // approx
    int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;

    /*
     * Roughly size a buffer to hold the whole output
     */
    StringBuilder sb = new StringBuilder(numLines * positionsPerLine);

    int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
    int chunksPerLine = positionsPerLine / spaceEvery;
    for (SequenceI seq : s)
    {
      sb.append(newline);
      sb.append(HASHSIGN + seq.getName()).append(newline);
      int startPos = 0;
      while (startPos < seq.getLength())
      {
        boolean firstChunk = true;
        /*
         * print next line for this sequence
         */
        int lastPos = startPos + positionsPerLine; // exclusive
        for (int j = 0; j < chunksPerLine; j++)
        {
          char[] subSequence = seq.getSequence(startPos,
                  Math.min(lastPos, startPos + positionsPerLine));
          if (subSequence.length > 0)
          {
            if (!firstChunk)
            {
              sb.append(SPACE);
            }
            sb.append(subSequence);
            firstChunk = false;
          }
          startPos += subSequence.length;
        }
        sb.append(newline);
      }
    }

    return new String(sb);
  }

  /**
   * Flag this file as interleaved or not, based on data format. Throws an
   * exception if has previously been determined to be otherwise.
   * 
   * @param isIt
   * @param dataLine
   * @throws IOException
   */
  protected void assertInterleaved(boolean isIt, String dataLine)
          throws FileFormatException
  {
    if (this.interleaved != null && isIt != this.interleaved.booleanValue())
    {
      throw new FileFormatException(
              "Parse error: mix of interleaved and noninterleaved detected, at line: "
                      + dataLine);
    }
    this.interleaved = new Boolean(isIt);
    setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
  }

  public boolean isInterleaved()
  {
    return this.interleaved == null ? false : this.interleaved
            .booleanValue();
  }

  /**
   * Adds saved parsed values either as alignment properties, or (in some cases)
   * as specific member fields of the alignment
   */
  @Override
  public void addProperties(AlignmentI al)
  {
    super.addProperties(al);
    al.setGapCharacter(gapCharacter);
    
    /*
     * warn if e.g. DataType=DNA but data is protein (or vice versa)
     */
    if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) {
      System.err.println("Warning: " + this.title + " declared "
              + (nucleotide ? "" : " not ") + "nucleotide but it is"
              + (nucleotide ? " not" : ""));
    }
  }

  /**
   * Print the given alignment in MEGA format. If the alignment was created by
   * parsing a MEGA file, it should have properties set (e.g. Title) which can
   * influence the output.
   */
  @Override
  public String print(AlignmentI al)
  {
    this.nucleotide = al.isNucleotide();

    String lineLength = (String) al.getProperty(PROP_LINELENGTH);
    this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
            .parseInt(lineLength);

    /*
     * round down to a multiple of 3 positions per line for nucleotide
     */
    if (nucleotide)
    {
      positionsPerLine = positionsPerLine - (positionsPerLine % 3);
    }

    String interleave = (String) al.getProperty(PROP_INTERLEAVED);
    if (interleave != null)
    {
      this.interleaved = Boolean.valueOf(interleave);
    }

    String headers = printHeaders(al);
    return headers + print(al.getSequencesArray());
  }

  /**
   * Returns the number of sequence positions output per line
   * 
   * @return
   */
  public int getPositionsPerLine()
  {
    return positionsPerLine;
  }

  /**
   * Sets the number of sequence positions output per line. Note these will be
   * formatted in blocks of 3 (nucleotide) or 10 (peptide).
   * 
   * @param p
   */
  public void setPositionsPerLine(int p)
  {
    this.positionsPerLine = p;
  }
}