JAL-1499 initial tests working, can export to / import from textbox ok

[jalview.git] / src / jalview / io / MegaFile.java
diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java

index 90693f1..238061a 100644 (file)
--- a/src/jalview/io/MegaFile.java
+++ b/src/jalview/io/MegaFile.java
@@ -48,6 +48,28 @@ import java.util.Set;
   */
  public class MegaFile extends AlignFile
  {
+  private static final int DEFAULT_LINE_LENGTH = 60;
+
+  private static final String INDENT = "    ";
+
+  private static final String N_SITES = "NSites";
+
+  private static final String N_SEQS = "NSeqs";
+
+  private static final String MISSING = "Missing";
+
+  private static final String IDENTICAL = "Identical";
+
+  private static final String INDEL = "Indel";
+
+  private static final String CODETABLE = "CodeTable";
+
+  private static final String PROTEIN = "Protein";
+
+  private static final String NUCLEOTIDE = "Nucleotide";
+
+  private static final String DATATYPE = "DataType";
+
    private static final char COMMENT_START = '[';
  
    private static final char COMMENT_END = ']';
@@ -62,7 +84,7 @@ public class MegaFile extends AlignFile
  
    private static final String MEGA_ID = HASHSIGN + "MEGA";
  
-  private static final String TITLE = "TITLE";
+  private static final String TITLE = "Title";
  
    private static final String FORMAT = "Format";
  
@@ -72,8 +94,6 @@ public class MegaFile extends AlignFile
  
    private static final String DOMAIN = "Domain";
  
-  private static final String INTERLEAVED = "Interleaved";
-
    /*
     * names of properties to save to the alignment (may affect eventual output
     * format)
@@ -90,6 +110,11 @@ public class MegaFile extends AlignFile
  
    static final String PROP_MISSING = "MEGA_MISSING";
  
+  static final String PROP_DATATYPE = "MEGA_DATATYPE";
+
+  // number of bases per line of file (value is inferred)
+  static final String PROP_LINELENGTH = "MEGA_LINELENGTH";
+
    // TODO: need a controlled name for Gene as a feature if we want to be able to
    // output the MEGA file with !Gene headers
    // WTF do we do if the sequences get realigned?
@@ -99,7 +124,10 @@ public class MegaFile extends AlignFile
  
    private static final String SPACE = " ";
  
-  private static final int POSITIONS_PER_LINE = 50;
+  /*
+   * number of sequence positions output per line
+   */
+  private int positionsPerLine;
  
    private String title;
  
@@ -181,6 +209,9 @@ public class MegaFile extends AlignFile
        dataLine = nextNonCommentLine();
      }
  
+    // remember the (longest) line length read in, so we can output the same
+    setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
+
      setSequences(seqData);
    }
  
@@ -435,6 +466,8 @@ public class MegaFile extends AlignFile
       * Add the current line of data to the sequence.
       */
      sb.append(dataLine);
+
+    setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
    }
  
    /**
@@ -489,7 +522,7 @@ public class MegaFile extends AlignFile
      /*
       * Do nothing if this line is _only_ a sequence id with no data following.
       * 
-     * Remove any internal spaces (present in the 'fancy' file format)
+     * Remove any internal spaces
       */
      if (data != null && data.length() > 0)
      {
@@ -498,6 +531,7 @@ public class MegaFile extends AlignFile
          data = data.replace(SPACE, "");
        }
        sb.append(data);
+      setPositionsPerLine(Math.max(positionsPerLine, data.length()));
        assertInterleaved(true, dataLine);
      }
    }
@@ -554,7 +588,8 @@ public class MegaFile extends AlignFile
  
        if (isTitle(inputLine))
        {
-        setAlignmentProperty(PROP_TITLE, getValue(inputLine));
+        this.title = getValue(inputLine);
+        setAlignmentProperty(PROP_TITLE, title);
        }
        else if (inputLine.startsWith(BANG + DESCRIPTION))
        {
@@ -616,6 +651,10 @@ public class MegaFile extends AlignFile
      {
        inputLine = inputLine.substring(0, inputLine.length() - 1);
      }
+    if (inputLine.length() == 0)
+    {
+      return;
+    }
      String[] tokens = inputLine.trim().split("\\s"); // any whitespace
      for (String token : tokens)
      {
@@ -656,7 +695,7 @@ public class MegaFile extends AlignFile
      /*
       * Jalview will work out whether nucleotide or not anyway
       */
-    if (keyword.equalsIgnoreCase("DataType"))
+    if (keyword.equalsIgnoreCase(DATATYPE))
      {
        if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
                || value.equalsIgnoreCase("Nucleotide"))
@@ -664,7 +703,7 @@ public class MegaFile extends AlignFile
          this.nucleotide = true;
          // alignment computes whether or not it is nucleotide when created
        }
-      else if (value.equalsIgnoreCase("Protein"))
+      else if (value.equalsIgnoreCase(PROTEIN))
        {
          this.nucleotide = false;
        }
@@ -672,13 +711,14 @@ public class MegaFile extends AlignFile
        {
          throw new FileFormatException(msg);
        }
+      setAlignmentProperty(PROP_DATATYPE, value);
      }
  
      /*
       * accept non-Standard code table but save in case we want to disable
       * 'translate as cDNA'
       */
-    else if (keyword.equalsIgnoreCase("CodeTable"))
+    else if (keyword.equalsIgnoreCase(CODETABLE))
      {
        setAlignmentProperty(PROP_CODETABLE, value);
      }
@@ -686,23 +726,23 @@ public class MegaFile extends AlignFile
      /*
       * save gap char to set later on alignment once created
       */
-    else if (keyword.equalsIgnoreCase("Indel"))
+    else if (keyword.equalsIgnoreCase(INDEL))
      {
        this.gapCharacter = value.charAt(0);
      }
  
-    else if (keyword.equalsIgnoreCase("Identical")
+    else if (keyword.equalsIgnoreCase(IDENTICAL)
              || keyword.equalsIgnoreCase("MatchChar"))
      {
+      setAlignmentProperty(PROP_IDENTITY, value);
        if (!".".equals(value))
        {
-        setAlignmentProperty(PROP_IDENTITY, value);
          System.err.println("Warning: " + token
                  + " not supported, Jalview uses '.' for identity");
        }
      }
  
-    else if (keyword.equalsIgnoreCase("Missing"))
+    else if (keyword.equalsIgnoreCase(MISSING))
      {
        setAlignmentProperty(PROP_MISSING, value);
        System.err.println("Warning: " + token + " not supported");
@@ -715,8 +755,8 @@ public class MegaFile extends AlignFile
        setAlignmentProperty(PROP_MISSING, value);
      }
  
-    else if (!keyword.equalsIgnoreCase("NSeqs")
-            && !keyword.equalsIgnoreCase("NSites"))
+    else if (!keyword.equalsIgnoreCase(N_SEQS)
+            && !keyword.equalsIgnoreCase(N_SITES))
      {
        System.err.println("Warning: " + msg);
      }
@@ -781,7 +821,8 @@ public class MegaFile extends AlignFile
        return false;
      }
      String upper = inputLine.toUpperCase();
-    return (upper.startsWith(TITLE) || upper.startsWith(BANG + TITLE));
+    return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
+            + TITLE.toUpperCase()));
    }
  
    /**
@@ -799,7 +840,7 @@ public class MegaFile extends AlignFile
      {
        if (line.endsWith(SEMICOLON))
        {
-        desc.append(line.substring(0, line.length() - 1)).append(newline);
+        desc.append(line.substring(0, line.length() - 1));
          break;
        }
        else if (line.length() > 0)
@@ -812,24 +853,21 @@ public class MegaFile extends AlignFile
    }
  
    /**
-   * Write out the alignment sequences in Mega format.
+   * Returns the alignment sequences in Mega format.
     */
    @Override
    public String print()
    {
-    return print(getSeqsAsArray());
+    return MEGA_ID + newline + print(getSeqsAsArray());
    }
  
    /**
     * Write out the alignment sequences in Mega format - interleaved unless
     * explicitly noninterleaved.
     */
-  public String print(SequenceI[] s)
+  protected String print(SequenceI[] s)
    {
-    // TODO: is there a way to preserve the 'interleaved' property so it can
-    // affect output?
-
-    String result = null;
+    String result;
      if (this.interleaved != null && !this.interleaved)
      {
        result = printNonInterleaved(s);
@@ -842,21 +880,8 @@ public class MegaFile extends AlignFile
    }
  
    /**
-   * Print the sequences in interleaved format, each row 15 space-separated
-   * triplets.
-   * 
-   * @param s
-   * @return
-   */
-  protected String printInterleavedCodons(SequenceI[] s)
-  {
-    // TODO not coded yet - defaulting to the 'simple' format output
-    return printInterleaved(s);
-  }
-
-  /**
-   * Print to string in Interleaved format - blocks of next 50 characters of
-   * each sequence in turn.
+   * Print to string in Interleaved format - blocks of next N characters of each
+   * sequence in turn.
     * 
     * @param s
     */
@@ -864,58 +889,119 @@ public class MegaFile extends AlignFile
    {
      int maxIdLength = getMaxIdLength(s);
      int maxSequenceLength = getMaxSequenceLength(s);
-    int numLines = maxSequenceLength / POSITIONS_PER_LINE + 3; // approx
+    int numLines = maxSequenceLength / positionsPerLine + 3; // approx
  
      /*
       * Size a buffer to hold the whole output
       */
      StringBuilder sb = new StringBuilder(numLines
-            * (maxIdLength + 2 + POSITIONS_PER_LINE));
-    printHeaders(sb);
+            * (maxIdLength + 2 + positionsPerLine));
+
+    int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1;
+    int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
+    int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;
  
-    int numDataBlocks = (maxSequenceLength - 1) / POSITIONS_PER_LINE + 1;
+    /*
+     * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
+     */
+    int from = 0;
      for (int i = 0; i < numDataBlocks; i++)
      {
        sb.append(newline);
+      boolean first = true;
+      int advancedBy = 0;
        for (SequenceI seq : s)
        {
-
-        String seqId = String.format("#%-" + maxIdLength + "s ",
+        int seqFrom = from;
+        String seqId = String.format("#%-" + maxIdLength + "s",
                  seq.getName());
-        char[] subSequence = seq.getSequence(i * POSITIONS_PER_LINE,
-                (i + 1) * POSITIONS_PER_LINE);
+
+        /*
+         * output next line for this sequence
+         */
          sb.append(seqId);
-        sb.append(subSequence);
+        int lastPos = seqFrom + positionsPerLine; // exclusive
+        for (int j = 0; j < chunksPerLine; j++)
+        {
+          char[] subSequence = seq.getSequence(seqFrom,
+                  Math.min(lastPos, seqFrom + spaceEvery));
+          if (subSequence.length > 0)
+          {
+            sb.append(SPACE).append(subSequence);
+          }
+          seqFrom += subSequence.length;
+          if (first)
+          {
+            // all sequences should be the same length in MEGA
+            advancedBy += subSequence.length;
+          }
+        }
          sb.append(newline);
+        first = false;
        }
+      from += advancedBy;
      }
  
      return new String(sb);
    }
  
    /**
-   * Append the MEGA header and any other known properties
+   * Outputs to string the MEGA header and any other known and relevant
+   * alignment properties
     * 
-   * @param sb
+   * @param al
     */
-  private void printHeaders(StringBuilder sb)
+  protected String printHeaders(AlignmentI al)
    {
-    sb.append(MEGA_ID);
-    sb.append(newline);
+    StringBuilder sb = new StringBuilder(128);
+    sb.append(MEGA_ID).append(newline);
+    printProperty(al, sb, PROP_TITLE, TITLE);
+    printProperty(al, sb, PROP_DESCRIPTION, DESCRIPTION);
  
-    String ttle = getAlignmentProperty(PROP_TITLE);
-    if (ttle != null)
+    /*
+     * !Format DataType CodeTable
+     */
+    sb.append(BANG).append(FORMAT).append(newline);
+    String dataType = (String) al.getProperty(PROP_DATATYPE);
+    if (dataType == null)
      {
-      sb.append(BANG).append(TITLE).append(SPACE).append(ttle)
-              .append(SEMICOLON).append(newline);
+      dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
      }
+    sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
+    String codeTable = (String) al.getProperty(PROP_CODETABLE);
+    sb.append(SPACE).append(CODETABLE).append(EQUALS)
+            .append(codeTable == null ? "Standard" : codeTable)
+            .append(newline);
+    
+    /*
+     * !Format NSeqs NSites
+     * NSites the length of any sequence (they should all be the same), excluding
+     * gaps?!?
+     */
+    sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
+    SequenceI seq = al.getSequenceAt(0);
+    sb.append(SPACE).append(N_SITES).append(EQUALS)
+            .append(seq.getEnd() - seq.getStart() + 1);
+    sb.append(newline);
  
-    String desc = getAlignmentProperty(PROP_DESCRIPTION);
-    if (desc != null)
+    /*
+     * !Format Indel Identical Missing
+     */
+    sb.append(INDENT);
+    sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
+    String identity = (String) al.getProperty(PROP_IDENTITY);
+    if (identity != null)
+    {
+      sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
+    }
+    String missing = (String) al.getProperty(PROP_MISSING);
+    if (missing != null)
      {
-      sb.append(BANG).append(DESCRIPTION).append(SPACE).append(desc)
-              .append(SEMICOLON).append(newline);
+      sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
      }
+    sb.append(SEMICOLON).append(newline);
+
+    return sb.toString();
    }
  
    /**
@@ -971,26 +1057,43 @@ public class MegaFile extends AlignFile
    {
      int maxSequenceLength = getMaxSequenceLength(s);
      // approx
-    int numLines = maxSequenceLength / POSITIONS_PER_LINE + 2 + s.length;
+    int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;
  
      /*
       * Roughly size a buffer to hold the whole output
       */
-    StringBuilder sb = new StringBuilder(numLines * POSITIONS_PER_LINE);
-    printHeaders(sb);
+    StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
  
+    int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
+    int chunksPerLine = positionsPerLine / spaceEvery;
      for (SequenceI seq : s)
      {
        sb.append(newline);
        sb.append(HASHSIGN + seq.getName()).append(newline);
        int startPos = 0;
-      while (startPos <= seq.getLength())
+      while (startPos < seq.getLength())
        {
-        char[] subSequence = seq.getSequence(startPos, startPos
-                + POSITIONS_PER_LINE);
-        sb.append(subSequence);
+        boolean firstChunk = true;
+        /*
+         * print next line for this sequence
+         */
+        int lastPos = startPos + positionsPerLine; // exclusive
+        for (int j = 0; j < chunksPerLine; j++)
+        {
+          char[] subSequence = seq.getSequence(startPos,
+                  Math.min(lastPos, startPos + positionsPerLine));
+          if (subSequence.length > 0)
+          {
+            if (!firstChunk)
+            {
+              sb.append(SPACE);
+            }
+            sb.append(subSequence);
+            firstChunk = false;
+          }
+          startPos += subSequence.length;
+        }
          sb.append(newline);
-        startPos += POSITIONS_PER_LINE;
        }
      }
  
@@ -1006,15 +1109,16 @@ public class MegaFile extends AlignFile
     * @throws IOException
     */
    protected void assertInterleaved(boolean isIt, String dataLine)
-          throws IOException
+          throws FileFormatException
    {
      if (this.interleaved != null && isIt != this.interleaved.booleanValue())
      {
-      throw new IOException(
+      throw new FileFormatException(
                "Parse error: mix of interleaved and noninterleaved detected, at line: "
                        + dataLine);
      }
      this.interleaved = new Boolean(isIt);
+    setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
    }
  
    public boolean isInterleaved()
@@ -1045,4 +1149,61 @@ public class MegaFile extends AlignFile
                + (nucleotide ? " not" : ""));
      }
    }
+
+  /**
+   * Print the given alignment in MEGA format. If the alignment was created by
+   * parsing a MEGA file, it should have properties set (e.g. Title) which can
+   * influence the output.
+   */
+  @Override
+  public String print(AlignmentI al)
+  {
+    this.nucleotide = al.isNucleotide();
+    String lineLength = (String) al.getProperty(PROP_LINELENGTH);
+    this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
+            .parseInt(lineLength);
+    return printHeaders(al) + print(al.getSequencesArray());
+  }
+
+  /**
+   * Helper method to append a property e.g. !Title to the output buffer, if the
+   * property is set on the alignment.
+   * 
+   * @param al
+   * @param headers
+   * @param propertyName
+   * @param propertyKeyword
+   */
+  protected void printProperty(AlignmentI al, StringBuilder headers,
+          String propertyName, String propertyKeyword)
+  {
+    String propertyValue = (String) al.getProperty(propertyName);
+    if (propertyValue != null)
+    {
+      headers.append(BANG).append(propertyKeyword).append(SPACE)
+              .append(propertyValue).append(SEMICOLON)
+              .append(newline);
+    }
+  }
+
+  /**
+   * Returns the number of sequence positions output per line
+   * 
+   * @return
+   */
+  public int getPositionsPerLine()
+  {
+    return positionsPerLine;
+  }
+
+  /**
+   * Sets the number of sequence positions output per line. Note these will be
+   * formatted in blocks of 3 (nucleotide) or 10 (peptide).
+   * 
+   * @param p
+   */
+  public void setPositionsPerLine(int p)
+  {
+    this.positionsPerLine = p;
+  }
  }