JAL-1499 output !Label lines for "MEGA Label" alignment annotation

author gmungoc <g.m.carstairs@dundee.ac.uk>

Fri, 9 Oct 2015 09:15:14 +0000 (10:15 +0100)

committer gmungoc <g.m.carstairs@dundee.ac.uk>

Fri, 9 Oct 2015 09:15:14 +0000 (10:15 +0100)
author gmungoc <g.m.carstairs@dundee.ac.uk>
Fri, 9 Oct 2015 09:15:14 +0000 (10:15 +0100)
committer gmungoc <g.m.carstairs@dundee.ac.uk>
Fri, 9 Oct 2015 09:15:14 +0000 (10:15 +0100)
diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java

index fcb7a93..69e7435 100644 (file)
--- a/src/jalview/io/MegaFile.java
+++ b/src/jalview/io/MegaFile.java
@@ -35,6 +35,7 @@ import java.util.List;
  import java.util.Map;
  import java.util.Map.Entry;
  import java.util.Set;
+import java.util.Vector;
  
  /**
   * A parser for input or output of MEGA format files. <br>
@@ -56,6 +57,8 @@ import java.util.Set;
   */
  public class MegaFile extends AlignFile
  {
+  private static final String MEGA_ANNOTATION_LABEL = "MEGA Label";
+
    private static final char UNDERSCORE = '_';
  
    private static final String WHITESPACE = "\\s+";
@@ -246,6 +249,7 @@ public class MegaFile extends AlignFile
       */
      currentSequenceId = "";
  
+    boolean annotationAdded = false;
      while (dataLine != null)
      {
        dataLine = dataLine.trim();
@@ -260,7 +264,7 @@ public class MegaFile extends AlignFile
          }
          else if (upperCased.startsWith(BANG + LABEL.toUpperCase()))
          {
-          parseLabel(dataLine);
+          annotationAdded |= parseLabel(dataLine);
          }
          else
          {
@@ -288,7 +292,10 @@ public class MegaFile extends AlignFile
  
      deriveSequencesAndFeatures();
  
-    deriveAnnotations();
+    if (annotationAdded)
+    {
+      deriveAnnotations();
+    }
    }
  
    /**
@@ -301,7 +308,7 @@ public class MegaFile extends AlignFile
      {
        Annotation[] anns = labelAnnotations
                .toArray(new Annotation[labelAnnotations.size()]);
-      AlignmentAnnotation aa = new AlignmentAnnotation("MEGA", "Label",
+      AlignmentAnnotation aa = new AlignmentAnnotation(MEGA_ANNOTATION_LABEL, "",
                anns);
        this.annotations.add(aa);
      }
@@ -313,9 +320,10 @@ public class MegaFile extends AlignFile
     * Labels are assembled into an AlignmentAnnotation object.
     * 
     * @param dataLine
+   * @return true if any non-null annotation was created
     * @throws FileFormatException
     */
-  protected void parseLabel(String dataLine) throws FileFormatException
+  protected boolean parseLabel(String dataLine) throws FileFormatException
    {
      // strip off leading !Label and following spaces
      dataLine = dataLine.substring(LABEL.length() + 1).trim();
@@ -331,6 +339,7 @@ public class MegaFile extends AlignFile
        System.err.println("Warning: '" + dataLine
                + "' should end with semi-colon");
      }
+    boolean added = false;
      for (char c : labels.toCharArray())
      {
        if (c == UNDERSCORE)
@@ -341,21 +350,10 @@ public class MegaFile extends AlignFile
        {
          this.labelAnnotations.add(new Annotation(String.valueOf(c), "",
                  ' ', 0f));
+        added = true;
        }
      }
-
-    /*
-     * sanity check - the number of labels added should exactly match the
-     * sequence length so far
-     */
-    int sequenceLength = seqData.isEmpty() ? 0 : seqData.values()
-            .iterator().next().length();
-    if (labelAnnotations.size() != sequenceLength)
-    {
-      System.err.println("Warning: file inconsistent - "
-              + labelAnnotations.size() + " labels for " + sequenceLength
-              + " positions after " + dataLine);
-    }
+    return added;
    }
  
    /**
@@ -364,11 +362,19 @@ public class MegaFile extends AlignFile
    protected void endOfDataBlock()
    {
      this.firstDataBlockRead = true;
-    // TODO:
-    // (initialise and) populate arrays of sequence length so far (excluding
-    // gaps)
-    // On change or end of a denoted Gene or Domain, add sequence features for
-    // it
+
+    /*
+     * append null annotations to keep the annotations the same length as the
+     * sequences (in case some blocks have !Label lines and some don't)
+     */
+
+    int sequenceLength = seqData.isEmpty() ? 0 : seqData.values()
+            .iterator().next().length();
+    int annotationsToAdd = sequenceLength - labelAnnotations.size();
+    for (int i = 0; i < annotationsToAdd; i++)
+    {
+      labelAnnotations.add(null);
+    }
    }
  
    /**
@@ -1419,6 +1425,7 @@ public class MegaFile extends AlignFile
          sb.append(newline);
          first = false;
        }
+      sb.append(printLabel(from, advancedBy, maxIdLength));
        from += advancedBy;
      }
  
@@ -1553,40 +1560,112 @@ public class MegaFile extends AlignFile
       */
      StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
  
+    for (SequenceI seq : s)
+    {
+      printSequence(sb, seq);
+    }
+
+    return new String(sb);
+  }
+
+  /**
+   * Append a formatted complete sequence to the string buffer
+   * 
+   * @param sb
+   * @param seq
+   */
+  protected void printSequence(StringBuilder sb, SequenceI seq)
+  {
      int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
+    // round down to output a whole number of spaced blocks
      int chunksPerLine = positionsPerLine / spaceEvery;
-    for (SequenceI seq : s)
+
+    sb.append(newline);
+    sb.append(HASHSIGN + seq.getName()).append(newline);
+    int startPos = 0;
+    while (startPos < seq.getLength())
      {
-      sb.append(newline);
-      sb.append(HASHSIGN + seq.getName()).append(newline);
-      int startPos = 0;
-      while (startPos < seq.getLength())
+      /*
+       * print next line for this sequence
+       */
+      boolean firstChunk = true;
+      int lastPos = startPos + positionsPerLine; // exclusive
+      for (int j = 0; j < chunksPerLine; j++)
        {
-        boolean firstChunk = true;
-        /*
-         * print next line for this sequence
-         */
-        int lastPos = startPos + positionsPerLine; // exclusive
-        for (int j = 0; j < chunksPerLine; j++)
+        char[] subSequence = seq.getSequence(startPos,
+                Math.min(lastPos, startPos + spaceEvery));
+        if (subSequence.length > 0)
          {
-          char[] subSequence = seq.getSequence(startPos,
-                  Math.min(lastPos, startPos + positionsPerLine));
-          if (subSequence.length > 0)
+          if (!firstChunk)
            {
-            if (!firstChunk)
-            {
-              sb.append(SPACE);
-            }
-            sb.append(subSequence);
-            firstChunk = false;
+            sb.append(SPACE);
            }
-          startPos += subSequence.length;
+          sb.append(subSequence);
+          firstChunk = false;
          }
-        sb.append(newline);
+        startPos += subSequence.length;
        }
+      // line end position (base 1) as a comment
+      sb.append(SPACE).append(COMMENT_START).append(startPos)
+              .append(COMMENT_END);
+      sb.append(newline);
      }
+  }
  
-    return new String(sb);
+  /**
+   * Returns a formatted string like <br>
+   * !Label aa_b_ ab_b_ <br>
+   * where underscore represents no annotation, any other character a MEGA label
+   * character <br>
+   * Returns an empty string if there is no non-null annotation in the given
+   * alignment range
+   * 
+   * @param fromPos
+   *          start column of the alignment (base 0)
+   * @param positions
+   *          number of positions to output
+   * @param labelWidth
+   *          padded width of !Label statement to output
+   * @return
+   */
+  protected String printLabel(int fromPos, int positions, int labelWidth)
+  {
+    int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
+    String none = "";
+    if (annotations == null || annotations.isEmpty()
+            || !MEGA_ANNOTATION_LABEL.equals(annotations.get(0).label))
+    {
+      return none;
+    }
+
+    StringBuilder sb = new StringBuilder(positions + 20);
+    sb.append(String.format("%-" + labelWidth + "s ", BANG + LABEL));
+    Annotation[] anns = annotations.get(0).annotations;
+    int blockCharCount = 0;
+    boolean annotationFound = false;
+
+    for (int i = fromPos; i < fromPos + positions; i++)
+    {
+      String label = String.valueOf(UNDERSCORE);
+      if (i < anns.length && anns[i] != null)
+      {
+        label = anns[i].displayCharacter;
+      }
+      sb.append(label);
+      if (label.charAt(0) != UNDERSCORE)
+      {
+        annotationFound = true;
+      }
+      // add a space after each block except the last
+      if (++blockCharCount % spaceEvery == 0
+              && (i < fromPos + positions - 1))
+      {
+        sb.append(SPACE);
+      }
+    }
+    sb.append(SEMICOLON).append(newline);
+
+    return annotationFound ? sb.toString() : none;
    }
  
    /**
@@ -1643,13 +1722,31 @@ public class MegaFile extends AlignFile
    /**
     * Print the given alignment in MEGA format. If the alignment was created by
     * parsing a MEGA file, it should have properties set (e.g. Title) which can
-   * influence the output.
+   * surface in the output.
     */
    @Override
    public String print(AlignmentI al)
    {
      this.nucleotide = al.isNucleotide();
  
+    /*
+     * if the alignment has a "MEGA" annotation, we'll output its values as
+     * !Label statements; MEGA only supports one of these
+     */
+    AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
+    if (anns != null)
+    {
+      for (AlignmentAnnotation ann : anns)
+      {
+        if (MEGA_ANNOTATION_LABEL.equals(ann.label))
+        {
+          this.annotations = new Vector<AlignmentAnnotation>();
+          annotations.add(ann);
+          break;
+        }
+      }
+    }
+
      String lineLength = (String) al.getProperty(PROP_LINELENGTH);
      this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
              .parseInt(lineLength);
diff --git a/test/jalview/io/MegaFileTest.java b/test/jalview/io/MegaFileTest.java

index f7d83c0..bdae11a 100644 (file)
--- a/test/jalview/io/MegaFileTest.java
+++ b/test/jalview/io/MegaFileTest.java
@@ -279,9 +279,9 @@ public class MegaFileTest
      System.out.println(printed);
      // normally output should match input
      // we cheated here with a number of short input lines
-    String expected = "#MEGA\n\n"
- + "#U455\n" + "ABCFEDHIJM\nNOPQR\n\n"
-            + "#CPZANT\n" + "KLMNOPWXYZ\nCGATC\n";
+    String expected = "#MEGA\n\n" + "#U455\n"
+            + "ABCFEDHIJM [10]\nNOPQR [15]\n\n" + "#CPZANT\n"
+            + "KLMNOPWXYZ [10]\nCGATC [15]\n";
      assertEquals("Print format wrong", expected, printed);
    }
  
@@ -304,7 +304,6 @@ public class MegaFileTest
      String printed = testee.print();
      System.out.println(printed);
      //@formatter:off
-    //0123456789klmnopqrstABCDEFGHIJ9876543210abcdefghij
      String expected = 
              "#MEGA\n\n" + 
              "#U455   0123456789 klmnopqrst [20]\n" + // first 20
@@ -333,10 +332,14 @@ public class MegaFileTest
      assertEquals(30, testee.getPositionsPerLine());
      testee.setPositionsPerLine(25);
      String printed = testee.print();
-    // 60 character sequence should be output as 50 on first line then 10 more
+
+    /*
+     * 25 positions per line is rounded down to 20 (two blocks of 10)
+     */
      String expected = "#MEGA\n\n" + "#SIXTY\n"
-            + "0123456789klmnopqrstABCDE\n" + "FGHIJ9876543210abcdefghij\n"
-            + "9993332221\n";
+            + "0123456789 klmnopqrst [20]\n"
+            + "ABCDEFGHIJ 9876543210 [40]\n"
+            + "abcdefghij 9993332221 [60]\n";
      assertEquals("Print format wrong", expected, printed);
    }
  
@@ -718,24 +721,31 @@ public class MegaFileTest
      "TITLE: Interleaved sequence data\n\n" + 
      "#U455   ABC DEF\n" + 
      "#CPZANT MNO PQR\n" +
-    "!Label  +-_ 23_\n" +
+    "!Label  +-_ 23_\n\n" +
+    // a row with no labels = null annotation
+    "#U455   abc def\n" + 
+    "#CPZANT mno pqr\n\n" +
      "#U455   KLM NOP\n" + 
      "#CPZANT WXY ZGC\n" +
      "!label  __3 +X_\n", AppletFormatAdapter.PASTE);
      //@formatter:on
      Vector<SequenceI> seqs = testee.getSeqs();
      assertEquals("Expected two sequences", 2, seqs.size());
-    assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0)
+    assertEquals("First sequence data wrong", "ABCDEFabcdefKLMNOP", seqs
+            .get(0)
              .getSequenceAsString());
-    assertEquals("Second sequence data wrong", "MNOPQRWXYZGC", seqs.get(1)
+    assertEquals("Second sequence data wrong", "MNOPQRmnopqrWXYZGC", seqs
+            .get(1)
              .getSequenceAsString());
  
      // check AlignmentAnnotation added with expected values
      assertEquals(1, testee.annotations.size());
      AlignmentAnnotation aa = testee.annotations.get(0);
      assertNull(aa.sequenceRef);
-    assertEquals(12, aa.annotations.length);
-    assertEquals("+, -, , 2, 3, , , , 3, +, X, , ", aa.toString());
+    assertEquals("MEGA Label", aa.label);
+    assertEquals(18, aa.annotations.length);
+    assertEquals("+, -, , 2, 3, , , , , , , , , , 3, +, X, , ",
+            aa.toString());
    }
  
    //@formatter:on
@@ -809,4 +819,50 @@ public class MegaFileTest
              .getSequenceAsString());
      assertEquals('-', al.getGapCharacter());
    }
+
+  /**
+   * Test reading a MEGA file to an alignment then writing it out in MEGA
+   * format. Includes !Label statements which should be converted to
+   * AlignmentAnnotation and back again.
+   * 
+   * @throws IOException
+   */
+  @Test(groups = "Functional")
+  public void testRoundTrip_withLabels() throws IOException
+  {
+    AppletFormatAdapter fa = new AppletFormatAdapter();
+
+    //@formatter:off
+    String data = "#MEGA\n"
+    + "#U455   C-- GTA\n" 
+    + "#CPZANT ATC -G-\n"
+    + "!Label F__E_H\n\n"
+    + "#U455   CGA --T\n" 
+    + "#CPZANT CA- -GC\n"
+    + "!Label FFH__E\n";
+    AlignmentI al = fa.readFile(data,
+            AppletFormatAdapter.PASTE, "MEGA");
+    AlignmentAnnotation aa = al.getAlignmentAnnotation()[0];
+    assertEquals("MEGA Label", aa.label);
+    assertEquals("F, , , E, , H, F, F, H, , , E, ",
+            aa.toString());
+
+    MegaFile output = new MegaFile();
+    String formatted = output.print(al);
+    String expected = 
+        "#MEGA\n" +
+        "!Format\n" +
+        "    DataType=Nucleotide CodeTable=Standard\n" +
+        "    NSeqs=2 NSites=12\n" +
+        "    Indel=-;\n\n" +
+        "#U455   C-- GTA [6]\n" +
+        "#CPZANT ATC -G- [6]\n" +
+        "!Label F__ E_H;\n\n" +  
+        "#U455   CGA --T [12]\n" +
+        "#CPZANT CA- -GC [12]\n" +
+        "!Label FFH __E;\n";
+    //@formatter:on
+    assertEquals("Roundtrip didn't match", expected,
+            formatted);
+  }
  }
author	gmungoc <g.m.carstairs@dundee.ac.uk>
	Fri, 9 Oct 2015 09:15:14 +0000 (10:15 +0100)
committer	gmungoc <g.m.carstairs@dundee.ac.uk>
	Fri, 9 Oct 2015 09:15:14 +0000 (10:15 +0100)
src/jalview/io/MegaFile.java		patch \| blob \| history
test/jalview/io/MegaFileTest.java		patch \| blob \| history