JAL-1499 parsing !Label statements to AlignmentAnnotation

author gmungoc <g.m.carstairs@dundee.ac.uk>

Wed, 7 Oct 2015 15:47:13 +0000 (16:47 +0100)

committer gmungoc <g.m.carstairs@dundee.ac.uk>

Wed, 7 Oct 2015 15:47:13 +0000 (16:47 +0100)
author gmungoc <g.m.carstairs@dundee.ac.uk>
Wed, 7 Oct 2015 15:47:13 +0000 (16:47 +0100)
committer gmungoc <g.m.carstairs@dundee.ac.uk>
Wed, 7 Oct 2015 15:47:13 +0000 (16:47 +0100)
diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java

index eb9868b..3096b60 100644 (file)
--- a/src/jalview/io/MegaFile.java
+++ b/src/jalview/io/MegaFile.java
@@ -18,7 +18,9 @@
   */
  package jalview.io;
  
+import jalview.datamodel.AlignmentAnnotation;
  import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Annotation;
  import jalview.datamodel.Sequence;
  import jalview.datamodel.SequenceFeature;
  import jalview.datamodel.SequenceI;
@@ -53,6 +55,8 @@ import java.util.Set;
   */
  public class MegaFile extends AlignFile
  {
+  private static final char UNDERSCORE = '_';
+
    private static final String WHITESPACE = "\\s+";
  
    private static final int DEFAULT_LINE_LENGTH = 60;
@@ -137,6 +141,8 @@ public class MegaFile extends AlignFile
  
    private static final String SPACE = " ";
  
+  private static final String TAB = "\t";
+
    /*
     * number of sequence positions output per line
     */
@@ -190,6 +196,9 @@ public class MegaFile extends AlignFile
    // map of SequenceFeature's by sequence id
    Map<String, List<SequenceFeature>> sequenceFeatures;
  
+  // each !Label line character becomes an Annotation (except underscores)
+  List<Annotation> labelAnnotations;
+
    public MegaFile()
    {
    }
@@ -215,6 +224,7 @@ public class MegaFile extends AlignFile
      geneStart = new HashMap<String, Integer>();
      domainStart = new HashMap<String, Integer>();
      residuesRead = new HashMap<String, Integer>();
+    labelAnnotations = new ArrayList<Annotation>();
  
      /*
       * Read and process MEGA and Title/Format/Description headers if present.
@@ -238,12 +248,14 @@ public class MegaFile extends AlignFile
        dataLine = dataLine.trim();
        if (dataLine.length() > 0)
        {
-        if (dataLine.startsWith(BANG + GENE)
-                || dataLine.startsWith(BANG + DOMAIN))
+        dataLine = dataLine.replace(TAB, SPACE);
+        String upperCased = dataLine.toUpperCase();
+        if (upperCased.startsWith(BANG + GENE.toUpperCase())
+                || upperCased.startsWith(BANG + DOMAIN.toUpperCase()))
          {
            parseGeneOrDomain(dataLine);
          }
-        else if (dataLine.startsWith(BANG + LABEL))
+        else if (upperCased.startsWith(BANG + LABEL.toUpperCase()))
          {
            parseLabel(dataLine);
          }
@@ -271,18 +283,76 @@ public class MegaFile extends AlignFile
      // remember the (longest) line length read in, so we can output the same
      setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
  
-    deriveSequences();
+    deriveSequencesAndFeatures();
+
+    deriveAnnotations();
    }
  
    /**
-   * Parse a !Label
+   * If we parsed !Label statements into a list of Annotation objects, create an
+   * AlignmentAnnotation
+   */
+  protected void deriveAnnotations()
+  {
+    if (this.labelAnnotations.size() > 0)
+    {
+      Annotation[] anns = labelAnnotations
+              .toArray(new Annotation[labelAnnotations.size()]);
+      AlignmentAnnotation aa = new AlignmentAnnotation("MEGA", "Label",
+              anns);
+      this.annotations.add(aa);
+    }
+  }
+
+  /**
+   * Parse a !Label line. This contains a single character per position (column)
+   * of the alignment block above. An underscore character represents no label.
+   * Labels are assembled into an AlignmentAnnotation object.
     * 
     * @param dataLine
+   * @throws FileFormatException
     */
-  protected void parseLabel(String dataLine)
+  protected void parseLabel(String dataLine) throws FileFormatException
    {
-    // TODO Auto-generated method stub
+    // strip off leading !Label and following spaces
+    dataLine = dataLine.substring(LABEL.length() + 1).trim();
+
+    // remove internal spacing and any leading tab
+    String labels = dataLine.replace(SPACE, "");
+    if (labels.endsWith(SEMICOLON))
+    {
+      labels = labels.substring(0, labels.length() - 1);
+    }
+    else
+    {
+      System.err.println("Warning: '" + dataLine
+              + "' should end with semi-colon");
+    }
+    for (char c : labels.toCharArray())
+    {
+      if (c == UNDERSCORE)
+      {
+        this.labelAnnotations.add(null);
+      }
+      else
+      {
+        this.labelAnnotations.add(new Annotation(String.valueOf(c), "",
+                ' ', 0f));
+      }
+    }
  
+    /*
+     * sanity check - the number of labels added should exactly match the
+     * sequence length so far
+     */
+    int sequenceLength = seqData.isEmpty() ? 0 : seqData.values()
+            .iterator().next().length();
+    if (labelAnnotations.size() != sequenceLength)
+    {
+      System.err.println("Warning: file inconsistent - "
+              + labelAnnotations.size() + " labels for " + sequenceLength
+              + " positions after " + dataLine);
+    }
    }
  
    /**
@@ -383,7 +453,7 @@ public class MegaFile extends AlignFile
       * the order of processing below ensures that we correctly capture where a
       * domain is in the context of an enclosing gene
       */
-    processDomainEnd(domain, property);
+    processDomainEnd(domain, gene, property);
  
      processGeneEnd(gene);
  
@@ -435,21 +505,30 @@ public class MegaFile extends AlignFile
  
    /**
     * If we have been processing a domain, and it is not being continued, then
-   * make a sequence feature for the domain just ended
+   * make a sequence feature for the domain just ended. Criteria for the domain
+   * not being continued are either an explicit new domain or gene name, or a
+   * 'Property=domainend' statement
     * 
     * @param domain
+   * @param gene
     * @param property
     * @return true if a feature is created, else false
     */
-  protected boolean processDomainEnd(String domain, String property)
+  protected boolean processDomainEnd(String domain, String gene,
+          String property)
    {
+    boolean newGene = (gene != null && !gene.equals(currentGene));
+
      String verboseDomain = makeVerboseDomainName(domain, property);
+
      if (this.currentDomain != null)
      {
+      boolean newDomain = !this.currentDomain.equals(verboseDomain);
        boolean domainEnded = "domainend".equalsIgnoreCase(property);
-      if (!this.currentDomain.equals(verboseDomain) || domainEnded)
+      if (newDomain || newGene || domainEnded)
        {
          createFeature(DOMAIN, currentDomain, domainStart);
+        currentDomain = null;
          return true;
        }
      }
@@ -473,6 +552,7 @@ public class MegaFile extends AlignFile
      if (this.currentGene != null && !this.currentGene.equals(gene))
      {
        createFeature(GENE, currentGene, geneStart);
+      currentGene = null;
        created = true;
      }
  
@@ -705,7 +785,7 @@ public class MegaFile extends AlignFile
    /**
     * Convert the parsed sequence strings to objects and store them in the model.
     */
-  protected void deriveSequences()
+  protected void deriveSequencesAndFeatures()
    {
      Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
  
@@ -773,14 +853,15 @@ public class MegaFile extends AlignFile
     * @throws IOException
     */
    protected void parseNoninterleavedDataLine(String dataLine)
-          throws IOException
+          throws FileFormatException
    {
      if (currentSequenceId == null)
      {
        /*
         * Oops. Data but no sequence id context.
         */
-      throw new IOException("No sequence id context at: " + dataLine);
+      throw new FileFormatException("No sequence id context at: "
+              + dataLine);
      }
  
      assertInterleaved(false, dataLine);
diff --git a/test/jalview/io/MegaFileTest.java b/test/jalview/io/MegaFileTest.java

index 881c47b..2b2422f 100644 (file)
--- a/test/jalview/io/MegaFileTest.java
+++ b/test/jalview/io/MegaFileTest.java
@@ -6,6 +6,7 @@ import static org.testng.AssertJUnit.assertNull;
  import static org.testng.AssertJUnit.assertTrue;
  import static org.testng.AssertJUnit.fail;
  
+import jalview.datamodel.AlignmentAnnotation;
  import jalview.datamodel.AlignmentI;
  import jalview.datamodel.Sequence;
  import jalview.datamodel.SequenceFeature;
@@ -502,16 +503,17 @@ public class MegaFileTest
     * @throws IOException
     */
    @Test(groups = { "Functional" })
-  public void testParse_interleavedWithIdentity() throws IOException
+  public void testParse_interleavedWithIdentityAndTabs() throws IOException
    {
      //@formatter:off
+    // uses tab instead of space separators to check robustness
      MegaFile testee = new MegaFile("#MEGA\n"+ 
-    "!TITLE Interleaved sequence data;\n" +
-    "!Format Identical=.;\n\n" +
-    "#U455   ABCDEF\n" + 
-    "#CPZANT  M..P.R\n\n" + 
-    "#U455   KLMNOP\n" + 
-    "#CPZANT ..YZ..", AppletFormatAdapter.PASTE);
+    "!TITLE\tInterleaved sequence data;\n" +
+    "!Format\tIdentical=.;\n\n" +
+    "#U455\tABCDEF\n" + 
+    "#CPZANT\tM..P.R\n\n" + 
+    "#U455\t\tKLMNOP\n" +
+    "#CPZANT\t..YZ..", AppletFormatAdapter.PASTE);
      //@formatter:on
      assertEquals("Title not as expected", "Interleaved sequence data",
              testee.getAlignmentProperty(MegaFile.PROP_TITLE));
@@ -623,7 +625,7 @@ public class MegaFileTest
      "!Domain=Exon1 Gene=Adh Property=Coding CodonStart=1;\n" +
      "#U455   GGGGGG\n" + 
      "#CPZANT AAAAAA\n\n" +
-    "!Domain=Intron1 Property=Intron Gene=Adh;\n" +
+    "!domain=Intron1 Property=Intron Gene=Adh;\n" +
      "#U455   tttttt\n" + 
      "#CPZANT cccccc\n\n" +
      "!Domain=Exon2 Gene=Adh Property=Exon CodonStart=1;\n" +
@@ -698,4 +700,81 @@ public class MegaFileTest
      assertEquals(begin, sf.begin);
      assertEquals(end, sf.end);
    }
+
+  //@formatter:on
+  
+  /**
+   * Test parse of data including !Label statements. An underscore means no
+   * label, other characters are treated as alignment annotation.
+   * 
+   * @throws IOException
+   */
+  @Test(groups = { "Functional" })
+  public void testParse_withLabels() throws IOException
+  {
+    //@formatter:off
+    MegaFile testee = new MegaFile("#MEGA\n"+ 
+    "TITLE: Interleaved sequence data\n\n" + 
+    "#U455   ABC DEF\n" + 
+    "#CPZANT MNO PQR\n" +
+    "!Label  +-_ 23_\n" +
+    "#U455   KLM NOP\n" + 
+    "#CPZANT WXY ZGC\n" +
+    "!label  __3 +X_\n", AppletFormatAdapter.PASTE);
+    //@formatter:on
+    Vector<SequenceI> seqs = testee.getSeqs();
+    assertEquals("Expected two sequences", 2, seqs.size());
+    assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0)
+            .getSequenceAsString());
+    assertEquals("Second sequence data wrong", "MNOPQRWXYZGC", seqs.get(1)
+            .getSequenceAsString());
+
+    // check AlignmentAnnotation added with expected values
+    assertEquals(1, testee.annotations.size());
+    AlignmentAnnotation aa = testee.annotations.get(0);
+    assertNull(aa.sequenceRef);
+    assertEquals(12, aa.annotations.length);
+    assertEquals("+, -, , 2, 3, , , , 3, +, X, , ", aa.toString());
+  }
+
+  //@formatter:on
+  
+  /**
+   * Test case where a domain is implicitly terminated by starting a new gene
+   * 
+   * @throws IOException
+   */
+  @Test(groups = { "Functional" })
+  public void testParse_changeOfGeneEndsDomain() throws IOException
+  {
+    //@formatter:off
+    // uses tab instead of space separators to check robustness
+    MegaFile testee = new MegaFile("#MEGA\n"+ 
+    "!TITLE Interleaved sequence data;\n" +
+    "!Format Identical=.;\n\n" +
+    "!Gene=gene1 Domain=Exon1 Property=Coding;\n" +
+    "#U455 ABCDEF\n" + 
+    "#CPZANT M..P.R\n\n" + 
+    "!Gene=gene2;\n" +
+    "#U455 KLMNOP\n" +
+    "#CPZANT ..YZ..", AppletFormatAdapter.PASTE);
+    //@formatter:on
+    Vector<SequenceI> seqs = testee.getSeqs();
+    assertEquals("Expected two sequences", 2, seqs.size());
+    assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0)
+            .getSequenceAsString());
+    assertEquals("Second sequence data wrong", "MBCPERKLYZOP", seqs.get(1)
+            .getSequenceAsString());
+    assertTrue("File format is not flagged as interleaved",
+            testee.isInterleaved());
+
+    for (SequenceI seq : seqs)
+    {
+      SequenceFeature[] sfs = seq.getSequenceFeatures();
+      assertEquals(3, sfs.length);
+      verifySequenceFeature(sfs[0], "Exon1 (gene1 Coding)", "Domain", 1, 6);
+      verifySequenceFeature(sfs[1], "gene1", "Gene", 1, 6);
+      verifySequenceFeature(sfs[2], "gene2", "Gene", 7, 12);
+    }
+  }
  }
author	gmungoc <g.m.carstairs@dundee.ac.uk>
	Wed, 7 Oct 2015 15:47:13 +0000 (16:47 +0100)
committer	gmungoc <g.m.carstairs@dundee.ac.uk>
	Wed, 7 Oct 2015 15:47:13 +0000 (16:47 +0100)
src/jalview/io/MegaFile.java		patch \| blob \| history
test/jalview/io/MegaFileTest.java		patch \| blob \| history