JAL-1499 parsing !Label statements to AlignmentAnnotation

[jalview.git] / src / jalview / io / MegaFile.java
diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java

index eb9868b..3096b60 100644 (file)
--- a/src/jalview/io/MegaFile.java
+++ b/src/jalview/io/MegaFile.java
@@ -18,7 +18,9 @@
   */
  package jalview.io;
  
+import jalview.datamodel.AlignmentAnnotation;
  import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Annotation;
  import jalview.datamodel.Sequence;
  import jalview.datamodel.SequenceFeature;
  import jalview.datamodel.SequenceI;
@@ -53,6 +55,8 @@ import java.util.Set;
   */
  public class MegaFile extends AlignFile
  {
+  private static final char UNDERSCORE = '_';
+
    private static final String WHITESPACE = "\\s+";
  
    private static final int DEFAULT_LINE_LENGTH = 60;
@@ -137,6 +141,8 @@ public class MegaFile extends AlignFile
  
    private static final String SPACE = " ";
  
+  private static final String TAB = "\t";
+
    /*
     * number of sequence positions output per line
     */
@@ -190,6 +196,9 @@ public class MegaFile extends AlignFile
    // map of SequenceFeature's by sequence id
    Map<String, List<SequenceFeature>> sequenceFeatures;
  
+  // each !Label line character becomes an Annotation (except underscores)
+  List<Annotation> labelAnnotations;
+
    public MegaFile()
    {
    }
@@ -215,6 +224,7 @@ public class MegaFile extends AlignFile
      geneStart = new HashMap<String, Integer>();
      domainStart = new HashMap<String, Integer>();
      residuesRead = new HashMap<String, Integer>();
+    labelAnnotations = new ArrayList<Annotation>();
  
      /*
       * Read and process MEGA and Title/Format/Description headers if present.
@@ -238,12 +248,14 @@ public class MegaFile extends AlignFile
        dataLine = dataLine.trim();
        if (dataLine.length() > 0)
        {
-        if (dataLine.startsWith(BANG + GENE)
-                || dataLine.startsWith(BANG + DOMAIN))
+        dataLine = dataLine.replace(TAB, SPACE);
+        String upperCased = dataLine.toUpperCase();
+        if (upperCased.startsWith(BANG + GENE.toUpperCase())
+                || upperCased.startsWith(BANG + DOMAIN.toUpperCase()))
          {
            parseGeneOrDomain(dataLine);
          }
-        else if (dataLine.startsWith(BANG + LABEL))
+        else if (upperCased.startsWith(BANG + LABEL.toUpperCase()))
          {
            parseLabel(dataLine);
          }
@@ -271,18 +283,76 @@ public class MegaFile extends AlignFile
      // remember the (longest) line length read in, so we can output the same
      setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
  
-    deriveSequences();
+    deriveSequencesAndFeatures();
+
+    deriveAnnotations();
    }
  
    /**
-   * Parse a !Label
+   * If we parsed !Label statements into a list of Annotation objects, create an
+   * AlignmentAnnotation
+   */
+  protected void deriveAnnotations()
+  {
+    if (this.labelAnnotations.size() > 0)
+    {
+      Annotation[] anns = labelAnnotations
+              .toArray(new Annotation[labelAnnotations.size()]);
+      AlignmentAnnotation aa = new AlignmentAnnotation("MEGA", "Label",
+              anns);
+      this.annotations.add(aa);
+    }
+  }
+
+  /**
+   * Parse a !Label line. This contains a single character per position (column)
+   * of the alignment block above. An underscore character represents no label.
+   * Labels are assembled into an AlignmentAnnotation object.
     * 
     * @param dataLine
+   * @throws FileFormatException
     */
-  protected void parseLabel(String dataLine)
+  protected void parseLabel(String dataLine) throws FileFormatException
    {
-    // TODO Auto-generated method stub
+    // strip off leading !Label and following spaces
+    dataLine = dataLine.substring(LABEL.length() + 1).trim();
+
+    // remove internal spacing and any leading tab
+    String labels = dataLine.replace(SPACE, "");
+    if (labels.endsWith(SEMICOLON))
+    {
+      labels = labels.substring(0, labels.length() - 1);
+    }
+    else
+    {
+      System.err.println("Warning: '" + dataLine
+              + "' should end with semi-colon");
+    }
+    for (char c : labels.toCharArray())
+    {
+      if (c == UNDERSCORE)
+      {
+        this.labelAnnotations.add(null);
+      }
+      else
+      {
+        this.labelAnnotations.add(new Annotation(String.valueOf(c), "",
+                ' ', 0f));
+      }
+    }
  
+    /*
+     * sanity check - the number of labels added should exactly match the
+     * sequence length so far
+     */
+    int sequenceLength = seqData.isEmpty() ? 0 : seqData.values()
+            .iterator().next().length();
+    if (labelAnnotations.size() != sequenceLength)
+    {
+      System.err.println("Warning: file inconsistent - "
+              + labelAnnotations.size() + " labels for " + sequenceLength
+              + " positions after " + dataLine);
+    }
    }
  
    /**
@@ -383,7 +453,7 @@ public class MegaFile extends AlignFile
       * the order of processing below ensures that we correctly capture where a
       * domain is in the context of an enclosing gene
       */
-    processDomainEnd(domain, property);
+    processDomainEnd(domain, gene, property);
  
      processGeneEnd(gene);
  
@@ -435,21 +505,30 @@ public class MegaFile extends AlignFile
  
    /**
     * If we have been processing a domain, and it is not being continued, then
-   * make a sequence feature for the domain just ended
+   * make a sequence feature for the domain just ended. Criteria for the domain
+   * not being continued are either an explicit new domain or gene name, or a
+   * 'Property=domainend' statement
     * 
     * @param domain
+   * @param gene
     * @param property
     * @return true if a feature is created, else false
     */
-  protected boolean processDomainEnd(String domain, String property)
+  protected boolean processDomainEnd(String domain, String gene,
+          String property)
    {
+    boolean newGene = (gene != null && !gene.equals(currentGene));
+
      String verboseDomain = makeVerboseDomainName(domain, property);
+
      if (this.currentDomain != null)
      {
+      boolean newDomain = !this.currentDomain.equals(verboseDomain);
        boolean domainEnded = "domainend".equalsIgnoreCase(property);
-      if (!this.currentDomain.equals(verboseDomain) || domainEnded)
+      if (newDomain || newGene || domainEnded)
        {
          createFeature(DOMAIN, currentDomain, domainStart);
+        currentDomain = null;
          return true;
        }
      }
@@ -473,6 +552,7 @@ public class MegaFile extends AlignFile
      if (this.currentGene != null && !this.currentGene.equals(gene))
      {
        createFeature(GENE, currentGene, geneStart);
+      currentGene = null;
        created = true;
      }
  
@@ -705,7 +785,7 @@ public class MegaFile extends AlignFile
    /**
     * Convert the parsed sequence strings to objects and store them in the model.
     */
-  protected void deriveSequences()
+  protected void deriveSequencesAndFeatures()
    {
      Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
  
@@ -773,14 +853,15 @@ public class MegaFile extends AlignFile
     * @throws IOException
     */
    protected void parseNoninterleavedDataLine(String dataLine)
-          throws IOException
+          throws FileFormatException
    {
      if (currentSequenceId == null)
      {
        /*
         * Oops. Data but no sequence id context.
         */
-      throw new IOException("No sequence id context at: " + dataLine);
+      throw new FileFormatException("No sequence id context at: "
+              + dataLine);
      }
  
      assertInterleaved(false, dataLine);