JAL-1499 Gene and Domain parsed to AlignmentAnnotation (currently as

author gmungoc <g.m.carstairs@dundee.ac.uk>

Fri, 9 Oct 2015 14:54:38 +0000 (15:54 +0100)

committer gmungoc <g.m.carstairs@dundee.ac.uk>

Fri, 9 Oct 2015 14:54:38 +0000 (15:54 +0100)
author gmungoc <g.m.carstairs@dundee.ac.uk>
Fri, 9 Oct 2015 14:54:38 +0000 (15:54 +0100)
committer gmungoc <g.m.carstairs@dundee.ac.uk>
Fri, 9 Oct 2015 14:54:38 +0000 (15:54 +0100)
diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java

index 69e7435..b9cc546 100644 (file)
--- a/src/jalview/io/MegaFile.java
+++ b/src/jalview/io/MegaFile.java
@@ -24,8 +24,10 @@ import jalview.datamodel.Annotation;
  import jalview.datamodel.Sequence;
  import jalview.datamodel.SequenceFeature;
  import jalview.datamodel.SequenceI;
+import jalview.schemes.UserColourScheme;
  import jalview.util.Comparison;
  
+import java.awt.Color;
  import java.io.IOException;
  import java.util.ArrayList;
  import java.util.HashMap;
@@ -49,15 +51,20 @@ import java.util.Vector;
   * http://www.megasoftware.net/manual.pdf <br>
   * Limitations:
   * <ul>
- * <li>nested comments (marked by [ ]) are accepted but not preserved</li>
- * <li>to be completed</li>
+ * <li>any comments (delimited by [ ]) are ignored and not preserved</li>
   * </ul>
   * 
   * @see http://www.megasoftware.net/
   */
  public class MegaFile extends AlignFile
  {
-  private static final String MEGA_ANNOTATION_LABEL = "MEGA Label";
+  private static final String MEGA = "MEGA";
+
+  private static final String MEGA_ANNOTATION_LABEL = MEGA + " Label";
+
+  private static final String MEGA_ANNOTATION_GENE = MEGA + " Gene";
+
+  private static final String MEGA_ANNOTATION_DOMAIN = MEGA + " Domain";
  
    private static final char UNDERSCORE = '_';
  
@@ -97,7 +104,7 @@ public class MegaFile extends AlignFile
  
    private static final String EQUALS = "=";
  
-  private static final String MEGA_ID = HASHSIGN + "MEGA";
+  private static final String MEGA_ID = HASHSIGN + MEGA;
  
    private static final String TITLE = "Title";
  
@@ -113,6 +120,8 @@ public class MegaFile extends AlignFile
  
    private static final String CODONSTART = "CodonStart";
  
+  private static final String DOMAINEND = "domainend";
+
    private static final String LABEL = "Label";
  
    /*
@@ -190,21 +199,33 @@ public class MegaFile extends AlignFile
    // current Gene if any we are parsing
    private String currentGene;
  
-  // start residue (base 1) per sequence of current gene
-  Map<String, Integer> geneStart;
+  // start position in alignment (base 0) of current Gene
+  private int currentGeneStartCol;
+
+  // start residue (base 1) per sequence of current Gene
+  private Map<String, Integer> geneStart;
  
    // current Domain if any we are parsing
    private String currentDomain;
  
-  // start residue (base 1) per sequence of current domain
-  Map<String, Integer> domainStart;
+  // start position in alignment (base 0) of current Domain
+  private int currentDomainStartCol;
+
+  // start residue (base 1) per sequence of current Domain
+  private Map<String, Integer> domainStart;
  
    // map of SequenceFeature's by sequence id
-  Map<String, List<SequenceFeature>> sequenceFeatures;
+  private Map<String, List<SequenceFeature>> sequenceFeatures;
  
    // each !Label line character becomes an Annotation (except underscores)
    List<Annotation> labelAnnotations;
  
+  // records any declared Gene positions (including null values)
+  List<Annotation> geneAnnotations;
+
+  // records any declared Domain positions (including null values)
+  List<Annotation> domainAnnotations;
+
    public MegaFile()
    {
    }
@@ -231,6 +252,10 @@ public class MegaFile extends AlignFile
      domainStart = new HashMap<String, Integer>();
      residuesRead = new HashMap<String, Integer>();
      labelAnnotations = new ArrayList<Annotation>();
+    geneAnnotations = new ArrayList<Annotation>();
+    domainAnnotations = new ArrayList<Annotation>();
+    currentDomainStartCol = -1;
+    currentGeneStartCol = -1;
  
      /*
       * Read and process MEGA and Title/Format/Description headers if present.
@@ -249,7 +274,6 @@ public class MegaFile extends AlignFile
       */
      currentSequenceId = "";
  
-    boolean annotationAdded = false;
      while (dataLine != null)
      {
        dataLine = dataLine.trim();
@@ -264,7 +288,7 @@ public class MegaFile extends AlignFile
          }
          else if (upperCased.startsWith(BANG + LABEL.toUpperCase()))
          {
-          annotationAdded |= parseLabel(dataLine);
+          parseLabel(dataLine);
          }
          else
          {
@@ -287,33 +311,60 @@ public class MegaFile extends AlignFile
      createFeature(GENE, currentGene, geneStart);
      createFeature(DOMAIN, currentDomain, domainStart);
  
+    extendAnnotation(geneAnnotations, currentGene, currentGeneStartCol);
+    extendAnnotation(domainAnnotations, currentDomain,
+            currentDomainStartCol);
+
      // remember the (longest) line length read in, so we can output the same
      setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
  
      deriveSequencesAndFeatures();
  
-    if (annotationAdded)
-    {
-      deriveAnnotations();
-    }
+    deriveAnnotations();
    }
  
    /**
-   * If we parsed !Label statements into a list of Annotation objects, create an
-   * AlignmentAnnotation
+   * Create AlignmentAnnotation for Label, Gene and Domain (provided at least
+   * one non-null annotation is present)
     */
    protected void deriveAnnotations()
    {
-    if (this.labelAnnotations.size() > 0)
+    deriveAnnotation(this.labelAnnotations, MEGA_ANNOTATION_LABEL);
+    deriveAnnotation(this.geneAnnotations, MEGA_ANNOTATION_GENE);
+    deriveAnnotation(this.domainAnnotations, MEGA_ANNOTATION_DOMAIN);
+  }
+
+  /**
+   * Create and ad an AlignmentAnnotation (provided at least one non-null
+   * annotation is present)
+   * 
+   * @param anns
+   * @param label
+   */
+  protected void deriveAnnotation(List<Annotation> anns, String label)
+  {
+    if (anns.size() > 0 && hasNonNullEntry(anns))
      {
-      Annotation[] anns = labelAnnotations
-              .toArray(new Annotation[labelAnnotations.size()]);
-      AlignmentAnnotation aa = new AlignmentAnnotation(MEGA_ANNOTATION_LABEL, "",
-              anns);
+      Annotation[] annotationArray = anns.toArray(new Annotation[anns
+              .size()]);
+      AlignmentAnnotation aa = new AlignmentAnnotation(label, "",
+              annotationArray);
        this.annotations.add(aa);
      }
    }
  
+  protected static boolean hasNonNullEntry(List<? extends Object> l)
+  {
+    for (Object o : l)
+    {
+      if (o != null)
+      {
+        return true;
+      }
+    }
+    return false;
+  }
+
    /**
     * Parse a !Label line. This contains a single character per position (column)
     * of the alignment block above. An underscore character represents no label.
@@ -363,17 +414,36 @@ public class MegaFile extends AlignFile
    {
      this.firstDataBlockRead = true;
  
-    /*
-     * append null annotations to keep the annotations the same length as the
-     * sequences (in case some blocks have !Label lines and some don't)
-     */
+    padAnnotations(labelAnnotations);
+  }
  
-    int sequenceLength = seqData.isEmpty() ? 0 : seqData.values()
-            .iterator().next().length();
-    int annotationsToAdd = sequenceLength - labelAnnotations.size();
+  /**
+   * Append null annotations to keep the annotations list the same length as the
+   * sequences. This ensures that when the list is converted to an array it is
+   * correctly aligned with the alignment columns. It is needed when there are
+   * gaps in declared 'annotations' in a MEGA file, such as lines with no !Label
+   * statement, or regions between marked genes or domains.
+   * 
+   * @param anns
+   */
+  protected void padAnnotations(List<Annotation> anns)
+  {
+    addNullAnnotations(anns, getAlignmentWidth());
+  }
+
+  /**
+   * Append null annotations for positions up to (and excluding) the given end
+   * column (base 0)
+   * 
+   * @param anns
+   * @param upTo
+   */
+  protected void addNullAnnotations(List<Annotation> anns, int upTo)
+  {
+    int annotationsToAdd = upTo - anns.size();
      for (int i = 0; i < annotationsToAdd; i++)
      {
-      labelAnnotations.add(null);
+      anns.add(null);
      }
    }
  
@@ -459,8 +529,8 @@ public class MegaFile extends AlignFile
            String property, String codonStart)
    {
      /*
-     * the order of processing below ensures that we correctly capture where a
-     * domain is in the context of an enclosing gene
+     * the order of processing below ensures that we correctly handle a domain
+     * in the context of an enclosing gene
       */
      processDomainEnd(domain, gene, property);
  
@@ -482,7 +552,7 @@ public class MegaFile extends AlignFile
     */
    protected void processDomainStart(String domain, String property)
    {
-    if ("domainend".equalsIgnoreCase(property))
+    if (DOMAINEND.equalsIgnoreCase(property))
      {
        currentDomain = null;
        return;
@@ -492,12 +562,26 @@ public class MegaFile extends AlignFile
      {
        String verboseDomain = makeVerboseDomainName(domain, property);
        startSequenceFeature(domainStart);
+      currentDomainStartCol = getAlignmentWidth();
  
        currentDomain = verboseDomain;
      }
    }
  
    /**
+   * Returns the width of alignment parsed so far. Note we assume (as does MEGA)
+   * that all sequences are the same length, so we can just take the length of
+   * the first one.
+   * 
+   * @return
+   */
+  protected int getAlignmentWidth()
+  {
+    return seqData.isEmpty() ? 0 : seqData.values()
+            .iterator().next().length();
+  }
+
+  /**
     * If we have declared a gene, and it is not continuing, start a sequence
     * feature for it
     * 
@@ -508,6 +592,7 @@ public class MegaFile extends AlignFile
      if (gene != null && !gene.equals(currentGene))
      {
        startSequenceFeature(geneStart);
+      currentGeneStartCol = getAlignmentWidth();
      }
      currentGene = gene;
    }
@@ -533,11 +618,15 @@ public class MegaFile extends AlignFile
      if (this.currentDomain != null)
      {
        boolean newDomain = !this.currentDomain.equals(verboseDomain);
-      boolean domainEnded = "domainend".equalsIgnoreCase(property);
+      boolean domainEnded = DOMAINEND.equalsIgnoreCase(property);
        if (newDomain || newGene || domainEnded)
        {
          createFeature(DOMAIN, currentDomain, domainStart);
+        // and/or... create annnotations for domain
+        extendAnnotation(domainAnnotations, currentDomain,
+                currentDomainStartCol);
          currentDomain = null;
+        currentDomainStartCol = -1;
          return true;
        }
      }
@@ -561,7 +650,10 @@ public class MegaFile extends AlignFile
      if (this.currentGene != null && !this.currentGene.equals(gene))
      {
        createFeature(GENE, currentGene, geneStart);
+      // and/or... add annotations for Gene
+      extendAnnotation(geneAnnotations, currentGene, currentGeneStartCol);
        currentGene = null;
+      currentGeneStartCol = -1;
        created = true;
      }
  
@@ -569,6 +661,37 @@ public class MegaFile extends AlignFile
    }
  
    /**
+   * Helper method to add Annotation elements, with the given description and
+   * starting at the given start column, up to the end of the sequence length
+   * parsed so far. Null elements are inserted for any skipped columns since the
+   * last annotation (if any), i.e. positions with no annotation of this type.
+   * 
+   * @param anns
+   * @param description
+   * @param startColumn
+   *          the start column of the annotations to add, or -1 if nothing to
+   *          add
+   */
+  protected void extendAnnotation(List<Annotation> anns,
+          String description, int startColumn)
+  {
+    int alignmentWidth = getAlignmentWidth();
+    addNullAnnotations(anns, startColumn == -1 ? alignmentWidth
+            : startColumn);
+
+    int numberToAdd = alignmentWidth - anns.size();
+    if (numberToAdd > 0)
+    {
+      Color col = description == null ? Color.black : UserColourScheme
+              .createColourFromName(description);
+      for (int i = 0; i < numberToAdd; i++)
+      {
+        anns.add(new Annotation("X", description, ' ', 0f, col));
+      }
+    }
+  }
+
+  /**
     * Makes an expanded descriptive name for Domain if possible e.g.
     * "Intron1 (Adh Coding)". Currently incorporates the current gene name (if
     * any) and the Coding/Noncoding property value (if given).
@@ -1632,8 +1755,8 @@ public class MegaFile extends AlignFile
    {
      int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
      String none = "";
-    if (annotations == null || annotations.isEmpty()
-            || !MEGA_ANNOTATION_LABEL.equals(annotations.get(0).label))
+    AlignmentAnnotation ann = findAnnotation(MEGA_ANNOTATION_LABEL);
+    if (ann == null)
      {
        return none;
      }
@@ -1669,6 +1792,27 @@ public class MegaFile extends AlignFile
    }
  
    /**
+   * Returns the first stored annotation found with the given label, or null
+   * 
+   * @param annotationLabel
+   * @return
+   */
+  protected AlignmentAnnotation findAnnotation(String annotationLabel)
+  {
+    if (annotations != null)
+    {
+      for (AlignmentAnnotation ann : annotations)
+      {
+        if (annotationLabel.equals(ann.label))
+        {
+          return ann;
+        }
+      }
+    }
+    return null;
+  }
+
+  /**
     * Flag this file as interleaved or not, based on data format. Throws an
     * exception if has previously been determined to be otherwise.
     * 
diff --git a/src/jalview/schemes/UserColourScheme.java b/src/jalview/schemes/UserColourScheme.java

index 92989fb..7ca211c 100755 (executable)
--- a/src/jalview/schemes/UserColourScheme.java
+++ b/src/jalview/schemes/UserColourScheme.java
@@ -136,7 +136,7 @@ public class UserColourScheme extends ResidueColourScheme
  
    }
  
-  public Color createColourFromName(String name)
+  public static Color createColourFromName(String name)
    {
      int r, g, b;
  
diff --git a/test/jalview/io/MegaFileTest.java b/test/jalview/io/MegaFileTest.java

index bdae11a..40a7c6e 100644 (file)
--- a/test/jalview/io/MegaFileTest.java
+++ b/test/jalview/io/MegaFileTest.java
@@ -685,6 +685,60 @@ public class MegaFileTest
        verifySequenceFeature(sfs[7], "MEF2A", "Gene", 31, 36);
        verifySequenceFeature(sfs[8], "BindingSite", "Domain", 37, 42);
      }
+
+    /*
+     * verify gene and domain alignment annotations
+     */
+    assertEquals(2, testee.annotations.size());
+    AlignmentAnnotation ann = testee.annotations.get(0);
+    assertEquals("MEGA Gene", ann.label);
+    assertEquals(42, ann.annotations.length);
+    verifyAnnotation(ann, 0, 6, null);
+    verifyAnnotation(ann, 6, 24, "Adh");
+    verifyAnnotation(ann, 24, 30, "Opsin");
+    verifyAnnotation(ann, 30, 36, "MEF2A");
+    verifyAnnotation(ann, 37, 42, null);
+
+    ann = testee.annotations.get(1);
+    assertEquals("MEGA Domain", ann.label);
+    assertEquals(42, ann.annotations.length);
+    verifyAnnotation(ann, 0, 6, null);
+    verifyAnnotation(ann, 6, 12, "Exon1 (Adh Coding)");
+    verifyAnnotation(ann, 12, 18, "Intron1 (Adh Noncoding)");
+    verifyAnnotation(ann, 19, 24, "Exon2 (Adh Coding)");
+    verifyAnnotation(ann, 25, 30, "Intron1 (Opsin Noncoding)");
+    verifyAnnotation(ann, 31, 36, "Exon1 (MEF2A Coding)");
+    verifyAnnotation(ann, 37, 42, "BindingSite");
+
+  }
+
+  /**
+   * Helper method to verify a range of annotation positions all have the given
+   * description
+   * 
+   * @param ann
+   *          array of annotations to check
+   * @param from
+   *          start index to check
+   * @param to
+   *          end index to check (exclusive)
+   * @param description
+   *          value to assert
+   */
+  protected void verifyAnnotation(AlignmentAnnotation ann, int from,
+          int to, String description)
+  {
+    for (int pos = from; pos < to; pos++)
+    {
+      if (description == null)
+      {
+        assertNull(ann.annotations[pos]);
+      }
+      else
+      {
+        assertEquals(description, ann.annotations[pos].description);
+      }
+    }
    }
  
    /**
author	gmungoc <g.m.carstairs@dundee.ac.uk>
	Fri, 9 Oct 2015 14:54:38 +0000 (15:54 +0100)
committer	gmungoc <g.m.carstairs@dundee.ac.uk>
	Fri, 9 Oct 2015 14:54:38 +0000 (15:54 +0100)
src/jalview/io/MegaFile.java		patch \| blob \| history
src/jalview/schemes/UserColourScheme.java		patch \| blob \| history
test/jalview/io/MegaFileTest.java		patch \| blob \| history