From e710810610e029b1519d31c6d8a9de26833f59b9 Mon Sep 17 00:00:00 2001
From: gmungoc <g.m.carstairs@dundee.ac.uk>
Date: Tue, 6 Oct 2015 16:41:18 +0100
Subject: [PATCH] JAL-1499 improved parsing of !Domain and !Gene including
 Property=domainend

---
 src/jalview/io/MegaFile.java      |  213 ++++++++++++++++++++++++++++---------
 test/jalview/io/MegaFileTest.java |   76 ++++++++++++-
 2 files changed, 237 insertions(+), 52 deletions(-)
diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java
index e4079c0..58961f8 100644
--- a/src/jalview/io/MegaFile.java
+++ b/src/jalview/io/MegaFile.java
@@ -53,6 +53,8 @@ import java.util.Set;
  */
 public class MegaFile extends AlignFile
 {
+  private static final String WHITESPACE = "\\s+";
+
   private static final int DEFAULT_LINE_LENGTH = 60;
 
   private static final String INDENT = "    ";
@@ -99,6 +101,10 @@ public class MegaFile extends AlignFile
 
   private static final String DOMAIN = "Domain";
 
+  private static final String PROPERTY = "Property";
+
+  private static final String CODONSTART = "CodonStart";
+
   /*
    * names of properties to save to the alignment (may affect eventual output
    * format)
@@ -167,14 +173,17 @@ public class MegaFile extends AlignFile
   // number of residues read (so far) per sequence
   Map<String, Integer> residuesRead;
   
-  // start residue (base 1) per sequence of current feature
-  Map<String, Integer> featureStart;
-  
-  // feature (Gene/Domain) if any we are parsing
-  private String currentFeature;
+  // current Gene if any we are parsing
+  private String currentGene;
+
+  // start residue (base 1) per sequence of current gene
+  Map<String, Integer> geneStart;
 
-  // feature type (Gene/Domain) if any we are parsing
-  private String currentFeatureType;
+  // current Domain if any we are parsing
+  private String currentDomain;
+
+  // start residue (base 1) per sequence of current domain
+  Map<String, Integer> domainStart;
 
   // map of SequenceFeature's by sequence id
   Map<String, List<SequenceFeature>> sequenceFeatures;
@@ -201,7 +210,8 @@ public class MegaFile extends AlignFile
   {
     gapCharacter = '-';
     sequenceFeatures = new HashMap<String, List<SequenceFeature>>();
-    featureStart = new HashMap<String, Integer>();
+    geneStart = new HashMap<String, Integer>();
+    domainStart = new HashMap<String, Integer>();
     residuesRead = new HashMap<String, Integer>();
 
     /*
@@ -226,13 +236,10 @@ public class MegaFile extends AlignFile
       dataLine = dataLine.trim();
       if (dataLine.length() > 0)
       {
-        if (dataLine.startsWith(BANG + GENE))
-        {
-          parseFeature(GENE, dataLine);
-        }
-        else if (dataLine.startsWith(BANG + DOMAIN))
+        if (dataLine.startsWith(BANG + GENE)
+                || dataLine.startsWith(BANG + DOMAIN))
         {
-          parseFeature(DOMAIN, dataLine);
+          parseGeneOrDomain(dataLine);
         }
         else
         {
@@ -249,6 +256,12 @@ public class MegaFile extends AlignFile
       dataLine = nextNonCommentLine();
     }
 
+    /*
+     * close off any features currently being parsed
+     */
+    createFeature(GENE, currentGene, geneStart);
+    createFeature(DOMAIN, currentDomain, domainStart);
+
     // remember the (longest) line length read in, so we can output the same
     setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
 
@@ -269,36 +282,131 @@ public class MegaFile extends AlignFile
   }
 
   /**
-   * Parse a !Gene or !Domain command line
+   * Parse a !Gene or !Domain command line. MEGA accepts
+   * <ul>
+   * <li>!Gene=name;</li>
+   * <li>!Gene=name Property=Coding/Noncoding CodonStart=1/2/3;</li>
+   * <li>!Gene=genename Domain=domainname Property= etc</li>
+   * <li>!Domain=domainname Gene=genename Property= etc</li>
+   * <li>!Domain=domainname Property= etc</li>
+   * <li>!domain=domainname property=domainend</li>
+   * </ul>
+   * Properly, a Gene should be composed of Domain segments, but MEGA accepts
+   * without. Note that keywords don't seem to be case sensitive.
    * 
-   * @param featureType
    * @param dataLine
+   * @throws FileFormatException
    */
-  protected void parseFeature(String featureType, String dataLine)
+  protected void parseGeneOrDomain(String dataLine)
+          throws FileFormatException
   {
-    String featureName = getValue(dataLine);
-    // TODO parse !Gene=xyx Property=end; ???
-    if (this.currentFeature != null)
+    String domain = null;
+    String gene = null;
+    String property = null;
+    String codonStart = null;
+    String errorMsg = "Unrecognized format: " + dataLine;
+
+    if (!dataLine.startsWith(BANG) || !dataLine.endsWith(SEMICOLON))
+    {
+      throw new FileFormatException(errorMsg);
+    }
+    String trimmed = dataLine.substring(1, dataLine.length() - 1).trim();
+    String[] tokens = trimmed.split(WHITESPACE);
+    for (String token : tokens)
     {
-      endSequenceFeature();
+      String[] keyValue = token.split("=");
+      if (keyValue.length != 2)
+      {
+        throw new FileFormatException(errorMsg);
+      }
+      String key = keyValue[0];
+      if (GENE.equalsIgnoreCase(key))
+      {
+        gene = keyValue[1];
+      }
+      else if (DOMAIN.equalsIgnoreCase(key))
+      {
+        domain = keyValue[1];
+      }
+      else if (PROPERTY.equalsIgnoreCase(key))
+      {
+        property = keyValue[1];
+      }
+      else if (CODONSTART.equalsIgnoreCase(key))
+      {
+        codonStart = keyValue[1];
+      }
+      else
+      {
+        System.err.println("Unrecognised token: '" + key + "; in "
+                + dataLine);
+      }
     }
-    startSequenceFeature(featureName, featureType);
+
+    processGeneOrDomain(gene, domain, property, codonStart);
   }
 
   /**
-   * Start processing a new feature
+   * Process a statement containing one or both of Gene and Domain, and
+   * optionally Property or CodonStart commands.
    * 
-   * @param featureName
+   * @param gene
+   *          the Gene name if specified, else null
+   * @param domain
+   *          the Domain name if specified, else null
+   * @param property
+   *          the Property value if specified, else null
+   * @param codonStart
+   *          the CodonStart value if specified, else null
    */
-  protected void startSequenceFeature(String featureName, String featureType)
+  protected void processGeneOrDomain(String gene, String domain,
+          String property, String codonStart)
   {
-    currentFeature = featureName;
-    currentFeatureType = featureType;
+    /*
+     * If we have been processing a Gene or Domain, and this does not continue
+     * it, then close it off (generate sequence features for it)
+     */
+    if (this.currentGene != null && !this.currentGene.equals(gene))
+    {
+      createFeature(GENE, currentGene, geneStart);
+    }
+    if (this.currentDomain != null)
+    {
+      if (!this.currentDomain.equals(domain)
+              || "domainend".equalsIgnoreCase(property))
+      {
+        createFeature(DOMAIN, currentDomain, domainStart);
+      }
+    }
 
     /*
-     * If the feature name precedes all sequences, we will know in
-     * endSequenceFeature that it starts with residue 1; otherwise note now
-     * where it starts in each sequence
+     * and if we have declared a Gene or Domain which does not continue the
+     * current one, then record its start positions per sequence
+     */
+    if (gene != null && !gene.equals(currentGene))
+    {
+      startSequenceFeature(geneStart);
+    }
+    if (domain != null && !domain.equals(currentDomain))
+    {
+      startSequenceFeature(domainStart);
+    }
+
+    currentGene = gene;
+    currentDomain = domain;
+  }
+
+  /**
+   * Start processing a new feature
+   * 
+   * @param startPositions
+   */
+  protected void startSequenceFeature(Map<String, Integer> startPositions)
+  {
+    /*
+     * If the feature declaration precedes all sequences, we will know in
+     * createFeature that it started with residue 1; otherwise note now where it
+     * starts in each sequence
      */
     if (!residuesRead.isEmpty())
     {
@@ -306,22 +414,32 @@ public class MegaFile extends AlignFile
       {
         String seqId = entry.getKey();
         Integer nextResidue = entry.getValue() + 1;
-        featureStart.put(seqId, nextResidue);
+        startPositions.put(seqId, nextResidue);
       }
     }
   }
 
   /**
-   * Add a SequenceFeature for the current feature to each sequence, using the
-   * current feature start/end values per sequence
+   * Add a SequenceFeature to each sequence, using the given start/end values
+   * per sequence
+   * 
+   * @param featureType
+   * @param featureValue
+   * @param featureStartResidues
    */
-  protected void endSequenceFeature()
+  protected void createFeature(String featureType, String featureValue,
+          Map<String, Integer> featureStartResidues)
   {
+    if (featureValue == null)
+    {
+      return;
+    }
+
     Iterator<String> seqids = this.seqData.keySet().iterator();
     while (seqids.hasNext())
     {
       String seqid = seqids.next();
-      Integer startAt = featureStart.get(seqid);
+      Integer startAt = featureStartResidues.get(seqid);
       int sfstart = startAt == null ? 1 : startAt.intValue();
       int sfend = residuesRead.get(seqid);
       if (sfend >= sfstart)
@@ -329,23 +447,16 @@ public class MegaFile extends AlignFile
         /*
          * don't add feature if entirely gapped in the sequence
          */
-        SequenceFeature sf = new SequenceFeature(currentFeature,
-                currentFeatureType, sfstart, sfend, 0f, null);
+        // TODO: type="Gene" (but then all coloured the same) or
+        // type="GeneName"?
+        SequenceFeature sf = new SequenceFeature(featureValue, featureType,
+                sfstart, sfend, 0f, null);
         sequenceFeatures.get(seqid).add(sf);
       }
     }
   }
 
   /**
-   * Parse a !Domain command line
-   * 
-   * @param dataLine
-   */
-  private void parseDomain(String dataLine)
-  {
-  }
-
-  /**
    * Returns the next line that is not a comment, or null at end of file.
    * Comments in MEGA are within [ ] brackets, and may be nested.
    * 
@@ -798,7 +909,7 @@ public class MegaFile extends AlignFile
     {
       return;
     }
-    String[] tokens = inputLine.trim().split("\\s"); // any whitespace
+    String[] tokens = inputLine.trim().split(WHITESPACE);
     for (String token : tokens)
     {
       parseFormatKeyword(token);
@@ -892,14 +1003,14 @@ public class MegaFile extends AlignFile
       System.err.println("Warning: " + token + " not supported");
     }
 
-    else if (keyword.equalsIgnoreCase("Property"))
+    else if (keyword.equalsIgnoreCase(PROPERTY))
     {
-      // TODO: figure out what to do with this
-      // can it appear more than once in a file?
-      setAlignmentProperty(PROP_MISSING, value);
+      // TODO: can Property appear in a Format command?
+      // suspect this is a mistake in the manual
     }
 
     else if (!keyword.equalsIgnoreCase(N_SEQS)
+            && !keyword.equalsIgnoreCase("NTaxa")
             && !keyword.equalsIgnoreCase(N_SITES))
     {
       System.err.println("Warning: " + msg);
diff --git a/test/jalview/io/MegaFileTest.java b/test/jalview/io/MegaFileTest.java
index d868dcb..14cb27e 100644
--- a/test/jalview/io/MegaFileTest.java
+++ b/test/jalview/io/MegaFileTest.java
@@ -8,6 +8,7 @@ import static org.testng.AssertJUnit.fail;
 
 import jalview.datamodel.AlignmentI;
 import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
 
 import java.io.IOException;
@@ -76,7 +77,7 @@ public class MegaFileTest
   private static final String INTERLEAVED_WITH_DESCRIPTION = 
           "#MEGA\n"
           + "!Title Data with description;\n"
-          + "!Format DataType=DNA indel=- CodeTable=Standard Missing=? MatchChar=.;\n\n"
+          + "!Format DataType=DNA  indel=-\tCodeTable=Standard Missing=? MatchChar=.;\n\n"
           + "!Description\n" 
           + "    Line one of description\n"
           + "    Line two of description;\n\n"
@@ -605,5 +606,78 @@ public class MegaFileTest
   }
 
   //@formatter:on
+  
+  /**
+   * Test parse of data with !Gene and !Domain statements.
+   * 
+   * @throws IOException
+   */
+  @Test(groups = { "Functional" })
+  public void testParse_geneDomains() throws IOException
+  {
+    //@formatter:off
+    String data = "#MEGA\n"+ 
+    "TITLE: Interleaved sequence data\n\n" + 
+    "#U455   CCCCCC\n" + 
+    "#CPZANT  TTTTTT\n\n" +
+    "!Domain=Exon1 Gene=Adh Property=Coding CodonStart=1;\n" +
+    "#U455   GGGGGG\n" + 
+    "#CPZANT AAAAAA\n\n" +
+    "!Domain=Intron1 Gene=Adh;\n" +
+    "#U455   tttttt\n" + 
+    "#CPZANT cccccc\n\n" +
+    "!Domain=Exon2 Gene=Adh Property=Coding CodonStart=1;\n" +
+    "#U455   aaaaaa\n" + 
+    "#CPZANT gggggg\n\n" +
+    // explicit end of Exon2, implicit end of Adh:
+    "!Domain=Exon2 Property=domainend;\n" +
+    "!Domain=BindingSite;\n" +
+    "#U455   CCCCCC\n" + 
+    "#CPZANT TTTTTT\n\n";
+    //@formatter:on
+    MegaFile testee = new MegaFile(data, AppletFormatAdapter.PASTE);
 
+    Vector<SequenceI> seqs = testee.getSeqs();
+    // should be 2 sequences
+    assertEquals("Expected two sequences", 2, seqs.size());
+    // check sequence data
+    assertEquals("First sequence data wrong",
+            "CCCCCCGGGGGGttttttaaaaaaCCCCCC", seqs.get(0)
+            .getSequenceAsString());
+    assertEquals("Second sequence data wrong",
+            "TTTTTTAAAAAAccccccggggggTTTTTT", seqs.get(1)
+            .getSequenceAsString());
+
+    /*
+     * sequences should have features for Gene=Adh 7-24, Exon1 7-12, Intron1
+     * 13-18, Exon2 19-24, BindingSite 25-30
+     */
+    for (SequenceI seq : seqs) {
+      SequenceFeature[] sfs = seq.getSequenceFeatures();
+      // features are added in the order in which their end is found
+      // (Gene before Domain when they end together)
+      assertEquals(5, sfs.length);
+      // TODO settle which way round type/description go!
+      assertEquals("Exon1", sfs[0].type);
+      assertEquals("Domain", sfs[0].description);
+      assertEquals(7, sfs[0].begin);
+      assertEquals(12, sfs[0].end);
+      assertEquals("Intron1", sfs[1].type);
+      assertEquals("Domain", sfs[1].description);
+      assertEquals(13, sfs[1].begin);
+      assertEquals(18, sfs[1].end);
+      assertEquals("Adh", sfs[2].type);
+      assertEquals("Gene", sfs[2].description);
+      assertEquals(7, sfs[2].begin);
+      assertEquals(24, sfs[2].end);
+      assertEquals("Exon2", sfs[3].type);
+      assertEquals("Domain", sfs[3].description);
+      assertEquals(19, sfs[3].begin);
+      assertEquals(24, sfs[3].end);
+      assertEquals("BindingSite", sfs[4].type);
+      assertEquals("Domain", sfs[4].description);
+      assertEquals(25, sfs[4].begin);
+      assertEquals(30, sfs[4].end);
+    }
+  }
 }
-- 
1.7.10.2