JAL-1705 JAL-1191 SequenceOntologyLite added as hard-coded alternative
authorgmungoc <g.m.carstairs@dundee.ac.uk>
Mon, 1 Feb 2016 10:17:49 +0000 (10:17 +0000)
committergmungoc <g.m.carstairs@dundee.ac.uk>
Mon, 1 Feb 2016 10:17:49 +0000 (10:17 +0000)
15 files changed:
src/jalview/analysis/AlignmentUtils.java
src/jalview/ext/ensembl/EnsemblCdna.java
src/jalview/ext/ensembl/EnsemblCds.java
src/jalview/ext/ensembl/EnsemblGene.java
src/jalview/ext/ensembl/EnsemblOverlap.java
src/jalview/ext/ensembl/EnsemblRestClient.java
src/jalview/ext/ensembl/EnsemblSeqProxy.java
src/jalview/io/gff/Gff3Helper.java
src/jalview/io/gff/InterProScanHelper.java
src/jalview/io/gff/SequenceOntology.java
src/jalview/io/gff/SequenceOntologyFactory.java [new file with mode: 0644]
src/jalview/io/gff/SequenceOntologyI.java [new file with mode: 0644]
src/jalview/io/gff/SequenceOntologyLite.java [new file with mode: 0644]
test/jalview/ext/ensembl/EnsemblSeqProxyTest.java
test/jalview/io/gff/SequenceOntologyTest.java

index 41538eb..34eaa60 100644 (file)
@@ -34,7 +34,8 @@ import jalview.datamodel.Sequence;
 import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceGroup;
 import jalview.datamodel.SequenceI;
-import jalview.io.gff.SequenceOntology;
+import jalview.io.gff.SequenceOntologyFactory;
+import jalview.io.gff.SequenceOntologyI;
 import jalview.schemes.ResidueProperties;
 import jalview.util.DBRefUtils;
 import jalview.util.MapList;
@@ -1435,7 +1436,7 @@ public class AlignmentUtils
       copyTo = copyTo.getDatasetSequence();
     }
 
-    SequenceOntology so = SequenceOntology.getInstance();
+    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
     int count = 0;
     SequenceFeature[] sfs = fromSeq.getSequenceFeatures();
     if (sfs != null)
index 139e44f..373286f 100644 (file)
@@ -1,7 +1,8 @@
 package jalview.ext.ensembl;
 
 import jalview.datamodel.SequenceFeature;
-import jalview.io.gff.SequenceOntology;
+import jalview.io.gff.SequenceOntologyFactory;
+import jalview.io.gff.SequenceOntologyI;
 
 import java.util.List;
 
@@ -68,8 +69,8 @@ public class EnsemblCdna extends EnsemblSeqProxy
   @Override
   protected boolean identifiesSequence(SequenceFeature sf, String accId)
   {
-    if (SequenceOntology.getInstance().isA(sf.getType(),
-            SequenceOntology.EXON))
+    if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+            SequenceOntologyI.EXON))
     {
       String parentFeature = (String) sf.getValue(PARENT);
       if (("transcript:" + accId).equals(parentFeature))
index 22c0a06..ec5780f 100644 (file)
@@ -2,7 +2,8 @@ package jalview.ext.ensembl;
 
 import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
-import jalview.io.gff.SequenceOntology;
+import jalview.io.gff.SequenceOntologyFactory;
+import jalview.io.gff.SequenceOntologyI;
 
 import java.util.List;
 
@@ -51,8 +52,8 @@ public class EnsemblCds extends EnsemblSeqProxy
   @Override
   protected boolean retainFeature(SequenceFeature sf, String accessionId)
   {
-    if (SequenceOntology.getInstance().isA(sf.getType(),
-            SequenceOntology.CDS))
+    if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+            SequenceOntologyI.CDS))
     {
       return false;
     }
@@ -67,8 +68,8 @@ public class EnsemblCds extends EnsemblSeqProxy
   @Override
   protected boolean identifiesSequence(SequenceFeature sf, String accId)
   {
-    if (SequenceOntology.getInstance().isA(sf.getType(),
-            SequenceOntology.CDS))
+    if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+            SequenceOntologyI.CDS))
     {
       String parentFeature = (String) sf.getValue(PARENT);
       if (("transcript:" + accId).equals(parentFeature))
index 1325bec..df246f8 100644 (file)
@@ -4,7 +4,8 @@ import jalview.datamodel.AlignmentI;
 import jalview.datamodel.Sequence;
 import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
-import jalview.io.gff.SequenceOntology;
+import jalview.io.gff.SequenceOntologyFactory;
+import jalview.io.gff.SequenceOntologyI;
 import jalview.util.MapList;
 
 import java.util.ArrayList;
@@ -132,10 +133,10 @@ public class EnsemblGene extends EnsemblSeqProxy
      */
     String parentId = "transcript:" + accId;
     List<SequenceFeature> splices = findFeatures(gene,
-            SequenceOntology.EXON, parentId);
+            SequenceOntologyI.EXON, parentId);
     if (splices.isEmpty())
     {
-      splices = findFeatures(gene, SequenceOntology.CDS, parentId);
+      splices = findFeatures(gene, SequenceOntologyI.CDS, parentId);
     }
 
     int transcriptLength = 0;
@@ -176,7 +177,7 @@ public class EnsemblGene extends EnsemblSeqProxy
     /*
      * and finally fetch the protein product and save as a cross-reference
      */
-    addProteinProduct(transcript);
+    new EnsemblCdna().addProteinProduct(transcript);
 
     return transcript;
   }
@@ -240,8 +241,8 @@ public class EnsemblGene extends EnsemblSeqProxy
   @Override
   protected boolean identifiesSequence(SequenceFeature sf, String accId)
   {
-    if (SequenceOntology.getInstance().isA(sf.getType(),
-            SequenceOntology.GENE))
+    if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+            SequenceOntologyI.GENE))
     {
       String id = (String) sf.getValue(ID);
       if (("gene:" + accId).equals(id))
@@ -262,8 +263,8 @@ public class EnsemblGene extends EnsemblSeqProxy
   @Override
   protected boolean retainFeature(SequenceFeature sf, String accessionId)
   {
-    if (SequenceOntology.getInstance().isA(sf.getType(),
-            SequenceOntology.GENE))
+    if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+            SequenceOntologyI.GENE))
     {
       return false;
     }
@@ -299,4 +300,13 @@ public class EnsemblGene extends EnsemblSeqProxy
     return super.getCrossReferenceDatabases();
   }
 
+  /**
+   * Override to do nothing as Ensembl doesn't return a protein sequence for a
+   * gene identifier
+   */
+  @Override
+  protected void addProteinProduct(SequenceI querySeq)
+  {
+  }
+
 }
index b1514d8..507b6f8 100644 (file)
@@ -42,14 +42,11 @@ public class EnsemblOverlap extends EnsemblRestClient
   @Override
   public AlignmentI getSequenceRecords(String query) throws IOException
   {
-    long now = System.currentTimeMillis();
     // TODO: use a vararg String... for getSequenceRecords instead?
     List<String> queries = new ArrayList<String>();
     queries.add(query);
     FileParse fp = getSequenceReader(queries);
     FeaturesFile fr = new FeaturesFile(fp);
-    System.out.println(getClass().getName() + " took "
-            + (System.currentTimeMillis() - now) + "ms to fetch");
     return new Alignment(fr.getSeqsAsArray());
   }
 
index 2fd7fa3..dc4cc88 100644 (file)
@@ -140,6 +140,7 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher
   protected BufferedReader getHttpResponse(URL url, List<String> ids)
           throws IOException
   {
+    long now = System.currentTimeMillis();
     HttpURLConnection connection = (HttpURLConnection) url.openConnection();
   
     /*
@@ -175,6 +176,8 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher
               "Response code was not 200. Detected response was "
                       + responseCode);
     }
+    System.out.println(getClass().getName() + " took "
+            + (System.currentTimeMillis() - now) + "ms to fetch");
   
     BufferedReader reader = null;
     reader = new BufferedReader(new InputStreamReader(response, "UTF-8"));
index 0bfeda1..744aa49 100644 (file)
@@ -11,7 +11,8 @@ import jalview.datamodel.SequenceI;
 import jalview.exceptions.JalviewException;
 import jalview.io.FastaFile;
 import jalview.io.FileParse;
-import jalview.io.gff.SequenceOntology;
+import jalview.io.gff.SequenceOntologyFactory;
+import jalview.io.gff.SequenceOntologyI;
 import jalview.schemes.ResidueProperties;
 import jalview.util.DBRefUtils;
 import jalview.util.MapList;
@@ -127,7 +128,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
   @Override
   public AlignmentI getSequenceRecords(String query) throws Exception
   {
-    long now = System.currentTimeMillis();
     // TODO use a String... query vararg instead?
 
     // danger: accession separator used as a regex here, a string elsewhere
@@ -156,14 +156,15 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
                 + " chunks. Unexpected problem (" + r.getLocalizedMessage()
                 + ")";
         System.err.println(msg);
-        if (alignment != null)
-        {
-          break; // return what we got
-        }
-        else
-        {
-          throw new JalviewException(msg, r);
-        }
+        break;
+        // if (alignment != null)
+        // {
+        // break; // return what we got
+        // }
+        // else
+        // {
+        // throw new JalviewException(msg, r);
+        // }
       }
     }
 
@@ -181,8 +182,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       getCrossReferences(seq);
     }
 
-    System.out.println(getClass().getName() + " took "
-            + (System.currentTimeMillis() - now) + "ms to fetch");
     return alignment;
   }
 
@@ -368,11 +367,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
   }
 
   /**
-   * Adds CDS ranges to the ranges list, and returns the total length mapped.
+   * Adds CDS ranges to the ranges list, and returns the total length mapped
+   * from.
    * 
-   * No need to worry about reverse strand dna here since the retrieved sequence
-   * is as transcribed (reverse complement for reverse strand), i.e in the same
-   * sense as the peptide.
+   * No need to worry about reverse strand dna, here since the retrieved
+   * sequence is as transcribed (reverse complement for reverse strand), i.e in
+   * the same sense as the peptide.
    * 
    * @param dnaSeq
    * @param ranges
@@ -391,7 +391,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       /*
        * process a CDS feature (or a sub-type of CDS)
        */
-      if (SequenceOntology.getInstance().isA(sf.getType(), SequenceOntology.CDS))
+      if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+              SequenceOntologyI.CDS))
       {
         int phase = 0;
         try {
@@ -579,7 +580,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
    *          the start position of the sequence we are mapping to
    * @return
    */
-  protected MapList getGenomicRanges(SequenceI sourceSequence,
+  protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence,
           String accId, int start)
   {
     SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
@@ -605,11 +606,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
        */
       if (identifiesSequence(sf, accId))
       {
-          int strand = sf.getStrand();
-  
-          if (directionSet && strand != direction)
-          {
-            // abort - mix of forward and backward
+        int strand = sf.getStrand();
+        strand = strand == 0 ? 1 : strand; // treat unknown as forward
+
+        if (directionSet && strand != direction)
+        {
+          // abort - mix of forward and backward
           System.err.println("Error: forward and backward strand for "
                   + accId);
             return null;
@@ -654,8 +656,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
      */
     Collections.sort(regions, new RangeSorter(direction == 1));
   
-    List<int[]> to = new ArrayList<int[]>();
-    to.add(new int[] { start, start + mappedLength - 1 });
+    List<int[]> to = Arrays.asList(new int[] { start,
+        start + mappedLength - 1 });
   
     return new MapList(regions, to, 1, 1);
   }
@@ -710,7 +712,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       /*
        * for sequence_variant, make an additional feature with consequence
        */
-      if (SequenceOntology.getInstance().isSequenceVariant(sf.getType()))
+      if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+              SequenceOntologyI.SEQUENCE_VARIANT))
       {
         String consequence = (String) sf.getValue(CONSEQUENCE_TYPE);
         if (consequence != null)
@@ -741,7 +744,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     }
 
     SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
-    MapList mapping = getGenomicRanges(sourceSequence, accessionId,
+    MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId,
             targetSequence.getStart());
     if (mapping == null)
     {
@@ -850,7 +853,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     
     SequenceFeature[] sfs = sequence.getSequenceFeatures();
     if (sfs != null) {
-      SequenceOntology so = SequenceOntology.getInstance();
+      SequenceOntologyI so = SequenceOntologyFactory.getInstance();
       for (SequenceFeature sf :sfs) {
         if (so.isA(sf.getType(), type))
         {
@@ -888,7 +891,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     }
   
     AlignmentUtils.transferFeatures(dnaSeq, peptide, dnaToProtein,
-            SequenceOntology.EXON);
+            SequenceOntologyI.EXON);
 
     LinkedHashMap<Integer, String[][]> variants = buildDnaVariantsMap(
             dnaSeq, dnaToProtein);
@@ -909,7 +912,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
         String desc = StringUtils.listToDelimitedString(peptideVariants,
                 ", ");
         SequenceFeature sf = new SequenceFeature(
-                SequenceOntology.SEQUENCE_VARIANT, desc, peptidePos,
+                SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
                 peptidePos, 0f, null);
         peptide.addSequenceFeature(sf);
         count++;
@@ -934,7 +937,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
      * LinkedHashMap ensures we add the peptide features in sequence order
      */
     LinkedHashMap<Integer, String[][]> variants = new LinkedHashMap<Integer, String[][]>();
-    SequenceOntology so = SequenceOntology.getInstance();
+    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
   
     SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
     if (dnaFeatures == null)
@@ -957,7 +960,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
         // not handling multi-locus variant features
         continue;
       }
-      if (so.isSequenceVariant(sf.getType()))
+      if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT))
       {
         int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
         if (mapsTo == null)
@@ -1096,6 +1099,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
   public static boolean isTranscript(String featureType)
   {
     return NMD_VARIANT.equals(featureType)
-            || SequenceOntology.getInstance().isA(featureType, SequenceOntology.TRANSCRIPT);
+            || SequenceOntologyFactory.getInstance().isA(featureType,
+                    SequenceOntologyI.TRANSCRIPT);
   }
 }
index 2e98e4e..d29645b 100644 (file)
@@ -70,12 +70,13 @@ public class Gff3Helper extends GffHelperBase
       String atts = gff[ATTRIBUTES_COL];
       Map<String, List<String>> attributes = parseNameValuePairs(atts);
 
-      if (SequenceOntology.getInstance().isProteinMatch(soTerm))
+      SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+      if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
       {
-        sf = processProteinMatch(attributes, seq, gff, align,
-                newseqs, relaxedIdMatching);
+        sf = processProteinMatch(attributes, seq, gff, align, newseqs,
+                relaxedIdMatching);
       }
-      else if (SequenceOntology.getInstance().isNucleotideMatch(soTerm))
+      else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
       {
         sf = processNucleotideMatch(attributes, seq, gff, align,
                 newseqs, relaxedIdMatching);
@@ -372,9 +373,9 @@ public class Gff3Helper extends GffHelperBase
       desc = target.split(" ")[0];
     }
 
-    SequenceOntology so = SequenceOntology.getInstance();
+    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
     String type = sf.getType();
-    if (so.isSequenceVariant(type))
+    if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
     {
       /*
        * Ensembl returns dna variants as 'alleles'
index 3323e27..68d5d4f 100644 (file)
@@ -89,10 +89,11 @@ public class InterProScanHelper extends Gff3Helper
    */
   public static boolean recognises(String[] columns)
   {
-    SequenceOntology so = SequenceOntology.getInstance();
+    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
     String type = columns[TYPE_COL];
-    if (so.isProteinMatch(type)
-            || (".".equals(columns[SOURCE_COL]) && so.isPolypeptide(type)))
+    if (so.isA(type, SequenceOntologyI.PROTEIN_MATCH)
+            || (".".equals(columns[SOURCE_COL]) && so.isA(type,
+                    SequenceOntologyI.POLYPEPTIDE)))
     {
       return true;
     }
index 685b83e..b069eef 100644 (file)
@@ -7,6 +7,7 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.text.ParseException;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -25,32 +26,8 @@ import org.biojava.nbio.ontology.utils.Annotation;
  * A wrapper class that parses the Sequence Ontology and exposes useful access
  * methods. This version uses the BioJava parser.
  */
-public class SequenceOntology
+class SequenceOntology implements SequenceOntologyI
 {
-
-  /*
-   * selected commonly used values for quick reference
-   */
-  // SO:0000316
-  public static final String CDS = "CDS";
-
-  // SO:0001060
-  public static final String SEQUENCE_VARIANT = "sequence_variant";
-
-  // SO:0000147
-  public static final String EXON = "exon";
-
-  // SO:0000673
-  public static final String TRANSCRIPT = "transcript";
-
-  // SO:0000704
-  public static final String GENE = "gene";
-
-  /*
-   * singleton instance of this class
-   */
-  private static SequenceOntology instance;
-
   /*
    * the parsed Ontology data as modelled by BioJava
    */
@@ -73,26 +50,18 @@ public class SequenceOntology
    */
   private Map<Term, List<Term>> termIsA;
 
-  /**
-   * Returns singleton instance
-   * 
-   * @return
-   */
-  public synchronized static SequenceOntology getInstance()
-  {
-    if (instance == null)
-    {
-      instance = new SequenceOntology();
-    }
-    return instance;
-  }
+  private List<String> termsFound;
+
+  private List<String> termsNotFound;
 
   /**
-   * Private constructor to enforce use of singleton. Parses and caches the SO
-   * OBO data file.
+   * Package private constructor to enforce use of singleton. Parses and caches
+   * the SO OBO data file.
    */
-  private SequenceOntology()
+  SequenceOntology()
   {
+    termsFound = new ArrayList<String>();
+    termsNotFound = new ArrayList<String>();
     termsByDescription = new HashMap<String, Term>();
     termIsA = new HashMap<Term, List<Term>>();
 
@@ -248,7 +217,7 @@ public class SequenceOntology
    */
   public boolean isNucleotideMatch(String soTerm)
   {
-    return isA(soTerm, "nucleotide_match");
+    return isA(soTerm, NUCLEOTIDE_MATCH);
   }
 
   /**
@@ -261,7 +230,7 @@ public class SequenceOntology
    */
   public boolean isProteinMatch(String soTerm)
   {
-    return isA(soTerm, "protein_match");
+    return isA(soTerm, PROTEIN_MATCH);
   }
 
   /**
@@ -274,7 +243,7 @@ public class SequenceOntology
    */
   public boolean isPolypeptide(String soTerm)
   {
-    return isA(soTerm, "polypeptide");
+    return isA(soTerm, POLYPEPTIDE);
   }
 
   /**
@@ -285,23 +254,70 @@ public class SequenceOntology
    * @param parent
    * @return
    */
+  @Override
   public boolean isA(String child, String parent)
   {
+    if (child == null || parent == null)
+    {
+      return false;
+    }
     /*
      * optimise trivial checks like isA("CDS", "CDS")
      */
     if (child.equals(parent))
     {
+      termFound(child);
       return true;
     }
 
     Term childTerm = getTerm(child);
+    if (childTerm != null)
+    {
+      termFound(child);
+    }
+    else
+    {
+      termNotFound(child);
+    }
     Term parentTerm = getTerm(parent);
 
     return termIsA(childTerm, parentTerm);
   }
 
   /**
+   * Records a valid term queried for, for reporting purposes
+   * 
+   * @param term
+   */
+  private void termFound(String term)
+  {
+    synchronized (termsFound)
+    {
+      if (!termsFound.contains(term))
+      {
+        termsFound.add(term);
+      }
+    }
+  }
+
+  /**
+   * Records an invalid term queried for, for reporting purposes
+   * 
+   * @param term
+   */
+  private void termNotFound(String term)
+  {
+    synchronized (termsNotFound)
+    {
+      if (!termsNotFound.contains(term))
+      {
+        System.err.println("SO term " + term + " invalid");
+        termsNotFound.add(term);
+      }
+    }
+  }
+
+  /**
    * Returns true if the childTerm 'isA' parentTerm (directly or indirectly).
    * 
    * @param childTerm
@@ -402,6 +418,32 @@ public class SequenceOntology
 
   public boolean isSequenceVariant(String soTerm)
   {
-    return isA(soTerm, "sequence_variant");
+    return isA(soTerm, SEQUENCE_VARIANT);
+  }
+
+  /**
+   * Sorts (case-insensitive) and returns the list of valid terms queried for
+   */
+  @Override
+  public List<String> termsFound()
+  {
+    synchronized (termsFound)
+    {
+      Collections.sort(termsFound, String.CASE_INSENSITIVE_ORDER);
+      return termsFound;
+    }
+  }
+
+  /**
+   * Sorts (case-insensitive) and returns the list of invalid terms queried for
+   */
+  @Override
+  public List<String> termsNotFound()
+  {
+    synchronized (termsNotFound)
+    {
+      Collections.sort(termsNotFound, String.CASE_INSENSITIVE_ORDER);
+      return termsNotFound;
+    }
   }
 }
diff --git a/src/jalview/io/gff/SequenceOntologyFactory.java b/src/jalview/io/gff/SequenceOntologyFactory.java
new file mode 100644 (file)
index 0000000..3eaa5d1
--- /dev/null
@@ -0,0 +1,21 @@
+package jalview.io.gff;
+
+public class SequenceOntologyFactory
+{
+  private static SequenceOntologyI instance;
+
+  public static synchronized SequenceOntologyI getInstance()
+  {
+    if (instance == null)
+    {
+      // instance = new SequenceOntology();
+      instance = new SequenceOntologyLite();
+    }
+    return instance;
+  }
+
+  public static void setInstance(SequenceOntologyI so)
+  {
+    instance = so;
+  }
+}
diff --git a/src/jalview/io/gff/SequenceOntologyI.java b/src/jalview/io/gff/SequenceOntologyI.java
new file mode 100644 (file)
index 0000000..8128177
--- /dev/null
@@ -0,0 +1,54 @@
+package jalview.io.gff;
+
+import java.util.List;
+
+public interface SequenceOntologyI
+{
+  /*
+   * selected commonly used values for quick reference
+   */
+  public static final String POLYPEPTIDE = "polypeptide";
+
+  public static final String PROTEIN_MATCH = "protein_match";
+
+  public static final String NUCLEOTIDE_MATCH = "nucleotide_match";
+
+  // SO:0000316
+  public static final String CDS = "CDS";
+
+  // SO:0001060
+  public static final String SEQUENCE_VARIANT = "sequence_variant";
+
+  // SO:0000147
+  public static final String EXON = "exon";
+
+  // SO:0000673
+  public static final String TRANSCRIPT = "transcript";
+
+  // SO:0000704
+  public static final String GENE = "gene";
+
+  public boolean isA(String childTerm, String parentTerm);
+
+  /**
+   * Returns a sorted list of all valid terms queried for (i.e. terms processed
+   * which were valid in the SO), using the friendly description.
+   * 
+   * This can be used to check that any hard-coded stand-in for the full SO
+   * includes all the terms needed for correct processing.
+   * 
+   * @return
+   */
+  public List<String> termsFound();
+
+  /**
+   * Returns a sorted list of all invalid terms queried for (i.e. terms
+   * processed which were not found in the SO), using the friendly description.
+   * 
+   * This can be used to report any 'non-compliance' in data, and/or to report
+   * valid terms missing from any hard-coded stand-in for the full SO.
+   * 
+   * @return
+   */
+  public List<String> termsNotFound();
+}
diff --git a/src/jalview/io/gff/SequenceOntologyLite.java b/src/jalview/io/gff/SequenceOntologyLite.java
new file mode 100644 (file)
index 0000000..173dea6
--- /dev/null
@@ -0,0 +1,190 @@
+package jalview.io.gff;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * An implementation of SequenceOntologyI that hard codes terms of interest.
+ *
+ * Use this in unit testing by calling SequenceOntology.setInstance(new
+ * SequenceOntologyLite()).
+ * 
+ * May also become a stand-in for SequenceOntology in the applet if we want to
+ * avoid the additional jars needed for parsing the full SO.
+ * 
+ * @author gmcarstairs
+ *
+ */
+public class SequenceOntologyLite implements SequenceOntologyI
+{
+  /*
+   * initial selection of types of interest when processing Ensembl features
+   */
+  // @formatter:off
+  private final String[][] TERMS = new String[][] {
+
+    /*
+     * gene sub-types:
+     */
+    { "gene", "gene" }, 
+    { "ncRNA_gene", "gene" }, 
+    { "snRNA_gene", "gene" },
+    
+    /*
+     * transcript sub-types:
+     */
+    { "transcript", "transcript" }, 
+    { "mature_transcript", "transcript" }, 
+    { "ncRNA", "transcript" },
+    { "snRNA", "transcript" },
+    { "aberrant_processed_transcript", "transcript" },
+    
+    /*
+     * sequence_variant sub-types:
+     */
+    { "sequence_variant", "sequence_variant" },
+    { "feature_variant", "sequence_variant" },
+    { "gene_variant", "sequence_variant" },
+    // NB Ensembl uses NMD_transcript_variant as if a 'transcript'
+    // but we model it here correctly as per the SO
+    { "NMD_transcript_variant", "sequence_variant" },
+    { "transcript_variant", "sequence_variant" },
+    { "structural_variant", "sequence_variant" },
+    
+    /*
+     * no sub-types of exon or CDS yet encountered; add if needed
+     */
+    { "exon", "exon" },
+    { "CDS", "CDS" }
+  };
+  // @formatter:on
+
+  /*
+   * hard-coded list of any parents (direct or indirect) 
+   * that we care about for a term
+   */
+  private Map<String, List<String>> parents;
+
+  private List<String> termsFound;
+
+  private List<String> termsNotFound;
+
+  public SequenceOntologyLite()
+  {
+    termsFound = new ArrayList<String>();
+    termsNotFound = new ArrayList<String>();
+    loadStaticData();
+  }
+
+  /**
+   * Loads hard-coded data into a lookup table of {term, {list_of_parents}}
+   */
+  private void loadStaticData()
+  {
+    parents = new HashMap<String, List<String>>();
+    for (String [] pair : TERMS) {
+      List<String> p = parents.get(pair[0]);
+      if (p == null)
+      {
+        p = new ArrayList<String>();
+        parents.put(pair[0], p);
+      }
+      p.add(pair[1]);
+    }
+  }
+
+  /**
+   * Answers true if 'child' isA 'parent' (including equality). In this
+   * implementation, based only on hard-coded values.
+   */
+  @Override
+  public boolean isA(String child, String parent)
+  {
+    if (child == null || parent == null)
+    {
+      return false;
+    }
+    if (child.equals(parent))
+    {
+      termFound(child);
+      return true;
+    }
+
+    List<String> p = parents.get(child);
+    if (p == null)
+    {
+      termNotFound(child);
+      return false;
+    }
+    termFound(child);
+    if (p.contains(parent))
+    {
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * Records a valid term queried for, for reporting purposes
+   * 
+   * @param term
+   */
+  private void termFound(String term)
+  {
+    if (!termsFound.contains(term))
+    {
+      synchronized (termsFound)
+      {
+        termsFound.add(term);
+      }
+    }
+  }
+
+  /**
+   * Records an invalid term queried for, for reporting purposes
+   * 
+   * @param term
+   */
+  private void termNotFound(String term)
+  {
+    synchronized (termsNotFound)
+    {
+      if (!termsNotFound.contains(term))
+      {
+        System.out.println("SO term " + term
+                + " not known - either invalid or needs modelled in "
+                + getClass().getName());
+        termsNotFound.add(term);
+      }
+    }
+  }
+
+  /**
+   * Sorts (case-insensitive) and returns the list of valid terms queried for
+   */
+  @Override
+  public List<String> termsFound()
+  {
+    synchronized (termsFound)
+    {
+      Collections.sort(termsFound, String.CASE_INSENSITIVE_ORDER);
+      return termsFound;
+    }
+  }
+
+  /**
+   * Sorts (case-insensitive) and returns the list of invalid terms queried for
+   */
+  @Override
+  public List<String> termsNotFound()
+  {
+    synchronized (termsNotFound)
+    {
+      Collections.sort(termsNotFound, String.CASE_INSENSITIVE_ORDER);
+      return termsNotFound;
+    }
+  }
+}
index c525e95..31745e5 100644 (file)
@@ -4,14 +4,19 @@ import static org.testng.AssertJUnit.assertEquals;
 
 import jalview.datamodel.Alignment;
 import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
 import jalview.io.AppletFormatAdapter;
 import jalview.io.FastaFile;
 import jalview.io.FileParse;
+import jalview.io.gff.SequenceOntologyFactory;
+import jalview.io.gff.SequenceOntologyLite;
 
 import java.lang.reflect.Method;
 import java.net.MalformedURLException;
 import java.net.URL;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 
@@ -280,4 +285,82 @@ public class EnsemblSeqProxyTest
     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
     assertEquals("[C, R, T, W]", variants.toString());
   }
+  
+  /**
+   * Tests for the method that maps the subset of a dna sequence that has CDS
+   * (or subtype) feature.
+   */
+  @Test(groups = "Functional")
+  public void testGetCdsRanges()
+  {
+    EnsemblSeqProxy testee = new EnsemblSeqProxyAdapter();
+
+    SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
+    dnaSeq.createDatasetSequence();
+    SequenceI ds = dnaSeq.getDatasetSequence();
+
+    // CDS for dna 3-6
+    SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
+    ds.addSequenceFeature(sf);
+    // exon feature should be ignored here
+    sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
+    ds.addSequenceFeature(sf);
+    // CDS for dna 10-12
+    sf = new SequenceFeature("some_cds", "", 10, 12, 0f, null);
+    ds.addSequenceFeature(sf);
+
+    SequenceOntologyFactory.setInstance(new SequenceOntologyLite());
+    List<int[]> ranges = new ArrayList<int[]>();
+    int mappedLength = testee.getCdsRanges(dnaSeq, ranges);
+    assertEquals(6, mappedLength);
+    assertEquals(2, ranges.size());
+    assertEquals(4, ranges.get(0)[0]);
+    assertEquals(6, ranges.get(0)[1]);
+    assertEquals(10, ranges.get(1)[0]);
+    assertEquals(12, ranges.get(1)[1]);
+
+  }
+
+  @Test(groups = "Functional")
+  public void getGenomicRangesFromFeatures()
+  {
+
+  }
+
+  /**
+   * Tests for the method that maps the subset of a dna sequence that has CDS
+   * (or subtype) feature - case where the start codon is incomplete.
+   */
+  @Test(groups = "Functional")
+  public void testGetCdsRanges_fivePrimeIncomplete()
+  {
+    EnsemblSeqProxy testee = new EnsemblSeqProxyAdapter();
+  
+    SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
+    dnaSeq.createDatasetSequence();
+    SequenceI ds = dnaSeq.getDatasetSequence();
+  
+    // CDS for dna 5-6 (incomplete codon), 7-9
+    SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
+    sf.setPhase("2"); // skip 2 bases to start of next codon
+    ds.addSequenceFeature(sf);
+    ds.addSequenceFeature(sf);
+    // CDS for dna 13-15
+    sf = new SequenceFeature("some_cds", "", 13, 15, 0f, null);
+    ds.addSequenceFeature(sf);
+  
+    SequenceOntologyFactory.setInstance(new SequenceOntologyLite());
+    List<int[]> ranges = new ArrayList<int[]>();
+    int mappedLength = testee.getCdsRanges(dnaSeq, ranges);
+
+    /*
+     * check the mapping starts with the first complete codon
+     */
+    assertEquals(6, mappedLength);
+    assertEquals(2, ranges.size());
+    assertEquals(7, ranges.get(0)[0]);
+    assertEquals(9, ranges.get(0)[1]);
+    assertEquals(13, ranges.get(1)[0]);
+    assertEquals(15, ranges.get(1)[1]);
+  }
 }
\ No newline at end of file
index 6c9226f..f791a1e 100644 (file)
@@ -8,12 +8,12 @@ import org.testng.annotations.Test;
 
 public class SequenceOntologyTest
 {
-  private SequenceOntology so;
+  private SequenceOntologyI so;
 
   @BeforeMethod
   public void setUp() {
     long now = System.currentTimeMillis();
-    so = SequenceOntology.getInstance();
+    so = SequenceOntologyFactory.getInstance();
     long elapsed = System.currentTimeMillis() - now;
     System.out.println("Load and cache of Sequence Ontology took "
             + elapsed + "ms");
@@ -57,29 +57,6 @@ public class SequenceOntologyTest
   }
 
   @Test(groups = "Functional")
-  public void testIsProteinMatch()
-  {
-    assertTrue(so.isProteinMatch("protein_match"));
-    assertTrue(so.isProteinMatch("protein_hmm_match"));
-    assertFalse(so.isProteinMatch("Protein_match")); // case-sensitive
-  }
-
-  @Test(groups = "Functional")
-  public void testIsNucleotideMatch()
-  {
-    assertTrue(so.isNucleotideMatch("nucleotide_match"));
-    assertTrue(so.isNucleotideMatch("primer_match"));
-    assertTrue(so.isNucleotideMatch("cross_genome_match"));
-    assertTrue(so.isNucleotideMatch("expressed_sequence_match"));
-    assertTrue(so.isNucleotideMatch("translated_nucleotide_match"));
-    assertTrue(so.isNucleotideMatch("UST_match"));
-    assertTrue(so.isNucleotideMatch("RST_match"));
-    assertTrue(so.isNucleotideMatch("cDNA_match"));
-    assertTrue(so.isNucleotideMatch("EST_match"));
-    assertFalse(so.isNucleotideMatch("match")); // parent
-  }
-
-  @Test(groups = "Functional")
   public void testIsCDS()
   {
     assertTrue(so.isA("CDS", "CDS"));