JAL-1191 SequenceOntology wrapping/caching SO.OBO via BioJava library
authorgmungoc <g.m.carstairs@dundee.ac.uk>
Tue, 22 Dec 2015 15:49:33 +0000 (15:49 +0000)
committergmungoc <g.m.carstairs@dundee.ac.uk>
Tue, 22 Dec 2015 15:49:33 +0000 (15:49 +0000)
.classpath
lib/biojava-core-4.1.0.jar [new file with mode: 0644]
lib/biojava-ontology-4.1.0.jar [new file with mode: 0644]
src/jalview/io/gff/SequenceOntology.java
test/jalview/io/gff/SequenceOntologyTest.java [new file with mode: 0644]

index 9164f3d..cad9e2b 100644 (file)
@@ -67,5 +67,7 @@
        <classpathentry kind="lib" path="lib/java-json.jar"/>
        <classpathentry kind="lib" path="lib/Jmol-14.2.14_2015.06.11.jar"/>
        <classpathentry kind="con" path="org.testng.TESTNG_CONTAINER"/>
+       <classpathentry kind="lib" path="lib/biojava-core-4.1.0.jar"/>
+       <classpathentry kind="lib" path="lib/biojava-ontology-4.1.0.jar"/>
        <classpathentry kind="output" path="classes"/>
 </classpath>
diff --git a/lib/biojava-core-4.1.0.jar b/lib/biojava-core-4.1.0.jar
new file mode 100644 (file)
index 0000000..5a09c1f
Binary files /dev/null and b/lib/biojava-core-4.1.0.jar differ
diff --git a/lib/biojava-ontology-4.1.0.jar b/lib/biojava-ontology-4.1.0.jar
new file mode 100644 (file)
index 0000000..80737d5
Binary files /dev/null and b/lib/biojava-ontology-4.1.0.jar differ
index c437f86..8357630 100644 (file)
 package jalview.io.gff;
 
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.NoSuchElementException;
+
+import org.biojava.nbio.ontology.Ontology;
+import org.biojava.nbio.ontology.Term;
+import org.biojava.nbio.ontology.Term.Impl;
+import org.biojava.nbio.ontology.Triple;
+import org.biojava.nbio.ontology.io.OboParser;
+
 /**
  * A wrapper class that parses the Sequence Ontology and exposes useful access
- * methods
+ * methods. This version uses the BioJava parser.
  */
 public class SequenceOntology
 {
   private static SequenceOntology instance = new SequenceOntology();
 
+  private Ontology ontology;
+
+  private Term isA;
+
+  /*
+   * lookup of terms by user readable name (NB not guaranteed unique)
+   */
+  private Map<String, Term> termsByDescription;
+
+  /*
+   * Map where key is a Term and value is a (possibly empty) list of 
+   * all Terms to which the key has a direct 'isA' relationship
+   */
+  private Map<Term, List<Term>> termIsA;
+
   public static SequenceOntology getInstance()
   {
     return instance;
   }
 
   /**
-   * Private constructor to enforce use of singleton.
+   * Private constructor to enforce use of singleton. Parses and caches the SO
+   * OBO data file.
    */
   private SequenceOntology()
   {
-    // TODO: parse and cache so.obo data file e.g. using BioJava
+    termsByDescription = new HashMap<String, Term>();
+    termIsA = new HashMap<Term, List<Term>>();
+
+    OboParser parser = new OboParser();
+    InputStream inStream = null;
+    try
+    {
+      inStream = new FileInputStream(
+              "/Users/gmcarstairs/Documents/ontologies/so-xp-simple.obo");
+
+      BufferedReader oboFile = new BufferedReader(new InputStreamReader(
+              inStream));
+      ontology = parser.parseOBO(oboFile, "SO", "the SO ontology");
+      isA = ontology.getTerm("is_a");
+
+      storeTermNames();
+    } catch (Exception e)
+    {
+      e.printStackTrace();
+    } finally
+    {
+      if (inStream != null)
+      {
+        try
+        {
+          inStream.close();
+        } catch (IOException e)
+        {
+          // ignore
+        }
+      }
+    }
+  }
+
+  protected void storeTermNames()
+  {
+    for (Term term : ontology.getTerms())
+    {
+      if (term instanceof Impl)
+      {
+        String description = term.getDescription();
+        if (description != null)
+        {
+          // System.out.println(term.getName() + "=" + term.getDescription());
+          Term replaced = termsByDescription.put(description, term);
+          if (replaced != null)
+          {
+            System.err.println("Warning: " + term.getName()
+                    + " has replaced " + replaced.getName()
+                    + " for lookup of description "
+                    + description);
+          }
+        }
+      }
+    }
   }
 
   /**
@@ -30,20 +117,7 @@ public class SequenceOntology
    */
   public boolean isNucleotideMatch(String soTerm)
   {
-    // temporary until OBO parser is in place!
-    // (which should also match SO ids e.g. "SO:0000347")
-    String[] nucMatch = { "nucleotide_match", "primer_match",
-        "cross_genome_match", "expressed_sequence_match",
-        "translated_nucleotide_match", "UST_match", "RSF_match",
-        "cDNA_match", "EST_match" };
-    for (int i = 0; i < nucMatch.length; i++)
-    {
-      if (nucMatch[i].equals(soTerm))
-      {
-        return true;
-      }
-    }
-    return false;
+    return isA(soTerm, "nucleotide_match");
   }
 
   /**
@@ -55,13 +129,117 @@ public class SequenceOntology
    */
   public boolean isProteinMatch(String soTerm)
   {
-    // temporary until OBO parser is in place!
-    return "protein_match".equals(soTerm)
-            || "protein_hmm_match".equals(soTerm);
+    return isA(soTerm, "protein_match");
   }
 
   public boolean isPolypeptide(String soTerm)
   {
-    return "polypeptide".equals(soTerm);
+    return isA(soTerm, "polypeptide");
+  }
+
+  /**
+   * Returns true if the given term has a (direct or indirect) 'isA'
+   * relationship with the parent
+   * 
+   * @param child
+   * @param parent
+   * @return
+   */
+  public boolean isA(String child, String parent)
+  {
+    Term childTerm = getTerm(child);
+    Term parentTerm = getTerm(parent);
+
+    return termIsA(childTerm, parentTerm);
+  }
+
+  /**
+   * Returns true if the childTerm 'isA' parentTerm (directly or indirectly).
+   * 
+   * @param childTerm
+   * @param parentTerm
+   * @return
+   */
+  protected synchronized boolean termIsA(Term childTerm, Term parentTerm)
+  {
+    /*
+     * null child term arises from a misspelled SO description
+     */
+    if (childTerm == null || parentTerm == null)
+    {
+      return false;
+    }
+
+    /*
+     * recursive search endpoint:
+     */
+    if (childTerm == parentTerm)
+    {
+      return true;
+    }
+    /*
+     * lazy initialisation - find all of a term's parents the first
+     * time this is called, and save them in a map.
+     */
+    if (!termIsA.containsKey(childTerm))
+    {
+      findParents(childTerm);
+    }
+
+    List<Term> parents = termIsA.get(childTerm);
+    for (Term parent : parents)
+    {
+      if (termIsA(parent, parentTerm))
+      {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  /**
+   * Finds all the 'isA' parents of the childTerm and stores them as a (possibly
+   * empty) list.
+   * 
+   * @param childTerm
+   */
+  protected synchronized void findParents(Term childTerm)
+  {
+    List<Term> result = new ArrayList<Term>();
+    for (Triple triple : ontology.getTriples(childTerm, null, isA))
+    {
+      Term parent = triple.getObject();
+      result.add(parent);
+
+      /*
+       * and search for the parent's parents recursively
+       */
+      findParents(parent);
+    }
+    termIsA.put(childTerm, result);
+  }
+
+  /**
+   * Returns the Term for a given name (e.g. "SO:0000735") or description (e.g.
+   * "sequence_location"), or null if not found.
+   * 
+   * @param child
+   * @return
+   */
+  protected Term getTerm(String nameOrDescription)
+  {
+    Term t = termsByDescription.get(nameOrDescription);
+    if (t == null)
+    {
+      try
+      {
+        t = ontology.getTerm(nameOrDescription);
+      } catch (NoSuchElementException e)
+      {
+        // not found
+      }
+    }
+    return t;
   }
 }
diff --git a/test/jalview/io/gff/SequenceOntologyTest.java b/test/jalview/io/gff/SequenceOntologyTest.java
new file mode 100644 (file)
index 0000000..54ab5dd
--- /dev/null
@@ -0,0 +1,71 @@
+package jalview.io.gff;
+
+import static org.testng.AssertJUnit.assertFalse;
+import static org.testng.AssertJUnit.assertTrue;
+
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+
+public class SequenceOntologyTest
+{
+  private SequenceOntology so;
+
+  @BeforeMethod
+  public void setUp() {
+    so = SequenceOntology.getInstance();
+  }
+
+  @Test(groups = "Functional")
+  public void testTermIsA()
+  {
+    assertTrue(so.isA("SO:0000087", "SO:0000704"));
+    assertFalse(so.isA("SO:0000704", "SO:0000087"));
+    assertTrue(so.isA("SO:0000736", "SO:0000735"));
+
+    // direct parent:
+    assertTrue(so.isA("micronuclear_sequence", "organelle_sequence"));
+    // grandparent:
+    assertTrue(so.isA("micronuclear_sequence", "sequence_location"));
+    // great-grandparent:
+    assertTrue(so.isA("micronuclear_sequence", "sequence_attribute"));
+
+    // same thing:
+    assertTrue(so.isA("micronuclear_sequence", "SO:0000084"));
+  }
+
+  @Test(groups = "Functional")
+  public void testIsProteinMatch()
+  {
+    assertTrue(so.isProteinMatch("protein_match"));
+    assertTrue(so.isProteinMatch("protein_hmm_match"));
+    assertFalse(so.isProteinMatch("Protein_match")); // case-sensitive
+  }
+
+  @Test(groups = "Functional")
+  public void testIsNucleotideMatch()
+  {
+    assertTrue(so.isNucleotideMatch("nucleotide_match"));
+    assertTrue(so.isNucleotideMatch("primer_match"));
+    assertTrue(so.isNucleotideMatch("cross_genome_match"));
+    assertTrue(so.isNucleotideMatch("expressed_sequence_match"));
+    assertTrue(so.isNucleotideMatch("translated_nucleotide_match"));
+    assertTrue(so.isNucleotideMatch("UST_match"));
+    assertTrue(so.isNucleotideMatch("RST_match"));
+    assertTrue(so.isNucleotideMatch("cDNA_match"));
+    assertTrue(so.isNucleotideMatch("EST_match"));
+    assertFalse(so.isNucleotideMatch("match")); // parent
+  }
+
+  @Test(groups = "Functional")
+  public void testIsCDS()
+  {
+    assertTrue(so.isA("CDS", "CDS"));
+    assertTrue(so.isA("CDS_predicted", "CDS"));
+    assertTrue(so.isA("transposable_element_CDS", "CDS"));
+    assertTrue(so.isA("edited_CDS", "CDS"));
+    assertTrue(so.isA("CDS_independently_known", "CDS"));
+    assertTrue(so.isA("CDS_fragment", "CDS"));
+    assertFalse(so.isA("CDS_region", "CDS"));// part_of
+    assertFalse(so.isA("polypeptide", "CDS")); // derives_from
+  }
+}