From: gmungoc Date: Tue, 22 Dec 2015 15:49:33 +0000 (+0000) Subject: JAL-1191 SequenceOntology wrapping/caching SO.OBO via BioJava library X-Git-Tag: Release_2_10_0~296^2~97 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=2447e0f9b158c45152803a91e3e17866bd676d4f;p=jalview.git JAL-1191 SequenceOntology wrapping/caching SO.OBO via BioJava library --- diff --git a/.classpath b/.classpath index 9164f3d..cad9e2b 100644 --- a/.classpath +++ b/.classpath @@ -67,5 +67,7 @@ + + diff --git a/lib/biojava-core-4.1.0.jar b/lib/biojava-core-4.1.0.jar new file mode 100644 index 0000000..5a09c1f Binary files /dev/null and b/lib/biojava-core-4.1.0.jar differ diff --git a/lib/biojava-ontology-4.1.0.jar b/lib/biojava-ontology-4.1.0.jar new file mode 100644 index 0000000..80737d5 Binary files /dev/null and b/lib/biojava-ontology-4.1.0.jar differ diff --git a/src/jalview/io/gff/SequenceOntology.java b/src/jalview/io/gff/SequenceOntology.java index c437f86..8357630 100644 --- a/src/jalview/io/gff/SequenceOntology.java +++ b/src/jalview/io/gff/SequenceOntology.java @@ -1,24 +1,111 @@ package jalview.io.gff; +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; + +import org.biojava.nbio.ontology.Ontology; +import org.biojava.nbio.ontology.Term; +import org.biojava.nbio.ontology.Term.Impl; +import org.biojava.nbio.ontology.Triple; +import org.biojava.nbio.ontology.io.OboParser; + /** * A wrapper class that parses the Sequence Ontology and exposes useful access - * methods + * methods. This version uses the BioJava parser. */ public class SequenceOntology { private static SequenceOntology instance = new SequenceOntology(); + private Ontology ontology; + + private Term isA; + + /* + * lookup of terms by user readable name (NB not guaranteed unique) + */ + private Map termsByDescription; + + /* + * Map where key is a Term and value is a (possibly empty) list of + * all Terms to which the key has a direct 'isA' relationship + */ + private Map> termIsA; + public static SequenceOntology getInstance() { return instance; } /** - * Private constructor to enforce use of singleton. + * Private constructor to enforce use of singleton. Parses and caches the SO + * OBO data file. */ private SequenceOntology() { - // TODO: parse and cache so.obo data file e.g. using BioJava + termsByDescription = new HashMap(); + termIsA = new HashMap>(); + + OboParser parser = new OboParser(); + InputStream inStream = null; + try + { + inStream = new FileInputStream( + "/Users/gmcarstairs/Documents/ontologies/so-xp-simple.obo"); + + BufferedReader oboFile = new BufferedReader(new InputStreamReader( + inStream)); + ontology = parser.parseOBO(oboFile, "SO", "the SO ontology"); + isA = ontology.getTerm("is_a"); + + storeTermNames(); + } catch (Exception e) + { + e.printStackTrace(); + } finally + { + if (inStream != null) + { + try + { + inStream.close(); + } catch (IOException e) + { + // ignore + } + } + } + } + + protected void storeTermNames() + { + for (Term term : ontology.getTerms()) + { + if (term instanceof Impl) + { + String description = term.getDescription(); + if (description != null) + { + // System.out.println(term.getName() + "=" + term.getDescription()); + Term replaced = termsByDescription.put(description, term); + if (replaced != null) + { + System.err.println("Warning: " + term.getName() + + " has replaced " + replaced.getName() + + " for lookup of description " + + description); + } + } + } + } } /** @@ -30,20 +117,7 @@ public class SequenceOntology */ public boolean isNucleotideMatch(String soTerm) { - // temporary until OBO parser is in place! - // (which should also match SO ids e.g. "SO:0000347") - String[] nucMatch = { "nucleotide_match", "primer_match", - "cross_genome_match", "expressed_sequence_match", - "translated_nucleotide_match", "UST_match", "RSF_match", - "cDNA_match", "EST_match" }; - for (int i = 0; i < nucMatch.length; i++) - { - if (nucMatch[i].equals(soTerm)) - { - return true; - } - } - return false; + return isA(soTerm, "nucleotide_match"); } /** @@ -55,13 +129,117 @@ public class SequenceOntology */ public boolean isProteinMatch(String soTerm) { - // temporary until OBO parser is in place! - return "protein_match".equals(soTerm) - || "protein_hmm_match".equals(soTerm); + return isA(soTerm, "protein_match"); } public boolean isPolypeptide(String soTerm) { - return "polypeptide".equals(soTerm); + return isA(soTerm, "polypeptide"); + } + + /** + * Returns true if the given term has a (direct or indirect) 'isA' + * relationship with the parent + * + * @param child + * @param parent + * @return + */ + public boolean isA(String child, String parent) + { + Term childTerm = getTerm(child); + Term parentTerm = getTerm(parent); + + return termIsA(childTerm, parentTerm); + } + + /** + * Returns true if the childTerm 'isA' parentTerm (directly or indirectly). + * + * @param childTerm + * @param parentTerm + * @return + */ + protected synchronized boolean termIsA(Term childTerm, Term parentTerm) + { + /* + * null child term arises from a misspelled SO description + */ + if (childTerm == null || parentTerm == null) + { + return false; + } + + /* + * recursive search endpoint: + */ + if (childTerm == parentTerm) + { + return true; + } + /* + * lazy initialisation - find all of a term's parents the first + * time this is called, and save them in a map. + */ + if (!termIsA.containsKey(childTerm)) + { + findParents(childTerm); + } + + List parents = termIsA.get(childTerm); + for (Term parent : parents) + { + if (termIsA(parent, parentTerm)) + { + return true; + } + } + + return false; + } + + /** + * Finds all the 'isA' parents of the childTerm and stores them as a (possibly + * empty) list. + * + * @param childTerm + */ + protected synchronized void findParents(Term childTerm) + { + List result = new ArrayList(); + for (Triple triple : ontology.getTriples(childTerm, null, isA)) + { + Term parent = triple.getObject(); + result.add(parent); + + /* + * and search for the parent's parents recursively + */ + findParents(parent); + } + termIsA.put(childTerm, result); + } + + /** + * Returns the Term for a given name (e.g. "SO:0000735") or description (e.g. + * "sequence_location"), or null if not found. + * + * @param child + * @return + */ + protected Term getTerm(String nameOrDescription) + { + Term t = termsByDescription.get(nameOrDescription); + if (t == null) + { + try + { + t = ontology.getTerm(nameOrDescription); + } catch (NoSuchElementException e) + { + // not found + } + } + return t; } } diff --git a/test/jalview/io/gff/SequenceOntologyTest.java b/test/jalview/io/gff/SequenceOntologyTest.java new file mode 100644 index 0000000..54ab5dd --- /dev/null +++ b/test/jalview/io/gff/SequenceOntologyTest.java @@ -0,0 +1,71 @@ +package jalview.io.gff; + +import static org.testng.AssertJUnit.assertFalse; +import static org.testng.AssertJUnit.assertTrue; + +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public class SequenceOntologyTest +{ + private SequenceOntology so; + + @BeforeMethod + public void setUp() { + so = SequenceOntology.getInstance(); + } + + @Test(groups = "Functional") + public void testTermIsA() + { + assertTrue(so.isA("SO:0000087", "SO:0000704")); + assertFalse(so.isA("SO:0000704", "SO:0000087")); + assertTrue(so.isA("SO:0000736", "SO:0000735")); + + // direct parent: + assertTrue(so.isA("micronuclear_sequence", "organelle_sequence")); + // grandparent: + assertTrue(so.isA("micronuclear_sequence", "sequence_location")); + // great-grandparent: + assertTrue(so.isA("micronuclear_sequence", "sequence_attribute")); + + // same thing: + assertTrue(so.isA("micronuclear_sequence", "SO:0000084")); + } + + @Test(groups = "Functional") + public void testIsProteinMatch() + { + assertTrue(so.isProteinMatch("protein_match")); + assertTrue(so.isProteinMatch("protein_hmm_match")); + assertFalse(so.isProteinMatch("Protein_match")); // case-sensitive + } + + @Test(groups = "Functional") + public void testIsNucleotideMatch() + { + assertTrue(so.isNucleotideMatch("nucleotide_match")); + assertTrue(so.isNucleotideMatch("primer_match")); + assertTrue(so.isNucleotideMatch("cross_genome_match")); + assertTrue(so.isNucleotideMatch("expressed_sequence_match")); + assertTrue(so.isNucleotideMatch("translated_nucleotide_match")); + assertTrue(so.isNucleotideMatch("UST_match")); + assertTrue(so.isNucleotideMatch("RST_match")); + assertTrue(so.isNucleotideMatch("cDNA_match")); + assertTrue(so.isNucleotideMatch("EST_match")); + assertFalse(so.isNucleotideMatch("match")); // parent + } + + @Test(groups = "Functional") + public void testIsCDS() + { + assertTrue(so.isA("CDS", "CDS")); + assertTrue(so.isA("CDS_predicted", "CDS")); + assertTrue(so.isA("transposable_element_CDS", "CDS")); + assertTrue(so.isA("edited_CDS", "CDS")); + assertTrue(so.isA("CDS_independently_known", "CDS")); + assertTrue(so.isA("CDS_fragment", "CDS")); + assertFalse(so.isA("CDS_region", "CDS"));// part_of + assertFalse(so.isA("polypeptide", "CDS")); // derives_from + } +}