From cd61435e058e0bf9b60292149c20299be7b4cf1f Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 1 Jun 2018 19:29:30 +0100 Subject: [PATCH] JAL-3010 refactoring to support feature type grouping by ontology --- src/jalview/datamodel/ontology/OntologyBase.java | 73 +++++++++++++++++++++ src/jalview/datamodel/ontology/OntologyI.java | 61 +++++++++++++++++ src/jalview/ext/so/SequenceOntology.java | 14 ++-- src/jalview/io/gff/SequenceOntologyI.java | 28 +------- src/jalview/io/gff/SequenceOntologyLite.java | 8 ++- test/jalview/ext/so/SequenceOntologyTest.java | 53 +++++++++++++-- test/jalview/io/gff/SequenceOntologyLiteTest.java | 62 +++++++++++++++-- 7 files changed, 258 insertions(+), 41 deletions(-) create mode 100644 src/jalview/datamodel/ontology/OntologyBase.java create mode 100644 src/jalview/datamodel/ontology/OntologyI.java diff --git a/src/jalview/datamodel/ontology/OntologyBase.java b/src/jalview/datamodel/ontology/OntologyBase.java new file mode 100644 index 0000000..25dae22 --- /dev/null +++ b/src/jalview/datamodel/ontology/OntologyBase.java @@ -0,0 +1,73 @@ +package jalview.datamodel.ontology; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * A base class for models of Sequence Ontology and others + * + * @author gmcarstairs + * + */ +public abstract class OntologyBase implements OntologyI +{ + @Override + public Set getParentTerms(Set terms) + { + Set parents = new HashSet<>(terms); + + boolean childRemoved = true; + while (childRemoved) + { + childRemoved = removeChild(parents); + } + return parents; + } + + /** + * Removes the first term in the given set found which is a child of another + * term in the set. Answers true if a child was found and removed, else false. + * + * @param terms + * @return + */ + boolean removeChild(Set terms) + { + for (String t1 : terms) + { + for (String t2 : terms) + { + if (t1 != t2) + { + if (isA(t1, t2)) + { + terms.remove(t1); + return true; + } + if (isA(t2, t1)) + { + terms.remove(t2); + return true; + } + } + } + } + return false; + } + + @Override + public List getChildTerms(String parent, List terms) + { + List children = new ArrayList<>(); + for (String term : terms) + { + if (!term.equals(parent) && isA(term, parent)) + { + children.add(term); + } + } + return children; + } +} diff --git a/src/jalview/datamodel/ontology/OntologyI.java b/src/jalview/datamodel/ontology/OntologyI.java new file mode 100644 index 0000000..545a3c7 --- /dev/null +++ b/src/jalview/datamodel/ontology/OntologyI.java @@ -0,0 +1,61 @@ +package jalview.datamodel.ontology; + +import java.util.List; +import java.util.Set; + +public interface OntologyI +{ + + /** + * Answers true if childTerm is the same as, or a sub-type + * (specialisation of) parentTerm, else false + * + * @param childTerm + * @param parentTerm + * @return + */ + boolean isA(String childTerm, String parentTerm); + + /** + * Answers those terms in the given set which are not child terms of some + * other term in the set. That is, returns a set of parent terms. The input + * set is not modified. + * + * @param terms + * @return + */ + Set getParentTerms(Set terms); + + /** + * Answers a (possibly empty) list of those terms in the supplied list which + * are a child (directly or indirectly) of parent. The parent + * term itself is not included (even if in the input list) + * + * @param parent + * @param terms + * @return + */ + List getChildTerms(String parent, List terms); + + /** + * Returns a sorted list of all valid terms queried for (i.e. terms processed + * which were valid in the SO), using the friendly description. + * + * This can be used to check that any hard-coded stand-in for the full SO + * includes all the terms needed for correct processing. + * + * @return + */ + List termsFound(); + + /** + * Returns a sorted list of all invalid terms queried for (i.e. terms + * processed which were not found in the SO), using the friendly description. + * + * This can be used to report any 'non-compliance' in data, and/or to report + * valid terms missing from any hard-coded stand-in for the full SO. + * + * @return + */ + List termsNotFound(); +} \ No newline at end of file diff --git a/src/jalview/ext/so/SequenceOntology.java b/src/jalview/ext/so/SequenceOntology.java index 0d631e6..7842294 100644 --- a/src/jalview/ext/so/SequenceOntology.java +++ b/src/jalview/ext/so/SequenceOntology.java @@ -20,6 +20,7 @@ */ package jalview.ext.so; +import jalview.datamodel.ontology.OntologyBase; import jalview.io.gff.SequenceOntologyI; import java.io.BufferedInputStream; @@ -48,7 +49,8 @@ import org.biojava.nbio.ontology.utils.Annotation; * A wrapper class that parses the Sequence Ontology and exposes useful access * methods. This version uses the BioJava parser. */ -public class SequenceOntology implements SequenceOntologyI +public class SequenceOntology extends OntologyBase + implements SequenceOntologyI { /* * the parsed Ontology data as modelled by BioJava @@ -82,10 +84,10 @@ public class SequenceOntology implements SequenceOntologyI */ public SequenceOntology() { - termsFound = new ArrayList(); - termsNotFound = new ArrayList(); - termsByDescription = new HashMap(); - termIsA = new HashMap>(); + termsFound = new ArrayList<>(); + termsNotFound = new ArrayList<>(); + termsByDescription = new HashMap<>(); + termIsA = new HashMap<>(); loadOntologyZipFile("so-xp-simple.obo"); } @@ -404,7 +406,7 @@ public class SequenceOntology implements SequenceOntologyI */ protected synchronized void findParents(Term childTerm) { - List result = new ArrayList(); + List result = new ArrayList<>(); for (Triple triple : ontology.getTriples(childTerm, null, isA)) { Term parent = triple.getObject(); diff --git a/src/jalview/io/gff/SequenceOntologyI.java b/src/jalview/io/gff/SequenceOntologyI.java index 307e1d1..e9b9923 100644 --- a/src/jalview/io/gff/SequenceOntologyI.java +++ b/src/jalview/io/gff/SequenceOntologyI.java @@ -20,9 +20,9 @@ */ package jalview.io.gff; -import java.util.List; +import jalview.datamodel.ontology.OntologyI; -public interface SequenceOntologyI +public interface SequenceOntologyI extends OntologyI { /* * selected commonly used values for quick reference @@ -62,28 +62,4 @@ public interface SequenceOntologyI // SO:0000704 public static final String GENE = "gene"; - - public boolean isA(String childTerm, String parentTerm); - - /** - * Returns a sorted list of all valid terms queried for (i.e. terms processed - * which were valid in the SO), using the friendly description. - * - * This can be used to check that any hard-coded stand-in for the full SO - * includes all the terms needed for correct processing. - * - * @return - */ - public List termsFound(); - - /** - * Returns a sorted list of all invalid terms queried for (i.e. terms - * processed which were not found in the SO), using the friendly description. - * - * This can be used to report any 'non-compliance' in data, and/or to report - * valid terms missing from any hard-coded stand-in for the full SO. - * - * @return - */ - public List termsNotFound(); } diff --git a/src/jalview/io/gff/SequenceOntologyLite.java b/src/jalview/io/gff/SequenceOntologyLite.java index 72e906c..670d887 100644 --- a/src/jalview/io/gff/SequenceOntologyLite.java +++ b/src/jalview/io/gff/SequenceOntologyLite.java @@ -20,6 +20,8 @@ */ package jalview.io.gff; +import jalview.datamodel.ontology.OntologyBase; + import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -38,7 +40,8 @@ import java.util.Map; * @author gmcarstairs * */ -public class SequenceOntologyLite implements SequenceOntologyI +public class SequenceOntologyLite extends OntologyBase + implements SequenceOntologyI { /* * initial selection of types of interest when processing Ensembl features @@ -80,8 +83,11 @@ public class SequenceOntologyLite implements SequenceOntologyI { "sequence_variant", "sequence_variant" }, { "structural_variant", "sequence_variant" }, { "feature_variant", "sequence_variant" }, + { "upstream_gene_variant", "sequence_variant" }, { "gene_variant", "sequence_variant" }, { "transcript_variant", "sequence_variant" }, + { "non_coding_transcript_variant", "sequence_variant" }, + { "non_coding_transcript_exon_variant", "sequence_variant" }, // NB Ensembl uses NMD_transcript_variant as if a 'transcript' // but we model it here correctly as per the SO { "NMD_transcript_variant", "sequence_variant" }, diff --git a/test/jalview/ext/so/SequenceOntologyTest.java b/test/jalview/ext/so/SequenceOntologyTest.java index 31e1887..c7776a3 100644 --- a/test/jalview/ext/so/SequenceOntologyTest.java +++ b/test/jalview/ext/so/SequenceOntologyTest.java @@ -20,11 +20,18 @@ */ package jalview.ext.so; -import static org.testng.AssertJUnit.assertFalse; -import static org.testng.AssertJUnit.assertTrue; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; +import jalview.datamodel.ontology.OntologyI; import jalview.gui.JvOptionPane; -import jalview.io.gff.SequenceOntologyI; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -39,7 +46,7 @@ public class SequenceOntologyTest JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION); } - private SequenceOntologyI so; + private OntologyI so; @BeforeClass(alwaysRun = true) public void setUp() @@ -132,4 +139,42 @@ public class SequenceOntologyTest assertTrue(so.isA("inframe_deletion", "sequence_variant")); assertTrue(so.isA("inframe_insertion", "sequence_variant")); } + + @Test(groups = "Functional") + public void testGetChildTerms() + { + List terms = Collections. emptyList(); + List children = so.getChildTerms("exon", terms); + assertTrue(children.isEmpty()); + + terms = Arrays.asList("gene", "transcript", "snRNA", "junk", "mRNA"); + children = so.getChildTerms("exon", terms); + assertTrue(children.isEmpty()); + children = so.getChildTerms("transcript", terms); + assertEquals(children.size(), 2); + assertTrue(children.contains("snRNA")); + assertTrue(children.contains("mRNA")); + + terms = Arrays.asList("gene", "transcript", "synonymous_variant", + "stop_lost", "chain"); + children = so.getChildTerms("sequence_variant", terms); + assertEquals(children.size(), 2); + assertTrue(children.contains("synonymous_variant")); + assertTrue(children.contains("stop_lost")); + } + + @Test(groups = "Functional") + public void testGetParentTerms() + { + Set terms = new HashSet<>(); + terms.add("sequence_variant"); + terms.add("NMD_transcript_variant"); + terms.add("stop_lost"); + terms.add("chain"); // not an SO term + + Set parents = so.getParentTerms(terms); + assertEquals(parents.size(), 2); + assertTrue(parents.contains("sequence_variant")); + assertTrue(parents.contains("chain")); + } } diff --git a/test/jalview/io/gff/SequenceOntologyLiteTest.java b/test/jalview/io/gff/SequenceOntologyLiteTest.java index 0766666..3076f96 100644 --- a/test/jalview/io/gff/SequenceOntologyLiteTest.java +++ b/test/jalview/io/gff/SequenceOntologyLiteTest.java @@ -1,17 +1,33 @@ package jalview.io.gff; -import static org.testng.AssertJUnit.assertFalse; -import static org.testng.AssertJUnit.assertTrue; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; +import jalview.datamodel.ontology.OntologyI; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; public class SequenceOntologyLiteTest { + private OntologyI so; + + @BeforeClass(alwaysRun = true) + public void setUp() + { + so = new SequenceOntologyLite(); + } + @Test(groups = "Functional") public void testIsA_sequenceVariant() { - SequenceOntologyI so = new SequenceOntologyLite(); - assertFalse(so.isA("CDS", "sequence_variant")); assertTrue(so.isA("sequence_variant", "sequence_variant")); @@ -34,4 +50,42 @@ public class SequenceOntologyLiteTest assertTrue(so.isA("inframe_insertion", "sequence_variant")); assertTrue(so.isA("splice_region_variant", "sequence_variant")); } + + @Test(groups = "Functional") + public void testGetParentTerms() + { + Set terms = new HashSet<>(); + terms.add("sequence_variant"); + terms.add("NMD_transcript_variant"); + terms.add("stop_lost"); + terms.add("chain"); // not an SO term + + Set parents = so.getParentTerms(terms); + assertEquals(parents.size(), 2); + assertTrue(parents.contains("sequence_variant")); + assertTrue(parents.contains("chain")); + } + + @Test(groups = "Functional") + public void testGetChildTerms() + { + List terms = Collections. emptyList(); + List children = so.getChildTerms("exon", terms); + assertTrue(children.isEmpty()); + + terms = Arrays.asList("gene", "transcript", "snRNA", "junk", "mRNA"); + children = so.getChildTerms("exon", terms); + assertTrue(children.isEmpty()); + children = so.getChildTerms("transcript", terms); + assertEquals(children.size(), 2); + assertTrue(children.contains("snRNA")); + assertTrue(children.contains("mRNA")); + + terms = Arrays.asList("gene", "transcript", "synonymous_variant", + "stop_lost", "chain"); + children = so.getChildTerms("sequence_variant", terms); + assertEquals(children.size(), 2); + assertTrue(children.contains("synonymous_variant")); + assertTrue(children.contains("stop_lost")); + } } -- 1.7.10.2