From: gmungoc Date: Wed, 24 Apr 2019 15:05:18 +0000 (+0100) Subject: JAL-3010 cache synonyms for Sequence Ontology terms X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=375d6ac2f29b2791680fcd1423aa8a2c6ff88cef;p=jalview.git JAL-3010 cache synonyms for Sequence Ontology terms --- diff --git a/src/jalview/datamodel/ontology/OntologyBase.java b/src/jalview/datamodel/ontology/OntologyBase.java index 22dc37e..b19d9bc 100644 --- a/src/jalview/datamodel/ontology/OntologyBase.java +++ b/src/jalview/datamodel/ontology/OntologyBase.java @@ -95,6 +95,7 @@ public abstract class OntologyBase implements OntologyI * candidate terms to 'capture' in ontology groupings * @return */ + @Override public Map> findSequenceOntologyGroupings( String givenTerm, List targetTerms) { @@ -102,7 +103,11 @@ public abstract class OntologyBase implements OntologyI Collections.sort(sortedTypes); Map> parents = new HashMap<>(); - + if (!isValidTerm(givenTerm)) + { + return parents; + } + /* * method: * walk up featureType and all of its parents @@ -111,6 +116,7 @@ public abstract class OntologyBase implements OntologyI */ List candidates = new ArrayList<>(); SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + candidates.add(givenTerm); while (!candidates.isEmpty()) { diff --git a/src/jalview/datamodel/ontology/OntologyI.java b/src/jalview/datamodel/ontology/OntologyI.java index f82adb0..b449982 100644 --- a/src/jalview/datamodel/ontology/OntologyI.java +++ b/src/jalview/datamodel/ontology/OntologyI.java @@ -6,6 +6,14 @@ import java.util.Set; public interface OntologyI { + /** + * Answers true if the term can be identified in the ontology (possibly by id, + * description or alias), else false + * + * @param term + * @return + */ + boolean isValidTerm(String term); /** * Answers true if childTerm is the same as, or a sub-type diff --git a/src/jalview/ext/so/SequenceOntology.java b/src/jalview/ext/so/SequenceOntology.java index 3b8bad4..3a775b0 100644 --- a/src/jalview/ext/so/SequenceOntology.java +++ b/src/jalview/ext/so/SequenceOntology.java @@ -32,6 +32,7 @@ import java.text.ParseException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; @@ -40,6 +41,7 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import org.biojava.nbio.ontology.Ontology; +import org.biojava.nbio.ontology.Synonym; import org.biojava.nbio.ontology.Term; import org.biojava.nbio.ontology.Term.Impl; import org.biojava.nbio.ontology.Triple; @@ -66,7 +68,7 @@ public class SequenceOntology extends OntologyBase /* * lookup of terms by user readable name (NB not guaranteed unique) */ - private Map termsByDescription; + private Map aliases; /* * Map where key is a Term and value is a (possibly empty) list of @@ -87,7 +89,7 @@ public class SequenceOntology extends OntologyBase { termsFound = new ArrayList<>(); termsNotFound = new ArrayList<>(); - termsByDescription = new HashMap<>(); + aliases = new HashMap<>(); termIsA = new HashMap<>(); loadOntologyZipFile("so-xp-simple.obo"); @@ -161,28 +163,31 @@ public class SequenceOntology extends OntologyBase OboParser parser = new OboParser(); ontology = parser.parseOBO(oboFile, "SO", "the SO ontology"); isA = ontology.getTerm("is_a"); - storeTermNames(); + storeTermAliases(); } /** - * Stores a lookup table of terms by description. Note that description is not - * guaranteed unique. Where duplicate descriptions are found, try to discard - * the term that is flagged as obsolete. However we do store obsolete terms - * where there is no duplication of description. + * Stores a lookup table of terms by description or synonym. Note that + * description is not guaranteed unique. Where duplicate descriptions are + * found, try to discard the term that is flagged as obsolete. However we do + * store obsolete terms where there is no duplication of description. */ - protected void storeTermNames() + protected void storeTermAliases() { + Set ambiguous = new HashSet<>(); + for (Term term : ontology.getTerms()) { if (term instanceof Impl) { + boolean newTermIsObsolete = isObsolete(term); String description = term.getDescription(); if (description != null) { - Term replaced = termsByDescription.get(description); + description = canonicalise(description); + Term replaced = aliases.get(description); if (replaced != null) { - boolean newTermIsObsolete = isObsolete(term); boolean oldTermIsObsolete = isObsolete(replaced); if (newTermIsObsolete && !oldTermIsObsolete) { @@ -203,10 +208,93 @@ public class SequenceOntology extends OntologyBase + " for lookup of '" + description + "'"); } } - termsByDescription.put(description, term); + aliases.put(description, term); + + /* + * also store synonyms if not ambiguous + */ + if (!newTermIsObsolete) + { + for (Object syn : term.getSynonyms()) + { + String name = ((Synonym) syn).getName(); + String synonym = canonicalise(name); + if (aliases.containsKey(synonym)) + { + final Term found = aliases.get(synonym); + if (found != term) + { + /* + * this alias is ambiguous - matches description, + * or an alias, of another term + */ + String msg = String.format( + "Ambiguous synonym %s for '%s:%s' and '%s:%s'", + synonym, term.getName(), term.getDescription(), + found.getName(), found.getDescription()); + System.err.println(msg); + + /* + * preserve any entry whose canonical description happens to match + * a synonym (NMD_transcript is a valid description, and also + * a synonym for NMD_transcript_variant) + * also preserve a parent (more general) term + */ + if (synonym.equals(canonicalise(found.getDescription())) + || termIsA(term, found)) + { + // leave it alone + } + /* + * replace a specialised term with a more general one + * with the same alias + */ + // else if + // (synonym.equals(canonicalise(term.getDescription()))) + else if (termIsA(found, term)) + { + aliases.put(synonym, term); + } + else + { + ambiguous.add(synonym); + } + } + } + else + { + aliases.put(synonym, term); + } + } + } } } } + + /* + * remove ambiguous synonyms for safety; + * problem: what if a synonym matches a description? + * only one case found: + * nmd_transcript is synonym for SO:0001621:NMD_transcript_variant + * and also the description for SO:0002114:NMD_transcript + */ + for (String syn : ambiguous) + { + aliases.remove(syn); + } + } + + /** + * Converts a string to lower case and changes hyphens and spaces to + * underscores + * + * @param s + * @return + */ + static String canonicalise(String s) + { + return s == null ? null + : s.toLowerCase().replace('-', '_').replace(' ', '_'); } /** @@ -423,14 +511,18 @@ public class SequenceOntology extends OntologyBase /** * Returns the Term for a given name (e.g. "SO:0000735") or description (e.g. - * "sequence_location"), or null if not found. + * "sequence_location"), or alias, or null if not found * * @param child * @return */ - protected Term getTerm(String nameOrDescription) + protected Term getTerm(final String nameOrDescription) { - Term t = termsByDescription.get(nameOrDescription); + if (nameOrDescription == null) + { + return null; + } + Term t = aliases.get(canonicalise(nameOrDescription)); if (t == null) { try @@ -568,4 +660,10 @@ public class SequenceOntology extends OntologyBase } return parents; } + + @Override + public boolean isValidTerm(String term) + { + return getTerm(term) != null; + } } diff --git a/src/jalview/io/gff/SequenceOntologyLite.java b/src/jalview/io/gff/SequenceOntologyLite.java index 6abb5d6..5722159 100644 --- a/src/jalview/io/gff/SequenceOntologyLite.java +++ b/src/jalview/io/gff/SequenceOntologyLite.java @@ -313,4 +313,10 @@ public class SequenceOntologyLite extends OntologyBase List result = parents.get(term); return result == null ? new ArrayList<>() : result; } + + @Override + public boolean isValidTerm(String term) + { + return parents.containsKey(term); + } } diff --git a/test/jalview/datamodel/ontology/OntologyBaseTest.java b/test/jalview/datamodel/ontology/OntologyBaseTest.java index 937d260..64812c3 100644 --- a/test/jalview/datamodel/ontology/OntologyBaseTest.java +++ b/test/jalview/datamodel/ontology/OntologyBaseTest.java @@ -29,7 +29,7 @@ public class OntologyBaseTest } @Test(groups = "Functional") - public void testfindSequenceOntologyGroupings() + public void testfindSequenceOntologyGroupings_nucleotide() { /* * typical gnomAD feature types, plus the top level 'sequence_variant' as in dbSNP @@ -42,11 +42,7 @@ public class OntologyBaseTest "missense_variant"); /* - * for stop_gained: - * transcript_variant further adds 5_prime_UTR_variant, - * non_coding_transcript_exon_variant, synonymous_variant, splice_region_variant - * feature_variant further adds upstream_gene_variant - * sequence_variant further adds sequence_variant + * hierarchy from stop_gained */ Map> map = SequenceOntologyFactory.getInstance() .findSequenceOntologyGroupings("stop_gained", featureTypes); @@ -145,4 +141,24 @@ public class OntologyBaseTest + "stop_lost, synonymous_variant, upstream_gene_variant]"); } + @Test(groups = "Functional") + public void testfindSequenceOntologyGroupings_peptide() + { + /* + * typical Uniprot feature types + */ + List featureTypes = Arrays.asList("BETA-TURN-IR", "NEST-RL", + "BETA-BULGE", "ALPHA-BETA-MOTIF", "ASX-TURN-IR", + "GAMMA-TURN-CLASSIC", "GAMMA-TURN-INVERSE", "BETA-TURN-IL", + "BETA-TURN-IIL"); + + /* + * hierarchy from GAMMA-TURN-INVERSE + * this is a synonym for + */ + Map> map = SequenceOntologyFactory.getInstance() + .findSequenceOntologyGroupings("TURN", featureTypes); + assertEquals(map.size(), 10); + } + } diff --git a/test/jalview/ext/so/SequenceOntologyTest.java b/test/jalview/ext/so/SequenceOntologyTest.java index 7eb01c9..260b011 100644 --- a/test/jalview/ext/so/SequenceOntologyTest.java +++ b/test/jalview/ext/so/SequenceOntologyTest.java @@ -22,10 +22,11 @@ package jalview.ext.so; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertSame; import static org.testng.Assert.assertTrue; -import jalview.datamodel.ontology.OntologyI; import jalview.gui.JvOptionPane; import java.util.Arrays; @@ -35,6 +36,8 @@ import java.util.List; import java.util.Map; import java.util.Set; +import org.biojava.nbio.ontology.Synonym; +import org.biojava.nbio.ontology.Term; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -50,7 +53,7 @@ public class SequenceOntologyTest JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION); } - private OntologyI so; + private SequenceOntology so; @BeforeClass(alwaysRun = true) public void setUp() @@ -236,4 +239,83 @@ public class SequenceOntologyTest assertEquals(parents.size(), 1); assertTrue(parents.contains("sequence_feature")); } + + @Test(groups = "Functional") + public void testGetTerm() + { + assertNull(so.getTerm(null)); + assertNull(so.getTerm("!*£&!")); + + Term t = so.getTerm("SO:0000084"); + assertNotNull(t); + assertEquals(t.getDescription(), "micronuclear_sequence"); + // name lookup is case sensitive + assertNull(so.getTerm("so:0000084")); + + t = so.getTerm("alpha_helix"); + assertNotNull(t); + Object[] synonyms = t.getSynonyms(); + assertEquals(synonyms.length, 2); + assertEquals(((Synonym) synonyms[0]).getName(), "a-helix"); + assertEquals(((Synonym) synonyms[1]).getName(), "helix"); + // case-insensitive description lookup + Term t2 = so.getTerm("ALPHA_HELIX"); + assertSame(t, t2); + // can also retrieve by synonym + t2 = so.getTerm("a-helix"); + assertSame(t, t2); + + t = so.getTerm("serine_threonine_motif"); + t2 = so.getTerm("ST-MOTIF"); // synonym is "st_motif" + assertNotNull(t); + assertSame(t, t2); + + /* + * if a synonym is ambiguous within a hierarchy, + * we keep it for the most general term (always valid) + * helix is a synonym for + * alpha_helix (isA) right_handed_peptide_helix (isA) peptide_helix + * motif is a synonym for polypeptide_conserved_motif (isA) polypeptide_motif + * + */ + t = so.getTerm("helix"); + assertNotNull(t); + assertEquals(t.getDescription(), "peptide_helix"); + t = so.getTerm("motif"); + assertNotNull(t); + assertEquals(t.getDescription(), "polypeptide_motif"); + + /* + * ambiguous synonyms with no mutual hierarchy are not cached + * 'sequence variation' is a synonym for + * sequence_alteration SO:0001059 + * alternate_sequence_site SO:0001149 + * and these have no 'isA' relationship + */ + assertNull(so.getTerm("sequence_variation")); + + /* + * nmd_transcript is synonym for SO:0001621:NMD_transcript_variant + * and also the description for SO:0002114:NMD_transcript + * since v3.1 of so-simple.obo + */ + t = so.getTerm("SO:0002114"); + assertNotNull(t); + t2 = so.getTerm("SO:0001621"); + assertNotNull(t2); + assertSame(t, so.getTerm("nmd_transcript")); + assertSame(t2, so.getTerm("nmd_transcript_variant")); + } + + @Test(groups = "Functional") + public void testCanonicalise() + { + assertNull(SequenceOntology.canonicalise(null)); + assertEquals(SequenceOntology.canonicalise("A-b_c"), "a_b_c"); + assertEquals(SequenceOntology.canonicalise("A-b-C"), "a_b_c"); + assertEquals(SequenceOntology.canonicalise("metal binding site"), "metal_binding_site"); + String s = "thisought_nottobe_modified!"; + String s2 = SequenceOntology.canonicalise(s); + assertSame(s, s2); + } }