* candidate terms to 'capture' in ontology groupings
* @return
*/
+ @Override
public Map<String, List<String>> findSequenceOntologyGroupings(
String givenTerm, List<String> targetTerms)
{
Collections.sort(sortedTypes);
Map<String, List<String>> parents = new HashMap<>();
-
+ if (!isValidTerm(givenTerm))
+ {
+ return parents;
+ }
+
/*
* method:
* walk up featureType and all of its parents
*/
List<String> candidates = new ArrayList<>();
SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+
candidates.add(givenTerm);
while (!candidates.isEmpty())
{
public interface OntologyI
{
+ /**
+ * Answers true if the term can be identified in the ontology (possibly by id,
+ * description or alias), else false
+ *
+ * @param term
+ * @return
+ */
+ boolean isValidTerm(String term);
/**
* Answers true if <code>childTerm</code> is the same as, or a sub-type
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.zip.ZipInputStream;
import org.biojava.nbio.ontology.Ontology;
+import org.biojava.nbio.ontology.Synonym;
import org.biojava.nbio.ontology.Term;
import org.biojava.nbio.ontology.Term.Impl;
import org.biojava.nbio.ontology.Triple;
/*
* lookup of terms by user readable name (NB not guaranteed unique)
*/
- private Map<String, Term> termsByDescription;
+ private Map<String, Term> aliases;
/*
* Map where key is a Term and value is a (possibly empty) list of
{
termsFound = new ArrayList<>();
termsNotFound = new ArrayList<>();
- termsByDescription = new HashMap<>();
+ aliases = new HashMap<>();
termIsA = new HashMap<>();
loadOntologyZipFile("so-xp-simple.obo");
OboParser parser = new OboParser();
ontology = parser.parseOBO(oboFile, "SO", "the SO ontology");
isA = ontology.getTerm("is_a");
- storeTermNames();
+ storeTermAliases();
}
/**
- * Stores a lookup table of terms by description. Note that description is not
- * guaranteed unique. Where duplicate descriptions are found, try to discard
- * the term that is flagged as obsolete. However we do store obsolete terms
- * where there is no duplication of description.
+ * Stores a lookup table of terms by description or synonym. Note that
+ * description is not guaranteed unique. Where duplicate descriptions are
+ * found, try to discard the term that is flagged as obsolete. However we do
+ * store obsolete terms where there is no duplication of description.
*/
- protected void storeTermNames()
+ protected void storeTermAliases()
{
+ Set<String> ambiguous = new HashSet<>();
+
for (Term term : ontology.getTerms())
{
if (term instanceof Impl)
{
+ boolean newTermIsObsolete = isObsolete(term);
String description = term.getDescription();
if (description != null)
{
- Term replaced = termsByDescription.get(description);
+ description = canonicalise(description);
+ Term replaced = aliases.get(description);
if (replaced != null)
{
- boolean newTermIsObsolete = isObsolete(term);
boolean oldTermIsObsolete = isObsolete(replaced);
if (newTermIsObsolete && !oldTermIsObsolete)
{
+ " for lookup of '" + description + "'");
}
}
- termsByDescription.put(description, term);
+ aliases.put(description, term);
+
+ /*
+ * also store synonyms if not ambiguous
+ */
+ if (!newTermIsObsolete)
+ {
+ for (Object syn : term.getSynonyms())
+ {
+ String name = ((Synonym) syn).getName();
+ String synonym = canonicalise(name);
+ if (aliases.containsKey(synonym))
+ {
+ final Term found = aliases.get(synonym);
+ if (found != term)
+ {
+ /*
+ * this alias is ambiguous - matches description,
+ * or an alias, of another term
+ */
+ String msg = String.format(
+ "Ambiguous synonym %s for '%s:%s' and '%s:%s'",
+ synonym, term.getName(), term.getDescription(),
+ found.getName(), found.getDescription());
+ System.err.println(msg);
+
+ /*
+ * preserve any entry whose canonical description happens to match
+ * a synonym (NMD_transcript is a valid description, and also
+ * a synonym for NMD_transcript_variant)
+ * also preserve a parent (more general) term
+ */
+ if (synonym.equals(canonicalise(found.getDescription()))
+ || termIsA(term, found))
+ {
+ // leave it alone
+ }
+ /*
+ * replace a specialised term with a more general one
+ * with the same alias
+ */
+ // else if
+ // (synonym.equals(canonicalise(term.getDescription())))
+ else if (termIsA(found, term))
+ {
+ aliases.put(synonym, term);
+ }
+ else
+ {
+ ambiguous.add(synonym);
+ }
+ }
+ }
+ else
+ {
+ aliases.put(synonym, term);
+ }
+ }
+ }
}
}
}
+
+ /*
+ * remove ambiguous synonyms for safety;
+ * problem: what if a synonym matches a description?
+ * only one case found:
+ * nmd_transcript is synonym for SO:0001621:NMD_transcript_variant
+ * and also the description for SO:0002114:NMD_transcript
+ */
+ for (String syn : ambiguous)
+ {
+ aliases.remove(syn);
+ }
+ }
+
+ /**
+ * Converts a string to lower case and changes hyphens and spaces to
+ * underscores
+ *
+ * @param s
+ * @return
+ */
+ static String canonicalise(String s)
+ {
+ return s == null ? null
+ : s.toLowerCase().replace('-', '_').replace(' ', '_');
}
/**
/**
* Returns the Term for a given name (e.g. "SO:0000735") or description (e.g.
- * "sequence_location"), or null if not found.
+ * "sequence_location"), or alias, or null if not found
*
* @param child
* @return
*/
- protected Term getTerm(String nameOrDescription)
+ protected Term getTerm(final String nameOrDescription)
{
- Term t = termsByDescription.get(nameOrDescription);
+ if (nameOrDescription == null)
+ {
+ return null;
+ }
+ Term t = aliases.get(canonicalise(nameOrDescription));
if (t == null)
{
try
}
return parents;
}
+
+ @Override
+ public boolean isValidTerm(String term)
+ {
+ return getTerm(term) != null;
+ }
}
List<String> result = parents.get(term);
return result == null ? new ArrayList<>() : result;
}
+
+ @Override
+ public boolean isValidTerm(String term)
+ {
+ return parents.containsKey(term);
+ }
}
}
@Test(groups = "Functional")
- public void testfindSequenceOntologyGroupings()
+ public void testfindSequenceOntologyGroupings_nucleotide()
{
/*
* typical gnomAD feature types, plus the top level 'sequence_variant' as in dbSNP
"missense_variant");
/*
- * for stop_gained:
- * transcript_variant further adds 5_prime_UTR_variant,
- * non_coding_transcript_exon_variant, synonymous_variant, splice_region_variant
- * feature_variant further adds upstream_gene_variant
- * sequence_variant further adds sequence_variant
+ * hierarchy from stop_gained
*/
Map<String, List<String>> map = SequenceOntologyFactory.getInstance()
.findSequenceOntologyGroupings("stop_gained", featureTypes);
+ "stop_lost, synonymous_variant, upstream_gene_variant]");
}
+ @Test(groups = "Functional")
+ public void testfindSequenceOntologyGroupings_peptide()
+ {
+ /*
+ * typical Uniprot feature types
+ */
+ List<String> featureTypes = Arrays.asList("BETA-TURN-IR", "NEST-RL",
+ "BETA-BULGE", "ALPHA-BETA-MOTIF", "ASX-TURN-IR",
+ "GAMMA-TURN-CLASSIC", "GAMMA-TURN-INVERSE", "BETA-TURN-IL",
+ "BETA-TURN-IIL");
+
+ /*
+ * hierarchy from GAMMA-TURN-INVERSE
+ * this is a synonym for
+ */
+ Map<String, List<String>> map = SequenceOntologyFactory.getInstance()
+ .findSequenceOntologyGroupings("TURN", featureTypes);
+ assertEquals(map.size(), 10);
+ }
+
}
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
+import static org.testng.Assert.assertNotNull;
import static org.testng.Assert.assertNull;
+import static org.testng.Assert.assertSame;
import static org.testng.Assert.assertTrue;
-import jalview.datamodel.ontology.OntologyI;
import jalview.gui.JvOptionPane;
import java.util.Arrays;
import java.util.Map;
import java.util.Set;
+import org.biojava.nbio.ontology.Synonym;
+import org.biojava.nbio.ontology.Term;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION);
}
- private OntologyI so;
+ private SequenceOntology so;
@BeforeClass(alwaysRun = true)
public void setUp()
assertEquals(parents.size(), 1);
assertTrue(parents.contains("sequence_feature"));
}
+
+ @Test(groups = "Functional")
+ public void testGetTerm()
+ {
+ assertNull(so.getTerm(null));
+ assertNull(so.getTerm("!*£&!"));
+
+ Term t = so.getTerm("SO:0000084");
+ assertNotNull(t);
+ assertEquals(t.getDescription(), "micronuclear_sequence");
+ // name lookup is case sensitive
+ assertNull(so.getTerm("so:0000084"));
+
+ t = so.getTerm("alpha_helix");
+ assertNotNull(t);
+ Object[] synonyms = t.getSynonyms();
+ assertEquals(synonyms.length, 2);
+ assertEquals(((Synonym) synonyms[0]).getName(), "a-helix");
+ assertEquals(((Synonym) synonyms[1]).getName(), "helix");
+ // case-insensitive description lookup
+ Term t2 = so.getTerm("ALPHA_HELIX");
+ assertSame(t, t2);
+ // can also retrieve by synonym
+ t2 = so.getTerm("a-helix");
+ assertSame(t, t2);
+
+ t = so.getTerm("serine_threonine_motif");
+ t2 = so.getTerm("ST-MOTIF"); // synonym is "st_motif"
+ assertNotNull(t);
+ assertSame(t, t2);
+
+ /*
+ * if a synonym is ambiguous within a hierarchy,
+ * we keep it for the most general term (always valid)
+ * helix is a synonym for
+ * alpha_helix (isA) right_handed_peptide_helix (isA) peptide_helix
+ * motif is a synonym for polypeptide_conserved_motif (isA) polypeptide_motif
+ *
+ */
+ t = so.getTerm("helix");
+ assertNotNull(t);
+ assertEquals(t.getDescription(), "peptide_helix");
+ t = so.getTerm("motif");
+ assertNotNull(t);
+ assertEquals(t.getDescription(), "polypeptide_motif");
+
+ /*
+ * ambiguous synonyms with no mutual hierarchy are not cached
+ * 'sequence variation' is a synonym for
+ * sequence_alteration SO:0001059
+ * alternate_sequence_site SO:0001149
+ * and these have no 'isA' relationship
+ */
+ assertNull(so.getTerm("sequence_variation"));
+
+ /*
+ * nmd_transcript is synonym for SO:0001621:NMD_transcript_variant
+ * and also the description for SO:0002114:NMD_transcript
+ * since v3.1 of so-simple.obo
+ */
+ t = so.getTerm("SO:0002114");
+ assertNotNull(t);
+ t2 = so.getTerm("SO:0001621");
+ assertNotNull(t2);
+ assertSame(t, so.getTerm("nmd_transcript"));
+ assertSame(t2, so.getTerm("nmd_transcript_variant"));
+ }
+
+ @Test(groups = "Functional")
+ public void testCanonicalise()
+ {
+ assertNull(SequenceOntology.canonicalise(null));
+ assertEquals(SequenceOntology.canonicalise("A-b_c"), "a_b_c");
+ assertEquals(SequenceOntology.canonicalise("A-b-C"), "a_b_c");
+ assertEquals(SequenceOntology.canonicalise("metal binding site"), "metal_binding_site");
+ String s = "thisought_nottobe_modified!";
+ String s2 = SequenceOntology.canonicalise(s);
+ assertSame(s, s2);
+ }
}