+/*
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
+ *
+ * This file is part of Jalview.
+ *
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *
+ * Jalview is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
+ * The Jalview Authors are detailed in the 'AUTHORS' file.
+ */
package jalview.ext.so;
+import jalview.datamodel.ontology.OntologyBase;
import jalview.io.gff.SequenceOntologyI;
import java.io.BufferedInputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
+import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.biojava.nbio.ontology.Ontology;
+import org.biojava.nbio.ontology.Synonym;
import org.biojava.nbio.ontology.Term;
import org.biojava.nbio.ontology.Term.Impl;
import org.biojava.nbio.ontology.Triple;
* A wrapper class that parses the Sequence Ontology and exposes useful access
* methods. This version uses the BioJava parser.
*/
-public class SequenceOntology implements SequenceOntologyI
+public class SequenceOntology extends OntologyBase
+ implements SequenceOntologyI
{
/*
* the parsed Ontology data as modelled by BioJava
/*
* lookup of terms by user readable name (NB not guaranteed unique)
*/
- private Map<String, Term> termsByDescription;
+ private Map<String, Term> aliases;
/*
* Map where key is a Term and value is a (possibly empty) list of
*/
public SequenceOntology()
{
- termsFound = new ArrayList<String>();
- termsNotFound = new ArrayList<String>();
- termsByDescription = new HashMap<String, Term>();
- termIsA = new HashMap<Term, List<Term>>();
+ termsFound = new ArrayList<>();
+ termsNotFound = new ArrayList<>();
+ aliases = new HashMap<>();
+ termIsA = new HashMap<>();
loadOntologyZipFile("so-xp-simple.obo");
}
try
{
String zipFile = ontologyFile + ".zip";
- InputStream inStream = this.getClass().getResourceAsStream(
- "/" + zipFile);
+ InputStream inStream = this.getClass()
+ .getResourceAsStream("/" + zipFile);
zipStream = new ZipInputStream(new BufferedInputStream(inStream));
ZipEntry entry;
while ((entry = zipStream.getNextEntry()) != null)
* @throws ParseException
* @throws IOException
*/
- protected void loadOboFile(InputStream is) throws ParseException,
- IOException
+ protected void loadOboFile(InputStream is)
+ throws ParseException, IOException
{
BufferedReader oboFile = new BufferedReader(new InputStreamReader(is));
OboParser parser = new OboParser();
ontology = parser.parseOBO(oboFile, "SO", "the SO ontology");
isA = ontology.getTerm("is_a");
- storeTermNames();
+ storeTermAliases();
}
/**
- * Stores a lookup table of terms by description. Note that description is not
- * guaranteed unique. Where duplicate descriptions are found, try to discard
- * the term that is flagged as obsolete. However we do store obsolete terms
- * where there is no duplication of description.
+ * Stores a lookup table of terms by description or synonym. Note that
+ * description is not guaranteed unique. Where duplicate descriptions are
+ * found, try to discard the term that is flagged as obsolete. However we do
+ * store obsolete terms where there is no duplication of description.
*/
- protected void storeTermNames()
+ protected void storeTermAliases()
{
+ Set<String> ambiguous = new HashSet<>();
+
for (Term term : ontology.getTerms())
{
if (term instanceof Impl)
{
+ boolean newTermIsObsolete = isObsolete(term);
String description = term.getDescription();
if (description != null)
{
- Term replaced = termsByDescription.get(description);
+ description = canonicalise(description);
+ Term replaced = aliases.get(description);
if (replaced != null)
{
- boolean newTermIsObsolete = isObsolete(term);
boolean oldTermIsObsolete = isObsolete(replaced);
if (newTermIsObsolete && !oldTermIsObsolete)
{
+ " for lookup of '" + description + "'");
}
}
- termsByDescription.put(description, term);
+ aliases.put(description, term);
+
+ /*
+ * also store synonyms if not ambiguous
+ */
+ if (!newTermIsObsolete)
+ {
+ for (Object syn : term.getSynonyms())
+ {
+ String name = ((Synonym) syn).getName();
+ String synonym = canonicalise(name);
+ if (aliases.containsKey(synonym))
+ {
+ final Term found = aliases.get(synonym);
+ if (found != term)
+ {
+ /*
+ * this alias is ambiguous - matches description,
+ * or an alias, of another term
+ */
+ String msg = String.format(
+ "Ambiguous synonym %s for '%s:%s' and '%s:%s'",
+ synonym, term.getName(), term.getDescription(),
+ found.getName(), found.getDescription());
+ System.err.println(msg);
+
+ /*
+ * preserve any entry whose canonical description happens to match
+ * a synonym (NMD_transcript is a valid description, and also
+ * a synonym for NMD_transcript_variant)
+ * also preserve a parent (more general) term
+ */
+ if (synonym.equals(canonicalise(found.getDescription()))
+ || termIsA(term, found))
+ {
+ // leave it alone
+ }
+ /*
+ * replace a specialised term with a more general one
+ * with the same alias
+ */
+ // else if
+ // (synonym.equals(canonicalise(term.getDescription())))
+ else if (termIsA(found, term))
+ {
+ aliases.put(synonym, term);
+ }
+ else
+ {
+ ambiguous.add(synonym);
+ }
+ }
+ }
+ else
+ {
+ aliases.put(synonym, term);
+ }
+ }
+ }
}
}
}
+
+ /*
+ * remove ambiguous synonyms for safety;
+ * problem: what if a synonym matches a description?
+ * only one case found:
+ * nmd_transcript is synonym for SO:0001621:NMD_transcript_variant
+ * and also the description for SO:0002114:NMD_transcript
+ */
+ for (String syn : ambiguous)
+ {
+ aliases.remove(syn);
+ }
+ }
+
+ /**
+ * Converts a string to lower case and changes hyphens and spaces to
+ * underscores
+ *
+ * @param s
+ * @return
+ */
+ static String canonicalise(String s)
+ {
+ return s == null ? null
+ : s.toLowerCase().replace('-', '_').replace(' ', '_');
}
/**
*/
protected synchronized void findParents(Term childTerm)
{
- List<Term> result = new ArrayList<Term>();
+ List<Term> result = new ArrayList<>();
for (Triple triple : ontology.getTriples(childTerm, null, isA))
{
Term parent = triple.getObject();
/**
* Returns the Term for a given name (e.g. "SO:0000735") or description (e.g.
- * "sequence_location"), or null if not found.
+ * "sequence_location"), or alias, or null if not found
*
* @param child
* @return
*/
- protected Term getTerm(String nameOrDescription)
+ protected Term getTerm(final String nameOrDescription)
{
- Term t = termsByDescription.get(nameOrDescription);
+ if (nameOrDescription == null)
+ {
+ return null;
+ }
+ Term t = aliases.get(canonicalise(nameOrDescription));
if (t == null)
{
try
return termsNotFound;
}
}
+
+ /**
+ * {@inheritDoc}
+ *
+ * @throws IllegalStateException
+ * if a loop is detected in the ontology
+ */
+ @Override
+ public List<String> getRootParents(final String term)
+ {
+ /*
+ * check in cache first
+ */
+ if (rootParents.containsKey(term))
+ {
+ return rootParents.get(term);
+ }
+ Term t = getTerm(term);
+ if (t == null)
+ {
+ return null;
+ }
+
+ /*
+ * todo: check for loops using 'seen', allowing for alternate paths e.g.
+ * stop_gained isA feature_truncation isA feature_variant
+ * " isA nonsynonymous_variant ... isA geneVariant isA feature_variant
+ */
+ List<Term> seen = new ArrayList<>();
+ List<Term> top = new ArrayList<>();
+ List<Term> query = new ArrayList<>();
+ query.add(t);
+
+ while (!query.isEmpty())
+ {
+ List<Term> nextQuery = new ArrayList<>();
+ for (Term q : query)
+ {
+ Set<Triple> parents = ontology.getTriples(q, null, isA);
+ if (parents.isEmpty())
+ {
+ /*
+ * q has no parents so is a top level term
+ */
+ top.add(q);
+ }
+ else
+ {
+ /*
+ * search all parent terms
+ */
+ for (Triple triple : parents)
+ {
+ Term parent = triple.getObject();
+ nextQuery.add(parent);
+ }
+ }
+ }
+ query = nextQuery;
+ }
+
+ List<String> result = new ArrayList<>();
+ for (Term found : top)
+ {
+ String desc = found.getDescription();
+ if (!result.contains(desc))
+ {
+ result.add(desc);
+ }
+ }
+
+ /*
+ * save result in cache
+ */
+ rootParents.put(term, result);
+
+ return result;
+ }
+
+ @Override
+ public List<String> getParents(String term)
+ {
+ List<String> parents = new ArrayList<>();
+ Term t = getTerm(term);
+ if (t != null)
+ {
+ for (Triple triple : ontology.getTriples(t, null, isA))
+ {
+ Term parent = triple.getObject();
+ parents.add(parent.getDescription());
+ }
+ }
+ return parents;
+ }
+
+ @Override
+ public boolean isValidTerm(String term)
+ {
+ return getTerm(term) != null;
+ }
}