2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.ext.so;
23 import jalview.datamodel.ontology.OntologyBase;
24 import jalview.io.gff.SequenceOntologyI;
26 import java.io.BufferedInputStream;
27 import java.io.BufferedReader;
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.io.InputStreamReader;
31 import java.text.ParseException;
32 import java.util.ArrayList;
33 import java.util.Collections;
34 import java.util.HashMap;
35 import java.util.HashSet;
36 import java.util.List;
38 import java.util.NoSuchElementException;
40 import java.util.zip.ZipEntry;
41 import java.util.zip.ZipInputStream;
43 import org.biojava.nbio.ontology.Ontology;
44 import org.biojava.nbio.ontology.Synonym;
45 import org.biojava.nbio.ontology.Term;
46 import org.biojava.nbio.ontology.Term.Impl;
47 import org.biojava.nbio.ontology.Triple;
48 import org.biojava.nbio.ontology.io.OboParser;
49 import org.biojava.nbio.ontology.utils.Annotation;
52 * A wrapper class that parses the Sequence Ontology and exposes useful access
53 * methods. This version uses the BioJava parser.
55 public class SequenceOntology extends OntologyBase
56 implements SequenceOntologyI
59 * the parsed Ontology data as modelled by BioJava
61 private Ontology ontology;
64 * the ontology term for the isA relationship
69 * lookup of terms by user readable name (NB not guaranteed unique)
71 private Map<String, Term> aliases;
74 * Map where key is a Term and value is a (possibly empty) list of
75 * all Terms to which the key has an 'isA' relationship, either
76 * directly or indirectly (A isA B isA C)
78 private Map<Term, List<Term>> termIsA;
80 private List<String> termsFound;
82 private List<String> termsNotFound;
85 * Package private constructor to enforce use of singleton. Parses and caches
86 * the SO OBO data file.
88 public SequenceOntology()
90 termsFound = new ArrayList<>();
91 termsNotFound = new ArrayList<>();
92 aliases = new HashMap<>();
93 termIsA = new HashMap<>();
95 loadOntologyZipFile("so-xp-simple.obo");
99 * Loads the given ontology file from a zip file with ".zip" appended
101 * @param ontologyFile
103 protected void loadOntologyZipFile(String ontologyFile)
105 long now = System.currentTimeMillis();
106 ZipInputStream zipStream = null;
109 String zipFile = ontologyFile + ".zip";
110 InputStream inStream = this.getClass()
111 .getResourceAsStream("/" + zipFile);
112 zipStream = new ZipInputStream(new BufferedInputStream(inStream));
114 while ((entry = zipStream.getNextEntry()) != null)
116 if (entry.getName().equals(ontologyFile))
118 loadOboFile(zipStream);
121 long elapsed = System.currentTimeMillis() - now;
122 System.out.println("Loaded Sequence Ontology from " + zipFile + " ("
124 } catch (Exception e)
129 closeStream(zipStream);
134 * Closes the input stream, swallowing all exceptions
138 protected void closeStream(InputStream is)
145 } catch (IOException e)
153 * Reads, parses and stores the OBO file data
156 * @throws ParseException
157 * @throws IOException
159 protected void loadOboFile(InputStream is)
160 throws ParseException, IOException
162 BufferedReader oboFile = new BufferedReader(new InputStreamReader(is));
163 OboParser parser = new OboParser();
164 ontology = parser.parseOBO(oboFile, "SO", "the SO ontology");
165 isA = ontology.getTerm("is_a");
170 * Stores a lookup table of terms by description or synonym. Note that
171 * description is not guaranteed unique. Where duplicate descriptions are
172 * found, try to discard the term that is flagged as obsolete. However we do
173 * store obsolete terms where there is no duplication of description.
175 protected void storeTermAliases()
177 Set<String> ambiguous = new HashSet<>();
179 for (Term term : ontology.getTerms())
181 if (term instanceof Impl)
183 boolean newTermIsObsolete = isObsolete(term);
184 String description = term.getDescription();
185 if (description != null)
187 description = canonicalise(description);
188 Term replaced = aliases.get(description);
189 if (replaced != null)
191 boolean oldTermIsObsolete = isObsolete(replaced);
192 if (newTermIsObsolete && !oldTermIsObsolete)
194 System.err.println("Ignoring " + term.getName()
195 + " as obsolete and duplicated by "
196 + replaced.getName());
199 else if (!newTermIsObsolete && oldTermIsObsolete)
201 System.err.println("Ignoring " + replaced.getName()
202 + " as obsolete and duplicated by " + term.getName());
206 System.err.println("Warning: " + term.getName()
207 + " has replaced " + replaced.getName()
208 + " for lookup of '" + description + "'");
211 aliases.put(description, term);
214 * also store synonyms if not ambiguous
216 if (!newTermIsObsolete)
218 for (Object syn : term.getSynonyms())
220 String name = ((Synonym) syn).getName();
221 String synonym = canonicalise(name);
222 if (aliases.containsKey(synonym))
224 final Term found = aliases.get(synonym);
228 * this alias is ambiguous - matches description,
229 * or an alias, of another term
231 String msg = String.format(
232 "Ambiguous synonym %s for '%s:%s' and '%s:%s'",
233 synonym, term.getName(), term.getDescription(),
234 found.getName(), found.getDescription());
235 System.err.println(msg);
238 * preserve any entry whose canonical description happens to match
239 * a synonym (NMD_transcript is a valid description, and also
240 * a synonym for NMD_transcript_variant)
241 * also preserve a parent (more general) term
243 if (synonym.equals(canonicalise(found.getDescription()))
244 || termIsA(term, found))
249 * replace a specialised term with a more general one
250 * with the same alias
253 // (synonym.equals(canonicalise(term.getDescription())))
254 else if (termIsA(found, term))
256 aliases.put(synonym, term);
260 ambiguous.add(synonym);
266 aliases.put(synonym, term);
275 * remove ambiguous synonyms for safety;
276 * problem: what if a synonym matches a description?
277 * only one case found:
278 * nmd_transcript is synonym for SO:0001621:NMD_transcript_variant
279 * and also the description for SO:0002114:NMD_transcript
281 for (String syn : ambiguous)
288 * Converts a string to lower case and changes hyphens and spaces to
294 static String canonicalise(String s)
296 return s == null ? null
297 : s.toLowerCase().replace('-', '_').replace(' ', '_');
301 * Answers true if the term has property "is_obsolete" with value true, else
307 public static boolean isObsolete(Term term)
309 Annotation ann = term.getAnnotation();
314 if (Boolean.TRUE.equals(ann.getProperty("is_obsolete")))
318 } catch (NoSuchElementException e)
320 // fall through to false
327 * Test whether the given Sequence Ontology term is nucleotide_match (either
328 * directly or via is_a relationship)
331 * SO name or description
334 public boolean isNucleotideMatch(String soTerm)
336 return isA(soTerm, NUCLEOTIDE_MATCH);
340 * Test whether the given Sequence Ontology term is protein_match (either
341 * directly or via is_a relationship)
344 * SO name or description
347 public boolean isProteinMatch(String soTerm)
349 return isA(soTerm, PROTEIN_MATCH);
353 * Test whether the given Sequence Ontology term is polypeptide (either
354 * directly or via is_a relationship)
357 * SO name or description
360 public boolean isPolypeptide(String soTerm)
362 return isA(soTerm, POLYPEPTIDE);
366 * Returns true if the given term has a (direct or indirect) 'isA'
367 * relationship with the parent
374 public boolean isA(String child, String parent)
376 if (child == null || parent == null)
381 * optimise trivial checks like isA("CDS", "CDS")
383 if (child.equals(parent))
389 Term childTerm = getTerm(child);
390 if (childTerm != null)
398 Term parentTerm = getTerm(parent);
400 return termIsA(childTerm, parentTerm);
404 * Records a valid term queried for, for reporting purposes
408 private void termFound(String term)
410 synchronized (termsFound)
412 if (!termsFound.contains(term))
414 termsFound.add(term);
420 * Records an invalid term queried for, for reporting purposes
424 private void termNotFound(String term)
426 synchronized (termsNotFound)
428 if (!termsNotFound.contains(term))
430 System.err.println("SO term " + term + " invalid");
431 termsNotFound.add(term);
437 * Returns true if the childTerm 'isA' parentTerm (directly or indirectly).
443 protected synchronized boolean termIsA(Term childTerm, Term parentTerm)
446 * null term could arise from a misspelled SO description
448 if (childTerm == null || parentTerm == null)
454 * recursive search endpoint:
456 if (childTerm == parentTerm)
462 * lazy initialisation - find all of a term's parents (recursively)
463 * the first time this is called, and save them in a map.
465 if (!termIsA.containsKey(childTerm))
467 findParents(childTerm);
470 List<Term> parents = termIsA.get(childTerm);
471 for (Term parent : parents)
473 if (termIsA(parent, parentTerm))
476 * add (great-)grandparents to parents list as they are discovered,
477 * for faster lookup next time
479 if (!parents.contains(parentTerm))
481 parents.add(parentTerm);
491 * Finds all the 'isA' parents of the childTerm and stores them as a (possibly
496 protected synchronized void findParents(Term childTerm)
498 List<Term> result = new ArrayList<>();
499 for (Triple triple : ontology.getTriples(childTerm, null, isA))
501 Term parent = triple.getObject();
505 * and search for the parent's parents recursively
509 termIsA.put(childTerm, result);
513 * Returns the Term for a given name (e.g. "SO:0000735") or description (e.g.
514 * "sequence_location"), or alias, or null if not found
519 protected Term getTerm(final String nameOrDescription)
521 if (nameOrDescription == null)
525 Term t = aliases.get(canonicalise(nameOrDescription));
530 t = ontology.getTerm(nameOrDescription);
531 } catch (NoSuchElementException e)
539 public boolean isSequenceVariant(String soTerm)
541 return isA(soTerm, SEQUENCE_VARIANT);
545 * Sorts (case-insensitive) and returns the list of valid terms queried for
548 public List<String> termsFound()
550 synchronized (termsFound)
552 Collections.sort(termsFound, String.CASE_INSENSITIVE_ORDER);
558 * Sorts (case-insensitive) and returns the list of invalid terms queried for
561 public List<String> termsNotFound()
563 synchronized (termsNotFound)
565 Collections.sort(termsNotFound, String.CASE_INSENSITIVE_ORDER);
566 return termsNotFound;
573 * @throws IllegalStateException
574 * if a loop is detected in the ontology
577 public List<String> getRootParents(final String term)
580 * check in cache first
582 if (rootParents.containsKey(term))
584 return rootParents.get(term);
586 Term t = getTerm(term);
593 * todo: check for loops using 'seen', allowing for alternate paths e.g.
594 * stop_gained isA feature_truncation isA feature_variant
595 * " isA nonsynonymous_variant ... isA geneVariant isA feature_variant
597 List<Term> seen = new ArrayList<>();
598 List<Term> top = new ArrayList<>();
599 List<Term> query = new ArrayList<>();
602 while (!query.isEmpty())
604 List<Term> nextQuery = new ArrayList<>();
607 Set<Triple> parents = ontology.getTriples(q, null, isA);
608 if (parents.isEmpty())
611 * q has no parents so is a top level term
618 * search all parent terms
620 for (Triple triple : parents)
622 Term parent = triple.getObject();
623 nextQuery.add(parent);
630 List<String> result = new ArrayList<>();
631 for (Term found : top)
633 String desc = found.getDescription();
634 if (!result.contains(desc))
641 * save result in cache
643 rootParents.put(term, result);
649 public List<String> getParents(String term)
651 List<String> parents = new ArrayList<>();
652 Term t = getTerm(term);
655 for (Triple triple : ontology.getTriples(t, null, isA))
657 Term parent = triple.getObject();
658 parents.add(parent.getDescription());
665 public boolean isValidTerm(String term)
667 return getTerm(term) != null;