2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.ext.so;
23 import jalview.bin.Cache;
24 import jalview.datamodel.ontology.OntologyBase;
25 import jalview.io.gff.SequenceOntologyI;
27 import java.io.BufferedInputStream;
28 import java.io.BufferedReader;
29 import java.io.IOException;
30 import java.io.InputStream;
31 import java.io.InputStreamReader;
32 import java.text.ParseException;
33 import java.util.ArrayList;
34 import java.util.Collections;
35 import java.util.HashMap;
36 import java.util.HashSet;
37 import java.util.List;
39 import java.util.NoSuchElementException;
41 import java.util.zip.ZipEntry;
42 import java.util.zip.ZipInputStream;
44 import org.biojava.nbio.ontology.Ontology;
45 import org.biojava.nbio.ontology.Synonym;
46 import org.biojava.nbio.ontology.Term;
47 import org.biojava.nbio.ontology.Term.Impl;
48 import org.biojava.nbio.ontology.Triple;
49 import org.biojava.nbio.ontology.io.OboParser;
50 import org.biojava.nbio.ontology.utils.Annotation;
53 * A wrapper class that parses the Sequence Ontology and exposes useful access
54 * methods. This version uses the BioJava parser.
56 public class SequenceOntology extends OntologyBase
57 implements SequenceOntologyI
60 * the parsed Ontology data as modelled by BioJava
62 private Ontology ontology;
65 * the ontology term for the isA relationship
70 * lookup of terms by user readable name (NB not guaranteed unique)
72 private Map<String, Term> aliases;
75 * Map where key is a Term and value is a (possibly empty) list of
76 * all Terms to which the key has an 'isA' relationship, either
77 * directly or indirectly (A isA B isA C)
79 private Map<Term, List<Term>> termIsA;
81 private List<String> termsFound;
83 private List<String> termsNotFound;
86 * Package private constructor to enforce use of singleton. Parses and caches
87 * the SO OBO data file.
89 public SequenceOntology()
91 termsFound = new ArrayList<>();
92 termsNotFound = new ArrayList<>();
93 aliases = new HashMap<>();
94 termIsA = new HashMap<>();
96 loadOntologyZipFile("so-simple.obo");
100 * Loads the given ontology file from a zip file with ".zip" appended
102 * @param ontologyFile
104 protected void loadOntologyZipFile(String ontologyFile)
106 long now = System.currentTimeMillis();
107 ZipInputStream zipStream = null;
110 String zipFile = ontologyFile + ".zip";
111 InputStream inStream = this.getClass()
112 .getResourceAsStream("/" + zipFile);
113 zipStream = new ZipInputStream(new BufferedInputStream(inStream));
115 while ((entry = zipStream.getNextEntry()) != null)
117 if (entry.getName().equals(ontologyFile))
119 loadOboFile(zipStream);
122 long elapsed = System.currentTimeMillis() - now;
123 System.out.println("Loaded Sequence Ontology from " + zipFile + " ("
125 } catch (Exception e)
130 closeStream(zipStream);
135 * Closes the input stream, swallowing all exceptions
139 protected void closeStream(InputStream is)
146 } catch (IOException e)
154 * Reads, parses and stores the OBO file data
157 * @throws ParseException
158 * @throws IOException
160 protected void loadOboFile(InputStream is)
161 throws ParseException, IOException
163 BufferedReader oboFile = new BufferedReader(new InputStreamReader(is));
164 OboParser parser = new OboParser();
165 ontology = parser.parseOBO(oboFile, "SO", "the SO ontology");
166 isA = ontology.getTerm("is_a");
171 * Stores a lookup table of terms by description or synonym. Note that
172 * description is not guaranteed unique. Where duplicate descriptions are
173 * found, try to discard the term that is flagged as obsolete. However we do
174 * store obsolete terms where there is no duplication of description.
176 protected void storeTermAliases()
178 Set<String> ambiguous = new HashSet<>();
180 for (Term term : ontology.getTerms())
182 if (term instanceof Impl)
184 boolean newTermIsObsolete = isObsolete(term);
185 String description = term.getDescription();
186 if (description != null)
188 description = canonicalise(description);
189 Term replaced = aliases.get(description);
190 if (replaced != null)
192 boolean oldTermIsObsolete = isObsolete(replaced);
193 if (newTermIsObsolete && !oldTermIsObsolete)
195 Cache.log.debug("SequenceOntology ignoring " + term.getName()
196 + " as obsolete and duplicated by "
197 + replaced.getName());
200 else if (!newTermIsObsolete && oldTermIsObsolete)
202 Cache.log.debug("SequenceOntology ignoring "
204 + " as obsolete and duplicated by " + term.getName());
208 Cache.log.debug("SequenceOntology warning: " + term.getName()
209 + " has replaced " + replaced.getName()
210 + " for lookup of '" + description + "'");
213 aliases.put(description, term);
216 * also store synonyms if not ambiguous
218 if (!newTermIsObsolete)
220 storeSynonymsForTerm(term, ambiguous);
227 * remove ambiguous synonyms for safety;
228 * problem: what if a synonym matches a description?
229 * only one case found:
230 * nmd_transcript is synonym for SO:0001621:NMD_transcript_variant
231 * and also the description for SO:0002114:NMD_transcript
233 for (String syn : ambiguous)
240 * Stores any synonyms as an alternative lookup for the term, canonicalised
241 * for case/hyphen/space insensitivity on lookup.
243 * Some synonyms may be ambiguous (present for more than one term), and these
244 * are handled as follows:
246 * <li>if a synonym matches the <em>description</em> of another term, it is
247 * not saved, so that a term can always be found by description
249 * <li>Example: {@code nmd_transcript} is the description for
250 * {@code NMD_transcript} and also a synonym for
251 * {@code NMD_transcript_variant} - the synonym is ignored</li>
254 * <li>if one term is a sub-term (directly or indirectly) of the other, the
255 * synonym is retained for the more general term
257 * <li>Example: {@code helix} is a synonym for
258 * {@code alpha_helix, right_handed_peptide_helix, peptide_helix} - it is kept
259 * for {@code peptide_helix} as this is a parent of the other terms</li>
262 * <li>otherwise the synonym is added to the {@code ambiguous} list for
265 * <li>Example: {@code sequence variation} is a synonym for
266 * {@code sequence_alteration} and {@code alternate_sequence_site} but these
267 * have no {@code isA} relationship - the synonym is ignored as ambiguous</li>
274 void storeSynonymsForTerm(Term term, Set<String> ambiguous)
276 for (Object syn : term.getSynonyms())
278 String name = ((Synonym) syn).getName();
279 String synonym = canonicalise(name);
280 if (aliases.containsKey(synonym))
282 final Term found = aliases.get(synonym);
286 * this alias is ambiguous - matches description,
287 * or an alias, of another term
289 String msg = String.format(
290 "SequenceOntology ambiguous synonym %s for '%s:%s' and '%s:%s'",
291 synonym, term.getName(), term.getDescription(),
292 found.getName(), found.getDescription());
293 Cache.log.debug(msg);
296 * preserve any entry whose canonical description happens to match
297 * a synonym (NMD_transcript is a valid description, and also
298 * a synonym for NMD_transcript_variant)
299 * also preserve a parent (more general) term
301 if (synonym.equals(canonicalise(found.getDescription()))
302 || termIsA(term, found))
307 * replace a specialised term with a more general one
308 * with the same alias
311 // (synonym.equals(canonicalise(term.getDescription())))
312 else if (termIsA(found, term))
314 aliases.put(synonym, term);
318 ambiguous.add(synonym);
324 aliases.put(synonym, term);
330 * Converts a string to lower case and changes hyphens and spaces to
336 static String canonicalise(String s)
338 return s == null ? null
339 : s.toLowerCase().replace('-', '_').replace(' ', '_');
343 * Answers true if the term has property "is_obsolete" with value true, else
349 public static boolean isObsolete(Term term)
351 Annotation ann = term.getAnnotation();
356 if (Boolean.TRUE.equals(ann.getProperty("is_obsolete")))
360 } catch (NoSuchElementException e)
362 // fall through to false
369 * Test whether the given Sequence Ontology term is nucleotide_match (either
370 * directly or via is_a relationship)
373 * SO name or description
376 public boolean isNucleotideMatch(String soTerm)
378 return isA(soTerm, NUCLEOTIDE_MATCH);
382 * Test whether the given Sequence Ontology term is protein_match (either
383 * directly or via is_a relationship)
386 * SO name or description
389 public boolean isProteinMatch(String soTerm)
391 return isA(soTerm, PROTEIN_MATCH);
395 * Test whether the given Sequence Ontology term is polypeptide (either
396 * directly or via is_a relationship)
399 * SO name or description
402 public boolean isPolypeptide(String soTerm)
404 return isA(soTerm, POLYPEPTIDE);
408 * Returns true if the given term has a (direct or indirect) 'isA'
409 * relationship with the parent
416 public boolean isA(String child, String parent)
418 if (child == null || parent == null)
423 * optimise trivial checks like isA("CDS", "CDS")
425 if (child.equals(parent))
431 Term childTerm = getTerm(child);
432 if (childTerm != null)
440 Term parentTerm = getTerm(parent);
442 return termIsA(childTerm, parentTerm);
446 * Records a valid term queried for, for reporting purposes
450 private void termFound(String term)
452 synchronized (termsFound)
454 if (!termsFound.contains(term))
456 termsFound.add(term);
462 * Records an invalid term queried for, for reporting purposes
466 private void termNotFound(String term)
468 synchronized (termsNotFound)
470 if (!termsNotFound.contains(term))
472 Cache.log.debug("SequenceOntology term " + term + " invalid");
473 termsNotFound.add(term);
479 * Returns true if the childTerm 'isA' parentTerm (directly or indirectly).
485 protected synchronized boolean termIsA(Term childTerm, Term parentTerm)
488 * null term could arise from a misspelled SO description
490 if (childTerm == null || parentTerm == null)
496 * recursive search endpoint:
498 if (childTerm == parentTerm)
504 * lazy initialisation - find all of a term's parents (recursively)
505 * the first time this is called, and save them in a map.
507 if (!termIsA.containsKey(childTerm))
509 findParents(childTerm);
512 List<Term> parents = termIsA.get(childTerm);
513 for (Term parent : parents)
515 if (termIsA(parent, parentTerm))
518 * add (great-)grandparents to parents list as they are discovered,
519 * for faster lookup next time
521 if (!parents.contains(parentTerm))
523 parents.add(parentTerm);
533 * Finds all the 'isA' parents of the childTerm and stores them as a (possibly
538 protected synchronized void findParents(Term childTerm)
540 List<Term> result = new ArrayList<>();
541 for (Triple triple : ontology.getTriples(childTerm, null, isA))
543 Term parent = triple.getObject();
547 * and search for the parent's parents recursively
551 termIsA.put(childTerm, result);
555 * Returns the Term for a given name (e.g. "SO:0000735") or description (e.g.
556 * "sequence_location"), or alias, or null if not found
561 protected Term getTerm(final String nameOrDescription)
563 if (nameOrDescription == null)
567 Term t = aliases.get(canonicalise(nameOrDescription));
572 t = ontology.getTerm(nameOrDescription);
573 } catch (NoSuchElementException e)
581 public boolean isSequenceVariant(String soTerm)
583 return isA(soTerm, SEQUENCE_VARIANT);
587 * Sorts (case-insensitive) and returns the list of valid terms queried for
590 public List<String> termsFound()
592 synchronized (termsFound)
594 Collections.sort(termsFound, String.CASE_INSENSITIVE_ORDER);
600 * Sorts (case-insensitive) and returns the list of invalid terms queried for
603 public List<String> termsNotFound()
605 synchronized (termsNotFound)
607 Collections.sort(termsNotFound, String.CASE_INSENSITIVE_ORDER);
608 return termsNotFound;
615 * @throws IllegalStateException
616 * if a loop is detected in the ontology
619 public List<String> getRootParents(final String term)
622 * check in cache first
624 if (rootParents.containsKey(term))
626 return rootParents.get(term);
628 Term t = getTerm(term);
635 * todo: check for loops using 'seen', allowing for alternate paths e.g.
636 * stop_gained isA feature_truncation isA feature_variant
637 * " isA nonsynonymous_variant ... isA geneVariant isA feature_variant
639 List<Term> seen = new ArrayList<>();
640 List<Term> top = new ArrayList<>();
641 List<Term> query = new ArrayList<>();
644 while (!query.isEmpty())
646 List<Term> nextQuery = new ArrayList<>();
649 Set<Triple> parents = ontology.getTriples(q, null, isA);
650 if (parents.isEmpty())
653 * q has no parents so is a top level term
660 * search all parent terms
662 for (Triple triple : parents)
664 Term parent = triple.getObject();
665 nextQuery.add(parent);
672 List<String> result = new ArrayList<>();
673 for (Term found : top)
675 String desc = found.getDescription();
676 if (!result.contains(desc))
683 * save result in cache
685 rootParents.put(term, result);
691 public List<String> getParents(String term)
693 List<String> parents = new ArrayList<>();
694 Term t = getTerm(term);
697 for (Triple triple : ontology.getTriples(t, null, isA))
699 Term parent = triple.getObject();
700 parents.add(parent.getDescription());
707 public boolean isValidTerm(String term)
709 return getTerm(term) != null;