2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.io.gff;
23 import jalview.datamodel.ontology.OntologyBase;
25 import java.util.ArrayList;
26 import java.util.Collections;
27 import java.util.HashMap;
28 import java.util.List;
32 * An implementation of SequenceOntologyI that hard codes terms of interest.
34 * Use this in unit testing by calling SequenceOntology.setInstance(new
35 * SequenceOntologyLite()).
37 * May also become a stand-in for SequenceOntology in the applet if we want to
38 * avoid the additional jars needed for parsing the full SO.
43 public class SequenceOntologyLite extends OntologyBase
44 implements SequenceOntologyI
47 * initial selection of types of interest when processing Ensembl features
48 * NB unlike the full SequenceOntology we don't traverse indirect
49 * child-parent relationships here so e.g. need to list every sub-type
50 * (direct or indirect) that is of interest
53 private final String[][] TERMS = new String[][] {
59 { "ncRNA_gene", "gene" },
60 { "snRNA_gene", "gene" },
61 { "miRNA_gene", "gene" },
62 { "lincRNA_gene", "gene" },
63 { "rRNA_gene", "gene" },
66 * transcript sub-types:
68 { "transcript", "transcript" },
69 { "mature_transcript", "transcript" },
70 { "processed_transcript", "transcript" },
71 { "aberrant_processed_transcript", "transcript" },
72 { "ncRNA", "transcript" },
73 { "snRNA", "transcript" },
74 { "miRNA", "transcript" },
75 { "lincRNA", "transcript" },
76 { "lnc_RNA", "transcript" },
77 { "rRNA", "transcript" },
78 { "mRNA", "transcript" },
79 // there are many more sub-types of ncRNA...
82 * sequence_variant sub-types
84 { "sequence_variant", "sequence_variant" },
85 { "structural_variant", "sequence_variant" },
86 { "feature_variant", "sequence_variant" },
87 { "upstream_gene_variant", "sequence_variant" },
88 { "gene_variant", "sequence_variant" },
89 { "transcript_variant", "sequence_variant" },
90 { "non_coding_transcript_variant", "sequence_variant" },
91 { "non_coding_transcript_exon_variant", "sequence_variant" },
92 // NB Ensembl uses NMD_transcript_variant as if a 'transcript'
93 // but we model it here correctly as per the SO
94 { "NMD_transcript_variant", "sequence_variant" },
95 { "missense_variant", "sequence_variant" },
96 { "synonymous_variant", "sequence_variant" },
97 { "frameshift_variant", "sequence_variant" },
98 { "5_prime_UTR_variant", "sequence_variant" },
99 { "3_prime_UTR_variant", "sequence_variant" },
100 { "stop_gained", "sequence_variant" },
101 { "stop_lost", "sequence_variant" },
102 { "inframe_deletion", "sequence_variant" },
103 { "inframe_insertion", "sequence_variant" },
104 { "splice_region_variant", "sequence_variant" },
107 * no sub-types of exon or CDS yet seen in Ensembl
108 * some added here for testing purposes
111 { "coding_exon", "exon" },
113 { "CDS_predicted", "CDS" },
116 * terms used in exonerate or PASA GFF
118 { "protein_match", "protein_match"},
119 { "nucleotide_match", "nucleotide_match"},
120 { "cDNA_match", "nucleotide_match"},
123 * used in InterProScan GFF
125 { "polypeptide", "polypeptide" }
130 * hard-coded list of any parents (direct or indirect)
131 * that we care about for a term
133 private Map<String, List<String>> parents;
135 private List<String> termsFound;
137 private List<String> termsNotFound;
139 public SequenceOntologyLite()
141 termsFound = new ArrayList<>();
142 termsNotFound = new ArrayList<>();
147 * Loads hard-coded data into a lookup table of {term, {list_of_parents}}
149 private void loadStaticData()
151 parents = new HashMap<>();
152 for (String[] pair : TERMS)
154 List<String> p = parents.get(pair[0]);
157 p = new ArrayList<>();
158 parents.put(pair[0], p);
165 * Answers true if 'child' isA 'parent' (including equality). In this
166 * implementation, based only on hard-coded values.
169 public boolean isA(String child, String parent)
171 if (child == null || parent == null)
175 if (child.equals(parent))
181 List<String> p = parents.get(child);
188 if (p.contains(parent))
196 * Records a valid term queried for, for reporting purposes
200 private void termFound(String term)
202 if (!termsFound.contains(term))
204 synchronized (termsFound)
206 termsFound.add(term);
212 * Records an invalid term queried for, for reporting purposes
216 private void termNotFound(String term)
218 synchronized (termsNotFound)
220 if (!termsNotFound.contains(term))
222 // suppress logging here as it reports Uniprot sequence features
223 // (which do not use SO terms) when auto-configuring feature colours
224 // System.out.println("SO term " + term
225 // + " not known - add to model if needed in "
226 // + getClass().getName());
227 termsNotFound.add(term);
233 * Sorts (case-insensitive) and returns the list of valid terms queried for
236 public List<String> termsFound()
238 synchronized (termsFound)
240 Collections.sort(termsFound, String.CASE_INSENSITIVE_ORDER);
246 * Sorts (case-insensitive) and returns the list of invalid terms queried for
249 public List<String> termsNotFound()
251 synchronized (termsNotFound)
253 Collections.sort(termsNotFound, String.CASE_INSENSITIVE_ORDER);
254 return termsNotFound;
259 public List<String> getRootParents(final String term)
262 * check in cache first
264 if (rootParents.containsKey(term))
266 return rootParents.get(term);
269 List<String> top = new ArrayList<>();
270 List<String> query = new ArrayList<>();
273 while (!query.isEmpty())
275 List<String> nextQuery = new ArrayList<>();
276 for (String q : query)
278 List<String> theParents = parents.get(q);
279 if (theParents != null)
281 if (theParents.size() == 1 && theParents.get(0).equals(q))
286 if (!top.contains(q))
293 for (String p : theParents)
306 rootParents.put(term, top);
308 return top.isEmpty() ? null : top;
312 public List<String> getParents(String term)
314 List<String> result = parents.get(term);
315 return result == null ? new ArrayList<>() : result;
319 public boolean isValidTerm(String term)
321 return parents.containsKey(term);