+/*
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
+ *
+ * This file is part of Jalview.
+ *
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *
+ * Jalview is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
+ * The Jalview Authors are detailed in the 'AUTHORS' file.
+ */
package jalview.io.gff;
+import jalview.datamodel.ontology.OntologyBase;
+
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
* @author gmcarstairs
*
*/
-public class SequenceOntologyLite implements SequenceOntologyI
+public class SequenceOntologyLite extends OntologyBase
+ implements SequenceOntologyI
{
/*
* initial selection of types of interest when processing Ensembl features
+ * NB unlike the full SequenceOntology we don't traverse indirect
+ * child-parent relationships here so e.g. need to list every sub-type
+ * (direct or indirect) that is of interest
*/
// @formatter:off
private final String[][] TERMS = new String[][] {
{ "gene", "gene" },
{ "ncRNA_gene", "gene" },
{ "snRNA_gene", "gene" },
+ { "miRNA_gene", "gene" },
+ { "lincRNA_gene", "gene" },
+ { "rRNA_gene", "gene" },
/*
* transcript sub-types:
*/
{ "transcript", "transcript" },
{ "mature_transcript", "transcript" },
+ { "processed_transcript", "transcript" },
+ { "aberrant_processed_transcript", "transcript" },
{ "ncRNA", "transcript" },
{ "snRNA", "transcript" },
- { "aberrant_processed_transcript", "transcript" },
+ { "miRNA", "transcript" },
+ { "lincRNA", "transcript" },
+ { "lnc_RNA", "transcript" },
+ { "rRNA", "transcript" },
+ { "mRNA", "transcript" },
+ // there are many more sub-types of ncRNA...
/*
- * sequence_variant sub-types:
+ * sequence_variant sub-types
*/
{ "sequence_variant", "sequence_variant" },
+ { "structural_variant", "sequence_variant" },
{ "feature_variant", "sequence_variant" },
+ { "upstream_gene_variant", "sequence_variant" },
{ "gene_variant", "sequence_variant" },
+ { "transcript_variant", "sequence_variant" },
+ { "non_coding_transcript_variant", "sequence_variant" },
+ { "non_coding_transcript_exon_variant", "sequence_variant" },
// NB Ensembl uses NMD_transcript_variant as if a 'transcript'
// but we model it here correctly as per the SO
{ "NMD_transcript_variant", "sequence_variant" },
- { "transcript_variant", "sequence_variant" },
- { "structural_variant", "sequence_variant" },
+ { "missense_variant", "sequence_variant" },
+ { "synonymous_variant", "sequence_variant" },
+ { "frameshift_variant", "sequence_variant" },
+ { "5_prime_UTR_variant", "sequence_variant" },
+ { "3_prime_UTR_variant", "sequence_variant" },
+ { "stop_gained", "sequence_variant" },
+ { "stop_lost", "sequence_variant" },
+ { "inframe_deletion", "sequence_variant" },
+ { "inframe_insertion", "sequence_variant" },
+ { "splice_region_variant", "sequence_variant" },
/*
- * no sub-types of exon or CDS encountered in Ensembl
- * a few added here for testing purposes
+ * no sub-types of exon or CDS yet seen in Ensembl
+ * some added here for testing purposes
*/
{ "exon", "exon" },
{ "coding_exon", "exon" },
{ "CDS_predicted", "CDS" },
/*
- * used in exonerate GFF
+ * terms used in exonerate or PASA GFF
*/
{ "protein_match", "protein_match"},
{ "nucleotide_match", "nucleotide_match"},
+ { "cDNA_match", "nucleotide_match"},
/*
* used in InterProScan GFF
public SequenceOntologyLite()
{
- termsFound = new ArrayList<String>();
- termsNotFound = new ArrayList<String>();
+ termsFound = new ArrayList<>();
+ termsNotFound = new ArrayList<>();
loadStaticData();
}
*/
private void loadStaticData()
{
- parents = new HashMap<String, List<String>>();
- for (String [] pair : TERMS) {
+ parents = new HashMap<>();
+ for (String[] pair : TERMS)
+ {
List<String> p = parents.get(pair[0]);
if (p == null)
{
- p = new ArrayList<String>();
+ p = new ArrayList<>();
parents.put(pair[0], p);
}
p.add(pair[1]);
{
if (!termsNotFound.contains(term))
{
- System.out.println("SO term " + term
- + " not known - either invalid or needs modelled in "
- + getClass().getName());
+ // suppress logging here as it reports Uniprot sequence features
+ // (which do not use SO terms) when auto-configuring feature colours
+ // System.out.println("SO term " + term
+ // + " not known - add to model if needed in "
+ // + getClass().getName());
termsNotFound.add(term);
}
}
return termsNotFound;
}
}
+
+ @Override
+ public List<String> getRootParents(final String term)
+ {
+ /*
+ * check in cache first
+ */
+ if (rootParents.containsKey(term))
+ {
+ return rootParents.get(term);
+ }
+
+ List<String> top = new ArrayList<>();
+ List<String> query = new ArrayList<>();
+ query.add(term);
+
+ while (!query.isEmpty())
+ {
+ List<String> nextQuery = new ArrayList<>();
+ for (String q : query)
+ {
+ List<String> theParents = parents.get(q);
+ if (theParents != null)
+ {
+ if (theParents.size() == 1 && theParents.get(0).equals(q))
+ {
+ /*
+ * top-level term
+ */
+ if (!top.contains(q))
+ {
+ top.add(q);
+ }
+ }
+ else
+ {
+ for (String p : theParents)
+ {
+ if (!p.equals(q))
+ {
+ nextQuery.add(p);
+ }
+ }
+ }
+ }
+ }
+ query = nextQuery;
+ }
+
+ rootParents.put(term, top);
+
+ return top.isEmpty() ? null : top;
+ }
+
+ @Override
+ public List<String> getParents(String term)
+ {
+ List<String> result = parents.get(term);
+ return result == null ? new ArrayList<>() : result;
+ }
+
+ @Override
+ public boolean isValidTerm(String term)
+ {
+ return parents.containsKey(term);
+ }
}