Merge branch 'develop' into features/JAL-3010ontologyFeatureSettings

[jalview.git] / src / jalview / io / gff / SequenceOntologyLite.java
diff --git a/src/jalview/io/gff/SequenceOntologyLite.java b/src/jalview/io/gff/SequenceOntologyLite.java

index c0ae971..2cbec36 100644 (file)
--- a/src/jalview/io/gff/SequenceOntologyLite.java
+++ b/src/jalview/io/gff/SequenceOntologyLite.java
@@ -1,5 +1,27 @@
+/*
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
+ * 
+ * This file is part of Jalview.
+ * 
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License 
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *  
+ * Jalview is distributed in the hope that it will be useful, but 
+ * WITHOUT ANY WARRANTY; without even the implied warranty 
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR 
+ * PURPOSE.  See the GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
+ * The Jalview Authors are detailed in the 'AUTHORS' file.
+ */
  package jalview.io.gff;
  
+import jalview.datamodel.ontology.OntologyBase;
+
  import java.util.ArrayList;
  import java.util.Collections;
  import java.util.HashMap;
@@ -18,10 +40,14 @@ import java.util.Map;
   * @author gmcarstairs
   *
   */
-public class SequenceOntologyLite implements SequenceOntologyI
+public class SequenceOntologyLite extends OntologyBase
+        implements SequenceOntologyI
  {
    /*
     * initial selection of types of interest when processing Ensembl features
+   * NB unlike the full SequenceOntology we don't traverse indirect
+   * child-parent relationships here so e.g. need to list every sub-type
+   * (direct or indirect) that is of interest
     */
    // @formatter:off
    private final String[][] TERMS = new String[][] {
@@ -32,31 +58,54 @@ public class SequenceOntologyLite implements SequenceOntologyI
      { "gene", "gene" }, 
      { "ncRNA_gene", "gene" }, 
      { "snRNA_gene", "gene" },
+    { "miRNA_gene", "gene" },
+    { "lincRNA_gene", "gene" },
+    { "rRNA_gene", "gene" },
      
      /*
       * transcript sub-types:
       */
      { "transcript", "transcript" }, 
      { "mature_transcript", "transcript" }, 
+    { "processed_transcript", "transcript" }, 
+    { "aberrant_processed_transcript", "transcript" },
      { "ncRNA", "transcript" },
      { "snRNA", "transcript" },
-    { "aberrant_processed_transcript", "transcript" },
+    { "miRNA", "transcript" },
+    { "lincRNA", "transcript" },
+    { "lnc_RNA", "transcript" },
+    { "rRNA", "transcript" },
+    { "mRNA", "transcript" },
+    // there are many more sub-types of ncRNA...
      
      /*
-     * sequence_variant sub-types:
+     * sequence_variant sub-types
       */
      { "sequence_variant", "sequence_variant" },
+    { "structural_variant", "sequence_variant" },
      { "feature_variant", "sequence_variant" },
+    { "upstream_gene_variant", "sequence_variant" },
      { "gene_variant", "sequence_variant" },
+    { "transcript_variant", "sequence_variant" },
+    { "non_coding_transcript_variant", "sequence_variant" },
+    { "non_coding_transcript_exon_variant", "sequence_variant" },
      // NB Ensembl uses NMD_transcript_variant as if a 'transcript'
      // but we model it here correctly as per the SO
      { "NMD_transcript_variant", "sequence_variant" },
-    { "transcript_variant", "sequence_variant" },
-    { "structural_variant", "sequence_variant" },
+    { "missense_variant", "sequence_variant" },
+    { "synonymous_variant", "sequence_variant" },
+    { "frameshift_variant", "sequence_variant" },
+    { "5_prime_UTR_variant", "sequence_variant" },
+    { "3_prime_UTR_variant", "sequence_variant" },
+    { "stop_gained", "sequence_variant" },
+    { "stop_lost", "sequence_variant" },
+    { "inframe_deletion", "sequence_variant" },
+    { "inframe_insertion", "sequence_variant" },
+    { "splice_region_variant", "sequence_variant" },
      
      /*
-     * no sub-types of exon or CDS encountered in Ensembl
-     * a few added here for testing purposes
+     * no sub-types of exon or CDS yet seen in Ensembl
+     * some added here for testing purposes
       */
      { "exon", "exon" },
      { "coding_exon", "exon" },
@@ -64,10 +113,11 @@ public class SequenceOntologyLite implements SequenceOntologyI
      { "CDS_predicted", "CDS" },
      
      /*
-     * used in exonerate GFF
+     * terms used in exonerate or PASA GFF
       */
      { "protein_match", "protein_match"},
      { "nucleotide_match", "nucleotide_match"},
+    { "cDNA_match", "nucleotide_match"},
      
      /*
       * used in InterProScan GFF
@@ -88,8 +138,8 @@ public class SequenceOntologyLite implements SequenceOntologyI
  
    public SequenceOntologyLite()
    {
-    termsFound = new ArrayList<String>();
-    termsNotFound = new ArrayList<String>();
+    termsFound = new ArrayList<>();
+    termsNotFound = new ArrayList<>();
      loadStaticData();
    }
  
@@ -98,12 +148,13 @@ public class SequenceOntologyLite implements SequenceOntologyI
     */
    private void loadStaticData()
    {
-    parents = new HashMap<String, List<String>>();
-    for (String [] pair : TERMS) {
+    parents = new HashMap<>();
+    for (String[] pair : TERMS)
+    {
        List<String> p = parents.get(pair[0]);
        if (p == null)
        {
-        p = new ArrayList<String>();
+        p = new ArrayList<>();
          parents.put(pair[0], p);
        }
        p.add(pair[1]);
@@ -168,9 +219,11 @@ public class SequenceOntologyLite implements SequenceOntologyI
      {
        if (!termsNotFound.contains(term))
        {
-        System.out.println("SO term " + term
-                + " not known - either invalid or needs modelled in "
-                + getClass().getName());
+        // suppress logging here as it reports Uniprot sequence features
+        // (which do not use SO terms) when auto-configuring feature colours
+        // System.out.println("SO term " + term
+        // + " not known - add to model if needed in "
+        // + getClass().getName());
          termsNotFound.add(term);
        }
      }
@@ -201,4 +254,70 @@ public class SequenceOntologyLite implements SequenceOntologyI
        return termsNotFound;
      }
    }
+
+  @Override
+  public List<String> getRootParents(final String term)
+  {
+    /*
+     * check in cache first
+     */
+    if (rootParents.containsKey(term))
+    {
+      return rootParents.get(term);
+    }
+
+    List<String> top = new ArrayList<>();
+    List<String> query = new ArrayList<>();
+    query.add(term);
+
+    while (!query.isEmpty())
+    {
+      List<String> nextQuery = new ArrayList<>();
+      for (String q : query)
+      {
+        List<String> theParents = parents.get(q);
+        if (theParents != null)
+        {
+          if (theParents.size() == 1 && theParents.get(0).equals(q))
+          {
+            /*
+             * top-level term
+             */
+            if (!top.contains(q))
+            {
+              top.add(q);
+            }
+          }
+          else
+          {
+            for (String p : theParents)
+            {
+              if (!p.equals(q))
+              {
+                nextQuery.add(p);
+              }
+            }
+          }
+        }
+      }
+      query = nextQuery;
+    }
+
+    rootParents.put(term, top);
+
+    return top.isEmpty() ? null : top;
+  }
+
+  @Override
+  public List<String> getParents(String term)
+  {
+    List<String> result = parents.get(term);
+    return result == null ? new ArrayList<>() : result;
+  }
+
+  @Override
+  public boolean isValidTerm(String term)
+  {
+    return parents.containsKey(term);
+  }
  }