mungo merge
[jalview.git] / src / jalview / io / gff / SequenceOntologyLite.java
diff --git a/src/jalview/io/gff/SequenceOntologyLite.java b/src/jalview/io/gff/SequenceOntologyLite.java
new file mode 100644 (file)
index 0000000..b3f8161
--- /dev/null
@@ -0,0 +1,216 @@
+package jalview.io.gff;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * An implementation of SequenceOntologyI that hard codes terms of interest.
+ *
+ * Use this in unit testing by calling SequenceOntology.setInstance(new
+ * SequenceOntologyLite()).
+ * 
+ * May also become a stand-in for SequenceOntology in the applet if we want to
+ * avoid the additional jars needed for parsing the full SO.
+ * 
+ * @author gmcarstairs
+ *
+ */
+public class SequenceOntologyLite implements SequenceOntologyI
+{
+  /*
+   * initial selection of types of interest when processing Ensembl features
+   * NB unlike the full SequenceOntology we don't traverse indirect
+   * child-parent relationships here so e.g. need to list every sub-type
+   * of gene (direct or indirect) that is of interest
+   */
+  // @formatter:off
+  private final String[][] TERMS = new String[][] {
+
+    /*
+     * gene sub-types:
+     */
+    { "gene", "gene" }, 
+    { "ncRNA_gene", "gene" }, 
+    { "snRNA_gene", "gene" },
+    { "miRNA_gene", "gene" },
+    { "lincRNA_gene", "gene" },
+    { "rRNA_gene", "gene" },
+    
+    /*
+     * transcript sub-types:
+     */
+    { "transcript", "transcript" }, 
+    { "mature_transcript", "transcript" }, 
+    { "processed_transcript", "transcript" }, 
+    { "aberrant_processed_transcript", "transcript" },
+    { "ncRNA", "transcript" },
+    { "snRNA", "transcript" },
+    { "miRNA", "transcript" },
+    { "lincRNA", "transcript" },
+    { "rRNA", "transcript" },
+    // there are many more sub-types of ncRNA...
+    
+    /*
+     * sequence_variant sub-types:
+     */
+    { "sequence_variant", "sequence_variant" },
+    { "feature_variant", "sequence_variant" },
+    { "gene_variant", "sequence_variant" },
+    // NB Ensembl uses NMD_transcript_variant as if a 'transcript'
+    // but we model it here correctly as per the SO
+    { "NMD_transcript_variant", "sequence_variant" },
+    { "transcript_variant", "sequence_variant" },
+    { "structural_variant", "sequence_variant" },
+    
+    /*
+     * no sub-types of exon or CDS yet seen in Ensembl
+     * some added here for testing purposes
+     */
+    { "exon", "exon" },
+    { "coding_exon", "exon" },
+    { "CDS", "CDS" },
+    { "CDS_predicted", "CDS" },
+    
+    /*
+     * terms used in exonerate or PASA GFF
+     */
+    { "protein_match", "protein_match"},
+    { "nucleotide_match", "nucleotide_match"},
+    { "cDNA_match", "nucleotide_match"},
+    
+    /*
+     * used in InterProScan GFF
+     */
+    { "polypeptide", "polypeptide" }
+  };
+  // @formatter:on
+
+  /*
+   * hard-coded list of any parents (direct or indirect) 
+   * that we care about for a term
+   */
+  private Map<String, List<String>> parents;
+
+  private List<String> termsFound;
+
+  private List<String> termsNotFound;
+
+  public SequenceOntologyLite()
+  {
+    termsFound = new ArrayList<String>();
+    termsNotFound = new ArrayList<String>();
+    loadStaticData();
+  }
+
+  /**
+   * Loads hard-coded data into a lookup table of {term, {list_of_parents}}
+   */
+  private void loadStaticData()
+  {
+    parents = new HashMap<String, List<String>>();
+    for (String [] pair : TERMS) {
+      List<String> p = parents.get(pair[0]);
+      if (p == null)
+      {
+        p = new ArrayList<String>();
+        parents.put(pair[0], p);
+      }
+      p.add(pair[1]);
+    }
+  }
+
+  /**
+   * Answers true if 'child' isA 'parent' (including equality). In this
+   * implementation, based only on hard-coded values.
+   */
+  @Override
+  public boolean isA(String child, String parent)
+  {
+    if (child == null || parent == null)
+    {
+      return false;
+    }
+    if (child.equals(parent))
+    {
+      termFound(child);
+      return true;
+    }
+
+    List<String> p = parents.get(child);
+    if (p == null)
+    {
+      termNotFound(child);
+      return false;
+    }
+    termFound(child);
+    if (p.contains(parent))
+    {
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * Records a valid term queried for, for reporting purposes
+   * 
+   * @param term
+   */
+  private void termFound(String term)
+  {
+    if (!termsFound.contains(term))
+    {
+      synchronized (termsFound)
+      {
+        termsFound.add(term);
+      }
+    }
+  }
+
+  /**
+   * Records an invalid term queried for, for reporting purposes
+   * 
+   * @param term
+   */
+  private void termNotFound(String term)
+  {
+    synchronized (termsNotFound)
+    {
+      if (!termsNotFound.contains(term))
+      {
+        System.out.println("SO term " + term
+                + " not known - may be invalid, or model if needed in "
+                + getClass().getName());
+        termsNotFound.add(term);
+      }
+    }
+  }
+
+  /**
+   * Sorts (case-insensitive) and returns the list of valid terms queried for
+   */
+  @Override
+  public List<String> termsFound()
+  {
+    synchronized (termsFound)
+    {
+      Collections.sort(termsFound, String.CASE_INSENSITIVE_ORDER);
+      return termsFound;
+    }
+  }
+
+  /**
+   * Sorts (case-insensitive) and returns the list of invalid terms queried for
+   */
+  @Override
+  public List<String> termsNotFound()
+  {
+    synchronized (termsNotFound)
+    {
+      Collections.sort(termsNotFound, String.CASE_INSENSITIVE_ORDER);
+      return termsNotFound;
+    }
+  }
+}