JAL-2049 separate protein variant per dna variant (combinations tbd)

[jalview.git] / src / jalview / io / gff / SequenceOntologyLite.java
diff --git a/src/jalview/io/gff/SequenceOntologyLite.java b/src/jalview/io/gff/SequenceOntologyLite.java

index 173dea6..b3f8161 100644 (file)
--- a/src/jalview/io/gff/SequenceOntologyLite.java
+++ b/src/jalview/io/gff/SequenceOntologyLite.java
@@ -22,6 +22,9 @@ public class SequenceOntologyLite implements SequenceOntologyI
  {
    /*
     * initial selection of types of interest when processing Ensembl features
+   * NB unlike the full SequenceOntology we don't traverse indirect
+   * child-parent relationships here so e.g. need to list every sub-type
+   * of gene (direct or indirect) that is of interest
     */
    // @formatter:off
    private final String[][] TERMS = new String[][] {
@@ -32,15 +35,23 @@ public class SequenceOntologyLite implements SequenceOntologyI
      { "gene", "gene" }, 
      { "ncRNA_gene", "gene" }, 
      { "snRNA_gene", "gene" },
+    { "miRNA_gene", "gene" },
+    { "lincRNA_gene", "gene" },
+    { "rRNA_gene", "gene" },
      
      /*
       * transcript sub-types:
       */
      { "transcript", "transcript" }, 
      { "mature_transcript", "transcript" }, 
+    { "processed_transcript", "transcript" }, 
+    { "aberrant_processed_transcript", "transcript" },
      { "ncRNA", "transcript" },
      { "snRNA", "transcript" },
-    { "aberrant_processed_transcript", "transcript" },
+    { "miRNA", "transcript" },
+    { "lincRNA", "transcript" },
+    { "rRNA", "transcript" },
+    // there are many more sub-types of ncRNA...
      
      /*
       * sequence_variant sub-types:
@@ -55,10 +66,25 @@ public class SequenceOntologyLite implements SequenceOntologyI
      { "structural_variant", "sequence_variant" },
      
      /*
-     * no sub-types of exon or CDS yet encountered; add if needed
+     * no sub-types of exon or CDS yet seen in Ensembl
+     * some added here for testing purposes
       */
      { "exon", "exon" },
-    { "CDS", "CDS" }
+    { "coding_exon", "exon" },
+    { "CDS", "CDS" },
+    { "CDS_predicted", "CDS" },
+    
+    /*
+     * terms used in exonerate or PASA GFF
+     */
+    { "protein_match", "protein_match"},
+    { "nucleotide_match", "nucleotide_match"},
+    { "cDNA_match", "nucleotide_match"},
+    
+    /*
+     * used in InterProScan GFF
+     */
+    { "polypeptide", "polypeptide" }
    };
    // @formatter:on
  
@@ -155,7 +181,7 @@ public class SequenceOntologyLite implements SequenceOntologyI
        if (!termsNotFound.contains(term))
        {
          System.out.println("SO term " + term
-                + " not known - either invalid or needs modelled in "
+                + " not known - may be invalid, or model if needed in "
                  + getClass().getName());
          termsNotFound.add(term);
        }