src/jalview/io/gff/SequenceOntologyLite.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import java.util.ArrayList;
  24 import java.util.Collections;
  25 import java.util.HashMap;
  26 import java.util.List;
  27 import java.util.Map;
  28
  29 /**
  30  * An implementation of SequenceOntologyI that hard codes terms of interest.
  31  *
  32  * Use this in unit testing by calling SequenceOntology.setInstance(new
  33  * SequenceOntologyLite()).
  34  *
  35  * May also become a stand-in for SequenceOntology in the applet if we want to
  36  * avoid the additional jars needed for parsing the full SO.
  37  *
  38  * @author gmcarstairs
  39  *
  40  */
  41 public class SequenceOntologyLite implements SequenceOntologyI
  42 {
  43   /*
  44    * initial selection of types of interest when processing Ensembl features
  45    * NB unlike the full SequenceOntology we don't traverse indirect
  46    * child-parent relationships here so e.g. need to list every sub-type
  47    * (direct or indirect) that is of interest
  48    */
  49   // @formatter:off
  50   private final String[][] TERMS = new String[][] {
  51
  52     /*
  53      * gene sub-types:
  54      */
  55     { "gene", "gene" },
  56     { "ncRNA_gene", "gene" },
  57     { "snRNA_gene", "gene" },
  58     { "miRNA_gene", "gene" },
  59     { "lincRNA_gene", "gene" },
  60     { "rRNA_gene", "gene" },
  61
  62     /*
  63      * transcript sub-types:
  64      */
  65     { "transcript", "transcript" },
  66     { "mature_transcript", "transcript" },
  67     { "processed_transcript", "transcript" },
  68     { "aberrant_processed_transcript", "transcript" },
  69     { "ncRNA", "transcript" },
  70     { "snRNA", "transcript" },
  71     { "miRNA", "transcript" },
  72     { "lincRNA", "transcript" },
  73     { "rRNA", "transcript" },
  74     { "mRNA", "transcript" },
  75     // there are many more sub-types of ncRNA...
  76
  77     /*
  78      * sequence_variant sub-types
  79      */
  80     { "sequence_variant", "sequence_variant" },
  81     { "structural_variant", "sequence_variant" },
  82     { "feature_variant", "sequence_variant" },
  83     { "gene_variant", "sequence_variant" },
  84     { "transcript_variant", "sequence_variant" },
  85     // NB Ensembl uses NMD_transcript_variant as if a 'transcript'
  86     // but we model it here correctly as per the SO
  87     { "NMD_transcript_variant", "sequence_variant" },
  88     { "missense_variant", "sequence_variant" },
  89     { "synonymous_variant", "sequence_variant" },
  90     { "frameshift_variant", "sequence_variant" },
  91     { "5_prime_UTR_variant", "sequence_variant" },
  92     { "3_prime_UTR_variant", "sequence_variant" },
  93     { "stop_gained", "sequence_variant" },
  94     { "stop_lost", "sequence_variant" },
  95     { "inframe_deletion", "sequence_variant" },
  96     { "inframe_insertion", "sequence_variant" },
  97     { "splice_region_variant", "sequence_variant" },
  98
  99     /*
 100      * no sub-types of exon or CDS yet seen in Ensembl
 101      * some added here for testing purposes
 102      */
 103     { "exon", "exon" },
 104     { "coding_exon", "exon" },
 105     { "CDS", "CDS" },
 106     { "CDS_predicted", "CDS" },
 107
 108     /*
 109      * terms used in exonerate or PASA GFF
 110      */
 111     { "protein_match", "protein_match"},
 112     { "nucleotide_match", "nucleotide_match"},
 113     { "cDNA_match", "nucleotide_match"},
 114
 115     /*
 116      * used in InterProScan GFF
 117      */
 118     { "polypeptide", "polypeptide" }
 119   };
 120   // @formatter:on
 121
 122   /*
 123    * hard-coded list of any parents (direct or indirect)
 124    * that we care about for a term
 125    */
 126   private Map<String, List<String>> parents;
 127
 128   private List<String> termsFound;
 129
 130   private List<String> termsNotFound;
 131
 132   public SequenceOntologyLite()
 133   {
 134     termsFound = new ArrayList<>();
 135     termsNotFound = new ArrayList<>();
 136     loadStaticData();
 137   }
 138
 139   /**
 140    * Loads hard-coded data into a lookup table of {term, {list_of_parents}}
 141    */
 142   private void loadStaticData()
 143   {
 144     parents = new HashMap<>();
 145     for (String[] pair : TERMS)
 146     {
 147       List<String> p = parents.get(pair[0]);
 148       if (p == null)
 149       {
 150         p = new ArrayList<>();
 151         parents.put(pair[0], p);
 152       }
 153       p.add(pair[1]);
 154     }
 155   }
 156
 157   /**
 158    * Answers true if 'child' isA 'parent' (including equality). In this
 159    * implementation, based only on hard-coded values.
 160    */
 161   @Override
 162   public boolean isA(String child, String parent)
 163   {
 164     if (child == null || parent == null)
 165     {
 166       return false;
 167     }
 168     if (child.equals(parent))
 169     {
 170       termFound(child);
 171       return true;
 172     }
 173
 174     List<String> p = parents.get(child);
 175     if (p == null)
 176     {
 177       termNotFound(child);
 178       return false;
 179     }
 180     termFound(child);
 181     if (p.contains(parent))
 182     {
 183       return true;
 184     }
 185     return false;
 186   }
 187
 188   /**
 189    * Records a valid term queried for, for reporting purposes
 190    *
 191    * @param term
 192    */
 193   private void termFound(String term)
 194   {
 195     if (!termsFound.contains(term))
 196     {
 197       synchronized (termsFound)
 198       {
 199         termsFound.add(term);
 200       }
 201     }
 202   }
 203
 204   /**
 205    * Records an invalid term queried for, for reporting purposes
 206    *
 207    * @param term
 208    */
 209   private void termNotFound(String term)
 210   {
 211     synchronized (termsNotFound)
 212     {
 213       if (!termsNotFound.contains(term))
 214       {
 215         // suppress logging here as it reports Uniprot sequence features
 216         // (which do not use SO terms) when auto-configuring feature colours
 217         // System.out.println("SO term " + term
 218         // + " not known - add to model if needed in "
 219         // + getClass().getName());
 220         termsNotFound.add(term);
 221       }
 222     }
 223   }
 224
 225   /**
 226    * Sorts (case-insensitive) and returns the list of valid terms queried for
 227    */
 228   @Override
 229   public List<String> termsFound()
 230   {
 231     synchronized (termsFound)
 232     {
 233       Collections.sort(termsFound, String.CASE_INSENSITIVE_ORDER);
 234       return termsFound;
 235     }
 236   }
 237
 238   /**
 239    * Sorts (case-insensitive) and returns the list of invalid terms queried for
 240    */
 241   @Override
 242   public List<String> termsNotFound()
 243   {
 244     synchronized (termsNotFound)
 245     {
 246       Collections.sort(termsNotFound, String.CASE_INSENSITIVE_ORDER);
 247       return termsNotFound;
 248     }
 249   }
 250 }