src/jalview/io/gff/SequenceOntologyLite.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import jalview.datamodel.ontology.OntologyBase;
  24
  25 import java.util.ArrayList;
  26 import java.util.Collections;
  27 import java.util.HashMap;
  28 import java.util.List;
  29 import java.util.Map;
  30
  31 /**
  32  * An implementation of SequenceOntologyI that hard codes terms of interest.
  33  *
  34  * Use this in unit testing by calling SequenceOntology.setInstance(new
  35  * SequenceOntologyLite()).
  36  *
  37  * May also become a stand-in for SequenceOntology in the applet if we want to
  38  * avoid the additional jars needed for parsing the full SO.
  39  *
  40  * @author gmcarstairs
  41  *
  42  */
  43 public class SequenceOntologyLite extends OntologyBase
  44         implements SequenceOntologyI
  45 {
  46   /*
  47    * initial selection of types of interest when processing Ensembl features
  48    * NB unlike the full SequenceOntology we don't traverse indirect
  49    * child-parent relationships here so e.g. need to list every sub-type
  50    * (direct or indirect) that is of interest
  51    */
  52   // @formatter:off
  53   private final String[][] TERMS = new String[][] {
  54
  55     /*
  56      * gene sub-types:
  57      */
  58     { "gene", "gene" },
  59     { "ncRNA_gene", "gene" },
  60     { "snRNA_gene", "gene" },
  61     { "miRNA_gene", "gene" },
  62     { "lincRNA_gene", "gene" },
  63     { "rRNA_gene", "gene" },
  64
  65     /*
  66      * transcript sub-types:
  67      */
  68     { "transcript", "transcript" },
  69     { "mature_transcript", "transcript" },
  70     { "processed_transcript", "transcript" },
  71     { "aberrant_processed_transcript", "transcript" },
  72     { "ncRNA", "transcript" },
  73     { "snRNA", "transcript" },
  74     { "miRNA", "transcript" },
  75     { "lincRNA", "transcript" },
  76     { "lnc_RNA", "transcript" },
  77     { "rRNA", "transcript" },
  78     { "mRNA", "transcript" },
  79     // there are many more sub-types of ncRNA...
  80
  81     /*
  82      * sequence_variant sub-types
  83      */
  84     { "sequence_variant", "sequence_variant" },
  85     { "structural_variant", "sequence_variant" },
  86     { "feature_variant", "sequence_variant" },
  87     { "upstream_gene_variant", "sequence_variant" },
  88     { "gene_variant", "sequence_variant" },
  89     { "transcript_variant", "sequence_variant" },
  90     { "non_coding_transcript_variant", "sequence_variant" },
  91     { "non_coding_transcript_exon_variant", "sequence_variant" },
  92     // NB Ensembl uses NMD_transcript_variant as if a 'transcript'
  93     // but we model it here correctly as per the SO
  94     { "NMD_transcript_variant", "sequence_variant" },
  95     { "missense_variant", "sequence_variant" },
  96     { "synonymous_variant", "sequence_variant" },
  97     { "frameshift_variant", "sequence_variant" },
  98     { "5_prime_UTR_variant", "sequence_variant" },
  99     { "3_prime_UTR_variant", "sequence_variant" },
 100     { "stop_gained", "sequence_variant" },
 101     { "stop_lost", "sequence_variant" },
 102     { "inframe_deletion", "sequence_variant" },
 103     { "inframe_insertion", "sequence_variant" },
 104     { "splice_region_variant", "sequence_variant" },
 105
 106     /*
 107      * no sub-types of exon or CDS yet seen in Ensembl
 108      * some added here for testing purposes
 109      */
 110     { "exon", "exon" },
 111     { "coding_exon", "exon" },
 112     { "CDS", "CDS" },
 113     { "CDS_predicted", "CDS" },
 114
 115     /*
 116      * terms used in exonerate or PASA GFF
 117      */
 118     { "protein_match", "protein_match"},
 119     { "nucleotide_match", "nucleotide_match"},
 120     { "cDNA_match", "nucleotide_match"},
 121
 122     /*
 123      * used in InterProScan GFF
 124      */
 125     { "polypeptide", "polypeptide" }
 126   };
 127   // @formatter:on
 128
 129   /*
 130    * hard-coded list of any parents (direct or indirect)
 131    * that we care about for a term
 132    */
 133   private Map<String, List<String>> parents;
 134
 135   private List<String> termsFound;
 136
 137   private List<String> termsNotFound;
 138
 139   public SequenceOntologyLite()
 140   {
 141     termsFound = new ArrayList<>();
 142     termsNotFound = new ArrayList<>();
 143     loadStaticData();
 144   }
 145
 146   /**
 147    * Loads hard-coded data into a lookup table of {term, {list_of_parents}}
 148    */
 149   private void loadStaticData()
 150   {
 151     parents = new HashMap<>();
 152     for (String[] pair : TERMS)
 153     {
 154       List<String> p = parents.get(pair[0]);
 155       if (p == null)
 156       {
 157         p = new ArrayList<>();
 158         parents.put(pair[0], p);
 159       }
 160       p.add(pair[1]);
 161     }
 162   }
 163
 164   /**
 165    * Answers true if 'child' isA 'parent' (including equality). In this
 166    * implementation, based only on hard-coded values.
 167    */
 168   @Override
 169   public boolean isA(String child, String parent)
 170   {
 171     if (child == null || parent == null)
 172     {
 173       return false;
 174     }
 175     if (child.equals(parent))
 176     {
 177       termFound(child);
 178       return true;
 179     }
 180
 181     List<String> p = parents.get(child);
 182     if (p == null)
 183     {
 184       termNotFound(child);
 185       return false;
 186     }
 187     termFound(child);
 188     if (p.contains(parent))
 189     {
 190       return true;
 191     }
 192     return false;
 193   }
 194
 195   /**
 196    * Records a valid term queried for, for reporting purposes
 197    *
 198    * @param term
 199    */
 200   private void termFound(String term)
 201   {
 202     if (!termsFound.contains(term))
 203     {
 204       synchronized (termsFound)
 205       {
 206         termsFound.add(term);
 207       }
 208     }
 209   }
 210
 211   /**
 212    * Records an invalid term queried for, for reporting purposes
 213    *
 214    * @param term
 215    */
 216   private void termNotFound(String term)
 217   {
 218     synchronized (termsNotFound)
 219     {
 220       if (!termsNotFound.contains(term))
 221       {
 222         // suppress logging here as it reports Uniprot sequence features
 223         // (which do not use SO terms) when auto-configuring feature colours
 224         // System.out.println("SO term " + term
 225         // + " not known - add to model if needed in "
 226         // + getClass().getName());
 227         termsNotFound.add(term);
 228       }
 229     }
 230   }
 231
 232   /**
 233    * Sorts (case-insensitive) and returns the list of valid terms queried for
 234    */
 235   @Override
 236   public List<String> termsFound()
 237   {
 238     synchronized (termsFound)
 239     {
 240       Collections.sort(termsFound, String.CASE_INSENSITIVE_ORDER);
 241       return termsFound;
 242     }
 243   }
 244
 245   /**
 246    * Sorts (case-insensitive) and returns the list of invalid terms queried for
 247    */
 248   @Override
 249   public List<String> termsNotFound()
 250   {
 251     synchronized (termsNotFound)
 252     {
 253       Collections.sort(termsNotFound, String.CASE_INSENSITIVE_ORDER);
 254       return termsNotFound;
 255     }
 256   }
 257
 258   @Override
 259   public List<String> getRootParents(final String term)
 260   {
 261     /*
 262      * check in cache first
 263      */
 264     if (rootParents.containsKey(term))
 265     {
 266       return rootParents.get(term);
 267     }
 268
 269     List<String> top = new ArrayList<>();
 270     List<String> query = new ArrayList<>();
 271     query.add(term);
 272
 273     while (!query.isEmpty())
 274     {
 275       List<String> nextQuery = new ArrayList<>();
 276       for (String q : query)
 277       {
 278         List<String> theParents = parents.get(q);
 279         if (theParents != null)
 280         {
 281           if (theParents.size() == 1 && theParents.get(0).equals(q))
 282           {
 283             /*
 284              * top-level term
 285              */
 286             if (!top.contains(q))
 287             {
 288               top.add(q);
 289             }
 290           }
 291           else
 292           {
 293             for (String p : theParents)
 294             {
 295               if (!p.equals(q))
 296               {
 297                 nextQuery.add(p);
 298               }
 299             }
 300           }
 301         }
 302       }
 303       query = nextQuery;
 304     }
 305
 306     rootParents.put(term, top);
 307
 308     return top.isEmpty() ? null : top;
 309   }
 310
 311   @Override
 312   public List<String> getParents(String term)
 313   {
 314     List<String> result = parents.get(term);
 315     return result == null ? new ArrayList<>() : result;
 316   }
 317
 318   @Override
 319   public boolean isValidTerm(String term)
 320   {
 321     return parents.containsKey(term);
 322   }
 323 }