src/jalview/io/gff/SequenceOntologyLite.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import jalview.datamodel.ontology.OntologyBase;
  24
  25 import java.util.ArrayList;
  26 import java.util.Collections;
  27 import java.util.HashMap;
  28 import java.util.List;
  29 import java.util.Map;
  30
  31 /**
  32  * An implementation of SequenceOntologyI that hard codes terms of interest.
  33  *
  34  * Use this in unit testing by calling SequenceOntology.setInstance(new
  35  * SequenceOntologyLite()).
  36  *
  37  * May also become a stand-in for SequenceOntology in the applet if we want to
  38  * avoid the additional jars needed for parsing the full SO.
  39  *
  40  * @author gmcarstairs
  41  *
  42  */
  43 public class SequenceOntologyLite extends OntologyBase
  44         implements SequenceOntologyI
  45 {
  46   /*
  47    * initial selection of types of interest when processing Ensembl features
  48    * NB unlike the full SequenceOntology we don't traverse indirect
  49    * child-parent relationships here so e.g. need to list every sub-type
  50    * (direct or indirect) that is of interest
  51    */
  52   // @formatter:off
  53   private final String[][] TERMS = new String[][] {
  54
  55     /*
  56      * gene sub-types:
  57      */
  58     { "gene", "gene" },
  59     { "ncRNA_gene", "gene" },
  60     { "snRNA_gene", "gene" },
  61     { "miRNA_gene", "gene" },
  62     { "lincRNA_gene", "gene" },
  63     { "rRNA_gene", "gene" },
  64
  65     /*
  66      * transcript sub-types:
  67      */
  68     { "transcript", "transcript" },
  69     { "mature_transcript", "transcript" },
  70     { "processed_transcript", "transcript" },
  71     { "aberrant_processed_transcript", "transcript" },
  72     { "ncRNA", "transcript" },
  73     { "snRNA", "transcript" },
  74     { "miRNA", "transcript" },
  75     { "lincRNA", "transcript" },
  76     { "rRNA", "transcript" },
  77     { "mRNA", "transcript" },
  78     // there are many more sub-types of ncRNA...
  79
  80     /*
  81      * sequence_variant sub-types
  82      */
  83     { "sequence_variant", "sequence_variant" },
  84     { "structural_variant", "sequence_variant" },
  85     { "feature_variant", "sequence_variant" },
  86     { "upstream_gene_variant", "sequence_variant" },
  87     { "gene_variant", "sequence_variant" },
  88     { "transcript_variant", "sequence_variant" },
  89     { "non_coding_transcript_variant", "sequence_variant" },
  90     { "non_coding_transcript_exon_variant", "sequence_variant" },
  91     // NB Ensembl uses NMD_transcript_variant as if a 'transcript'
  92     // but we model it here correctly as per the SO
  93     { "NMD_transcript_variant", "sequence_variant" },
  94     { "missense_variant", "sequence_variant" },
  95     { "synonymous_variant", "sequence_variant" },
  96     { "frameshift_variant", "sequence_variant" },
  97     { "5_prime_UTR_variant", "sequence_variant" },
  98     { "3_prime_UTR_variant", "sequence_variant" },
  99     { "stop_gained", "sequence_variant" },
 100     { "stop_lost", "sequence_variant" },
 101     { "inframe_deletion", "sequence_variant" },
 102     { "inframe_insertion", "sequence_variant" },
 103     { "splice_region_variant", "sequence_variant" },
 104
 105     /*
 106      * no sub-types of exon or CDS yet seen in Ensembl
 107      * some added here for testing purposes
 108      */
 109     { "exon", "exon" },
 110     { "coding_exon", "exon" },
 111     { "CDS", "CDS" },
 112     { "CDS_predicted", "CDS" },
 113
 114     /*
 115      * terms used in exonerate or PASA GFF
 116      */
 117     { "protein_match", "protein_match"},
 118     { "nucleotide_match", "nucleotide_match"},
 119     { "cDNA_match", "nucleotide_match"},
 120
 121     /*
 122      * used in InterProScan GFF
 123      */
 124     { "polypeptide", "polypeptide" }
 125   };
 126   // @formatter:on
 127
 128   /*
 129    * hard-coded list of any parents (direct or indirect)
 130    * that we care about for a term
 131    */
 132   private Map<String, List<String>> parents;
 133
 134   private List<String> termsFound;
 135
 136   private List<String> termsNotFound;
 137
 138   public SequenceOntologyLite()
 139   {
 140     termsFound = new ArrayList<>();
 141     termsNotFound = new ArrayList<>();
 142     loadStaticData();
 143   }
 144
 145   /**
 146    * Loads hard-coded data into a lookup table of {term, {list_of_parents}}
 147    */
 148   private void loadStaticData()
 149   {
 150     parents = new HashMap<>();
 151     for (String[] pair : TERMS)
 152     {
 153       List<String> p = parents.get(pair[0]);
 154       if (p == null)
 155       {
 156         p = new ArrayList<>();
 157         parents.put(pair[0], p);
 158       }
 159       p.add(pair[1]);
 160     }
 161   }
 162
 163   /**
 164    * Answers true if 'child' isA 'parent' (including equality). In this
 165    * implementation, based only on hard-coded values.
 166    */
 167   @Override
 168   public boolean isA(String child, String parent)
 169   {
 170     if (child == null || parent == null)
 171     {
 172       return false;
 173     }
 174     if (child.equals(parent))
 175     {
 176       termFound(child);
 177       return true;
 178     }
 179
 180     List<String> p = parents.get(child);
 181     if (p == null)
 182     {
 183       termNotFound(child);
 184       return false;
 185     }
 186     termFound(child);
 187     if (p.contains(parent))
 188     {
 189       return true;
 190     }
 191     return false;
 192   }
 193
 194   /**
 195    * Records a valid term queried for, for reporting purposes
 196    *
 197    * @param term
 198    */
 199   private void termFound(String term)
 200   {
 201     if (!termsFound.contains(term))
 202     {
 203       synchronized (termsFound)
 204       {
 205         termsFound.add(term);
 206       }
 207     }
 208   }
 209
 210   /**
 211    * Records an invalid term queried for, for reporting purposes
 212    *
 213    * @param term
 214    */
 215   private void termNotFound(String term)
 216   {
 217     synchronized (termsNotFound)
 218     {
 219       if (!termsNotFound.contains(term))
 220       {
 221         // suppress logging here as it reports Uniprot sequence features
 222         // (which do not use SO terms) when auto-configuring feature colours
 223         // System.out.println("SO term " + term
 224         // + " not known - add to model if needed in "
 225         // + getClass().getName());
 226         termsNotFound.add(term);
 227       }
 228     }
 229   }
 230
 231   /**
 232    * Sorts (case-insensitive) and returns the list of valid terms queried for
 233    */
 234   @Override
 235   public List<String> termsFound()
 236   {
 237     synchronized (termsFound)
 238     {
 239       Collections.sort(termsFound, String.CASE_INSENSITIVE_ORDER);
 240       return termsFound;
 241     }
 242   }
 243
 244   /**
 245    * Sorts (case-insensitive) and returns the list of invalid terms queried for
 246    */
 247   @Override
 248   public List<String> termsNotFound()
 249   {
 250     synchronized (termsNotFound)
 251     {
 252       Collections.sort(termsNotFound, String.CASE_INSENSITIVE_ORDER);
 253       return termsNotFound;
 254     }
 255   }
 256
 257   @Override
 258   public List<String> getRootParents(final String term)
 259   {
 260     /*
 261      * check in cache first
 262      */
 263     if (rootParents.containsKey(term))
 264     {
 265       return rootParents.get(term);
 266     }
 267
 268     List<String> top = new ArrayList<>();
 269     List<String> query = new ArrayList<>();
 270     query.add(term);
 271
 272     while (!query.isEmpty())
 273     {
 274       List<String> nextQuery = new ArrayList<>();
 275       for (String q : query)
 276       {
 277         List<String> theParents = parents.get(q);
 278         if (theParents != null)
 279         {
 280           if (theParents.size() == 1 && theParents.get(0).equals(q))
 281           {
 282             /*
 283              * top-level term
 284              */
 285             if (!top.contains(q))
 286             {
 287               top.add(q);
 288             }
 289           }
 290           else
 291           {
 292             for (String p : theParents)
 293             {
 294               if (!p.equals(q))
 295               {
 296                 nextQuery.add(p);
 297               }
 298             }
 299           }
 300         }
 301       }
 302       query = nextQuery;
 303     }
 304
 305     rootParents.put(term, top);
 306
 307     return top.isEmpty() ? null : top;
 308   }
 309 }