/*
* Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
* Copyright (C) $$Year-Rel$$ The Jalview Authors
*
* This file is part of Jalview.
*
* Jalview is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* Jalview is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Jalview. If not, see .
* The Jalview Authors are detailed in the 'AUTHORS' file.
*/
package jalview.io.gff;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* An implementation of SequenceOntologyI that hard codes terms of interest.
*
* Use this in unit testing by calling SequenceOntology.setInstance(new
* SequenceOntologyLite()).
*
* May also become a stand-in for SequenceOntology in the applet if we want to
* avoid the additional jars needed for parsing the full SO.
*
* @author gmcarstairs
*
*/
public class SequenceOntologyLite implements SequenceOntologyI
{
/*
* initial selection of types of interest when processing Ensembl features
* NB unlike the full SequenceOntology we don't traverse indirect
* child-parent relationships here so e.g. need to list every sub-type
* (direct or indirect) that is of interest
*/
// @formatter:off
private final String[][] TERMS = new String[][] {
/*
* gene sub-types:
*/
{ "gene", "gene" },
{ "ncRNA_gene", "gene" },
{ "snRNA_gene", "gene" },
{ "miRNA_gene", "gene" },
{ "lincRNA_gene", "gene" },
{ "rRNA_gene", "gene" },
/*
* transcript sub-types:
*/
{ "transcript", "transcript" },
{ "mature_transcript", "transcript" },
{ "processed_transcript", "transcript" },
{ "aberrant_processed_transcript", "transcript" },
{ "ncRNA", "transcript" },
{ "snRNA", "transcript" },
{ "miRNA", "transcript" },
{ "lincRNA", "transcript" },
{ "rRNA", "transcript" },
{ "mRNA", "transcript" },
// there are many more sub-types of ncRNA...
/*
* sequence_variant sub-types
*/
{ "sequence_variant", "sequence_variant" },
{ "structural_variant", "sequence_variant" },
{ "feature_variant", "sequence_variant" },
{ "gene_variant", "sequence_variant" },
{ "transcript_variant", "sequence_variant" },
// NB Ensembl uses NMD_transcript_variant as if a 'transcript'
// but we model it here correctly as per the SO
{ "NMD_transcript_variant", "sequence_variant" },
{ "missense_variant", "sequence_variant" },
{ "synonymous_variant", "sequence_variant" },
{ "frameshift_variant", "sequence_variant" },
{ "5_prime_UTR_variant", "sequence_variant" },
{ "3_prime_UTR_variant", "sequence_variant" },
{ "stop_gained", "sequence_variant" },
{ "stop_lost", "sequence_variant" },
{ "inframe_deletion", "sequence_variant" },
{ "inframe_insertion", "sequence_variant" },
{ "splice_region_variant", "sequence_variant" },
/*
* no sub-types of exon or CDS yet seen in Ensembl
* some added here for testing purposes
*/
{ "exon", "exon" },
{ "coding_exon", "exon" },
{ "CDS", "CDS" },
{ "CDS_predicted", "CDS" },
/*
* terms used in exonerate or PASA GFF
*/
{ "protein_match", "protein_match"},
{ "nucleotide_match", "nucleotide_match"},
{ "cDNA_match", "nucleotide_match"},
/*
* used in InterProScan GFF
*/
{ "polypeptide", "polypeptide" }
};
// @formatter:on
/*
* hard-coded list of any parents (direct or indirect)
* that we care about for a term
*/
private Map> parents;
private List termsFound;
private List termsNotFound;
public SequenceOntologyLite()
{
termsFound = new ArrayList<>();
termsNotFound = new ArrayList<>();
loadStaticData();
}
/**
* Loads hard-coded data into a lookup table of {term, {list_of_parents}}
*/
private void loadStaticData()
{
parents = new HashMap<>();
for (String[] pair : TERMS)
{
List p = parents.get(pair[0]);
if (p == null)
{
p = new ArrayList<>();
parents.put(pair[0], p);
}
p.add(pair[1]);
}
}
/**
* Answers true if 'child' isA 'parent' (including equality). In this
* implementation, based only on hard-coded values.
*/
@Override
public boolean isA(String child, String parent)
{
if (child == null || parent == null)
{
return false;
}
if (child.equals(parent))
{
termFound(child);
return true;
}
List p = parents.get(child);
if (p == null)
{
termNotFound(child);
return false;
}
termFound(child);
if (p.contains(parent))
{
return true;
}
return false;
}
/**
* Records a valid term queried for, for reporting purposes
*
* @param term
*/
private void termFound(String term)
{
if (!termsFound.contains(term))
{
synchronized (termsFound)
{
termsFound.add(term);
}
}
}
/**
* Records an invalid term queried for, for reporting purposes
*
* @param term
*/
private void termNotFound(String term)
{
synchronized (termsNotFound)
{
if (!termsNotFound.contains(term))
{
// suppress logging here as it reports Uniprot sequence features
// (which do not use SO terms) when auto-configuring feature colours
// System.out.println("SO term " + term
// + " not known - add to model if needed in "
// + getClass().getName());
termsNotFound.add(term);
}
}
}
/**
* Sorts (case-insensitive) and returns the list of valid terms queried for
*/
@Override
public List termsFound()
{
synchronized (termsFound)
{
Collections.sort(termsFound, String.CASE_INSENSITIVE_ORDER);
return termsFound;
}
}
/**
* Sorts (case-insensitive) and returns the list of invalid terms queried for
*/
@Override
public List termsNotFound()
{
synchronized (termsNotFound)
{
Collections.sort(termsNotFound, String.CASE_INSENSITIVE_ORDER);
return termsNotFound;
}
}
}