{
/*
* initial selection of types of interest when processing Ensembl features
+ * NB unlike the full SequenceOntology we don't traverse indirect
+ * child-parent relationships here so e.g. need to list every sub-type
+ * of gene (direct or indirect) that is of interest
*/
// @formatter:off
private final String[][] TERMS = new String[][] {
{ "gene", "gene" },
{ "ncRNA_gene", "gene" },
{ "snRNA_gene", "gene" },
+ { "miRNA_gene", "gene" },
+ { "lincRNA_gene", "gene" },
+ { "rRNA_gene", "gene" },
/*
* transcript sub-types:
*/
{ "transcript", "transcript" },
{ "mature_transcript", "transcript" },
+ { "processed_transcript", "transcript" },
+ { "aberrant_processed_transcript", "transcript" },
{ "ncRNA", "transcript" },
{ "snRNA", "transcript" },
- { "aberrant_processed_transcript", "transcript" },
+ { "miRNA", "transcript" },
+ { "lincRNA", "transcript" },
+ { "rRNA", "transcript" },
+ // there are many more sub-types of ncRNA...
/*
* sequence_variant sub-types:
{ "structural_variant", "sequence_variant" },
/*
- * no sub-types of exon or CDS yet encountered; add if needed
+ * no sub-types of exon or CDS yet seen in Ensembl
+ * some added here for testing purposes
*/
{ "exon", "exon" },
- { "CDS", "CDS" }
+ { "coding_exon", "exon" },
+ { "CDS", "CDS" },
+ { "CDS_predicted", "CDS" },
+
+ /*
+ * terms used in exonerate or PASA GFF
+ */
+ { "protein_match", "protein_match"},
+ { "nucleotide_match", "nucleotide_match"},
+ { "cDNA_match", "nucleotide_match"},
+
+ /*
+ * used in InterProScan GFF
+ */
+ { "polypeptide", "polypeptide" }
};
// @formatter:on
if (!termsNotFound.contains(term))
{
System.out.println("SO term " + term
- + " not known - either invalid or needs modelled in "
+ + " not known - may be invalid, or model if needed in "
+ getClass().getName());
termsNotFound.add(term);
}