From 3cccac4697c371b1964236e17b532fd3d180e1c4 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Mon, 1 Feb 2016 10:17:49 +0000 Subject: [PATCH] JAL-1705 JAL-1191 SequenceOntologyLite added as hard-coded alternative --- src/jalview/analysis/AlignmentUtils.java | 5 +- src/jalview/ext/ensembl/EnsemblCdna.java | 7 +- src/jalview/ext/ensembl/EnsemblCds.java | 11 +- src/jalview/ext/ensembl/EnsemblGene.java | 26 ++- src/jalview/ext/ensembl/EnsemblOverlap.java | 3 - src/jalview/ext/ensembl/EnsemblRestClient.java | 3 + src/jalview/ext/ensembl/EnsemblSeqProxy.java | 70 ++++---- src/jalview/io/gff/Gff3Helper.java | 13 +- src/jalview/io/gff/InterProScanHelper.java | 7 +- src/jalview/io/gff/SequenceOntology.java | 132 +++++++++----- src/jalview/io/gff/SequenceOntologyFactory.java | 21 +++ src/jalview/io/gff/SequenceOntologyI.java | 54 ++++++ src/jalview/io/gff/SequenceOntologyLite.java | 190 +++++++++++++++++++++ test/jalview/ext/ensembl/EnsemblSeqProxyTest.java | 83 +++++++++ test/jalview/io/gff/SequenceOntologyTest.java | 27 +-- 15 files changed, 519 insertions(+), 133 deletions(-) create mode 100644 src/jalview/io/gff/SequenceOntologyFactory.java create mode 100644 src/jalview/io/gff/SequenceOntologyI.java create mode 100644 src/jalview/io/gff/SequenceOntologyLite.java diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 41538eb..34eaa60 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -34,7 +34,8 @@ import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceGroup; import jalview.datamodel.SequenceI; -import jalview.io.gff.SequenceOntology; +import jalview.io.gff.SequenceOntologyFactory; +import jalview.io.gff.SequenceOntologyI; import jalview.schemes.ResidueProperties; import jalview.util.DBRefUtils; import jalview.util.MapList; @@ -1435,7 +1436,7 @@ public class AlignmentUtils copyTo = copyTo.getDatasetSequence(); } - SequenceOntology so = SequenceOntology.getInstance(); + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); int count = 0; SequenceFeature[] sfs = fromSeq.getSequenceFeatures(); if (sfs != null) diff --git a/src/jalview/ext/ensembl/EnsemblCdna.java b/src/jalview/ext/ensembl/EnsemblCdna.java index 139e44f..373286f 100644 --- a/src/jalview/ext/ensembl/EnsemblCdna.java +++ b/src/jalview/ext/ensembl/EnsemblCdna.java @@ -1,7 +1,8 @@ package jalview.ext.ensembl; import jalview.datamodel.SequenceFeature; -import jalview.io.gff.SequenceOntology; +import jalview.io.gff.SequenceOntologyFactory; +import jalview.io.gff.SequenceOntologyI; import java.util.List; @@ -68,8 +69,8 @@ public class EnsemblCdna extends EnsemblSeqProxy @Override protected boolean identifiesSequence(SequenceFeature sf, String accId) { - if (SequenceOntology.getInstance().isA(sf.getType(), - SequenceOntology.EXON)) + if (SequenceOntologyFactory.getInstance().isA(sf.getType(), + SequenceOntologyI.EXON)) { String parentFeature = (String) sf.getValue(PARENT); if (("transcript:" + accId).equals(parentFeature)) diff --git a/src/jalview/ext/ensembl/EnsemblCds.java b/src/jalview/ext/ensembl/EnsemblCds.java index 22c0a06..ec5780f 100644 --- a/src/jalview/ext/ensembl/EnsemblCds.java +++ b/src/jalview/ext/ensembl/EnsemblCds.java @@ -2,7 +2,8 @@ package jalview.ext.ensembl; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.io.gff.SequenceOntology; +import jalview.io.gff.SequenceOntologyFactory; +import jalview.io.gff.SequenceOntologyI; import java.util.List; @@ -51,8 +52,8 @@ public class EnsemblCds extends EnsemblSeqProxy @Override protected boolean retainFeature(SequenceFeature sf, String accessionId) { - if (SequenceOntology.getInstance().isA(sf.getType(), - SequenceOntology.CDS)) + if (SequenceOntologyFactory.getInstance().isA(sf.getType(), + SequenceOntologyI.CDS)) { return false; } @@ -67,8 +68,8 @@ public class EnsemblCds extends EnsemblSeqProxy @Override protected boolean identifiesSequence(SequenceFeature sf, String accId) { - if (SequenceOntology.getInstance().isA(sf.getType(), - SequenceOntology.CDS)) + if (SequenceOntologyFactory.getInstance().isA(sf.getType(), + SequenceOntologyI.CDS)) { String parentFeature = (String) sf.getValue(PARENT); if (("transcript:" + accId).equals(parentFeature)) diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java index 1325bec..df246f8 100644 --- a/src/jalview/ext/ensembl/EnsemblGene.java +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -4,7 +4,8 @@ import jalview.datamodel.AlignmentI; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.io.gff.SequenceOntology; +import jalview.io.gff.SequenceOntologyFactory; +import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; import java.util.ArrayList; @@ -132,10 +133,10 @@ public class EnsemblGene extends EnsemblSeqProxy */ String parentId = "transcript:" + accId; List splices = findFeatures(gene, - SequenceOntology.EXON, parentId); + SequenceOntologyI.EXON, parentId); if (splices.isEmpty()) { - splices = findFeatures(gene, SequenceOntology.CDS, parentId); + splices = findFeatures(gene, SequenceOntologyI.CDS, parentId); } int transcriptLength = 0; @@ -176,7 +177,7 @@ public class EnsemblGene extends EnsemblSeqProxy /* * and finally fetch the protein product and save as a cross-reference */ - addProteinProduct(transcript); + new EnsemblCdna().addProteinProduct(transcript); return transcript; } @@ -240,8 +241,8 @@ public class EnsemblGene extends EnsemblSeqProxy @Override protected boolean identifiesSequence(SequenceFeature sf, String accId) { - if (SequenceOntology.getInstance().isA(sf.getType(), - SequenceOntology.GENE)) + if (SequenceOntologyFactory.getInstance().isA(sf.getType(), + SequenceOntologyI.GENE)) { String id = (String) sf.getValue(ID); if (("gene:" + accId).equals(id)) @@ -262,8 +263,8 @@ public class EnsemblGene extends EnsemblSeqProxy @Override protected boolean retainFeature(SequenceFeature sf, String accessionId) { - if (SequenceOntology.getInstance().isA(sf.getType(), - SequenceOntology.GENE)) + if (SequenceOntologyFactory.getInstance().isA(sf.getType(), + SequenceOntologyI.GENE)) { return false; } @@ -299,4 +300,13 @@ public class EnsemblGene extends EnsemblSeqProxy return super.getCrossReferenceDatabases(); } + /** + * Override to do nothing as Ensembl doesn't return a protein sequence for a + * gene identifier + */ + @Override + protected void addProteinProduct(SequenceI querySeq) + { + } + } diff --git a/src/jalview/ext/ensembl/EnsemblOverlap.java b/src/jalview/ext/ensembl/EnsemblOverlap.java index b1514d8..507b6f8 100644 --- a/src/jalview/ext/ensembl/EnsemblOverlap.java +++ b/src/jalview/ext/ensembl/EnsemblOverlap.java @@ -42,14 +42,11 @@ public class EnsemblOverlap extends EnsemblRestClient @Override public AlignmentI getSequenceRecords(String query) throws IOException { - long now = System.currentTimeMillis(); // TODO: use a vararg String... for getSequenceRecords instead? List queries = new ArrayList(); queries.add(query); FileParse fp = getSequenceReader(queries); FeaturesFile fr = new FeaturesFile(fp); - System.out.println(getClass().getName() + " took " - + (System.currentTimeMillis() - now) + "ms to fetch"); return new Alignment(fr.getSeqsAsArray()); } diff --git a/src/jalview/ext/ensembl/EnsemblRestClient.java b/src/jalview/ext/ensembl/EnsemblRestClient.java index 2fd7fa3..dc4cc88 100644 --- a/src/jalview/ext/ensembl/EnsemblRestClient.java +++ b/src/jalview/ext/ensembl/EnsemblRestClient.java @@ -140,6 +140,7 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher protected BufferedReader getHttpResponse(URL url, List ids) throws IOException { + long now = System.currentTimeMillis(); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); /* @@ -175,6 +176,8 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher "Response code was not 200. Detected response was " + responseCode); } + System.out.println(getClass().getName() + " took " + + (System.currentTimeMillis() - now) + "ms to fetch"); BufferedReader reader = null; reader = new BufferedReader(new InputStreamReader(response, "UTF-8")); diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 0bfeda1..744aa49 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -11,7 +11,8 @@ import jalview.datamodel.SequenceI; import jalview.exceptions.JalviewException; import jalview.io.FastaFile; import jalview.io.FileParse; -import jalview.io.gff.SequenceOntology; +import jalview.io.gff.SequenceOntologyFactory; +import jalview.io.gff.SequenceOntologyI; import jalview.schemes.ResidueProperties; import jalview.util.DBRefUtils; import jalview.util.MapList; @@ -127,7 +128,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient @Override public AlignmentI getSequenceRecords(String query) throws Exception { - long now = System.currentTimeMillis(); // TODO use a String... query vararg instead? // danger: accession separator used as a regex here, a string elsewhere @@ -156,14 +156,15 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient + " chunks. Unexpected problem (" + r.getLocalizedMessage() + ")"; System.err.println(msg); - if (alignment != null) - { - break; // return what we got - } - else - { - throw new JalviewException(msg, r); - } + break; + // if (alignment != null) + // { + // break; // return what we got + // } + // else + // { + // throw new JalviewException(msg, r); + // } } } @@ -181,8 +182,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient getCrossReferences(seq); } - System.out.println(getClass().getName() + " took " - + (System.currentTimeMillis() - now) + "ms to fetch"); return alignment; } @@ -368,11 +367,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Adds CDS ranges to the ranges list, and returns the total length mapped. + * Adds CDS ranges to the ranges list, and returns the total length mapped + * from. * - * No need to worry about reverse strand dna here since the retrieved sequence - * is as transcribed (reverse complement for reverse strand), i.e in the same - * sense as the peptide. + * No need to worry about reverse strand dna, here since the retrieved + * sequence is as transcribed (reverse complement for reverse strand), i.e in + * the same sense as the peptide. * * @param dnaSeq * @param ranges @@ -391,7 +391,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /* * process a CDS feature (or a sub-type of CDS) */ - if (SequenceOntology.getInstance().isA(sf.getType(), SequenceOntology.CDS)) + if (SequenceOntologyFactory.getInstance().isA(sf.getType(), + SequenceOntologyI.CDS)) { int phase = 0; try { @@ -579,7 +580,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * the start position of the sequence we are mapping to * @return */ - protected MapList getGenomicRanges(SequenceI sourceSequence, + protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence, String accId, int start) { SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); @@ -605,11 +606,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ if (identifiesSequence(sf, accId)) { - int strand = sf.getStrand(); - - if (directionSet && strand != direction) - { - // abort - mix of forward and backward + int strand = sf.getStrand(); + strand = strand == 0 ? 1 : strand; // treat unknown as forward + + if (directionSet && strand != direction) + { + // abort - mix of forward and backward System.err.println("Error: forward and backward strand for " + accId); return null; @@ -654,8 +656,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ Collections.sort(regions, new RangeSorter(direction == 1)); - List to = new ArrayList(); - to.add(new int[] { start, start + mappedLength - 1 }); + List to = Arrays.asList(new int[] { start, + start + mappedLength - 1 }); return new MapList(regions, to, 1, 1); } @@ -710,7 +712,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /* * for sequence_variant, make an additional feature with consequence */ - if (SequenceOntology.getInstance().isSequenceVariant(sf.getType())) + if (SequenceOntologyFactory.getInstance().isA(sf.getType(), + SequenceOntologyI.SEQUENCE_VARIANT)) { String consequence = (String) sf.getValue(CONSEQUENCE_TYPE); if (consequence != null) @@ -741,7 +744,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); - MapList mapping = getGenomicRanges(sourceSequence, accessionId, + MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId, targetSequence.getStart()); if (mapping == null) { @@ -850,7 +853,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient SequenceFeature[] sfs = sequence.getSequenceFeatures(); if (sfs != null) { - SequenceOntology so = SequenceOntology.getInstance(); + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); for (SequenceFeature sf :sfs) { if (so.isA(sf.getType(), type)) { @@ -888,7 +891,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } AlignmentUtils.transferFeatures(dnaSeq, peptide, dnaToProtein, - SequenceOntology.EXON); + SequenceOntologyI.EXON); LinkedHashMap variants = buildDnaVariantsMap( dnaSeq, dnaToProtein); @@ -909,7 +912,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String desc = StringUtils.listToDelimitedString(peptideVariants, ", "); SequenceFeature sf = new SequenceFeature( - SequenceOntology.SEQUENCE_VARIANT, desc, peptidePos, + SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos, peptidePos, 0f, null); peptide.addSequenceFeature(sf); count++; @@ -934,7 +937,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * LinkedHashMap ensures we add the peptide features in sequence order */ LinkedHashMap variants = new LinkedHashMap(); - SequenceOntology so = SequenceOntology.getInstance(); + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures(); if (dnaFeatures == null) @@ -957,7 +960,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient // not handling multi-locus variant features continue; } - if (so.isSequenceVariant(sf.getType())) + if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT)) { int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol); if (mapsTo == null) @@ -1096,6 +1099,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient public static boolean isTranscript(String featureType) { return NMD_VARIANT.equals(featureType) - || SequenceOntology.getInstance().isA(featureType, SequenceOntology.TRANSCRIPT); + || SequenceOntologyFactory.getInstance().isA(featureType, + SequenceOntologyI.TRANSCRIPT); } } diff --git a/src/jalview/io/gff/Gff3Helper.java b/src/jalview/io/gff/Gff3Helper.java index 2e98e4e..d29645b 100644 --- a/src/jalview/io/gff/Gff3Helper.java +++ b/src/jalview/io/gff/Gff3Helper.java @@ -70,12 +70,13 @@ public class Gff3Helper extends GffHelperBase String atts = gff[ATTRIBUTES_COL]; Map> attributes = parseNameValuePairs(atts); - if (SequenceOntology.getInstance().isProteinMatch(soTerm)) + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH)) { - sf = processProteinMatch(attributes, seq, gff, align, - newseqs, relaxedIdMatching); + sf = processProteinMatch(attributes, seq, gff, align, newseqs, + relaxedIdMatching); } - else if (SequenceOntology.getInstance().isNucleotideMatch(soTerm)) + else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH)) { sf = processNucleotideMatch(attributes, seq, gff, align, newseqs, relaxedIdMatching); @@ -372,9 +373,9 @@ public class Gff3Helper extends GffHelperBase desc = target.split(" ")[0]; } - SequenceOntology so = SequenceOntology.getInstance(); + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); String type = sf.getType(); - if (so.isSequenceVariant(type)) + if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT)) { /* * Ensembl returns dna variants as 'alleles' diff --git a/src/jalview/io/gff/InterProScanHelper.java b/src/jalview/io/gff/InterProScanHelper.java index 3323e27..68d5d4f 100644 --- a/src/jalview/io/gff/InterProScanHelper.java +++ b/src/jalview/io/gff/InterProScanHelper.java @@ -89,10 +89,11 @@ public class InterProScanHelper extends Gff3Helper */ public static boolean recognises(String[] columns) { - SequenceOntology so = SequenceOntology.getInstance(); + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); String type = columns[TYPE_COL]; - if (so.isProteinMatch(type) - || (".".equals(columns[SOURCE_COL]) && so.isPolypeptide(type))) + if (so.isA(type, SequenceOntologyI.PROTEIN_MATCH) + || (".".equals(columns[SOURCE_COL]) && so.isA(type, + SequenceOntologyI.POLYPEPTIDE))) { return true; } diff --git a/src/jalview/io/gff/SequenceOntology.java b/src/jalview/io/gff/SequenceOntology.java index 685b83e..b069eef 100644 --- a/src/jalview/io/gff/SequenceOntology.java +++ b/src/jalview/io/gff/SequenceOntology.java @@ -7,6 +7,7 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.text.ParseException; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -25,32 +26,8 @@ import org.biojava.nbio.ontology.utils.Annotation; * A wrapper class that parses the Sequence Ontology and exposes useful access * methods. This version uses the BioJava parser. */ -public class SequenceOntology +class SequenceOntology implements SequenceOntologyI { - - /* - * selected commonly used values for quick reference - */ - // SO:0000316 - public static final String CDS = "CDS"; - - // SO:0001060 - public static final String SEQUENCE_VARIANT = "sequence_variant"; - - // SO:0000147 - public static final String EXON = "exon"; - - // SO:0000673 - public static final String TRANSCRIPT = "transcript"; - - // SO:0000704 - public static final String GENE = "gene"; - - /* - * singleton instance of this class - */ - private static SequenceOntology instance; - /* * the parsed Ontology data as modelled by BioJava */ @@ -73,26 +50,18 @@ public class SequenceOntology */ private Map> termIsA; - /** - * Returns singleton instance - * - * @return - */ - public synchronized static SequenceOntology getInstance() - { - if (instance == null) - { - instance = new SequenceOntology(); - } - return instance; - } + private List termsFound; + + private List termsNotFound; /** - * Private constructor to enforce use of singleton. Parses and caches the SO - * OBO data file. + * Package private constructor to enforce use of singleton. Parses and caches + * the SO OBO data file. */ - private SequenceOntology() + SequenceOntology() { + termsFound = new ArrayList(); + termsNotFound = new ArrayList(); termsByDescription = new HashMap(); termIsA = new HashMap>(); @@ -248,7 +217,7 @@ public class SequenceOntology */ public boolean isNucleotideMatch(String soTerm) { - return isA(soTerm, "nucleotide_match"); + return isA(soTerm, NUCLEOTIDE_MATCH); } /** @@ -261,7 +230,7 @@ public class SequenceOntology */ public boolean isProteinMatch(String soTerm) { - return isA(soTerm, "protein_match"); + return isA(soTerm, PROTEIN_MATCH); } /** @@ -274,7 +243,7 @@ public class SequenceOntology */ public boolean isPolypeptide(String soTerm) { - return isA(soTerm, "polypeptide"); + return isA(soTerm, POLYPEPTIDE); } /** @@ -285,23 +254,70 @@ public class SequenceOntology * @param parent * @return */ + @Override public boolean isA(String child, String parent) { + if (child == null || parent == null) + { + return false; + } /* * optimise trivial checks like isA("CDS", "CDS") */ if (child.equals(parent)) { + termFound(child); return true; } Term childTerm = getTerm(child); + if (childTerm != null) + { + termFound(child); + } + else + { + termNotFound(child); + } Term parentTerm = getTerm(parent); return termIsA(childTerm, parentTerm); } /** + * Records a valid term queried for, for reporting purposes + * + * @param term + */ + private void termFound(String term) + { + synchronized (termsFound) + { + if (!termsFound.contains(term)) + { + termsFound.add(term); + } + } + } + + /** + * Records an invalid term queried for, for reporting purposes + * + * @param term + */ + private void termNotFound(String term) + { + synchronized (termsNotFound) + { + if (!termsNotFound.contains(term)) + { + System.err.println("SO term " + term + " invalid"); + termsNotFound.add(term); + } + } + } + + /** * Returns true if the childTerm 'isA' parentTerm (directly or indirectly). * * @param childTerm @@ -402,6 +418,32 @@ public class SequenceOntology public boolean isSequenceVariant(String soTerm) { - return isA(soTerm, "sequence_variant"); + return isA(soTerm, SEQUENCE_VARIANT); + } + + /** + * Sorts (case-insensitive) and returns the list of valid terms queried for + */ + @Override + public List termsFound() + { + synchronized (termsFound) + { + Collections.sort(termsFound, String.CASE_INSENSITIVE_ORDER); + return termsFound; + } + } + + /** + * Sorts (case-insensitive) and returns the list of invalid terms queried for + */ + @Override + public List termsNotFound() + { + synchronized (termsNotFound) + { + Collections.sort(termsNotFound, String.CASE_INSENSITIVE_ORDER); + return termsNotFound; + } } } diff --git a/src/jalview/io/gff/SequenceOntologyFactory.java b/src/jalview/io/gff/SequenceOntologyFactory.java new file mode 100644 index 0000000..3eaa5d1 --- /dev/null +++ b/src/jalview/io/gff/SequenceOntologyFactory.java @@ -0,0 +1,21 @@ +package jalview.io.gff; + +public class SequenceOntologyFactory +{ + private static SequenceOntologyI instance; + + public static synchronized SequenceOntologyI getInstance() + { + if (instance == null) + { + // instance = new SequenceOntology(); + instance = new SequenceOntologyLite(); + } + return instance; + } + + public static void setInstance(SequenceOntologyI so) + { + instance = so; + } +} diff --git a/src/jalview/io/gff/SequenceOntologyI.java b/src/jalview/io/gff/SequenceOntologyI.java new file mode 100644 index 0000000..8128177 --- /dev/null +++ b/src/jalview/io/gff/SequenceOntologyI.java @@ -0,0 +1,54 @@ +package jalview.io.gff; + +import java.util.List; + +public interface SequenceOntologyI +{ + /* + * selected commonly used values for quick reference + */ + public static final String POLYPEPTIDE = "polypeptide"; + + public static final String PROTEIN_MATCH = "protein_match"; + + public static final String NUCLEOTIDE_MATCH = "nucleotide_match"; + + // SO:0000316 + public static final String CDS = "CDS"; + + // SO:0001060 + public static final String SEQUENCE_VARIANT = "sequence_variant"; + + // SO:0000147 + public static final String EXON = "exon"; + + // SO:0000673 + public static final String TRANSCRIPT = "transcript"; + + // SO:0000704 + public static final String GENE = "gene"; + + public boolean isA(String childTerm, String parentTerm); + + /** + * Returns a sorted list of all valid terms queried for (i.e. terms processed + * which were valid in the SO), using the friendly description. + * + * This can be used to check that any hard-coded stand-in for the full SO + * includes all the terms needed for correct processing. + * + * @return + */ + public List termsFound(); + + /** + * Returns a sorted list of all invalid terms queried for (i.e. terms + * processed which were not found in the SO), using the friendly description. + * + * This can be used to report any 'non-compliance' in data, and/or to report + * valid terms missing from any hard-coded stand-in for the full SO. + * + * @return + */ + public List termsNotFound(); +} diff --git a/src/jalview/io/gff/SequenceOntologyLite.java b/src/jalview/io/gff/SequenceOntologyLite.java new file mode 100644 index 0000000..173dea6 --- /dev/null +++ b/src/jalview/io/gff/SequenceOntologyLite.java @@ -0,0 +1,190 @@ +package jalview.io.gff; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * An implementation of SequenceOntologyI that hard codes terms of interest. + * + * Use this in unit testing by calling SequenceOntology.setInstance(new + * SequenceOntologyLite()). + * + * May also become a stand-in for SequenceOntology in the applet if we want to + * avoid the additional jars needed for parsing the full SO. + * + * @author gmcarstairs + * + */ +public class SequenceOntologyLite implements SequenceOntologyI +{ + /* + * initial selection of types of interest when processing Ensembl features + */ + // @formatter:off + private final String[][] TERMS = new String[][] { + + /* + * gene sub-types: + */ + { "gene", "gene" }, + { "ncRNA_gene", "gene" }, + { "snRNA_gene", "gene" }, + + /* + * transcript sub-types: + */ + { "transcript", "transcript" }, + { "mature_transcript", "transcript" }, + { "ncRNA", "transcript" }, + { "snRNA", "transcript" }, + { "aberrant_processed_transcript", "transcript" }, + + /* + * sequence_variant sub-types: + */ + { "sequence_variant", "sequence_variant" }, + { "feature_variant", "sequence_variant" }, + { "gene_variant", "sequence_variant" }, + // NB Ensembl uses NMD_transcript_variant as if a 'transcript' + // but we model it here correctly as per the SO + { "NMD_transcript_variant", "sequence_variant" }, + { "transcript_variant", "sequence_variant" }, + { "structural_variant", "sequence_variant" }, + + /* + * no sub-types of exon or CDS yet encountered; add if needed + */ + { "exon", "exon" }, + { "CDS", "CDS" } + }; + // @formatter:on + + /* + * hard-coded list of any parents (direct or indirect) + * that we care about for a term + */ + private Map> parents; + + private List termsFound; + + private List termsNotFound; + + public SequenceOntologyLite() + { + termsFound = new ArrayList(); + termsNotFound = new ArrayList(); + loadStaticData(); + } + + /** + * Loads hard-coded data into a lookup table of {term, {list_of_parents}} + */ + private void loadStaticData() + { + parents = new HashMap>(); + for (String [] pair : TERMS) { + List p = parents.get(pair[0]); + if (p == null) + { + p = new ArrayList(); + parents.put(pair[0], p); + } + p.add(pair[1]); + } + } + + /** + * Answers true if 'child' isA 'parent' (including equality). In this + * implementation, based only on hard-coded values. + */ + @Override + public boolean isA(String child, String parent) + { + if (child == null || parent == null) + { + return false; + } + if (child.equals(parent)) + { + termFound(child); + return true; + } + + List p = parents.get(child); + if (p == null) + { + termNotFound(child); + return false; + } + termFound(child); + if (p.contains(parent)) + { + return true; + } + return false; + } + + /** + * Records a valid term queried for, for reporting purposes + * + * @param term + */ + private void termFound(String term) + { + if (!termsFound.contains(term)) + { + synchronized (termsFound) + { + termsFound.add(term); + } + } + } + + /** + * Records an invalid term queried for, for reporting purposes + * + * @param term + */ + private void termNotFound(String term) + { + synchronized (termsNotFound) + { + if (!termsNotFound.contains(term)) + { + System.out.println("SO term " + term + + " not known - either invalid or needs modelled in " + + getClass().getName()); + termsNotFound.add(term); + } + } + } + + /** + * Sorts (case-insensitive) and returns the list of valid terms queried for + */ + @Override + public List termsFound() + { + synchronized (termsFound) + { + Collections.sort(termsFound, String.CASE_INSENSITIVE_ORDER); + return termsFound; + } + } + + /** + * Sorts (case-insensitive) and returns the list of invalid terms queried for + */ + @Override + public List termsNotFound() + { + synchronized (termsNotFound) + { + Collections.sort(termsNotFound, String.CASE_INSENSITIVE_ORDER); + return termsNotFound; + } + } +} diff --git a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java index c525e95..31745e5 100644 --- a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java +++ b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java @@ -4,14 +4,19 @@ import static org.testng.AssertJUnit.assertEquals; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.io.AppletFormatAdapter; import jalview.io.FastaFile; import jalview.io.FileParse; +import jalview.io.gff.SequenceOntologyFactory; +import jalview.io.gff.SequenceOntologyLite; import java.lang.reflect.Method; import java.net.MalformedURLException; import java.net.URL; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -280,4 +285,82 @@ public class EnsemblSeqProxyTest variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S"); assertEquals("[C, R, T, W]", variants.toString()); } + + /** + * Tests for the method that maps the subset of a dna sequence that has CDS + * (or subtype) feature. + */ + @Test(groups = "Functional") + public void testGetCdsRanges() + { + EnsemblSeqProxy testee = new EnsemblSeqProxyAdapter(); + + SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt"); + dnaSeq.createDatasetSequence(); + SequenceI ds = dnaSeq.getDatasetSequence(); + + // CDS for dna 3-6 + SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null); + ds.addSequenceFeature(sf); + // exon feature should be ignored here + sf = new SequenceFeature("exon", "", 7, 9, 0f, null); + ds.addSequenceFeature(sf); + // CDS for dna 10-12 + sf = new SequenceFeature("some_cds", "", 10, 12, 0f, null); + ds.addSequenceFeature(sf); + + SequenceOntologyFactory.setInstance(new SequenceOntologyLite()); + List ranges = new ArrayList(); + int mappedLength = testee.getCdsRanges(dnaSeq, ranges); + assertEquals(6, mappedLength); + assertEquals(2, ranges.size()); + assertEquals(4, ranges.get(0)[0]); + assertEquals(6, ranges.get(0)[1]); + assertEquals(10, ranges.get(1)[0]); + assertEquals(12, ranges.get(1)[1]); + + } + + @Test(groups = "Functional") + public void getGenomicRangesFromFeatures() + { + + } + + /** + * Tests for the method that maps the subset of a dna sequence that has CDS + * (or subtype) feature - case where the start codon is incomplete. + */ + @Test(groups = "Functional") + public void testGetCdsRanges_fivePrimeIncomplete() + { + EnsemblSeqProxy testee = new EnsemblSeqProxyAdapter(); + + SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt"); + dnaSeq.createDatasetSequence(); + SequenceI ds = dnaSeq.getDatasetSequence(); + + // CDS for dna 5-6 (incomplete codon), 7-9 + SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null); + sf.setPhase("2"); // skip 2 bases to start of next codon + ds.addSequenceFeature(sf); + ds.addSequenceFeature(sf); + // CDS for dna 13-15 + sf = new SequenceFeature("some_cds", "", 13, 15, 0f, null); + ds.addSequenceFeature(sf); + + SequenceOntologyFactory.setInstance(new SequenceOntologyLite()); + List ranges = new ArrayList(); + int mappedLength = testee.getCdsRanges(dnaSeq, ranges); + + /* + * check the mapping starts with the first complete codon + */ + assertEquals(6, mappedLength); + assertEquals(2, ranges.size()); + assertEquals(7, ranges.get(0)[0]); + assertEquals(9, ranges.get(0)[1]); + assertEquals(13, ranges.get(1)[0]); + assertEquals(15, ranges.get(1)[1]); + } } \ No newline at end of file diff --git a/test/jalview/io/gff/SequenceOntologyTest.java b/test/jalview/io/gff/SequenceOntologyTest.java index 6c9226f..f791a1e 100644 --- a/test/jalview/io/gff/SequenceOntologyTest.java +++ b/test/jalview/io/gff/SequenceOntologyTest.java @@ -8,12 +8,12 @@ import org.testng.annotations.Test; public class SequenceOntologyTest { - private SequenceOntology so; + private SequenceOntologyI so; @BeforeMethod public void setUp() { long now = System.currentTimeMillis(); - so = SequenceOntology.getInstance(); + so = SequenceOntologyFactory.getInstance(); long elapsed = System.currentTimeMillis() - now; System.out.println("Load and cache of Sequence Ontology took " + elapsed + "ms"); @@ -57,29 +57,6 @@ public class SequenceOntologyTest } @Test(groups = "Functional") - public void testIsProteinMatch() - { - assertTrue(so.isProteinMatch("protein_match")); - assertTrue(so.isProteinMatch("protein_hmm_match")); - assertFalse(so.isProteinMatch("Protein_match")); // case-sensitive - } - - @Test(groups = "Functional") - public void testIsNucleotideMatch() - { - assertTrue(so.isNucleotideMatch("nucleotide_match")); - assertTrue(so.isNucleotideMatch("primer_match")); - assertTrue(so.isNucleotideMatch("cross_genome_match")); - assertTrue(so.isNucleotideMatch("expressed_sequence_match")); - assertTrue(so.isNucleotideMatch("translated_nucleotide_match")); - assertTrue(so.isNucleotideMatch("UST_match")); - assertTrue(so.isNucleotideMatch("RST_match")); - assertTrue(so.isNucleotideMatch("cDNA_match")); - assertTrue(so.isNucleotideMatch("EST_match")); - assertFalse(so.isNucleotideMatch("match")); // parent - } - - @Test(groups = "Functional") public void testIsCDS() { assertTrue(so.isA("CDS", "CDS")); -- 1.7.10.2