From 46b2bc114452c0e30463214b3e82eb0a5c98d23f Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 4 Feb 2016 10:29:09 +0000 Subject: [PATCH] JAL-1705 additional tests, validation regexp tweaks, javadoc --- src/jalview/ext/ensembl/EnsemblCdna.java | 7 +- src/jalview/ext/ensembl/EnsemblFeatures.java | 123 ++++++++++++++++++++ src/jalview/ext/ensembl/EnsemblGene.java | 12 ++ src/jalview/ext/ensembl/EnsemblGenome.java | 2 +- src/jalview/ext/ensembl/EnsemblProtein.java | 11 ++ src/jalview/ext/ensembl/EnsemblSeqProxy.java | 34 +++++- .../ext/ensembl/EnsemblSequenceFetcher.java | 18 ++- src/jalview/ext/ensembl/EnsemblXref.java | 9 +- test/jalview/datamodel/SequenceTest.java | 7 ++ test/jalview/ext/ensembl/EnsemblProteinTest.java | 19 +++ test/jalview/ext/ensembl/EnsemblSeqProxyTest.java | 16 --- 11 files changed, 227 insertions(+), 31 deletions(-) create mode 100644 src/jalview/ext/ensembl/EnsemblFeatures.java create mode 100644 test/jalview/ext/ensembl/EnsemblProteinTest.java diff --git a/src/jalview/ext/ensembl/EnsemblCdna.java b/src/jalview/ext/ensembl/EnsemblCdna.java index 373286f..d4d1c08 100644 --- a/src/jalview/ext/ensembl/EnsemblCdna.java +++ b/src/jalview/ext/ensembl/EnsemblCdna.java @@ -10,8 +10,11 @@ import com.stevesoft.pat.Regex; public class EnsemblCdna extends EnsemblSeqProxy { + // TODO modify to accept other species e.g. ENSMUSPnnn + private static final Regex ACCESSION_REGEX = new Regex("((ENST|ENSG|CCDS)[0-9.]{3,})"); + /* - * fetch exon features on genomic sequence (to identify the cdnaregions) + * fetch exon features on genomic sequence (to identify the cdna regions) * and cds and variation features (to retain) */ private static final EnsemblFeatureType[] FEATURES_TO_FETCH = { @@ -38,7 +41,7 @@ public class EnsemblCdna extends EnsemblSeqProxy @Override public Regex getAccessionValidator() { - return new Regex("((ENST|ENSG|CCDS)[0-9.]{3,})"); + return ACCESSION_REGEX; } @Override diff --git a/src/jalview/ext/ensembl/EnsemblFeatures.java b/src/jalview/ext/ensembl/EnsemblFeatures.java new file mode 100644 index 0000000..22faba9 --- /dev/null +++ b/src/jalview/ext/ensembl/EnsemblFeatures.java @@ -0,0 +1,123 @@ +package jalview.ext.ensembl; + +import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentI; +import jalview.io.FeaturesFile; +import jalview.io.FileParse; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +/** + * A client for fetching and processing Ensembl feature data in GFF format by + * calling the overlap REST service + * + * @author gmcarstairs + * @see http://rest.ensembl.org/documentation/info/overlap_id + */ +class EnsemblFeatures extends EnsemblRestClient +{ + /* + * The default features to retrieve from Ensembl + * can override in getSequenceRecords parameter + */ + private EnsemblFeatureType[] featuresWanted = { EnsemblFeatureType.cds, + EnsemblFeatureType.exon, EnsemblFeatureType.variation }; + + @Override + public String getDbName() + { + return "ENSEMBL (features)"; + } + + /** + * Makes a query to the REST overlap endpoint for the given sequence + * identifier. This returns an 'alignment' consisting of one 'dummy sequence' + * (the genomic sequence for which overlap features are returned by the + * service). This sequence will have on it sequence features which are the + * real information of interest, such as CDS regions or sequence variations. + */ + @Override + public AlignmentI getSequenceRecords(String query) throws IOException + { + // TODO: use a vararg String... for getSequenceRecords instead? + List queries = new ArrayList(); + queries.add(query); + FileParse fp = getSequenceReader(queries); + FeaturesFile fr = new FeaturesFile(fp); + return new Alignment(fr.getSeqsAsArray()); + } + + /** + * Returns a URL for the REST overlap endpoint + * + * @param ids + * @return + */ + @Override + protected URL getUrl(List ids) throws MalformedURLException + { + StringBuffer urlstring = new StringBuffer(128); + urlstring.append(ENSEMBL_REST).append("/overlap/id/") + .append(ids.get(0)); + + // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats + urlstring.append("?content-type=text/x-gff3"); + + /* + * specify features to retrieve + * @see http://rest.ensembl.org/documentation/info/overlap_id + * could make the list a configurable entry in jalview.properties + */ + for (EnsemblFeatureType feature : featuresWanted) + { + urlstring.append("&feature=").append(feature.name()); + } + + return new URL(urlstring.toString()); + } + + @Override + protected boolean useGetRequest() + { + return true; + } + + /** + * Returns the MIME type for GFF3. For GET requests the Content-type header + * describes the required encoding of the response. + */ + @Override + protected String getRequestMimeType(boolean multipleIds) + { + return "text/x-gff3"; + } + + /** + * Returns the MIME type for GFF3. + */ + @Override + protected String getResponseMimeType() + { + return "text/x-gff3"; + } + + /** + * Overloaded method that allows a list of features to retrieve to be + * specified + * + * @param accId + * @param features + * @return + * @throws IOException + */ + protected AlignmentI getSequenceRecords(String accId, + EnsemblFeatureType[] features) throws IOException + { + featuresWanted = features; + return getSequenceRecords(accId); + } +} diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java index df246f8..dc28796 100644 --- a/src/jalview/ext/ensembl/EnsemblGene.java +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -12,6 +12,8 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import com.stevesoft.pat.Regex; + /** * A class that fetches genomic sequence and all transcripts for an Ensembl gene * @@ -19,6 +21,10 @@ import java.util.List; */ public class EnsemblGene extends EnsemblSeqProxy { + // TODO modify to accept other species e.g. ENSMUSGnnn + private static final Regex ACCESSION_REGEX = new Regex( + "((ENSG)[0-9]{11})"); + private static final EnsemblFeatureType[] FEATURES_TO_FETCH = { EnsemblFeatureType.gene, EnsemblFeatureType.transcript, EnsemblFeatureType.exon, EnsemblFeatureType.cds, @@ -309,4 +315,10 @@ public class EnsemblGene extends EnsemblSeqProxy { } + @Override + public Regex getAccessionValidator() + { + return ACCESSION_REGEX; + } + } diff --git a/src/jalview/ext/ensembl/EnsemblGenome.java b/src/jalview/ext/ensembl/EnsemblGenome.java index 6bbc3e9..e977e62 100644 --- a/src/jalview/ext/ensembl/EnsemblGenome.java +++ b/src/jalview/ext/ensembl/EnsemblGenome.java @@ -20,7 +20,7 @@ public class EnsemblGenome extends EnsemblSeqProxy @Override public String getDbName() { - return "ENSEMBL (Genome)"; + return "ENSEMBL (Genomic)"; } @Override diff --git a/src/jalview/ext/ensembl/EnsemblProtein.java b/src/jalview/ext/ensembl/EnsemblProtein.java index c40fdd0..8f23984 100644 --- a/src/jalview/ext/ensembl/EnsemblProtein.java +++ b/src/jalview/ext/ensembl/EnsemblProtein.java @@ -6,8 +6,13 @@ import jalview.datamodel.SequenceFeature; import java.util.Arrays; import java.util.List; +import com.stevesoft.pat.Regex; + public class EnsemblProtein extends EnsemblSeqProxy { + // TODO modify to accept other species e.g. ENSMUSPnnn + private static final Regex ACCESSION_REGEX = new Regex( + "((ENSP|CCDS)[0-9.]{3,})"); private static final List CROSSREFS = Arrays.asList(new String[] { "PDB", "Uniprot/SPTREMBL", "Uniprot/SWISSPROT" }); @@ -76,4 +81,10 @@ public class EnsemblProtein extends EnsemblSeqProxy return CROSSREFS; } + @Override + public Regex getAccessionValidator() + { + return ACCESSION_REGEX; + } + } diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index b2804f2..e77051d 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -48,25 +48,28 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient protected static final String NAME = "Name"; + /* + * enum for 'type' parameter to the /sequence REST service + */ public enum EnsemblSeqType { /** - * type=genomic for the full dna including introns + * type=genomic to fetch full dna including introns */ GENOMIC("genomic"), /** - * type=cdna for transcribed dna including UTRs + * type=cdna to fetch dna including UTRs */ CDNA("cdna"), /** - * type=cds for coding dna excluding UTRs + * type=cds to fetch coding dna excluding UTRs */ CDS("cds"), /** - * type=protein for the peptide product sequence + * type=protein to fetch peptide product sequence */ PROTEIN("protein"); @@ -201,7 +204,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * get 'dummy' genomic sequence with exon, cds and variation features */ SequenceI genomicSequence = null; - EnsemblOverlap gffFetcher = new EnsemblOverlap(); + EnsemblFeatures gffFetcher = new EnsemblFeatures(); EnsemblFeatureType[] features = getFeaturesToFetch(); AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, features); @@ -268,7 +271,10 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient MapList mapList = mapCdsToProtein(querySeq, proteinSeq); if (mapList != null) { - Mapping map = new Mapping(proteinSeq.getDatasetSequence(), mapList); + // clunky: ensure Uniprot xref if we have one is on mapped sequence + SequenceI ds = proteinSeq.getDatasetSequence(); + ds.setSourceDBRef(proteinSeq.getSourceDBRef()); + Mapping map = new Mapping(ds, mapList); DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(), accId, map); querySeq.getDatasetSequence().addDBRef(dbr); @@ -919,6 +925,22 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient count++; } } + + /* + * ugly sort to get sequence features in start position order + * - would be better to store in Sequence as a TreeSet instead? + */ + Arrays.sort(peptide.getSequenceFeatures(), + new Comparator() + { + @Override + public int compare(SequenceFeature o1, SequenceFeature o2) + { + int c = Integer.compare(o1.getBegin(), o2.getBegin()); + return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd()) + : c; + } + }); return count; } diff --git a/src/jalview/ext/ensembl/EnsemblSequenceFetcher.java b/src/jalview/ext/ensembl/EnsemblSequenceFetcher.java index f1b96e2..2e32bd2 100644 --- a/src/jalview/ext/ensembl/EnsemblSequenceFetcher.java +++ b/src/jalview/ext/ensembl/EnsemblSequenceFetcher.java @@ -9,13 +9,15 @@ import com.stevesoft.pat.Regex; * A base class for Ensembl sequence fetchers * * @author gmcarstairs - * */ -public abstract class EnsemblSequenceFetcher extends DbSourceProxyImpl +abstract class EnsemblSequenceFetcher extends DbSourceProxyImpl { + // TODO modify to accept other species e.g. ENSMUSTnnn + private static final Regex ACCESSION_REGEX = new Regex("((ENSP|ENST|ENSG|CCDS)[0-9.]{3,})"); + /* - * possible values for the 'feature' parameter of the REST overlap endpoint - * @see + * possible values for the 'feature' parameter of the /overlap REST service + * @see http://rest.ensembl.org/documentation/info/overlap_id */ protected enum EnsemblFeatureType { @@ -43,10 +45,16 @@ public abstract class EnsemblSequenceFetcher extends DbSourceProxyImpl return " "; } + /** + * Ensembl accession are ENST + 11 digits for human transcript, ENSG for human + * gene. Other species insert 3 letters e.g. ENSMUST..., ENSMUSG... + * + * @see http://www.ensembl.org/Help/View?id=151 + */ @Override public Regex getAccessionValidator() { - return new Regex("((ENSP|ENST|ENSG|CCDS)[0-9.]{3,})"); + return ACCESSION_REGEX; } @Override diff --git a/src/jalview/ext/ensembl/EnsemblXref.java b/src/jalview/ext/ensembl/EnsemblXref.java index 36bd7c5..d4c5b18 100644 --- a/src/jalview/ext/ensembl/EnsemblXref.java +++ b/src/jalview/ext/ensembl/EnsemblXref.java @@ -17,7 +17,14 @@ import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import org.json.simple.parser.ParseException; -public class EnsemblXref extends EnsemblRestClient +/** + * A class to fetch cross-references from Ensembl by calling the /xrefs REST + * service + * + * @author gmcarstairs + * + */ +class EnsemblXref extends EnsemblRestClient { @Override diff --git a/test/jalview/datamodel/SequenceTest.java b/test/jalview/datamodel/SequenceTest.java index dcc8ef7..0d40037 100644 --- a/test/jalview/datamodel/SequenceTest.java +++ b/test/jalview/datamodel/SequenceTest.java @@ -384,11 +384,18 @@ public class SequenceTest { SequenceI sq = new Sequence("Seq1", "CD"); sq.setDatasetSequence(new Sequence("Seq1", "ABCDEF")); + sq.getDatasetSequence().addSequenceFeature( + new SequenceFeature("", "", 1, 2, 0f, null)); sq.setStart(3); sq.setEnd(4); SequenceI derived = sq.deriveSequence(); assertEquals("CD", derived.getSequenceAsString()); assertSame(sq.getDatasetSequence(), derived.getDatasetSequence()); + + assertNull(((Sequence) seq).sequenceFeatures); + assertNull(((Sequence) derived).sequenceFeatures); + assertNotNull(seq.getSequenceFeatures()); + assertSame(seq.getSequenceFeatures(), derived.getSequenceFeatures()); } /** diff --git a/test/jalview/ext/ensembl/EnsemblProteinTest.java b/test/jalview/ext/ensembl/EnsemblProteinTest.java new file mode 100644 index 0000000..bd0e7b3 --- /dev/null +++ b/test/jalview/ext/ensembl/EnsemblProteinTest.java @@ -0,0 +1,19 @@ +package jalview.ext.ensembl; + +import org.testng.Assert; +import org.testng.annotations.Test; + +public class EnsemblProteinTest +{ + + @Test(groups = "Functional") + public void testIsValidReference() throws Exception + { + EnsemblSequenceFetcher esq = new EnsemblProtein(); + Assert.assertTrue(esq.isValidReference("CCDS5863.1")); + Assert.assertTrue(esq.isValidReference("ENSP00000288602")); + Assert.assertFalse(esq.isValidReference("ENST00000288602")); + Assert.assertFalse(esq.isValidReference("ENSG00000288602")); + } + +} diff --git a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java index ed936d5..a6694eb 100644 --- a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java +++ b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java @@ -120,22 +120,6 @@ public class EnsemblSeqProxyTest SequenceOntologyFactory.setInstance(null); } - @DataProvider(name = "queries") - public Object[][] createQueryData(Method m) - { - return new Object[][] { { "CCDS5863.1" }, { "ENSP00000288602" } }; - } - - @Test(dataProvider = "queries") - public void testIsValidReference(String query) throws Exception - { - EnsemblSequenceFetcher esq = new EnsemblProtein(); - Assert.assertTrue(esq.isValidReference(query), - "Expected reference string " + query - + " to be valid for regex " - + esq.getAccessionValidator().toString()); - } - @DataProvider(name = "ens_seqs") public Object[][] createData(Method m) { -- 1.7.10.2