From e24933a537e0f640c75d4685c468615872bc77fc Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 19 Feb 2016 17:00:19 +0000 Subject: [PATCH] JAL-1705 regular expression updates, tests, other refactoring --- src/jalview/ext/ensembl/EnsemblCdna.java | 8 +++-- src/jalview/ext/ensembl/EnsemblProtein.java | 8 +++-- src/jalview/ext/ensembl/EnsemblRestClient.java | 18 ++++++++++ src/jalview/ext/ensembl/EnsemblSeqProxy.java | 36 +++++++++----------- .../ext/ensembl/EnsemblSequenceFetcher.java | 8 +++-- src/jalview/ext/ensembl/EnsemblXref.java | 5 ++- test/jalview/ext/ensembl/EnsemblCdnaTest.java | 14 ++++++++ test/jalview/ext/ensembl/EnsemblCdsTest.java | 14 ++++++++ test/jalview/ext/ensembl/EnsemblProteinTest.java | 2 ++ test/jalview/ext/ensembl/EnsemblSeqProxyTest.java | 14 ++++++++ 10 files changed, 99 insertions(+), 28 deletions(-) diff --git a/src/jalview/ext/ensembl/EnsemblCdna.java b/src/jalview/ext/ensembl/EnsemblCdna.java index f60125b..467fc6d 100644 --- a/src/jalview/ext/ensembl/EnsemblCdna.java +++ b/src/jalview/ext/ensembl/EnsemblCdna.java @@ -10,9 +10,13 @@ import com.stevesoft.pat.Regex; public class EnsemblCdna extends EnsemblSeqProxy { - // TODO modify to accept other species e.g. ENSMUSPnnn + /* + * accepts ENST or ENSTG with 11 digits + * or ENSMUST or similar for other species + * or CCDSnnnnn.nn with at least 3 digits + */ private static final Regex ACCESSION_REGEX = new Regex( - "(ENST|ENSG|CCDS)[0-9.]{3,}$"); + "(ENS([A-Z]{3}|)[TG][0-9]{11}$)" + "|" + "(CCDS[0-9.]{3,}$)"); /* * fetch exon features on genomic sequence (to identify the cdna regions) diff --git a/src/jalview/ext/ensembl/EnsemblProtein.java b/src/jalview/ext/ensembl/EnsemblProtein.java index 29c7eda..fb79ccf 100644 --- a/src/jalview/ext/ensembl/EnsemblProtein.java +++ b/src/jalview/ext/ensembl/EnsemblProtein.java @@ -10,9 +10,13 @@ import com.stevesoft.pat.Regex; public class EnsemblProtein extends EnsemblSeqProxy { - // TODO modify to accept other species e.g. ENSMUSPnnn + /* + * accepts ENSP with 11 digits + * or ENSMUSP or similar for other species + * or CCDSnnnnn.nn with at least 3 digits + */ private static final Regex ACCESSION_REGEX = new Regex( - "(ENSP|CCDS)[0-9.]{3,}$"); + "(ENS([A-Z]{3}|)P[0-9]{11}$)" + "|" + "(CCDS[0-9.]{3,}$)"); private static final List CROSSREFS = Arrays.asList(new String[] { "PDB", "Uniprot/SPTREMBL", "Uniprot/SWISSPROT" }); diff --git a/src/jalview/ext/ensembl/EnsemblRestClient.java b/src/jalview/ext/ensembl/EnsemblRestClient.java index 215eb7a..297f71b 100644 --- a/src/jalview/ext/ensembl/EnsemblRestClient.java +++ b/src/jalview/ext/ensembl/EnsemblRestClient.java @@ -14,6 +14,8 @@ import java.util.List; import javax.ws.rs.HttpMethod; +import com.stevesoft.pat.Regex; + /** * Base class for Ensembl REST service clients * @@ -31,12 +33,28 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher private final static long RETEST_INTERVAL = 10000L; // 10 seconds + private static final Regex TRANSCRIPT_REGEX = new Regex( + "(ENS)([A-Z]{3}|)T[0-9]{11}$"); + + private static final Regex GENE_REGEX = new Regex( + "(ENS)([A-Z]{3}|)G[0-9]{11}$"); + private static boolean ensemblRestAvailable = false; private static long lastCheck = -1; protected volatile boolean inProgress = false; + public static boolean isTranscriptIdentifier(String query) + { + return query == null ? false : TRANSCRIPT_REGEX.search(query); + } + + public static boolean isGeneIdentifier(String query) + { + return query == null ? false : GENE_REGEX.search(query); + } + @Override public boolean queryInProgress() { diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index a2be17b..77263ff 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -30,8 +30,6 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map.Entry; -import com.stevesoft.pat.Regex; - /** * Base class for Ensembl sequence fetchers * @@ -39,12 +37,8 @@ import com.stevesoft.pat.Regex; */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { - // TODO modify to accept other species e.g. ENSMUSTnnn - private static final Regex TRANSCRIPT_REGEX = new Regex( - "(ENST)[0-9]{11}$"); - private static final List CROSS_REFERENCES = Arrays - .asList(new String[] { "CCDS" }); + .asList(new String[] { "CCDS", "Uniprot/SWISSPROT" }); protected static final String CONSEQUENCE_TYPE = "consequence_type"; @@ -163,6 +157,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } } + if (alignment == null) + { + return null; + } + /* * fetch and transfer genomic sequence features, * fetch protein product and add as cross-reference @@ -344,7 +343,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient int mappedDnaLength = getCdsRanges(dnaSeq, ranges); int proteinLength = proteinSeq.getLength(); - List proteinRange = new ArrayList(); int proteinStart = 1; /* @@ -356,15 +354,20 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient proteinStart = 2; proteinLength--; } - proteinRange.add(new int[] { proteinStart, proteinLength }); + List proteinRange = new ArrayList(); /* * dna length should map to protein (or protein plus stop codon) */ int codesForResidues = mappedDnaLength / 3; - if (codesForResidues == proteinLength - || codesForResidues == (proteinLength + 1)) + if (codesForResidues == (proteinLength + 1)) + { + MappingUtils.unmapStopCodon(ranges, mappedDnaLength); + codesForResidues--; + } + if (codesForResidues == proteinLength) { + proteinRange.add(new int[] { proteinStart, proteinLength }); return new MapList(ranges, proteinRange, 3, 1); } return null; @@ -389,14 +392,14 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient { return 0; } + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); int mappedDnaLength = 0; for (SequenceFeature sf : sfs) { /* * process a CDS feature (or a sub-type of CDS) */ - if (SequenceOntologyFactory.getInstance().isA(sf.getType(), - SequenceOntologyI.CDS)) + if (so.isA(sf.getType(), SequenceOntologyI.CDS)) { int phase = 0; try { @@ -411,7 +414,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ int begin = sf.getBegin(); int end = sf.getEnd(); - if (ranges.isEmpty() && phase > 0) + if (ranges.isEmpty()) { begin += phase; if (begin > end) @@ -1129,9 +1132,4 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient || SequenceOntologyFactory.getInstance().isA(featureType, SequenceOntologyI.TRANSCRIPT); } - - public static boolean isTranscriptIdentifier(String query) - { - return query == null ? false : TRANSCRIPT_REGEX.search(query); - } } diff --git a/src/jalview/ext/ensembl/EnsemblSequenceFetcher.java b/src/jalview/ext/ensembl/EnsemblSequenceFetcher.java index 67c5e63..9a4952e 100644 --- a/src/jalview/ext/ensembl/EnsemblSequenceFetcher.java +++ b/src/jalview/ext/ensembl/EnsemblSequenceFetcher.java @@ -12,9 +12,13 @@ import com.stevesoft.pat.Regex; */ abstract class EnsemblSequenceFetcher extends DbSourceProxyImpl { - // TODO modify to accept other species e.g. ENSMUSTnnn + /* + * accepts ENSG/T/E/P with 11 digits + * or ENSMUSP or similar for other species + * or CCDSnnnnn.nn with at least 3 digits + */ private static final Regex ACCESSION_REGEX = new Regex( - "(ENSP|ENST|ENSG|CCDS)[0-9.]{3,}$"); + "(ENS([A-Z]{3}|)[GTEP]{1}[0-9]{11}$)" + "|" + "(CCDS[0-9.]{3,}$)"); /* * possible values for the 'feature' parameter of the /overlap REST service diff --git a/src/jalview/ext/ensembl/EnsemblXref.java b/src/jalview/ext/ensembl/EnsemblXref.java index d4c5b18..514e44a 100644 --- a/src/jalview/ext/ensembl/EnsemblXref.java +++ b/src/jalview/ext/ensembl/EnsemblXref.java @@ -22,7 +22,7 @@ import org.json.simple.parser.ParseException; * service * * @author gmcarstairs - * + * @see http://rest.ensembl.org/documentation/info/xref_id */ class EnsemblXref extends EnsemblRestClient { @@ -42,8 +42,7 @@ class EnsemblXref extends EnsemblRestClient @Override protected URL getUrl(List ids) throws MalformedURLException { - // TODO Auto-generated method stub - return null; + return getUrl(ids.get(0)); } @Override diff --git a/test/jalview/ext/ensembl/EnsemblCdnaTest.java b/test/jalview/ext/ensembl/EnsemblCdnaTest.java index 2d99a52..90c38d4 100644 --- a/test/jalview/ext/ensembl/EnsemblCdnaTest.java +++ b/test/jalview/ext/ensembl/EnsemblCdnaTest.java @@ -14,6 +14,7 @@ import jalview.util.MapList; import java.util.List; +import org.testng.Assert; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -234,4 +235,17 @@ public class EnsemblCdnaTest sf.setType("CDS"); assertFalse(testee.identifiesSequence(sf, accId)); } + + @Test(groups = "Functional") + public void testIsValidReference() throws Exception + { + EnsemblSequenceFetcher esq = new EnsemblCdna(); + Assert.assertTrue(esq.isValidReference("CCDS5863.1")); + Assert.assertTrue(esq.isValidReference("ENST00000288602")); + Assert.assertTrue(esq.isValidReference("ENSG00000288602")); + Assert.assertFalse(esq.isValidReference("ENSP00000288602")); + Assert.assertFalse(esq.isValidReference("ENST0000288602")); + // non-human species having a 3 character identifier included: + Assert.assertTrue(esq.isValidReference("ENSMUSG00000099398")); + } } diff --git a/test/jalview/ext/ensembl/EnsemblCdsTest.java b/test/jalview/ext/ensembl/EnsemblCdsTest.java index fb17845..183f933 100644 --- a/test/jalview/ext/ensembl/EnsemblCdsTest.java +++ b/test/jalview/ext/ensembl/EnsemblCdsTest.java @@ -13,6 +13,7 @@ import jalview.util.MapList; import java.util.List; +import org.testng.Assert; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -151,4 +152,17 @@ public class EnsemblCdsTest assertFalse(testee.identifiesSequence(sf, accId)); } + @Test(groups = "Functional") + public void testIsValidReference() throws Exception + { + EnsemblSequenceFetcher esq = new EnsemblCds(); + Assert.assertTrue(esq.isValidReference("CCDS5863.1")); + Assert.assertTrue(esq.isValidReference("ENST00000288602")); + Assert.assertTrue(esq.isValidReference("ENSG00000288602")); + Assert.assertTrue(esq.isValidReference("ENSP00000288602")); + Assert.assertFalse(esq.isValidReference("ENST0000288602")); + // non-human species have a 3 character identifier included: + Assert.assertTrue(esq.isValidReference("ENSMUSG00000099398")); + } + } diff --git a/test/jalview/ext/ensembl/EnsemblProteinTest.java b/test/jalview/ext/ensembl/EnsemblProteinTest.java index c5db0a8..e6f6683 100644 --- a/test/jalview/ext/ensembl/EnsemblProteinTest.java +++ b/test/jalview/ext/ensembl/EnsemblProteinTest.java @@ -16,6 +16,8 @@ public class EnsemblProteinTest Assert.assertTrue(esq.isValidReference("ENSP00000288602")); Assert.assertFalse(esq.isValidReference("ENST00000288602")); Assert.assertFalse(esq.isValidReference("ENSG00000288602")); + // non-human species having a 3 character identifier included: + Assert.assertTrue(esq.isValidReference("ENSMUSP00000099398")); } @Test(groups = "Functional") diff --git a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java index 73d2858..7ef8dd7 100644 --- a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java +++ b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java @@ -368,8 +368,22 @@ public class EnsemblSeqProxyTest assertFalse(EnsemblSeqProxy.isTranscriptIdentifier("")); assertFalse(EnsemblSeqProxy.isTranscriptIdentifier("ENSG00000012345")); assertTrue(EnsemblSeqProxy.isTranscriptIdentifier("ENST00000012345")); + assertTrue(EnsemblSeqProxy.isTranscriptIdentifier("ENSMUST00000012345")); assertFalse(EnsemblSeqProxy.isTranscriptIdentifier("enst00000012345")); assertFalse(EnsemblSeqProxy.isTranscriptIdentifier("ENST000000123456")); assertFalse(EnsemblSeqProxy.isTranscriptIdentifier("ENST0000001234")); } + + @Test(groups = "Functional") + public void testIsGeneIdentifier() + { + assertFalse(EnsemblSeqProxy.isGeneIdentifier(null)); + assertFalse(EnsemblSeqProxy.isGeneIdentifier("")); + assertFalse(EnsemblSeqProxy.isGeneIdentifier("ENST00000012345")); + assertTrue(EnsemblSeqProxy.isGeneIdentifier("ENSG00000012345")); + assertTrue(EnsemblSeqProxy.isGeneIdentifier("ENSMUSG00000012345")); + assertFalse(EnsemblSeqProxy.isGeneIdentifier("ensg00000012345")); + assertFalse(EnsemblSeqProxy.isGeneIdentifier("ENSG000000123456")); + assertFalse(EnsemblSeqProxy.isGeneIdentifier("ENSG0000001234")); + } } \ No newline at end of file -- 1.7.10.2