From f23f5cefe9c66bfa8b877c707d25862cabba3ddf Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 14 Oct 2016 11:10:25 +0100 Subject: [PATCH] JAL-2253 safer simpler resolution of identifiers using lookup endpoint; handle error status conditions Conflicts: src/jalview/ext/ensembl/EnsemblGene.java src/jalview/ext/ensembl/EnsemblRestClient.java --- src/jalview/ext/ensembl/EnsemblGene.java | 77 ++++----------------- src/jalview/ext/ensembl/EnsemblGenomes.java | 6 -- src/jalview/ext/ensembl/EnsemblLookup.java | 48 +++++++++++-- src/jalview/ext/ensembl/EnsemblProtein.java | 2 - src/jalview/ext/ensembl/EnsemblRestClient.java | 72 +++++-------------- src/jalview/ext/ensembl/EnsemblSymbol.java | 45 +++++++++--- src/jalview/ext/ensembl/EnsemblXref.java | 18 ++--- test/jalview/ext/ensembl/EnsemblSeqProxyTest.java | 28 -------- 8 files changed, 118 insertions(+), 178 deletions(-) diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java index 365c1c2..f46cb1b 100644 --- a/src/jalview/ext/ensembl/EnsemblGene.java +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -161,8 +161,9 @@ public class EnsemblGene extends EnsemblSeqProxy } /** - * Converts a query, which may contain one or more gene or transcript - * identifiers, into a non-redundant list of gene identifiers. + * Converts a query, which may contain one or more gene, transcript, or + * external (to Ensembl) identifiers, into a non-redundant list of gene + * identifiers. * * @param accessions * @return @@ -173,54 +174,30 @@ public class EnsemblGene extends EnsemblSeqProxy for (String acc : accessions.split(getAccessionSeparator())) { - if (isGeneIdentifier(acc)) - { - if (!geneIds.contains(acc)) - { - geneIds.add(acc); - } - } - /* - * if given a transcript id, look up its gene parent + * First try lookup as an Ensembl (gene or transcript) identifier */ - else if (isTranscriptIdentifier(acc)) + String geneId = new EnsemblLookup(getDomain()).getGeneId(acc); + if (geneId != null) { - String geneId = new EnsemblLookup(getDomain()).getParent(acc); - if (geneId != null && !geneIds.contains(geneId)) + if (!geneIds.contains(geneId)) { geneIds.add(geneId); } } - else if (isProteinIdentifier(acc)) - { - String tscriptId = new EnsemblLookup(getDomain()).getParent(acc); - if (tscriptId != null) - { - String geneId = new EnsemblLookup(getDomain()) - .getParent(tscriptId); - - if (geneId != null && !geneIds.contains(geneId)) - { - geneIds.add(geneId); - } - } - // NOTE - acc is lost if it resembles an ENS.+ ID but isn't actually - // resolving to one... e.g. ENSMICP00000009241 - } - /* - * if given a gene or other external name, lookup and fetch - * the corresponding gene for all model organisms - */ else { + /* + * if given a gene or other external name, lookup and fetch + * the corresponding gene for all model organisms + */ List ids = new EnsemblSymbol(getDomain(), getDbSource(), - getDbVersion()).getIds(acc); - for (String geneId : ids) + getDbVersion()).getGeneIds(acc); + for (String id : ids) { - if (!geneIds.contains(geneId)) + if (!geneIds.contains(id)) { - geneIds.add(geneId); + geneIds.add(id); } } } @@ -229,30 +206,6 @@ public class EnsemblGene extends EnsemblSeqProxy } /** - * Attempts to get Ensembl stable identifiers for model organisms for a gene - * name by calling the xrefs symbol REST service to resolve the gene name. - * - * @param query - * @return - */ - protected String getGeneIdentifiersForName(String query) - { - List ids = new EnsemblSymbol(getDomain(), getDbSource(), - getDbVersion()).getIds(query); - if (ids != null) - { - for (String id : ids) - { - if (isGeneIdentifier(id)) - { - return id; - } - } - } - return null; - } - - /** * Constructs all transcripts for the gene, as identified by "transcript" * features whose Parent is the requested gene. The coding transcript * sequences (i.e. with introns omitted) are added to the alignment. diff --git a/src/jalview/ext/ensembl/EnsemblGenomes.java b/src/jalview/ext/ensembl/EnsemblGenomes.java index ef46a5b..5a390f2 100644 --- a/src/jalview/ext/ensembl/EnsemblGenomes.java +++ b/src/jalview/ext/ensembl/EnsemblGenomes.java @@ -39,12 +39,6 @@ public class EnsemblGenomes extends EnsemblGene } @Override - public boolean isGeneIdentifier(String query) - { - return true; - } - - @Override public String getDbName() { return "EnsemblGenomes"; diff --git a/src/jalview/ext/ensembl/EnsemblLookup.java b/src/jalview/ext/ensembl/EnsemblLookup.java index eb8f90e..6e824cc 100644 --- a/src/jalview/ext/ensembl/EnsemblLookup.java +++ b/src/jalview/ext/ensembl/EnsemblLookup.java @@ -43,6 +43,13 @@ import org.json.simple.parser.ParseException; public class EnsemblLookup extends EnsemblRestClient { + private static final String OBJECT_TYPE_TRANSLATION = "Translation"; + private static final String PARENT = "Parent"; + private static final String OBJECT_TYPE_TRANSCRIPT = "Transcript"; + private static final String ID = "id"; + private static final String OBJECT_TYPE_GENE = "Gene"; + private static final String OBJECT_TYPE = "object_type"; + /** * Default constructor (to use rest.ensembl.org) */ @@ -122,7 +129,7 @@ public class EnsemblLookup extends EnsemblRestClient * @param identifier * @return */ - public String getParent(String identifier) + public String getGeneId(String identifier) { List ids = Arrays.asList(new String[] { identifier }); @@ -134,7 +141,7 @@ public class EnsemblLookup extends EnsemblRestClient { br = getHttpResponse(url, ids); } - return (parseResponse(br)); + return br == null ? null : parseResponse(br); } catch (IOException e) { // ignore @@ -155,8 +162,10 @@ public class EnsemblLookup extends EnsemblRestClient } /** - * Parses "Parent" from the JSON response and returns the value, or null if - * not found + * Parses the JSON response and returns the gene identifier, or null if not + * found. If the returned object_type is Gene, returns the id, if Transcript + * returns the Parent. If it is Translation (peptide identifier), then the + * Parent is the transcript identifier, so we redo the search with this value. * * @param br * @return @@ -164,17 +173,42 @@ public class EnsemblLookup extends EnsemblRestClient */ protected String parseResponse(BufferedReader br) throws IOException { - String parent = null; + String geneId = null; JSONParser jp = new JSONParser(); try { JSONObject val = (JSONObject) jp.parse(br); - parent = val.get("Parent").toString(); + String type = val.get(OBJECT_TYPE).toString(); + if (OBJECT_TYPE_GENE.equalsIgnoreCase(type)) + { + geneId = val.get(ID).toString(); + } + else if (OBJECT_TYPE_TRANSCRIPT.equalsIgnoreCase(type)) + { + geneId = val.get(PARENT).toString(); + } + else if (OBJECT_TYPE_TRANSLATION.equalsIgnoreCase(type)) + { + String transcriptId = val.get(PARENT).toString(); + try + { + geneId = getGeneId(transcriptId); + } catch (StackOverflowError e) + { + /* + * unlikely data condition error! + */ + System.err + .println("** Ensembl lookup " + + getUrl(transcriptId).toString() + + " looping on Parent!"); + } + } } catch (ParseException e) { // ignore } - return parent; + return geneId; } } diff --git a/src/jalview/ext/ensembl/EnsemblProtein.java b/src/jalview/ext/ensembl/EnsemblProtein.java index 1554a0b..99006aa 100644 --- a/src/jalview/ext/ensembl/EnsemblProtein.java +++ b/src/jalview/ext/ensembl/EnsemblProtein.java @@ -23,8 +23,6 @@ package jalview.ext.ensembl; import jalview.datamodel.AlignmentI; import jalview.datamodel.SequenceFeature; -import java.util.List; - import com.stevesoft.pat.Regex; /** diff --git a/src/jalview/ext/ensembl/EnsemblRestClient.java b/src/jalview/ext/ensembl/EnsemblRestClient.java index ad6c70c..7752e9c 100644 --- a/src/jalview/ext/ensembl/EnsemblRestClient.java +++ b/src/jalview/ext/ensembl/EnsemblRestClient.java @@ -42,8 +42,6 @@ import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; -import com.stevesoft.pat.Regex; - /** * Base class for Ensembl REST service clients * @@ -76,15 +74,6 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher private final static long VERSION_RETEST_INTERVAL = 1000L * 3600; // 1 hr - private static final Regex PROTEIN_REGEX = new Regex( - "(ENS)([A-Z]{3}|)P[0-9]{11}$"); - - private static final Regex TRANSCRIPT_REGEX = new Regex( - "(ENS)([A-Z]{3}|)T[0-9]{11}$"); - - private static final Regex GENE_REGEX = new Regex( - "(ENS)([A-Z]{3}|)G[0-9]{11}$"); - static { domainData = new HashMap(); @@ -114,42 +103,6 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher setDomain(d); } - /** - * Answers true if the query matches the regular expression pattern for an - * Ensembl transcript stable identifier - * - * @param query - * @return - */ - public boolean isTranscriptIdentifier(String query) - { - return query == null ? false : TRANSCRIPT_REGEX.search(query); - } - - /** - * Answers true if the query matches the regular expression pattern for an - * Ensembl protein stable identifier - * - * @param query - * @return - */ - public boolean isProteinIdentifier(String query) - { - return query == null ? false : PROTEIN_REGEX.search(query); - } - - /** - * Answers true if the query matches the regular expression pattern for an - * Ensembl gene stable identifier - * - * @param query - * @return - */ - public boolean isGeneIdentifier(String query) - { - return query == null ? false : GENE_REGEX.search(query); - } - @Override public boolean queryInProgress() { @@ -219,6 +172,11 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher * if ping takes more than 2 seconds to respond, treat as if unavailable */ br = getHttpResponse(ping, null, 2 * 1000); + if (br == null) + { + // error reponse status + return false; + } JSONParser jp = new JSONParser(); JSONObject val = (JSONObject) jp.parse(br); String pingString = val.get("ping").toString(); @@ -355,13 +313,13 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher void checkRateLimits(HttpURLConnection connection) { // number of requests allowed per time interval: - String limit = connection.getHeaderField("X-RateLimit-Limit"); + // String limit = connection.getHeaderField("X-RateLimit-Limit"); // length of quota time interval in seconds: // String period = connection.getHeaderField("X-RateLimit-Period"); // seconds remaining until usage quota is reset: - String reset = connection.getHeaderField("X-RateLimit-Reset"); + // String reset = connection.getHeaderField("X-RateLimit-Reset"); // number of requests remaining from quota for current period: - String remaining = connection.getHeaderField("X-RateLimit-Remaining"); + // String remaining = connection.getHeaderField("X-RateLimit-Remaining"); // number of seconds to wait before retrying (if remaining == 0) String retryDelay = connection.getHeaderField("Retry-After"); @@ -500,6 +458,10 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher url = new URL( getDomain() + "/info/rest?content-type=application/json"); BufferedReader br = getHttpResponse(url, null); + if (br == null) + { + return; + } JSONObject val = (JSONObject) jp.parse(br); String version = val.get("release").toString(); String majorVersion = version.substring(0, version.indexOf(".")); @@ -563,9 +525,13 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher url = new URL( getDomain() + "/info/data?content-type=application/json"); BufferedReader br = getHttpResponse(url, null); - JSONObject val = (JSONObject) jp.parse(br); - JSONArray versions = (JSONArray) val.get("releases"); - domainData.get(getDomain()).dataVersion = versions.get(0).toString(); + if (br != null) + { + JSONObject val = (JSONObject) jp.parse(br); + JSONArray versions = (JSONArray) val.get("releases"); + domainData.get(getDomain()).dataVersion = versions.get(0) + .toString(); + } } catch (Throwable t) { System.err.println( diff --git a/src/jalview/ext/ensembl/EnsemblSymbol.java b/src/jalview/ext/ensembl/EnsemblSymbol.java index 9f86731..671bfec 100644 --- a/src/jalview/ext/ensembl/EnsemblSymbol.java +++ b/src/jalview/ext/ensembl/EnsemblSymbol.java @@ -74,7 +74,8 @@ public class EnsemblSymbol extends EnsemblXref { JSONObject val = (JSONObject) rvals.next(); String id = val.get("id").toString(); - if (id != null && isGeneIdentifier(id)) + String type = val.get("type").toString(); + if (id != null && "gene".equals(type)) { result = id; break; @@ -87,12 +88,31 @@ public class EnsemblSymbol extends EnsemblXref return result; } - protected URL getUrl(String id, Species species) + /** + * Constructs the URL for the REST symbol endpoint + * + * @param id + * the accession id (Ensembl or external) + * @param species + * a species name recognisable by Ensembl + * @param type + * an optional type to filter the response (gene, transcript, + * translation) + * @return + */ + protected URL getUrl(String id, Species species, String... type) { - String url = getDomain() + "/xrefs/symbol/" + species.toString() + "/" - + id + "?content-type=application/json"; + StringBuilder sb = new StringBuilder(); + sb.append(getDomain()).append("/xrefs/symbol/") + .append(species.toString()).append("/").append(id) + .append("?content-type=application/json"); + for (String t : type) + { + sb.append("&object_type=").append(t); + } try { + String url = sb.toString(); return new URL(url); } catch (MalformedURLException e) { @@ -107,7 +127,7 @@ public class EnsemblSymbol extends EnsemblXref * @param identifier * @return */ - public List getIds(String identifier) + public List getGeneIds(String identifier) { List result = new ArrayList(); List ids = new ArrayList(); @@ -123,15 +143,18 @@ public class EnsemblSymbol extends EnsemblXref { if (taxon.isModelOrganism()) { - URL url = getUrl(query, taxon); + URL url = getUrl(query, taxon, "gene"); if (url != null) { br = getHttpResponse(url, ids); - } - String geneId = parseSymbolResponse(br); - if (geneId != null) - { - result.add(geneId); + if (br != null) + { + String geneId = parseSymbolResponse(br); + if (geneId != null && !result.contains(geneId)) + { + result.add(geneId); + } + } } } } diff --git a/src/jalview/ext/ensembl/EnsemblXref.java b/src/jalview/ext/ensembl/EnsemblXref.java index c0b00b1..9634ad3 100644 --- a/src/jalview/ext/ensembl/EnsemblXref.java +++ b/src/jalview/ext/ensembl/EnsemblXref.java @@ -125,7 +125,10 @@ class EnsemblXref extends EnsemblRestClient { br = getHttpResponse(url, ids); } - return (parseResponse(br)); + if (br != null) + { + result = parseResponse(br); + } } catch (IOException e) { // ignore @@ -168,16 +171,13 @@ class EnsemblXref extends EnsemblRestClient while (rvals.hasNext()) { JSONObject val = (JSONObject) rvals.next(); - String dbName = val.get("dbname").toString(); - if (dbName.equals(GO_GENE_ONTOLOGY)) - { - continue; - } + String db = val.get("dbname").toString(); String id = val.get("primary_id").toString(); - if (dbName != null && id != null) + if (db != null && id != null + && !GO_GENE_ONTOLOGY.equals(db)) { - dbName = DBRefUtils.getCanonicalName(dbName); - DBRefEntry dbref = new DBRefEntry(dbName, getXRefVersion(), id); + db = DBRefUtils.getCanonicalName(db); + DBRefEntry dbref = new DBRefEntry(db, getXRefVersion(), id); result.add(dbref); } } diff --git a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java index aa2c315..e2af26b 100644 --- a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java +++ b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java @@ -191,34 +191,6 @@ public class EnsemblSeqProxyTest } - @Test(groups = "Functional") - public void testIsTranscriptIdentifier() - { - EnsemblSeqProxy testee = new EnsemblGene(); - assertFalse(testee.isTranscriptIdentifier(null)); - assertFalse(testee.isTranscriptIdentifier("")); - assertFalse(testee.isTranscriptIdentifier("ENSG00000012345")); - assertTrue(testee.isTranscriptIdentifier("ENST00000012345")); - assertTrue(testee.isTranscriptIdentifier("ENSMUST00000012345")); - assertFalse(testee.isTranscriptIdentifier("enst00000012345")); - assertFalse(testee.isTranscriptIdentifier("ENST000000123456")); - assertFalse(testee.isTranscriptIdentifier("ENST0000001234")); - } - - @Test(groups = "Functional") - public void testIsGeneIdentifier() - { - EnsemblSeqProxy testee = new EnsemblGene(); - assertFalse(testee.isGeneIdentifier(null)); - assertFalse(testee.isGeneIdentifier("")); - assertFalse(testee.isGeneIdentifier("ENST00000012345")); - assertTrue(testee.isGeneIdentifier("ENSG00000012345")); - assertTrue(testee.isGeneIdentifier("ENSMUSG00000012345")); - assertFalse(testee.isGeneIdentifier("ensg00000012345")); - assertFalse(testee.isGeneIdentifier("ENSG000000123456")); - assertFalse(testee.isGeneIdentifier("ENSG0000001234")); - } - /** * Test the method that appends a single allele's reverse complement to a * string buffer -- 1.7.10.2