From 949fed115506ff00c221669e096546f9c39a6ace Mon Sep 17 00:00:00 2001 From: gmungoc Date: Sat, 30 Jan 2016 06:33:01 +0000 Subject: [PATCH] JAL-1705 refactored cross-reference fetching (CCDS, Uniprot, PDB) --- src/jalview/ext/ensembl/EnsemblCdna.java | 10 ++++++ src/jalview/ext/ensembl/EnsemblGene.java | 9 +++++ src/jalview/ext/ensembl/EnsemblProtein.java | 12 +++++++ src/jalview/ext/ensembl/EnsemblSeqProxy.java | 36 +++++++++++++------ src/jalview/ext/ensembl/EnsemblXref.java | 49 +++++++++++++++----------- 5 files changed, 84 insertions(+), 32 deletions(-) diff --git a/src/jalview/ext/ensembl/EnsemblCdna.java b/src/jalview/ext/ensembl/EnsemblCdna.java index a2ecfcd..139e44f 100644 --- a/src/jalview/ext/ensembl/EnsemblCdna.java +++ b/src/jalview/ext/ensembl/EnsemblCdna.java @@ -3,6 +3,8 @@ package jalview.ext.ensembl; import jalview.datamodel.SequenceFeature; import jalview.io.gff.SequenceOntology; +import java.util.List; + import com.stevesoft.pat.Regex; public class EnsemblCdna extends EnsemblSeqProxy @@ -78,4 +80,12 @@ public class EnsemblCdna extends EnsemblSeqProxy return false; } + @Override + protected List getCrossReferenceDatabases() + { + return super.getCrossReferenceDatabases(); + // 30/01/16 also found Vega_transcript, OTTT, ENS_LRG_transcript, UCSC, + // HGNC_trans_name, RefSeq_mRNA, RefSeq_mRNA_predicted + } + } diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java index b5ea686..1325bec 100644 --- a/src/jalview/ext/ensembl/EnsemblGene.java +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -290,4 +290,13 @@ public class EnsemblGene extends EnsemblSeqProxy return false; } + @Override + protected List getCrossReferenceDatabases() + { + // found these for ENSG00000157764 on 30/01/2016: + // return new String[] {"Vega_gene", "OTTG", "ENS_LRG_gene", "ArrayExpress", + // "EntrezGene", "HGNC", "MIM_GENE", "MIM_MORBID", "WikiGene"}; + return super.getCrossReferenceDatabases(); + } + } diff --git a/src/jalview/ext/ensembl/EnsemblProtein.java b/src/jalview/ext/ensembl/EnsemblProtein.java index 5238f98..c40fdd0 100644 --- a/src/jalview/ext/ensembl/EnsemblProtein.java +++ b/src/jalview/ext/ensembl/EnsemblProtein.java @@ -3,9 +3,15 @@ package jalview.ext.ensembl; import jalview.datamodel.AlignmentI; import jalview.datamodel.SequenceFeature; +import java.util.Arrays; +import java.util.List; + public class EnsemblProtein extends EnsemblSeqProxy { + private static final List CROSSREFS = Arrays.asList(new String[] { + "PDB", "Uniprot/SPTREMBL", "Uniprot/SWISSPROT" }); + public EnsemblProtein() { super(); @@ -64,4 +70,10 @@ public class EnsemblProtein extends EnsemblSeqProxy return false; } + @Override + protected List getCrossReferenceDatabases() + { + return CROSSREFS; + } + } diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 8698b78..0bfeda1 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -36,6 +36,9 @@ import java.util.Map.Entry; */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { + private static final List CROSS_REFERENCES = Arrays + .asList(new String[] { "CCDS" }); + protected static final String CONSEQUENCE_TYPE = "consequence_type"; protected static final String PARENT = "Parent"; @@ -173,7 +176,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient addFeaturesAndProduct(accId, alignment); } - inProgress = false; + for (SequenceI seq : alignment.getSequences()) + { + getCrossReferences(seq); + } + System.out.println(getClass().getName() + " took " + (System.currentTimeMillis() - now) + "ms to fetch"); return alignment; @@ -265,8 +272,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient proteinSeq.createDatasetSequence(); querySeq.createDatasetSequence(); - getProteinCrossReferences(proteinSeq); - MapList mapList = mapCdsToProtein(querySeq, proteinSeq); if (mapList != null) { @@ -293,26 +298,35 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * Get Uniprot and PDB xrefs from Ensembl, and attach them to the protein * sequence * - * @param proteinSeq + * @param seq */ - protected void getProteinCrossReferences(SequenceI proteinSeq) + protected void getCrossReferences(SequenceI seq) { - while (proteinSeq.getDatasetSequence() != null) + while (seq.getDatasetSequence() != null) { - proteinSeq = proteinSeq.getDatasetSequence(); + seq = seq.getDatasetSequence(); } EnsemblXref xrefFetcher = new EnsemblXref(); - List xrefs = xrefFetcher.getCrossReferences( - proteinSeq.getName(), "PDB", "Uniprot/SPTREMBL", - "Uniprot/SWISSPROT"); + List xrefs = xrefFetcher.getCrossReferences(seq.getName(), + getCrossReferenceDatabases()); for (DBRefEntry xref : xrefs) { - proteinSeq.addDBRef(xref); + seq.addDBRef(xref); } } /** + * Returns a list of database names to be used when fetching cross-references. + * + * @return + */ + protected List getCrossReferenceDatabases() + { + return CROSS_REFERENCES; + } + + /** * Returns a mapping from dna to protein by inspecting sequence features of * type "CDS" on the dna. * diff --git a/src/jalview/ext/ensembl/EnsemblXref.java b/src/jalview/ext/ensembl/EnsemblXref.java index 6a4f369..36bd7c5 100644 --- a/src/jalview/ext/ensembl/EnsemblXref.java +++ b/src/jalview/ext/ensembl/EnsemblXref.java @@ -60,15 +60,16 @@ public class EnsemblXref extends EnsemblRestClient /** * Calls the Ensembl xrefs REST endpoint and retrieves any cross-references * ("primary_id") for the given identifier (Ensembl accession id) and database - * name. The "dbname" returned by Ensembl is canonicalised to Jalview's - * standard version, and a DBRefEntry constructed. + * names. The "dbname" returned by Ensembl is canonicalised to Jalview's + * standard version, and a DBRefEntry constructed. If no databases are + * specified, all available cross-references are retrieved. * * @param identifier - * @param database + * @param databases * @return */ public List getCrossReferences(String identifier, - String... database) + List databases) { List result = new ArrayList(); List ids = new ArrayList(); @@ -77,22 +78,12 @@ public class EnsemblXref extends EnsemblRestClient BufferedReader br = null; try { - for (String db : database) - { - URL url = getUrl(identifier, db); + URL url = getUrl(identifier); if (url != null) { br = getHttpResponse(url, ids); } - for (DBRefEntry xref : parseResponse(br)) - { - if (!result.contains(xref)) - { - result.add(xref); - } - } - br.close(); - } + return (parseResponse(br, databases)); } catch (IOException e) { // ignore @@ -114,14 +105,17 @@ public class EnsemblXref extends EnsemblRestClient } /** - * Parses "primary_id" and "dbname" values from the JSON response and returns - * a list of DBRefEntry constructed. + * Parses "primary_id" and "dbname" values from the JSON response and + * constructs a DBRefEntry if the dbname is in the list supplied. Returns a + * list of DBRefEntry created. * * @param br + * @param databases * @return * @throws IOException */ - protected List parseResponse(BufferedReader br) + protected List parseResponse(BufferedReader br, + List databases) throws IOException { JSONParser jp = new JSONParser(); @@ -134,6 +128,11 @@ public class EnsemblXref extends EnsemblRestClient { JSONObject val = (JSONObject) rvals.next(); String dbName = val.get("dbname").toString(); + if (databases != null && !databases.isEmpty() + && !databases.contains(dbName)) + { + continue; + } String id = val.get("primary_id").toString(); if (dbName != null && id != null) { @@ -149,10 +148,18 @@ public class EnsemblXref extends EnsemblRestClient return result; } - protected URL getUrl(String identifier, String db) + /** + * Returns the URL for the REST endpoint to fetch all cross-references for an + * identifier. Note this may return protein cross-references for nucleotide. + * Filter the returned list as required. + * + * @param identifier + * @return + */ + protected URL getUrl(String identifier) { String url = ENSEMBL_REST + "/xrefs/id/" + identifier - + "?content-type=application/json&external_db=" + db; + + "?content-type=application/json&all_levels=1"; try { return new URL(url); -- 1.7.10.2