From 550c391f0c113658e540783dc89034a34280ef18 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 19 Feb 2016 16:49:46 +0000 Subject: [PATCH] JAL-1705 return Ensembl genes for model species for a gene name --- src/jalview/ext/ensembl/EnsemblGene.java | 129 +++++++++++------------ src/jalview/ext/ensembl/EnsemblLookup.java | 135 +++++++++++++++++++++++++ src/jalview/ext/ensembl/EnsemblSymbol.java | 121 ++++++++++++++++++++++ src/jalview/ext/ensembl/Species.java | 32 ++++++ test/jalview/ext/ensembl/EnsemblGeneTest.java | 13 +-- 5 files changed, 354 insertions(+), 76 deletions(-) create mode 100644 src/jalview/ext/ensembl/EnsemblLookup.java create mode 100644 src/jalview/ext/ensembl/EnsemblSymbol.java create mode 100644 src/jalview/ext/ensembl/Species.java diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java index 73649b4..10841bd 100644 --- a/src/jalview/ext/ensembl/EnsemblGene.java +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -7,8 +7,8 @@ import jalview.datamodel.SequenceI; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; +import jalview.util.StringUtils; -import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -24,9 +24,11 @@ public class EnsemblGene extends EnsemblSeqProxy { private static final String GENE_PREFIX = "gene:"; - // TODO modify to accept other species e.g. ENSMUSGnnn - private static final Regex ACCESSION_REGEX = new Regex( - "(ENSG|ENST)[0-9]{11}$"); + /* + * accepts anything as we will attempt lookup of gene or + * transcript id or gene name + */ + private static final Regex ACCESSION_REGEX = new Regex(".*"); private static final EnsemblFeatureType[] FEATURES_TO_FETCH = { EnsemblFeatureType.gene, EnsemblFeatureType.transcript, @@ -52,8 +54,15 @@ public class EnsemblGene extends EnsemblSeqProxy } /** - * Builds an alignment of all transcripts for the requested gene: + * Returns an alignment containing the gene(s) for the given gene or + * transcript identifier, or external identifier (e.g. Uniprot id). If given a + * gene name or external identifier, returns any related gene sequences found + * for model organisms. If only a single gene is queried for, then its + * transcripts are also retrieved and added to the alignment.
+ * Method: * + * + * @param query + * one or more identifiers separated by a space + * @return an alignment containing one or more genes, and possibly + * transcripts, or null */ @Override public AlignmentI getSequenceRecords(String query) throws Exception { - List transcriptsWanted = null; + // todo: tidy up handling of one or multiple accession ids + String[] queries = query.split(getAccessionSeparator()); + /* + * if given a transcript id, look up its gene parent + */ if (isTranscriptIdentifier(query)) { - transcriptsWanted = Arrays.asList(query - .split(getAccessionSeparator())); - query = getGeneForTranscript(query); + // we are assuming all transcripts have the same gene parent here + query = new EnsemblLookup().getParent(queries[0]); if (query == null) { return null; } } + /* + * if given a gene or other external name, lookup and fetch + * the corresponding gene for all model organisms + */ + if (!isGeneIdentifier(query)) + { + List geneIds = new EnsemblSymbol().getIds(query); + if (geneIds.isEmpty()) + { + return null; + } + String theIds = StringUtils.listToDelimitedString(geneIds, + getAccessionSeparator()); + return getSequenceRecords(theIds); + } + AlignmentI al = super.getSequenceRecords(query); - if (al.getHeight() > 0) + + /* + * if we retrieved a single gene, get its transcripts as well + */ + if (al.getHeight() == 1) { - getTranscripts(al, query, transcriptsWanted); + getTranscripts(al, query); } return al; } /** - * Gets the parent gene identifier for a given transcript identifier, by - * retrieving 'transcript' features overlapping the transcript, and finding - * the Parent property of the feature whose id is the given identifier. + * Attempts to get Ensembl stable identifiers for model organisms for a gene + * name by calling the xrefs symbol REST service to resolve the gene name. * * @param query * @return */ - protected String getGeneForTranscript(String transcriptId) + protected String getGeneIdentifiersForName(String query) { - String geneId = null; - - /* - * reduce multiple transcripts (e.g. from Uniprot x-ref) to the first - * one only as representative (they should all have the same gene) - */ - transcriptId = transcriptId.split(getAccessionSeparator())[0]; - - try + List ids = new EnsemblSymbol().getIds(query); + if (ids != null) { - EnsemblFeatureType[] geneFeature = new EnsemblFeatureType[] { EnsemblFeatureType.transcript }; - AlignmentI al = new EnsemblFeatures().getSequenceRecords( - transcriptId, geneFeature); - if (al != null && al.getHeight() > 0) + for (String id : ids) { - SequenceFeature[] sfs = al.getSequenceAt(0).getSequenceFeatures(); - if (sfs != null) + if (isGeneIdentifier(id)) { - for (SequenceFeature sf : sfs) - { - if (transcriptId.equals(getTranscriptId(sf))) - { - String parent = (String) sf.getValue(PARENT); - if (parent != null && parent.startsWith(GENE_PREFIX)) - { - geneId = parent.substring(5); - } - break; - } - } + return id; } } - return geneId; - } catch (IOException e) - { - System.err.println("Error retrieving gene id for " + transcriptId - + ": " + e.getMessage()); - return null; } + return null; } /** @@ -149,17 +158,14 @@ public class EnsemblGene extends EnsemblSeqProxy * * @param al * @param accId - * @param transcriptsWanted - * optional list of transcript ids to filter by * @throws Exception */ - protected void getTranscripts(AlignmentI al, String accId, - List transcriptsWanted) + protected void getTranscripts(AlignmentI al, String accId) throws Exception { SequenceI gene = al.getSequenceAt(0); List transcriptFeatures = getTranscriptFeatures(accId, - gene, transcriptsWanted); + gene); for (SequenceFeature transcriptFeature : transcriptFeatures) { @@ -250,6 +256,11 @@ public class EnsemblGene extends EnsemblSeqProxy transcript.getDatasetSequence(), mapping, parentId); /* + * fetch and save cross-references + */ + super.getCrossReferences(transcript); + + /* * and finally fetch the protein product and save as a cross-reference */ new EnsemblCdna().addProteinProduct(transcript); @@ -274,12 +285,10 @@ public class EnsemblGene extends EnsemblSeqProxy * * @param accId * @param geneSequence - * @param transcriptsWanted - * optional list of ids to filter on * @return */ protected List getTranscriptFeatures(String accId, - SequenceI geneSequence, List transcriptsWanted) + SequenceI geneSequence) { List transcriptFeatures = new ArrayList(); @@ -292,14 +301,6 @@ public class EnsemblGene extends EnsemblSeqProxy { if (isTranscript(sf.getType())) { - if (transcriptsWanted != null) - { - String transcriptId = (String) sf.getValue("transcript_id"); - if (!transcriptsWanted.contains(transcriptId)) - { - // continue; - } - } String parent = (String) sf.getValue(PARENT); if (parentIdentifier.equals(parent)) { diff --git a/src/jalview/ext/ensembl/EnsemblLookup.java b/src/jalview/ext/ensembl/EnsemblLookup.java new file mode 100644 index 0000000..cd792b5 --- /dev/null +++ b/src/jalview/ext/ensembl/EnsemblLookup.java @@ -0,0 +1,135 @@ +package jalview.ext.ensembl; + +import jalview.datamodel.AlignmentI; + +import java.io.BufferedReader; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Arrays; +import java.util.List; + +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; + +public class EnsemblLookup extends EnsemblRestClient +{ + + @Override + public String getDbName() + { + return "ENSEMBL"; + } + + @Override + public AlignmentI getSequenceRecords(String queries) throws Exception + { + return null; + } + + @Override + protected URL getUrl(List ids) throws MalformedURLException + { + String identifier = ids.get(0); + return getUrl(identifier); + } + + /** + * @param identifier + * @return + */ + protected URL getUrl(String identifier) + { + String url = ENSEMBL_REST + "/lookup/id/" + identifier + + "?content-type=application/json"; + try + { + return new URL(url); + } catch (MalformedURLException e) + { + return null; + } + } + + @Override + protected boolean useGetRequest() + { + return true; + } + + @Override + protected String getRequestMimeType(boolean multipleIds) + { + return "application/json"; + } + + @Override + protected String getResponseMimeType() + { + return "application/json"; + } + + /** + * Calls the Ensembl lookup REST endpoint and retrieves the 'Parent' for the + * given identifier, or null if not found + * + * @param identifier + * @return + */ + public String getParent(String identifier) + { + List ids = Arrays.asList(new String[] { identifier }); + + BufferedReader br = null; + try + { + URL url = getUrl(identifier); + if (url != null) + { + br = getHttpResponse(url, ids); + } + return (parseResponse(br)); + } catch (IOException e) + { + // ignore + return null; + } finally + { + if (br != null) + { + try + { + br.close(); + } catch (IOException e) + { + // ignore + } + } + } + } + + /** + * Parses "Parent" from the JSON response and returns the value, or null if + * not found + * + * @param br + * @return + * @throws IOException + */ + protected String parseResponse(BufferedReader br) throws IOException + { + String parent = null; + JSONParser jp = new JSONParser(); + try + { + JSONObject val = (JSONObject) jp.parse(br); + parent = val.get("Parent").toString(); + } catch (ParseException e) + { + // ignore + } + return parent; + } + +} diff --git a/src/jalview/ext/ensembl/EnsemblSymbol.java b/src/jalview/ext/ensembl/EnsemblSymbol.java new file mode 100644 index 0000000..5b3baa1 --- /dev/null +++ b/src/jalview/ext/ensembl/EnsemblSymbol.java @@ -0,0 +1,121 @@ +package jalview.ext.ensembl; + +import java.io.BufferedReader; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; + +public class EnsemblSymbol extends EnsemblXref +{ + /** + * Returns the first "id" value in gene identifier format from the JSON + * response, or null if none found + * + * @param br + * @return + * @throws IOException + */ + protected String parseResponse(BufferedReader br) + throws IOException + { + JSONParser jp = new JSONParser(); + String result = null; + try + { + JSONArray responses = (JSONArray) jp.parse(br); + Iterator rvals = responses.iterator(); + while (rvals.hasNext()) + { + JSONObject val = (JSONObject) rvals.next(); + String id = val.get("id").toString(); + if (id != null && isGeneIdentifier(id)) + { + result = id; + break; + } + } + } catch (ParseException e) + { + // ignore + } + return result; + } + + protected URL getUrl(String id, Species species) + { + String url = ENSEMBL_REST + "/xrefs/symbol/" + species.toString() + "/" + + id + + "?content-type=application/json"; + try + { + return new URL(url); + } catch (MalformedURLException e) + { + return null; + } + } + + /** + * Calls the Ensembl xrefs REST 'symbol' endpoint and retrieves any gene ids + * for the given identifier, for any known model organisms + * + * @param identifier + * @return + */ + public List getIds(String identifier) + { + List result = new ArrayList(); + List ids = new ArrayList(); + ids.add(identifier); + + String[] queries = identifier.split(getAccessionSeparator()); + BufferedReader br = null; + try + { + for (String query : queries) + { + for (Species taxon : Species.values()) + { + if (taxon.isModelOrganism()) + { + URL url = getUrl(query, taxon); + if (url != null) + { + br = getHttpResponse(url, ids); + } + String geneId = parseResponse(br); + if (geneId != null) + { + result.add(geneId); + } + } + } + } + } catch (IOException e) + { + // ignore + } finally + { + if (br != null) + { + try + { + br.close(); + } catch (IOException e) + { + // ignore + } + } + } + return result; + } + +} diff --git a/src/jalview/ext/ensembl/Species.java b/src/jalview/ext/ensembl/Species.java new file mode 100644 index 0000000..d8a00a5 --- /dev/null +++ b/src/jalview/ext/ensembl/Species.java @@ -0,0 +1,32 @@ +package jalview.ext.ensembl; + +/** + * Selected species identifiers used by Ensembl + * + * @author gmcarstairs + * @see http://rest.ensembl.org/info/species?content-type=text/xml + */ +enum Species +{ + /* + * using any suitably readable alias as the enum name; these are all + * valid species parameters to Ensembl REST services where applicable + */ + human(true), mouse(true), s_cerevisiae(true), cow(false), pig(false), + rat(true), celegans(true), sheep(false), horse(false), gorilla(false), + rabbit(false), gibbon(false), dog(false), orangutan(false), + xenopus(true), chimpanzee(false), cat(false), zebrafish(true), chicken( + true), dmelanogaster(true); + + boolean modelOrganism; + + private Species(boolean model) + { + this.modelOrganism = model; + } + + boolean isModelOrganism() + { + return modelOrganism; + } +} diff --git a/test/jalview/ext/ensembl/EnsemblGeneTest.java b/test/jalview/ext/ensembl/EnsemblGeneTest.java index a262c1e..d1c7e2f 100644 --- a/test/jalview/ext/ensembl/EnsemblGeneTest.java +++ b/test/jalview/ext/ensembl/EnsemblGeneTest.java @@ -12,7 +12,6 @@ import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyLite; import jalview.util.MapList; -import java.util.Arrays; import java.util.List; import org.testng.annotations.AfterClass; @@ -160,21 +159,11 @@ public class EnsemblGeneTest * with no filter */ List features = testee.getTranscriptFeatures(geneId, - genomic, null); + genomic); assertEquals(3, features.size()); assertSame(sf1, features.get(0)); assertSame(sf2, features.get(1)); assertSame(sf3, features.get(2)); - - /* - * with filter - */ - List ids = Arrays.asList(new String[] { "transcript2", - "transcript3" }); - features = testee.getTranscriptFeatures(geneId, genomic, ids); - assertEquals(2, features.size()); - assertSame(sf2, features.get(0)); - assertSame(sf3, features.get(1)); } /** -- 1.7.10.2