From: gmungoc Date: Fri, 10 Aug 2018 09:59:42 +0000 (+0100) Subject: JAL-3076 fetch Ensembl sequence as JSON instead of Fasta X-Git-Tag: Release_2_11_0~20^2~3 X-Git-Url: http://source.jalview.org/gitweb/?p=jalview.git;a=commitdiff_plain;h=f59dd9efbb3dfc313ab0b0507832e21cd0076fe1 JAL-3076 fetch Ensembl sequence as JSON instead of Fasta --- diff --git a/src/jalview/ext/ensembl/EnsemblFeatures.java b/src/jalview/ext/ensembl/EnsemblFeatures.java index cb6f548..582eac6 100644 --- a/src/jalview/ext/ensembl/EnsemblFeatures.java +++ b/src/jalview/ext/ensembl/EnsemblFeatures.java @@ -22,9 +22,11 @@ package jalview.ext.ensembl; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; +import jalview.io.DataSourceType; import jalview.io.FeaturesFile; import jalview.io.FileParse; +import java.io.BufferedReader; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; @@ -84,12 +86,13 @@ class EnsemblFeatures extends EnsemblRestClient // TODO: use a vararg String... for getSequenceRecords instead? List queries = new ArrayList<>(); queries.add(query); - FileParse fp = getSequenceReader(queries); - if (fp == null || !fp.isValid()) + BufferedReader fp = getSequenceReader(queries); + if (fp == null) { return null; } - FeaturesFile fr = new FeaturesFile(fp); + FeaturesFile fr = new FeaturesFile( + new FileParse(fp, null, DataSourceType.URL)); return new Alignment(fr.getSeqsAsArray()); } @@ -140,13 +143,13 @@ class EnsemblFeatures extends EnsemblRestClient * describes the required encoding of the response. */ @Override - protected String getRequestMimeType(boolean multipleIds) + protected String getRequestMimeType() { return "text/x-gff3"; } /** - * Returns the MIME type for GFF3. + * Returns the MIME type for GFF3 */ @Override protected String getResponseMimeType() diff --git a/src/jalview/ext/ensembl/EnsemblInfo.java b/src/jalview/ext/ensembl/EnsemblInfo.java index 37dff51..fa24f1e 100644 --- a/src/jalview/ext/ensembl/EnsemblInfo.java +++ b/src/jalview/ext/ensembl/EnsemblInfo.java @@ -61,18 +61,6 @@ public class EnsemblInfo extends EnsemblRestClient return true; } - @Override - protected String getRequestMimeType(boolean multipleIds) - { - return "application/json"; - } - - @Override - protected String getResponseMimeType() - { - return "application/json"; - } - /** * Answers the domain (http://rest.ensembl.org or * http://rest.ensemblgenomes.org) for the given division, or null if not diff --git a/src/jalview/ext/ensembl/EnsemblLookup.java b/src/jalview/ext/ensembl/EnsemblLookup.java index 82690a3..c6b794a 100644 --- a/src/jalview/ext/ensembl/EnsemblLookup.java +++ b/src/jalview/ext/ensembl/EnsemblLookup.java @@ -117,18 +117,6 @@ public class EnsemblLookup extends EnsemblRestClient return true; } - @Override - protected String getRequestMimeType(boolean multipleIds) - { - return "application/json"; - } - - @Override - protected String getResponseMimeType() - { - return "application/json"; - } - /** * Returns the gene id related to the given identifier (which may be for a * gene, transcript or protein), or null if none is found diff --git a/src/jalview/ext/ensembl/EnsemblMap.java b/src/jalview/ext/ensembl/EnsemblMap.java index 8ca60de..f01bd4f 100644 --- a/src/jalview/ext/ensembl/EnsemblMap.java +++ b/src/jalview/ext/ensembl/EnsemblMap.java @@ -98,18 +98,6 @@ public class EnsemblMap extends EnsemblRestClient } @Override - protected String getRequestMimeType(boolean multipleIds) - { - return "application/json"; - } - - @Override - protected String getResponseMimeType() - { - return "application/json"; - } - - @Override protected URL getUrl(List ids) throws MalformedURLException { return null; // not used diff --git a/src/jalview/ext/ensembl/EnsemblRestClient.java b/src/jalview/ext/ensembl/EnsemblRestClient.java index f36e111..19940ff 100644 --- a/src/jalview/ext/ensembl/EnsemblRestClient.java +++ b/src/jalview/ext/ensembl/EnsemblRestClient.java @@ -20,8 +20,6 @@ */ package jalview.ext.ensembl; -import jalview.io.DataSourceType; -import jalview.io.FileParse; import jalview.util.StringUtils; import java.io.BufferedReader; @@ -153,22 +151,28 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher protected abstract boolean useGetRequest(); /** - * Return the desired value for the Content-Type request header - * - * @param multipleIds + * Returns the desired value for the Content-Type request header. Default is + * application/json, override if required to vary this. * * @return * @see https://github.com/Ensembl/ensembl-rest/wiki/HTTP-Headers */ - protected abstract String getRequestMimeType(boolean multipleIds); + protected String getRequestMimeType() + { + return "application/json"; + } /** - * Return the desired value for the Accept request header + * Return the desired value for the Accept request header. Default is + * application/json, override if required to vary this. * * @return * @see https://github.com/Ensembl/ensembl-rest/wiki/HTTP-Headers */ - protected abstract String getResponseMimeType(); + protected String getResponseMimeType() + { + return "application/json"; + } /** * Checks Ensembl's REST 'ping' endpoint, and returns true if response @@ -222,25 +226,20 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher } /** - * returns a reader to a Fasta response from the Ensembl sequence endpoint + * Returns a reader to a (Json) response from the Ensembl sequence endpoint. + * If the request failed the return value may be null. * * @param ids * @return * @throws IOException */ - protected FileParse getSequenceReader(List ids) throws IOException + protected BufferedReader getSequenceReader(List ids) + throws IOException { URL url = getUrl(ids); BufferedReader reader = getHttpResponse(url, ids); - if (reader == null) - { - // request failed - return null; - } - FileParse fp = new FileParse(reader, url.toString(), - DataSourceType.URL); - return fp; + return reader; } /** @@ -332,8 +331,7 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher boolean multipleIds = ids != null && ids.size() > 1; connection.setRequestMethod( multipleIds ? HttpMethod.POST : HttpMethod.GET); - connection.setRequestProperty("Content-Type", - getRequestMimeType(multipleIds)); + connection.setRequestProperty("Content-Type", getRequestMimeType()); connection.setRequestProperty("Accept", getResponseMimeType()); connection.setUseCaches(false); diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 9229379..f96f1d5 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -28,12 +28,11 @@ import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.Mapping; +import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.datamodel.features.SequenceFeatures; import jalview.exceptions.JalviewException; -import jalview.io.FastaFile; -import jalview.io.FileParse; import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; @@ -42,6 +41,7 @@ import jalview.util.DBRefUtils; import jalview.util.IntRangeComparator; import jalview.util.MapList; +import java.io.BufferedReader; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; @@ -50,6 +50,10 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; + /** * Base class for Ensembl sequence fetchers * @@ -385,50 +389,44 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient inProgress = false; throw new JalviewException("ENSEMBL Rest API not available."); } - FileParse fp = getSequenceReader(ids); - if (fp == null) + BufferedReader br = getSequenceReader(ids); + if (br == null) { return alignment; } - FastaFile fr = new FastaFile(fp); - if (fr.hasWarningMessage()) + List seqs = parseSequenceJson(br); + + if (seqs.isEmpty()) { - System.out.println( - String.format("Warning when retrieving %d ids %s\n%s", - ids.size(), ids.toString(), fr.getWarningMessage())); + throw new IOException("No data returned for " + ids); } - else if (fr.getSeqs().size() != ids.size()) + + if (seqs.size() != ids.size()) { System.out.println(String.format( "Only retrieved %d sequences for %d query strings", - fr.getSeqs().size(), ids.size())); + seqs.size(), ids.size())); } - if (fr.getSeqs().size() == 1 && fr.getSeqs().get(0).getLength() == 0) + if (!seqs.isEmpty()) { - /* - * POST request has returned an empty FASTA file e.g. for invalid id - */ - throw new IOException("No data returned for " + ids); - } - - if (fr.getSeqs().size() > 0) - { - AlignmentI seqal = new Alignment(fr.getSeqsAsArray()); - for (SequenceI sq : seqal.getSequences()) + AlignmentI seqal = new Alignment( + seqs.toArray(new SequenceI[seqs.size()])); + for (SequenceI seq : seqs) { - if (sq.getDescription() == null) + if (seq.getDescription() == null) { - sq.setDescription(getDbName()); + seq.setDescription(getDbName()); } - String name = sq.getName(); + String name = seq.getName(); if (ids.contains(name) || ids.contains(name.replace("ENSP", "ENST"))) { - DBRefEntry dbref = DBRefUtils.parseToDbRef(sq, getDbSource(), + // TODO JAL-3077 use true accession version in dbref + DBRefEntry dbref = DBRefUtils.parseToDbRef(seq, getDbSource(), getEnsemblDataVersion(), name); - sq.addDBRef(dbref); + seq.addDBRef(dbref); } } if (alignment == null) @@ -444,6 +442,49 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** + * Parses a JSON response into a list of sequences + * + * @param br + * @return + * @see http://rest.ensembl.org/documentation/info/sequence_id + */ + protected List parseSequenceJson(BufferedReader br) + { + JSONParser jp = new JSONParser(); + List result = new ArrayList<>(); + try + { + /* + * for now, assumes only one sequence returned; refactor if needed + * in future to handle a JSONArray with more than one + */ + final JSONObject val = (JSONObject) jp.parse(br); + Object s = val.get("desc"); + String desc = s == null ? null : s.toString(); + s = val.get("id"); + String id = s == null ? null : s.toString(); + s = val.get("seq"); + String seq = s == null ? null : s.toString(); + Sequence sequence = new Sequence(id, seq); + if (desc != null) + { + sequence.setDescription(desc); + } + // todo JAL-3077 make a DBRefEntry with true accession version + // s = val.get("version"); + // String version = s == null ? "0" : s.toString(); + // DBRefEntry dbref = new DBRefEntry(getDbSource(), version, id); + // sequence.addDBRef(dbref); + result.add(sequence); + } catch (ParseException | IOException e) + { + System.err.println("Error processing JSON response: " + e.toString()); + // ignore + } + return result; + } + + /** * Returns the URL for the REST call * * @return @@ -464,7 +505,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats urlstring.append("?type=").append(getSourceEnsemblType().getType()); - urlstring.append(("&Accept=text/x-fasta")); + urlstring.append(("&Accept=application/json")); + urlstring.append(("&Content-Type=application/json")); String objectType = getObjectType(); if (objectType != null) @@ -504,18 +546,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return false; } - @Override - protected String getRequestMimeType(boolean multipleIds) - { - return multipleIds ? "application/json" : "text/x-fasta"; - } - - @Override - protected String getResponseMimeType() - { - return "text/x-fasta"; - } - /** * * @return the configured sequence return type for this source @@ -656,6 +686,20 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String accId); /** + * Answers a list of sequence features that mark positions of the genomic + * sequence feature which are within the sequence being retrieved. For + * example, an 'exon' feature whose parent is the target transcript marks the + * cdna positions of the transcript. For a gene sequence, this is trivially + * just the 'gene' feature with matching gene id. + * + * @param seq + * @param accId + * @return + */ + protected abstract List getIdentifyingFeatures( + SequenceI seq, String accId); + + /** * Transfers the sequence feature to the target sequence, locating its start * and end range based on the mapping. Features which do not overlap the * target sequence are ignored. diff --git a/src/jalview/ext/ensembl/EnsemblXref.java b/src/jalview/ext/ensembl/EnsemblXref.java index 27c448e..77768a6 100644 --- a/src/jalview/ext/ensembl/EnsemblXref.java +++ b/src/jalview/ext/ensembl/EnsemblXref.java @@ -88,18 +88,6 @@ class EnsemblXref extends EnsemblRestClient return true; } - @Override - protected String getRequestMimeType(boolean multipleIds) - { - return "application/json"; - } - - @Override - protected String getResponseMimeType() - { - return "application/json"; - } - /** * Calls the Ensembl xrefs REST endpoint and retrieves any cross-references * ("primary_id") for the given identifier (Ensembl accession id) and database @@ -113,8 +101,8 @@ class EnsemblXref extends EnsemblRestClient */ public List getCrossReferences(String identifier) { - List result = new ArrayList(); - List ids = new ArrayList(); + List result = new ArrayList<>(); + List ids = new ArrayList<>(); ids.add(identifier); BufferedReader br = null; @@ -163,7 +151,7 @@ class EnsemblXref extends EnsemblRestClient throws IOException { JSONParser jp = new JSONParser(); - List result = new ArrayList(); + List result = new ArrayList<>(); try { JSONArray responses = (JSONArray) jp.parse(br); diff --git a/test/jalview/ext/ensembl/EnsemblRestClientTest.java b/test/jalview/ext/ensembl/EnsemblRestClientTest.java index cc3a3db..460d16c 100644 --- a/test/jalview/ext/ensembl/EnsemblRestClientTest.java +++ b/test/jalview/ext/ensembl/EnsemblRestClientTest.java @@ -79,19 +79,6 @@ public class EnsemblRestClientTest { return false; } - - @Override - protected String getRequestMimeType(boolean b) - { - return null; - } - - @Override - protected String getResponseMimeType() - { - return null; - } - }; } diff --git a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java index 42afa82..69b2ad4 100644 --- a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java +++ b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java @@ -23,14 +23,13 @@ package jalview.ext.ensembl; import static org.testng.AssertJUnit.assertEquals; import static org.testng.AssertJUnit.assertSame; -import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentI; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.datamodel.features.SequenceFeatures; import jalview.gui.JvOptionPane; import jalview.io.DataSourceType; import jalview.io.FastaFile; -import jalview.io.FileParse; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyLite; @@ -125,7 +124,11 @@ public class EnsemblSeqProxyTest + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n" + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n" + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n" - + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n" + + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDG\n" + // ? insertion added in ENSP00000288602.11, not in P15056 + + "APLNQLMRCLRKYQSRTPSPLLHSVPSEIVFDFEPGPVFR\n" + // end insertion + + "GSTTGLSATPPASLPGSLTNVKALQKSP\n" + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n" + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n" + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n" @@ -153,22 +156,21 @@ public class EnsemblSeqProxyTest } @Test(dataProvider = "ens_seqs", suiteName = "live") - public void testGetOneSeqs(EnsemblRestClient proxy, String sq, + public void testGetSequenceRecords(EnsemblSeqProxy proxy, String sq, String fastasq) throws Exception { - FileParse fp = proxy.getSequenceReader(Arrays - .asList(new String[] { sq })); - SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray(); FastaFile trueRes = new FastaFile(fastasq, DataSourceType.PASTE); - SequenceI[] trueSqs = trueRes.getSeqsAsArray(); - Assert.assertEquals(sqs.length, trueSqs.length, + SequenceI[] expected = trueRes.getSeqsAsArray(); + AlignmentI retrieved = proxy.getSequenceRecords(sq); + + Assert.assertEquals(retrieved.getHeight(), expected.length, "Different number of sequences retrieved for query " + sq); - Alignment ral = new Alignment(sqs); - for (SequenceI tr : trueSqs) + + for (SequenceI tr : expected) { SequenceI[] rseq; Assert.assertNotNull( - rseq = ral.findSequenceMatch(tr.getName()), + rseq = retrieved.findSequenceMatch(tr.getName()), "Couldn't find sequences matching expected sequence " + tr.getName()); Assert.assertEquals(rseq.length, 1, @@ -179,7 +181,6 @@ public class EnsemblSeqProxyTest "Sequences differ for " + tr.getName() + "\n" + "Exp:" + tr.getSequenceAsString() + "\n" + "Got:" + rseq[0].getSequenceAsString()); - } }