JAL-3076 fetch Ensembl sequence as JSON instead of Fasta
authorgmungoc <g.m.carstairs@dundee.ac.uk>
Fri, 10 Aug 2018 09:59:42 +0000 (10:59 +0100)
committergmungoc <g.m.carstairs@dundee.ac.uk>
Mon, 3 Sep 2018 14:02:28 +0000 (15:02 +0100)
Conflicts:
src/jalview/ext/ensembl/EnsemblInfo.java
src/jalview/ext/ensembl/EnsemblMap.java
src/jalview/ext/ensembl/EnsemblSeqProxy.java

src/jalview/ext/ensembl/EnsemblFeatures.java
src/jalview/ext/ensembl/EnsemblLookup.java
src/jalview/ext/ensembl/EnsemblMap.java [new file with mode: 0644]
src/jalview/ext/ensembl/EnsemblRestClient.java
src/jalview/ext/ensembl/EnsemblSeqProxy.java
src/jalview/ext/ensembl/EnsemblXref.java
test/jalview/ext/ensembl/EnsemblRestClientTest.java
test/jalview/ext/ensembl/EnsemblSeqProxyTest.java

index cb6f548..582eac6 100644 (file)
@@ -22,9 +22,11 @@ package jalview.ext.ensembl;
 
 import jalview.datamodel.Alignment;
 import jalview.datamodel.AlignmentI;
+import jalview.io.DataSourceType;
 import jalview.io.FeaturesFile;
 import jalview.io.FileParse;
 
+import java.io.BufferedReader;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
@@ -84,12 +86,13 @@ class EnsemblFeatures extends EnsemblRestClient
     // TODO: use a vararg String... for getSequenceRecords instead?
     List<String> queries = new ArrayList<>();
     queries.add(query);
-    FileParse fp = getSequenceReader(queries);
-    if (fp == null || !fp.isValid())
+    BufferedReader fp = getSequenceReader(queries);
+    if (fp == null)
     {
       return null;
     }
-    FeaturesFile fr = new FeaturesFile(fp);
+    FeaturesFile fr = new FeaturesFile(
+            new FileParse(fp, null, DataSourceType.URL));
     return new Alignment(fr.getSeqsAsArray());
   }
 
@@ -140,13 +143,13 @@ class EnsemblFeatures extends EnsemblRestClient
    * describes the required encoding of the response.
    */
   @Override
-  protected String getRequestMimeType(boolean multipleIds)
+  protected String getRequestMimeType()
   {
     return "text/x-gff3";
   }
 
   /**
-   * Returns the MIME type for GFF3.
+   * Returns the MIME type for GFF3
    */
   @Override
   protected String getResponseMimeType()
index 877331d..d7f1b07 100644 (file)
@@ -110,18 +110,6 @@ public class EnsemblLookup extends EnsemblRestClient
     return true;
   }
 
-  @Override
-  protected String getRequestMimeType(boolean multipleIds)
-  {
-    return "application/json";
-  }
-
-  @Override
-  protected String getResponseMimeType()
-  {
-    return "application/json";
-  }
-
   /**
    * Returns the gene id related to the given identifier, which may be for a
    * gene, transcript or protein
diff --git a/src/jalview/ext/ensembl/EnsemblMap.java b/src/jalview/ext/ensembl/EnsemblMap.java
new file mode 100644 (file)
index 0000000..c94528e
--- /dev/null
@@ -0,0 +1,213 @@
+package jalview.ext.ensembl;
+
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.DBRefSource;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Iterator;
+import java.util.List;
+
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
+
+public class EnsemblMap extends EnsemblRestClient
+{
+  private static final String MAPPED = "mapped";
+
+  private static final String MAPPINGS = "mappings";
+
+  /**
+   * Default constructor (to use rest.ensembl.org)
+   */
+  public EnsemblMap()
+  {
+    super();
+  }
+
+  /**
+   * Constructor given the target domain to fetch data from
+   * 
+   * @param
+   */
+  public EnsemblMap(String domain)
+  {
+    super(domain);
+  }
+
+  @Override
+  public String getDbName()
+  {
+    return DBRefSource.ENSEMBL;
+  }
+
+  @Override
+  public AlignmentI getSequenceRecords(String queries) throws Exception
+  {
+    return null; // not used
+  }
+
+  /**
+   * Constructs a URL of the format <code>
+   * http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37?content-type=application/json
+   * </code>
+   * 
+   * @param species
+   * @param chromosome
+   * @param fromRef
+   * @param toRef
+   * @param startPos
+   * @param endPos
+   * @return
+   * @throws MalformedURLException
+   */
+  protected URL getAssemblyMapUrl(String species, String chromosome, String fromRef,
+          String toRef, int startPos, int endPos)
+          throws MalformedURLException
+  {
+    /*
+     * start-end might be reverse strand - present forwards to the service
+     */
+    boolean forward = startPos <= endPos;
+    int start = forward ? startPos : endPos;
+    int end = forward ? endPos : startPos;
+    String strand = forward ? "1" : "-1";
+    String url = String.format(
+            "%s/map/%s/%s/%s:%d..%d:%s/%s?content-type=application/json",
+            getDomain(), species, fromRef, chromosome, start, end, strand,
+            toRef);
+    return new URL(url);
+  }
+
+  @Override
+  protected boolean useGetRequest()
+  {
+    return true;
+  }
+
+  @Override
+  protected URL getUrl(List<String> ids) throws MalformedURLException
+  {
+    return null; // not used
+  }
+
+  /**
+   * Calls the REST /map service to get the chromosomal coordinates (start/end)
+   * in 'toRef' that corresponding to the (start/end) queryRange in 'fromRef'
+   * 
+   * @param species
+   * @param chromosome
+   * @param fromRef
+   * @param toRef
+   * @param queryRange
+   * @return
+   * @see http://rest.ensemblgenomes.org/documentation/info/assembly_map
+   */
+  public int[] getAssemblyMapping(String species, String chromosome,
+          String fromRef, String toRef, int[] queryRange)
+  {
+    URL url = null;
+    BufferedReader br = null;
+
+    try
+    {
+      url = getAssemblyMapUrl(species, chromosome, fromRef, toRef, queryRange[0],
+              queryRange[1]);
+      br = getHttpResponse(url, null);
+      return (parseAssemblyMappingResponse(br));
+    } catch (Throwable t)
+    {
+      System.out.println("Error calling " + url + ": " + t.getMessage());
+      return null;
+    } finally
+    {
+      if (br != null)
+      {
+        try
+        {
+          br.close();
+        } catch (IOException e)
+        {
+          // ignore
+        }
+      }
+    }
+  }
+
+  /**
+   * Parses the JSON response from the /map/&lt;species&gt;/ REST service. The
+   * format is (with some fields omitted)
+   * 
+   * <pre>
+   *  {"mappings": 
+   *    [{
+   *       "original": {"end":45109016,"start":45051610},
+   *       "mapped"  : {"end":43186384,"start":43128978} 
+   *  }] }
+   * </pre>
+   * 
+   * @param br
+   * @return
+   */
+  protected int[] parseAssemblyMappingResponse(BufferedReader br)
+  {
+    int[] result = null;
+    JSONParser jp = new JSONParser();
+
+    try
+    {
+      JSONObject parsed = (JSONObject) jp.parse(br);
+      JSONArray mappings = (JSONArray) parsed.get(MAPPINGS);
+
+      Iterator rvals = mappings.iterator();
+      while (rvals.hasNext())
+      {
+        // todo check for "mapped"
+        JSONObject val = (JSONObject) rvals.next();
+        JSONObject mapped = (JSONObject) val.get(MAPPED);
+        int start = Integer.parseInt(mapped.get("start").toString());
+        int end = Integer.parseInt(mapped.get("end").toString());
+        String strand = mapped.get("strand").toString();
+        if ("1".equals(strand))
+        {
+          result = new int[] { start, end };
+        }
+        else
+        {
+          result = new int[] { end, start };
+        }
+      }
+    } catch (IOException | ParseException | NumberFormatException e)
+    {
+      // ignore
+    }
+    return result;
+  }
+
+  /**
+   * Constructs a URL to the /map/cds/<id> or /map/cdna/<id> REST service. The
+   * REST call is to either ensembl or ensemblgenomes, as determined from the
+   * division, e.g. Ensembl or EnsemblProtists.
+   * 
+   * @param domain
+   * @param accession
+   * @param start
+   * @param end
+   * @param cdsOrCdna
+   * @return
+   * @throws MalformedURLException
+   */
+  URL getIdMapUrl(String domain, String accession, int start, int end,
+          String cdsOrCdna) throws MalformedURLException
+  {
+    String url = String
+            .format("%s/map/%s/%s/%d..%d?include_original_region=1&content-type=application/json",
+                    domain, cdsOrCdna, accession, start, end);
+    return new URL(url);
+  }
+
+}
index b19f557..e6b1264 100644 (file)
@@ -20,8 +20,6 @@
  */
 package jalview.ext.ensembl;
 
-import jalview.io.DataSourceType;
-import jalview.io.FileParse;
 import jalview.util.StringUtils;
 
 import java.io.BufferedReader;
@@ -153,22 +151,28 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher
   protected abstract boolean useGetRequest();
 
   /**
-   * Return the desired value for the Content-Type request header
-   * 
-   * @param multipleIds
+   * Returns the desired value for the Content-Type request header. Default is
+   * application/json, override if required to vary this.
    * 
    * @return
    * @see https://github.com/Ensembl/ensembl-rest/wiki/HTTP-Headers
    */
-  protected abstract String getRequestMimeType(boolean multipleIds);
+  protected String getRequestMimeType()
+  {
+    return "application/json";
+  }
 
   /**
-   * Return the desired value for the Accept request header
+   * Return the desired value for the Accept request header. Default is
+   * application/json, override if required to vary this.
    * 
    * @return
    * @see https://github.com/Ensembl/ensembl-rest/wiki/HTTP-Headers
    */
-  protected abstract String getResponseMimeType();
+  protected String getResponseMimeType()
+  {
+    return "application/json";
+  }
 
   /**
    * Checks Ensembl's REST 'ping' endpoint, and returns true if response
@@ -222,25 +226,20 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher
   }
 
   /**
-   * returns a reader to a Fasta response from the Ensembl sequence endpoint
+   * Returns a reader to a (Json) response from the Ensembl sequence endpoint.
+   * If the request failed the return value may be null.
    * 
    * @param ids
    * @return
    * @throws IOException
    */
-  protected FileParse getSequenceReader(List<String> ids) throws IOException
+  protected BufferedReader getSequenceReader(List<String> ids)
+          throws IOException
   {
     URL url = getUrl(ids);
 
     BufferedReader reader = getHttpResponse(url, ids);
-    if (reader == null)
-    {
-      // request failed
-      return null;
-    }
-    FileParse fp = new FileParse(reader, url.toString(),
-            DataSourceType.URL);
-    return fp;
+    return reader;
   }
 
   /**
@@ -332,8 +331,7 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher
     boolean multipleIds = ids != null && ids.size() > 1;
     connection.setRequestMethod(
             multipleIds ? HttpMethod.POST : HttpMethod.GET);
-    connection.setRequestProperty("Content-Type",
-            getRequestMimeType(multipleIds));
+    connection.setRequestProperty("Content-Type", getRequestMimeType());
     connection.setRequestProperty("Accept", getResponseMimeType());
 
     connection.setUseCaches(false);
index b2ebb1a..c903de3 100644 (file)
@@ -28,12 +28,11 @@ import jalview.datamodel.AlignmentI;
 import jalview.datamodel.DBRefEntry;
 import jalview.datamodel.DBRefSource;
 import jalview.datamodel.Mapping;
+import jalview.datamodel.Sequence;
 import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
 import jalview.datamodel.features.SequenceFeatures;
 import jalview.exceptions.JalviewException;
-import jalview.io.FastaFile;
-import jalview.io.FileParse;
 import jalview.io.gff.SequenceOntologyFactory;
 import jalview.io.gff.SequenceOntologyI;
 import jalview.util.Comparison;
@@ -41,6 +40,7 @@ import jalview.util.DBRefUtils;
 import jalview.util.IntRangeComparator;
 import jalview.util.MapList;
 
+import java.io.BufferedReader;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
@@ -49,6 +49,10 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
+
 /**
  * Base class for Ensembl sequence fetchers
  * 
@@ -386,50 +390,44 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       inProgress = false;
       throw new JalviewException("ENSEMBL Rest API not available.");
     }
-    FileParse fp = getSequenceReader(ids);
-    if (fp == null)
+    BufferedReader br = getSequenceReader(ids);
+    if (br == null)
     {
       return alignment;
     }
 
-    FastaFile fr = new FastaFile(fp);
-    if (fr.hasWarningMessage())
+    List<SequenceI> seqs = parseSequenceJson(br);
+
+    if (seqs.isEmpty())
     {
-      System.out.println(
-              String.format("Warning when retrieving %d ids %s\n%s",
-                      ids.size(), ids.toString(), fr.getWarningMessage()));
+      throw new IOException("No data returned for " + ids);
     }
-    else if (fr.getSeqs().size() != ids.size())
+
+    if (seqs.size() != ids.size())
     {
       System.out.println(String.format(
               "Only retrieved %d sequences for %d query strings",
-              fr.getSeqs().size(), ids.size()));
+              seqs.size(), ids.size()));
     }
 
-    if (fr.getSeqs().size() == 1 && fr.getSeqs().get(0).getLength() == 0)
+    if (!seqs.isEmpty())
     {
-      /*
-       * POST request has returned an empty FASTA file e.g. for invalid id
-       */
-      throw new IOException("No data returned for " + ids);
-    }
-
-    if (fr.getSeqs().size() > 0)
-    {
-      AlignmentI seqal = new Alignment(fr.getSeqsAsArray());
-      for (SequenceI sq : seqal.getSequences())
+      AlignmentI seqal = new Alignment(
+              seqs.toArray(new SequenceI[seqs.size()]));
+      for (SequenceI seq : seqs)
       {
-        if (sq.getDescription() == null)
+        if (seq.getDescription() == null)
         {
-          sq.setDescription(getDbName());
+          seq.setDescription(getDbName());
         }
-        String name = sq.getName();
+        String name = seq.getName();
         if (ids.contains(name)
                 || ids.contains(name.replace("ENSP", "ENST")))
         {
-          DBRefEntry dbref = DBRefUtils.parseToDbRef(sq, getDbSource(),
+          // TODO JAL-3077 use true accession version in dbref
+          DBRefEntry dbref = DBRefUtils.parseToDbRef(seq, getDbSource(),
                   getEnsemblDataVersion(), name);
-          sq.addDBRef(dbref);
+          seq.addDBRef(dbref);
         }
       }
       if (alignment == null)
@@ -445,6 +443,49 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
   }
 
   /**
+   * Parses a JSON response into a list of sequences
+   * 
+   * @param br
+   * @return
+   * @see http://rest.ensembl.org/documentation/info/sequence_id
+   */
+  protected List<SequenceI> parseSequenceJson(BufferedReader br)
+  {
+    JSONParser jp = new JSONParser();
+    List<SequenceI> result = new ArrayList<>();
+    try
+    {
+      /*
+       * for now, assumes only one sequence returned; refactor if needed
+       * in future to handle a JSONArray with more than one
+       */
+      final JSONObject val = (JSONObject) jp.parse(br);
+      Object s = val.get("desc");
+      String desc = s == null ? null : s.toString();
+      s = val.get("id");
+      String id = s == null ? null : s.toString();
+      s = val.get("seq");
+      String seq = s == null ? null : s.toString();
+      Sequence sequence = new Sequence(id, seq);
+      if (desc != null)
+      {
+        sequence.setDescription(desc);
+      }
+      // todo JAL-3077 make a DBRefEntry with true accession version
+      // s = val.get("version");
+      // String version = s == null ? "0" : s.toString();
+      // DBRefEntry dbref = new DBRefEntry(getDbSource(), version, id);
+      // sequence.addDBRef(dbref);
+      result.add(sequence);
+    } catch (ParseException | IOException e)
+    {
+      System.err.println("Error processing JSON response: " + e.toString());
+      // ignore
+    }
+    return result;
+  }
+
+  /**
    * Returns the URL for the REST call
    * 
    * @return
@@ -465,7 +506,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     }
     // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats
     urlstring.append("?type=").append(getSourceEnsemblType().getType());
-    urlstring.append(("&Accept=text/x-fasta"));
+    urlstring.append(("&Accept=application/json"));
+    urlstring.append(("&Content-Type=application/json"));
 
     String objectType = getObjectType();
     if (objectType != null)
@@ -505,18 +547,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     return false;
   }
 
-  @Override
-  protected String getRequestMimeType(boolean multipleIds)
-  {
-    return multipleIds ? "application/json" : "text/x-fasta";
-  }
-
-  @Override
-  protected String getResponseMimeType()
-  {
-    return "text/x-fasta";
-  }
-
   /**
    * 
    * @return the configured sequence return type for this source
index 27c448e..77768a6 100644 (file)
@@ -88,18 +88,6 @@ class EnsemblXref extends EnsemblRestClient
     return true;
   }
 
-  @Override
-  protected String getRequestMimeType(boolean multipleIds)
-  {
-    return "application/json";
-  }
-
-  @Override
-  protected String getResponseMimeType()
-  {
-    return "application/json";
-  }
-
   /**
    * Calls the Ensembl xrefs REST endpoint and retrieves any cross-references
    * ("primary_id") for the given identifier (Ensembl accession id) and database
@@ -113,8 +101,8 @@ class EnsemblXref extends EnsemblRestClient
    */
   public List<DBRefEntry> getCrossReferences(String identifier)
   {
-    List<DBRefEntry> result = new ArrayList<DBRefEntry>();
-    List<String> ids = new ArrayList<String>();
+    List<DBRefEntry> result = new ArrayList<>();
+    List<String> ids = new ArrayList<>();
     ids.add(identifier);
 
     BufferedReader br = null;
@@ -163,7 +151,7 @@ class EnsemblXref extends EnsemblRestClient
           throws IOException
   {
     JSONParser jp = new JSONParser();
-    List<DBRefEntry> result = new ArrayList<DBRefEntry>();
+    List<DBRefEntry> result = new ArrayList<>();
     try
     {
       JSONArray responses = (JSONArray) jp.parse(br);
index cc3a3db..460d16c 100644 (file)
@@ -79,19 +79,6 @@ public class EnsemblRestClientTest
       {
         return false;
       }
-
-      @Override
-      protected String getRequestMimeType(boolean b)
-      {
-        return null;
-      }
-
-      @Override
-      protected String getResponseMimeType()
-      {
-        return null;
-      }
-
     };
   }
 
index e2af26b..72f5a34 100644 (file)
@@ -25,14 +25,13 @@ import static org.testng.AssertJUnit.assertFalse;
 import static org.testng.AssertJUnit.assertSame;
 import static org.testng.AssertJUnit.assertTrue;
 
-import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
 import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
 import jalview.datamodel.features.SequenceFeatures;
 import jalview.gui.JvOptionPane;
 import jalview.io.DataSourceType;
 import jalview.io.FastaFile;
-import jalview.io.FileParse;
 import jalview.io.gff.SequenceOntologyFactory;
 import jalview.io.gff.SequenceOntologyLite;
 
@@ -127,7 +126,11 @@ public class EnsemblSeqProxyTest
                   + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
                   + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
                   + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
-                  + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
+                  + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDG\n"
+                  // ? insertion added in ENSP00000288602.11, not in P15056
+                  + "APLNQLMRCLRKYQSRTPSPLLHSVPSEIVFDFEPGPVFR\n"
+                  // end insertion
+                  + "GSTTGLSATPPASLPGSLTNVKALQKSP\n"
                   + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
                   + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
                   + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
@@ -155,22 +158,21 @@ public class EnsemblSeqProxyTest
   }
 
   @Test(dataProvider = "ens_seqs", suiteName = "live")
-  public void testGetOneSeqs(EnsemblRestClient proxy, String sq,
+  public void testGetSequenceRecords(EnsemblSeqProxy proxy, String sq,
           String fastasq) throws Exception
   {
-    FileParse fp = proxy.getSequenceReader(Arrays
-            .asList(new String[] { sq }));
-    SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray();
     FastaFile trueRes = new FastaFile(fastasq, DataSourceType.PASTE);
-    SequenceI[] trueSqs = trueRes.getSeqsAsArray();
-    Assert.assertEquals(sqs.length, trueSqs.length,
+    SequenceI[] expected = trueRes.getSeqsAsArray();
+    AlignmentI retrieved = proxy.getSequenceRecords(sq);
+
+    Assert.assertEquals(retrieved.getHeight(), expected.length,
             "Different number of sequences retrieved for query " + sq);
-    Alignment ral = new Alignment(sqs);
-    for (SequenceI tr : trueSqs)
+
+    for (SequenceI tr : expected)
     {
       SequenceI[] rseq;
       Assert.assertNotNull(
-              rseq = ral.findSequenceMatch(tr.getName()),
+              rseq = retrieved.findSequenceMatch(tr.getName()),
               "Couldn't find sequences matching expected sequence "
                       + tr.getName());
       Assert.assertEquals(rseq.length, 1,
@@ -181,7 +183,6 @@ public class EnsemblSeqProxyTest
               "Sequences differ for " + tr.getName() + "\n" + "Exp:"
                       + tr.getSequenceAsString() + "\n" + "Got:"
                       + rseq[0].getSequenceAsString());
-
     }
   }