Merge remote-tracking branch 'origin/features/JAL-1705_ensembl' into
authorgmungoc <g.m.carstairs@dundee.ac.uk>
Tue, 5 Jan 2016 13:09:44 +0000 (13:09 +0000)
committergmungoc <g.m.carstairs@dundee.ac.uk>
Tue, 5 Jan 2016 13:09:44 +0000 (13:09 +0000)
features/JAL-653_JAL-1766_htslib_refseqsupport

15 files changed:
src/jalview/ext/ensembl/EnsemblCdna.java [new file with mode: 0644]
src/jalview/ext/ensembl/EnsemblCds.java [new file with mode: 0644]
src/jalview/ext/ensembl/EnsemblGenome.java [new file with mode: 0644]
src/jalview/ext/ensembl/EnsemblProtein.java [new file with mode: 0644]
src/jalview/ext/ensembl/EnsemblSeqProxy.java [new file with mode: 0644]
src/jalview/ext/ensembl/EnsemblTranscript.java [new file with mode: 0644]
src/jalview/ext/ensembl/SeqFetcher.java [new file with mode: 0644]
src/jalview/gui/SequenceFetcher.java
src/jalview/io/AlignFile.java
src/jalview/io/FileParse.java
src/jalview/util/DBRefUtils.java
src/jalview/ws/SequenceFetcher.java
src/jalview/ws/seqfetcher/DbSourceProxy.java
test/jalview/ext/ensembl/ENSG00000157764.gff [new file with mode: 0644]
test/jalview/ext/ensembl/EnsemblSeqProxyTest.java [new file with mode: 0644]

diff --git a/src/jalview/ext/ensembl/EnsemblCdna.java b/src/jalview/ext/ensembl/EnsemblCdna.java
new file mode 100644 (file)
index 0000000..757b3c8
--- /dev/null
@@ -0,0 +1,39 @@
+package jalview.ext.ensembl;
+
+import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+
+import com.stevesoft.pat.Regex;
+
+public class EnsemblCdna extends EnsemblSeqProxy
+{
+
+  public EnsemblCdna() throws Exception
+  {
+    super();
+  }
+
+  @Override
+  public String getDbName()
+  {
+    return "ENSEMBL (CDNA)";
+  }
+
+  @Override
+  protected EnsemblSeqType getSourceEnsemblType()
+  {
+    return EnsemblSeqType.CDNA;
+  }
+
+  @Override
+  public Regex getAccessionValidator()
+  {
+    return new Regex("((ENST|ENSG|CCDS)[0-9.]{3,})");
+  }
+
+  @Override
+  public String getTestQuery()
+  {
+    return "ENST00000288602";
+  }
+
+}
diff --git a/src/jalview/ext/ensembl/EnsemblCds.java b/src/jalview/ext/ensembl/EnsemblCds.java
new file mode 100644 (file)
index 0000000..1f63e05
--- /dev/null
@@ -0,0 +1,25 @@
+package jalview.ext.ensembl;
+
+import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+
+public class EnsemblCds extends EnsemblSeqProxy
+{
+
+  public EnsemblCds() throws Exception
+  {
+    super();
+  }
+
+  @Override
+  public String getDbName()
+  {
+    return "ENSEMBL (CDS)";
+  }
+
+  @Override
+  protected EnsemblSeqType getSourceEnsemblType()
+  {
+    return EnsemblSeqType.CDS;
+  }
+
+}
diff --git a/src/jalview/ext/ensembl/EnsemblGenome.java b/src/jalview/ext/ensembl/EnsemblGenome.java
new file mode 100644 (file)
index 0000000..37e8e2b
--- /dev/null
@@ -0,0 +1,25 @@
+package jalview.ext.ensembl;
+
+import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+
+public class EnsemblGenome extends EnsemblSeqProxy
+{
+
+  public EnsemblGenome() throws Exception
+  {
+    super();
+  }
+
+  @Override
+  public String getDbName()
+  {
+    return "ENSEMBL (Genome)";
+  }
+
+  @Override
+  protected EnsemblSeqType getSourceEnsemblType()
+  {
+    return EnsemblSeqType.GENOMIC;
+  }
+
+}
diff --git a/src/jalview/ext/ensembl/EnsemblProtein.java b/src/jalview/ext/ensembl/EnsemblProtein.java
new file mode 100644 (file)
index 0000000..db8d9d5
--- /dev/null
@@ -0,0 +1,25 @@
+package jalview.ext.ensembl;
+
+import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+
+public class EnsemblProtein extends EnsemblSeqProxy
+{
+
+  public EnsemblProtein() throws Exception
+  {
+    super();
+  }
+
+  @Override
+  public String getDbName()
+  {
+    return "ENSEMBL (Protein)";
+  }
+
+  @Override
+  protected EnsemblSeqType getSourceEnsemblType()
+  {
+    return EnsemblSeqType.PROTEIN;
+  }
+
+}
diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java
new file mode 100644 (file)
index 0000000..137c9b0
--- /dev/null
@@ -0,0 +1,178 @@
+package jalview.ext.ensembl;
+
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.DBRefSource;
+import jalview.datamodel.SequenceI;
+import jalview.exceptions.JalviewException;
+import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+import jalview.io.FastaFile;
+import jalview.io.FileParse;
+import jalview.util.DBRefUtils;
+import jalview.ws.seqfetcher.DbSourceProxy;
+import jalview.ws.seqfetcher.DbSourceProxyImpl;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import com.stevesoft.pat.Regex;
+
+public abstract class EnsemblSeqProxy extends DbSourceProxyImpl implements
+        DbSourceProxy
+{
+  SeqFetcher sf;
+
+  public EnsemblSeqProxy() throws Exception
+  {
+    sf = new SeqFetcher();
+    addDbSourceProperty(DBRefSource.MULTIACC);
+    addDbSourceProperty(DBRefSource.SEQDB);
+    // decide whether these need to be filtered according to return type
+    addDbSourceProperty(DBRefSource.PROTSEQDB);
+    addDbSourceProperty(DBRefSource.DNACODINGSEQDB);
+    addDbSourceProperty(DBRefSource.DNASEQDB);
+  }
+
+  @Override
+  public String getDbSource()
+  {
+    return "ENSEMBL";
+  }
+
+
+  @Override
+  public String getDbVersion()
+  {
+    return "0"; // sf.getVersion();
+  }
+
+  @Override
+  public String getAccessionSeparator()
+  {
+    return " ";
+  }
+
+  @Override
+  public Regex getAccessionValidator()
+  {
+    return new Regex("((ENSP|ENST|ENSG|CCDS)[0-9.]{3,})");
+  }
+
+  @Override
+  public String getTestQuery()
+  {
+    return "ENSP00000288602";
+  }
+
+  @Override
+  public boolean isValidReference(String accession)
+  {
+    return getAccessionValidator().search(accession);
+  }
+
+  private volatile boolean inProgress = false;
+
+  @Override
+  public AlignmentI getSequenceRecords(String queries) throws Exception
+  {
+    inProgress = true;
+    List<String> tids, ids = new ArrayList<String>();
+    tids = Arrays.asList(queries.split(" +"));
+    AlignmentI rtn = null;
+    for (int v = 0, vSize = tids.size(); v < vSize; v += 50)
+    {
+      int p = v + 50;
+      if (p > vSize)
+      {
+        p = vSize;
+      }
+      ;
+      ids = tids.subList(v, p);
+      try
+      {
+        if (!sf.isEnsemblAvailable())
+        {
+          inProgress = false;
+          throw new JalviewException("ENSEMBL Rest API not available.");
+        }
+        FileParse fp = new FileParse(sf.getSequenceReader(
+                getSourceEnsemblType(), ids));
+        FastaFile fr = new FastaFile(fp);
+        if (fr.hasWarningMessage())
+        {
+          System.out
+                  .println("Warning when retrieving " + ids.size() + " ids"
+                          + ids.toString() + "\n" + fr.getWarningMessage());
+        }
+        else if (fr.getSeqs().size() != ids.size())
+        {
+          System.out.println("Only retrieved " + fr.getSeqs().size()
+                  + " sequences for " + ids.size() + " query strings.");
+        }
+        if (fr.getSeqs().size() > 0)
+        {
+          AlignmentI seqal = new jalview.datamodel.Alignment(
+                  fr.getSeqsAsArray());
+          for (SequenceI sq:seqal.getSequences())
+          {
+            if (ids.contains((sq.getName())))
+            {
+              DBRefUtils.parseToDbRef(sq, "ENSEMBL", "0", sq.getName());
+            }
+          }
+          if (rtn == null)
+          {
+            rtn = seqal;
+          }
+          else
+          {
+            rtn.append(seqal);
+          }
+        }
+      } catch (Throwable r)
+      {
+        inProgress = false;
+        if (rtn != null)
+        {
+          System.err.println("Aborting ID retrieval after " + v
+                  + " chunks.");
+          r.printStackTrace();
+        }
+        else
+        {
+
+          throw new JalviewException("Aborting ID retrieval after " + v
+                  + " chunks. Unexpected problem ("
+                  + r.getLocalizedMessage() + ")", r);
+        }
+
+      }
+    }
+    inProgress = false;
+    return rtn;
+  }
+
+  /**
+   * 
+   * @return the configured sequence return type for this source
+   */
+  protected abstract EnsemblSeqType getSourceEnsemblType();
+
+  @Override
+  public boolean queryInProgress()
+  {
+    return inProgress;
+  }
+
+  @Override
+  public StringBuffer getRawRecords()
+  {
+    return null;
+  }
+
+  @Override
+  public int getTier()
+  {
+    return 0;
+  }
+}
diff --git a/src/jalview/ext/ensembl/EnsemblTranscript.java b/src/jalview/ext/ensembl/EnsemblTranscript.java
new file mode 100644 (file)
index 0000000..68ed310
--- /dev/null
@@ -0,0 +1,25 @@
+package jalview.ext.ensembl;
+
+import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+
+public class EnsemblTranscript extends EnsemblSeqProxy
+{
+
+  public EnsemblTranscript() throws Exception
+  {
+    super();
+  }
+
+  @Override
+  public String getDbName()
+  {
+    return "ENSEMBL (Protein)";
+  }
+
+  @Override
+  protected EnsemblSeqType getSourceEnsemblType()
+  {
+    return EnsemblSeqType.PROTEIN;
+  }
+
+}
diff --git a/src/jalview/ext/ensembl/SeqFetcher.java b/src/jalview/ext/ensembl/SeqFetcher.java
new file mode 100644 (file)
index 0000000..7c913bf
--- /dev/null
@@ -0,0 +1,222 @@
+package jalview.ext.ensembl;
+
+import jalview.io.FileParse;
+
+import java.io.BufferedReader;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.net.URLConnection;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.http.NameValuePair;
+import org.apache.http.message.BasicNameValuePair;
+
+public class SeqFetcher
+{
+  private final static String ENSEMBL_REST = "rest.ensembl.org";
+
+  private static boolean ensemblRestavailable = false;
+
+  private static long lastCheck = -1;
+
+  public boolean isEnsemblAvailable()
+  {
+    if (isTesting || !ensemblRestavailable
+            || System.currentTimeMillis() - lastCheck > 10000)
+    {
+      checkEnsembl();
+      lastCheck = System.currentTimeMillis();
+    }
+    return ensemblRestavailable;
+  }
+
+  private boolean isTesting, testEnsemblStatus;
+
+  /**
+   * @return the isTesting
+   */
+  public boolean isTesting()
+  {
+    return isTesting;
+  }
+
+  /**
+   * @param isTesting
+   *          the isTesting to set
+   */
+  public void setTesting(boolean isTesting)
+  {
+    this.isTesting = isTesting;
+  }
+
+  /**
+   * @return the testEnsemblStatus
+   */
+  public boolean isTestEnsemblStatus()
+  {
+    return testEnsemblStatus;
+  }
+
+  /**
+   * @param testEnsemblStatus
+   *          the testEnsemblStatus to set
+   */
+  public void setTestEnsemblStatus(boolean testEnsemblStatus)
+  {
+    this.testEnsemblStatus = testEnsemblStatus;
+  }
+
+  private void checkEnsembl()
+  {
+    if (isTesting)
+    {
+      ensemblRestavailable = testEnsemblStatus;
+      return;
+    }
+    try
+    {
+      URL ping = new URL("http://" + ENSEMBL_REST + "/info/ping");
+      HttpURLConnection conn = (HttpURLConnection) (ping.openConnection());
+      if (conn.getResponseCode() >= 200 && conn.getResponseCode() < 300)
+      {
+        ensemblRestavailable = true;
+        return;
+      }
+    } catch (Error err)
+    {
+      err.printStackTrace();
+    } catch (Exception exx)
+    {
+      exx.printStackTrace();
+    }
+    ensemblRestavailable = false;
+  }
+
+  public SeqFetcher()
+  {
+  }
+
+  public enum EnsemblSeqType
+  {
+    GENOMIC("genomic"), CDS("cds"), TRANSCRIPT("cds"), PROTEIN("protein"), CDNA(
+            "cdna");
+
+    private String type;
+
+    EnsemblSeqType(String t)
+    {
+      type = t;
+    }
+
+    public String getType()
+    {
+      return type;
+    }
+  }
+
+  /**
+   * reolve request type as an argument for sequence and features queries
+   * 
+   * @param type
+   */
+  public List<NameValuePair> getObjectTypeArg(EnsemblSeqType type)
+  {
+    NameValuePair nameValue = new BasicNameValuePair("type", type.getType());
+    return Collections.singletonList(nameValue);
+  }
+
+  /**
+   * return a reader to a Fasta response from the Ensembl sequence endpoint
+   * 
+   * @param returnType
+   * @param ids
+   * @return
+   * @throws IOException
+   */
+  public FileParse getSequenceReader(EnsemblSeqType returnType,
+          List<String> ids) throws IOException
+  {
+
+    // adapted From the rest.ensembl.org documentation for sequence_id
+
+    String urls = "http://" + ENSEMBL_REST + "/sequence/id";
+    List<NameValuePair> vals = getObjectTypeArg(returnType);
+    boolean f = true;
+    for (NameValuePair nvp : vals)
+    {
+      if (f)
+      {
+        f = false;
+        urls += "?";
+      }
+      else
+      {
+        urls += "&";
+      }
+      urls += nvp.getName() + "=" + nvp.getValue();
+    }
+
+    URL url = new URL(urls);
+
+    URLConnection connection = url.openConnection();
+    HttpURLConnection httpConnection = (HttpURLConnection) connection;
+
+    httpConnection.setRequestMethod("POST");
+    httpConnection.setRequestProperty("Content-Type", "application/json");
+    httpConnection.setRequestProperty("Accept", "text/x-fasta");
+    byte[] thepostbody;
+    {
+      StringBuilder postBody = new StringBuilder();
+      postBody.append("{\"ids\":[");
+      boolean first = true;
+      for (String id : ids)
+      {
+        if (first)
+        {
+          first = false;
+        }
+        else
+        {
+          postBody.append(",");
+        }
+        postBody.append("\"");
+        postBody.append(id.trim());
+        postBody.append("\"");
+      }
+      postBody.append("]}");
+      thepostbody = postBody.toString().getBytes();
+    }
+    httpConnection.setRequestProperty("Content-Length",
+            Integer.toString(thepostbody.length));
+    httpConnection.setUseCaches(false);
+    httpConnection.setDoInput(true);
+    httpConnection.setDoOutput(true);
+
+    DataOutputStream wr = new DataOutputStream(
+            httpConnection.getOutputStream());
+    wr.write(thepostbody);
+    wr.flush();
+    wr.close();
+
+    InputStream response = connection.getInputStream();
+    int responseCode = httpConnection.getResponseCode();
+
+    if (responseCode != 200)
+    {
+      throw new RuntimeException(
+              "Response code was not 200. Detected response was "
+                      + responseCode);
+    }
+
+    BufferedReader reader = null;
+    reader = new BufferedReader(new InputStreamReader(response, "UTF-8"));
+    FileParse fp = new FileParse(reader, url.toString(), "HTTP_POST");
+    return fp;
+  }
+
+}
index 207d2bc..09d33c8 100755 (executable)
@@ -54,8 +54,6 @@ import javax.swing.JTextArea;
 import javax.swing.SwingConstants;
 import javax.swing.tree.DefaultMutableTreeNode;
 
-import com.stevesoft.pat.Regex;
-
 public class SequenceFetcher extends JPanel implements Runnable
 {
   JLabel dbeg = new JLabel();
@@ -518,6 +516,7 @@ public class SequenceFetcher extends JPanel implements Runnable
       resetDialog();
       return;
     }
+    // TODO: Refactor to GUI independent code and write tests.
     // indicate if successive sources should be merged into one alignment.
     boolean addToLast = false;
     ArrayList<String> aresultq = new ArrayList<String>(), presultTitle = new ArrayList<String>();
@@ -639,23 +638,9 @@ public class SequenceFetcher extends JPanel implements Runnable
                 DBRefEntry dbr = new DBRefEntry(), found[] = null;
                 dbr.setSource(proxy.getDbSource());
                 dbr.setVersion(null);
-                if (proxy.getAccessionValidator() != null)
-                {
-                  Regex vgr = proxy.getAccessionValidator();
-                  vgr.search(q);
-                  if (vgr.numSubs() > 0)
-                  {
-                    dbr.setAccessionId(vgr.stringMatched(1));
-                  }
-                  else
-                  {
-                    dbr.setAccessionId(vgr.stringMatched());
-                  }
-                }
-                else
-                {
-                  dbr.setAccessionId(q);
-                }
+                String accId = DBRefUtils.processQueryToAccessionFor(proxy,
+                        q);
+                dbr.setAccessionId(accId);
                 boolean rfound = false;
                 for (int r = 0; r < rs.length; r++)
                 {
index 7e0cabd..eb43701 100755 (executable)
@@ -47,7 +47,8 @@ public abstract class AlignFile extends FileParse
   int maxLength = 0;
 
   /**
-   * Sequences to be added to form a new alignment.
+   * Sequences to be added to form a new alignment. TODO: remove vector in this
+   * class
    */
   protected Vector<SequenceI> seqs;
 
index 8711354..61f5127 100755 (executable)
@@ -274,6 +274,30 @@ public class FileParse
   }
 
   /**
+   * not for general use, creates a fileParse object for an existing reader with
+   * configurable values for the origin and the type of the source
+   */
+  public FileParse(BufferedReader source, String originString,
+          String typeString)
+  {
+    type = typeString;
+    error = false;
+    inFile = null;
+    dataName = originString;
+    dataIn = source;
+    try
+    {
+      if (dataIn.markSupported())
+      {
+        dataIn.mark(READAHEAD_LIMIT);
+      }
+    } catch (IOException q)
+    {
+
+    }
+  }
+
+  /**
    * Create a datasource for input to Jalview. See AppletFormatAdapter for the
    * types of sources that are handled.
    * 
index 518c310..b8f1dd5 100755 (executable)
@@ -24,6 +24,7 @@ import jalview.datamodel.DBRefEntry;
 import jalview.datamodel.DBRefSource;
 import jalview.datamodel.PDBEntry;
 import jalview.datamodel.SequenceI;
+import jalview.ws.seqfetcher.DbSourceProxy;
 
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -472,4 +473,36 @@ public class DBRefUtils
     return ref;
   }
 
+  /**
+   * Extract valid accession strings from a query string. Used by the
+   * SequenceFetcher and DBRefFetcher to create valid accession strings from an
+   * ID string for database sources with a Regex validation field.
+   * 
+   * @param proxy
+   * @param q
+   * @return q if proxy.getAccessionValidator()==null, otherwise the matched
+   *         region or the first subgroup match from the matched region
+   */
+  public static String processQueryToAccessionFor(DbSourceProxy proxy,
+          String q)
+  {
+    if (proxy.getAccessionValidator() != null)
+    {
+      Regex vgr = proxy.getAccessionValidator();
+      vgr.search(q);
+      if (vgr.numSubs() > 0)
+      {
+        return (vgr.stringMatched(1));
+      }
+      else
+      {
+        return (vgr.stringMatched());
+      }
+    }
+    else
+    {
+      return (q);
+    }
+  }
+
 }
index 6a612a0..fcc4457 100644 (file)
@@ -55,6 +55,12 @@ public class SequenceFetcher extends ASequenceFetcher
 
   public SequenceFetcher(boolean addDas)
   {
+    addDBRefSourceImpl(jalview.ext.ensembl.EnsemblProtein.class);
+    addDBRefSourceImpl(jalview.ext.ensembl.EnsemblTranscript.class);
+    addDBRefSourceImpl(jalview.ext.ensembl.EnsemblCds.class);
+    addDBRefSourceImpl(jalview.ext.ensembl.EnsemblGenome.class);
+    addDBRefSourceImpl(jalview.ext.ensembl.EnsemblCdna.class);
+
     addDBRefSourceImpl(jalview.ws.dbsources.EmblSource.class);
     addDBRefSourceImpl(jalview.ws.dbsources.EmblCdsSouce.class);
     addDBRefSourceImpl(jalview.ws.dbsources.Uniprot.class);
index 556df1f..33f62b6 100644 (file)
@@ -94,7 +94,9 @@ public interface DbSourceProxy
   public String getTestQuery();
 
   /**
-   * optionally implemented
+   * Required for sources supporting multiple query retrieval for use with the
+   * DBRefFetcher, which attempts to limit its queries with putative accession
+   * strings for a source to only those that are likely to be valid.
    * 
    * @param accession
    * @return
@@ -107,7 +109,9 @@ public interface DbSourceProxy
    * 
    * @param queries
    *          - one or more queries for database in expected form
-   * @return null if queries were successful but result was not parsable
+   * @return null if queries were successful but result was not parsable.
+   *         Otherwise, an AlignmentI object containing properly annotated data
+   *         (e.g. sequences with accessions for this datasource)
    * @throws Exception
    *           - propagated from underlying transport to database (note -
    *           exceptions are not raised if query not found in database)
diff --git a/test/jalview/ext/ensembl/ENSG00000157764.gff b/test/jalview/ext/ensembl/ENSG00000157764.gff
new file mode 100644 (file)
index 0000000..21cef29
--- /dev/null
@@ -0,0 +1,56 @@
+##gff-version 3
+# retrieved via http://rest.ensembl.org/overlap/id/ENSG00000157764?feature=gene&feature=transcript&feature=cds&content-type=text/x-gff3
+##sequence-region   7 140719327 140924764
+7      ensembl_havana  gene    140719327       140722790       .       +       .       ID=gene:ENSG00000090266;assembly_name=GRCh38;biotype=protein_coding;description=NADH dehydrogenase (ubiquinone) 1 beta subcomplex%2C 2%2C 8kDa [Source:HGNC Symbol%3BAcc:HGNC:7697];external_name=NDUFB2;logic_name=ensembl_havana_gene;version=11
+7      ensembl_havana  gene    140719327       140924764       .       -       .       ID=gene:ENSG00000157764;assembly_name=GRCh38;biotype=protein_coding;description=B-Raf proto-oncogene%2C serine/threonine kinase [Source:HGNC Symbol%3BAcc:HGNC:1097];external_name=BRAF;logic_name=ensembl_havana_gene;version=11
+7      ensembl snRNA_gene      140884072       140884178       .       +       .       ID=gene:ENSG00000271932;assembly_name=GRCh38;biotype=snRNA;description=RNA%2C U6 small nuclear 85%2C pseudogene [Source:HGNC Symbol%3BAcc:HGNC:47048];external_name=RNU6-85P;logic_name=ncrna;version=1
+7      ensembl_havana  transcript      140719327       140722790       .       +       .       ID=transcript:ENST00000476279;Parent=gene:ENSG00000090266;assembly_name=GRCh38;biotype=protein_coding;external_name=NDUFB2-003;logic_name=havana;version=4
+7      ensembl_havana  transcript      140719327       140721955       .       +       .       ID=transcript:ENST00000461457;Parent=gene:ENSG00000090266;assembly_name=GRCh38;biotype=protein_coding;external_name=NDUFB2-004;logic_name=havana;version=1
+7      ensembl_havana  transcript      140719327       140783157       .       -       .       ID=transcript:ENST00000496384;Parent=gene:ENSG00000157764;assembly_name=GRCh38;biotype=protein_coding;external_name=BRAF-003;logic_name=havana;version=5
+7      ensembl_havana  transcript      140734479       140924764       .       -       .       ID=transcript:ENST00000288602;Parent=gene:ENSG00000157764;assembly_name=GRCh38;biotype=protein_coding;external_name=BRAF-001;logic_name=ensembl_havana_transcript;version=9
+7      ensembl_havana  NMD_transcript_variant  140734521       140754211       .       -       .       ID=transcript:ENST00000479537;Parent=gene:ENSG00000157764;assembly_name=GRCh38;biotype=nonsense_mediated_decay;external_name=BRAF-005;logic_name=havana;version=4
+7      ensembl_havana  NMD_transcript_variant  140734597       140924658       .       -       .       ID=transcript:ENST00000497784;Parent=gene:ENSG00000157764;assembly_name=GRCh38;biotype=nonsense_mediated_decay;external_name=BRAF-002;logic_name=havana;version=1
+7      ensembl_havana  aberrant_processed_transcript   140834061       140924709       .       -       .       ID=transcript:ENST00000469930;Parent=gene:ENSG00000157764;assembly_name=GRCh38;biotype=retained_intron;external_name=BRAF-004;logic_name=havana;version=1
+7      ensembl snRNA   140884072       140884178       .       +       .       ID=transcript:ENST00000605989;Parent=gene:ENSG00000271932;assembly_name=GRCh38;biotype=snRNA;external_name=RNU6-85P-201;logic_name=ncrna;version=1
+7      havana  CDS     140696745       140696842       .       +       0       ID=CDS:ENSP00000419087;Parent=transcript:ENST00000476279;assembly_name=GRCh38
+7      havana  CDS     140702866       140703010       .       +       1       ID=CDS:ENSP00000419087;Parent=transcript:ENST00000476279;assembly_name=GRCh38
+7      havana  CDS     140704860       140704934       .       +       0       ID=CDS:ENSP00000419087;Parent=transcript:ENST00000476279;assembly_name=GRCh38
+7      havana  CDS     140696745       140696842       .       +       0       ID=CDS:ENSP00000420062;Parent=transcript:ENST00000461457;assembly_name=GRCh38
+7      havana  CDS     140721552       140721744       .       +       1       ID=CDS:ENSP00000420062;Parent=transcript:ENST00000461457;assembly_name=GRCh38
+7      havana  CDS     140783021       140783157       .       -       2       ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7      havana  CDS     140781576       140781693       .       -       0       ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7      havana  CDS     140777991       140778075       .       -       2       ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7      havana  CDS     140776912       140777088       .       -       1       ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7      havana  CDS     140754187       140754233       .       -       1       ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7      havana  CDS     140753275       140753393       .       -       2       ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7      havana  CDS     140749287       140749418       .       -       0       ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7      havana  CDS     140739812       140739946       .       -       0       ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7      havana  CDS     140734617       140734770       .       -       0       ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7      havana  CDS     140726494       140726516       .       -       2       ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7      ensembl_havana  CDS     140924566       140924703       .       -       0       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140850111       140850212       .       -       0       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140834609       140834872       .       -       0       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140808892       140808995       .       -       0       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140807960       140808062       .       -       1       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140801412       140801560       .       -       0       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140800362       140800481       .       -       1       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140794308       140794467       .       -       1       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140787548       140787584       .       -       0       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140783021       140783157       .       -       2       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140781576       140781693       .       -       0       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140777991       140778075       .       -       2       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140776912       140777088       .       -       1       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140754187       140754233       .       -       1       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140753275       140753393       .       -       2       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140749287       140749418       .       -       0       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140739812       140739946       .       -       0       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      ensembl_havana  CDS     140734597       140734770       .       -       0       ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7      havana  CDS     140754187       140754211       .       -       0       ID=CDS:ENSP00000418033;Parent=transcript:ENST00000479537;assembly_name=GRCh38
+7      havana  CDS     140753275       140753393       .       -       2       ID=CDS:ENSP00000418033;Parent=transcript:ENST00000479537;assembly_name=GRCh38
+7      havana  CDS     140749287       140749418       .       -       0       ID=CDS:ENSP00000418033;Parent=transcript:ENST00000479537;assembly_name=GRCh38
+7      havana  CDS     140747415       140747447       .       -       0       ID=CDS:ENSP00000418033;Parent=transcript:ENST00000479537;assembly_name=GRCh38
+7      havana  CDS     140924566       140924658       .       -       0       ID=CDS:ENSP00000420119;Parent=transcript:ENST00000497784;assembly_name=GRCh38
+7      havana  CDS     140850111       140850212       .       -       0       ID=CDS:ENSP00000420119;Parent=transcript:ENST00000497784;assembly_name=GRCh38
+7      havana  CDS     140834609       140834872       .       -       0       ID=CDS:ENSP00000420119;Parent=transcript:ENST00000497784;assembly_name=GRCh38
+7      havana  CDS     140808892       140808995       .       -       0       ID=CDS:ENSP00000420119;Parent=transcript:ENST00000497784;assembly_name=GRCh38
+7      havana  CDS     140808295       140808316       .       -       1       ID=CDS:ENSP00000420119;Parent=transcript:ENST00000497784;assembly_name=GRCh38
diff --git a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java
new file mode 100644 (file)
index 0000000..978316b
--- /dev/null
@@ -0,0 +1,199 @@
+package jalview.ext.ensembl;
+
+import jalview.datamodel.Alignment;
+import jalview.datamodel.SequenceI;
+import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+import jalview.io.AppletFormatAdapter;
+import jalview.io.FastaFile;
+import jalview.io.FileParse;
+import jalview.util.DBRefUtils;
+
+import java.lang.reflect.Method;
+import java.util.Arrays;
+
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+public class EnsemblSeqProxyTest
+{
+  @Test
+  public void testCheckEnsembl()
+  {
+    SeqFetcher sf = new SeqFetcher();
+    sf.setTestEnsemblStatus(true);
+    sf.setTesting(true);
+    Assert.assertTrue(sf.isEnsemblAvailable());
+    sf.setTestEnsemblStatus(false);
+    Assert.assertFalse(sf.isEnsemblAvailable());
+  }
+
+  @Test(suiteName = "live")
+  public void testLiveCheckEnsembl()
+  {
+    SeqFetcher sf = new SeqFetcher();
+    boolean isAvailable = sf.isEnsemblAvailable();
+    System.out.println("Ensembl is "
+            + (isAvailable ? "UP!" : "DOWN ******************* BAD!"));
+  }
+
+  @DataProvider(name = "ens_seqs")
+  public Object[][] createData(Method m)
+  {
+    System.out.println(m.getName());
+    return allSeqs;
+  }
+
+  public static Object[][] allSeqs = new Object[][]
+  {
+      {
+          EnsemblSeqType.PROTEIN,
+          "CCDS5863.1",
+          ">CCDS5863.1\n"
+                  + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
+                  + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
+                  + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
+                  + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
+                  + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
+                  + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
+                  + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
+                  + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
+                  + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
+                  + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
+                  + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
+                  + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
+                  + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH\n" },
+      {
+          EnsemblSeqType.TRANSCRIPT,
+          "CCDS5863.1",
+          ">CCDS5863.1\n"
+                  + "ATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAAC\n"
+                  + "GGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGAC\n"
+                  + "CCTGCCATTCCGGAGGAGGTGTGGAATATCAAACAAATGATTAAGTTGACACAGGAACAT\n"
+                  + "ATAGAGGCCCTATTGGACAAATTTGGTGGGGAGCATAATCCACCATCAATATATCTGGAG\n"
+                  + "GCCTATGAAGAATACACCAGCAAGCTAGATGCACTCCAACAAAGAGAACAACAGTTATTG\n"
+                  + "GAATCTCTGGGGAACGGAACTGATTTTTCTGTTTCTAGCTCTGCATCAATGGATACCGTT\n"
+                  + "ACATCTTCTTCCTCTTCTAGCCTTTCAGTGCTACCTTCATCTCTTTCAGTTTTTCAAAAT\n"
+                  + "CCCACAGATGTGGCACGGAGCAACCCCAAGTCACCACAAAAACCTATCGTTAGAGTCTTC\n"
+                  + "CTGCCCAACAAACAGAGGACAGTGGTACCTGCAAGGTGTGGAGTTACAGTCCGAGACAGT\n"
+                  + "CTAAAGAAAGCACTGATGATGAGAGGTCTAATCCCAGAGTGCTGTGCTGTTTACAGAATT\n"
+                  + "CAGGATGGAGAGAAGAAACCAATTGGTTGGGACACTGATATTTCCTGGCTTACTGGAGAA\n"
+                  + "GAATTGCATGTGGAAGTGTTGGAGAATGTTCCACTTACAACACACAACTTTGTACGAAAA\n"
+                  + "ACGTTTTTCACCTTAGCATTTTGTGACTTTTGTCGAAAGCTGCTTTTCCAGGGTTTCCGC\n"
+                  + "TGTCAAACATGTGGTTATAAATTTCACCAGCGTTGTAGTACAGAAGTTCCACTGATGTGT\n"
+                  + "GTTAATTATGACCAACTTGATTTGCTGTTTGTCTCCAAGTTCTTTGAACACCACCCAATA\n"
+                  + "CCACAGGAAGAGGCGTCCTTAGCAGAGACTGCCCTAACATCTGGATCATCCCCTTCCGCA\n"
+                  + "CCCGCCTCGGACTCTATTGGGCCCCAAATTCTCACCAGTCCGTCTCCTTCAAAATCCATT\n"
+                  + "CCAATTCCACAGCCCTTCCGACCAGCAGATGAAGATCATCGAAATCAATTTGGGCAACGA\n"
+                  + "GACCGATCCTCATCAGCTCCCAATGTGCATATAAACACAATAGAACCTGTCAATATTGAT\n"
+                  + "GACTTGATTAGAGACCAAGGATTTCGTGGTGATGGAGGATCAACCACAGGTTTGTCTGCT\n"
+                  + "ACCCCCCCTGCCTCATTACCTGGCTCACTAACTAACGTGAAAGCCTTACAGAAATCTCCA\n"
+                  + "GGACCTCAGCGAGAAAGGAAGTCATCTTCATCCTCAGAAGACAGGAATCGAATGAAAACA\n"
+                  + "CTTGGTAGACGGGACTCGAGTGATGATTGGGAGATTCCTGATGGGCAGATTACAGTGGGA\n"
+                  + "CAAAGAATTGGATCTGGATCATTTGGAACAGTCTACAAGGGAAAGTGGCATGGTGATGTG\n"
+                  + "GCAGTGAAAATGTTGAATGTGACAGCACCTACACCTCAGCAGTTACAAGCCTTCAAAAAT\n"
+                  + "GAAGTAGGAGTACTCAGGAAAACACGACATGTGAATATCCTACTCTTCATGGGCTATTCC\n"
+                  + "ACAAAGCCACAACTGGCTATTGTTACCCAGTGGTGTGAGGGCTCCAGCTTGTATCACCAT\n"
+                  + "CTCCATATCATTGAGACCAAATTTGAGATGATCAAACTTATAGATATTGCACGACAGACT\n"
+                  + "GCACAGGGCATGGATTACTTACACGCCAAGTCAATCATCCACAGAGACCTCAAGAGTAAT\n"
+                  + "AATATATTTCTTCATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTG\n"
+                  + "AAATCTCGATGGAGTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATG\n"
+                  + "GCACCAGAAGTCATCAGAATGCAAGATAAAAATCCATACAGCTTTCAGTCAGATGTATAT\n"
+                  + "GCATTTGGAATTGTTCTGTATGAATTGATGACTGGACAGTTACCTTATTCAAACATCAAC\n"
+                  + "AACAGGGACCAGATAATTTTTATGGTGGGACGAGGATACCTGTCTCCAGATCTCAGTAAG\n"
+                  + "GTACGGAGTAACTGTCCAAAAGCCATGAAGAGATTAATGGCAGAGTGCCTCAAAAAGAAA\n"
+                  + "AGAGATGAGAGACCACTCTTTCCCCAAATTCTCGCCTCTATTGAGCTGCTGGCCCGCTCA\n"
+                  + "TTGCCAAAAATTCACCGCAGTGCATCAGAACCCTCCTTGAATCGGGCTGGTTTCCAAACA\n"
+                  + "GAGGATTTTAGTCTATATGCTTGTGCTTCTCCAAAAACACCCATCCAGGCAGGGGGATAT\n"
+                  + "GGTGCGTTTCCTGTCCACTGA\n" },
+      {
+          EnsemblSeqType.PROTEIN,
+          "ENSP00000288602",
+          ">ENSP00000288602\n"
+                  + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
+                  + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
+                  + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
+                  + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
+                  + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
+                  + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
+                  + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
+                  + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
+                  + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
+                  + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
+                  + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
+                  + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
+                  + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" } };
+
+  @Test(dataProvider = "ens_seqs", suiteName = "live")
+  public void testGetOneSeqs(EnsemblSeqType type, String sq, String fastasq)
+          throws Exception
+  {
+    SeqFetcher sf = new SeqFetcher();
+    FileParse fp = sf.getSequenceReader(type, Arrays.asList(new String[]
+    { sq }));
+    SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray();
+    FastaFile trueRes = new FastaFile(fastasq, AppletFormatAdapter.PASTE);
+    SequenceI[] trueSqs = trueRes.getSeqsAsArray();
+    Assert.assertEquals(sqs.length, trueSqs.length,
+            "Different number of sequences retrieved for query " + sq);
+    Alignment ral = new Alignment(sqs);
+    for (SequenceI tr : trueSqs)
+    {
+      SequenceI[] rseq;
+      Assert.assertNotNull(
+              rseq = ral.findSequenceMatch(tr.getName()),
+              "Couldn't find sequences matching expected sequence "
+                      + tr.getName());
+      Assert.assertEquals(rseq.length, 1,
+              "Expected only one sequence for sequence ID " + tr.getName());
+      Assert.assertEquals(
+              rseq[0].getSequenceAsString(),
+              tr.getSequenceAsString(),
+              "Sequences differ for " + tr.getName() + "\n" + "Exp:"
+                      + tr.getSequenceAsString() + "\n" + "Got:"
+                      + rseq[0].getSequenceAsString());
+
+    }
+  }
+
+  @Test(dataProvider = "ens_seqs")
+  public void testRegexForProxy(EnsemblSeqType type, String sq,
+          String fastasq) throws Exception
+  {
+    EnsemblSeqProxy esq = new EnsemblProtein();
+    Assert.assertTrue(esq.isValidReference(sq),
+            "Expected reference string " + sq + " to be valid for regex "
+                    + esq.getAccessionValidator().toString());
+    
+    Assert.assertEquals(sq, DBRefUtils.processQueryToAccessionFor(esq, sq),
+            "Regex for " + esq.getClass().toString() + " not correct.");
+  }
+  // TODO:
+  // sequence query with ENSG and anything other than a genomic type will yield
+  // sequences with different IDs which will
+  // break the post-processing stage where DBRefs are assigned to sequences.
+  // -> multiple_sequences = true is needed additional parameter
+  // http://rest.ensembl.org/sequence/id/ENSG00000157764?content-type=text/x-json;type=protein;multiple_sequences=true
+  // result with four transcripts, cds, cdna, and protein products.
+  // *
+  // features for ENG -
+  // http://rest.ensembl.org/overlap/id/ENSG00000157764?feature=cds&feature=exon&feature=transcript&content-type=text/x-gff3
+  // transcript: gives locus, all transcript products with ENSG parents
+  // gene: give all ENSG on locus
+  // exon: all exon boundaries. CDS same info.
+
+  // @Test(dataProvider = "ens_seqs", suiteName = "live")
+  // public void testGetOneSeqs(EnsemblSeqType type, String sq, String fastasq)
+  // throws Exception
+  // {
+  //
+  // {
+  // Assert.assertTrue(rseq[0].getDBRef() != null
+  // && rseq[0].getDBRef().length > 0,
+  // "No database references added to sequence by fetcher.");
+  // Assert.assertNotNull(DBRefUtils.searchRefs(rseq[0].getDBRef(),
+  // new DBRefEntry("ENSEMBL", null, sq)),
+  // "Could't find database references added to sequence by fetcher.");
+  //
+  // }
+}
\ No newline at end of file