--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+
+import com.stevesoft.pat.Regex;
+
+public class EnsemblCdna extends EnsemblSeqProxy
+{
+
+ public EnsemblCdna() throws Exception
+ {
+ super();
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL (CDNA)";
+ }
+
+ @Override
+ protected EnsemblSeqType getSourceEnsemblType()
+ {
+ return EnsemblSeqType.CDNA;
+ }
+
+ @Override
+ public Regex getAccessionValidator()
+ {
+ return new Regex("((ENST|ENSG|CCDS)[0-9.]{3,})");
+ }
+
+ @Override
+ public String getTestQuery()
+ {
+ return "ENST00000288602";
+ }
+
+}
--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+
+public class EnsemblCds extends EnsemblSeqProxy
+{
+
+ public EnsemblCds() throws Exception
+ {
+ super();
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL (CDS)";
+ }
+
+ @Override
+ protected EnsemblSeqType getSourceEnsemblType()
+ {
+ return EnsemblSeqType.CDS;
+ }
+
+}
--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+
+public class EnsemblGenome extends EnsemblSeqProxy
+{
+
+ public EnsemblGenome() throws Exception
+ {
+ super();
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL (Genome)";
+ }
+
+ @Override
+ protected EnsemblSeqType getSourceEnsemblType()
+ {
+ return EnsemblSeqType.GENOMIC;
+ }
+
+}
--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+
+public class EnsemblProtein extends EnsemblSeqProxy
+{
+
+ public EnsemblProtein() throws Exception
+ {
+ super();
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL (Protein)";
+ }
+
+ @Override
+ protected EnsemblSeqType getSourceEnsemblType()
+ {
+ return EnsemblSeqType.PROTEIN;
+ }
+
+}
--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.DBRefSource;
+import jalview.datamodel.SequenceI;
+import jalview.exceptions.JalviewException;
+import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+import jalview.io.FastaFile;
+import jalview.io.FileParse;
+import jalview.util.DBRefUtils;
+import jalview.ws.seqfetcher.DbSourceProxy;
+import jalview.ws.seqfetcher.DbSourceProxyImpl;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import com.stevesoft.pat.Regex;
+
+public abstract class EnsemblSeqProxy extends DbSourceProxyImpl implements
+ DbSourceProxy
+{
+ SeqFetcher sf;
+
+ public EnsemblSeqProxy() throws Exception
+ {
+ sf = new SeqFetcher();
+ addDbSourceProperty(DBRefSource.MULTIACC);
+ addDbSourceProperty(DBRefSource.SEQDB);
+ // decide whether these need to be filtered according to return type
+ addDbSourceProperty(DBRefSource.PROTSEQDB);
+ addDbSourceProperty(DBRefSource.DNACODINGSEQDB);
+ addDbSourceProperty(DBRefSource.DNASEQDB);
+ }
+
+ @Override
+ public String getDbSource()
+ {
+ return "ENSEMBL";
+ }
+
+
+ @Override
+ public String getDbVersion()
+ {
+ return "0"; // sf.getVersion();
+ }
+
+ @Override
+ public String getAccessionSeparator()
+ {
+ return " ";
+ }
+
+ @Override
+ public Regex getAccessionValidator()
+ {
+ return new Regex("((ENSP|ENST|ENSG|CCDS)[0-9.]{3,})");
+ }
+
+ @Override
+ public String getTestQuery()
+ {
+ return "ENSP00000288602";
+ }
+
+ @Override
+ public boolean isValidReference(String accession)
+ {
+ return getAccessionValidator().search(accession);
+ }
+
+ private volatile boolean inProgress = false;
+
+ @Override
+ public AlignmentI getSequenceRecords(String queries) throws Exception
+ {
+ inProgress = true;
+ List<String> tids, ids = new ArrayList<String>();
+ tids = Arrays.asList(queries.split(" +"));
+ AlignmentI rtn = null;
+ for (int v = 0, vSize = tids.size(); v < vSize; v += 50)
+ {
+ int p = v + 50;
+ if (p > vSize)
+ {
+ p = vSize;
+ }
+ ;
+ ids = tids.subList(v, p);
+ try
+ {
+ if (!sf.isEnsemblAvailable())
+ {
+ inProgress = false;
+ throw new JalviewException("ENSEMBL Rest API not available.");
+ }
+ FileParse fp = new FileParse(sf.getSequenceReader(
+ getSourceEnsemblType(), ids));
+ FastaFile fr = new FastaFile(fp);
+ if (fr.hasWarningMessage())
+ {
+ System.out
+ .println("Warning when retrieving " + ids.size() + " ids"
+ + ids.toString() + "\n" + fr.getWarningMessage());
+ }
+ else if (fr.getSeqs().size() != ids.size())
+ {
+ System.out.println("Only retrieved " + fr.getSeqs().size()
+ + " sequences for " + ids.size() + " query strings.");
+ }
+ if (fr.getSeqs().size() > 0)
+ {
+ AlignmentI seqal = new jalview.datamodel.Alignment(
+ fr.getSeqsAsArray());
+ for (SequenceI sq:seqal.getSequences())
+ {
+ if (ids.contains((sq.getName())))
+ {
+ DBRefUtils.parseToDbRef(sq, "ENSEMBL", "0", sq.getName());
+ }
+ }
+ if (rtn == null)
+ {
+ rtn = seqal;
+ }
+ else
+ {
+ rtn.append(seqal);
+ }
+ }
+ } catch (Throwable r)
+ {
+ inProgress = false;
+ if (rtn != null)
+ {
+ System.err.println("Aborting ID retrieval after " + v
+ + " chunks.");
+ r.printStackTrace();
+ }
+ else
+ {
+
+ throw new JalviewException("Aborting ID retrieval after " + v
+ + " chunks. Unexpected problem ("
+ + r.getLocalizedMessage() + ")", r);
+ }
+
+ }
+ }
+ inProgress = false;
+ return rtn;
+ }
+
+ /**
+ *
+ * @return the configured sequence return type for this source
+ */
+ protected abstract EnsemblSeqType getSourceEnsemblType();
+
+ @Override
+ public boolean queryInProgress()
+ {
+ return inProgress;
+ }
+
+ @Override
+ public StringBuffer getRawRecords()
+ {
+ return null;
+ }
+
+ @Override
+ public int getTier()
+ {
+ return 0;
+ }
+}
--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+
+public class EnsemblTranscript extends EnsemblSeqProxy
+{
+
+ public EnsemblTranscript() throws Exception
+ {
+ super();
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL (Protein)";
+ }
+
+ @Override
+ protected EnsemblSeqType getSourceEnsemblType()
+ {
+ return EnsemblSeqType.PROTEIN;
+ }
+
+}
--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.io.FileParse;
+
+import java.io.BufferedReader;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.net.URLConnection;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.http.NameValuePair;
+import org.apache.http.message.BasicNameValuePair;
+
+public class SeqFetcher
+{
+ private final static String ENSEMBL_REST = "rest.ensembl.org";
+
+ private static boolean ensemblRestavailable = false;
+
+ private static long lastCheck = -1;
+
+ public boolean isEnsemblAvailable()
+ {
+ if (isTesting || !ensemblRestavailable
+ || System.currentTimeMillis() - lastCheck > 10000)
+ {
+ checkEnsembl();
+ lastCheck = System.currentTimeMillis();
+ }
+ return ensemblRestavailable;
+ }
+
+ private boolean isTesting, testEnsemblStatus;
+
+ /**
+ * @return the isTesting
+ */
+ public boolean isTesting()
+ {
+ return isTesting;
+ }
+
+ /**
+ * @param isTesting
+ * the isTesting to set
+ */
+ public void setTesting(boolean isTesting)
+ {
+ this.isTesting = isTesting;
+ }
+
+ /**
+ * @return the testEnsemblStatus
+ */
+ public boolean isTestEnsemblStatus()
+ {
+ return testEnsemblStatus;
+ }
+
+ /**
+ * @param testEnsemblStatus
+ * the testEnsemblStatus to set
+ */
+ public void setTestEnsemblStatus(boolean testEnsemblStatus)
+ {
+ this.testEnsemblStatus = testEnsemblStatus;
+ }
+
+ private void checkEnsembl()
+ {
+ if (isTesting)
+ {
+ ensemblRestavailable = testEnsemblStatus;
+ return;
+ }
+ try
+ {
+ URL ping = new URL("http://" + ENSEMBL_REST + "/info/ping");
+ HttpURLConnection conn = (HttpURLConnection) (ping.openConnection());
+ if (conn.getResponseCode() >= 200 && conn.getResponseCode() < 300)
+ {
+ ensemblRestavailable = true;
+ return;
+ }
+ } catch (Error err)
+ {
+ err.printStackTrace();
+ } catch (Exception exx)
+ {
+ exx.printStackTrace();
+ }
+ ensemblRestavailable = false;
+ }
+
+ public SeqFetcher()
+ {
+ }
+
+ public enum EnsemblSeqType
+ {
+ GENOMIC("genomic"), CDS("cds"), TRANSCRIPT("cds"), PROTEIN("protein"), CDNA(
+ "cdna");
+
+ private String type;
+
+ EnsemblSeqType(String t)
+ {
+ type = t;
+ }
+
+ public String getType()
+ {
+ return type;
+ }
+ }
+
+ /**
+ * reolve request type as an argument for sequence and features queries
+ *
+ * @param type
+ */
+ public List<NameValuePair> getObjectTypeArg(EnsemblSeqType type)
+ {
+ NameValuePair nameValue = new BasicNameValuePair("type", type.getType());
+ return Collections.singletonList(nameValue);
+ }
+
+ /**
+ * return a reader to a Fasta response from the Ensembl sequence endpoint
+ *
+ * @param returnType
+ * @param ids
+ * @return
+ * @throws IOException
+ */
+ public FileParse getSequenceReader(EnsemblSeqType returnType,
+ List<String> ids) throws IOException
+ {
+
+ // adapted From the rest.ensembl.org documentation for sequence_id
+
+ String urls = "http://" + ENSEMBL_REST + "/sequence/id";
+ List<NameValuePair> vals = getObjectTypeArg(returnType);
+ boolean f = true;
+ for (NameValuePair nvp : vals)
+ {
+ if (f)
+ {
+ f = false;
+ urls += "?";
+ }
+ else
+ {
+ urls += "&";
+ }
+ urls += nvp.getName() + "=" + nvp.getValue();
+ }
+
+ URL url = new URL(urls);
+
+ URLConnection connection = url.openConnection();
+ HttpURLConnection httpConnection = (HttpURLConnection) connection;
+
+ httpConnection.setRequestMethod("POST");
+ httpConnection.setRequestProperty("Content-Type", "application/json");
+ httpConnection.setRequestProperty("Accept", "text/x-fasta");
+ byte[] thepostbody;
+ {
+ StringBuilder postBody = new StringBuilder();
+ postBody.append("{\"ids\":[");
+ boolean first = true;
+ for (String id : ids)
+ {
+ if (first)
+ {
+ first = false;
+ }
+ else
+ {
+ postBody.append(",");
+ }
+ postBody.append("\"");
+ postBody.append(id.trim());
+ postBody.append("\"");
+ }
+ postBody.append("]}");
+ thepostbody = postBody.toString().getBytes();
+ }
+ httpConnection.setRequestProperty("Content-Length",
+ Integer.toString(thepostbody.length));
+ httpConnection.setUseCaches(false);
+ httpConnection.setDoInput(true);
+ httpConnection.setDoOutput(true);
+
+ DataOutputStream wr = new DataOutputStream(
+ httpConnection.getOutputStream());
+ wr.write(thepostbody);
+ wr.flush();
+ wr.close();
+
+ InputStream response = connection.getInputStream();
+ int responseCode = httpConnection.getResponseCode();
+
+ if (responseCode != 200)
+ {
+ throw new RuntimeException(
+ "Response code was not 200. Detected response was "
+ + responseCode);
+ }
+
+ BufferedReader reader = null;
+ reader = new BufferedReader(new InputStreamReader(response, "UTF-8"));
+ FileParse fp = new FileParse(reader, url.toString(), "HTTP_POST");
+ return fp;
+ }
+
+}
import javax.swing.SwingConstants;
import javax.swing.tree.DefaultMutableTreeNode;
-import com.stevesoft.pat.Regex;
-
public class SequenceFetcher extends JPanel implements Runnable
{
JLabel dbeg = new JLabel();
resetDialog();
return;
}
+ // TODO: Refactor to GUI independent code and write tests.
// indicate if successive sources should be merged into one alignment.
boolean addToLast = false;
ArrayList<String> aresultq = new ArrayList<String>(), presultTitle = new ArrayList<String>();
DBRefEntry dbr = new DBRefEntry(), found[] = null;
dbr.setSource(proxy.getDbSource());
dbr.setVersion(null);
- if (proxy.getAccessionValidator() != null)
- {
- Regex vgr = proxy.getAccessionValidator();
- vgr.search(q);
- if (vgr.numSubs() > 0)
- {
- dbr.setAccessionId(vgr.stringMatched(1));
- }
- else
- {
- dbr.setAccessionId(vgr.stringMatched());
- }
- }
- else
- {
- dbr.setAccessionId(q);
- }
+ String accId = DBRefUtils.processQueryToAccessionFor(proxy,
+ q);
+ dbr.setAccessionId(accId);
boolean rfound = false;
for (int r = 0; r < rs.length; r++)
{
int maxLength = 0;
/**
- * Sequences to be added to form a new alignment.
+ * Sequences to be added to form a new alignment. TODO: remove vector in this
+ * class
*/
protected Vector<SequenceI> seqs;
}
/**
+ * not for general use, creates a fileParse object for an existing reader with
+ * configurable values for the origin and the type of the source
+ */
+ public FileParse(BufferedReader source, String originString,
+ String typeString)
+ {
+ type = typeString;
+ error = false;
+ inFile = null;
+ dataName = originString;
+ dataIn = source;
+ try
+ {
+ if (dataIn.markSupported())
+ {
+ dataIn.mark(READAHEAD_LIMIT);
+ }
+ } catch (IOException q)
+ {
+
+ }
+ }
+
+ /**
* Create a datasource for input to Jalview. See AppletFormatAdapter for the
* types of sources that are handled.
*
import jalview.datamodel.DBRefSource;
import jalview.datamodel.PDBEntry;
import jalview.datamodel.SequenceI;
+import jalview.ws.seqfetcher.DbSourceProxy;
import java.util.ArrayList;
import java.util.HashMap;
return ref;
}
+ /**
+ * Extract valid accession strings from a query string. Used by the
+ * SequenceFetcher and DBRefFetcher to create valid accession strings from an
+ * ID string for database sources with a Regex validation field.
+ *
+ * @param proxy
+ * @param q
+ * @return q if proxy.getAccessionValidator()==null, otherwise the matched
+ * region or the first subgroup match from the matched region
+ */
+ public static String processQueryToAccessionFor(DbSourceProxy proxy,
+ String q)
+ {
+ if (proxy.getAccessionValidator() != null)
+ {
+ Regex vgr = proxy.getAccessionValidator();
+ vgr.search(q);
+ if (vgr.numSubs() > 0)
+ {
+ return (vgr.stringMatched(1));
+ }
+ else
+ {
+ return (vgr.stringMatched());
+ }
+ }
+ else
+ {
+ return (q);
+ }
+ }
+
}
public SequenceFetcher(boolean addDas)
{
+ addDBRefSourceImpl(jalview.ext.ensembl.EnsemblProtein.class);
+ addDBRefSourceImpl(jalview.ext.ensembl.EnsemblTranscript.class);
+ addDBRefSourceImpl(jalview.ext.ensembl.EnsemblCds.class);
+ addDBRefSourceImpl(jalview.ext.ensembl.EnsemblGenome.class);
+ addDBRefSourceImpl(jalview.ext.ensembl.EnsemblCdna.class);
+
addDBRefSourceImpl(jalview.ws.dbsources.EmblSource.class);
addDBRefSourceImpl(jalview.ws.dbsources.EmblCdsSouce.class);
addDBRefSourceImpl(jalview.ws.dbsources.Uniprot.class);
public String getTestQuery();
/**
- * optionally implemented
+ * Required for sources supporting multiple query retrieval for use with the
+ * DBRefFetcher, which attempts to limit its queries with putative accession
+ * strings for a source to only those that are likely to be valid.
*
* @param accession
* @return
*
* @param queries
* - one or more queries for database in expected form
- * @return null if queries were successful but result was not parsable
+ * @return null if queries were successful but result was not parsable.
+ * Otherwise, an AlignmentI object containing properly annotated data
+ * (e.g. sequences with accessions for this datasource)
* @throws Exception
* - propagated from underlying transport to database (note -
* exceptions are not raised if query not found in database)
--- /dev/null
+##gff-version 3
+# retrieved via http://rest.ensembl.org/overlap/id/ENSG00000157764?feature=gene&feature=transcript&feature=cds&content-type=text/x-gff3
+##sequence-region 7 140719327 140924764
+7 ensembl_havana gene 140719327 140722790 . + . ID=gene:ENSG00000090266;assembly_name=GRCh38;biotype=protein_coding;description=NADH dehydrogenase (ubiquinone) 1 beta subcomplex%2C 2%2C 8kDa [Source:HGNC Symbol%3BAcc:HGNC:7697];external_name=NDUFB2;logic_name=ensembl_havana_gene;version=11
+7 ensembl_havana gene 140719327 140924764 . - . ID=gene:ENSG00000157764;assembly_name=GRCh38;biotype=protein_coding;description=B-Raf proto-oncogene%2C serine/threonine kinase [Source:HGNC Symbol%3BAcc:HGNC:1097];external_name=BRAF;logic_name=ensembl_havana_gene;version=11
+7 ensembl snRNA_gene 140884072 140884178 . + . ID=gene:ENSG00000271932;assembly_name=GRCh38;biotype=snRNA;description=RNA%2C U6 small nuclear 85%2C pseudogene [Source:HGNC Symbol%3BAcc:HGNC:47048];external_name=RNU6-85P;logic_name=ncrna;version=1
+7 ensembl_havana transcript 140719327 140722790 . + . ID=transcript:ENST00000476279;Parent=gene:ENSG00000090266;assembly_name=GRCh38;biotype=protein_coding;external_name=NDUFB2-003;logic_name=havana;version=4
+7 ensembl_havana transcript 140719327 140721955 . + . ID=transcript:ENST00000461457;Parent=gene:ENSG00000090266;assembly_name=GRCh38;biotype=protein_coding;external_name=NDUFB2-004;logic_name=havana;version=1
+7 ensembl_havana transcript 140719327 140783157 . - . ID=transcript:ENST00000496384;Parent=gene:ENSG00000157764;assembly_name=GRCh38;biotype=protein_coding;external_name=BRAF-003;logic_name=havana;version=5
+7 ensembl_havana transcript 140734479 140924764 . - . ID=transcript:ENST00000288602;Parent=gene:ENSG00000157764;assembly_name=GRCh38;biotype=protein_coding;external_name=BRAF-001;logic_name=ensembl_havana_transcript;version=9
+7 ensembl_havana NMD_transcript_variant 140734521 140754211 . - . ID=transcript:ENST00000479537;Parent=gene:ENSG00000157764;assembly_name=GRCh38;biotype=nonsense_mediated_decay;external_name=BRAF-005;logic_name=havana;version=4
+7 ensembl_havana NMD_transcript_variant 140734597 140924658 . - . ID=transcript:ENST00000497784;Parent=gene:ENSG00000157764;assembly_name=GRCh38;biotype=nonsense_mediated_decay;external_name=BRAF-002;logic_name=havana;version=1
+7 ensembl_havana aberrant_processed_transcript 140834061 140924709 . - . ID=transcript:ENST00000469930;Parent=gene:ENSG00000157764;assembly_name=GRCh38;biotype=retained_intron;external_name=BRAF-004;logic_name=havana;version=1
+7 ensembl snRNA 140884072 140884178 . + . ID=transcript:ENST00000605989;Parent=gene:ENSG00000271932;assembly_name=GRCh38;biotype=snRNA;external_name=RNU6-85P-201;logic_name=ncrna;version=1
+7 havana CDS 140696745 140696842 . + 0 ID=CDS:ENSP00000419087;Parent=transcript:ENST00000476279;assembly_name=GRCh38
+7 havana CDS 140702866 140703010 . + 1 ID=CDS:ENSP00000419087;Parent=transcript:ENST00000476279;assembly_name=GRCh38
+7 havana CDS 140704860 140704934 . + 0 ID=CDS:ENSP00000419087;Parent=transcript:ENST00000476279;assembly_name=GRCh38
+7 havana CDS 140696745 140696842 . + 0 ID=CDS:ENSP00000420062;Parent=transcript:ENST00000461457;assembly_name=GRCh38
+7 havana CDS 140721552 140721744 . + 1 ID=CDS:ENSP00000420062;Parent=transcript:ENST00000461457;assembly_name=GRCh38
+7 havana CDS 140783021 140783157 . - 2 ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7 havana CDS 140781576 140781693 . - 0 ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7 havana CDS 140777991 140778075 . - 2 ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7 havana CDS 140776912 140777088 . - 1 ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7 havana CDS 140754187 140754233 . - 1 ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7 havana CDS 140753275 140753393 . - 2 ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7 havana CDS 140749287 140749418 . - 0 ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7 havana CDS 140739812 140739946 . - 0 ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7 havana CDS 140734617 140734770 . - 0 ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7 havana CDS 140726494 140726516 . - 2 ID=CDS:ENSP00000419060;Parent=transcript:ENST00000496384;assembly_name=GRCh38
+7 ensembl_havana CDS 140924566 140924703 . - 0 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140850111 140850212 . - 0 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140834609 140834872 . - 0 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140808892 140808995 . - 0 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140807960 140808062 . - 1 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140801412 140801560 . - 0 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140800362 140800481 . - 1 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140794308 140794467 . - 1 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140787548 140787584 . - 0 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140783021 140783157 . - 2 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140781576 140781693 . - 0 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140777991 140778075 . - 2 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140776912 140777088 . - 1 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140754187 140754233 . - 1 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140753275 140753393 . - 2 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140749287 140749418 . - 0 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140739812 140739946 . - 0 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 ensembl_havana CDS 140734597 140734770 . - 0 ID=CDS:ENSP00000288602;Parent=transcript:ENST00000288602;assembly_name=GRCh38
+7 havana CDS 140754187 140754211 . - 0 ID=CDS:ENSP00000418033;Parent=transcript:ENST00000479537;assembly_name=GRCh38
+7 havana CDS 140753275 140753393 . - 2 ID=CDS:ENSP00000418033;Parent=transcript:ENST00000479537;assembly_name=GRCh38
+7 havana CDS 140749287 140749418 . - 0 ID=CDS:ENSP00000418033;Parent=transcript:ENST00000479537;assembly_name=GRCh38
+7 havana CDS 140747415 140747447 . - 0 ID=CDS:ENSP00000418033;Parent=transcript:ENST00000479537;assembly_name=GRCh38
+7 havana CDS 140924566 140924658 . - 0 ID=CDS:ENSP00000420119;Parent=transcript:ENST00000497784;assembly_name=GRCh38
+7 havana CDS 140850111 140850212 . - 0 ID=CDS:ENSP00000420119;Parent=transcript:ENST00000497784;assembly_name=GRCh38
+7 havana CDS 140834609 140834872 . - 0 ID=CDS:ENSP00000420119;Parent=transcript:ENST00000497784;assembly_name=GRCh38
+7 havana CDS 140808892 140808995 . - 0 ID=CDS:ENSP00000420119;Parent=transcript:ENST00000497784;assembly_name=GRCh38
+7 havana CDS 140808295 140808316 . - 1 ID=CDS:ENSP00000420119;Parent=transcript:ENST00000497784;assembly_name=GRCh38
--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.datamodel.Alignment;
+import jalview.datamodel.SequenceI;
+import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+import jalview.io.AppletFormatAdapter;
+import jalview.io.FastaFile;
+import jalview.io.FileParse;
+import jalview.util.DBRefUtils;
+
+import java.lang.reflect.Method;
+import java.util.Arrays;
+
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+public class EnsemblSeqProxyTest
+{
+ @Test
+ public void testCheckEnsembl()
+ {
+ SeqFetcher sf = new SeqFetcher();
+ sf.setTestEnsemblStatus(true);
+ sf.setTesting(true);
+ Assert.assertTrue(sf.isEnsemblAvailable());
+ sf.setTestEnsemblStatus(false);
+ Assert.assertFalse(sf.isEnsemblAvailable());
+ }
+
+ @Test(suiteName = "live")
+ public void testLiveCheckEnsembl()
+ {
+ SeqFetcher sf = new SeqFetcher();
+ boolean isAvailable = sf.isEnsemblAvailable();
+ System.out.println("Ensembl is "
+ + (isAvailable ? "UP!" : "DOWN ******************* BAD!"));
+ }
+
+ @DataProvider(name = "ens_seqs")
+ public Object[][] createData(Method m)
+ {
+ System.out.println(m.getName());
+ return allSeqs;
+ }
+
+ public static Object[][] allSeqs = new Object[][]
+ {
+ {
+ EnsemblSeqType.PROTEIN,
+ "CCDS5863.1",
+ ">CCDS5863.1\n"
+ + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
+ + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
+ + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
+ + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
+ + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
+ + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
+ + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
+ + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
+ + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
+ + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
+ + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
+ + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
+ + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH\n" },
+ {
+ EnsemblSeqType.TRANSCRIPT,
+ "CCDS5863.1",
+ ">CCDS5863.1\n"
+ + "ATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAAC\n"
+ + "GGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGAC\n"
+ + "CCTGCCATTCCGGAGGAGGTGTGGAATATCAAACAAATGATTAAGTTGACACAGGAACAT\n"
+ + "ATAGAGGCCCTATTGGACAAATTTGGTGGGGAGCATAATCCACCATCAATATATCTGGAG\n"
+ + "GCCTATGAAGAATACACCAGCAAGCTAGATGCACTCCAACAAAGAGAACAACAGTTATTG\n"
+ + "GAATCTCTGGGGAACGGAACTGATTTTTCTGTTTCTAGCTCTGCATCAATGGATACCGTT\n"
+ + "ACATCTTCTTCCTCTTCTAGCCTTTCAGTGCTACCTTCATCTCTTTCAGTTTTTCAAAAT\n"
+ + "CCCACAGATGTGGCACGGAGCAACCCCAAGTCACCACAAAAACCTATCGTTAGAGTCTTC\n"
+ + "CTGCCCAACAAACAGAGGACAGTGGTACCTGCAAGGTGTGGAGTTACAGTCCGAGACAGT\n"
+ + "CTAAAGAAAGCACTGATGATGAGAGGTCTAATCCCAGAGTGCTGTGCTGTTTACAGAATT\n"
+ + "CAGGATGGAGAGAAGAAACCAATTGGTTGGGACACTGATATTTCCTGGCTTACTGGAGAA\n"
+ + "GAATTGCATGTGGAAGTGTTGGAGAATGTTCCACTTACAACACACAACTTTGTACGAAAA\n"
+ + "ACGTTTTTCACCTTAGCATTTTGTGACTTTTGTCGAAAGCTGCTTTTCCAGGGTTTCCGC\n"
+ + "TGTCAAACATGTGGTTATAAATTTCACCAGCGTTGTAGTACAGAAGTTCCACTGATGTGT\n"
+ + "GTTAATTATGACCAACTTGATTTGCTGTTTGTCTCCAAGTTCTTTGAACACCACCCAATA\n"
+ + "CCACAGGAAGAGGCGTCCTTAGCAGAGACTGCCCTAACATCTGGATCATCCCCTTCCGCA\n"
+ + "CCCGCCTCGGACTCTATTGGGCCCCAAATTCTCACCAGTCCGTCTCCTTCAAAATCCATT\n"
+ + "CCAATTCCACAGCCCTTCCGACCAGCAGATGAAGATCATCGAAATCAATTTGGGCAACGA\n"
+ + "GACCGATCCTCATCAGCTCCCAATGTGCATATAAACACAATAGAACCTGTCAATATTGAT\n"
+ + "GACTTGATTAGAGACCAAGGATTTCGTGGTGATGGAGGATCAACCACAGGTTTGTCTGCT\n"
+ + "ACCCCCCCTGCCTCATTACCTGGCTCACTAACTAACGTGAAAGCCTTACAGAAATCTCCA\n"
+ + "GGACCTCAGCGAGAAAGGAAGTCATCTTCATCCTCAGAAGACAGGAATCGAATGAAAACA\n"
+ + "CTTGGTAGACGGGACTCGAGTGATGATTGGGAGATTCCTGATGGGCAGATTACAGTGGGA\n"
+ + "CAAAGAATTGGATCTGGATCATTTGGAACAGTCTACAAGGGAAAGTGGCATGGTGATGTG\n"
+ + "GCAGTGAAAATGTTGAATGTGACAGCACCTACACCTCAGCAGTTACAAGCCTTCAAAAAT\n"
+ + "GAAGTAGGAGTACTCAGGAAAACACGACATGTGAATATCCTACTCTTCATGGGCTATTCC\n"
+ + "ACAAAGCCACAACTGGCTATTGTTACCCAGTGGTGTGAGGGCTCCAGCTTGTATCACCAT\n"
+ + "CTCCATATCATTGAGACCAAATTTGAGATGATCAAACTTATAGATATTGCACGACAGACT\n"
+ + "GCACAGGGCATGGATTACTTACACGCCAAGTCAATCATCCACAGAGACCTCAAGAGTAAT\n"
+ + "AATATATTTCTTCATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTG\n"
+ + "AAATCTCGATGGAGTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATG\n"
+ + "GCACCAGAAGTCATCAGAATGCAAGATAAAAATCCATACAGCTTTCAGTCAGATGTATAT\n"
+ + "GCATTTGGAATTGTTCTGTATGAATTGATGACTGGACAGTTACCTTATTCAAACATCAAC\n"
+ + "AACAGGGACCAGATAATTTTTATGGTGGGACGAGGATACCTGTCTCCAGATCTCAGTAAG\n"
+ + "GTACGGAGTAACTGTCCAAAAGCCATGAAGAGATTAATGGCAGAGTGCCTCAAAAAGAAA\n"
+ + "AGAGATGAGAGACCACTCTTTCCCCAAATTCTCGCCTCTATTGAGCTGCTGGCCCGCTCA\n"
+ + "TTGCCAAAAATTCACCGCAGTGCATCAGAACCCTCCTTGAATCGGGCTGGTTTCCAAACA\n"
+ + "GAGGATTTTAGTCTATATGCTTGTGCTTCTCCAAAAACACCCATCCAGGCAGGGGGATAT\n"
+ + "GGTGCGTTTCCTGTCCACTGA\n" },
+ {
+ EnsemblSeqType.PROTEIN,
+ "ENSP00000288602",
+ ">ENSP00000288602\n"
+ + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
+ + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
+ + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
+ + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
+ + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
+ + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
+ + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
+ + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
+ + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
+ + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
+ + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
+ + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
+ + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" } };
+
+ @Test(dataProvider = "ens_seqs", suiteName = "live")
+ public void testGetOneSeqs(EnsemblSeqType type, String sq, String fastasq)
+ throws Exception
+ {
+ SeqFetcher sf = new SeqFetcher();
+ FileParse fp = sf.getSequenceReader(type, Arrays.asList(new String[]
+ { sq }));
+ SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray();
+ FastaFile trueRes = new FastaFile(fastasq, AppletFormatAdapter.PASTE);
+ SequenceI[] trueSqs = trueRes.getSeqsAsArray();
+ Assert.assertEquals(sqs.length, trueSqs.length,
+ "Different number of sequences retrieved for query " + sq);
+ Alignment ral = new Alignment(sqs);
+ for (SequenceI tr : trueSqs)
+ {
+ SequenceI[] rseq;
+ Assert.assertNotNull(
+ rseq = ral.findSequenceMatch(tr.getName()),
+ "Couldn't find sequences matching expected sequence "
+ + tr.getName());
+ Assert.assertEquals(rseq.length, 1,
+ "Expected only one sequence for sequence ID " + tr.getName());
+ Assert.assertEquals(
+ rseq[0].getSequenceAsString(),
+ tr.getSequenceAsString(),
+ "Sequences differ for " + tr.getName() + "\n" + "Exp:"
+ + tr.getSequenceAsString() + "\n" + "Got:"
+ + rseq[0].getSequenceAsString());
+
+ }
+ }
+
+ @Test(dataProvider = "ens_seqs")
+ public void testRegexForProxy(EnsemblSeqType type, String sq,
+ String fastasq) throws Exception
+ {
+ EnsemblSeqProxy esq = new EnsemblProtein();
+ Assert.assertTrue(esq.isValidReference(sq),
+ "Expected reference string " + sq + " to be valid for regex "
+ + esq.getAccessionValidator().toString());
+
+ Assert.assertEquals(sq, DBRefUtils.processQueryToAccessionFor(esq, sq),
+ "Regex for " + esq.getClass().toString() + " not correct.");
+ }
+ // TODO:
+ // sequence query with ENSG and anything other than a genomic type will yield
+ // sequences with different IDs which will
+ // break the post-processing stage where DBRefs are assigned to sequences.
+ // -> multiple_sequences = true is needed additional parameter
+ // http://rest.ensembl.org/sequence/id/ENSG00000157764?content-type=text/x-json;type=protein;multiple_sequences=true
+ // result with four transcripts, cds, cdna, and protein products.
+ // *
+ // features for ENG -
+ // http://rest.ensembl.org/overlap/id/ENSG00000157764?feature=cds&feature=exon&feature=transcript&content-type=text/x-gff3
+ // transcript: gives locus, all transcript products with ENSG parents
+ // gene: give all ENSG on locus
+ // exon: all exon boundaries. CDS same info.
+
+ // @Test(dataProvider = "ens_seqs", suiteName = "live")
+ // public void testGetOneSeqs(EnsemblSeqType type, String sq, String fastasq)
+ // throws Exception
+ // {
+ //
+ // {
+ // Assert.assertTrue(rseq[0].getDBRef() != null
+ // && rseq[0].getDBRef().length > 0,
+ // "No database references added to sequence by fetcher.");
+ // Assert.assertNotNull(DBRefUtils.searchRefs(rseq[0].getDBRef(),
+ // new DBRefEntry("ENSEMBL", null, sq)),
+ // "Could't find database references added to sequence by fetcher.");
+ //
+ // }
+}
\ No newline at end of file