package jalview.ext.ensembl;
-import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+import jalview.datamodel.SequenceFeature;
+import jalview.io.gff.SequenceOntology;
import com.stevesoft.pat.Regex;
public class EnsemblCdna extends EnsemblSeqProxy
{
+ /*
+ * fetch exon features on genomic sequence (to identify the cdnaregions)
+ * and cds and variation features (to retain)
+ */
+ private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
+ EnsemblFeatureType.exon, EnsemblFeatureType.cds,
+ EnsemblFeatureType.variation };
public EnsemblCdna()
{
}
@Override
- public String getTestQuery()
+ protected EnsemblFeatureType[] getFeaturesToFetch()
{
- return "ENST00000288602";
+ return FEATURES_TO_FETCH;
+ }
+
+ /**
+ * Answers true unless the feature type is 'exon' (or a sub-type of exon in
+ * the Sequence Ontology). Exon features are only retrieved in order to
+ * identify the exon sequence range, and are redundant information on the exon
+ * sequence itself.
+ */
+ @Override
+ protected boolean retainFeature(String type)
+ {
+ return !SequenceOntology.getInstance().isA(type, SequenceOntology.EXON);
+ }
+
+ /**
+ * Answers true if the sequence feature type is 'exon' (or a subtype of exon
+ * in the Sequence Ontology), and the Parent of the feature is the transcript
+ * we are retrieving
+ */
+ @Override
+ protected boolean identifiesSequence(SequenceFeature sf, String accId)
+ {
+ if (SequenceOntology.getInstance().isA(sf.getType(),
+ SequenceOntology.EXON))
+ {
+ String parentFeature = (String) sf.getValue("Parent");
+ if (("transcript:" + accId).equals(parentFeature))
+ {
+ return true;
+ }
+ }
+ return false;
}
}
package jalview.ext.ensembl;
-import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+import jalview.datamodel.SequenceFeature;
+import jalview.io.gff.SequenceOntology;
public class EnsemblCds extends EnsemblSeqProxy
{
+ /*
+ * fetch cds features on genomic sequence (to identify the CDS regions)
+ * and variation features (to retain)
+ */
+ private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
+ EnsemblFeatureType.cds, EnsemblFeatureType.variation };
+ /**
+ * Constructor
+ */
public EnsemblCds()
{
super();
return EnsemblSeqType.CDS;
}
+ @Override
+ protected EnsemblFeatureType[] getFeaturesToFetch()
+ {
+ return FEATURES_TO_FETCH;
+ }
+
+ /**
+ * Answers true unless the feature type is 'CDS' (or a sub-type of CDS in the
+ * Sequence Ontology). CDS features are only retrieved in order to identify
+ * the cds sequence range, and are redundant information on the cds sequence
+ * itself.
+ */
+ @Override
+ protected boolean retainFeature(String type)
+ {
+ return !SequenceOntology.getInstance().isA(type, SequenceOntology.CDS);
+ }
+
+ /**
+ * Answers true if the sequence feature type is 'CDS' (or a subtype of CDS in
+ * the Sequence Ontology), and the Parent of the feature is the transcript we
+ * are retrieving
+ */
+ @Override
+ protected boolean identifiesSequence(SequenceFeature sf, String accId)
+ {
+ if (SequenceOntology.getInstance().isA(sf.getType(),
+ SequenceOntology.CDS))
+ {
+ String parentFeature = (String) sf.getValue("Parent");
+ if (("transcript:" + accId).equals(parentFeature))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
}
package jalview.ext.ensembl;
-import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+import jalview.datamodel.SequenceFeature;
+import jalview.io.gff.SequenceOntology;
public class EnsemblGenome extends EnsemblSeqProxy
{
+ /*
+ * fetch transcript features on genomic sequence (to identify the transcript
+ * regions) and cds, exon and variation features (to retain)
+ */
+ private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
+ EnsemblFeatureType.transcript, EnsemblFeatureType.exon,
+ EnsemblFeatureType.cds, EnsemblFeatureType.variation };
public EnsemblGenome()
{
return EnsemblSeqType.GENOMIC;
}
+ @Override
+ protected EnsemblFeatureType[] getFeaturesToFetch()
+ {
+ return FEATURES_TO_FETCH;
+ }
+
+ /**
+ * Answers true unless the feature type is 'transcript' (or a sub-type of
+ * transcript in the Sequence Ontology). Transcript features are only
+ * retrieved in order to identify the transcript sequence range, and are
+ * redundant information on the transcript sequence itself.
+ */
+ @Override
+ protected boolean retainFeature(String type)
+ {
+ return !SequenceOntology.getInstance().isA(type,
+ SequenceOntology.TRANSCRIPT);
+ }
+
+ /**
+ * Answers true if the sequence feature type is 'transcript' (or a subtype of
+ * transcript in the Sequence Ontology), and the ID of the feature is the
+ * transcript we are retrieving
+ */
+ @Override
+ protected boolean identifiesSequence(SequenceFeature sf, String accId)
+ {
+ if (SequenceOntology.getInstance().isA(sf.getType(),
+ SequenceOntology.TRANSCRIPT))
+ {
+ String parentFeature = (String) sf.getValue("ID");
+ if (("transcript:" + accId).equals(parentFeature))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
}
--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
+import jalview.io.FeaturesFile;
+import jalview.io.FileParse;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A client for fetching and processing Ensembl overlap data in GFF feature
+ * format
+ *
+ * @author gmcarstairs
+ * @see http://rest.ensembl.org/documentation/info/overlap_id
+ */
+public class EnsemblOverlap extends EnsemblRestClient
+{
+ /*
+ * The default features to retrieve from Ensembl; can override in getSequenceRecords
+ */
+ private EnsemblFeatureType[] featuresWanted = { EnsemblFeatureType.cds,
+ EnsemblFeatureType.exon, EnsemblFeatureType.variation };
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL (overlap)";
+ }
+
+ /**
+ * Makes a query to the REST overlap endpoint for the given sequence
+ * identifier. This returns an 'alignment' consisting of one 'dummy sequence'
+ * (the genomic sequence for which overlap features are returned by the
+ * service). This sequence will have on it sequence features which are the
+ * real information of interest, such as CDS regions or sequence variations.
+ */
+ @Override
+ public AlignmentI getSequenceRecords(String query) throws IOException
+ {
+ // TODO: use a vararg String... for getSequenceRecords instead?
+ List<String> queries = new ArrayList<String>();
+ queries.add(query);
+ FileParse fp = getSequenceReader(queries);
+ FeaturesFile fr = new FeaturesFile(fp);
+ return new Alignment(fr.getSeqsAsArray());
+ }
+
+ /**
+ * Returns a URL for the REST overlap endpoint
+ *
+ * @param ids
+ * @return
+ */
+ @Override
+ protected URL getUrl(List<String> ids) throws MalformedURLException
+ {
+ StringBuffer urlstring = new StringBuffer(128);
+ urlstring.append(ENSEMBL_REST).append("/overlap/id/")
+ .append(ids.get(0));
+
+ // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats
+ urlstring.append("?content-type=text/x-gff3");
+
+ /*
+ * specify features to retrieve
+ * @see http://rest.ensembl.org/documentation/info/overlap_id
+ * could make the list a configurable entry in jalview.properties
+ */
+ for (EnsemblFeatureType feature : featuresWanted)
+ {
+ urlstring.append("&feature=").append(feature.name());
+ }
+
+ return new URL(urlstring.toString());
+ }
+
+ @Override
+ public boolean useGetRequest()
+ {
+ return true;
+ }
+
+ /**
+ * Returns the MIME type for GFF3. For GET requests the Content-type header
+ * describes the required encoding of the response.
+ */
+ @Override
+ public String getRequestMimeType()
+ {
+ return "text/x-gff3";
+ }
+
+ /**
+ * Returns the MIME type for GFF3.
+ */
+ @Override
+ public String getResponseMimeType()
+ {
+ return "text/x-gff3";
+ }
+
+ /**
+ * Overloaded method that allows a list of features to retrieve to be
+ * specified
+ *
+ * @param accId
+ * @param features
+ * @return
+ * @throws IOException
+ */
+ public AlignmentI getSequenceRecords(String accId,
+ EnsemblFeatureType[] features)
+ throws IOException
+ {
+ featuresWanted = features;
+ return getSequenceRecords(accId);
+ }
+}
package jalview.ext.ensembl;
-import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.SequenceFeature;
public class EnsemblProtein extends EnsemblSeqProxy
{
return EnsemblSeqType.PROTEIN;
}
+ /**
+ * Returns false, as this fetcher does not retrieve DNA sequences.
+ */
@Override
public boolean isDnaCoding()
{
return "ENSP00000288602";
}
+ /**
+ * Overrides base class method to do nothing - genomic features are not
+ * applicable to the protein product sequence
+ */
+ @Override
+ protected void addFeaturesAndProduct(String accId, AlignmentI alignment)
+ {
+ }
+
+ @Override
+ protected EnsemblFeatureType[] getFeaturesToFetch()
+ {
+ // not applicable - can't fetch genomic features for a protein sequence
+ return null;
+ }
+
+ @Override
+ protected boolean identifiesSequence(SequenceFeature sf, String accId)
+ {
+ // not applicable - protein sequence is not a 'subset' of genomic sequence
+ return false;
+ }
+
}
--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.io.FileParse;
+
+import java.io.BufferedReader;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.List;
+
+import javax.ws.rs.HttpMethod;
+
+/**
+ * Base class for Ensembl REST service clients
+ *
+ * @author gmcarstairs
+ */
+abstract class EnsemblRestClient extends EnsemblSequenceFetcher
+{
+ protected final static String ENSEMBL_REST = "http://rest.ensembl.org";
+
+ protected static final String SEQUENCE_ID_URL = ENSEMBL_REST
+ + "/sequence/id";
+
+ // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats
+ private static final String PING_URL = "http://rest.ensembl.org/info/ping.json";
+
+ private final static long RETEST_INTERVAL = 10000L; // 10 seconds
+
+ private static boolean ensemblRestAvailable = false;
+
+ private static long lastCheck = -1;
+
+ protected volatile boolean inProgress = false;
+
+ @Override
+ public boolean queryInProgress()
+ {
+ return inProgress;
+ }
+
+ @Override
+ public StringBuffer getRawRecords()
+ {
+ return null;
+ }
+
+ /**
+ * Returns the URL for the client http request
+ *
+ * @param ids
+ * @return
+ * @throws MalformedURLException
+ */
+ protected abstract URL getUrl(List<String> ids)
+ throws MalformedURLException;
+
+ /**
+ * Returns true if client uses GET method, false if it uses POST
+ *
+ * @return
+ */
+ public abstract boolean useGetRequest();
+
+ /**
+ * Return the desired value for the Content-Type request header
+ *
+ * @return
+ * @see https://github.com/Ensembl/ensembl-rest/wiki/HTTP-Headers
+ */
+ public abstract String getRequestMimeType();
+
+ /**
+ * Return the desired value for the Accept request header
+ *
+ * @return
+ * @see https://github.com/Ensembl/ensembl-rest/wiki/HTTP-Headers
+ */
+ public abstract String getResponseMimeType();
+
+ /**
+ * Tries to connect to Ensembl's REST 'ping' endpoint, and returns true if
+ * successful, else false
+ *
+ * @return
+ */
+ private boolean checkEnsembl()
+ {
+ try
+ {
+ URL ping = new URL(PING_URL);
+ HttpURLConnection conn = (HttpURLConnection) ping.openConnection();
+ int rc = conn.getResponseCode();
+ conn.disconnect();
+ if (rc >= 200 && rc < 300)
+ {
+ return true;
+ }
+ } catch (Throwable t)
+ {
+ System.err.println("Error connecting to " + PING_URL + ": "
+ + t.getMessage());
+ }
+ return false;
+ }
+
+ /**
+ * returns a reader to a Fasta response from the Ensembl sequence endpoint
+ *
+ * @param ids
+ * @return
+ * @throws IOException
+ */
+ public FileParse getSequenceReader(List<String> ids)
+ throws IOException
+ {
+ URL url = getUrl(ids);
+
+ HttpURLConnection connection = (HttpURLConnection) url.openConnection();
+
+ /*
+ * POST method allows multiple queries in one request; it is supported for
+ * sequence queries, but not for overlap
+ */
+ connection.setRequestMethod(useGetRequest() ? HttpMethod.GET
+ : HttpMethod.POST);
+ connection.setRequestProperty("Content-Type", getRequestMimeType());
+ connection.setRequestProperty("Accept", getResponseMimeType());
+
+ connection.setUseCaches(false);
+ connection.setDoInput(true);
+ connection.setDoOutput(true);
+
+ if (!useGetRequest())
+ {
+ writePostBody(connection, ids);
+ }
+
+ InputStream response = connection.getInputStream();
+ int responseCode = connection.getResponseCode();
+
+ if (responseCode != 200)
+ {
+ throw new RuntimeException(
+ "Response code was not 200. Detected response was "
+ + responseCode);
+ }
+
+ BufferedReader reader = null;
+ reader = new BufferedReader(new InputStreamReader(response, "UTF-8"));
+ FileParse fp = new FileParse(reader, url.toString(), "HTTP_POST");
+ return fp;
+ }
+
+ /**
+ * Rechecks if Ensembl is responding, unless the last check was successful and
+ * the retest interval has not yet elapsed. Returns true if Ensembl is up,
+ * else false.
+ *
+ * @return
+ */
+ public boolean isEnsemblAvailable()
+ {
+ long now = System.currentTimeMillis();
+ boolean retest = now - lastCheck > RETEST_INTERVAL;
+ if (ensemblRestAvailable && !retest)
+ {
+ return true;
+ }
+ ensemblRestAvailable = checkEnsembl();
+ lastCheck = now;
+ return ensemblRestAvailable;
+ }
+
+ /**
+ * Constructs, writes and flushes the POST body of the request, containing the
+ * query ids in JSON format
+ *
+ * @param connection
+ * @param ids
+ * @throws IOException
+ */
+ protected void writePostBody(HttpURLConnection connection,
+ List<String> ids) throws IOException
+ {
+ boolean first;
+ StringBuilder postBody = new StringBuilder(64);
+ postBody.append("{\"ids\":[");
+ first = true;
+ for (String id : ids)
+ {
+ if (!first)
+ {
+ postBody.append(",");
+ }
+ first = false;
+ postBody.append("\"");
+ postBody.append(id.trim());
+ postBody.append("\"");
+ }
+ postBody.append("]}");
+ byte[] thepostbody = postBody.toString().getBytes();
+ connection.setRequestProperty("Content-Length",
+ Integer.toString(thepostbody.length));
+ DataOutputStream wr = new DataOutputStream(connection.getOutputStream());
+ wr.write(thepostbody);
+ wr.flush();
+ wr.close();
+ }
+
+}
import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
+import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.DBRefSource;
+import jalview.datamodel.Mapping;
+import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import jalview.exceptions.JalviewException;
-import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
import jalview.io.FastaFile;
import jalview.io.FileParse;
+import jalview.io.gff.SequenceOntology;
import jalview.util.DBRefUtils;
-import jalview.ws.seqfetcher.DbSourceProxyImpl;
+import jalview.util.MapList;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
import java.util.List;
-import com.stevesoft.pat.Regex;
-
-public abstract class EnsemblSeqProxy extends DbSourceProxyImpl
+/**
+ * Base class for Ensembl sequence fetchers
+ *
+ * @author gmcarstairs
+ */
+public abstract class EnsemblSeqProxy extends EnsemblRestClient
{
- SeqFetcher sf;
-
- public EnsemblSeqProxy()
+ public enum EnsemblSeqType
{
- sf = new SeqFetcher();
- }
+ /**
+ * type=genomic for the full dna including introns
+ */
+ GENOMIC("genomic"),
- @Override
- public String getDbSource()
- {
- return "ENSEMBL";
- }
+ /**
+ * type=cdna for transcribed dna including UTRs
+ */
+ CDNA("cdna"),
+ /**
+ * type=cds for coding dna excluding UTRs
+ */
+ CDS("cds"),
- @Override
- public String getDbVersion()
- {
- return "0"; // sf.getVersion();
- }
+ /**
+ * type=protein for the peptide product sequence
+ */
+ PROTEIN("protein");
- @Override
- public String getAccessionSeparator()
- {
- return " ";
- }
+ /*
+ * the value of the 'type' parameter to fetch this version of
+ * an Ensembl sequence
+ */
+ private String type;
+
+ EnsemblSeqType(String t)
+ {
+ type = t;
+ }
+
+ public String getType()
+ {
+ return type;
+ }
- @Override
- public Regex getAccessionValidator()
- {
- return new Regex("((ENSP|ENST|ENSG|CCDS)[0-9.]{3,})");
}
/**
- * Default test query is a transcript
+ * A comparator to sort ranges into ascending start position order
*/
- @Override
- public String getTestQuery()
+ private class RangeSorter implements Comparator<int[]>
{
- return "ENST00000288602";
- }
+ boolean forwards;
- @Override
- public boolean isValidReference(String accession)
+ RangeSorter(boolean forward)
+ {
+ forwards = forward;
+ }
+
+ @Override
+ public int compare(int[] o1, int[] o2)
+ {
+ return (forwards ? 1 : -1) * Integer.compare(o1[0], o2[0]);
+ }
+
+ };
+
+ /**
+ * Constructor
+ */
+ public EnsemblSeqProxy()
{
- return getAccessionValidator().search(accession);
}
- private volatile boolean inProgress = false;
-
+ /**
+ * Makes the sequence queries to Ensembl's REST service and returns an
+ * alignment consisting of the returned sequences
+ */
@Override
- public AlignmentI getSequenceRecords(String queries) throws Exception
+ public AlignmentI getSequenceRecords(String query) throws Exception
{
+ // TODO use a String... query vararg instead?
+
+ // danger: accession separator used as a regex here, a string elsewhere
+ // in this case it is ok (it is just a space), but (e.g.) '\' would not be
+ List<String> allIds = Arrays.asList(query.split(getAccessionSeparator()));
+ AlignmentI alignment = null;
inProgress = true;
- List<String> tids, ids = new ArrayList<String>();
- tids = Arrays.asList(queries.split(" +"));
- AlignmentI rtn = null;
/*
* execute queries, if necessary in batches of the
* maximum allowed number of ids
*/
int maxQueryCount = getMaximumQueryCount();
- for (int v = 0, vSize = tids.size(); v < vSize; v += maxQueryCount)
+ for (int v = 0, vSize = allIds.size(); v < vSize; v += maxQueryCount)
{
int p = Math.min(vSize, v + maxQueryCount);
- ids = tids.subList(v, p);
+ List<String> ids = allIds.subList(v, p);
try
{
- if (!sf.isEnsemblAvailable())
- {
- inProgress = false;
- throw new JalviewException("ENSEMBL Rest API not available.");
- }
- FileParse fp = new FileParse(sf.getSequenceReader(
- getSourceEnsemblType(), ids));
- FastaFile fr = new FastaFile(fp);
- if (fr.hasWarningMessage())
- {
- System.out
- .println("Warning when retrieving " + ids.size() + " ids"
- + ids.toString() + "\n" + fr.getWarningMessage());
- }
- else if (fr.getSeqs().size() != ids.size())
- {
- System.out.println("Only retrieved " + fr.getSeqs().size()
- + " sequences for " + ids.size() + " query strings.");
- }
- if (fr.getSeqs().size() > 0)
- {
- AlignmentI seqal = new Alignment(
- fr.getSeqsAsArray());
- for (SequenceI sq:seqal.getSequences())
- {
- if (ids.contains((sq.getName())))
- {
- DBRefUtils.parseToDbRef(sq, "ENSEMBL", "0", sq.getName());
- }
- }
- if (rtn == null)
- {
- rtn = seqal;
- }
- else
- {
- rtn.append(seqal);
- }
- }
+ alignment = fetchSequences(ids, alignment);
} catch (Throwable r)
{
inProgress = false;
- if (rtn != null)
+ String msg = "Aborting ID retrieval after " + v
+ + " chunks. Unexpected problem (" + r.getLocalizedMessage()
+ + ")";
+ System.err.println(msg);
+ if (alignment != null)
{
- System.err.println("Aborting ID retrieval after " + v
- + " chunks.");
- r.printStackTrace();
+ break; // return what we got
}
else
{
-
- throw new JalviewException("Aborting ID retrieval after " + v
- + " chunks. Unexpected problem ("
- + r.getLocalizedMessage() + ")", r);
+ throw new JalviewException(msg, r);
}
-
}
}
+
+ /*
+ * fetch and transfer genomic sequence features
+ */
+ for (String accId : allIds)
+ {
+ addFeaturesAndProduct(accId, alignment);
+ }
+
inProgress = false;
- return rtn;
+ return alignment;
}
/**
+ * Fetches Ensembl features using the /overlap REST endpoint, and adds them to
+ * the sequence in the alignment. Also fetches the protein product, maps it
+ * from the CDS features of the sequence, and saves it as a cross-reference of
+ * the dna sequence.
*
- * @return the configured sequence return type for this source
+ * @param accId
+ * @param alignment
*/
- protected abstract EnsemblSeqType getSourceEnsemblType();
+ protected void addFeaturesAndProduct(String accId, AlignmentI alignment)
+ {
+ try
+ {
+ /*
+ * get 'dummy' genomic sequence with exon, cds and variation features
+ */
+ EnsemblOverlap gffFetcher = new EnsemblOverlap();
+ EnsemblFeatureType[] features = getFeaturesToFetch();
+ AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId,
+ features);
+ if (geneFeatures.getHeight() > 0)
+ {
+ /*
+ * transfer features to the query sequence
+ */
+ SequenceI genomicSequence = geneFeatures.getSequenceAt(0);
+ SequenceI querySeq = alignment.findName(accId);
+ transferFeatures(accId, genomicSequence, querySeq);
- @Override
- public boolean queryInProgress()
+ /*
+ * fetch and map protein product, and add it as a cross-reference
+ * of the retrieved sequence
+ */
+ addProteinProduct(querySeq);
+ }
+ } catch (IOException e)
+ {
+ System.err.println("Error transferring Ensembl features: "
+ + e.getMessage());
+ }
+ }
+
+ /**
+ * Returns those sequence feature types to fetch from Ensembl. We may want
+ * features either because they are of interest to the user, or as means to
+ * identify the locations of the sequence on the genomic sequence (CDS
+ * features identify CDS, exon features identify cDNA etc).
+ *
+ * @return
+ */
+ protected abstract EnsemblFeatureType[] getFeaturesToFetch();
+
+ /**
+ * Fetches and maps the protein product, and adds it as a cross-reference of
+ * the retrieved sequence
+ */
+ protected void addProteinProduct(SequenceI querySeq)
{
- return inProgress;
+ String accId = querySeq.getName();
+ try
+ {
+ AlignmentI protein = new EnsemblProtein().getSequenceRecords(accId);
+ if (protein == null || protein.getHeight() == 0)
+ {
+ System.out.println("Failed to retrieve protein for " + accId);
+ return;
+ }
+ SequenceI proteinSeq = protein.getSequenceAt(0);
+
+ /*
+ * need dataset sequences (to be the subject of mappings)
+ */
+ proteinSeq.createDatasetSequence();
+ querySeq.createDatasetSequence();
+
+ MapList mapList = mapCdsToProtein(querySeq, proteinSeq);
+ if (mapList != null)
+ {
+ Mapping map = new Mapping(proteinSeq.getDatasetSequence(), mapList);
+ DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(),
+ accId, map);
+ querySeq.getDatasetSequence().addDBRef(dbr);
+ }
+ } catch (Exception e)
+ {
+ System.err
+ .println(String.format("Error retrieving protein for %s: %s",
+ accId, e.getMessage()));
+ }
}
- @Override
- public StringBuffer getRawRecords()
+ /**
+ * Returns a mapping from dna to protein by inspecting sequence features of
+ * type "CDS" on the dna.
+ *
+ * @param dnaSeq
+ * @param proteinSeq
+ * @return
+ */
+ protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq)
{
+ SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
+ if (sfs == null)
+ {
+ return null;
+ }
+
+ List<int[]> ranges = new ArrayList<int[]>(50);
+ SequenceOntology so = SequenceOntology.getInstance();
+
+ int mappedDnaLength = 0;
+
+ /*
+ * Map CDS columns of dna to peptide. No need to worry about reverse strand
+ * dna here since the retrieved sequence is as transcribed (reverse
+ * complement for reverse strand), i.e in the same sense as the peptide.
+ */
+ for (SequenceFeature sf : sfs)
+ {
+ /*
+ * process a CDS feature (or a sub-type of CDS)
+ */
+ if (so.isA(sf.getType(), SequenceOntology.CDS))
+ {
+ ranges.add(new int[] { sf.getBegin(), sf.getEnd() });
+ mappedDnaLength += Math.abs(sf.getEnd() - sf.getBegin()) + 1;
+ }
+ }
+ int proteinLength = proteinSeq.getLength();
+ List<int[]> proteinRange = new ArrayList<int[]>();
+ proteinRange.add(new int[] { 1, proteinLength });
+
+ /*
+ * dna length should map to protein (or protein minus stop codon)
+ */
+ if (mappedDnaLength == 3 * proteinLength
+ || mappedDnaLength == 3 * (proteinLength + 1))
+ {
+ return new MapList(ranges, proteinRange, 3, 1);
+ }
return null;
}
+ /**
+ * Fetches sequences for the list of accession ids and adds them to the
+ * alignment. Returns the extended (or created) alignment.
+ *
+ * @param ids
+ * @param alignment
+ * @return
+ * @throws JalviewException
+ * @throws IOException
+ */
+ protected AlignmentI fetchSequences(List<String> ids, AlignmentI alignment)
+ throws JalviewException, IOException
+ {
+ if (!isEnsemblAvailable())
+ {
+ inProgress = false;
+ throw new JalviewException("ENSEMBL Rest API not available.");
+ }
+ FileParse fp = getSequenceReader(ids);
+ FastaFile fr = new FastaFile(fp);
+ if (fr.hasWarningMessage())
+ {
+ System.out.println(String.format(
+ "Warning when retrieving %d ids %s\n%s", ids.size(),
+ ids.toString(), fr.getWarningMessage()));
+ }
+ else if (fr.getSeqs().size() != ids.size())
+ {
+ System.out.println(String.format(
+ "Only retrieved %d sequences for %d query strings", fr
+ .getSeqs().size(), ids.size()));
+ }
+ if (fr.getSeqs().size() > 0)
+ {
+ AlignmentI seqal = new Alignment(
+ fr.getSeqsAsArray());
+ for (SequenceI sq:seqal.getSequences())
+ {
+ if (sq.getDescription() == null)
+ {
+ sq.setDescription(getDbName());
+ }
+ String name = sq.getName();
+ if (ids.contains(name)
+ || ids.contains(name.replace("ENSP", "ENST")))
+ {
+ DBRefUtils.parseToDbRef(sq, DBRefSource.ENSEMBL, "0", name);
+ }
+ }
+ if (alignment == null)
+ {
+ alignment = seqal;
+ }
+ else
+ {
+ alignment.append(seqal);
+ }
+ }
+ return alignment;
+ }
+
+ /**
+ * Returns the URL for the REST call
+ *
+ * @return
+ * @throws MalformedURLException
+ */
@Override
- public int getTier()
+ protected URL getUrl(List<String> ids) throws MalformedURLException
{
- return 0;
+ // ids are not used - they go in the POST body instead
+ StringBuffer urlstring = new StringBuffer(128);
+ urlstring.append(SEQUENCE_ID_URL);
+
+ // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats
+ urlstring.append("?type=").append(getSourceEnsemblType().getType());
+ urlstring.append(("&Accept=text/x-fasta"));
+
+ URL url = new URL(urlstring.toString());
+ return url;
}
/**
}
@Override
- public boolean isDnaCoding()
+ public boolean useGetRequest()
+ {
+ return false;
+ }
+
+ @Override
+ public String getRequestMimeType()
+ {
+ return "application/json";
+ }
+
+ @Override
+ public String getResponseMimeType()
+ {
+ return "text/x-fasta";
+ }
+
+ /**
+ *
+ * @return the configured sequence return type for this source
+ */
+ protected abstract EnsemblSeqType getSourceEnsemblType();
+
+ /**
+ * Returns a list of [start, end] genomic ranges corresponding to the sequence
+ * being retrieved.
+ *
+ * The correspondence between the frames of reference is made by locating
+ * those features on the genomic sequence which identify the retrieved
+ * sequence. Specifically
+ * <ul>
+ * <li>genomic sequence is identified by "transcript" features with
+ * ID=transcript:transcriptId</li>
+ * <li>cdna sequence is identified by "exon" features with
+ * Parent=transcript:transcriptId</li>
+ * <li>cds sequence is identified by "CDS" features with
+ * Parent=transcript:transcriptId</li>
+ * </ul>
+ *
+ * The returned ranges are sorted to run forwards (for positive strand) or
+ * backwards (for negative strand). Aborts and returns null if both positive
+ * and negative strand are found (this should not normally happen).
+ *
+ * @param sfs
+ * @param accId
+ * @return
+ */
+ protected MapList getGenomicRanges(SequenceFeature[] sfs, String accId)
+ {
+ /*
+ * generously size for initial number of cds regions
+ * (worst case titin Q8WZ42 has c. 313 exons)
+ */
+ List<int[]> regions = new ArrayList<int[]>(100);
+ int mappedLength = 0;
+ int direction = 1; // forward
+ boolean directionSet = false;
+
+ for (SequenceFeature sf : sfs)
+ {
+ /*
+ * accept the target feature type or a specialisation of it
+ * (e.g. coding_exon for exon)
+ */
+ if (identifiesSequence(sf, accId))
+ {
+ int strand = sf.getStrand();
+
+ if (directionSet && strand != direction)
+ {
+ // abort - mix of forward and backward
+ System.err.println("Error: forward and backward strand for "
+ + accId);
+ return null;
+ }
+ direction = strand;
+ directionSet = true;
+
+ /*
+ * add to CDS ranges, semi-sorted forwards/backwards
+ */
+ if (strand < 0)
+ {
+ regions.add(0, new int[] { sf.getEnd(), sf.getBegin() });
+ }
+ else
+ {
+ regions.add(new int[] { sf.getBegin(), sf.getEnd() });
+ }
+ mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1);
+ }
+ }
+
+ /*
+ * a final sort is needed since Ensembl returns CDS sorted within source
+ * (havana / ensembl_havana)
+ */
+ Collections.sort(regions, new RangeSorter(direction == 1));
+
+ List<int[]> to = new ArrayList<int[]>();
+ to.add(new int[] { 1, mappedLength });
+
+ return new MapList(regions, to, 1, 1);
+ }
+
+ /**
+ * Returns true if the sequence feature identifies positions of the genomic
+ * sequence feature which are within the sequence being retrieved.
+ *
+ * @param sf
+ * @param accId
+ * @return
+ */
+ protected abstract boolean identifiesSequence(SequenceFeature sf,
+ String accId);
+
+ /**
+ * Transfers the sequence feature to the target sequence, adjusting its start
+ * and end range based on the 'overlap' ranges. Features which do not overlap
+ * the target sequence are ignored, as are features with a parent other than
+ * the target sequence id.
+ *
+ * @param sf
+ * @param targetSequence
+ * @param overlap
+ */
+ protected void transferFeature(SequenceFeature sf,
+ SequenceI targetSequence, MapList overlap)
+ {
+ String parent = (String) sf.getValue("Parent");
+ if (parent != null && !parent.contains(targetSequence.getName()))
+ {
+ // this genomic feature belongs to a different transcript
+ return;
+ }
+
+ int start = sf.getBegin();
+ int end = sf.getEnd();
+ int[] mappedRange = overlap.locateInTo(start, end);
+
+ if (mappedRange != null)
+ {
+ SequenceFeature copy = new SequenceFeature(sf);
+ int offset = targetSequence.getStart() - 1;
+ copy.setBegin(offset + Math.min(mappedRange[0], mappedRange[1]));
+ copy.setEnd(offset + Math.max(mappedRange[0], mappedRange[1]));
+ targetSequence.addSequenceFeature(copy);
+ }
+
+ }
+
+ /**
+ * Transfers features from sourceSequence to targetSequence
+ *
+ * @param accessionId
+ * @param sourceSequence
+ * @param targetSequence
+ */
+ protected void transferFeatures(String accessionId,
+ SequenceI sourceSequence, SequenceI targetSequence)
+ {
+ if (sourceSequence == null || targetSequence == null)
+ {
+ return;
+ }
+
+ SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
+ MapList overlap = getGenomicRanges(sfs, accessionId);
+
+ final boolean forwardStrand = overlap.isFromForwardStrand();
+
+ /*
+ * sort features by start position (descending if reverse strand)
+ * before transferring (in forwards order) to the target sequence
+ */
+ Arrays.sort(sfs, new Comparator<SequenceFeature>()
+ {
+ @Override
+ public int compare(SequenceFeature o1, SequenceFeature o2)
+ {
+ int c = Integer.compare(o1.getBegin(), o2.getBegin());
+ return forwardStrand ? c : -c;
+ }
+ });
+
+ for (SequenceFeature sf : sfs)
+ {
+ if (retainFeature(sf.getType()))
+ {
+ transferFeature(sf, targetSequence, overlap);
+ }
+ }
+ }
+
+ /**
+ * Answers true if the feature type is one to attach to the retrieved sequence
+ *
+ * @param type
+ * @return
+ */
+ protected boolean retainFeature(@SuppressWarnings("unused") String type)
{
- return true;
+ return true; // default is to keep all
}
}
--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.datamodel.DBRefSource;
+import jalview.ws.seqfetcher.DbSourceProxyImpl;
+
+import com.stevesoft.pat.Regex;
+
+/**
+ * A base class for Ensembl sequence fetchers
+ *
+ * @author gmcarstairs
+ *
+ */
+public abstract class EnsemblSequenceFetcher extends DbSourceProxyImpl
+{
+ /*
+ * possible values for the 'feature' parameter of the REST overlap endpoint
+ * @see
+ */
+ protected enum EnsemblFeatureType
+ {
+ gene, transcript, cds, exon, repeat, simple, misc, variation,
+ somatic_variation, structural_variation, somatic_structural_variation,
+ constrained, regulatory
+ }
+
+ @Override
+ public String getDbSource()
+ {
+ // NB ensure Uniprot xrefs are canonicalised from "Ensembl" to "ENSEMBL"
+ return DBRefSource.ENSEMBL; // "ENSEMBL"
+ }
+
+ @Override
+ public String getDbVersion()
+ {
+ return "0";
+ }
+
+ @Override
+ public String getAccessionSeparator()
+ {
+ return " ";
+ }
+
+ @Override
+ public Regex getAccessionValidator()
+ {
+ return new Regex("((ENSP|ENST|ENSG|CCDS)[0-9.]{3,})");
+ }
+
+ @Override
+ public boolean isValidReference(String accession)
+ {
+ return getAccessionValidator().search(accession);
+ }
+
+ @Override
+ public int getTier()
+ {
+ return 0;
+ }
+
+ /**
+ * Default test query is a transcript
+ */
+ @Override
+ public String getTestQuery()
+ {
+ // has CDS on reverse strand:
+ return "ENST00000288602";
+ // ENST00000461457 // forward strand
+ }
+
+ @Override
+ public boolean isDnaCoding()
+ {
+ return true;
+ }
+}
+++ /dev/null
-package jalview.ext.ensembl;
-
-import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
-
-public class EnsemblTranscript extends EnsemblSeqProxy
-{
-
- // TODO is this class needed? it seems to duplicate EnsemblProtein
- public EnsemblTranscript()
- {
- super();
- }
-
- @Override
- public String getDbName()
- {
- return "ENSEMBL (Protein)";
- }
-
- @Override
- protected EnsemblSeqType getSourceEnsemblType()
- {
- return EnsemblSeqType.PROTEIN;
- }
-
-}
+++ /dev/null
-package jalview.ext.ensembl;
-
-import jalview.io.FileParse;
-
-import java.io.BufferedReader;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.net.HttpURLConnection;
-import java.net.URL;
-import java.net.URLConnection;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.http.NameValuePair;
-import org.apache.http.message.BasicNameValuePair;
-
-public class SeqFetcher
-{
- private final static String ENSEMBL_REST = "rest.ensembl.org";
-
- private static final String SEQUENCE_ID_URL = "http://" + ENSEMBL_REST + "/sequence/id";
-
- private static final String PING_URL = "http://" + ENSEMBL_REST + "/info/ping";
-
- private final static long RETEST_INTERVAL = 10000L; // 10 seconds
-
- private static boolean ensemblRestAvailable = false;
-
- private static long lastCheck = -1;
-
- /**
- * Rechecks if Ensembl is responding, unless the last check was successful and
- * the retest interval has not yet elapsed. Returns true if Ensembl is up,
- * else false.
- *
- * @return
- */
- public boolean isEnsemblAvailable()
- {
- long now = System.currentTimeMillis();
- boolean retest = now - lastCheck > RETEST_INTERVAL;
- if (ensemblRestAvailable && !retest)
- {
- return true;
- }
- ensemblRestAvailable = checkEnsembl();
- lastCheck = now;
- return ensemblRestAvailable;
- }
-
- /**
- * Tries to connect to Ensembl's REST 'ping' endpoint, and returns true if
- * successful, else false
- *
- * @return
- */
- private boolean checkEnsembl()
- {
- try
- {
- URL ping = new URL(PING_URL);
- HttpURLConnection conn = (HttpURLConnection) ping.openConnection();
- int rc = conn.getResponseCode();
- conn.disconnect();
- if (rc >= 200 && rc < 300)
- {
- return true;
- }
- } catch (Throwable t)
- {
- System.err.println("Error connecting to " + PING_URL + ": "
- + t.getMessage());
- }
- return false;
- }
-
- public SeqFetcher()
- {
- }
-
- public enum EnsemblSeqType
- {
- GENOMIC("genomic"), CDS("cds"), TRANSCRIPT("cds"), PROTEIN("protein"), CDNA(
- "cdna");
-
- private String type;
-
- EnsemblSeqType(String t)
- {
- type = t;
- }
-
- public String getType()
- {
- return type;
- }
- }
-
- /**
- * Returns a list of additional URL query parameters to specify the desired
- * sequence type (genomic/cds/protein etc), and data format Fasta
- *
- * @param type
- */
- public List<NameValuePair> getAdditionalParameters(EnsemblSeqType type)
- {
- List<NameValuePair> params = new ArrayList<NameValuePair>();
- params.add(new BasicNameValuePair("type", type.getType()));
- params.add(new BasicNameValuePair("content-type", "text/x-fasta"));
- return params;
- }
-
- /**
- * return a reader to a Fasta response from the Ensembl sequence endpoint
- *
- * @param returnType
- * @param ids
- * @return
- * @throws IOException
- */
- public FileParse getSequenceReader(EnsemblSeqType returnType,
- List<String> ids) throws IOException
- {
- // see http://rest.ensembl.org/documentation/info/sequence_id
-
- String urlstring = SEQUENCE_ID_URL;
- List<NameValuePair> vals = getAdditionalParameters(returnType);
- boolean first = true;
- for (NameValuePair nvp : vals)
- {
- urlstring += first ? "?" : "&";
- first = false;
- urlstring += nvp.getName() + "=" + nvp.getValue();
- }
-
- URL url = new URL(urlstring);
-
- URLConnection connection = url.openConnection();
- HttpURLConnection httpConnection = (HttpURLConnection) connection;
-
- httpConnection.setRequestMethod("POST");
- httpConnection.setRequestProperty("Content-Type", "application/json");
- httpConnection.setRequestProperty("Accept", "text/x-fasta");
- byte[] thepostbody;
- {
- StringBuilder postBody = new StringBuilder();
- postBody.append("{\"ids\":[");
- first = true;
- for (String id : ids)
- {
- if (!first)
- {
- postBody.append(",");
- }
- first = false;
- postBody.append("\"");
- postBody.append(id.trim());
- postBody.append("\"");
- }
- postBody.append("]}");
- thepostbody = postBody.toString().getBytes();
- }
- httpConnection.setRequestProperty("Content-Length",
- Integer.toString(thepostbody.length));
- httpConnection.setUseCaches(false);
- httpConnection.setDoInput(true);
- httpConnection.setDoOutput(true);
-
- DataOutputStream wr = new DataOutputStream(
- httpConnection.getOutputStream());
- wr.write(thepostbody);
- wr.flush();
- wr.close();
-
- InputStream response = connection.getInputStream();
- int responseCode = httpConnection.getResponseCode();
-
- if (responseCode != 200)
- {
- throw new RuntimeException(
- "Response code was not 200. Detected response was "
- + responseCode);
- }
-
- BufferedReader reader = null;
- reader = new BufferedReader(new InputStreamReader(response, "UTF-8"));
- FileParse fp = new FileParse(reader, url.toString(), "HTTP_POST");
- return fp;
- }
-
-}
--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.datamodel.AlignmentI;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.List;
+
+import org.testng.annotations.Test;
+
+public class EnsemblRestClientTest
+{
+
+ @Test(suiteName = "live")
+ public void testLiveCheckEnsembl()
+ {
+ EnsemblRestClient sf = new EnsemblRestClient()
+ {
+
+ @Override
+ public String getDbName()
+ {
+ return null;
+ }
+
+ @Override
+ public AlignmentI getSequenceRecords(String queries) throws Exception
+ {
+ return null;
+ }
+
+ @Override
+ protected URL getUrl(List<String> ids) throws MalformedURLException
+ {
+ return null;
+ }
+
+ @Override
+ public boolean useGetRequest()
+ {
+ return false;
+ }
+
+ @Override
+ public String getRequestMimeType()
+ {
+ return null;
+ }
+
+ @Override
+ public String getResponseMimeType()
+ {
+ return null;
+ }
+
+ };
+ boolean isAvailable = sf.isEnsemblAvailable();
+ if (isAvailable)
+ {
+ System.out.println("Ensembl is UP!");
+ }
+ else
+ {
+ System.err
+ .println("Ensembl is DOWN or unreachable ******************* BAD!");
+ }
+ }
+
+}
package jalview.ext.ensembl;
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.SequenceI;
+import jalview.io.AppletFormatAdapter;
+import jalview.io.FastaFile;
+import jalview.io.FileParse;
+
import java.lang.reflect.Method;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.List;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
public class EnsemblSeqProxyTest
{
+ private static final Object[][] allSeqs = new Object[][] {
+ {
+ new EnsemblProtein(),
+ "CCDS5863.1",
+ ">CCDS5863.1\n"
+ + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
+ + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
+ + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
+ + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
+ + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
+ + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
+ + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
+ + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
+ + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
+ + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
+ + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
+ + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
+ + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH\n" },
+ {
+ new EnsemblCdna(),
+ "CCDS5863.1",
+ ">CCDS5863.1\n"
+ + "ATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAAC\n"
+ + "GGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGAC\n"
+ + "CCTGCCATTCCGGAGGAGGTGTGGAATATCAAACAAATGATTAAGTTGACACAGGAACAT\n"
+ + "ATAGAGGCCCTATTGGACAAATTTGGTGGGGAGCATAATCCACCATCAATATATCTGGAG\n"
+ + "GCCTATGAAGAATACACCAGCAAGCTAGATGCACTCCAACAAAGAGAACAACAGTTATTG\n"
+ + "GAATCTCTGGGGAACGGAACTGATTTTTCTGTTTCTAGCTCTGCATCAATGGATACCGTT\n"
+ + "ACATCTTCTTCCTCTTCTAGCCTTTCAGTGCTACCTTCATCTCTTTCAGTTTTTCAAAAT\n"
+ + "CCCACAGATGTGGCACGGAGCAACCCCAAGTCACCACAAAAACCTATCGTTAGAGTCTTC\n"
+ + "CTGCCCAACAAACAGAGGACAGTGGTACCTGCAAGGTGTGGAGTTACAGTCCGAGACAGT\n"
+ + "CTAAAGAAAGCACTGATGATGAGAGGTCTAATCCCAGAGTGCTGTGCTGTTTACAGAATT\n"
+ + "CAGGATGGAGAGAAGAAACCAATTGGTTGGGACACTGATATTTCCTGGCTTACTGGAGAA\n"
+ + "GAATTGCATGTGGAAGTGTTGGAGAATGTTCCACTTACAACACACAACTTTGTACGAAAA\n"
+ + "ACGTTTTTCACCTTAGCATTTTGTGACTTTTGTCGAAAGCTGCTTTTCCAGGGTTTCCGC\n"
+ + "TGTCAAACATGTGGTTATAAATTTCACCAGCGTTGTAGTACAGAAGTTCCACTGATGTGT\n"
+ + "GTTAATTATGACCAACTTGATTTGCTGTTTGTCTCCAAGTTCTTTGAACACCACCCAATA\n"
+ + "CCACAGGAAGAGGCGTCCTTAGCAGAGACTGCCCTAACATCTGGATCATCCCCTTCCGCA\n"
+ + "CCCGCCTCGGACTCTATTGGGCCCCAAATTCTCACCAGTCCGTCTCCTTCAAAATCCATT\n"
+ + "CCAATTCCACAGCCCTTCCGACCAGCAGATGAAGATCATCGAAATCAATTTGGGCAACGA\n"
+ + "GACCGATCCTCATCAGCTCCCAATGTGCATATAAACACAATAGAACCTGTCAATATTGAT\n"
+ + "GACTTGATTAGAGACCAAGGATTTCGTGGTGATGGAGGATCAACCACAGGTTTGTCTGCT\n"
+ + "ACCCCCCCTGCCTCATTACCTGGCTCACTAACTAACGTGAAAGCCTTACAGAAATCTCCA\n"
+ + "GGACCTCAGCGAGAAAGGAAGTCATCTTCATCCTCAGAAGACAGGAATCGAATGAAAACA\n"
+ + "CTTGGTAGACGGGACTCGAGTGATGATTGGGAGATTCCTGATGGGCAGATTACAGTGGGA\n"
+ + "CAAAGAATTGGATCTGGATCATTTGGAACAGTCTACAAGGGAAAGTGGCATGGTGATGTG\n"
+ + "GCAGTGAAAATGTTGAATGTGACAGCACCTACACCTCAGCAGTTACAAGCCTTCAAAAAT\n"
+ + "GAAGTAGGAGTACTCAGGAAAACACGACATGTGAATATCCTACTCTTCATGGGCTATTCC\n"
+ + "ACAAAGCCACAACTGGCTATTGTTACCCAGTGGTGTGAGGGCTCCAGCTTGTATCACCAT\n"
+ + "CTCCATATCATTGAGACCAAATTTGAGATGATCAAACTTATAGATATTGCACGACAGACT\n"
+ + "GCACAGGGCATGGATTACTTACACGCCAAGTCAATCATCCACAGAGACCTCAAGAGTAAT\n"
+ + "AATATATTTCTTCATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTG\n"
+ + "AAATCTCGATGGAGTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATG\n"
+ + "GCACCAGAAGTCATCAGAATGCAAGATAAAAATCCATACAGCTTTCAGTCAGATGTATAT\n"
+ + "GCATTTGGAATTGTTCTGTATGAATTGATGACTGGACAGTTACCTTATTCAAACATCAAC\n"
+ + "AACAGGGACCAGATAATTTTTATGGTGGGACGAGGATACCTGTCTCCAGATCTCAGTAAG\n"
+ + "GTACGGAGTAACTGTCCAAAAGCCATGAAGAGATTAATGGCAGAGTGCCTCAAAAAGAAA\n"
+ + "AGAGATGAGAGACCACTCTTTCCCCAAATTCTCGCCTCTATTGAGCTGCTGGCCCGCTCA\n"
+ + "TTGCCAAAAATTCACCGCAGTGCATCAGAACCCTCCTTGAATCGGGCTGGTTTCCAAACA\n"
+ + "GAGGATTTTAGTCTATATGCTTGTGCTTCTCCAAAAACACCCATCCAGGCAGGGGGATAT\n"
+ + "GGTGCGTTTCCTGTCCACTGA\n" },
+ {
+ new EnsemblProtein(),
+ "ENSP00000288602",
+ ">ENSP00000288602\n"
+ + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
+ + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
+ + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
+ + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
+ + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
+ + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
+ + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
+ + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
+ + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
+ + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
+ + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
+ + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
+ + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" } };
+
@DataProvider(name = "queries")
- public Object[][] createData(Method m)
+ public Object[][] createQueryData(Method m)
{
return new Object[][] { { "CCDS5863.1" }, { "ENSP00000288602" } };
}
@Test(dataProvider = "queries")
public void testIsValidReference(String query) throws Exception
{
- EnsemblSeqProxy esq = new EnsemblProtein();
+ EnsemblSequenceFetcher esq = new EnsemblProtein();
Assert.assertTrue(esq.isValidReference(query),
"Expected reference string " + query
+ " to be valid for regex "
+ esq.getAccessionValidator().toString());
}
+
+ @DataProvider(name = "ens_seqs")
+ public Object[][] createData(Method m)
+ {
+ System.out.println(m.getName());
+ return allSeqs;
+ }
+
+ @Test(dataProvider = "ens_seqs", suiteName = "live")
+ public void testGetOneSeqs(EnsemblRestClient proxy, String sq, String fastasq)
+ throws Exception
+ {
+ FileParse fp = proxy.getSequenceReader(Arrays
+ .asList(new String[]
+ { sq }));
+ SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray();
+ FastaFile trueRes = new FastaFile(fastasq, AppletFormatAdapter.PASTE);
+ SequenceI[] trueSqs = trueRes.getSeqsAsArray();
+ Assert.assertEquals(sqs.length, trueSqs.length,
+ "Different number of sequences retrieved for query " + sq);
+ Alignment ral = new Alignment(sqs);
+ for (SequenceI tr : trueSqs)
+ {
+ SequenceI[] rseq;
+ Assert.assertNotNull(
+ rseq = ral.findSequenceMatch(tr.getName()),
+ "Couldn't find sequences matching expected sequence "
+ + tr.getName());
+ Assert.assertEquals(rseq.length, 1,
+ "Expected only one sequence for sequence ID " + tr.getName());
+ Assert.assertEquals(
+ rseq[0].getSequenceAsString(),
+ tr.getSequenceAsString(),
+ "Sequences differ for " + tr.getName() + "\n" + "Exp:"
+ + tr.getSequenceAsString() + "\n" + "Got:"
+ + rseq[0].getSequenceAsString());
+
+ }
+ }
+
+ @Test(suiteName = "live")
+ public void testLiveCheckEnsembl()
+ {
+ EnsemblRestClient sf = new EnsemblRestClient()
+ {
+
+ @Override
+ public String getDbName()
+ {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ @Override
+ public AlignmentI getSequenceRecords(String queries) throws Exception
+ {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ @Override
+ protected URL getUrl(List<String> ids) throws MalformedURLException
+ {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ @Override
+ public boolean useGetRequest()
+ {
+ // TODO Auto-generated method stub
+ return false;
+ }
+
+ @Override
+ public String getRequestMimeType()
+ {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ @Override
+ public String getResponseMimeType()
+ {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ };
+ boolean isAvailable = sf.isEnsemblAvailable();
+ System.out.println("Ensembl is "
+ + (isAvailable ? "UP!"
+ : "DOWN or unreachable ******************* BAD!"));
+ }
}
\ No newline at end of file
+++ /dev/null
-package jalview.ext.ensembl;
-
-import jalview.datamodel.Alignment;
-import jalview.datamodel.SequenceI;
-import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
-import jalview.io.AppletFormatAdapter;
-import jalview.io.FastaFile;
-import jalview.io.FileParse;
-
-import java.lang.reflect.Method;
-import java.util.Arrays;
-
-import org.testng.Assert;
-import org.testng.annotations.DataProvider;
-import org.testng.annotations.Test;
-
-public class SeqFetcherTest
-{
- private static final Object[][] allSeqs = new Object[][] {
- {
- EnsemblSeqType.PROTEIN,
- "CCDS5863.1",
- ">CCDS5863.1\n"
- + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
- + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
- + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
- + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
- + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
- + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
- + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
- + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
- + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
- + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
- + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
- + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
- + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH\n" },
- {
- EnsemblSeqType.TRANSCRIPT,
- "CCDS5863.1",
- ">CCDS5863.1\n"
- + "ATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAAC\n"
- + "GGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGAC\n"
- + "CCTGCCATTCCGGAGGAGGTGTGGAATATCAAACAAATGATTAAGTTGACACAGGAACAT\n"
- + "ATAGAGGCCCTATTGGACAAATTTGGTGGGGAGCATAATCCACCATCAATATATCTGGAG\n"
- + "GCCTATGAAGAATACACCAGCAAGCTAGATGCACTCCAACAAAGAGAACAACAGTTATTG\n"
- + "GAATCTCTGGGGAACGGAACTGATTTTTCTGTTTCTAGCTCTGCATCAATGGATACCGTT\n"
- + "ACATCTTCTTCCTCTTCTAGCCTTTCAGTGCTACCTTCATCTCTTTCAGTTTTTCAAAAT\n"
- + "CCCACAGATGTGGCACGGAGCAACCCCAAGTCACCACAAAAACCTATCGTTAGAGTCTTC\n"
- + "CTGCCCAACAAACAGAGGACAGTGGTACCTGCAAGGTGTGGAGTTACAGTCCGAGACAGT\n"
- + "CTAAAGAAAGCACTGATGATGAGAGGTCTAATCCCAGAGTGCTGTGCTGTTTACAGAATT\n"
- + "CAGGATGGAGAGAAGAAACCAATTGGTTGGGACACTGATATTTCCTGGCTTACTGGAGAA\n"
- + "GAATTGCATGTGGAAGTGTTGGAGAATGTTCCACTTACAACACACAACTTTGTACGAAAA\n"
- + "ACGTTTTTCACCTTAGCATTTTGTGACTTTTGTCGAAAGCTGCTTTTCCAGGGTTTCCGC\n"
- + "TGTCAAACATGTGGTTATAAATTTCACCAGCGTTGTAGTACAGAAGTTCCACTGATGTGT\n"
- + "GTTAATTATGACCAACTTGATTTGCTGTTTGTCTCCAAGTTCTTTGAACACCACCCAATA\n"
- + "CCACAGGAAGAGGCGTCCTTAGCAGAGACTGCCCTAACATCTGGATCATCCCCTTCCGCA\n"
- + "CCCGCCTCGGACTCTATTGGGCCCCAAATTCTCACCAGTCCGTCTCCTTCAAAATCCATT\n"
- + "CCAATTCCACAGCCCTTCCGACCAGCAGATGAAGATCATCGAAATCAATTTGGGCAACGA\n"
- + "GACCGATCCTCATCAGCTCCCAATGTGCATATAAACACAATAGAACCTGTCAATATTGAT\n"
- + "GACTTGATTAGAGACCAAGGATTTCGTGGTGATGGAGGATCAACCACAGGTTTGTCTGCT\n"
- + "ACCCCCCCTGCCTCATTACCTGGCTCACTAACTAACGTGAAAGCCTTACAGAAATCTCCA\n"
- + "GGACCTCAGCGAGAAAGGAAGTCATCTTCATCCTCAGAAGACAGGAATCGAATGAAAACA\n"
- + "CTTGGTAGACGGGACTCGAGTGATGATTGGGAGATTCCTGATGGGCAGATTACAGTGGGA\n"
- + "CAAAGAATTGGATCTGGATCATTTGGAACAGTCTACAAGGGAAAGTGGCATGGTGATGTG\n"
- + "GCAGTGAAAATGTTGAATGTGACAGCACCTACACCTCAGCAGTTACAAGCCTTCAAAAAT\n"
- + "GAAGTAGGAGTACTCAGGAAAACACGACATGTGAATATCCTACTCTTCATGGGCTATTCC\n"
- + "ACAAAGCCACAACTGGCTATTGTTACCCAGTGGTGTGAGGGCTCCAGCTTGTATCACCAT\n"
- + "CTCCATATCATTGAGACCAAATTTGAGATGATCAAACTTATAGATATTGCACGACAGACT\n"
- + "GCACAGGGCATGGATTACTTACACGCCAAGTCAATCATCCACAGAGACCTCAAGAGTAAT\n"
- + "AATATATTTCTTCATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTG\n"
- + "AAATCTCGATGGAGTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATG\n"
- + "GCACCAGAAGTCATCAGAATGCAAGATAAAAATCCATACAGCTTTCAGTCAGATGTATAT\n"
- + "GCATTTGGAATTGTTCTGTATGAATTGATGACTGGACAGTTACCTTATTCAAACATCAAC\n"
- + "AACAGGGACCAGATAATTTTTATGGTGGGACGAGGATACCTGTCTCCAGATCTCAGTAAG\n"
- + "GTACGGAGTAACTGTCCAAAAGCCATGAAGAGATTAATGGCAGAGTGCCTCAAAAAGAAA\n"
- + "AGAGATGAGAGACCACTCTTTCCCCAAATTCTCGCCTCTATTGAGCTGCTGGCCCGCTCA\n"
- + "TTGCCAAAAATTCACCGCAGTGCATCAGAACCCTCCTTGAATCGGGCTGGTTTCCAAACA\n"
- + "GAGGATTTTAGTCTATATGCTTGTGCTTCTCCAAAAACACCCATCCAGGCAGGGGGATAT\n"
- + "GGTGCGTTTCCTGTCCACTGA\n" },
- {
- EnsemblSeqType.PROTEIN,
- "ENSP00000288602",
- ">ENSP00000288602\n"
- + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
- + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
- + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
- + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
- + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
- + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
- + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
- + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
- + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
- + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
- + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
- + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
- + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" } };
-
- @DataProvider(name = "ens_seqs")
- public Object[][] createData(Method m)
- {
- System.out.println(m.getName());
- return allSeqs;
- }
-
- @Test(dataProvider = "ens_seqs", suiteName = "live")
- public void testGetOneSeqs(EnsemblSeqType type, String sq, String fastasq)
- throws Exception
- {
- SeqFetcher sf = new SeqFetcher();
- FileParse fp = sf.getSequenceReader(type, Arrays.asList(new String[]
- { sq }));
- SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray();
- FastaFile trueRes = new FastaFile(fastasq, AppletFormatAdapter.PASTE);
- SequenceI[] trueSqs = trueRes.getSeqsAsArray();
- Assert.assertEquals(sqs.length, trueSqs.length,
- "Different number of sequences retrieved for query " + sq);
- Alignment ral = new Alignment(sqs);
- for (SequenceI tr : trueSqs)
- {
- SequenceI[] rseq;
- Assert.assertNotNull(
- rseq = ral.findSequenceMatch(tr.getName()),
- "Couldn't find sequences matching expected sequence "
- + tr.getName());
- Assert.assertEquals(rseq.length, 1,
- "Expected only one sequence for sequence ID " + tr.getName());
- Assert.assertEquals(
- rseq[0].getSequenceAsString(),
- tr.getSequenceAsString(),
- "Sequences differ for " + tr.getName() + "\n" + "Exp:"
- + tr.getSequenceAsString() + "\n" + "Got:"
- + rseq[0].getSequenceAsString());
-
- }
- }
-
- @Test(suiteName = "live")
- public void testLiveCheckEnsembl()
- {
- SeqFetcher sf = new SeqFetcher();
- boolean isAvailable = sf.isEnsemblAvailable();
- System.out.println("Ensembl is "
- + (isAvailable ? "UP!"
- : "DOWN or unreachable ******************* BAD!"));
- }
- // TODO:
- // sequence query with ENSG and anything other than a genomic type will yield
- // sequences with different IDs which will
- // break the post-processing stage where DBRefs are assigned to sequences.
- // -> multiple_sequences = true is needed additional parameter
- // http://rest.ensembl.org/sequence/id/ENSG00000157764?content-type=text/x-json;type=protein;multiple_sequences=true
- // result with four transcripts, cds, cdna, and protein products.
- // *
- // features for ENG -
- // http://rest.ensembl.org/overlap/id/ENSG00000157764?feature=cds&feature=exon&feature=transcript&content-type=text/x-gff3
- // transcript: gives locus, all transcript products with ENSG parents
- // gene: give all ENSG on locus
- // exon: all exon boundaries. CDS same info.
-
- // @Test(dataProvider = "ens_seqs", suiteName = "live")
- // public void testGetOneSeqs(EnsemblSeqType type, String sq, String fastasq)
- // throws Exception
- // {
- //
- // {
- // Assert.assertTrue(rseq[0].getDBRef() != null
- // && rseq[0].getDBRef().length > 0,
- // "No database references added to sequence by fetcher.");
- // Assert.assertNotNull(DBRefUtils.searchRefs(rseq[0].getDBRef(),
- // new DBRefEntry("ENSEMBL", null, sq)),
- // "Could't find database references added to sequence by fetcher.");
- //
- // }
-
-}
--- /dev/null
+package jalview.ext.jmol;
+
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceI;
+import jalview.gui.AlignFrame;
+import jalview.gui.SequenceRenderer;
+import jalview.structure.StructureMappingcommandSet;
+import jalview.structure.StructureSelectionManager;
+
+import org.testng.annotations.Test;
+
+public class JmolCommandsTest
+{
+
+ @Test(groups = { "Functional" })
+ public void testGetColourBySequenceCommand_noFeatures()
+ {
+ SequenceI seq1 = new Sequence("seq1", "MHRSQTRALK");
+ SequenceI seq2 = new Sequence("seq2", "MRLEITQSGD");
+ AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2 });
+ AlignFrame af = new AlignFrame(al, 800, 500);
+ SequenceRenderer sr = new SequenceRenderer(af.getViewport());
+ SequenceI[][] seqs = new SequenceI[][] { { seq1 }, { seq2 } };
+ String[] files = new String[] { "seq1.pdb", "seq2.pdb" };
+ StructureSelectionManager ssm = new StructureSelectionManager();
+
+ // need some mappings!
+
+ StructureMappingcommandSet[] commands = JmolCommands
+ .getColourBySequenceCommand(ssm, files, seqs, sr, null, al);
+ }
+}