From b03ec66ae6238b44bd20d2403d1157cadc5f0e01 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 21 Jan 2016 16:01:19 +0000 Subject: [PATCH] JAL-1705 reworked Ensembl clients now fetching and mapping features & peptide --- src/jalview/ext/ensembl/EnsemblCdna.java | 46 +- src/jalview/ext/ensembl/EnsemblCds.java | 50 +- src/jalview/ext/ensembl/EnsemblGenome.java | 49 +- src/jalview/ext/ensembl/EnsemblOverlap.java | 123 ++++ src/jalview/ext/ensembl/EnsemblProtein.java | 29 +- src/jalview/ext/ensembl/EnsemblRestClient.java | 215 +++++++ src/jalview/ext/ensembl/EnsemblSeqProxy.java | 617 ++++++++++++++++---- .../ext/ensembl/EnsemblSequenceFetcher.java | 80 +++ src/jalview/ext/ensembl/EnsemblTranscript.java | 26 - src/jalview/ext/ensembl/SeqFetcher.java | 193 ------ .../jalview/ext/ensembl/EnsemblRestClientTest.java | 69 +++ test/jalview/ext/ensembl/EnsemblSeqProxyTest.java | 188 +++++- test/jalview/ext/ensembl/SeqFetcherTest.java | 175 ------ test/jalview/ext/jmol/JmolCommandsTest.java | 34 ++ 14 files changed, 1387 insertions(+), 507 deletions(-) create mode 100644 src/jalview/ext/ensembl/EnsemblOverlap.java create mode 100644 src/jalview/ext/ensembl/EnsemblRestClient.java create mode 100644 src/jalview/ext/ensembl/EnsemblSequenceFetcher.java delete mode 100644 src/jalview/ext/ensembl/EnsemblTranscript.java delete mode 100644 src/jalview/ext/ensembl/SeqFetcher.java create mode 100644 test/jalview/ext/ensembl/EnsemblRestClientTest.java delete mode 100644 test/jalview/ext/ensembl/SeqFetcherTest.java create mode 100644 test/jalview/ext/jmol/JmolCommandsTest.java diff --git a/src/jalview/ext/ensembl/EnsemblCdna.java b/src/jalview/ext/ensembl/EnsemblCdna.java index 9c88b7c..b8c9c3f 100644 --- a/src/jalview/ext/ensembl/EnsemblCdna.java +++ b/src/jalview/ext/ensembl/EnsemblCdna.java @@ -1,11 +1,19 @@ package jalview.ext.ensembl; -import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType; +import jalview.datamodel.SequenceFeature; +import jalview.io.gff.SequenceOntology; import com.stevesoft.pat.Regex; public class EnsemblCdna extends EnsemblSeqProxy { + /* + * fetch exon features on genomic sequence (to identify the cdnaregions) + * and cds and variation features (to retain) + */ + private static final EnsemblFeatureType[] FEATURES_TO_FETCH = { + EnsemblFeatureType.exon, EnsemblFeatureType.cds, + EnsemblFeatureType.variation }; public EnsemblCdna() { @@ -31,9 +39,41 @@ public class EnsemblCdna extends EnsemblSeqProxy } @Override - public String getTestQuery() + protected EnsemblFeatureType[] getFeaturesToFetch() { - return "ENST00000288602"; + return FEATURES_TO_FETCH; + } + + /** + * Answers true unless the feature type is 'exon' (or a sub-type of exon in + * the Sequence Ontology). Exon features are only retrieved in order to + * identify the exon sequence range, and are redundant information on the exon + * sequence itself. + */ + @Override + protected boolean retainFeature(String type) + { + return !SequenceOntology.getInstance().isA(type, SequenceOntology.EXON); + } + + /** + * Answers true if the sequence feature type is 'exon' (or a subtype of exon + * in the Sequence Ontology), and the Parent of the feature is the transcript + * we are retrieving + */ + @Override + protected boolean identifiesSequence(SequenceFeature sf, String accId) + { + if (SequenceOntology.getInstance().isA(sf.getType(), + SequenceOntology.EXON)) + { + String parentFeature = (String) sf.getValue("Parent"); + if (("transcript:" + accId).equals(parentFeature)) + { + return true; + } + } + return false; } } diff --git a/src/jalview/ext/ensembl/EnsemblCds.java b/src/jalview/ext/ensembl/EnsemblCds.java index dc92348..897371d 100644 --- a/src/jalview/ext/ensembl/EnsemblCds.java +++ b/src/jalview/ext/ensembl/EnsemblCds.java @@ -1,10 +1,20 @@ package jalview.ext.ensembl; -import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType; +import jalview.datamodel.SequenceFeature; +import jalview.io.gff.SequenceOntology; public class EnsemblCds extends EnsemblSeqProxy { + /* + * fetch cds features on genomic sequence (to identify the CDS regions) + * and variation features (to retain) + */ + private static final EnsemblFeatureType[] FEATURES_TO_FETCH = { + EnsemblFeatureType.cds, EnsemblFeatureType.variation }; + /** + * Constructor + */ public EnsemblCds() { super(); @@ -22,4 +32,42 @@ public class EnsemblCds extends EnsemblSeqProxy return EnsemblSeqType.CDS; } + @Override + protected EnsemblFeatureType[] getFeaturesToFetch() + { + return FEATURES_TO_FETCH; + } + + /** + * Answers true unless the feature type is 'CDS' (or a sub-type of CDS in the + * Sequence Ontology). CDS features are only retrieved in order to identify + * the cds sequence range, and are redundant information on the cds sequence + * itself. + */ + @Override + protected boolean retainFeature(String type) + { + return !SequenceOntology.getInstance().isA(type, SequenceOntology.CDS); + } + + /** + * Answers true if the sequence feature type is 'CDS' (or a subtype of CDS in + * the Sequence Ontology), and the Parent of the feature is the transcript we + * are retrieving + */ + @Override + protected boolean identifiesSequence(SequenceFeature sf, String accId) + { + if (SequenceOntology.getInstance().isA(sf.getType(), + SequenceOntology.CDS)) + { + String parentFeature = (String) sf.getValue("Parent"); + if (("transcript:" + accId).equals(parentFeature)) + { + return true; + } + } + return false; + } + } diff --git a/src/jalview/ext/ensembl/EnsemblGenome.java b/src/jalview/ext/ensembl/EnsemblGenome.java index 39dfac0..6b4a1f6 100644 --- a/src/jalview/ext/ensembl/EnsemblGenome.java +++ b/src/jalview/ext/ensembl/EnsemblGenome.java @@ -1,9 +1,17 @@ package jalview.ext.ensembl; -import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType; +import jalview.datamodel.SequenceFeature; +import jalview.io.gff.SequenceOntology; public class EnsemblGenome extends EnsemblSeqProxy { + /* + * fetch transcript features on genomic sequence (to identify the transcript + * regions) and cds, exon and variation features (to retain) + */ + private static final EnsemblFeatureType[] FEATURES_TO_FETCH = { + EnsemblFeatureType.transcript, EnsemblFeatureType.exon, + EnsemblFeatureType.cds, EnsemblFeatureType.variation }; public EnsemblGenome() { @@ -22,4 +30,43 @@ public class EnsemblGenome extends EnsemblSeqProxy return EnsemblSeqType.GENOMIC; } + @Override + protected EnsemblFeatureType[] getFeaturesToFetch() + { + return FEATURES_TO_FETCH; + } + + /** + * Answers true unless the feature type is 'transcript' (or a sub-type of + * transcript in the Sequence Ontology). Transcript features are only + * retrieved in order to identify the transcript sequence range, and are + * redundant information on the transcript sequence itself. + */ + @Override + protected boolean retainFeature(String type) + { + return !SequenceOntology.getInstance().isA(type, + SequenceOntology.TRANSCRIPT); + } + + /** + * Answers true if the sequence feature type is 'transcript' (or a subtype of + * transcript in the Sequence Ontology), and the ID of the feature is the + * transcript we are retrieving + */ + @Override + protected boolean identifiesSequence(SequenceFeature sf, String accId) + { + if (SequenceOntology.getInstance().isA(sf.getType(), + SequenceOntology.TRANSCRIPT)) + { + String parentFeature = (String) sf.getValue("ID"); + if (("transcript:" + accId).equals(parentFeature)) + { + return true; + } + } + return false; + } + } diff --git a/src/jalview/ext/ensembl/EnsemblOverlap.java b/src/jalview/ext/ensembl/EnsemblOverlap.java new file mode 100644 index 0000000..732b518 --- /dev/null +++ b/src/jalview/ext/ensembl/EnsemblOverlap.java @@ -0,0 +1,123 @@ +package jalview.ext.ensembl; + +import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentI; +import jalview.io.FeaturesFile; +import jalview.io.FileParse; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +/** + * A client for fetching and processing Ensembl overlap data in GFF feature + * format + * + * @author gmcarstairs + * @see http://rest.ensembl.org/documentation/info/overlap_id + */ +public class EnsemblOverlap extends EnsemblRestClient +{ + /* + * The default features to retrieve from Ensembl; can override in getSequenceRecords + */ + private EnsemblFeatureType[] featuresWanted = { EnsemblFeatureType.cds, + EnsemblFeatureType.exon, EnsemblFeatureType.variation }; + + @Override + public String getDbName() + { + return "ENSEMBL (overlap)"; + } + + /** + * Makes a query to the REST overlap endpoint for the given sequence + * identifier. This returns an 'alignment' consisting of one 'dummy sequence' + * (the genomic sequence for which overlap features are returned by the + * service). This sequence will have on it sequence features which are the + * real information of interest, such as CDS regions or sequence variations. + */ + @Override + public AlignmentI getSequenceRecords(String query) throws IOException + { + // TODO: use a vararg String... for getSequenceRecords instead? + List queries = new ArrayList(); + queries.add(query); + FileParse fp = getSequenceReader(queries); + FeaturesFile fr = new FeaturesFile(fp); + return new Alignment(fr.getSeqsAsArray()); + } + + /** + * Returns a URL for the REST overlap endpoint + * + * @param ids + * @return + */ + @Override + protected URL getUrl(List ids) throws MalformedURLException + { + StringBuffer urlstring = new StringBuffer(128); + urlstring.append(ENSEMBL_REST).append("/overlap/id/") + .append(ids.get(0)); + + // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats + urlstring.append("?content-type=text/x-gff3"); + + /* + * specify features to retrieve + * @see http://rest.ensembl.org/documentation/info/overlap_id + * could make the list a configurable entry in jalview.properties + */ + for (EnsemblFeatureType feature : featuresWanted) + { + urlstring.append("&feature=").append(feature.name()); + } + + return new URL(urlstring.toString()); + } + + @Override + public boolean useGetRequest() + { + return true; + } + + /** + * Returns the MIME type for GFF3. For GET requests the Content-type header + * describes the required encoding of the response. + */ + @Override + public String getRequestMimeType() + { + return "text/x-gff3"; + } + + /** + * Returns the MIME type for GFF3. + */ + @Override + public String getResponseMimeType() + { + return "text/x-gff3"; + } + + /** + * Overloaded method that allows a list of features to retrieve to be + * specified + * + * @param accId + * @param features + * @return + * @throws IOException + */ + public AlignmentI getSequenceRecords(String accId, + EnsemblFeatureType[] features) + throws IOException + { + featuresWanted = features; + return getSequenceRecords(accId); + } +} diff --git a/src/jalview/ext/ensembl/EnsemblProtein.java b/src/jalview/ext/ensembl/EnsemblProtein.java index 4cc43ab..5238f98 100644 --- a/src/jalview/ext/ensembl/EnsemblProtein.java +++ b/src/jalview/ext/ensembl/EnsemblProtein.java @@ -1,6 +1,7 @@ package jalview.ext.ensembl; -import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.SequenceFeature; public class EnsemblProtein extends EnsemblSeqProxy { @@ -22,6 +23,9 @@ public class EnsemblProtein extends EnsemblSeqProxy return EnsemblSeqType.PROTEIN; } + /** + * Returns false, as this fetcher does not retrieve DNA sequences. + */ @Override public boolean isDnaCoding() { @@ -37,4 +41,27 @@ public class EnsemblProtein extends EnsemblSeqProxy return "ENSP00000288602"; } + /** + * Overrides base class method to do nothing - genomic features are not + * applicable to the protein product sequence + */ + @Override + protected void addFeaturesAndProduct(String accId, AlignmentI alignment) + { + } + + @Override + protected EnsemblFeatureType[] getFeaturesToFetch() + { + // not applicable - can't fetch genomic features for a protein sequence + return null; + } + + @Override + protected boolean identifiesSequence(SequenceFeature sf, String accId) + { + // not applicable - protein sequence is not a 'subset' of genomic sequence + return false; + } + } diff --git a/src/jalview/ext/ensembl/EnsemblRestClient.java b/src/jalview/ext/ensembl/EnsemblRestClient.java new file mode 100644 index 0000000..52993e9 --- /dev/null +++ b/src/jalview/ext/ensembl/EnsemblRestClient.java @@ -0,0 +1,215 @@ +package jalview.ext.ensembl; + +import jalview.io.FileParse; + +import java.io.BufferedReader; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.List; + +import javax.ws.rs.HttpMethod; + +/** + * Base class for Ensembl REST service clients + * + * @author gmcarstairs + */ +abstract class EnsemblRestClient extends EnsemblSequenceFetcher +{ + protected final static String ENSEMBL_REST = "http://rest.ensembl.org"; + + protected static final String SEQUENCE_ID_URL = ENSEMBL_REST + + "/sequence/id"; + + // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats + private static final String PING_URL = "http://rest.ensembl.org/info/ping.json"; + + private final static long RETEST_INTERVAL = 10000L; // 10 seconds + + private static boolean ensemblRestAvailable = false; + + private static long lastCheck = -1; + + protected volatile boolean inProgress = false; + + @Override + public boolean queryInProgress() + { + return inProgress; + } + + @Override + public StringBuffer getRawRecords() + { + return null; + } + + /** + * Returns the URL for the client http request + * + * @param ids + * @return + * @throws MalformedURLException + */ + protected abstract URL getUrl(List ids) + throws MalformedURLException; + + /** + * Returns true if client uses GET method, false if it uses POST + * + * @return + */ + public abstract boolean useGetRequest(); + + /** + * Return the desired value for the Content-Type request header + * + * @return + * @see https://github.com/Ensembl/ensembl-rest/wiki/HTTP-Headers + */ + public abstract String getRequestMimeType(); + + /** + * Return the desired value for the Accept request header + * + * @return + * @see https://github.com/Ensembl/ensembl-rest/wiki/HTTP-Headers + */ + public abstract String getResponseMimeType(); + + /** + * Tries to connect to Ensembl's REST 'ping' endpoint, and returns true if + * successful, else false + * + * @return + */ + private boolean checkEnsembl() + { + try + { + URL ping = new URL(PING_URL); + HttpURLConnection conn = (HttpURLConnection) ping.openConnection(); + int rc = conn.getResponseCode(); + conn.disconnect(); + if (rc >= 200 && rc < 300) + { + return true; + } + } catch (Throwable t) + { + System.err.println("Error connecting to " + PING_URL + ": " + + t.getMessage()); + } + return false; + } + + /** + * returns a reader to a Fasta response from the Ensembl sequence endpoint + * + * @param ids + * @return + * @throws IOException + */ + public FileParse getSequenceReader(List ids) + throws IOException + { + URL url = getUrl(ids); + + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + + /* + * POST method allows multiple queries in one request; it is supported for + * sequence queries, but not for overlap + */ + connection.setRequestMethod(useGetRequest() ? HttpMethod.GET + : HttpMethod.POST); + connection.setRequestProperty("Content-Type", getRequestMimeType()); + connection.setRequestProperty("Accept", getResponseMimeType()); + + connection.setUseCaches(false); + connection.setDoInput(true); + connection.setDoOutput(true); + + if (!useGetRequest()) + { + writePostBody(connection, ids); + } + + InputStream response = connection.getInputStream(); + int responseCode = connection.getResponseCode(); + + if (responseCode != 200) + { + throw new RuntimeException( + "Response code was not 200. Detected response was " + + responseCode); + } + + BufferedReader reader = null; + reader = new BufferedReader(new InputStreamReader(response, "UTF-8")); + FileParse fp = new FileParse(reader, url.toString(), "HTTP_POST"); + return fp; + } + + /** + * Rechecks if Ensembl is responding, unless the last check was successful and + * the retest interval has not yet elapsed. Returns true if Ensembl is up, + * else false. + * + * @return + */ + public boolean isEnsemblAvailable() + { + long now = System.currentTimeMillis(); + boolean retest = now - lastCheck > RETEST_INTERVAL; + if (ensemblRestAvailable && !retest) + { + return true; + } + ensemblRestAvailable = checkEnsembl(); + lastCheck = now; + return ensemblRestAvailable; + } + + /** + * Constructs, writes and flushes the POST body of the request, containing the + * query ids in JSON format + * + * @param connection + * @param ids + * @throws IOException + */ + protected void writePostBody(HttpURLConnection connection, + List ids) throws IOException + { + boolean first; + StringBuilder postBody = new StringBuilder(64); + postBody.append("{\"ids\":["); + first = true; + for (String id : ids) + { + if (!first) + { + postBody.append(","); + } + first = false; + postBody.append("\""); + postBody.append(id.trim()); + postBody.append("\""); + } + postBody.append("]}"); + byte[] thepostbody = postBody.toString().getBytes(); + connection.setRequestProperty("Content-Length", + Integer.toString(thepostbody.length)); + DataOutputStream wr = new DataOutputStream(connection.getOutputStream()); + wr.write(thepostbody); + wr.flush(); + wr.close(); + } + +} diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 4f85bd0..e986ba8 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -2,174 +2,381 @@ package jalview.ext.ensembl; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.Mapping; +import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.exceptions.JalviewException; -import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType; import jalview.io.FastaFile; import jalview.io.FileParse; +import jalview.io.gff.SequenceOntology; import jalview.util.DBRefUtils; -import jalview.ws.seqfetcher.DbSourceProxyImpl; +import jalview.util.MapList; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; import java.util.List; -import com.stevesoft.pat.Regex; - -public abstract class EnsemblSeqProxy extends DbSourceProxyImpl +/** + * Base class for Ensembl sequence fetchers + * + * @author gmcarstairs + */ +public abstract class EnsemblSeqProxy extends EnsemblRestClient { - SeqFetcher sf; - - public EnsemblSeqProxy() + public enum EnsemblSeqType { - sf = new SeqFetcher(); - } + /** + * type=genomic for the full dna including introns + */ + GENOMIC("genomic"), - @Override - public String getDbSource() - { - return "ENSEMBL"; - } + /** + * type=cdna for transcribed dna including UTRs + */ + CDNA("cdna"), + /** + * type=cds for coding dna excluding UTRs + */ + CDS("cds"), - @Override - public String getDbVersion() - { - return "0"; // sf.getVersion(); - } + /** + * type=protein for the peptide product sequence + */ + PROTEIN("protein"); - @Override - public String getAccessionSeparator() - { - return " "; - } + /* + * the value of the 'type' parameter to fetch this version of + * an Ensembl sequence + */ + private String type; + + EnsemblSeqType(String t) + { + type = t; + } + + public String getType() + { + return type; + } - @Override - public Regex getAccessionValidator() - { - return new Regex("((ENSP|ENST|ENSG|CCDS)[0-9.]{3,})"); } /** - * Default test query is a transcript + * A comparator to sort ranges into ascending start position order */ - @Override - public String getTestQuery() + private class RangeSorter implements Comparator { - return "ENST00000288602"; - } + boolean forwards; - @Override - public boolean isValidReference(String accession) + RangeSorter(boolean forward) + { + forwards = forward; + } + + @Override + public int compare(int[] o1, int[] o2) + { + return (forwards ? 1 : -1) * Integer.compare(o1[0], o2[0]); + } + + }; + + /** + * Constructor + */ + public EnsemblSeqProxy() { - return getAccessionValidator().search(accession); } - private volatile boolean inProgress = false; - + /** + * Makes the sequence queries to Ensembl's REST service and returns an + * alignment consisting of the returned sequences + */ @Override - public AlignmentI getSequenceRecords(String queries) throws Exception + public AlignmentI getSequenceRecords(String query) throws Exception { + // TODO use a String... query vararg instead? + + // danger: accession separator used as a regex here, a string elsewhere + // in this case it is ok (it is just a space), but (e.g.) '\' would not be + List allIds = Arrays.asList(query.split(getAccessionSeparator())); + AlignmentI alignment = null; inProgress = true; - List tids, ids = new ArrayList(); - tids = Arrays.asList(queries.split(" +")); - AlignmentI rtn = null; /* * execute queries, if necessary in batches of the * maximum allowed number of ids */ int maxQueryCount = getMaximumQueryCount(); - for (int v = 0, vSize = tids.size(); v < vSize; v += maxQueryCount) + for (int v = 0, vSize = allIds.size(); v < vSize; v += maxQueryCount) { int p = Math.min(vSize, v + maxQueryCount); - ids = tids.subList(v, p); + List ids = allIds.subList(v, p); try { - if (!sf.isEnsemblAvailable()) - { - inProgress = false; - throw new JalviewException("ENSEMBL Rest API not available."); - } - FileParse fp = new FileParse(sf.getSequenceReader( - getSourceEnsemblType(), ids)); - FastaFile fr = new FastaFile(fp); - if (fr.hasWarningMessage()) - { - System.out - .println("Warning when retrieving " + ids.size() + " ids" - + ids.toString() + "\n" + fr.getWarningMessage()); - } - else if (fr.getSeqs().size() != ids.size()) - { - System.out.println("Only retrieved " + fr.getSeqs().size() - + " sequences for " + ids.size() + " query strings."); - } - if (fr.getSeqs().size() > 0) - { - AlignmentI seqal = new Alignment( - fr.getSeqsAsArray()); - for (SequenceI sq:seqal.getSequences()) - { - if (ids.contains((sq.getName()))) - { - DBRefUtils.parseToDbRef(sq, "ENSEMBL", "0", sq.getName()); - } - } - if (rtn == null) - { - rtn = seqal; - } - else - { - rtn.append(seqal); - } - } + alignment = fetchSequences(ids, alignment); } catch (Throwable r) { inProgress = false; - if (rtn != null) + String msg = "Aborting ID retrieval after " + v + + " chunks. Unexpected problem (" + r.getLocalizedMessage() + + ")"; + System.err.println(msg); + if (alignment != null) { - System.err.println("Aborting ID retrieval after " + v - + " chunks."); - r.printStackTrace(); + break; // return what we got } else { - - throw new JalviewException("Aborting ID retrieval after " + v - + " chunks. Unexpected problem (" - + r.getLocalizedMessage() + ")", r); + throw new JalviewException(msg, r); } - } } + + /* + * fetch and transfer genomic sequence features + */ + for (String accId : allIds) + { + addFeaturesAndProduct(accId, alignment); + } + inProgress = false; - return rtn; + return alignment; } /** + * Fetches Ensembl features using the /overlap REST endpoint, and adds them to + * the sequence in the alignment. Also fetches the protein product, maps it + * from the CDS features of the sequence, and saves it as a cross-reference of + * the dna sequence. * - * @return the configured sequence return type for this source + * @param accId + * @param alignment */ - protected abstract EnsemblSeqType getSourceEnsemblType(); + protected void addFeaturesAndProduct(String accId, AlignmentI alignment) + { + try + { + /* + * get 'dummy' genomic sequence with exon, cds and variation features + */ + EnsemblOverlap gffFetcher = new EnsemblOverlap(); + EnsemblFeatureType[] features = getFeaturesToFetch(); + AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, + features); + if (geneFeatures.getHeight() > 0) + { + /* + * transfer features to the query sequence + */ + SequenceI genomicSequence = geneFeatures.getSequenceAt(0); + SequenceI querySeq = alignment.findName(accId); + transferFeatures(accId, genomicSequence, querySeq); - @Override - public boolean queryInProgress() + /* + * fetch and map protein product, and add it as a cross-reference + * of the retrieved sequence + */ + addProteinProduct(querySeq); + } + } catch (IOException e) + { + System.err.println("Error transferring Ensembl features: " + + e.getMessage()); + } + } + + /** + * Returns those sequence feature types to fetch from Ensembl. We may want + * features either because they are of interest to the user, or as means to + * identify the locations of the sequence on the genomic sequence (CDS + * features identify CDS, exon features identify cDNA etc). + * + * @return + */ + protected abstract EnsemblFeatureType[] getFeaturesToFetch(); + + /** + * Fetches and maps the protein product, and adds it as a cross-reference of + * the retrieved sequence + */ + protected void addProteinProduct(SequenceI querySeq) { - return inProgress; + String accId = querySeq.getName(); + try + { + AlignmentI protein = new EnsemblProtein().getSequenceRecords(accId); + if (protein == null || protein.getHeight() == 0) + { + System.out.println("Failed to retrieve protein for " + accId); + return; + } + SequenceI proteinSeq = protein.getSequenceAt(0); + + /* + * need dataset sequences (to be the subject of mappings) + */ + proteinSeq.createDatasetSequence(); + querySeq.createDatasetSequence(); + + MapList mapList = mapCdsToProtein(querySeq, proteinSeq); + if (mapList != null) + { + Mapping map = new Mapping(proteinSeq.getDatasetSequence(), mapList); + DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(), + accId, map); + querySeq.getDatasetSequence().addDBRef(dbr); + } + } catch (Exception e) + { + System.err + .println(String.format("Error retrieving protein for %s: %s", + accId, e.getMessage())); + } } - @Override - public StringBuffer getRawRecords() + /** + * Returns a mapping from dna to protein by inspecting sequence features of + * type "CDS" on the dna. + * + * @param dnaSeq + * @param proteinSeq + * @return + */ + protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq) { + SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); + if (sfs == null) + { + return null; + } + + List ranges = new ArrayList(50); + SequenceOntology so = SequenceOntology.getInstance(); + + int mappedDnaLength = 0; + + /* + * Map CDS columns of dna to peptide. No need to worry about reverse strand + * dna here since the retrieved sequence is as transcribed (reverse + * complement for reverse strand), i.e in the same sense as the peptide. + */ + for (SequenceFeature sf : sfs) + { + /* + * process a CDS feature (or a sub-type of CDS) + */ + if (so.isA(sf.getType(), SequenceOntology.CDS)) + { + ranges.add(new int[] { sf.getBegin(), sf.getEnd() }); + mappedDnaLength += Math.abs(sf.getEnd() - sf.getBegin()) + 1; + } + } + int proteinLength = proteinSeq.getLength(); + List proteinRange = new ArrayList(); + proteinRange.add(new int[] { 1, proteinLength }); + + /* + * dna length should map to protein (or protein minus stop codon) + */ + if (mappedDnaLength == 3 * proteinLength + || mappedDnaLength == 3 * (proteinLength + 1)) + { + return new MapList(ranges, proteinRange, 3, 1); + } return null; } + /** + * Fetches sequences for the list of accession ids and adds them to the + * alignment. Returns the extended (or created) alignment. + * + * @param ids + * @param alignment + * @return + * @throws JalviewException + * @throws IOException + */ + protected AlignmentI fetchSequences(List ids, AlignmentI alignment) + throws JalviewException, IOException + { + if (!isEnsemblAvailable()) + { + inProgress = false; + throw new JalviewException("ENSEMBL Rest API not available."); + } + FileParse fp = getSequenceReader(ids); + FastaFile fr = new FastaFile(fp); + if (fr.hasWarningMessage()) + { + System.out.println(String.format( + "Warning when retrieving %d ids %s\n%s", ids.size(), + ids.toString(), fr.getWarningMessage())); + } + else if (fr.getSeqs().size() != ids.size()) + { + System.out.println(String.format( + "Only retrieved %d sequences for %d query strings", fr + .getSeqs().size(), ids.size())); + } + if (fr.getSeqs().size() > 0) + { + AlignmentI seqal = new Alignment( + fr.getSeqsAsArray()); + for (SequenceI sq:seqal.getSequences()) + { + if (sq.getDescription() == null) + { + sq.setDescription(getDbName()); + } + String name = sq.getName(); + if (ids.contains(name) + || ids.contains(name.replace("ENSP", "ENST"))) + { + DBRefUtils.parseToDbRef(sq, DBRefSource.ENSEMBL, "0", name); + } + } + if (alignment == null) + { + alignment = seqal; + } + else + { + alignment.append(seqal); + } + } + return alignment; + } + + /** + * Returns the URL for the REST call + * + * @return + * @throws MalformedURLException + */ @Override - public int getTier() + protected URL getUrl(List ids) throws MalformedURLException { - return 0; + // ids are not used - they go in the POST body instead + StringBuffer urlstring = new StringBuffer(128); + urlstring.append(SEQUENCE_ID_URL); + + // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats + urlstring.append("?type=").append(getSourceEnsemblType().getType()); + urlstring.append(("&Accept=text/x-fasta")); + + URL url = new URL(urlstring.toString()); + return url; } /** @@ -184,8 +391,208 @@ public abstract class EnsemblSeqProxy extends DbSourceProxyImpl } @Override - public boolean isDnaCoding() + public boolean useGetRequest() + { + return false; + } + + @Override + public String getRequestMimeType() + { + return "application/json"; + } + + @Override + public String getResponseMimeType() + { + return "text/x-fasta"; + } + + /** + * + * @return the configured sequence return type for this source + */ + protected abstract EnsemblSeqType getSourceEnsemblType(); + + /** + * Returns a list of [start, end] genomic ranges corresponding to the sequence + * being retrieved. + * + * The correspondence between the frames of reference is made by locating + * those features on the genomic sequence which identify the retrieved + * sequence. Specifically + *
    + *
  • genomic sequence is identified by "transcript" features with + * ID=transcript:transcriptId
  • + *
  • cdna sequence is identified by "exon" features with + * Parent=transcript:transcriptId
  • + *
  • cds sequence is identified by "CDS" features with + * Parent=transcript:transcriptId
  • + *
+ * + * The returned ranges are sorted to run forwards (for positive strand) or + * backwards (for negative strand). Aborts and returns null if both positive + * and negative strand are found (this should not normally happen). + * + * @param sfs + * @param accId + * @return + */ + protected MapList getGenomicRanges(SequenceFeature[] sfs, String accId) + { + /* + * generously size for initial number of cds regions + * (worst case titin Q8WZ42 has c. 313 exons) + */ + List regions = new ArrayList(100); + int mappedLength = 0; + int direction = 1; // forward + boolean directionSet = false; + + for (SequenceFeature sf : sfs) + { + /* + * accept the target feature type or a specialisation of it + * (e.g. coding_exon for exon) + */ + if (identifiesSequence(sf, accId)) + { + int strand = sf.getStrand(); + + if (directionSet && strand != direction) + { + // abort - mix of forward and backward + System.err.println("Error: forward and backward strand for " + + accId); + return null; + } + direction = strand; + directionSet = true; + + /* + * add to CDS ranges, semi-sorted forwards/backwards + */ + if (strand < 0) + { + regions.add(0, new int[] { sf.getEnd(), sf.getBegin() }); + } + else + { + regions.add(new int[] { sf.getBegin(), sf.getEnd() }); + } + mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); + } + } + + /* + * a final sort is needed since Ensembl returns CDS sorted within source + * (havana / ensembl_havana) + */ + Collections.sort(regions, new RangeSorter(direction == 1)); + + List to = new ArrayList(); + to.add(new int[] { 1, mappedLength }); + + return new MapList(regions, to, 1, 1); + } + + /** + * Returns true if the sequence feature identifies positions of the genomic + * sequence feature which are within the sequence being retrieved. + * + * @param sf + * @param accId + * @return + */ + protected abstract boolean identifiesSequence(SequenceFeature sf, + String accId); + + /** + * Transfers the sequence feature to the target sequence, adjusting its start + * and end range based on the 'overlap' ranges. Features which do not overlap + * the target sequence are ignored, as are features with a parent other than + * the target sequence id. + * + * @param sf + * @param targetSequence + * @param overlap + */ + protected void transferFeature(SequenceFeature sf, + SequenceI targetSequence, MapList overlap) + { + String parent = (String) sf.getValue("Parent"); + if (parent != null && !parent.contains(targetSequence.getName())) + { + // this genomic feature belongs to a different transcript + return; + } + + int start = sf.getBegin(); + int end = sf.getEnd(); + int[] mappedRange = overlap.locateInTo(start, end); + + if (mappedRange != null) + { + SequenceFeature copy = new SequenceFeature(sf); + int offset = targetSequence.getStart() - 1; + copy.setBegin(offset + Math.min(mappedRange[0], mappedRange[1])); + copy.setEnd(offset + Math.max(mappedRange[0], mappedRange[1])); + targetSequence.addSequenceFeature(copy); + } + + } + + /** + * Transfers features from sourceSequence to targetSequence + * + * @param accessionId + * @param sourceSequence + * @param targetSequence + */ + protected void transferFeatures(String accessionId, + SequenceI sourceSequence, SequenceI targetSequence) + { + if (sourceSequence == null || targetSequence == null) + { + return; + } + + SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); + MapList overlap = getGenomicRanges(sfs, accessionId); + + final boolean forwardStrand = overlap.isFromForwardStrand(); + + /* + * sort features by start position (descending if reverse strand) + * before transferring (in forwards order) to the target sequence + */ + Arrays.sort(sfs, new Comparator() + { + @Override + public int compare(SequenceFeature o1, SequenceFeature o2) + { + int c = Integer.compare(o1.getBegin(), o2.getBegin()); + return forwardStrand ? c : -c; + } + }); + + for (SequenceFeature sf : sfs) + { + if (retainFeature(sf.getType())) + { + transferFeature(sf, targetSequence, overlap); + } + } + } + + /** + * Answers true if the feature type is one to attach to the retrieved sequence + * + * @param type + * @return + */ + protected boolean retainFeature(@SuppressWarnings("unused") String type) { - return true; + return true; // default is to keep all } } diff --git a/src/jalview/ext/ensembl/EnsemblSequenceFetcher.java b/src/jalview/ext/ensembl/EnsemblSequenceFetcher.java new file mode 100644 index 0000000..f1b96e2 --- /dev/null +++ b/src/jalview/ext/ensembl/EnsemblSequenceFetcher.java @@ -0,0 +1,80 @@ +package jalview.ext.ensembl; + +import jalview.datamodel.DBRefSource; +import jalview.ws.seqfetcher.DbSourceProxyImpl; + +import com.stevesoft.pat.Regex; + +/** + * A base class for Ensembl sequence fetchers + * + * @author gmcarstairs + * + */ +public abstract class EnsemblSequenceFetcher extends DbSourceProxyImpl +{ + /* + * possible values for the 'feature' parameter of the REST overlap endpoint + * @see + */ + protected enum EnsemblFeatureType + { + gene, transcript, cds, exon, repeat, simple, misc, variation, + somatic_variation, structural_variation, somatic_structural_variation, + constrained, regulatory + } + + @Override + public String getDbSource() + { + // NB ensure Uniprot xrefs are canonicalised from "Ensembl" to "ENSEMBL" + return DBRefSource.ENSEMBL; // "ENSEMBL" + } + + @Override + public String getDbVersion() + { + return "0"; + } + + @Override + public String getAccessionSeparator() + { + return " "; + } + + @Override + public Regex getAccessionValidator() + { + return new Regex("((ENSP|ENST|ENSG|CCDS)[0-9.]{3,})"); + } + + @Override + public boolean isValidReference(String accession) + { + return getAccessionValidator().search(accession); + } + + @Override + public int getTier() + { + return 0; + } + + /** + * Default test query is a transcript + */ + @Override + public String getTestQuery() + { + // has CDS on reverse strand: + return "ENST00000288602"; + // ENST00000461457 // forward strand + } + + @Override + public boolean isDnaCoding() + { + return true; + } +} diff --git a/src/jalview/ext/ensembl/EnsemblTranscript.java b/src/jalview/ext/ensembl/EnsemblTranscript.java deleted file mode 100644 index c2d0b6e..0000000 --- a/src/jalview/ext/ensembl/EnsemblTranscript.java +++ /dev/null @@ -1,26 +0,0 @@ -package jalview.ext.ensembl; - -import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType; - -public class EnsemblTranscript extends EnsemblSeqProxy -{ - - // TODO is this class needed? it seems to duplicate EnsemblProtein - public EnsemblTranscript() - { - super(); - } - - @Override - public String getDbName() - { - return "ENSEMBL (Protein)"; - } - - @Override - protected EnsemblSeqType getSourceEnsemblType() - { - return EnsemblSeqType.PROTEIN; - } - -} diff --git a/src/jalview/ext/ensembl/SeqFetcher.java b/src/jalview/ext/ensembl/SeqFetcher.java deleted file mode 100644 index 57f000f..0000000 --- a/src/jalview/ext/ensembl/SeqFetcher.java +++ /dev/null @@ -1,193 +0,0 @@ -package jalview.ext.ensembl; - -import jalview.io.FileParse; - -import java.io.BufferedReader; -import java.io.DataOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.HttpURLConnection; -import java.net.URL; -import java.net.URLConnection; -import java.util.ArrayList; -import java.util.List; - -import org.apache.http.NameValuePair; -import org.apache.http.message.BasicNameValuePair; - -public class SeqFetcher -{ - private final static String ENSEMBL_REST = "rest.ensembl.org"; - - private static final String SEQUENCE_ID_URL = "http://" + ENSEMBL_REST + "/sequence/id"; - - private static final String PING_URL = "http://" + ENSEMBL_REST + "/info/ping"; - - private final static long RETEST_INTERVAL = 10000L; // 10 seconds - - private static boolean ensemblRestAvailable = false; - - private static long lastCheck = -1; - - /** - * Rechecks if Ensembl is responding, unless the last check was successful and - * the retest interval has not yet elapsed. Returns true if Ensembl is up, - * else false. - * - * @return - */ - public boolean isEnsemblAvailable() - { - long now = System.currentTimeMillis(); - boolean retest = now - lastCheck > RETEST_INTERVAL; - if (ensemblRestAvailable && !retest) - { - return true; - } - ensemblRestAvailable = checkEnsembl(); - lastCheck = now; - return ensemblRestAvailable; - } - - /** - * Tries to connect to Ensembl's REST 'ping' endpoint, and returns true if - * successful, else false - * - * @return - */ - private boolean checkEnsembl() - { - try - { - URL ping = new URL(PING_URL); - HttpURLConnection conn = (HttpURLConnection) ping.openConnection(); - int rc = conn.getResponseCode(); - conn.disconnect(); - if (rc >= 200 && rc < 300) - { - return true; - } - } catch (Throwable t) - { - System.err.println("Error connecting to " + PING_URL + ": " - + t.getMessage()); - } - return false; - } - - public SeqFetcher() - { - } - - public enum EnsemblSeqType - { - GENOMIC("genomic"), CDS("cds"), TRANSCRIPT("cds"), PROTEIN("protein"), CDNA( - "cdna"); - - private String type; - - EnsemblSeqType(String t) - { - type = t; - } - - public String getType() - { - return type; - } - } - - /** - * Returns a list of additional URL query parameters to specify the desired - * sequence type (genomic/cds/protein etc), and data format Fasta - * - * @param type - */ - public List getAdditionalParameters(EnsemblSeqType type) - { - List params = new ArrayList(); - params.add(new BasicNameValuePair("type", type.getType())); - params.add(new BasicNameValuePair("content-type", "text/x-fasta")); - return params; - } - - /** - * return a reader to a Fasta response from the Ensembl sequence endpoint - * - * @param returnType - * @param ids - * @return - * @throws IOException - */ - public FileParse getSequenceReader(EnsemblSeqType returnType, - List ids) throws IOException - { - // see http://rest.ensembl.org/documentation/info/sequence_id - - String urlstring = SEQUENCE_ID_URL; - List vals = getAdditionalParameters(returnType); - boolean first = true; - for (NameValuePair nvp : vals) - { - urlstring += first ? "?" : "&"; - first = false; - urlstring += nvp.getName() + "=" + nvp.getValue(); - } - - URL url = new URL(urlstring); - - URLConnection connection = url.openConnection(); - HttpURLConnection httpConnection = (HttpURLConnection) connection; - - httpConnection.setRequestMethod("POST"); - httpConnection.setRequestProperty("Content-Type", "application/json"); - httpConnection.setRequestProperty("Accept", "text/x-fasta"); - byte[] thepostbody; - { - StringBuilder postBody = new StringBuilder(); - postBody.append("{\"ids\":["); - first = true; - for (String id : ids) - { - if (!first) - { - postBody.append(","); - } - first = false; - postBody.append("\""); - postBody.append(id.trim()); - postBody.append("\""); - } - postBody.append("]}"); - thepostbody = postBody.toString().getBytes(); - } - httpConnection.setRequestProperty("Content-Length", - Integer.toString(thepostbody.length)); - httpConnection.setUseCaches(false); - httpConnection.setDoInput(true); - httpConnection.setDoOutput(true); - - DataOutputStream wr = new DataOutputStream( - httpConnection.getOutputStream()); - wr.write(thepostbody); - wr.flush(); - wr.close(); - - InputStream response = connection.getInputStream(); - int responseCode = httpConnection.getResponseCode(); - - if (responseCode != 200) - { - throw new RuntimeException( - "Response code was not 200. Detected response was " - + responseCode); - } - - BufferedReader reader = null; - reader = new BufferedReader(new InputStreamReader(response, "UTF-8")); - FileParse fp = new FileParse(reader, url.toString(), "HTTP_POST"); - return fp; - } - -} diff --git a/test/jalview/ext/ensembl/EnsemblRestClientTest.java b/test/jalview/ext/ensembl/EnsemblRestClientTest.java new file mode 100644 index 0000000..086adbb --- /dev/null +++ b/test/jalview/ext/ensembl/EnsemblRestClientTest.java @@ -0,0 +1,69 @@ +package jalview.ext.ensembl; + +import jalview.datamodel.AlignmentI; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.List; + +import org.testng.annotations.Test; + +public class EnsemblRestClientTest +{ + + @Test(suiteName = "live") + public void testLiveCheckEnsembl() + { + EnsemblRestClient sf = new EnsemblRestClient() + { + + @Override + public String getDbName() + { + return null; + } + + @Override + public AlignmentI getSequenceRecords(String queries) throws Exception + { + return null; + } + + @Override + protected URL getUrl(List ids) throws MalformedURLException + { + return null; + } + + @Override + public boolean useGetRequest() + { + return false; + } + + @Override + public String getRequestMimeType() + { + return null; + } + + @Override + public String getResponseMimeType() + { + return null; + } + + }; + boolean isAvailable = sf.isEnsemblAvailable(); + if (isAvailable) + { + System.out.println("Ensembl is UP!"); + } + else + { + System.err + .println("Ensembl is DOWN or unreachable ******************* BAD!"); + } + } + +} diff --git a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java index f3526bc..3ca74b0 100644 --- a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java +++ b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java @@ -1,6 +1,17 @@ package jalview.ext.ensembl; +import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.SequenceI; +import jalview.io.AppletFormatAdapter; +import jalview.io.FastaFile; +import jalview.io.FileParse; + import java.lang.reflect.Method; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Arrays; +import java.util.List; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -9,8 +20,87 @@ import org.testng.annotations.Test; public class EnsemblSeqProxyTest { + private static final Object[][] allSeqs = new Object[][] { + { + new EnsemblProtein(), + "CCDS5863.1", + ">CCDS5863.1\n" + + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n" + + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n" + + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n" + + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n" + + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n" + + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n" + + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n" + + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n" + + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n" + + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n" + + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n" + + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n" + + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH\n" }, + { + new EnsemblCdna(), + "CCDS5863.1", + ">CCDS5863.1\n" + + "ATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAAC\n" + + "GGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGAC\n" + + "CCTGCCATTCCGGAGGAGGTGTGGAATATCAAACAAATGATTAAGTTGACACAGGAACAT\n" + + "ATAGAGGCCCTATTGGACAAATTTGGTGGGGAGCATAATCCACCATCAATATATCTGGAG\n" + + "GCCTATGAAGAATACACCAGCAAGCTAGATGCACTCCAACAAAGAGAACAACAGTTATTG\n" + + "GAATCTCTGGGGAACGGAACTGATTTTTCTGTTTCTAGCTCTGCATCAATGGATACCGTT\n" + + "ACATCTTCTTCCTCTTCTAGCCTTTCAGTGCTACCTTCATCTCTTTCAGTTTTTCAAAAT\n" + + "CCCACAGATGTGGCACGGAGCAACCCCAAGTCACCACAAAAACCTATCGTTAGAGTCTTC\n" + + "CTGCCCAACAAACAGAGGACAGTGGTACCTGCAAGGTGTGGAGTTACAGTCCGAGACAGT\n" + + "CTAAAGAAAGCACTGATGATGAGAGGTCTAATCCCAGAGTGCTGTGCTGTTTACAGAATT\n" + + "CAGGATGGAGAGAAGAAACCAATTGGTTGGGACACTGATATTTCCTGGCTTACTGGAGAA\n" + + "GAATTGCATGTGGAAGTGTTGGAGAATGTTCCACTTACAACACACAACTTTGTACGAAAA\n" + + "ACGTTTTTCACCTTAGCATTTTGTGACTTTTGTCGAAAGCTGCTTTTCCAGGGTTTCCGC\n" + + "TGTCAAACATGTGGTTATAAATTTCACCAGCGTTGTAGTACAGAAGTTCCACTGATGTGT\n" + + "GTTAATTATGACCAACTTGATTTGCTGTTTGTCTCCAAGTTCTTTGAACACCACCCAATA\n" + + "CCACAGGAAGAGGCGTCCTTAGCAGAGACTGCCCTAACATCTGGATCATCCCCTTCCGCA\n" + + "CCCGCCTCGGACTCTATTGGGCCCCAAATTCTCACCAGTCCGTCTCCTTCAAAATCCATT\n" + + "CCAATTCCACAGCCCTTCCGACCAGCAGATGAAGATCATCGAAATCAATTTGGGCAACGA\n" + + "GACCGATCCTCATCAGCTCCCAATGTGCATATAAACACAATAGAACCTGTCAATATTGAT\n" + + "GACTTGATTAGAGACCAAGGATTTCGTGGTGATGGAGGATCAACCACAGGTTTGTCTGCT\n" + + "ACCCCCCCTGCCTCATTACCTGGCTCACTAACTAACGTGAAAGCCTTACAGAAATCTCCA\n" + + "GGACCTCAGCGAGAAAGGAAGTCATCTTCATCCTCAGAAGACAGGAATCGAATGAAAACA\n" + + "CTTGGTAGACGGGACTCGAGTGATGATTGGGAGATTCCTGATGGGCAGATTACAGTGGGA\n" + + "CAAAGAATTGGATCTGGATCATTTGGAACAGTCTACAAGGGAAAGTGGCATGGTGATGTG\n" + + "GCAGTGAAAATGTTGAATGTGACAGCACCTACACCTCAGCAGTTACAAGCCTTCAAAAAT\n" + + "GAAGTAGGAGTACTCAGGAAAACACGACATGTGAATATCCTACTCTTCATGGGCTATTCC\n" + + "ACAAAGCCACAACTGGCTATTGTTACCCAGTGGTGTGAGGGCTCCAGCTTGTATCACCAT\n" + + "CTCCATATCATTGAGACCAAATTTGAGATGATCAAACTTATAGATATTGCACGACAGACT\n" + + "GCACAGGGCATGGATTACTTACACGCCAAGTCAATCATCCACAGAGACCTCAAGAGTAAT\n" + + "AATATATTTCTTCATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTG\n" + + "AAATCTCGATGGAGTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATG\n" + + "GCACCAGAAGTCATCAGAATGCAAGATAAAAATCCATACAGCTTTCAGTCAGATGTATAT\n" + + "GCATTTGGAATTGTTCTGTATGAATTGATGACTGGACAGTTACCTTATTCAAACATCAAC\n" + + "AACAGGGACCAGATAATTTTTATGGTGGGACGAGGATACCTGTCTCCAGATCTCAGTAAG\n" + + "GTACGGAGTAACTGTCCAAAAGCCATGAAGAGATTAATGGCAGAGTGCCTCAAAAAGAAA\n" + + "AGAGATGAGAGACCACTCTTTCCCCAAATTCTCGCCTCTATTGAGCTGCTGGCCCGCTCA\n" + + "TTGCCAAAAATTCACCGCAGTGCATCAGAACCCTCCTTGAATCGGGCTGGTTTCCAAACA\n" + + "GAGGATTTTAGTCTATATGCTTGTGCTTCTCCAAAAACACCCATCCAGGCAGGGGGATAT\n" + + "GGTGCGTTTCCTGTCCACTGA\n" }, + { + new EnsemblProtein(), + "ENSP00000288602", + ">ENSP00000288602\n" + + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n" + + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n" + + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n" + + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n" + + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n" + + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n" + + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n" + + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n" + + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n" + + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n" + + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n" + + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n" + + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" } }; + @DataProvider(name = "queries") - public Object[][] createData(Method m) + public Object[][] createQueryData(Method m) { return new Object[][] { { "CCDS5863.1" }, { "ENSP00000288602" } }; } @@ -18,10 +108,104 @@ public class EnsemblSeqProxyTest @Test(dataProvider = "queries") public void testIsValidReference(String query) throws Exception { - EnsemblSeqProxy esq = new EnsemblProtein(); + EnsemblSequenceFetcher esq = new EnsemblProtein(); Assert.assertTrue(esq.isValidReference(query), "Expected reference string " + query + " to be valid for regex " + esq.getAccessionValidator().toString()); } + + @DataProvider(name = "ens_seqs") + public Object[][] createData(Method m) + { + System.out.println(m.getName()); + return allSeqs; + } + + @Test(dataProvider = "ens_seqs", suiteName = "live") + public void testGetOneSeqs(EnsemblRestClient proxy, String sq, String fastasq) + throws Exception + { + FileParse fp = proxy.getSequenceReader(Arrays + .asList(new String[] + { sq })); + SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray(); + FastaFile trueRes = new FastaFile(fastasq, AppletFormatAdapter.PASTE); + SequenceI[] trueSqs = trueRes.getSeqsAsArray(); + Assert.assertEquals(sqs.length, trueSqs.length, + "Different number of sequences retrieved for query " + sq); + Alignment ral = new Alignment(sqs); + for (SequenceI tr : trueSqs) + { + SequenceI[] rseq; + Assert.assertNotNull( + rseq = ral.findSequenceMatch(tr.getName()), + "Couldn't find sequences matching expected sequence " + + tr.getName()); + Assert.assertEquals(rseq.length, 1, + "Expected only one sequence for sequence ID " + tr.getName()); + Assert.assertEquals( + rseq[0].getSequenceAsString(), + tr.getSequenceAsString(), + "Sequences differ for " + tr.getName() + "\n" + "Exp:" + + tr.getSequenceAsString() + "\n" + "Got:" + + rseq[0].getSequenceAsString()); + + } + } + + @Test(suiteName = "live") + public void testLiveCheckEnsembl() + { + EnsemblRestClient sf = new EnsemblRestClient() + { + + @Override + public String getDbName() + { + // TODO Auto-generated method stub + return null; + } + + @Override + public AlignmentI getSequenceRecords(String queries) throws Exception + { + // TODO Auto-generated method stub + return null; + } + + @Override + protected URL getUrl(List ids) throws MalformedURLException + { + // TODO Auto-generated method stub + return null; + } + + @Override + public boolean useGetRequest() + { + // TODO Auto-generated method stub + return false; + } + + @Override + public String getRequestMimeType() + { + // TODO Auto-generated method stub + return null; + } + + @Override + public String getResponseMimeType() + { + // TODO Auto-generated method stub + return null; + } + + }; + boolean isAvailable = sf.isEnsemblAvailable(); + System.out.println("Ensembl is " + + (isAvailable ? "UP!" + : "DOWN or unreachable ******************* BAD!")); + } } \ No newline at end of file diff --git a/test/jalview/ext/ensembl/SeqFetcherTest.java b/test/jalview/ext/ensembl/SeqFetcherTest.java deleted file mode 100644 index 8762698..0000000 --- a/test/jalview/ext/ensembl/SeqFetcherTest.java +++ /dev/null @@ -1,175 +0,0 @@ -package jalview.ext.ensembl; - -import jalview.datamodel.Alignment; -import jalview.datamodel.SequenceI; -import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType; -import jalview.io.AppletFormatAdapter; -import jalview.io.FastaFile; -import jalview.io.FileParse; - -import java.lang.reflect.Method; -import java.util.Arrays; - -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -public class SeqFetcherTest -{ - private static final Object[][] allSeqs = new Object[][] { - { - EnsemblSeqType.PROTEIN, - "CCDS5863.1", - ">CCDS5863.1\n" - + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n" - + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n" - + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n" - + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n" - + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n" - + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n" - + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n" - + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n" - + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n" - + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n" - + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n" - + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n" - + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH\n" }, - { - EnsemblSeqType.TRANSCRIPT, - "CCDS5863.1", - ">CCDS5863.1\n" - + "ATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAAC\n" - + "GGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGAC\n" - + "CCTGCCATTCCGGAGGAGGTGTGGAATATCAAACAAATGATTAAGTTGACACAGGAACAT\n" - + "ATAGAGGCCCTATTGGACAAATTTGGTGGGGAGCATAATCCACCATCAATATATCTGGAG\n" - + "GCCTATGAAGAATACACCAGCAAGCTAGATGCACTCCAACAAAGAGAACAACAGTTATTG\n" - + "GAATCTCTGGGGAACGGAACTGATTTTTCTGTTTCTAGCTCTGCATCAATGGATACCGTT\n" - + "ACATCTTCTTCCTCTTCTAGCCTTTCAGTGCTACCTTCATCTCTTTCAGTTTTTCAAAAT\n" - + "CCCACAGATGTGGCACGGAGCAACCCCAAGTCACCACAAAAACCTATCGTTAGAGTCTTC\n" - + "CTGCCCAACAAACAGAGGACAGTGGTACCTGCAAGGTGTGGAGTTACAGTCCGAGACAGT\n" - + "CTAAAGAAAGCACTGATGATGAGAGGTCTAATCCCAGAGTGCTGTGCTGTTTACAGAATT\n" - + "CAGGATGGAGAGAAGAAACCAATTGGTTGGGACACTGATATTTCCTGGCTTACTGGAGAA\n" - + "GAATTGCATGTGGAAGTGTTGGAGAATGTTCCACTTACAACACACAACTTTGTACGAAAA\n" - + "ACGTTTTTCACCTTAGCATTTTGTGACTTTTGTCGAAAGCTGCTTTTCCAGGGTTTCCGC\n" - + "TGTCAAACATGTGGTTATAAATTTCACCAGCGTTGTAGTACAGAAGTTCCACTGATGTGT\n" - + "GTTAATTATGACCAACTTGATTTGCTGTTTGTCTCCAAGTTCTTTGAACACCACCCAATA\n" - + "CCACAGGAAGAGGCGTCCTTAGCAGAGACTGCCCTAACATCTGGATCATCCCCTTCCGCA\n" - + "CCCGCCTCGGACTCTATTGGGCCCCAAATTCTCACCAGTCCGTCTCCTTCAAAATCCATT\n" - + "CCAATTCCACAGCCCTTCCGACCAGCAGATGAAGATCATCGAAATCAATTTGGGCAACGA\n" - + "GACCGATCCTCATCAGCTCCCAATGTGCATATAAACACAATAGAACCTGTCAATATTGAT\n" - + "GACTTGATTAGAGACCAAGGATTTCGTGGTGATGGAGGATCAACCACAGGTTTGTCTGCT\n" - + "ACCCCCCCTGCCTCATTACCTGGCTCACTAACTAACGTGAAAGCCTTACAGAAATCTCCA\n" - + "GGACCTCAGCGAGAAAGGAAGTCATCTTCATCCTCAGAAGACAGGAATCGAATGAAAACA\n" - + "CTTGGTAGACGGGACTCGAGTGATGATTGGGAGATTCCTGATGGGCAGATTACAGTGGGA\n" - + "CAAAGAATTGGATCTGGATCATTTGGAACAGTCTACAAGGGAAAGTGGCATGGTGATGTG\n" - + "GCAGTGAAAATGTTGAATGTGACAGCACCTACACCTCAGCAGTTACAAGCCTTCAAAAAT\n" - + "GAAGTAGGAGTACTCAGGAAAACACGACATGTGAATATCCTACTCTTCATGGGCTATTCC\n" - + "ACAAAGCCACAACTGGCTATTGTTACCCAGTGGTGTGAGGGCTCCAGCTTGTATCACCAT\n" - + "CTCCATATCATTGAGACCAAATTTGAGATGATCAAACTTATAGATATTGCACGACAGACT\n" - + "GCACAGGGCATGGATTACTTACACGCCAAGTCAATCATCCACAGAGACCTCAAGAGTAAT\n" - + "AATATATTTCTTCATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTG\n" - + "AAATCTCGATGGAGTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATG\n" - + "GCACCAGAAGTCATCAGAATGCAAGATAAAAATCCATACAGCTTTCAGTCAGATGTATAT\n" - + "GCATTTGGAATTGTTCTGTATGAATTGATGACTGGACAGTTACCTTATTCAAACATCAAC\n" - + "AACAGGGACCAGATAATTTTTATGGTGGGACGAGGATACCTGTCTCCAGATCTCAGTAAG\n" - + "GTACGGAGTAACTGTCCAAAAGCCATGAAGAGATTAATGGCAGAGTGCCTCAAAAAGAAA\n" - + "AGAGATGAGAGACCACTCTTTCCCCAAATTCTCGCCTCTATTGAGCTGCTGGCCCGCTCA\n" - + "TTGCCAAAAATTCACCGCAGTGCATCAGAACCCTCCTTGAATCGGGCTGGTTTCCAAACA\n" - + "GAGGATTTTAGTCTATATGCTTGTGCTTCTCCAAAAACACCCATCCAGGCAGGGGGATAT\n" - + "GGTGCGTTTCCTGTCCACTGA\n" }, - { - EnsemblSeqType.PROTEIN, - "ENSP00000288602", - ">ENSP00000288602\n" - + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n" - + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n" - + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n" - + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n" - + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n" - + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n" - + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n" - + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n" - + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n" - + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n" - + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n" - + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n" - + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" } }; - - @DataProvider(name = "ens_seqs") - public Object[][] createData(Method m) - { - System.out.println(m.getName()); - return allSeqs; - } - - @Test(dataProvider = "ens_seqs", suiteName = "live") - public void testGetOneSeqs(EnsemblSeqType type, String sq, String fastasq) - throws Exception - { - SeqFetcher sf = new SeqFetcher(); - FileParse fp = sf.getSequenceReader(type, Arrays.asList(new String[] - { sq })); - SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray(); - FastaFile trueRes = new FastaFile(fastasq, AppletFormatAdapter.PASTE); - SequenceI[] trueSqs = trueRes.getSeqsAsArray(); - Assert.assertEquals(sqs.length, trueSqs.length, - "Different number of sequences retrieved for query " + sq); - Alignment ral = new Alignment(sqs); - for (SequenceI tr : trueSqs) - { - SequenceI[] rseq; - Assert.assertNotNull( - rseq = ral.findSequenceMatch(tr.getName()), - "Couldn't find sequences matching expected sequence " - + tr.getName()); - Assert.assertEquals(rseq.length, 1, - "Expected only one sequence for sequence ID " + tr.getName()); - Assert.assertEquals( - rseq[0].getSequenceAsString(), - tr.getSequenceAsString(), - "Sequences differ for " + tr.getName() + "\n" + "Exp:" - + tr.getSequenceAsString() + "\n" + "Got:" - + rseq[0].getSequenceAsString()); - - } - } - - @Test(suiteName = "live") - public void testLiveCheckEnsembl() - { - SeqFetcher sf = new SeqFetcher(); - boolean isAvailable = sf.isEnsemblAvailable(); - System.out.println("Ensembl is " - + (isAvailable ? "UP!" - : "DOWN or unreachable ******************* BAD!")); - } - // TODO: - // sequence query with ENSG and anything other than a genomic type will yield - // sequences with different IDs which will - // break the post-processing stage where DBRefs are assigned to sequences. - // -> multiple_sequences = true is needed additional parameter - // http://rest.ensembl.org/sequence/id/ENSG00000157764?content-type=text/x-json;type=protein;multiple_sequences=true - // result with four transcripts, cds, cdna, and protein products. - // * - // features for ENG - - // http://rest.ensembl.org/overlap/id/ENSG00000157764?feature=cds&feature=exon&feature=transcript&content-type=text/x-gff3 - // transcript: gives locus, all transcript products with ENSG parents - // gene: give all ENSG on locus - // exon: all exon boundaries. CDS same info. - - // @Test(dataProvider = "ens_seqs", suiteName = "live") - // public void testGetOneSeqs(EnsemblSeqType type, String sq, String fastasq) - // throws Exception - // { - // - // { - // Assert.assertTrue(rseq[0].getDBRef() != null - // && rseq[0].getDBRef().length > 0, - // "No database references added to sequence by fetcher."); - // Assert.assertNotNull(DBRefUtils.searchRefs(rseq[0].getDBRef(), - // new DBRefEntry("ENSEMBL", null, sq)), - // "Could't find database references added to sequence by fetcher."); - // - // } - -} diff --git a/test/jalview/ext/jmol/JmolCommandsTest.java b/test/jalview/ext/jmol/JmolCommandsTest.java new file mode 100644 index 0000000..46fa241 --- /dev/null +++ b/test/jalview/ext/jmol/JmolCommandsTest.java @@ -0,0 +1,34 @@ +package jalview.ext.jmol; + +import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceI; +import jalview.gui.AlignFrame; +import jalview.gui.SequenceRenderer; +import jalview.structure.StructureMappingcommandSet; +import jalview.structure.StructureSelectionManager; + +import org.testng.annotations.Test; + +public class JmolCommandsTest +{ + + @Test(groups = { "Functional" }) + public void testGetColourBySequenceCommand_noFeatures() + { + SequenceI seq1 = new Sequence("seq1", "MHRSQTRALK"); + SequenceI seq2 = new Sequence("seq2", "MRLEITQSGD"); + AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2 }); + AlignFrame af = new AlignFrame(al, 800, 500); + SequenceRenderer sr = new SequenceRenderer(af.getViewport()); + SequenceI[][] seqs = new SequenceI[][] { { seq1 }, { seq2 } }; + String[] files = new String[] { "seq1.pdb", "seq2.pdb" }; + StructureSelectionManager ssm = new StructureSelectionManager(); + + // need some mappings! + + StructureMappingcommandSet[] commands = JmolCommands + .getColourBySequenceCommand(ssm, files, seqs, sr, null, al); + } +} -- 1.7.10.2