package jalview.ext.ensembl; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.Mapping; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.exceptions.JalviewException; import jalview.io.FastaFile; import jalview.io.FileParse; import jalview.io.gff.SequenceOntology; import jalview.util.DBRefUtils; import jalview.util.MapList; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.List; /** * Base class for Ensembl sequence fetchers * * @author gmcarstairs */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { protected static final String PARENT = "Parent"; protected static final String ID = "ID"; public enum EnsemblSeqType { /** * type=genomic for the full dna including introns */ GENOMIC("genomic"), /** * type=cdna for transcribed dna including UTRs */ CDNA("cdna"), /** * type=cds for coding dna excluding UTRs */ CDS("cds"), /** * type=protein for the peptide product sequence */ PROTEIN("protein"); /* * the value of the 'type' parameter to fetch this version of * an Ensembl sequence */ private String type; EnsemblSeqType(String t) { type = t; } public String getType() { return type; } } /** * A comparator to sort ranges into ascending start position order */ private class RangeSorter implements Comparator { boolean forwards; RangeSorter(boolean forward) { forwards = forward; } @Override public int compare(int[] o1, int[] o2) { return (forwards ? 1 : -1) * Integer.compare(o1[0], o2[0]); } }; /** * Constructor */ public EnsemblSeqProxy() { } /** * Makes the sequence queries to Ensembl's REST service and returns an * alignment consisting of the returned sequences */ @Override public AlignmentI getSequenceRecords(String query) throws Exception { long now = System.currentTimeMillis(); // TODO use a String... query vararg instead? // danger: accession separator used as a regex here, a string elsewhere // in this case it is ok (it is just a space), but (e.g.) '\' would not be List allIds = Arrays.asList(query.split(getAccessionSeparator())); AlignmentI alignment = null; inProgress = true; /* * execute queries, if necessary in batches of the * maximum allowed number of ids */ int maxQueryCount = getMaximumQueryCount(); for (int v = 0, vSize = allIds.size(); v < vSize; v += maxQueryCount) { int p = Math.min(vSize, v + maxQueryCount); List ids = allIds.subList(v, p); try { alignment = fetchSequences(ids, alignment); } catch (Throwable r) { inProgress = false; String msg = "Aborting ID retrieval after " + v + " chunks. Unexpected problem (" + r.getLocalizedMessage() + ")"; System.err.println(msg); if (alignment != null) { break; // return what we got } else { throw new JalviewException(msg, r); } } } /* * fetch and transfer genomic sequence features */ for (String accId : allIds) { addFeaturesAndProduct(accId, alignment); } inProgress = false; System.out.println(getClass().getName() + " took " + (System.currentTimeMillis() - now) + "ms to fetch"); return alignment; } /** * Fetches Ensembl features using the /overlap REST endpoint, and adds them to * the sequence in the alignment. Also fetches the protein product, maps it * from the CDS features of the sequence, and saves it as a cross-reference of * the dna sequence. * * @param accId * @param alignment */ protected void addFeaturesAndProduct(String accId, AlignmentI alignment) { try { /* * get 'dummy' genomic sequence with exon, cds and variation features */ EnsemblOverlap gffFetcher = new EnsemblOverlap(); EnsemblFeatureType[] features = getFeaturesToFetch(); AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, features); if (geneFeatures.getHeight() > 0) { /* * transfer features to the query sequence */ SequenceI genomicSequence = geneFeatures.getSequenceAt(0); SequenceI querySeq = alignment.findName(accId); transferFeatures(accId, genomicSequence, querySeq); /* * fetch and map protein product, and add it as a cross-reference * of the retrieved sequence */ addProteinProduct(querySeq); } } catch (IOException e) { System.err.println("Error transferring Ensembl features: " + e.getMessage()); } } /** * Returns those sequence feature types to fetch from Ensembl. We may want * features either because they are of interest to the user, or as means to * identify the locations of the sequence on the genomic sequence (CDS * features identify CDS, exon features identify cDNA etc). * * @return */ protected abstract EnsemblFeatureType[] getFeaturesToFetch(); /** * Fetches and maps the protein product, and adds it as a cross-reference of * the retrieved sequence */ protected void addProteinProduct(SequenceI querySeq) { String accId = querySeq.getName(); try { AlignmentI protein = new EnsemblProtein().getSequenceRecords(accId); if (protein == null || protein.getHeight() == 0) { System.out.println("Failed to retrieve protein for " + accId); return; } SequenceI proteinSeq = protein.getSequenceAt(0); /* * need dataset sequences (to be the subject of mappings) */ proteinSeq.createDatasetSequence(); querySeq.createDatasetSequence(); MapList mapList = mapCdsToProtein(querySeq, proteinSeq); if (mapList != null) { Mapping map = new Mapping(proteinSeq.getDatasetSequence(), mapList); DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(), accId, map); querySeq.getDatasetSequence().addDBRef(dbr); } } catch (Exception e) { System.err .println(String.format("Error retrieving protein for %s: %s", accId, e.getMessage())); } } /** * Returns a mapping from dna to protein by inspecting sequence features of * type "CDS" on the dna. * * @param dnaSeq * @param proteinSeq * @return */ protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq) { SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); if (sfs == null) { return null; } List ranges = new ArrayList(50); SequenceOntology so = SequenceOntology.getInstance(); int mappedDnaLength = 0; /* * Map CDS columns of dna to peptide. No need to worry about reverse strand * dna here since the retrieved sequence is as transcribed (reverse * complement for reverse strand), i.e in the same sense as the peptide. */ for (SequenceFeature sf : sfs) { /* * process a CDS feature (or a sub-type of CDS) */ if (so.isA(sf.getType(), SequenceOntology.CDS)) { ranges.add(new int[] { sf.getBegin(), sf.getEnd() }); mappedDnaLength += Math.abs(sf.getEnd() - sf.getBegin()) + 1; } } int proteinLength = proteinSeq.getLength(); List proteinRange = new ArrayList(); proteinRange.add(new int[] { 1, proteinLength }); /* * dna length should map to protein (or protein minus stop codon) */ if (mappedDnaLength == 3 * proteinLength || mappedDnaLength == 3 * (proteinLength + 1)) { return new MapList(ranges, proteinRange, 3, 1); } return null; } /** * Fetches sequences for the list of accession ids and adds them to the * alignment. Returns the extended (or created) alignment. * * @param ids * @param alignment * @return * @throws JalviewException * @throws IOException */ protected AlignmentI fetchSequences(List ids, AlignmentI alignment) throws JalviewException, IOException { if (!isEnsemblAvailable()) { inProgress = false; throw new JalviewException("ENSEMBL Rest API not available."); } FileParse fp = getSequenceReader(ids); FastaFile fr = new FastaFile(fp); if (fr.hasWarningMessage()) { System.out.println(String.format( "Warning when retrieving %d ids %s\n%s", ids.size(), ids.toString(), fr.getWarningMessage())); } else if (fr.getSeqs().size() != ids.size()) { System.out.println(String.format( "Only retrieved %d sequences for %d query strings", fr .getSeqs().size(), ids.size())); } if (fr.getSeqs().size() > 0) { AlignmentI seqal = new Alignment( fr.getSeqsAsArray()); for (SequenceI sq:seqal.getSequences()) { if (sq.getDescription() == null) { sq.setDescription(getDbName()); } String name = sq.getName(); if (ids.contains(name) || ids.contains(name.replace("ENSP", "ENST"))) { DBRefUtils.parseToDbRef(sq, DBRefSource.ENSEMBL, "0", name); } } if (alignment == null) { alignment = seqal; } else { alignment.append(seqal); } } return alignment; } /** * Returns the URL for the REST call * * @return * @throws MalformedURLException */ @Override protected URL getUrl(List ids) throws MalformedURLException { // ids are not used - they go in the POST body instead StringBuffer urlstring = new StringBuffer(128); urlstring.append(SEQUENCE_ID_URL); // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats urlstring.append("?type=").append(getSourceEnsemblType().getType()); urlstring.append(("&Accept=text/x-fasta")); URL url = new URL(urlstring.toString()); return url; } /** * A sequence/id POST request currently allows up to 50 queries * * @see http://rest.ensembl.org/documentation/info/sequence_id_post */ @Override public int getMaximumQueryCount() { return 50; } @Override public boolean useGetRequest() { return false; } @Override public String getRequestMimeType() { return "application/json"; } @Override public String getResponseMimeType() { return "text/x-fasta"; } /** * * @return the configured sequence return type for this source */ protected abstract EnsemblSeqType getSourceEnsemblType(); /** * Returns a list of [start, end] genomic ranges corresponding to the sequence * being retrieved. * * The correspondence between the frames of reference is made by locating * those features on the genomic sequence which identify the retrieved * sequence. Specifically *
    *
  • genomic sequence is identified by "transcript" features with * ID=transcript:transcriptId
  • *
  • cdna sequence is identified by "exon" features with * Parent=transcript:transcriptId
  • *
  • cds sequence is identified by "CDS" features with * Parent=transcript:transcriptId
  • *
* * The returned ranges are sorted to run forwards (for positive strand) or * backwards (for negative strand). Aborts and returns null if both positive * and negative strand are found (this should not normally happen). * * @param sfs * @param accId * @return */ protected MapList getGenomicRanges(SequenceFeature[] sfs, String accId) { /* * generously size for initial number of cds regions * (worst case titin Q8WZ42 has c. 313 exons) */ List regions = new ArrayList(100); int mappedLength = 0; int direction = 1; // forward boolean directionSet = false; for (SequenceFeature sf : sfs) { /* * accept the target feature type or a specialisation of it * (e.g. coding_exon for exon) */ if (identifiesSequence(sf, accId)) { int strand = sf.getStrand(); if (directionSet && strand != direction) { // abort - mix of forward and backward System.err.println("Error: forward and backward strand for " + accId); return null; } direction = strand; directionSet = true; /* * add to CDS ranges, semi-sorted forwards/backwards */ if (strand < 0) { regions.add(0, new int[] { sf.getEnd(), sf.getBegin() }); } else { regions.add(new int[] { sf.getBegin(), sf.getEnd() }); } mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); } } /* * a final sort is needed since Ensembl returns CDS sorted within source * (havana / ensembl_havana) */ Collections.sort(regions, new RangeSorter(direction == 1)); List to = new ArrayList(); to.add(new int[] { 1, mappedLength }); return new MapList(regions, to, 1, 1); } /** * Returns true if the sequence feature identifies positions of the genomic * sequence feature which are within the sequence being retrieved. * * @param sf * @param accId * @return */ protected abstract boolean identifiesSequence(SequenceFeature sf, String accId); /** * Transfers the sequence feature to the target sequence, adjusting its start * and end range based on the 'overlap' ranges. Features which do not overlap * the target sequence are ignored, as are features with a parent other than * the target sequence id. * * @param sf * @param targetSequence * @param overlap */ protected void transferFeature(SequenceFeature sf, SequenceI targetSequence, MapList overlap) { String parent = (String) sf.getValue(PARENT); if (parent != null && !parent.contains(targetSequence.getName())) { // this genomic feature belongs to a different transcript return; } int start = sf.getBegin(); int end = sf.getEnd(); int[] mappedRange = overlap.locateInTo(start, end); if (mappedRange != null) { SequenceFeature copy = new SequenceFeature(sf); int offset = targetSequence.getStart() - 1; copy.setBegin(offset + Math.min(mappedRange[0], mappedRange[1])); copy.setEnd(offset + Math.max(mappedRange[0], mappedRange[1])); targetSequence.addSequenceFeature(copy); /* * for sequence_variant, make an additional feature with consequence */ if (SequenceOntology.getInstance().isSequenceVariant(sf.getType())) { String consequence = (String) sf.getValue("consequence_type"); if (consequence != null) { SequenceFeature sf2 = new SequenceFeature("consequence", consequence, copy.getBegin(), copy.getEnd(), 0f, null); targetSequence.addSequenceFeature(sf2); } } } } /** * Transfers features from sourceSequence to targetSequence * * @param accessionId * @param sourceSequence * @param targetSequence */ protected void transferFeatures(String accessionId, SequenceI sourceSequence, SequenceI targetSequence) { if (sourceSequence == null || targetSequence == null) { return; } SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); MapList overlap = getGenomicRanges(sfs, accessionId); final boolean forwardStrand = overlap.isFromForwardStrand(); /* * sort features by start position (descending if reverse strand) * before transferring (in forwards order) to the target sequence */ Arrays.sort(sfs, new Comparator() { @Override public int compare(SequenceFeature o1, SequenceFeature o2) { int c = Integer.compare(o1.getBegin(), o2.getBegin()); return forwardStrand ? c : -c; } }); for (SequenceFeature sf : sfs) { if (retainFeature(sf.getType())) { transferFeature(sf, targetSequence, overlap); } } } /** * Answers true if the feature type is one to attach to the retrieved sequence * * @param type * @return */ protected boolean retainFeature(@SuppressWarnings("unused") String type) { return true; // default is to keep all } }