X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fensembl%2FEnsemblSeqProxy.java;h=e4fa53da00a1e97ea485b5bc6046379c4b3b2cb9;hb=cb8e52fbbc5f725e3f7f48c672cdddb0690bd978;hp=e241874b850928798810e0bb6f01a6771a7565c9;hpb=56b5f4d5ca50971a34c9284bbb4b0507f7ba8a71;p=jalview.git diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index e241874..e4fa53d 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -1,73 +1,89 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.ext.ensembl; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import org.json.simple.parser.ParseException; + +import jalview.analysis.AlignmentUtils; +import jalview.analysis.Dna; +import jalview.bin.Console; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.Mapping; +import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.datamodel.features.SequenceFeatures; import jalview.exceptions.JalviewException; -import jalview.io.FastaFile; -import jalview.io.FileParse; -import jalview.io.gff.SequenceOntology; -import jalview.schemes.ResidueProperties; +import jalview.io.gff.Gff3Helper; +import jalview.io.gff.SequenceOntologyFactory; +import jalview.io.gff.SequenceOntologyI; +import jalview.util.Comparison; import jalview.util.DBRefUtils; +import jalview.util.IntRangeComparator; import jalview.util.MapList; -import jalview.util.MappingUtils; -import jalview.util.StringUtils; - -import java.io.IOException; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map.Entry; /** * Base class for Ensembl sequence fetchers * + * @see http://rest.ensembl.org/documentation/info/sequence_id * @author gmcarstairs */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { - protected static final String CONSEQUENCE_TYPE = "consequence_type"; - - protected static final String PARENT = "Parent"; - - protected static final String ID = "ID"; + protected static final String DESCRIPTION = "description"; /* - * this needs special handling, as it isA sequence_variant in the - * Sequence Ontology, but behaves in Ensembl as if it isA transcript + * enum for 'type' parameter to the /sequence REST service */ - protected static final String NMD_VARIANT = "NMD_transcript_variant"; - - protected static final String NAME = "Name"; - public enum EnsemblSeqType { /** - * type=genomic for the full dna including introns + * type=genomic to fetch full dna including introns */ GENOMIC("genomic"), /** - * type=cdna for transcribed dna including UTRs + * type=cdna to fetch coding dna including UTRs */ CDNA("cdna"), /** - * type=cds for coding dna excluding UTRs + * type=cds to fetch coding dna excluding UTRs */ CDS("cds"), /** - * type=protein for the peptide product sequence + * type=protein to fetch peptide product sequence */ PROTEIN("protein"); @@ -90,30 +106,19 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * A comparator to sort ranges into ascending start position order + * Default constructor (to use rest.ensembl.org) */ - private class RangeSorter implements Comparator + public EnsemblSeqProxy() { - boolean forwards; - - RangeSorter(boolean forward) - { - forwards = forward; - } - - @Override - public int compare(int[] o1, int[] o2) - { - return (forwards ? 1 : -1) * Integer.compare(o1[0], o2[0]); - } - + super(); } /** - * Constructor + * Constructor given the target domain to fetch data from */ - public EnsemblSeqProxy() + public EnsemblSeqProxy(String d) { + super(d); } /** @@ -123,13 +128,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient @Override public AlignmentI getSequenceRecords(String query) throws Exception { - long now = System.currentTimeMillis(); // TODO use a String... query vararg instead? // danger: accession separator used as a regex here, a string elsewhere // in this case it is ok (it is just a space), but (e.g.) '\' would not be - List allIds = Arrays.asList(query - .split(getAccessionSeparator())); + List allIds = Arrays + .asList(query.split(getAccessionSeparator())); AlignmentI alignment = null; inProgress = true; @@ -152,29 +156,31 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient + " chunks. Unexpected problem (" + r.getLocalizedMessage() + ")"; System.err.println(msg); - if (alignment != null) - { - break; // return what we got - } - else - { - throw new JalviewException(msg, r); - } + r.printStackTrace(); + break; } } + if (alignment == null) + { + return null; + } + /* * fetch and transfer genomic sequence features, * fetch protein product and add as cross-reference */ - for (String accId : allIds) + for (int i = 0, n = allIds.size(); i < n; i++) + { + addFeaturesAndProduct(allIds.get(i), alignment); + } + + List seqs = alignment.getSequences(); + for (int i = 0, n = seqs.size(); i < n; i++) { - addFeaturesAndProduct(accId, alignment); + getCrossReferences(seqs.get(i)); } - inProgress = false; - System.out.println(getClass().getName() + " took " - + (System.currentTimeMillis() - now) + "ms to fetch"); return alignment; } @@ -197,23 +203,30 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient try { /* - * get 'dummy' genomic sequence with exon, cds and variation features + * get 'dummy' genomic sequence with gene, transcript, + * exon, cds and variation features */ SequenceI genomicSequence = null; - EnsemblOverlap gffFetcher = new EnsemblOverlap(); - EnsemblFeatureType[] features = getFeaturesToFetch(); + EnsemblFeatures gffFetcher = new EnsemblFeatures(getDomain()); + EnsemblFeatureType[] features = getFeaturesToFetch(); + + // Platform.timeCheck("ESP.getsequencerec1", Platform.TIME_MARK); + AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, features); - if (geneFeatures.getHeight() > 0) + if (geneFeatures != null && geneFeatures.getHeight() > 0) { genomicSequence = geneFeatures.getSequenceAt(0); } + + // Platform.timeCheck("ESP.getsequencerec2", Platform.TIME_MARK); + if (genomicSequence != null) { /* * transfer features to the query sequence */ - SequenceI querySeq = alignment.findName(accId); + SequenceI querySeq = alignment.findName(accId, true); if (transferFeatures(accId, genomicSequence, querySeq)) { @@ -221,14 +234,16 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * fetch and map protein product, and add it as a cross-reference * of the retrieved sequence */ + // Platform.timeCheck("ESP.transferFeatures", Platform.TIME_MARK); addProteinProduct(querySeq); } } } catch (IOException e) { - System.err.println("Error transferring Ensembl features: " - + e.getMessage()); + System.err.println( + "Error transferring Ensembl features: " + e.getMessage()); } + // Platform.timeCheck("ESP.addfeat done", Platform.TIME_MARK); } /** @@ -250,10 +265,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String accId = querySeq.getName(); try { - AlignmentI protein = new EnsemblProtein().getSequenceRecords(accId); + System.out.println("Adding protein product for " + accId); + AlignmentI protein = new EnsemblProtein(getDomain()) + .getSequenceRecords(accId); if (protein == null || protein.getHeight() == 0) { - System.out.println("Failed to retrieve protein for " + accId); + System.out.println("No protein product found for " + accId); return; } SequenceI proteinSeq = protein.getSequenceAt(0); @@ -264,19 +281,65 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient proteinSeq.createDatasetSequence(); querySeq.createDatasetSequence(); - MapList mapList = mapCdsToProtein(querySeq, proteinSeq); + MapList mapList = AlignmentUtils.mapCdsToProtein(querySeq, + proteinSeq); if (mapList != null) { - Mapping map = new Mapping(proteinSeq.getDatasetSequence(), mapList); - DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(), - accId, map); + // clunky: ensure Uniprot xref if we have one is on mapped sequence + SequenceI ds = proteinSeq.getDatasetSequence(); + // TODO: Verify ensp primary ref is on proteinSeq.getDatasetSequence() + Mapping map = new Mapping(ds, mapList); + DBRefEntry dbr = new DBRefEntry(getDbSource(), + getEnsemblDataVersion(), proteinSeq.getName(), map); querySeq.getDatasetSequence().addDBRef(dbr); - + List uprots = DBRefUtils.selectRefs(ds.getDBRefs(), + new String[] + { DBRefSource.UNIPROT }); + List upxrefs = DBRefUtils.selectRefs(querySeq.getDBRefs(), + new String[] + { DBRefSource.UNIPROT }); + if (uprots != null) + { + for (DBRefEntry up : uprots) + { + // locate local uniprot ref and map + List upx = DBRefUtils.searchRefs(upxrefs, + up.getAccessionId()); + DBRefEntry upxref; + if (upx.size() != 0) + { + upxref = upx.get(0); + + if (upx.size() > 1) + { + Console.warn( + "Implementation issue - multiple uniprot acc on product sequence."); + } + } + else + { + upxref = new DBRefEntry(DBRefSource.UNIPROT, + getEnsemblDataVersion(), up.getAccessionId()); + } + + Mapping newMap = new Mapping(ds, mapList); + upxref.setVersion(getEnsemblDataVersion()); + upxref.setMap(newMap); + if (upx.size() == 0) + { + // add the new uniprot ref + querySeq.getDatasetSequence().addDBRef(upxref); + } + + } + } + /* - * compute peptide variants from dna variants and add as - * sequence features on the protein sequence ta-da + * copy exon features to protein, compute peptide variants from dna + * variants and add as features on the protein sequence ta-da */ - computeProteinFeatures(querySeq, proteinSeq, mapList); + // JAL-3187 render on the fly instead + // AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, mapList); } } catch (Exception e) { @@ -287,98 +350,52 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Returns a mapping from dna to protein by inspecting sequence features of - * type "CDS" on the dna. + * Get database xrefs from Ensembl, and attach them to the sequence * - * @param dnaSeq - * @param proteinSeq - * @return + * @param seq */ - protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq) + protected void getCrossReferences(SequenceI seq) { - List ranges = new ArrayList(50); + + // Platform.timeCheck("ESP. getdataseq ", Platform.TIME_MARK); - int mappedDnaLength = getCdsRanges(dnaSeq, ranges); + + while (seq.getDatasetSequence() != null) + { + seq = seq.getDatasetSequence(); + } - int proteinLength = proteinSeq.getLength(); - List proteinRange = new ArrayList(); - int proteinStart = 1; + // Platform.timeCheck("ESP. getxref ", Platform.TIME_MARK); - /* - * incomplete start codon may mean X at start of peptide - * we ignore both for mapping purposes - */ - if (proteinSeq.getCharAt(0) == 'X') + EnsemblXref xrefFetcher = new EnsemblXref(getDomain(), getDbSource(), + getEnsemblDataVersion()); + List xrefs = xrefFetcher.getCrossReferences(seq.getName()); + + for (int i = 0, n = xrefs.size(); i < n; i++) { - proteinStart = 2; - proteinLength--; +// Platform.timeCheck("ESP. getxref + " + (i) + "/" + n, Platform.TIME_MARK); + // BH 2019.01.25 this next method was taking 174 ms PER addition for a 266-reference example. + // DBRefUtils.ensurePrimaries(seq) + // was at the end of seq.addDBRef, so executed after ever addition! + // This method was moved to seq.getPrimaryDBRefs() + seq.addDBRef(xrefs.get(i)); } - proteinRange.add(new int[] { proteinStart, proteinLength }); +// System.out.println("primaries are " + seq.getPrimaryDBRefs().toString()); /* - * dna length should map to protein (or protein plus stop codon) + * and add a reference to itself */ - int codesForResidues = mappedDnaLength / 3; - if (codesForResidues == proteinLength - || codesForResidues == (proteinLength + 1)) - { - return new MapList(ranges, proteinRange, 3, 1); - } - return null; - } + +// Platform.timeCheck("ESP. getxref self ", Platform.TIME_MARK); - /** - * Adds CDS ranges to the ranges list, and returns the total length mapped. - * - * No need to worry about reverse strand dna here since the retrieved sequence - * is as transcribed (reverse complement for reverse strand), i.e in the same - * sense as the peptide. - * - * @param dnaSeq - * @param ranges - * @return - */ - protected int getCdsRanges(SequenceI dnaSeq, List ranges) - { - SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); - if (sfs == null) - { - return 0; - } - int mappedDnaLength = 0; - for (SequenceFeature sf : sfs) - { - /* - * process a CDS feature (or a sub-type of CDS) - */ - if (SequenceOntology.getInstance().isA(sf.getType(), SequenceOntology.CDS)) - { - int phase = 0; - try { - phase = Integer.parseInt(sf.getPhase()); - } catch (NumberFormatException e) - { - // ignore - } - /* - * phase > 0 on first codon means 5' incomplete - skip to the start - * of the next codon; example ENST00000496384 - */ - int begin = sf.getBegin(); - int end = sf.getEnd(); - if (ranges.isEmpty() && phase > 0) - { - begin += phase; - if (begin > end) - { - continue; // shouldn't happen? - } - } - ranges.add(new int[] { begin, end }); - mappedDnaLength += Math.abs(end - begin) + 1; - } - } - return mappedDnaLength; + DBRefEntry self = new DBRefEntry(getDbSource(), getEnsemblDataVersion(), + seq.getName()); + +// Platform.timeCheck("ESP. getxref self add ", Platform.TIME_MARK); + + seq.addDBRef(self); + + // Platform.timeCheck("ESP. seqprox done ", Platform.TIME_MARK); } /** @@ -391,52 +408,50 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * @throws JalviewException * @throws IOException */ - protected AlignmentI fetchSequences(List ids, AlignmentI alignment) - throws JalviewException, IOException + protected AlignmentI fetchSequences(List ids, + AlignmentI alignment) throws JalviewException, IOException { if (!isEnsemblAvailable()) { inProgress = false; throw new JalviewException("ENSEMBL Rest API not available."); } - FileParse fp = getSequenceReader(ids); - FastaFile fr = new FastaFile(fp); - if (fr.hasWarningMessage()) - { - System.out.println(String.format( - "Warning when retrieving %d ids %s\n%s", ids.size(), - ids.toString(), fr.getWarningMessage())); - } - else if (fr.getSeqs().size() != ids.size()) + // Platform.timeCheck("EnsemblSeqProx.fetchSeq ", Platform.TIME_MARK); + + List seqs = parseSequenceJson(ids); + if (seqs == null) + return alignment; + + if (seqs.isEmpty()) { - System.out.println(String.format( - "Only retrieved %d sequences for %d query strings", fr - .getSeqs().size(), ids.size())); + throw new IOException("No data returned for " + ids); } - if (fr.getSeqs().size() == 1 && fr.getSeqs().get(0).getLength() == 0) + if (seqs.size() != ids.size()) { - /* - * POST request has returned an empty FASTA file e.g. for invalid id - */ - throw new IOException("No data returned for " + ids); + System.out.println(String.format( + "Only retrieved %d sequences for %d query strings", + seqs.size(), ids.size())); } - if (fr.getSeqs().size() > 0) + if (!seqs.isEmpty()) { AlignmentI seqal = new Alignment( - fr.getSeqsAsArray()); - for (SequenceI sq:seqal.getSequences()) + seqs.toArray(new SequenceI[seqs.size()])); + for (SequenceI seq : seqs) { - if (sq.getDescription() == null) + if (seq.getDescription() == null) { - sq.setDescription(getDbName()); + seq.setDescription(getDbName()); } - String name = sq.getName(); + String name = seq.getName(); if (ids.contains(name) || ids.contains(name.replace("ENSP", "ENST"))) { - DBRefUtils.parseToDbRef(sq, DBRefSource.ENSEMBL, "0", name); + // TODO JAL-3077 use true accession version in dbref + DBRefEntry dbref = DBRefUtils.parseToDbRef(seq, getDbSource(), + getEnsemblDataVersion(), name); + seq.addDBRef(dbref); } } if (alignment == null) @@ -452,6 +467,53 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** + * Parses a JSON response for a single sequence ID query + * + * @param br + * @return a single jalview.datamodel.Sequence + * @see http://rest.ensembl.org/documentation/info/sequence_id + */ + @SuppressWarnings("unchecked") + protected List parseSequenceJson(List ids) + { + List result = new ArrayList<>(); + try + { + /* + * for now, assumes only one sequence returned; refactor if needed + * in future to handle a JSONArray with more than one + */ + // Platform.timeCheck("ENS seqproxy", Platform.TIME_MARK); + Map val = (Map) getJSON(null, ids, -1, MODE_MAP, null); + if (val == null) + return null; + Object s = val.get("desc"); + String desc = s == null ? null : s.toString(); + s = val.get("id"); + String id = s == null ? null : s.toString(); + s = val.get("seq"); + String seq = s == null ? null : s.toString(); + Sequence sequence = new Sequence(id, seq); + if (desc != null) + { + sequence.setDescription(desc); + } + // todo JAL-3077 make a DBRefEntry with true accession version + // s = val.get("version"); + // String version = s == null ? "0" : s.toString(); + // DBRefEntry dbref = new DBRefEntry(getDbSource(), version, id); + // sequence.addDBRef(dbref); + result.add(sequence); + } catch (ParseException | IOException e) + { + System.err.println("Error processing JSON response: " + e.toString()); + // ignore + } + // Platform.timeCheck("ENS seqproxy2", Platform.TIME_MARK); + return result; + } + + /** * Returns the URL for the REST call * * @return @@ -465,20 +527,38 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * multiple ids go in the POST body instead */ StringBuffer urlstring = new StringBuffer(128); - urlstring.append(SEQUENCE_ID_URL); + urlstring.append(getDomain() + "/sequence/id"); if (ids.size() == 1) { urlstring.append("/").append(ids.get(0)); } // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats urlstring.append("?type=").append(getSourceEnsemblType().getType()); - urlstring.append(("&Accept=text/x-fasta")); + urlstring.append(("&Accept=application/json")); + urlstring.append(("&content-type=application/json")); + + String objectType = getObjectType(); + if (objectType != null) + { + urlstring.append("&").append(OBJECT_TYPE).append("=") + .append(objectType); + } URL url = new URL(urlstring.toString()); return url; } /** + * Override this method to specify object_type request parameter + * + * @return + */ + protected String getObjectType() + { + return null; + } + + /** * A sequence/id POST request currently allows up to 50 queries * * @see http://rest.ensembl.org/documentation/info/sequence_id_post @@ -495,18 +575,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return false; } - @Override - protected String getRequestMimeType(boolean multipleIds) - { - return multipleIds ? "application/json" : "text/x-fasta"; - } - - @Override - protected String getResponseMimeType() - { - return "text/x-fasta"; - } - /** * * @return the configured sequence return type for this source @@ -539,11 +607,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * the start position of the sequence we are mapping to * @return */ - protected MapList getGenomicRanges(SequenceI sourceSequence, + protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence, String accId, int start) { - SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); - if (sfs == null) + List sfs = getIdentifyingFeatures(sourceSequence, + accId); + if (sfs.isEmpty()) { return null; } @@ -552,55 +621,40 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * generously initial size for number of cds regions * (worst case titin Q8WZ42 has c. 313 exons) */ - List regions = new ArrayList(100); + List regions = new ArrayList<>(100); int mappedLength = 0; int direction = 1; // forward boolean directionSet = false; - + for (SequenceFeature sf : sfs) { + int strand = sf.getStrand(); + strand = strand == 0 ? 1 : strand; // treat unknown as forward + + if (directionSet && strand != direction) + { + // abort - mix of forward and backward + System.err + .println("Error: forward and backward strand for " + accId); + return null; + } + direction = strand; + directionSet = true; + /* - * accept the target feature type or a specialisation of it - * (e.g. coding_exon for exon) + * add to CDS ranges, semi-sorted forwards/backwards */ - if (identifiesSequence(sf, accId)) + if (strand < 0) { - int strand = sf.getStrand(); - - if (directionSet && strand != direction) - { - // abort - mix of forward and backward - System.err.println("Error: forward and backward strand for " - + accId); - return null; - } - direction = strand; - directionSet = true; - - /* - * add to CDS ranges, semi-sorted forwards/backwards - */ - if (strand < 0) - { - regions.add(0, new int[] { sf.getEnd(), sf.getBegin() }); - } - else - { - regions.add(new int[] { sf.getBegin(), sf.getEnd() }); - } - mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); - - if (!isSpliceable()) - { - /* - * 'gene' sequence is contiguous so we can stop as soon as its - * identifying feature has been found - */ - break; - } + regions.add(0, new int[] { sf.getEnd(), sf.getBegin() }); + } + else + { + regions.add(new int[] { sf.getBegin(), sf.getEnd() }); } + mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); } - + if (regions.isEmpty()) { System.out.println("Failed to identify target sequence for " + accId @@ -612,36 +666,32 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * a final sort is needed since Ensembl returns CDS sorted within source * (havana / ensembl_havana) */ - Collections.sort(regions, new RangeSorter(direction == 1)); - - List to = new ArrayList(); - to.add(new int[] { start, start + mappedLength - 1 }); - - return new MapList(regions, to, 1, 1); - } + Collections.sort(regions, direction == 1 ? IntRangeComparator.ASCENDING + : IntRangeComparator.DESCENDING); - /** - * Answers true if the sequence being retrieved may occupy discontiguous - * regions on the genomic sequence. - */ - protected boolean isSpliceable() - { - return true; + List to = Arrays + .asList(new int[] + { start, start + mappedLength - 1 }); + + return new MapList(regions, to, 1, 1); } /** - * Returns true if the sequence feature marks positions of the genomic + * Answers a list of sequence features that mark positions of the genomic * sequence feature which are within the sequence being retrieved. For * example, an 'exon' feature whose parent is the target transcript marks the - * cdna positions of the transcript. + * cdna positions of the transcript. For a gene sequence, this is trivially + * just the 'gene' feature with matching gene id. * - * @param sf + * @param seq * @param accId * @return */ - protected abstract boolean identifiesSequence(SequenceFeature sf, - String accId); - + protected abstract List getIdentifyingFeatures( + SequenceI seq, String accId); + + int bhtest = 0; + /** * Transfers the sequence feature to the target sequence, locating its start * and end range based on the mapping. Features which do not overlap the @@ -652,34 +702,96 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * @param mapping * mapping from the sequence feature's coordinates to the target * sequence + * @param forwardStrand */ protected void transferFeature(SequenceFeature sf, - SequenceI targetSequence, MapList mapping) + SequenceI targetSequence, MapList mapping, boolean forwardStrand) { int start = sf.getBegin(); int end = sf.getEnd(); int[] mappedRange = mapping.locateInTo(start, end); - + if (mappedRange != null) { - SequenceFeature copy = new SequenceFeature(sf); - copy.setBegin(Math.min(mappedRange[0], mappedRange[1])); - copy.setEnd(Math.max(mappedRange[0], mappedRange[1])); +// Platform.timeCheck(null, Platform.TIME_SET); + String group = sf.getFeatureGroup(); + if (".".equals(group)) + { + group = getDbSource(); + } + int newBegin = Math.min(mappedRange[0], mappedRange[1]); + int newEnd = Math.max(mappedRange[0], mappedRange[1]); +// Platform.timeCheck(null, Platform.TIME_MARK); + bhtest++; + // 280 ms/1000 here: + SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd, group, sf.getScore()); + // 0.175 ms here: targetSequence.addSequenceFeature(copy); /* - * for sequence_variant, make an additional feature with consequence + * for sequence_variant on reverse strand, have to convert the allele + * values to their complements */ - if (SequenceOntology.getInstance().isSequenceVariant(sf.getType())) + if (!forwardStrand && SequenceOntologyFactory.getInstance() + .isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT)) { - String consequence = (String) sf.getValue(CONSEQUENCE_TYPE); - if (consequence != null) - { - SequenceFeature sf2 = new SequenceFeature("consequence", - consequence, copy.getBegin(), copy.getEnd(), 0f, - null); - targetSequence.addSequenceFeature(sf2); - } + reverseComplementAlleles(copy); + } + } + } + + /** + * Change the 'alleles' value of a feature by converting to complementary + * bases, and also update the feature description to match + * + * @param sf + */ + static void reverseComplementAlleles(SequenceFeature sf) + { + final String alleles = (String) sf.getValue(Gff3Helper.ALLELES); + if (alleles == null) + { + return; + } + StringBuilder complement = new StringBuilder(alleles.length()); + for (String allele : alleles.split(",")) + { + reverseComplementAllele(complement, allele); + } + String comp = complement.toString(); + sf.setValue(Gff3Helper.ALLELES, comp); + sf.setDescription(comp); + } + + /** + * Makes the 'reverse complement' of the given allele and appends it to the + * buffer, after a comma separator if not the first + * + * @param complement + * @param allele + */ + static void reverseComplementAllele(StringBuilder complement, + String allele) + { + if (complement.length() > 0) + { + complement.append(","); + } + + /* + * some 'alleles' are actually descriptive terms + * e.g. HGMD_MUTATION, PhenCode_variation + * - we don't want to 'reverse complement' these + */ + if (!Comparison.isNucleotideSequence(allele, true)) + { + complement.append(allele); + } + else + { + for (int i = allele.length() - 1; i >= 0; i--) + { + complement.append(Dna.getComplement(allele.charAt(i))); } } } @@ -700,15 +812,25 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return false; } - SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); - MapList mapping = getGenomicRanges(sourceSequence, accessionId, - targetSequence.getStart()); +// long start = System.currentTimeMillis(); + List sfs = sourceSequence.getFeatures() + .getPositionalFeatures(); + MapList mapping = getGenomicRangesFromFeatures(sourceSequence, + accessionId, targetSequence.getStart()); if (mapping == null) - { + { return false; } - return transferFeatures(sfs, targetSequence, mapping, accessionId); + // Platform.timeCheck("ESP. xfer " + sfs.size(), Platform.TIME_MARK); + + boolean result = transferFeatures(sfs, targetSequence, mapping, + accessionId); +// System.out.println("transferFeatures (" + (sfs.size()) + " --> " +// + targetSequence.getFeatures().getFeatureCount(true) + ") to " +// + targetSequence.getName() + " took " +// + (System.currentTimeMillis() - start) + "ms"); + return result; } /** @@ -716,40 +838,42 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * converted using the mapping. Features which do not overlap are ignored. * Features whose parent is not the specified identifier are also ignored. * - * @param features + * @param sfs * @param targetSequence * @param mapping * @param parentId * @return */ - protected boolean transferFeatures(SequenceFeature[] features, + protected boolean transferFeatures(List sfs, SequenceI targetSequence, MapList mapping, String parentId) { final boolean forwardStrand = mapping.isFromForwardStrand(); /* - * sort features by start position (descending if reverse strand) - * before transferring (in forwards order) to the target sequence + * sort features by start position (which corresponds to end + * position descending if reverse strand) so as to add them in + * 'forwards' order to the target sequence */ - Arrays.sort(features, new Comparator() - { - @Override - public int compare(SequenceFeature o1, SequenceFeature o2) - { - int c = Integer.compare(o1.getBegin(), o2.getBegin()); - return forwardStrand ? c : -c; - } - }); + SequenceFeatures.sortFeatures(sfs, forwardStrand); boolean transferred = false; - for (SequenceFeature sf : features) + + for (int i = 0, n = sfs.size(); i < n; i++) { + +// if ((i%1000) == 0) { +//// Platform.timeCheck("Feature " + bhtest, Platform.TIME_GET); +// Platform.timeCheck("ESP. xferFeature + " + (i) + "/" + n, Platform.TIME_MARK); +// } + + SequenceFeature sf = sfs.get(i); if (retainFeature(sf, parentId)) { - transferFeature(sf, targetSequence, mapping); + transferFeature(sf, targetSequence, mapping, forwardStrand); transferred = true; } } + return transferred; } @@ -777,8 +901,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient protected boolean featureMayBelong(SequenceFeature sf, String identifier) { String parent = (String) sf.getValue(PARENT); - // using contains to allow for prefix "gene:", "transcript:" etc - if (parent != null && !parent.contains(identifier)) + if (parent != null + && !parent.equalsIgnoreCase(identifier)) { // this genomic feature belongs to a different transcript return false; @@ -786,6 +910,9 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return true; } + /** + * Answers a short description of the sequence fetcher + */ @Override public String getDescription() { @@ -795,307 +922,51 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /** * Returns a (possibly empty) list of features on the sequence which have the - * specified sequence ontology type (or a sub-type of it), and the given + * specified sequence ontology term (or a sub-type of it), and the given * identifier as parent * * @param sequence - * @param type + * @param term * @param parentId * @return */ protected List findFeatures(SequenceI sequence, - String type, String parentId) + String term, String parentId) { - List result = new ArrayList(); - - SequenceFeature[] sfs = sequence.getSequenceFeatures(); - if (sfs != null) { - SequenceOntology so = SequenceOntology.getInstance(); - for (SequenceFeature sf :sfs) { - if (so.isA(sf.getType(), type)) - { - String parent = (String) sf.getValue(PARENT); - if (parent.equals(parentId)) - { - result.add(sf); - } - } - } - } - return result; - } - - /** - * Maps exon features from dna to protein, and computes variants in peptide - * product generated by variants in dna, and adds them as sequence_variant - * features on the protein sequence. Returns the number of variant features - * added. - * - * @param dnaSeq - * @param peptide - * @param dnaToProtein - */ - static int computeProteinFeatures(SequenceI dnaSeq, - SequenceI peptide, MapList dnaToProtein) - { - while (dnaSeq.getDatasetSequence() != null) - { - dnaSeq = dnaSeq.getDatasetSequence(); - } - while (peptide.getDatasetSequence() != null) - { - peptide = peptide.getDatasetSequence(); - } - - mapExonFeaturesToProtein(dnaSeq, peptide, dnaToProtein); - - LinkedHashMap variants = buildDnaVariantsMap( - dnaSeq, dnaToProtein); - - /* - * scan codon variations, compute peptide variants and add to peptide sequence - */ - int count = 0; - for (Entry variant : variants.entrySet()) - { - int peptidePos = variant.getKey(); - String[][] codonVariants = variant.getValue(); - String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based - List peptideVariants = computePeptideVariants(codonVariants, - residue); - if (!peptideVariants.isEmpty()) - { - String desc = StringUtils.listToDelimitedString(peptideVariants, - ", "); - SequenceFeature sf = new SequenceFeature( - SequenceOntology.SEQUENCE_VARIANT, desc, peptidePos, - peptidePos, 0f, null); - peptide.addSequenceFeature(sf); - count++; - } - } - return count; - } - - /** - * Transfers exon features to the corresponding mapped regions of the protein - * sequence. This is useful because it allows visualisation of exon boundaries - * on the peptide (using 'colour by label' for the exon name). Returns the - * number of features written. - * - * @param dnaSeq - * @param peptide - * @param dnaToProtein - */ - static int mapExonFeaturesToProtein(SequenceI dnaSeq, SequenceI peptide, - MapList dnaToProtein) - { - SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); - if (sfs == null) - { - return 0; - } - - SequenceOntology so = SequenceOntology.getInstance(); - int count = 0; + List result = new ArrayList<>(); + List sfs = sequence.getFeatures() + .getFeaturesByOntology(term); for (SequenceFeature sf : sfs) { - if (so.isA(sf.getType(), SequenceOntology.EXON)) + String parent = (String) sf.getValue(PARENT); + if (parent != null && parent.equalsIgnoreCase(parentId)) { - int start = sf.getBegin(); - int end = sf.getEnd(); - int[] mapsTo = dnaToProtein.locateInTo(start, end); - if (mapsTo != null) - { - SequenceFeature copy = new SequenceFeature(SequenceOntology.EXON, - sf.getDescription(), mapsTo[0], mapsTo[1], 0f, null); - peptide.addSequenceFeature(copy); - count++; - } + result.add(sf); } } - return count; - } - /** - * Builds a map whose key is position in the protein sequence, and value is an - * array of all variants for the coding codon positions - * - * @param dnaSeq - * @param dnaToProtein - * @return - */ - static LinkedHashMap buildDnaVariantsMap( - SequenceI dnaSeq, MapList dnaToProtein) - { - /* - * map from peptide position to all variant features of the codon for it - * LinkedHashMap ensures we add the peptide features in sequence order - */ - LinkedHashMap variants = new LinkedHashMap(); - SequenceOntology so = SequenceOntology.getInstance(); - - SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures(); - if (dnaFeatures == null) - { - return variants; - } - - int dnaStart = dnaSeq.getStart(); - int[] lastCodon = null; - int lastPeptidePostion = 0; - - /* - * build a map of codon variations for peptides - */ - for (SequenceFeature sf : dnaFeatures) - { - int dnaCol = sf.getBegin(); - if (dnaCol != sf.getEnd()) - { - // not handling multi-locus variant features - continue; - } - if (so.isSequenceVariant(sf.getType())) - { - int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol); - if (mapsTo == null) - { - // feature doesn't lie within coding region - continue; - } - int peptidePosition = mapsTo[0]; - String[][] codonVariants = variants.get(peptidePosition); - if (codonVariants == null) - { - codonVariants = new String[3][]; - variants.put(peptidePosition, codonVariants); - } - - /* - * extract dna variants to a string array - */ - String alls = (String) sf.getValue("alleles"); - if (alls == null) - { - continue; - } - String[] alleles = alls.split(","); - - /* - * get this peptides codon positions e.g. [3, 4, 5] or [4, 7, 10] - */ - int[] codon = peptidePosition == lastPeptidePostion ? lastCodon - : MappingUtils.flattenRanges(dnaToProtein.locateInFrom( - peptidePosition, peptidePosition)); - lastPeptidePostion = peptidePosition; - lastCodon = codon; - - /* - * save nucleotide (and this variant) for each codon position - */ - for (int codonPos = 0; codonPos < 3; codonPos++) - { - String nucleotide = String.valueOf(dnaSeq - .getCharAt(codon[codonPos] - dnaStart)); - if (codon[codonPos] == dnaCol) - { - /* - * record current dna base and its alleles - */ - String[] dnaVariants = new String[alleles.length + 1]; - dnaVariants[0] = nucleotide; - System.arraycopy(alleles, 0, dnaVariants, 1, alleles.length); - codonVariants[codonPos] = dnaVariants; - } - else if (codonVariants[codonPos] == null) - { - /* - * record current dna base only - * (at least until we find any variation and overwrite it) - */ - codonVariants[codonPos] = new String[] { nucleotide }; - } - } - } - } - return variants; - } - - /** - * Returns a sorted, non-redundant list of all peptide translations generated - * by the given dna variants, excluding the current residue value - * - * @param codonVariants - * an array of base values (acgtACGT) for codon positions 1, 2, 3 - * @param residue - * the current residue translation - * @return - */ - static List computePeptideVariants( - String[][] codonVariants, String residue) - { - List result = new ArrayList(); - for (String base1 : codonVariants[0]) - { - for (String base2 : codonVariants[1]) - { - for (String base3 : codonVariants[2]) - { - String codon = base1 + base2 + base3; - // TODO: report frameshift/insertion/deletion - // and multiple-base variants?! - String peptide = codon.contains("-") ? "-" : ResidueProperties - .codonTranslate(codon); - if (peptide != null && !result.contains(peptide) - && !peptide.equalsIgnoreCase(residue)) - { - result.add(peptide); - } - } - } - } - - /* - * sort alphabetically with STOP at the end - */ - Collections.sort(result, new Comparator() - { - - @Override - public int compare(String o1, String o2) - { - if ("STOP".equals(o1)) - { - return 1; - } - else if ("STOP".equals(o2)) - { - return -1; - } - else - { - return o1.compareTo(o2); - } - } - }); return result; } /** * Answers true if the feature type is either 'NMD_transcript_variant' or - * 'transcript' or one of its sub-types in the Sequence Ontology. This is - * needed because NMD_transcript_variant behaves like 'transcript' in Ensembl + * 'transcript' (or one of its sub-types in the Sequence Ontology). This is + * because NMD_transcript_variant behaves like 'transcript' in Ensembl * although strictly speaking it is not (it is a sub-type of * sequence_variant). + *

+ * (This test was needed when fetching transcript features as GFF. As we are + * now fetching as JSON, all features have type 'transcript' so the check for + * NMD_transcript_variant is redundant. Left in for any future case arising.) * * @param featureType * @return */ public static boolean isTranscript(String featureType) { - return NMD_VARIANT.equals(featureType) - || SequenceOntology.getInstance().isA(featureType, SequenceOntology.TRANSCRIPT); + return SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(featureType) + || SequenceOntologyFactory.getInstance().isA(featureType, + SequenceOntologyI.TRANSCRIPT); } }