X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fensembl%2FEnsemblSeqProxy.java;h=c86469f7b5ab393157107bd323bf0d8000bae725;hb=1c6ddae580d69eb0fa5b4291ba84fd6ba9b83621;hp=b8054176318b791cef3b6da0bebc74a497d39b21;hpb=ecc50d775a514f9840cdcc10a9d5b1e49c60582c;p=jalview.git diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index b805417..c86469f 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -1,5 +1,7 @@ package jalview.ext.ensembl; +import jalview.analysis.AlignmentUtils; +import jalview.analysis.Dna; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; @@ -10,7 +12,9 @@ import jalview.datamodel.SequenceI; import jalview.exceptions.JalviewException; import jalview.io.FastaFile; import jalview.io.FileParse; -import jalview.io.gff.SequenceOntology; +import jalview.io.gff.SequenceOntologyFactory; +import jalview.io.gff.SequenceOntologyI; +import jalview.util.Comparison; import jalview.util.DBRefUtils; import jalview.util.MapList; @@ -26,33 +30,43 @@ import java.util.List; /** * Base class for Ensembl sequence fetchers * + * @see http://rest.ensembl.org/documentation/info/sequence_id * @author gmcarstairs */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { + private static final String ALLELES = "alleles"; + protected static final String PARENT = "Parent"; protected static final String ID = "ID"; + protected static final String NAME = "Name"; + + protected static final String DESCRIPTION = "description"; + + /* + * enum for 'type' parameter to the /sequence REST service + */ public enum EnsemblSeqType { /** - * type=genomic for the full dna including introns + * type=genomic to fetch full dna including introns */ GENOMIC("genomic"), /** - * type=cdna for transcribed dna including UTRs + * type=cdna to fetch coding dna including UTRs */ CDNA("cdna"), /** - * type=cds for coding dna excluding UTRs + * type=cds to fetch coding dna excluding UTRs */ CDS("cds"), /** - * type=protein for the peptide product sequence + * type=protein to fetch peptide product sequence */ PROTEIN("protein"); @@ -92,28 +106,37 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return (forwards ? 1 : -1) * Integer.compare(o1[0], o2[0]); } - }; + } /** - * Constructor + * Default constructor (to use rest.ensembl.org) */ public EnsemblSeqProxy() { + super(); + } + + /** + * Constructor given the target domain to fetch data from + */ + public EnsemblSeqProxy(String d) + { + super(d); } /** * Makes the sequence queries to Ensembl's REST service and returns an - * alignment consisting of the returned sequences + * alignment consisting of the returned sequences. */ @Override public AlignmentI getSequenceRecords(String query) throws Exception { - long now = System.currentTimeMillis(); // TODO use a String... query vararg instead? // danger: accession separator used as a regex here, a string elsewhere // in this case it is ok (it is just a space), but (e.g.) '\' would not be - List allIds = Arrays.asList(query.split(getAccessionSeparator())); + List allIds = Arrays.asList(query + .split(getAccessionSeparator())); AlignmentI alignment = null; inProgress = true; @@ -136,28 +159,29 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient + " chunks. Unexpected problem (" + r.getLocalizedMessage() + ")"; System.err.println(msg); - if (alignment != null) - { - break; // return what we got - } - else - { - throw new JalviewException(msg, r); - } + break; } } + if (alignment == null) + { + return null; + } + /* - * fetch and transfer genomic sequence features + * fetch and transfer genomic sequence features, + * fetch protein product and add as cross-reference */ for (String accId : allIds) { addFeaturesAndProduct(accId, alignment); } - inProgress = false; - System.out.println(getClass().getName() + " took " - + (System.currentTimeMillis() - now) + "ms to fetch"); + for (SequenceI seq : alignment.getSequences()) + { + getCrossReferences(seq); + } + return alignment; } @@ -172,29 +196,40 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ protected void addFeaturesAndProduct(String accId, AlignmentI alignment) { + if (alignment == null) + { + return; + } + try { /* * get 'dummy' genomic sequence with exon, cds and variation features */ - EnsemblOverlap gffFetcher = new EnsemblOverlap(); + SequenceI genomicSequence = null; + EnsemblFeatures gffFetcher = new EnsemblFeatures(getDomain()); EnsemblFeatureType[] features = getFeaturesToFetch(); AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, features); if (geneFeatures.getHeight() > 0) { + genomicSequence = geneFeatures.getSequenceAt(0); + } + if (genomicSequence != null) + { /* * transfer features to the query sequence */ - SequenceI genomicSequence = geneFeatures.getSequenceAt(0); SequenceI querySeq = alignment.findName(accId); - transferFeatures(accId, genomicSequence, querySeq); + if (transferFeatures(accId, genomicSequence, querySeq)) + { - /* - * fetch and map protein product, and add it as a cross-reference - * of the retrieved sequence - */ - addProteinProduct(querySeq); + /* + * fetch and map protein product, and add it as a cross-reference + * of the retrieved sequence + */ + addProteinProduct(querySeq); + } } } catch (IOException e) { @@ -222,10 +257,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String accId = querySeq.getName(); try { - AlignmentI protein = new EnsemblProtein().getSequenceRecords(accId); + AlignmentI protein = new EnsemblProtein(getDomain()) + .getSequenceRecords(accId); if (protein == null || protein.getHeight() == 0) { - System.out.println("Failed to retrieve protein for " + accId); + System.out.println("No protein product found for " + accId); return; } SequenceI proteinSeq = protein.getSequenceAt(0); @@ -236,13 +272,23 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient proteinSeq.createDatasetSequence(); querySeq.createDatasetSequence(); - MapList mapList = mapCdsToProtein(querySeq, proteinSeq); + MapList mapList = AlignmentUtils.mapCdsToProtein(querySeq, proteinSeq); if (mapList != null) { - Mapping map = new Mapping(proteinSeq.getDatasetSequence(), mapList); - DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(), - accId, map); + // clunky: ensure Uniprot xref if we have one is on mapped sequence + SequenceI ds = proteinSeq.getDatasetSequence(); + ds.setSourceDBRef(proteinSeq.getSourceDBRef()); + + Mapping map = new Mapping(ds, mapList); + DBRefEntry dbr = new DBRefEntry(getDbSource(), + getEnsemblDataVersion(), proteinSeq.getName(), map); querySeq.getDatasetSequence().addDBRef(dbr); + + /* + * copy exon features to protein, compute peptide variants from dna + * variants and add as features on the protein sequence ta-da + */ + AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, mapList); } } catch (Exception e) { @@ -253,55 +299,37 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Returns a mapping from dna to protein by inspecting sequence features of - * type "CDS" on the dna. + * Get database xrefs from Ensembl, and attach them to the sequence * - * @param dnaSeq - * @param proteinSeq - * @return + * @param seq */ - protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq) + protected void getCrossReferences(SequenceI seq) { - SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); - if (sfs == null) + while (seq.getDatasetSequence() != null) { - return null; + seq = seq.getDatasetSequence(); } - List ranges = new ArrayList(50); - SequenceOntology so = SequenceOntology.getInstance(); - - int mappedDnaLength = 0; - - /* - * Map CDS columns of dna to peptide. No need to worry about reverse strand - * dna here since the retrieved sequence is as transcribed (reverse - * complement for reverse strand), i.e in the same sense as the peptide. - */ - for (SequenceFeature sf : sfs) + EnsemblXref xrefFetcher = new EnsemblXref(getDomain()); + List xrefs = xrefFetcher.getCrossReferences(seq.getName()); + for (DBRefEntry xref : xrefs) { + seq.addDBRef(xref); /* - * process a CDS feature (or a sub-type of CDS) + * Save any Uniprot xref to be the reference for SIFTS mapping */ - if (so.isA(sf.getType(), SequenceOntology.CDS)) + if (DBRefSource.UNIPROT.equals(xref.getSource())) { - ranges.add(new int[] { sf.getBegin(), sf.getEnd() }); - mappedDnaLength += Math.abs(sf.getEnd() - sf.getBegin()) + 1; + seq.setSourceDBRef(xref); } } - int proteinLength = proteinSeq.getLength(); - List proteinRange = new ArrayList(); - proteinRange.add(new int[] { 1, proteinLength }); /* - * dna length should map to protein (or protein minus stop codon) + * and add a reference to itself */ - if (mappedDnaLength == 3 * proteinLength - || mappedDnaLength == 3 * (proteinLength + 1)) - { - return new MapList(ranges, proteinRange, 3, 1); - } - return null; + DBRefEntry self = new DBRefEntry(getDbSource(), + getEnsemblDataVersion(), seq.getName()); + seq.addDBRef(self); } /** @@ -336,6 +364,15 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient "Only retrieved %d sequences for %d query strings", fr .getSeqs().size(), ids.size())); } + + if (fr.getSeqs().size() == 1 && fr.getSeqs().get(0).getLength() == 0) + { + /* + * POST request has returned an empty FASTA file e.g. for invalid id + */ + throw new IOException("No data returned for " + ids); + } + if (fr.getSeqs().size() > 0) { AlignmentI seqal = new Alignment( @@ -350,7 +387,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient if (ids.contains(name) || ids.contains(name.replace("ENSP", "ENST"))) { - DBRefUtils.parseToDbRef(sq, DBRefSource.ENSEMBL, "0", name); + DBRefUtils.parseToDbRef(sq, getDbSource(), + getEnsemblDataVersion(), name); } } if (alignment == null) @@ -374,10 +412,16 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient @Override protected URL getUrl(List ids) throws MalformedURLException { - // ids are not used - they go in the POST body instead + /* + * a single id is included in the URL path + * multiple ids go in the POST body instead + */ StringBuffer urlstring = new StringBuffer(128); - urlstring.append(SEQUENCE_ID_URL); - + urlstring.append(getDomain() + "/sequence/id"); + if (ids.size() == 1) + { + urlstring.append("/").append(ids.get(0)); + } // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats urlstring.append("?type=").append(getSourceEnsemblType().getType()); urlstring.append(("&Accept=text/x-fasta")); @@ -398,19 +442,19 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } @Override - public boolean useGetRequest() + protected boolean useGetRequest() { return false; } @Override - public String getRequestMimeType() + protected String getRequestMimeType(boolean multipleIds) { - return "application/json"; + return multipleIds ? "application/json" : "text/x-fasta"; } @Override - public String getResponseMimeType() + protected String getResponseMimeType() { return "text/x-fasta"; } @@ -441,14 +485,23 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * backwards (for negative strand). Aborts and returns null if both positive * and negative strand are found (this should not normally happen). * - * @param sfs + * @param sourceSequence * @param accId + * @param start + * the start position of the sequence we are mapping to * @return */ - protected MapList getGenomicRanges(SequenceFeature[] sfs, String accId) + protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence, + String accId, int start) { + SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); + if (sfs == null) + { + return null; + } + /* - * generously size for initial number of cds regions + * generously initial size for number of cds regions * (worst case titin Q8WZ42 has c. 313 exons) */ List regions = new ArrayList(100); @@ -464,11 +517,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ if (identifiesSequence(sf, accId)) { - int strand = sf.getStrand(); - - if (directionSet && strand != direction) - { - // abort - mix of forward and backward + int strand = sf.getStrand(); + strand = strand == 0 ? 1 : strand; // treat unknown as forward + + if (directionSet && strand != direction) + { + // abort - mix of forward and backward System.err.println("Error: forward and backward strand for " + accId); return null; @@ -485,27 +539,54 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } else { - regions.add(new int[] { sf.getBegin(), sf.getEnd() }); - } - mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); + regions.add(new int[] { sf.getBegin(), sf.getEnd() }); } + mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); + + if (!isSpliceable()) + { + /* + * 'gene' sequence is contiguous so we can stop as soon as its + * identifying feature has been found + */ + break; + } + } } + if (regions.isEmpty()) + { + System.out.println("Failed to identify target sequence for " + accId + + " from genomic features"); + return null; + } + /* * a final sort is needed since Ensembl returns CDS sorted within source * (havana / ensembl_havana) */ Collections.sort(regions, new RangeSorter(direction == 1)); - List to = new ArrayList(); - to.add(new int[] { 1, mappedLength }); + List to = Arrays.asList(new int[] { start, + start + mappedLength - 1 }); return new MapList(regions, to, 1, 1); } /** - * Returns true if the sequence feature identifies positions of the genomic - * sequence feature which are within the sequence being retrieved. + * Answers true if the sequence being retrieved may occupy discontiguous + * regions on the genomic sequence. + */ + protected boolean isSpliceable() + { + return true; + } + + /** + * Returns true if the sequence feature marks positions of the genomic + * sequence feature which are within the sequence being retrieved. For + * example, an 'exon' feature whose parent is the target transcript marks the + * cdna positions of the transcript. * * @param sf * @param accId @@ -515,53 +596,109 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String accId); /** - * Transfers the sequence feature to the target sequence, adjusting its start - * and end range based on the 'overlap' ranges. Features which do not overlap - * the target sequence are ignored, as are features with a parent other than - * the target sequence id. + * Transfers the sequence feature to the target sequence, locating its start + * and end range based on the mapping. Features which do not overlap the + * target sequence are ignored. * * @param sf * @param targetSequence - * @param overlap + * @param mapping + * mapping from the sequence feature's coordinates to the target + * sequence + * @param forwardStrand */ protected void transferFeature(SequenceFeature sf, - SequenceI targetSequence, MapList overlap) + SequenceI targetSequence, MapList mapping, boolean forwardStrand) { - String parent = (String) sf.getValue(PARENT); - if (parent != null && !parent.contains(targetSequence.getName())) - { - // this genomic feature belongs to a different transcript - return; - } - int start = sf.getBegin(); int end = sf.getEnd(); - int[] mappedRange = overlap.locateInTo(start, end); + int[] mappedRange = mapping.locateInTo(start, end); if (mappedRange != null) { SequenceFeature copy = new SequenceFeature(sf); - int offset = targetSequence.getStart() - 1; - copy.setBegin(offset + Math.min(mappedRange[0], mappedRange[1])); - copy.setEnd(offset + Math.max(mappedRange[0], mappedRange[1])); + copy.setBegin(Math.min(mappedRange[0], mappedRange[1])); + copy.setEnd(Math.max(mappedRange[0], mappedRange[1])); targetSequence.addSequenceFeature(copy); /* - * for sequence_variant, make an additional feature with consequence + * for sequence_variant on reverse strand, have to convert the allele + * values to their complements */ - if (SequenceOntology.getInstance().isSequenceVariant(sf.getType())) + if (!forwardStrand + && SequenceOntologyFactory.getInstance().isA(sf.getType(), + SequenceOntologyI.SEQUENCE_VARIANT)) { - String consequence = (String) sf.getValue("consequence_type"); - if (consequence != null) - { - SequenceFeature sf2 = new SequenceFeature("consequence", - consequence, copy.getBegin(), copy.getEnd(), 0f, - null); - targetSequence.addSequenceFeature(sf2); - } + reverseComplementAlleles(copy); + } + } + } + + /** + * Change the 'alleles' value of a feature by converting to complementary + * bases, and also update the feature description to match + * + * @param sf + */ + static void reverseComplementAlleles(SequenceFeature sf) + { + final String alleles = (String) sf.getValue(ALLELES); + if (alleles == null) + { + return; + } + StringBuilder complement = new StringBuilder(alleles.length()); + for (String allele : alleles.split(",")) + { + reverseComplementAllele(complement, allele); + } + String comp = complement.toString(); + sf.setValue(ALLELES, comp); + sf.setDescription(comp); + + /* + * replace value of "alleles=" in sf.ATTRIBUTES as well + * so 'output as GFF' shows reverse complement alleles + */ + String atts = sf.getAttributes(); + if (atts != null) + { + atts = atts.replace(ALLELES + "=" + alleles, ALLELES + "=" + comp); + sf.setAttributes(atts); + } + } + + /** + * Makes the 'reverse complement' of the given allele and appends it to the + * buffer, after a comma separator if not the first + * + * @param complement + * @param allele + */ + static void reverseComplementAllele(StringBuilder complement, + String allele) + { + if (complement.length() > 0) + { + complement.append(","); + } + + /* + * some 'alleles' are actually descriptive terms + * e.g. HGMD_MUTATION, PhenCode_variation + * - we don't want to 'reverse complement' these + */ + if (!Comparison.isNucleotideSequence(allele, true)) + { + complement.append(allele); + } + else + { + for (int i = allele.length() - 1; i >= 0; i--) + { + complement.append(Dna.getComplement(allele.charAt(i))); } } - } /** @@ -570,51 +707,182 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * @param accessionId * @param sourceSequence * @param targetSequence + * @return true if any features were transferred, else false */ - protected void transferFeatures(String accessionId, + protected boolean transferFeatures(String accessionId, SequenceI sourceSequence, SequenceI targetSequence) { if (sourceSequence == null || targetSequence == null) { - return; + return false; } + // long start = System.currentTimeMillis(); SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); - MapList overlap = getGenomicRanges(sfs, accessionId); + MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId, + targetSequence.getStart()); + if (mapping == null) + { + return false; + } - final boolean forwardStrand = overlap.isFromForwardStrand(); + boolean result = transferFeatures(sfs, targetSequence, mapping, + accessionId); + // System.out.println("transferFeatures (" + (sfs.length) + " --> " + // + targetSequence.getSequenceFeatures().length + ") to " + // + targetSequence.getName() + // + " took " + (System.currentTimeMillis() - start) + "ms"); + return result; + } + + /** + * Transfer features to the target sequence. The start/end positions are + * converted using the mapping. Features which do not overlap are ignored. + * Features whose parent is not the specified identifier are also ignored. + * + * @param features + * @param targetSequence + * @param mapping + * @param parentId + * @return + */ + protected boolean transferFeatures(SequenceFeature[] features, + SequenceI targetSequence, MapList mapping, String parentId) + { + final boolean forwardStrand = mapping.isFromForwardStrand(); /* - * sort features by start position (descending if reverse strand) - * before transferring (in forwards order) to the target sequence + * sort features by start position (which corresponds to end + * position descending if reverse strand) so as to add them in + * 'forwards' order to the target sequence */ - Arrays.sort(sfs, new Comparator() + sortFeatures(features, forwardStrand); + + boolean transferred = false; + for (SequenceFeature sf : features) + { + if (retainFeature(sf, parentId)) + { + transferFeature(sf, targetSequence, mapping, forwardStrand); + transferred = true; + } + } + return transferred; + } + + /** + * Sort features by start position ascending (if on forward strand), or end + * position descending (if on reverse strand) + * + * @param features + * @param forwardStrand + */ + protected static void sortFeatures(SequenceFeature[] features, + final boolean forwardStrand) + { + Arrays.sort(features, new Comparator() { @Override public int compare(SequenceFeature o1, SequenceFeature o2) { - int c = Integer.compare(o1.getBegin(), o2.getBegin()); - return forwardStrand ? c : -c; + if (forwardStrand) + { + return Integer.compare(o1.getBegin(), o2.getBegin()); + } + else + { + return Integer.compare(o2.getEnd(), o1.getEnd()); + } } }); + } - for (SequenceFeature sf : sfs) + /** + * Answers true if the feature type is one we want to keep for the sequence. + * Some features are only retrieved in order to identify the sequence range, + * and may then be discarded as redundant information (e.g. "CDS" feature for + * a CDS sequence). + */ + @SuppressWarnings("unused") + protected boolean retainFeature(SequenceFeature sf, String accessionId) + { + return true; // override as required + } + + /** + * Answers true if the feature has a Parent which refers to the given + * accession id, or if the feature has no parent. Answers false if the + * feature's Parent is for a different accession id. + * + * @param sf + * @param identifier + * @return + */ + protected boolean featureMayBelong(SequenceFeature sf, String identifier) + { + String parent = (String) sf.getValue(PARENT); + // using contains to allow for prefix "gene:", "transcript:" etc + if (parent != null && !parent.contains(identifier)) { - if (retainFeature(sf.getType())) - { - transferFeature(sf, targetSequence, overlap); - } + // this genomic feature belongs to a different transcript + return false; } + return true; + } + + @Override + public String getDescription() + { + return "Ensembl " + getSourceEnsemblType().getType() + + " sequence with variant features"; } /** - * Answers true if the feature type is one to attach to the retrieved sequence + * Returns a (possibly empty) list of features on the sequence which have the + * specified sequence ontology type (or a sub-type of it), and the given + * identifier as parent * + * @param sequence * @param type + * @param parentId + * @return + */ + protected List findFeatures(SequenceI sequence, + String type, String parentId) + { + List result = new ArrayList(); + + SequenceFeature[] sfs = sequence.getSequenceFeatures(); + if (sfs != null) { + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + for (SequenceFeature sf :sfs) { + if (so.isA(sf.getType(), type)) + { + String parent = (String) sf.getValue(PARENT); + if (parent.equals(parentId)) + { + result.add(sf); + } + } + } + } + return result; + } + + /** + * Answers true if the feature type is either 'NMD_transcript_variant' or + * 'transcript' or one of its sub-types in the Sequence Ontology. This is + * needed because NMD_transcript_variant behaves like 'transcript' in Ensembl + * although strictly speaking it is not (it is a sub-type of + * sequence_variant). + * + * @param featureType * @return */ - protected boolean retainFeature(@SuppressWarnings("unused") String type) + public static boolean isTranscript(String featureType) { - return true; // default is to keep all + return SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(featureType) + || SequenceOntologyFactory.getInstance().isA(featureType, + SequenceOntologyI.TRANSCRIPT); } }