X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fensembl%2FEnsemblSeqProxy.java;h=4af6525c7a3bdb97475f1766f63986206d12e57f;hb=a064561d8665ee9db217b17cda826fceac90cbbc;hp=a6d838bf9adae2927c4f977006116948fd6a7cb9;hpb=cb6d2306e75ecea509fb1fde9736ff593e8e5837;p=jalview.git diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index a6d838b..4af6525 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -1,5 +1,6 @@ package jalview.ext.ensembl; +import jalview.analysis.AlignmentUtils; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; @@ -10,7 +11,8 @@ import jalview.datamodel.SequenceI; import jalview.exceptions.JalviewException; import jalview.io.FastaFile; import jalview.io.FileParse; -import jalview.io.gff.SequenceOntology; +import jalview.io.gff.SequenceOntologyFactory; +import jalview.io.gff.SequenceOntologyI; import jalview.util.DBRefUtils; import jalview.util.MapList; @@ -26,43 +28,47 @@ import java.util.List; /** * Base class for Ensembl sequence fetchers * + * @see http://rest.ensembl.org/documentation/info/sequence_id * @author gmcarstairs */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { + private static final List CROSS_REFERENCES = Arrays + .asList(new String[] { "CCDS", "Uniprot/SWISSPROT", + "Uniprot/SPTREMBL" }); + protected static final String CONSEQUENCE_TYPE = "consequence_type"; protected static final String PARENT = "Parent"; protected static final String ID = "ID"; - /* - * this needs special handling, as it isA sequence_variant in the - * Sequence Ontology, but behaves in Ensembl as if it isA transcript - */ - protected static final String NMD_VARIANT = "NMD_transcript_variant"; - protected static final String NAME = "Name"; + protected static final String DESCRIPTION = "description"; + + /* + * enum for 'type' parameter to the /sequence REST service + */ public enum EnsemblSeqType { /** - * type=genomic for the full dna including introns + * type=genomic to fetch full dna including introns */ GENOMIC("genomic"), /** - * type=cdna for transcribed dna including UTRs + * type=cdna to fetch dna including UTRs */ CDNA("cdna"), /** - * type=cds for coding dna excluding UTRs + * type=cds to fetch coding dna excluding UTRs */ CDS("cds"), /** - * type=protein for the peptide product sequence + * type=protein to fetch peptide product sequence */ PROTEIN("protein"); @@ -105,10 +111,19 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Constructor + * Default constructor (to use rest.ensembl.org) */ public EnsemblSeqProxy() { + super(); + } + + /** + * Constructor given the target domain to fetch data from + */ + public EnsemblSeqProxy(String d) + { + super(d); } /** @@ -118,7 +133,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient @Override public AlignmentI getSequenceRecords(String query) throws Exception { - long now = System.currentTimeMillis(); // TODO use a String... query vararg instead? // danger: accession separator used as a regex here, a string elsewhere @@ -147,17 +161,15 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient + " chunks. Unexpected problem (" + r.getLocalizedMessage() + ")"; System.err.println(msg); - if (alignment != null) - { - break; // return what we got - } - else - { - throw new JalviewException(msg, r); - } + break; } } + if (alignment == null) + { + return null; + } + /* * fetch and transfer genomic sequence features, * fetch protein product and add as cross-reference @@ -167,9 +179,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient addFeaturesAndProduct(accId, alignment); } - inProgress = false; - System.out.println(getClass().getName() + " took " - + (System.currentTimeMillis() - now) + "ms to fetch"); + for (SequenceI seq : alignment.getSequences()) + { + getCrossReferences(seq); + } + return alignment; } @@ -195,7 +209,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * get 'dummy' genomic sequence with exon, cds and variation features */ SequenceI genomicSequence = null; - EnsemblOverlap gffFetcher = new EnsemblOverlap(); + EnsemblFeatures gffFetcher = new EnsemblFeatures(getDomain()); EnsemblFeatureType[] features = getFeaturesToFetch(); AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, features); @@ -245,10 +259,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String accId = querySeq.getName(); try { - AlignmentI protein = new EnsemblProtein().getSequenceRecords(accId); + AlignmentI protein = new EnsemblProtein(getDomain()) + .getSequenceRecords(accId); if (protein == null || protein.getHeight() == 0) { - System.out.println("Failed to retrieve protein for " + accId); + System.out.println("No protein product found for " + accId); return; } SequenceI proteinSeq = protein.getSequenceAt(0); @@ -259,13 +274,22 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient proteinSeq.createDatasetSequence(); querySeq.createDatasetSequence(); - MapList mapList = mapCdsToProtein(querySeq, proteinSeq); + MapList mapList = AlignmentUtils.mapCdsToProtein(querySeq, proteinSeq); if (mapList != null) { - Mapping map = new Mapping(proteinSeq.getDatasetSequence(), mapList); + // clunky: ensure Uniprot xref if we have one is on mapped sequence + SequenceI ds = proteinSeq.getDatasetSequence(); + ds.setSourceDBRef(proteinSeq.getSourceDBRef()); + Mapping map = new Mapping(ds, mapList); DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(), accId, map); querySeq.getDatasetSequence().addDBRef(dbr); + + /* + * copy exon features to protein, compute peptide variants from dna + * variants and add as features on the protein sequence ta-da + */ + AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, mapList); } } catch (Exception e) { @@ -276,85 +300,43 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Returns a mapping from dna to protein by inspecting sequence features of - * type "CDS" on the dna. + * Get database xrefs from Ensembl, and attach them to the sequence * - * @param dnaSeq - * @param proteinSeq - * @return + * @param seq */ - protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq) + protected void getCrossReferences(SequenceI seq) { - SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); - if (sfs == null) + while (seq.getDatasetSequence() != null) { - return null; + seq = seq.getDatasetSequence(); } - List ranges = new ArrayList(50); - SequenceOntology so = SequenceOntology.getInstance(); - - int mappedDnaLength = 0; - - /* - * Map CDS columns of dna to peptide. No need to worry about reverse strand - * dna here since the retrieved sequence is as transcribed (reverse - * complement for reverse strand), i.e in the same sense as the peptide. - */ - boolean fivePrimeIncomplete = false; - for (SequenceFeature sf : sfs) + EnsemblXref xrefFetcher = new EnsemblXref(getDomain()); + List xrefs = xrefFetcher.getCrossReferences(seq.getName(), + getCrossReferenceDatabases()); + for (DBRefEntry xref : xrefs) { + seq.addDBRef(xref); /* - * process a CDS feature (or a sub-type of CDS) + * Save any Uniprot xref to be the reference for SIFTS mapping */ - if (so.isA(sf.getType(), SequenceOntology.CDS)) + if (DBRefSource.UNIPROT.equals(xref.getSource())) { - int phase = 0; - try { - phase = Integer.parseInt(sf.getPhase()); - } catch (NumberFormatException e) - { - // ignore - } - /* - * phase > 0 on first codon means 5' incomplete - skip to the start - * of the next codon; example ENST00000496384 - */ - int begin = sf.getBegin(); - int end = sf.getEnd(); - if (ranges.isEmpty() && phase > 0) - { - fivePrimeIncomplete = true; - begin += phase; - if (begin > end) - { - continue; // shouldn't happen? - } - } - ranges.add(new int[] { begin, end }); - mappedDnaLength += Math.abs(end - begin) + 1; + seq.setSourceDBRef(xref); } } - int proteinLength = proteinSeq.getLength(); - List proteinRange = new ArrayList(); - int proteinStart = 1; - if (fivePrimeIncomplete && proteinSeq.getCharAt(0) == 'X') - { - proteinStart = 2; - proteinLength--; - } - proteinRange.add(new int[] { proteinStart, proteinLength }); + } - /* - * dna length should map to protein (or protein plus stop codon) - */ - int codesForResidues = mappedDnaLength / 3; - if (codesForResidues == proteinLength - || codesForResidues == (proteinLength + 1)) - { - return new MapList(ranges, proteinRange, 3, 1); - } - return null; + /** + * Returns a list of database names to be used when fetching cross-references. + * Specifically, the names are used to filter data returned by the Ensembl + * xrefs REST service on the value in field 'dbname'. + * + * @return + */ + protected List getCrossReferenceDatabases() + { + return CROSS_REFERENCES; } /** @@ -441,7 +423,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * multiple ids go in the POST body instead */ StringBuffer urlstring = new StringBuffer(128); - urlstring.append(SEQUENCE_ID_URL); + urlstring.append(getDomain() + "/sequence/id"); if (ids.size() == 1) { urlstring.append("/").append(ids.get(0)); @@ -515,7 +497,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * the start position of the sequence we are mapping to * @return */ - protected MapList getGenomicRanges(SequenceI sourceSequence, + protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence, String accId, int start) { SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); @@ -541,11 +523,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ if (identifiesSequence(sf, accId)) { - int strand = sf.getStrand(); - - if (directionSet && strand != direction) - { - // abort - mix of forward and backward + int strand = sf.getStrand(); + strand = strand == 0 ? 1 : strand; // treat unknown as forward + + if (directionSet && strand != direction) + { + // abort - mix of forward and backward System.err.println("Error: forward and backward strand for " + accId); return null; @@ -590,8 +573,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ Collections.sort(regions, new RangeSorter(direction == 1)); - List to = new ArrayList(); - to.add(new int[] { start, start + mappedLength - 1 }); + List to = Arrays.asList(new int[] { start, + start + mappedLength - 1 }); return new MapList(regions, to, 1, 1); } @@ -646,17 +629,18 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /* * for sequence_variant, make an additional feature with consequence */ - if (SequenceOntology.getInstance().isSequenceVariant(sf.getType())) - { - String consequence = (String) sf.getValue(CONSEQUENCE_TYPE); - if (consequence != null) - { - SequenceFeature sf2 = new SequenceFeature("consequence", - consequence, copy.getBegin(), copy.getEnd(), 0f, - null); - targetSequence.addSequenceFeature(sf2); - } - } + // if (SequenceOntologyFactory.getInstance().isA(sf.getType(), + // SequenceOntologyI.SEQUENCE_VARIANT)) + // { + // String consequence = (String) sf.getValue(CONSEQUENCE_TYPE); + // if (consequence != null) + // { + // SequenceFeature sf2 = new SequenceFeature("consequence", + // consequence, copy.getBegin(), copy.getEnd(), 0f, + // null); + // targetSequence.addSequenceFeature(sf2); + // } + // } } } @@ -676,15 +660,22 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return false; } + // long start = System.currentTimeMillis(); SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); - MapList mapping = getGenomicRanges(sourceSequence, accessionId, + MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId, targetSequence.getStart()); if (mapping == null) { return false; } - return transferFeatures(sfs, targetSequence, mapping, accessionId); + boolean result = transferFeatures(sfs, targetSequence, mapping, + accessionId); + // System.out.println("transferFeatures (" + (sfs.length) + " --> " + // + targetSequence.getSequenceFeatures().length + ") to " + // + targetSequence.getName() + // + " took " + (System.currentTimeMillis() - start) + "ms"); + return result; } /** @@ -786,7 +777,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient SequenceFeature[] sfs = sequence.getSequenceFeatures(); if (sfs != null) { - SequenceOntology so = SequenceOntology.getInstance(); + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); for (SequenceFeature sf :sfs) { if (so.isA(sf.getType(), type)) { @@ -813,7 +804,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ public static boolean isTranscript(String featureType) { - return NMD_VARIANT.equals(featureType) - || SequenceOntology.getInstance().isA(featureType, SequenceOntology.TRANSCRIPT); + return SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(featureType) + || SequenceOntologyFactory.getInstance().isA(featureType, + SequenceOntologyI.TRANSCRIPT); } }