X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fensembl%2FEnsemblSeqProxy.java;h=e77051d493bf6d676de3b8d03ed936bc958f07a5;hb=46b2bc114452c0e30463214b3e82eb0a5c98d23f;hp=a6d838bf9adae2927c4f977006116948fd6a7cb9;hpb=cb6d2306e75ecea509fb1fde9736ff593e8e5837;p=jalview.git diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index a6d838b..e77051d 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -1,5 +1,6 @@ package jalview.ext.ensembl; +import jalview.analysis.AlignmentUtils; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; @@ -10,9 +11,13 @@ import jalview.datamodel.SequenceI; import jalview.exceptions.JalviewException; import jalview.io.FastaFile; import jalview.io.FileParse; -import jalview.io.gff.SequenceOntology; +import jalview.io.gff.SequenceOntologyFactory; +import jalview.io.gff.SequenceOntologyI; +import jalview.schemes.ResidueProperties; import jalview.util.DBRefUtils; import jalview.util.MapList; +import jalview.util.MappingUtils; +import jalview.util.StringUtils; import java.io.IOException; import java.net.MalformedURLException; @@ -21,7 +26,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map.Entry; /** * Base class for Ensembl sequence fetchers @@ -30,39 +37,39 @@ import java.util.List; */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { + private static final List CROSS_REFERENCES = Arrays + .asList(new String[] { "CCDS" }); + protected static final String CONSEQUENCE_TYPE = "consequence_type"; protected static final String PARENT = "Parent"; protected static final String ID = "ID"; - /* - * this needs special handling, as it isA sequence_variant in the - * Sequence Ontology, but behaves in Ensembl as if it isA transcript - */ - protected static final String NMD_VARIANT = "NMD_transcript_variant"; - protected static final String NAME = "Name"; + /* + * enum for 'type' parameter to the /sequence REST service + */ public enum EnsemblSeqType { /** - * type=genomic for the full dna including introns + * type=genomic to fetch full dna including introns */ GENOMIC("genomic"), /** - * type=cdna for transcribed dna including UTRs + * type=cdna to fetch dna including UTRs */ CDNA("cdna"), /** - * type=cds for coding dna excluding UTRs + * type=cds to fetch coding dna excluding UTRs */ CDS("cds"), /** - * type=protein for the peptide product sequence + * type=protein to fetch peptide product sequence */ PROTEIN("protein"); @@ -118,7 +125,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient @Override public AlignmentI getSequenceRecords(String query) throws Exception { - long now = System.currentTimeMillis(); // TODO use a String... query vararg instead? // danger: accession separator used as a regex here, a string elsewhere @@ -147,14 +153,15 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient + " chunks. Unexpected problem (" + r.getLocalizedMessage() + ")"; System.err.println(msg); - if (alignment != null) - { - break; // return what we got - } - else - { - throw new JalviewException(msg, r); - } + break; + // if (alignment != null) + // { + // break; // return what we got + // } + // else + // { + // throw new JalviewException(msg, r); + // } } } @@ -167,9 +174,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient addFeaturesAndProduct(accId, alignment); } - inProgress = false; - System.out.println(getClass().getName() + " took " - + (System.currentTimeMillis() - now) + "ms to fetch"); + for (SequenceI seq : alignment.getSequences()) + { + getCrossReferences(seq); + } + return alignment; } @@ -195,7 +204,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * get 'dummy' genomic sequence with exon, cds and variation features */ SequenceI genomicSequence = null; - EnsemblOverlap gffFetcher = new EnsemblOverlap(); + EnsemblFeatures gffFetcher = new EnsemblFeatures(); EnsemblFeatureType[] features = getFeaturesToFetch(); AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, features); @@ -262,10 +271,19 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient MapList mapList = mapCdsToProtein(querySeq, proteinSeq); if (mapList != null) { - Mapping map = new Mapping(proteinSeq.getDatasetSequence(), mapList); + // clunky: ensure Uniprot xref if we have one is on mapped sequence + SequenceI ds = proteinSeq.getDatasetSequence(); + ds.setSourceDBRef(proteinSeq.getSourceDBRef()); + Mapping map = new Mapping(ds, mapList); DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(), accId, map); querySeq.getDatasetSequence().addDBRef(dbr); + + /* + * compute peptide variants from dna variants and add as + * sequence features on the protein sequence ta-da + */ + computeProteinFeatures(querySeq, proteinSeq, mapList); } } catch (Exception e) { @@ -276,6 +294,45 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** + * Get Uniprot and PDB xrefs from Ensembl, and attach them to the protein + * sequence + * + * @param seq + */ + protected void getCrossReferences(SequenceI seq) + { + while (seq.getDatasetSequence() != null) + { + seq = seq.getDatasetSequence(); + } + + EnsemblXref xrefFetcher = new EnsemblXref(); + List xrefs = xrefFetcher.getCrossReferences(seq.getName(), + getCrossReferenceDatabases()); + for (DBRefEntry xref : xrefs) + { + seq.addDBRef(xref); + /* + * Save any Uniprot xref to be the reference for SIFTS mapping + */ + if (DBRefSource.UNIPROT.equals(xref.getSource())) + { + seq.setSourceDBRef(xref); + } + } + } + + /** + * Returns a list of database names to be used when fetching cross-references. + * + * @return + */ + protected List getCrossReferenceDatabases() + { + return CROSS_REFERENCES; + } + + /** * Returns a mapping from dna to protein by inspecting sequence features of * type "CDS" on the dna. * @@ -285,29 +342,64 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq) { - SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); - if (sfs == null) + List ranges = new ArrayList(50); + + int mappedDnaLength = getCdsRanges(dnaSeq, ranges); + + int proteinLength = proteinSeq.getLength(); + List proteinRange = new ArrayList(); + int proteinStart = 1; + + /* + * incomplete start codon may mean X at start of peptide + * we ignore both for mapping purposes + */ + if (proteinSeq.getCharAt(0) == 'X') { - return null; + proteinStart = 2; + proteinLength--; } + proteinRange.add(new int[] { proteinStart, proteinLength }); - List ranges = new ArrayList(50); - SequenceOntology so = SequenceOntology.getInstance(); - - int mappedDnaLength = 0; - /* - * Map CDS columns of dna to peptide. No need to worry about reverse strand - * dna here since the retrieved sequence is as transcribed (reverse - * complement for reverse strand), i.e in the same sense as the peptide. + * dna length should map to protein (or protein plus stop codon) */ - boolean fivePrimeIncomplete = false; + int codesForResidues = mappedDnaLength / 3; + if (codesForResidues == proteinLength + || codesForResidues == (proteinLength + 1)) + { + return new MapList(ranges, proteinRange, 3, 1); + } + return null; + } + + /** + * Adds CDS ranges to the ranges list, and returns the total length mapped + * from. + * + * No need to worry about reverse strand dna, here since the retrieved + * sequence is as transcribed (reverse complement for reverse strand), i.e in + * the same sense as the peptide. + * + * @param dnaSeq + * @param ranges + * @return + */ + protected int getCdsRanges(SequenceI dnaSeq, List ranges) + { + SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); + if (sfs == null) + { + return 0; + } + int mappedDnaLength = 0; for (SequenceFeature sf : sfs) { /* * process a CDS feature (or a sub-type of CDS) */ - if (so.isA(sf.getType(), SequenceOntology.CDS)) + if (SequenceOntologyFactory.getInstance().isA(sf.getType(), + SequenceOntologyI.CDS)) { int phase = 0; try { @@ -324,7 +416,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient int end = sf.getEnd(); if (ranges.isEmpty() && phase > 0) { - fivePrimeIncomplete = true; begin += phase; if (begin > end) { @@ -335,26 +426,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient mappedDnaLength += Math.abs(end - begin) + 1; } } - int proteinLength = proteinSeq.getLength(); - List proteinRange = new ArrayList(); - int proteinStart = 1; - if (fivePrimeIncomplete && proteinSeq.getCharAt(0) == 'X') - { - proteinStart = 2; - proteinLength--; - } - proteinRange.add(new int[] { proteinStart, proteinLength }); - - /* - * dna length should map to protein (or protein plus stop codon) - */ - int codesForResidues = mappedDnaLength / 3; - if (codesForResidues == proteinLength - || codesForResidues == (proteinLength + 1)) - { - return new MapList(ranges, proteinRange, 3, 1); - } - return null; + return mappedDnaLength; } /** @@ -515,7 +587,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * the start position of the sequence we are mapping to * @return */ - protected MapList getGenomicRanges(SequenceI sourceSequence, + protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence, String accId, int start) { SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); @@ -541,11 +613,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ if (identifiesSequence(sf, accId)) { - int strand = sf.getStrand(); - - if (directionSet && strand != direction) - { - // abort - mix of forward and backward + int strand = sf.getStrand(); + strand = strand == 0 ? 1 : strand; // treat unknown as forward + + if (directionSet && strand != direction) + { + // abort - mix of forward and backward System.err.println("Error: forward and backward strand for " + accId); return null; @@ -590,8 +663,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ Collections.sort(regions, new RangeSorter(direction == 1)); - List to = new ArrayList(); - to.add(new int[] { start, start + mappedLength - 1 }); + List to = Arrays.asList(new int[] { start, + start + mappedLength - 1 }); return new MapList(regions, to, 1, 1); } @@ -646,7 +719,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /* * for sequence_variant, make an additional feature with consequence */ - if (SequenceOntology.getInstance().isSequenceVariant(sf.getType())) + if (SequenceOntologyFactory.getInstance().isA(sf.getType(), + SequenceOntologyI.SEQUENCE_VARIANT)) { String consequence = (String) sf.getValue(CONSEQUENCE_TYPE); if (consequence != null) @@ -677,7 +751,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); - MapList mapping = getGenomicRanges(sourceSequence, accessionId, + MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId, targetSequence.getStart()); if (mapping == null) { @@ -786,7 +860,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient SequenceFeature[] sfs = sequence.getSequenceFeatures(); if (sfs != null) { - SequenceOntology so = SequenceOntology.getInstance(); + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); for (SequenceFeature sf :sfs) { if (so.isA(sf.getType(), type)) { @@ -802,6 +876,240 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** + * Maps exon features from dna to protein, and computes variants in peptide + * product generated by variants in dna, and adds them as sequence_variant + * features on the protein sequence. Returns the number of variant features + * added. + * + * @param dnaSeq + * @param peptide + * @param dnaToProtein + */ + static int computeProteinFeatures(SequenceI dnaSeq, + SequenceI peptide, MapList dnaToProtein) + { + while (dnaSeq.getDatasetSequence() != null) + { + dnaSeq = dnaSeq.getDatasetSequence(); + } + while (peptide.getDatasetSequence() != null) + { + peptide = peptide.getDatasetSequence(); + } + + AlignmentUtils.transferFeatures(dnaSeq, peptide, dnaToProtein, + SequenceOntologyI.EXON); + + LinkedHashMap variants = buildDnaVariantsMap( + dnaSeq, dnaToProtein); + + /* + * scan codon variations, compute peptide variants and add to peptide sequence + */ + int count = 0; + for (Entry variant : variants.entrySet()) + { + int peptidePos = variant.getKey(); + String[][] codonVariants = variant.getValue(); + String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based + List peptideVariants = computePeptideVariants(codonVariants, + residue); + if (!peptideVariants.isEmpty()) + { + String desc = StringUtils.listToDelimitedString(peptideVariants, + ", "); + SequenceFeature sf = new SequenceFeature( + SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos, + peptidePos, 0f, null); + peptide.addSequenceFeature(sf); + count++; + } + } + + /* + * ugly sort to get sequence features in start position order + * - would be better to store in Sequence as a TreeSet instead? + */ + Arrays.sort(peptide.getSequenceFeatures(), + new Comparator() + { + @Override + public int compare(SequenceFeature o1, SequenceFeature o2) + { + int c = Integer.compare(o1.getBegin(), o2.getBegin()); + return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd()) + : c; + } + }); + return count; + } + + /** + * Builds a map whose key is position in the protein sequence, and value is an + * array of all variants for the coding codon positions + * + * @param dnaSeq + * @param dnaToProtein + * @return + */ + static LinkedHashMap buildDnaVariantsMap( + SequenceI dnaSeq, MapList dnaToProtein) + { + /* + * map from peptide position to all variant features of the codon for it + * LinkedHashMap ensures we add the peptide features in sequence order + */ + LinkedHashMap variants = new LinkedHashMap(); + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + + SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures(); + if (dnaFeatures == null) + { + return variants; + } + + int dnaStart = dnaSeq.getStart(); + int[] lastCodon = null; + int lastPeptidePostion = 0; + + /* + * build a map of codon variations for peptides + */ + for (SequenceFeature sf : dnaFeatures) + { + int dnaCol = sf.getBegin(); + if (dnaCol != sf.getEnd()) + { + // not handling multi-locus variant features + continue; + } + if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT)) + { + int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol); + if (mapsTo == null) + { + // feature doesn't lie within coding region + continue; + } + int peptidePosition = mapsTo[0]; + String[][] codonVariants = variants.get(peptidePosition); + if (codonVariants == null) + { + codonVariants = new String[3][]; + variants.put(peptidePosition, codonVariants); + } + + /* + * extract dna variants to a string array + */ + String alls = (String) sf.getValue("alleles"); + if (alls == null) + { + continue; + } + String[] alleles = alls.split(","); + + /* + * get this peptides codon positions e.g. [3, 4, 5] or [4, 7, 10] + */ + int[] codon = peptidePosition == lastPeptidePostion ? lastCodon + : MappingUtils.flattenRanges(dnaToProtein.locateInFrom( + peptidePosition, peptidePosition)); + lastPeptidePostion = peptidePosition; + lastCodon = codon; + + /* + * save nucleotide (and this variant) for each codon position + */ + for (int codonPos = 0; codonPos < 3; codonPos++) + { + String nucleotide = String.valueOf(dnaSeq + .getCharAt(codon[codonPos] - dnaStart)); + if (codon[codonPos] == dnaCol) + { + /* + * record current dna base and its alleles + */ + String[] dnaVariants = new String[alleles.length + 1]; + dnaVariants[0] = nucleotide; + System.arraycopy(alleles, 0, dnaVariants, 1, alleles.length); + codonVariants[codonPos] = dnaVariants; + } + else if (codonVariants[codonPos] == null) + { + /* + * record current dna base only + * (at least until we find any variation and overwrite it) + */ + codonVariants[codonPos] = new String[] { nucleotide }; + } + } + } + } + return variants; + } + + /** + * Returns a sorted, non-redundant list of all peptide translations generated + * by the given dna variants, excluding the current residue value + * + * @param codonVariants + * an array of base values (acgtACGT) for codon positions 1, 2, 3 + * @param residue + * the current residue translation + * @return + */ + static List computePeptideVariants( + String[][] codonVariants, String residue) + { + List result = new ArrayList(); + for (String base1 : codonVariants[0]) + { + for (String base2 : codonVariants[1]) + { + for (String base3 : codonVariants[2]) + { + String codon = base1 + base2 + base3; + // TODO: report frameshift/insertion/deletion + // and multiple-base variants?! + String peptide = codon.contains("-") ? "-" : ResidueProperties + .codonTranslate(codon); + if (peptide != null && !result.contains(peptide) + && !peptide.equalsIgnoreCase(residue)) + { + result.add(peptide); + } + } + } + } + + /* + * sort alphabetically with STOP at the end + */ + Collections.sort(result, new Comparator() + { + + @Override + public int compare(String o1, String o2) + { + if ("STOP".equals(o1)) + { + return 1; + } + else if ("STOP".equals(o2)) + { + return -1; + } + else + { + return o1.compareTo(o2); + } + } + }); + return result; + } + + /** * Answers true if the feature type is either 'NMD_transcript_variant' or * 'transcript' or one of its sub-types in the Sequence Ontology. This is * needed because NMD_transcript_variant behaves like 'transcript' in Ensembl @@ -813,7 +1121,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ public static boolean isTranscript(String featureType) { - return NMD_VARIANT.equals(featureType) - || SequenceOntology.getInstance().isA(featureType, SequenceOntology.TRANSCRIPT); + return SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(featureType) + || SequenceOntologyFactory.getInstance().isA(featureType, + SequenceOntologyI.TRANSCRIPT); } }