X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fensembl%2FEnsemblSeqProxy.java;h=31552afcc72e4e4a7beb21c791d4d95088a9d12b;hb=08c202dd3ef0f34bde305c11da9eec03520174dc;hp=869a7028f1d2c8f5da9462a36877ad202359d1de;hpb=1e8c7a9ab9f5da589d0aa2482fd2e3361c320d57;p=jalview.git diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 869a702..31552af 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -1,10 +1,10 @@ package jalview.ext.ensembl; import jalview.analysis.AlignmentUtils; +import jalview.analysis.Dna; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; -import jalview.datamodel.DBRefSource; import jalview.datamodel.Mapping; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; @@ -13,11 +13,9 @@ import jalview.io.FastaFile; import jalview.io.FileParse; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; -import jalview.schemes.ResidueProperties; +import jalview.util.Comparison; import jalview.util.DBRefUtils; import jalview.util.MapList; -import jalview.util.MappingUtils; -import jalview.util.StringUtils; import java.io.IOException; import java.net.MalformedURLException; @@ -26,21 +24,17 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; -import java.util.LinkedHashMap; import java.util.List; -import java.util.Map.Entry; /** * Base class for Ensembl sequence fetchers * + * @see http://rest.ensembl.org/documentation/info/sequence_id * @author gmcarstairs */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { - private static final List CROSS_REFERENCES = Arrays - .asList(new String[] { "CCDS", "Uniprot/SWISSPROT" }); - - protected static final String CONSEQUENCE_TYPE = "consequence_type"; + private static final String ALLELES = "alleles"; protected static final String PARENT = "Parent"; @@ -48,6 +42,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient protected static final String NAME = "Name"; + protected static final String DESCRIPTION = "description"; + /* * enum for 'type' parameter to the /sequence REST service */ @@ -59,7 +55,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient GENOMIC("genomic"), /** - * type=cdna to fetch dna including UTRs + * type=cdna to fetch coding dna including UTRs */ CDNA("cdna"), @@ -112,10 +108,19 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Constructor + * Default constructor (to use rest.ensembl.org) */ public EnsemblSeqProxy() { + super(); + } + + /** + * Constructor given the target domain to fetch data from + */ + public EnsemblSeqProxy(String d) + { + super(d); } /** @@ -201,7 +206,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * get 'dummy' genomic sequence with exon, cds and variation features */ SequenceI genomicSequence = null; - EnsemblFeatures gffFetcher = new EnsemblFeatures(); + EnsemblFeatures gffFetcher = new EnsemblFeatures(getDomain()); EnsemblFeatureType[] features = getFeaturesToFetch(); AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, features); @@ -251,10 +256,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String accId = querySeq.getName(); try { - AlignmentI protein = new EnsemblProtein().getSequenceRecords(accId); + AlignmentI protein = new EnsemblProtein(getDomain()) + .getSequenceRecords(accId); if (protein == null || protein.getHeight() == 0) { - System.out.println("Failed to retrieve protein for " + accId); + System.out.println("No protein product found for " + accId); return; } SequenceI proteinSeq = protein.getSequenceAt(0); @@ -265,22 +271,23 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient proteinSeq.createDatasetSequence(); querySeq.createDatasetSequence(); - MapList mapList = mapCdsToProtein(querySeq, proteinSeq); + MapList mapList = AlignmentUtils.mapCdsToProtein(querySeq, proteinSeq); if (mapList != null) { // clunky: ensure Uniprot xref if we have one is on mapped sequence SequenceI ds = proteinSeq.getDatasetSequence(); ds.setSourceDBRef(proteinSeq.getSourceDBRef()); + Mapping map = new Mapping(ds, mapList); - DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(), - accId, map); + DBRefEntry dbr = new DBRefEntry(getDbSource(), + getEnsemblDataVersion(), proteinSeq.getName(), map); querySeq.getDatasetSequence().addDBRef(dbr); /* - * compute peptide variants from dna variants and add as - * sequence features on the protein sequence ta-da + * copy exon features to protein, compute peptide variants from dna + * variants and add as features on the protein sequence ta-da */ - computeProteinFeatures(querySeq, proteinSeq, mapList); + AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, mapList); } } catch (Exception e) { @@ -302,132 +309,20 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient seq = seq.getDatasetSequence(); } - EnsemblXref xrefFetcher = new EnsemblXref(); - List xrefs = xrefFetcher.getCrossReferences(seq.getName(), - getCrossReferenceDatabases()); + EnsemblXref xrefFetcher = new EnsemblXref(getDomain()); + List xrefs = xrefFetcher.getCrossReferences(seq.getName()); for (DBRefEntry xref : xrefs) { seq.addDBRef(xref); - /* - * Save any Uniprot xref to be the reference for SIFTS mapping - */ - if (DBRefSource.UNIPROT.equals(xref.getSource())) - { - seq.setSourceDBRef(xref); - } - } - } - - /** - * Returns a list of database names to be used when fetching cross-references. - * - * @return - */ - protected List getCrossReferenceDatabases() - { - return CROSS_REFERENCES; - } - - /** - * Returns a mapping from dna to protein by inspecting sequence features of - * type "CDS" on the dna. - * - * @param dnaSeq - * @param proteinSeq - * @return - */ - protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq) - { - List ranges = new ArrayList(50); - - int mappedDnaLength = getCdsRanges(dnaSeq, ranges); - - int proteinLength = proteinSeq.getLength(); - int proteinEnd = proteinLength; - int proteinStart = 1; - - /* - * incomplete start codon may mean X at start of peptide - * we ignore both for mapping purposes - */ - if (proteinSeq.getCharAt(0) == 'X') - { - proteinStart = 2; - proteinLength--; } - List proteinRange = new ArrayList(); /* - * dna length should map to protein (or protein plus stop codon) + * and add a reference to itself */ - int codesForResidues = mappedDnaLength / 3; - if (codesForResidues == (proteinLength + 1)) - { - MappingUtils.unmapStopCodon(ranges, mappedDnaLength); - codesForResidues--; - } - if (codesForResidues == proteinLength) - { - proteinRange.add(new int[] { proteinStart, proteinEnd }); - return new MapList(ranges, proteinRange, 3, 1); - } - return null; - } - - /** - * Adds CDS ranges to the ranges list, and returns the total length mapped - * from. - * - * No need to worry about reverse strand dna, here since the retrieved - * sequence is as transcribed (reverse complement for reverse strand), i.e in - * the same sense as the peptide. - * - * @param dnaSeq - * @param ranges - * @return - */ - protected int getCdsRanges(SequenceI dnaSeq, List ranges) - { - SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); - if (sfs == null) - { - return 0; - } - SequenceOntologyI so = SequenceOntologyFactory.getInstance(); - int mappedDnaLength = 0; - for (SequenceFeature sf : sfs) - { - /* - * process a CDS feature (or a sub-type of CDS) - */ - if (so.isA(sf.getType(), SequenceOntologyI.CDS)) - { - int phase = 0; - try { - phase = Integer.parseInt(sf.getPhase()); - } catch (NumberFormatException e) - { - // ignore - } - /* - * phase > 0 on first codon means 5' incomplete - skip to the start - * of the next codon; example ENST00000496384 - */ - int begin = sf.getBegin(); - int end = sf.getEnd(); - if (ranges.isEmpty()) - { - begin += phase; - if (begin > end) - { - continue; // shouldn't happen? - } - } - ranges.add(new int[] { begin, end }); - mappedDnaLength += Math.abs(end - begin) + 1; - } - } - return mappedDnaLength; + DBRefEntry self = new DBRefEntry(getDbSource(), + getEnsemblDataVersion(), seq.getName()); + seq.addDBRef(self); + seq.setSourceDBRef(self); } /** @@ -485,7 +380,9 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient if (ids.contains(name) || ids.contains(name.replace("ENSP", "ENST"))) { - DBRefUtils.parseToDbRef(sq, DBRefSource.ENSEMBL, "0", name); + DBRefEntry dbref = DBRefUtils.parseToDbRef(sq, getDbSource(), + getEnsemblDataVersion(), name); + sq.setSourceDBRef(dbref); } } if (alignment == null) @@ -514,7 +411,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * multiple ids go in the POST body instead */ StringBuffer urlstring = new StringBuffer(128); - urlstring.append(SEQUENCE_ID_URL); + urlstring.append(getDomain() + "/sequence/id"); if (ids.size() == 1) { urlstring.append("/").append(ids.get(0)); @@ -702,9 +599,10 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * @param mapping * mapping from the sequence feature's coordinates to the target * sequence + * @param forwardStrand */ protected void transferFeature(SequenceFeature sf, - SequenceI targetSequence, MapList mapping) + SequenceI targetSequence, MapList mapping, boolean forwardStrand) { int start = sf.getBegin(); int end = sf.getEnd(); @@ -718,20 +616,82 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient targetSequence.addSequenceFeature(copy); /* - * for sequence_variant, make an additional feature with consequence + * for sequence_variant on reverse strand, have to convert the allele + * values to their complements */ - // if (SequenceOntologyFactory.getInstance().isA(sf.getType(), - // SequenceOntologyI.SEQUENCE_VARIANT)) - // { - // String consequence = (String) sf.getValue(CONSEQUENCE_TYPE); - // if (consequence != null) - // { - // SequenceFeature sf2 = new SequenceFeature("consequence", - // consequence, copy.getBegin(), copy.getEnd(), 0f, - // null); - // targetSequence.addSequenceFeature(sf2); - // } - // } + if (!forwardStrand + && SequenceOntologyFactory.getInstance().isA(sf.getType(), + SequenceOntologyI.SEQUENCE_VARIANT)) + { + reverseComplementAlleles(copy); + } + } + } + + /** + * Change the 'alleles' value of a feature by converting to complementary + * bases, and also update the feature description to match + * + * @param sf + */ + static void reverseComplementAlleles(SequenceFeature sf) + { + final String alleles = (String) sf.getValue(ALLELES); + if (alleles == null) + { + return; + } + StringBuilder complement = new StringBuilder(alleles.length()); + for (String allele : alleles.split(",")) + { + reverseComplementAllele(complement, allele); + } + String comp = complement.toString(); + sf.setValue(ALLELES, comp); + sf.setDescription(comp); + + /* + * replace value of "alleles=" in sf.ATTRIBUTES as well + * so 'output as GFF' shows reverse complement alleles + */ + String atts = sf.getAttributes(); + if (atts != null) + { + atts = atts.replace(ALLELES + "=" + alleles, ALLELES + "=" + comp); + sf.setAttributes(atts); + } + } + + /** + * Makes the 'reverse complement' of the given allele and appends it to the + * buffer, after a comma separator if not the first + * + * @param complement + * @param allele + */ + static void reverseComplementAllele(StringBuilder complement, + String allele) + { + if (complement.length() > 0) + { + complement.append(","); + } + + /* + * some 'alleles' are actually descriptive terms + * e.g. HGMD_MUTATION, PhenCode_variation + * - we don't want to 'reverse complement' these + */ + if (!Comparison.isNucleotideSequence(allele, true)) + { + complement.append(allele); + } + else + { + for (int i = allele.length() - 1; i >= 0; i--) + { + complement.append(Dna.getComplement(allele.charAt(i))); + } } } @@ -786,25 +746,18 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient final boolean forwardStrand = mapping.isFromForwardStrand(); /* - * sort features by start position (descending if reverse strand) - * before transferring (in forwards order) to the target sequence + * sort features by start position (which corresponds to end + * position descending if reverse strand) so as to add them in + * 'forwards' order to the target sequence */ - Arrays.sort(features, new Comparator() - { - @Override - public int compare(SequenceFeature o1, SequenceFeature o2) - { - int c = Integer.compare(o1.getBegin(), o2.getBegin()); - return forwardStrand ? c : -c; - } - }); + sortFeatures(features, forwardStrand); boolean transferred = false; for (SequenceFeature sf : features) { if (retainFeature(sf, parentId)) { - transferFeature(sf, targetSequence, mapping); + transferFeature(sf, targetSequence, mapping, forwardStrand); transferred = true; } } @@ -812,6 +765,33 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** + * Sort features by start position ascending (if on forward strand), or end + * position descending (if on reverse strand) + * + * @param features + * @param forwardStrand + */ + protected static void sortFeatures(SequenceFeature[] features, + final boolean forwardStrand) + { + Arrays.sort(features, new Comparator() + { + @Override + public int compare(SequenceFeature o1, SequenceFeature o2) + { + if (forwardStrand) + { + return Integer.compare(o1.getBegin(), o2.getBegin()); + } + else + { + return Integer.compare(o2.getEnd(), o1.getEnd()); + } + } + }); + } + + /** * Answers true if the feature type is one we want to keep for the sequence. * Some features are only retrieved in order to identify the sequence range, * and may then be discarded as redundant information (e.g. "CDS" feature for @@ -884,240 +864,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Maps exon features from dna to protein, and computes variants in peptide - * product generated by variants in dna, and adds them as sequence_variant - * features on the protein sequence. Returns the number of variant features - * added. - * - * @param dnaSeq - * @param peptide - * @param dnaToProtein - */ - static int computeProteinFeatures(SequenceI dnaSeq, - SequenceI peptide, MapList dnaToProtein) - { - while (dnaSeq.getDatasetSequence() != null) - { - dnaSeq = dnaSeq.getDatasetSequence(); - } - while (peptide.getDatasetSequence() != null) - { - peptide = peptide.getDatasetSequence(); - } - - AlignmentUtils.transferFeatures(dnaSeq, peptide, dnaToProtein, - SequenceOntologyI.EXON); - - LinkedHashMap variants = buildDnaVariantsMap( - dnaSeq, dnaToProtein); - - /* - * scan codon variations, compute peptide variants and add to peptide sequence - */ - int count = 0; - for (Entry variant : variants.entrySet()) - { - int peptidePos = variant.getKey(); - String[][] codonVariants = variant.getValue(); - String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based - List peptideVariants = computePeptideVariants(codonVariants, - residue); - if (!peptideVariants.isEmpty()) - { - String desc = StringUtils.listToDelimitedString(peptideVariants, - ", "); - SequenceFeature sf = new SequenceFeature( - SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos, - peptidePos, 0f, null); - peptide.addSequenceFeature(sf); - count++; - } - } - - /* - * ugly sort to get sequence features in start position order - * - would be better to store in Sequence as a TreeSet instead? - */ - Arrays.sort(peptide.getSequenceFeatures(), - new Comparator() - { - @Override - public int compare(SequenceFeature o1, SequenceFeature o2) - { - int c = Integer.compare(o1.getBegin(), o2.getBegin()); - return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd()) - : c; - } - }); - return count; - } - - /** - * Builds a map whose key is position in the protein sequence, and value is an - * array of all variants for the coding codon positions - * - * @param dnaSeq - * @param dnaToProtein - * @return - */ - static LinkedHashMap buildDnaVariantsMap( - SequenceI dnaSeq, MapList dnaToProtein) - { - /* - * map from peptide position to all variant features of the codon for it - * LinkedHashMap ensures we add the peptide features in sequence order - */ - LinkedHashMap variants = new LinkedHashMap(); - SequenceOntologyI so = SequenceOntologyFactory.getInstance(); - - SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures(); - if (dnaFeatures == null) - { - return variants; - } - - int dnaStart = dnaSeq.getStart(); - int[] lastCodon = null; - int lastPeptidePostion = 0; - - /* - * build a map of codon variations for peptides - */ - for (SequenceFeature sf : dnaFeatures) - { - int dnaCol = sf.getBegin(); - if (dnaCol != sf.getEnd()) - { - // not handling multi-locus variant features - continue; - } - if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT)) - { - int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol); - if (mapsTo == null) - { - // feature doesn't lie within coding region - continue; - } - int peptidePosition = mapsTo[0]; - String[][] codonVariants = variants.get(peptidePosition); - if (codonVariants == null) - { - codonVariants = new String[3][]; - variants.put(peptidePosition, codonVariants); - } - - /* - * extract dna variants to a string array - */ - String alls = (String) sf.getValue("alleles"); - if (alls == null) - { - continue; - } - String[] alleles = alls.split(","); - - /* - * get this peptides codon positions e.g. [3, 4, 5] or [4, 7, 10] - */ - int[] codon = peptidePosition == lastPeptidePostion ? lastCodon - : MappingUtils.flattenRanges(dnaToProtein.locateInFrom( - peptidePosition, peptidePosition)); - lastPeptidePostion = peptidePosition; - lastCodon = codon; - - /* - * save nucleotide (and this variant) for each codon position - */ - for (int codonPos = 0; codonPos < 3; codonPos++) - { - String nucleotide = String.valueOf(dnaSeq - .getCharAt(codon[codonPos] - dnaStart)); - if (codon[codonPos] == dnaCol) - { - /* - * record current dna base and its alleles - */ - String[] dnaVariants = new String[alleles.length + 1]; - dnaVariants[0] = nucleotide; - System.arraycopy(alleles, 0, dnaVariants, 1, alleles.length); - codonVariants[codonPos] = dnaVariants; - } - else if (codonVariants[codonPos] == null) - { - /* - * record current dna base only - * (at least until we find any variation and overwrite it) - */ - codonVariants[codonPos] = new String[] { nucleotide }; - } - } - } - } - return variants; - } - - /** - * Returns a sorted, non-redundant list of all peptide translations generated - * by the given dna variants, excluding the current residue value - * - * @param codonVariants - * an array of base values (acgtACGT) for codon positions 1, 2, 3 - * @param residue - * the current residue translation - * @return - */ - static List computePeptideVariants( - String[][] codonVariants, String residue) - { - List result = new ArrayList(); - for (String base1 : codonVariants[0]) - { - for (String base2 : codonVariants[1]) - { - for (String base3 : codonVariants[2]) - { - String codon = base1 + base2 + base3; - // TODO: report frameshift/insertion/deletion - // and multiple-base variants?! - String peptide = codon.contains("-") ? "-" : ResidueProperties - .codonTranslate(codon); - if (peptide != null && !result.contains(peptide) - && !peptide.equalsIgnoreCase(residue)) - { - result.add(peptide); - } - } - } - } - - /* - * sort alphabetically with STOP at the end - */ - Collections.sort(result, new Comparator() - { - - @Override - public int compare(String o1, String o2) - { - if ("STOP".equals(o1)) - { - return 1; - } - else if ("STOP".equals(o2)) - { - return -1; - } - else - { - return o1.compareTo(o2); - } - } - }); - return result; - } - - /** * Answers true if the feature type is either 'NMD_transcript_variant' or * 'transcript' or one of its sub-types in the Sequence Ontology. This is * needed because NMD_transcript_variant behaves like 'transcript' in Ensembl