X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fensembl%2FEnsemblSeqProxy.java;h=869a7028f1d2c8f5da9462a36877ad202359d1de;hb=1e8c7a9ab9f5da589d0aa2482fd2e3361c320d57;hp=e986ba832d7994e38bab3469f25a48262389ef26;hpb=b03ec66ae6238b44bd20d2403d1157cadc5f0e01;p=jalview.git diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index e986ba8..869a702 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -1,5 +1,6 @@ package jalview.ext.ensembl; +import jalview.analysis.AlignmentUtils; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; @@ -10,9 +11,13 @@ import jalview.datamodel.SequenceI; import jalview.exceptions.JalviewException; import jalview.io.FastaFile; import jalview.io.FileParse; -import jalview.io.gff.SequenceOntology; +import jalview.io.gff.SequenceOntologyFactory; +import jalview.io.gff.SequenceOntologyI; +import jalview.schemes.ResidueProperties; import jalview.util.DBRefUtils; import jalview.util.MapList; +import jalview.util.MappingUtils; +import jalview.util.StringUtils; import java.io.IOException; import java.net.MalformedURLException; @@ -21,7 +26,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map.Entry; /** * Base class for Ensembl sequence fetchers @@ -30,25 +37,39 @@ import java.util.List; */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { + private static final List CROSS_REFERENCES = Arrays + .asList(new String[] { "CCDS", "Uniprot/SWISSPROT" }); + + protected static final String CONSEQUENCE_TYPE = "consequence_type"; + + protected static final String PARENT = "Parent"; + + protected static final String ID = "ID"; + + protected static final String NAME = "Name"; + + /* + * enum for 'type' parameter to the /sequence REST service + */ public enum EnsemblSeqType { /** - * type=genomic for the full dna including introns + * type=genomic to fetch full dna including introns */ GENOMIC("genomic"), /** - * type=cdna for transcribed dna including UTRs + * type=cdna to fetch dna including UTRs */ CDNA("cdna"), /** - * type=cds for coding dna excluding UTRs + * type=cds to fetch coding dna excluding UTRs */ CDS("cds"), /** - * type=protein for the peptide product sequence + * type=protein to fetch peptide product sequence */ PROTEIN("protein"); @@ -88,7 +109,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return (forwards ? 1 : -1) * Integer.compare(o1[0], o2[0]); } - }; + } /** * Constructor @@ -99,7 +120,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /** * Makes the sequence queries to Ensembl's REST service and returns an - * alignment consisting of the returned sequences + * alignment consisting of the returned sequences. */ @Override public AlignmentI getSequenceRecords(String query) throws Exception @@ -108,7 +129,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient // danger: accession separator used as a regex here, a string elsewhere // in this case it is ok (it is just a space), but (e.g.) '\' would not be - List allIds = Arrays.asList(query.split(getAccessionSeparator())); + List allIds = Arrays.asList(query + .split(getAccessionSeparator())); AlignmentI alignment = null; inProgress = true; @@ -131,26 +153,29 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient + " chunks. Unexpected problem (" + r.getLocalizedMessage() + ")"; System.err.println(msg); - if (alignment != null) - { - break; // return what we got - } - else - { - throw new JalviewException(msg, r); - } + break; } } + if (alignment == null) + { + return null; + } + /* - * fetch and transfer genomic sequence features + * fetch and transfer genomic sequence features, + * fetch protein product and add as cross-reference */ for (String accId : allIds) { addFeaturesAndProduct(accId, alignment); } - inProgress = false; + for (SequenceI seq : alignment.getSequences()) + { + getCrossReferences(seq); + } + return alignment; } @@ -165,29 +190,40 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ protected void addFeaturesAndProduct(String accId, AlignmentI alignment) { + if (alignment == null) + { + return; + } + try { /* * get 'dummy' genomic sequence with exon, cds and variation features */ - EnsemblOverlap gffFetcher = new EnsemblOverlap(); + SequenceI genomicSequence = null; + EnsemblFeatures gffFetcher = new EnsemblFeatures(); EnsemblFeatureType[] features = getFeaturesToFetch(); AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, features); if (geneFeatures.getHeight() > 0) { + genomicSequence = geneFeatures.getSequenceAt(0); + } + if (genomicSequence != null) + { /* * transfer features to the query sequence */ - SequenceI genomicSequence = geneFeatures.getSequenceAt(0); SequenceI querySeq = alignment.findName(accId); - transferFeatures(accId, genomicSequence, querySeq); + if (transferFeatures(accId, genomicSequence, querySeq)) + { - /* - * fetch and map protein product, and add it as a cross-reference - * of the retrieved sequence - */ - addProteinProduct(querySeq); + /* + * fetch and map protein product, and add it as a cross-reference + * of the retrieved sequence + */ + addProteinProduct(querySeq); + } } } catch (IOException e) { @@ -232,10 +268,19 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient MapList mapList = mapCdsToProtein(querySeq, proteinSeq); if (mapList != null) { - Mapping map = new Mapping(proteinSeq.getDatasetSequence(), mapList); + // clunky: ensure Uniprot xref if we have one is on mapped sequence + SequenceI ds = proteinSeq.getDatasetSequence(); + ds.setSourceDBRef(proteinSeq.getSourceDBRef()); + Mapping map = new Mapping(ds, mapList); DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(), accId, map); querySeq.getDatasetSequence().addDBRef(dbr); + + /* + * compute peptide variants from dna variants and add as + * sequence features on the protein sequence ta-da + */ + computeProteinFeatures(querySeq, proteinSeq, mapList); } } catch (Exception e) { @@ -246,6 +291,44 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** + * Get database xrefs from Ensembl, and attach them to the sequence + * + * @param seq + */ + protected void getCrossReferences(SequenceI seq) + { + while (seq.getDatasetSequence() != null) + { + seq = seq.getDatasetSequence(); + } + + EnsemblXref xrefFetcher = new EnsemblXref(); + List xrefs = xrefFetcher.getCrossReferences(seq.getName(), + getCrossReferenceDatabases()); + for (DBRefEntry xref : xrefs) + { + seq.addDBRef(xref); + /* + * Save any Uniprot xref to be the reference for SIFTS mapping + */ + if (DBRefSource.UNIPROT.equals(xref.getSource())) + { + seq.setSourceDBRef(xref); + } + } + } + + /** + * Returns a list of database names to be used when fetching cross-references. + * + * @return + */ + protected List getCrossReferenceDatabases() + { + return CROSS_REFERENCES; + } + + /** * Returns a mapping from dna to protein by inspecting sequence features of * type "CDS" on the dna. * @@ -255,49 +338,99 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq) { - SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); - if (sfs == null) - { - return null; - } - List ranges = new ArrayList(50); - SequenceOntology so = SequenceOntology.getInstance(); - int mappedDnaLength = 0; - + int mappedDnaLength = getCdsRanges(dnaSeq, ranges); + + int proteinLength = proteinSeq.getLength(); + int proteinEnd = proteinLength; + int proteinStart = 1; + /* - * Map CDS columns of dna to peptide. No need to worry about reverse strand - * dna here since the retrieved sequence is as transcribed (reverse - * complement for reverse strand), i.e in the same sense as the peptide. + * incomplete start codon may mean X at start of peptide + * we ignore both for mapping purposes */ - for (SequenceFeature sf : sfs) + if (proteinSeq.getCharAt(0) == 'X') { - /* - * process a CDS feature (or a sub-type of CDS) - */ - if (so.isA(sf.getType(), SequenceOntology.CDS)) - { - ranges.add(new int[] { sf.getBegin(), sf.getEnd() }); - mappedDnaLength += Math.abs(sf.getEnd() - sf.getBegin()) + 1; - } + proteinStart = 2; + proteinLength--; } - int proteinLength = proteinSeq.getLength(); List proteinRange = new ArrayList(); - proteinRange.add(new int[] { 1, proteinLength }); /* - * dna length should map to protein (or protein minus stop codon) + * dna length should map to protein (or protein plus stop codon) */ - if (mappedDnaLength == 3 * proteinLength - || mappedDnaLength == 3 * (proteinLength + 1)) + int codesForResidues = mappedDnaLength / 3; + if (codesForResidues == (proteinLength + 1)) + { + MappingUtils.unmapStopCodon(ranges, mappedDnaLength); + codesForResidues--; + } + if (codesForResidues == proteinLength) { + proteinRange.add(new int[] { proteinStart, proteinEnd }); return new MapList(ranges, proteinRange, 3, 1); } return null; } /** + * Adds CDS ranges to the ranges list, and returns the total length mapped + * from. + * + * No need to worry about reverse strand dna, here since the retrieved + * sequence is as transcribed (reverse complement for reverse strand), i.e in + * the same sense as the peptide. + * + * @param dnaSeq + * @param ranges + * @return + */ + protected int getCdsRanges(SequenceI dnaSeq, List ranges) + { + SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); + if (sfs == null) + { + return 0; + } + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + int mappedDnaLength = 0; + for (SequenceFeature sf : sfs) + { + /* + * process a CDS feature (or a sub-type of CDS) + */ + if (so.isA(sf.getType(), SequenceOntologyI.CDS)) + { + int phase = 0; + try { + phase = Integer.parseInt(sf.getPhase()); + } catch (NumberFormatException e) + { + // ignore + } + /* + * phase > 0 on first codon means 5' incomplete - skip to the start + * of the next codon; example ENST00000496384 + */ + int begin = sf.getBegin(); + int end = sf.getEnd(); + if (ranges.isEmpty()) + { + begin += phase; + if (begin > end) + { + continue; // shouldn't happen? + } + } + ranges.add(new int[] { begin, end }); + mappedDnaLength += Math.abs(end - begin) + 1; + } + } + return mappedDnaLength; + } + + /** * Fetches sequences for the list of accession ids and adds them to the * alignment. Returns the extended (or created) alignment. * @@ -329,6 +462,15 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient "Only retrieved %d sequences for %d query strings", fr .getSeqs().size(), ids.size())); } + + if (fr.getSeqs().size() == 1 && fr.getSeqs().get(0).getLength() == 0) + { + /* + * POST request has returned an empty FASTA file e.g. for invalid id + */ + throw new IOException("No data returned for " + ids); + } + if (fr.getSeqs().size() > 0) { AlignmentI seqal = new Alignment( @@ -367,10 +509,16 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient @Override protected URL getUrl(List ids) throws MalformedURLException { - // ids are not used - they go in the POST body instead + /* + * a single id is included in the URL path + * multiple ids go in the POST body instead + */ StringBuffer urlstring = new StringBuffer(128); urlstring.append(SEQUENCE_ID_URL); - + if (ids.size() == 1) + { + urlstring.append("/").append(ids.get(0)); + } // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats urlstring.append("?type=").append(getSourceEnsemblType().getType()); urlstring.append(("&Accept=text/x-fasta")); @@ -391,19 +539,19 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } @Override - public boolean useGetRequest() + protected boolean useGetRequest() { return false; } @Override - public String getRequestMimeType() + protected String getRequestMimeType(boolean multipleIds) { - return "application/json"; + return multipleIds ? "application/json" : "text/x-fasta"; } @Override - public String getResponseMimeType() + protected String getResponseMimeType() { return "text/x-fasta"; } @@ -434,14 +582,23 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * backwards (for negative strand). Aborts and returns null if both positive * and negative strand are found (this should not normally happen). * - * @param sfs + * @param sourceSequence * @param accId + * @param start + * the start position of the sequence we are mapping to * @return */ - protected MapList getGenomicRanges(SequenceFeature[] sfs, String accId) + protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence, + String accId, int start) { + SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); + if (sfs == null) + { + return null; + } + /* - * generously size for initial number of cds regions + * generously initial size for number of cds regions * (worst case titin Q8WZ42 has c. 313 exons) */ List regions = new ArrayList(100); @@ -457,11 +614,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ if (identifiesSequence(sf, accId)) { - int strand = sf.getStrand(); - - if (directionSet && strand != direction) - { - // abort - mix of forward and backward + int strand = sf.getStrand(); + strand = strand == 0 ? 1 : strand; // treat unknown as forward + + if (directionSet && strand != direction) + { + // abort - mix of forward and backward System.err.println("Error: forward and backward strand for " + accId); return null; @@ -478,27 +636,54 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } else { - regions.add(new int[] { sf.getBegin(), sf.getEnd() }); - } - mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); + regions.add(new int[] { sf.getBegin(), sf.getEnd() }); } + mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); + + if (!isSpliceable()) + { + /* + * 'gene' sequence is contiguous so we can stop as soon as its + * identifying feature has been found + */ + break; + } + } } + if (regions.isEmpty()) + { + System.out.println("Failed to identify target sequence for " + accId + + " from genomic features"); + return null; + } + /* * a final sort is needed since Ensembl returns CDS sorted within source * (havana / ensembl_havana) */ Collections.sort(regions, new RangeSorter(direction == 1)); - List to = new ArrayList(); - to.add(new int[] { 1, mappedLength }); + List to = Arrays.asList(new int[] { start, + start + mappedLength - 1 }); return new MapList(regions, to, 1, 1); } /** - * Returns true if the sequence feature identifies positions of the genomic - * sequence feature which are within the sequence being retrieved. + * Answers true if the sequence being retrieved may occupy discontiguous + * regions on the genomic sequence. + */ + protected boolean isSpliceable() + { + return true; + } + + /** + * Returns true if the sequence feature marks positions of the genomic + * sequence feature which are within the sequence being retrieved. For + * example, an 'exon' feature whose parent is the target transcript marks the + * cdna positions of the transcript. * * @param sf * @param accId @@ -508,38 +693,46 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String accId); /** - * Transfers the sequence feature to the target sequence, adjusting its start - * and end range based on the 'overlap' ranges. Features which do not overlap - * the target sequence are ignored, as are features with a parent other than - * the target sequence id. + * Transfers the sequence feature to the target sequence, locating its start + * and end range based on the mapping. Features which do not overlap the + * target sequence are ignored. * * @param sf * @param targetSequence - * @param overlap + * @param mapping + * mapping from the sequence feature's coordinates to the target + * sequence */ protected void transferFeature(SequenceFeature sf, - SequenceI targetSequence, MapList overlap) + SequenceI targetSequence, MapList mapping) { - String parent = (String) sf.getValue("Parent"); - if (parent != null && !parent.contains(targetSequence.getName())) - { - // this genomic feature belongs to a different transcript - return; - } - int start = sf.getBegin(); int end = sf.getEnd(); - int[] mappedRange = overlap.locateInTo(start, end); + int[] mappedRange = mapping.locateInTo(start, end); if (mappedRange != null) { SequenceFeature copy = new SequenceFeature(sf); - int offset = targetSequence.getStart() - 1; - copy.setBegin(offset + Math.min(mappedRange[0], mappedRange[1])); - copy.setEnd(offset + Math.max(mappedRange[0], mappedRange[1])); + copy.setBegin(Math.min(mappedRange[0], mappedRange[1])); + copy.setEnd(Math.max(mappedRange[0], mappedRange[1])); targetSequence.addSequenceFeature(copy); + + /* + * for sequence_variant, make an additional feature with consequence + */ + // if (SequenceOntologyFactory.getInstance().isA(sf.getType(), + // SequenceOntologyI.SEQUENCE_VARIANT)) + // { + // String consequence = (String) sf.getValue(CONSEQUENCE_TYPE); + // if (consequence != null) + // { + // SequenceFeature sf2 = new SequenceFeature("consequence", + // consequence, copy.getBegin(), copy.getEnd(), 0f, + // null); + // targetSequence.addSequenceFeature(sf2); + // } + // } } - } /** @@ -548,25 +741,55 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * @param accessionId * @param sourceSequence * @param targetSequence + * @return true if any features were transferred, else false */ - protected void transferFeatures(String accessionId, + protected boolean transferFeatures(String accessionId, SequenceI sourceSequence, SequenceI targetSequence) { if (sourceSequence == null || targetSequence == null) { - return; + return false; } + // long start = System.currentTimeMillis(); SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); - MapList overlap = getGenomicRanges(sfs, accessionId); + MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId, + targetSequence.getStart()); + if (mapping == null) + { + return false; + } - final boolean forwardStrand = overlap.isFromForwardStrand(); + boolean result = transferFeatures(sfs, targetSequence, mapping, + accessionId); + // System.out.println("transferFeatures (" + (sfs.length) + " --> " + // + targetSequence.getSequenceFeatures().length + ") to " + // + targetSequence.getName() + // + " took " + (System.currentTimeMillis() - start) + "ms"); + return result; + } + + /** + * Transfer features to the target sequence. The start/end positions are + * converted using the mapping. Features which do not overlap are ignored. + * Features whose parent is not the specified identifier are also ignored. + * + * @param features + * @param targetSequence + * @param mapping + * @param parentId + * @return + */ + protected boolean transferFeatures(SequenceFeature[] features, + SequenceI targetSequence, MapList mapping, String parentId) + { + final boolean forwardStrand = mapping.isFromForwardStrand(); /* * sort features by start position (descending if reverse strand) * before transferring (in forwards order) to the target sequence */ - Arrays.sort(sfs, new Comparator() + Arrays.sort(features, new Comparator() { @Override public int compare(SequenceFeature o1, SequenceFeature o2) @@ -576,23 +799,338 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } }); - for (SequenceFeature sf : sfs) + boolean transferred = false; + for (SequenceFeature sf : features) { - if (retainFeature(sf.getType())) + if (retainFeature(sf, parentId)) { - transferFeature(sf, targetSequence, overlap); + transferFeature(sf, targetSequence, mapping); + transferred = true; } } + return transferred; } /** - * Answers true if the feature type is one to attach to the retrieved sequence + * Answers true if the feature type is one we want to keep for the sequence. + * Some features are only retrieved in order to identify the sequence range, + * and may then be discarded as redundant information (e.g. "CDS" feature for + * a CDS sequence). + */ + @SuppressWarnings("unused") + protected boolean retainFeature(SequenceFeature sf, String accessionId) + { + return true; // override as required + } + + /** + * Answers true if the feature has a Parent which refers to the given + * accession id, or if the feature has no parent. Answers false if the + * feature's Parent is for a different accession id. + * + * @param sf + * @param identifier + * @return + */ + protected boolean featureMayBelong(SequenceFeature sf, String identifier) + { + String parent = (String) sf.getValue(PARENT); + // using contains to allow for prefix "gene:", "transcript:" etc + if (parent != null && !parent.contains(identifier)) + { + // this genomic feature belongs to a different transcript + return false; + } + return true; + } + + @Override + public String getDescription() + { + return "Ensembl " + getSourceEnsemblType().getType() + + " sequence with variant features"; + } + + /** + * Returns a (possibly empty) list of features on the sequence which have the + * specified sequence ontology type (or a sub-type of it), and the given + * identifier as parent * + * @param sequence * @param type + * @param parentId + * @return + */ + protected List findFeatures(SequenceI sequence, + String type, String parentId) + { + List result = new ArrayList(); + + SequenceFeature[] sfs = sequence.getSequenceFeatures(); + if (sfs != null) { + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + for (SequenceFeature sf :sfs) { + if (so.isA(sf.getType(), type)) + { + String parent = (String) sf.getValue(PARENT); + if (parent.equals(parentId)) + { + result.add(sf); + } + } + } + } + return result; + } + + /** + * Maps exon features from dna to protein, and computes variants in peptide + * product generated by variants in dna, and adds them as sequence_variant + * features on the protein sequence. Returns the number of variant features + * added. + * + * @param dnaSeq + * @param peptide + * @param dnaToProtein + */ + static int computeProteinFeatures(SequenceI dnaSeq, + SequenceI peptide, MapList dnaToProtein) + { + while (dnaSeq.getDatasetSequence() != null) + { + dnaSeq = dnaSeq.getDatasetSequence(); + } + while (peptide.getDatasetSequence() != null) + { + peptide = peptide.getDatasetSequence(); + } + + AlignmentUtils.transferFeatures(dnaSeq, peptide, dnaToProtein, + SequenceOntologyI.EXON); + + LinkedHashMap variants = buildDnaVariantsMap( + dnaSeq, dnaToProtein); + + /* + * scan codon variations, compute peptide variants and add to peptide sequence + */ + int count = 0; + for (Entry variant : variants.entrySet()) + { + int peptidePos = variant.getKey(); + String[][] codonVariants = variant.getValue(); + String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based + List peptideVariants = computePeptideVariants(codonVariants, + residue); + if (!peptideVariants.isEmpty()) + { + String desc = StringUtils.listToDelimitedString(peptideVariants, + ", "); + SequenceFeature sf = new SequenceFeature( + SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos, + peptidePos, 0f, null); + peptide.addSequenceFeature(sf); + count++; + } + } + + /* + * ugly sort to get sequence features in start position order + * - would be better to store in Sequence as a TreeSet instead? + */ + Arrays.sort(peptide.getSequenceFeatures(), + new Comparator() + { + @Override + public int compare(SequenceFeature o1, SequenceFeature o2) + { + int c = Integer.compare(o1.getBegin(), o2.getBegin()); + return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd()) + : c; + } + }); + return count; + } + + /** + * Builds a map whose key is position in the protein sequence, and value is an + * array of all variants for the coding codon positions + * + * @param dnaSeq + * @param dnaToProtein + * @return + */ + static LinkedHashMap buildDnaVariantsMap( + SequenceI dnaSeq, MapList dnaToProtein) + { + /* + * map from peptide position to all variant features of the codon for it + * LinkedHashMap ensures we add the peptide features in sequence order + */ + LinkedHashMap variants = new LinkedHashMap(); + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + + SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures(); + if (dnaFeatures == null) + { + return variants; + } + + int dnaStart = dnaSeq.getStart(); + int[] lastCodon = null; + int lastPeptidePostion = 0; + + /* + * build a map of codon variations for peptides + */ + for (SequenceFeature sf : dnaFeatures) + { + int dnaCol = sf.getBegin(); + if (dnaCol != sf.getEnd()) + { + // not handling multi-locus variant features + continue; + } + if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT)) + { + int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol); + if (mapsTo == null) + { + // feature doesn't lie within coding region + continue; + } + int peptidePosition = mapsTo[0]; + String[][] codonVariants = variants.get(peptidePosition); + if (codonVariants == null) + { + codonVariants = new String[3][]; + variants.put(peptidePosition, codonVariants); + } + + /* + * extract dna variants to a string array + */ + String alls = (String) sf.getValue("alleles"); + if (alls == null) + { + continue; + } + String[] alleles = alls.split(","); + + /* + * get this peptides codon positions e.g. [3, 4, 5] or [4, 7, 10] + */ + int[] codon = peptidePosition == lastPeptidePostion ? lastCodon + : MappingUtils.flattenRanges(dnaToProtein.locateInFrom( + peptidePosition, peptidePosition)); + lastPeptidePostion = peptidePosition; + lastCodon = codon; + + /* + * save nucleotide (and this variant) for each codon position + */ + for (int codonPos = 0; codonPos < 3; codonPos++) + { + String nucleotide = String.valueOf(dnaSeq + .getCharAt(codon[codonPos] - dnaStart)); + if (codon[codonPos] == dnaCol) + { + /* + * record current dna base and its alleles + */ + String[] dnaVariants = new String[alleles.length + 1]; + dnaVariants[0] = nucleotide; + System.arraycopy(alleles, 0, dnaVariants, 1, alleles.length); + codonVariants[codonPos] = dnaVariants; + } + else if (codonVariants[codonPos] == null) + { + /* + * record current dna base only + * (at least until we find any variation and overwrite it) + */ + codonVariants[codonPos] = new String[] { nucleotide }; + } + } + } + } + return variants; + } + + /** + * Returns a sorted, non-redundant list of all peptide translations generated + * by the given dna variants, excluding the current residue value + * + * @param codonVariants + * an array of base values (acgtACGT) for codon positions 1, 2, 3 + * @param residue + * the current residue translation + * @return + */ + static List computePeptideVariants( + String[][] codonVariants, String residue) + { + List result = new ArrayList(); + for (String base1 : codonVariants[0]) + { + for (String base2 : codonVariants[1]) + { + for (String base3 : codonVariants[2]) + { + String codon = base1 + base2 + base3; + // TODO: report frameshift/insertion/deletion + // and multiple-base variants?! + String peptide = codon.contains("-") ? "-" : ResidueProperties + .codonTranslate(codon); + if (peptide != null && !result.contains(peptide) + && !peptide.equalsIgnoreCase(residue)) + { + result.add(peptide); + } + } + } + } + + /* + * sort alphabetically with STOP at the end + */ + Collections.sort(result, new Comparator() + { + + @Override + public int compare(String o1, String o2) + { + if ("STOP".equals(o1)) + { + return 1; + } + else if ("STOP".equals(o2)) + { + return -1; + } + else + { + return o1.compareTo(o2); + } + } + }); + return result; + } + + /** + * Answers true if the feature type is either 'NMD_transcript_variant' or + * 'transcript' or one of its sub-types in the Sequence Ontology. This is + * needed because NMD_transcript_variant behaves like 'transcript' in Ensembl + * although strictly speaking it is not (it is a sub-type of + * sequence_variant). + * + * @param featureType * @return */ - protected boolean retainFeature(@SuppressWarnings("unused") String type) + public static boolean isTranscript(String featureType) { - return true; // default is to keep all + return SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(featureType) + || SequenceOntologyFactory.getInstance().isA(featureType, + SequenceOntologyI.TRANSCRIPT); } }