From ecc50d775a514f9840cdcc10a9d5b1e49c60582c Mon Sep 17 00:00:00 2001 From: gmungoc Date: Sat, 23 Jan 2016 06:11:30 +0000 Subject: [PATCH] JAL-1705 compute peptide variants from dna; add "consequence" feature on CDS --- src/jalview/analysis/AlignmentUtils.java | 10 +- src/jalview/analysis/CrossRef.java | 204 ++++++++++++++++++++++++++ src/jalview/ext/ensembl/EnsemblCdna.java | 2 +- src/jalview/ext/ensembl/EnsemblCds.java | 2 +- src/jalview/ext/ensembl/EnsemblGenome.java | 4 +- src/jalview/ext/ensembl/EnsemblOverlap.java | 3 + src/jalview/ext/ensembl/EnsemblSeqProxy.java | 24 ++- src/jalview/io/gff/Gff3Helper.java | 8 +- src/jalview/io/gff/GffHelperBase.java | 2 +- 9 files changed, 241 insertions(+), 18 deletions(-) diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 194e6af..2311dea 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -1394,7 +1394,7 @@ public class AlignmentUtils for (Mapping seqMapping : seqMappings) { - SequenceI cds = makeCdsSequence(dnaSeq, seqMapping, newMappings); + SequenceI cds = makeCdsSequence(dnaSeq, seqMapping); cdsSequences.add(cds); /* @@ -1507,19 +1507,16 @@ public class AlignmentUtils /** * Makes and returns a CDS-only sequence, where the CDS regions are identified - * as the 'from' ranges of the mapping on the dna. Any sequence features on - * the dna which overlap the CDS regions are copied to the new sequence. + * as the 'from' ranges of the mapping on the dna. * * @param dnaSeq * nucleotide sequence * @param seqMapping * mappings from CDS regions of nucleotide - * @param exonMappings - * CDS-to-peptide mapping (to add to) * @return */ protected static SequenceI makeCdsSequence(SequenceI dnaSeq, - Mapping seqMapping, AlignedCodonFrame exonMappings) + Mapping seqMapping) { StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); final char[] dna = dnaSeq.getSequence(); @@ -1542,6 +1539,7 @@ public class AlignmentUtils newSequence.toString()); transferDbRefs(seqMapping.getTo(), cds); + return cds; } diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java index e96d9d7..68f6c93 100644 --- a/src/jalview/analysis/CrossRef.java +++ b/src/jalview/analysis/CrossRef.java @@ -27,13 +27,22 @@ import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.io.gff.SequenceOntology; +import jalview.schemes.ResidueProperties; import jalview.util.DBRefUtils; +import jalview.util.MapList; +import jalview.util.MappingUtils; +import jalview.util.StringUtils; import jalview.ws.SequenceFetcher; import jalview.ws.seqfetcher.ASequenceFetcher; import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map.Entry; import java.util.Vector; /** @@ -269,6 +278,12 @@ public class CrossRef // map should be from protein seq to its coding dna cf.addMap(rsq, dss, xref.getMap().getMap().getInverse()); } + + /* + * compute peptide variants from dna variants + */ + rsq.createDatasetSequence(); + computeProteinVariants(seq, rsq, xref.getMap().getMap()); } found = true; } @@ -570,6 +585,195 @@ public class CrossRef } /** + * Computes variants in peptide product generated by variants in dna, and adds + * them as sequence_variant features on the protein sequence. Returns the + * number of variant features added. + * + * @param dnaSeq + * @param peptide + * @param dnaToProtein + */ + protected static int computeProteinVariants(SequenceI dnaSeq, + SequenceI peptide, MapList dnaToProtein) + { + /* + * map from peptide position to all variant features of the codon for it + * LinkedHashMap ensures we add the peptide features in sequence order + */ + LinkedHashMap variants = new LinkedHashMap(); + SequenceOntology so = SequenceOntology.getInstance(); + + SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures(); + if (dnaFeatures == null) + { + return 0; + } + + int[] lastCodon = null; + int lastPeptidePostion = 0; + + /* + * build a map of codon variations for peptides + */ + for (SequenceFeature sf : dnaFeatures) + { + int dnaCol = sf.getBegin(); + if (dnaCol != sf.getEnd()) + { + // not handling multi-locus variant features + continue; + } + if (so.isSequenceVariant(sf.getType())) + { + int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol); + if (mapsTo == null) + { + // feature doesn't lie within coding region + continue; + } + int peptidePosition = mapsTo[0]; + String[][] codonVariants = variants.get(peptidePosition); + if (codonVariants == null) + { + codonVariants = new String[3][]; + variants.put(peptidePosition, codonVariants); + } + + /* + * extract dna variants to a string array + */ + String alls = (String) sf.getValue("alleles"); + if (alls == null) + { + continue; + } + String[] alleles = alls.split(","); + + /* + * get this peptides codon positions e.g. [3, 4, 5] or [4, 7, 10] + */ + int[] codon = peptidePosition == lastPeptidePostion ? lastCodon + : MappingUtils.flattenRanges(dnaToProtein.locateInFrom( + peptidePosition, peptidePosition)); + lastPeptidePostion = peptidePosition; + lastCodon = codon; + + /* + * save nucleotide (and this variant) for each codon position + */ + for (int codonPos = 0; codonPos < 3; codonPos++) + { + String nucleotide = String.valueOf(dnaSeq + .getCharAt(codon[codonPos] - 1)); + if (codon[codonPos] == dnaCol) + { + /* + * record current dna base and its alleles + */ + String[] dnaVariants = new String[alleles.length + 1]; + dnaVariants[0] = nucleotide; + System.arraycopy(alleles, 0, dnaVariants, 1, alleles.length); + codonVariants[codonPos] = dnaVariants; + } + else if (codonVariants[codonPos] == null) + { + /* + * record current dna base only + * (at least until we find any variation and overwrite it) + */ + codonVariants[codonPos] = new String[] { nucleotide }; + } + } + } + } + + /* + * scan codon variations, compute peptide variants and add to peptide sequence + */ + int count = 0; + for (Entry variant : variants.entrySet()) + { + int peptidePos = variant.getKey(); + String[][] codonVariants = variant.getValue(); + String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based + List peptideVariants = computePeptideVariants(codonVariants, + residue); + if (!peptideVariants.isEmpty()) + { + Collections.sort(peptideVariants); + String desc = StringUtils.listToDelimitedString(peptideVariants, + ", "); + SequenceFeature sf = new SequenceFeature( + SequenceOntology.SEQUENCE_VARIANT, desc, peptidePos, + peptidePos, Float.NaN, null); + peptide.getDatasetSequence().addSequenceFeature(sf); + count++; + } + } + return count; + } + + /** + * Returns a non-redundant list of all peptide translations generated by the + * given dna variants, excluding the current residue value + * + * @param codonVariants + * an array of base values for codon positions 1, 2, 3 + * @param residue + * the current residue translation + * @return + */ + protected static List computePeptideVariants( + String[][] codonVariants, String residue) + { + List result = new ArrayList(); + for (String base1 : codonVariants[0]) + { + for (String base2 : codonVariants[1]) + { + for (String base3 : codonVariants[2]) + { + String codon = base1 + base2 + base3; + // TODO: report frameshift/insertion/deletion + // and multiple-base variants?! + String peptide = codon.contains("-") ? "-" : ResidueProperties + .codonTranslate(codon); + if (peptide != null && !result.contains(peptide) + && !peptide.equals(residue)) + { + result.add(peptide); + } + } + } + } + return result; + } + + /** + * Computes a list of all peptide variants given dna variants + * + * @param dnaSeq + * the coding dna sequence + * @param codonVariants + * variant features for each codon position (null if no variant) + * @param residue + * the canonical protein translation + * @return + */ + protected static List computePeptideVariants(SequenceI dnaSeq, + SequenceFeature[] codonVariants, String residue) + { + List result = new ArrayList(); + int[][] dnaVariants = new int[3][]; + for (int i = 0; i < 3; i++) + { + + } + // TODO Auto-generated method stub + return null; + } + + /** * precalculate different products that can be found for seqs in dataset and * return them. * diff --git a/src/jalview/ext/ensembl/EnsemblCdna.java b/src/jalview/ext/ensembl/EnsemblCdna.java index b8c9c3f..2ac4956 100644 --- a/src/jalview/ext/ensembl/EnsemblCdna.java +++ b/src/jalview/ext/ensembl/EnsemblCdna.java @@ -67,7 +67,7 @@ public class EnsemblCdna extends EnsemblSeqProxy if (SequenceOntology.getInstance().isA(sf.getType(), SequenceOntology.EXON)) { - String parentFeature = (String) sf.getValue("Parent"); + String parentFeature = (String) sf.getValue(PARENT); if (("transcript:" + accId).equals(parentFeature)) { return true; diff --git a/src/jalview/ext/ensembl/EnsemblCds.java b/src/jalview/ext/ensembl/EnsemblCds.java index 897371d..e366569 100644 --- a/src/jalview/ext/ensembl/EnsemblCds.java +++ b/src/jalview/ext/ensembl/EnsemblCds.java @@ -61,7 +61,7 @@ public class EnsemblCds extends EnsemblSeqProxy if (SequenceOntology.getInstance().isA(sf.getType(), SequenceOntology.CDS)) { - String parentFeature = (String) sf.getValue("Parent"); + String parentFeature = (String) sf.getValue(PARENT); if (("transcript:" + accId).equals(parentFeature)) { return true; diff --git a/src/jalview/ext/ensembl/EnsemblGenome.java b/src/jalview/ext/ensembl/EnsemblGenome.java index 6b4a1f6..60f8c46 100644 --- a/src/jalview/ext/ensembl/EnsemblGenome.java +++ b/src/jalview/ext/ensembl/EnsemblGenome.java @@ -60,8 +60,8 @@ public class EnsemblGenome extends EnsemblSeqProxy if (SequenceOntology.getInstance().isA(sf.getType(), SequenceOntology.TRANSCRIPT)) { - String parentFeature = (String) sf.getValue("ID"); - if (("transcript:" + accId).equals(parentFeature)) + String id = (String) sf.getValue(ID); + if (("transcript:" + accId).equals(id)) { return true; } diff --git a/src/jalview/ext/ensembl/EnsemblOverlap.java b/src/jalview/ext/ensembl/EnsemblOverlap.java index 732b518..2ec4664 100644 --- a/src/jalview/ext/ensembl/EnsemblOverlap.java +++ b/src/jalview/ext/ensembl/EnsemblOverlap.java @@ -42,11 +42,14 @@ public class EnsemblOverlap extends EnsemblRestClient @Override public AlignmentI getSequenceRecords(String query) throws IOException { + long now = System.currentTimeMillis(); // TODO: use a vararg String... for getSequenceRecords instead? List queries = new ArrayList(); queries.add(query); FileParse fp = getSequenceReader(queries); FeaturesFile fr = new FeaturesFile(fp); + System.out.println(getClass().getName() + " took " + + (System.currentTimeMillis() - now) + "ms to fetch"); return new Alignment(fr.getSeqsAsArray()); } diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index e986ba8..b805417 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -30,6 +30,10 @@ import java.util.List; */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { + protected static final String PARENT = "Parent"; + + protected static final String ID = "ID"; + public enum EnsemblSeqType { /** @@ -104,6 +108,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient @Override public AlignmentI getSequenceRecords(String query) throws Exception { + long now = System.currentTimeMillis(); // TODO use a String... query vararg instead? // danger: accession separator used as a regex here, a string elsewhere @@ -151,6 +156,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } inProgress = false; + System.out.println(getClass().getName() + " took " + + (System.currentTimeMillis() - now) + "ms to fetch"); return alignment; } @@ -520,7 +527,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient protected void transferFeature(SequenceFeature sf, SequenceI targetSequence, MapList overlap) { - String parent = (String) sf.getValue("Parent"); + String parent = (String) sf.getValue(PARENT); if (parent != null && !parent.contains(targetSequence.getName())) { // this genomic feature belongs to a different transcript @@ -538,6 +545,21 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient copy.setBegin(offset + Math.min(mappedRange[0], mappedRange[1])); copy.setEnd(offset + Math.max(mappedRange[0], mappedRange[1])); targetSequence.addSequenceFeature(copy); + + /* + * for sequence_variant, make an additional feature with consequence + */ + if (SequenceOntology.getInstance().isSequenceVariant(sf.getType())) + { + String consequence = (String) sf.getValue("consequence_type"); + if (consequence != null) + { + SequenceFeature sf2 = new SequenceFeature("consequence", + consequence, copy.getBegin(), copy.getEnd(), 0f, + null); + targetSequence.addSequenceFeature(sf2); + } + } } } diff --git a/src/jalview/io/gff/Gff3Helper.java b/src/jalview/io/gff/Gff3Helper.java index 55a0f9a..d878f64 100644 --- a/src/jalview/io/gff/Gff3Helper.java +++ b/src/jalview/io/gff/Gff3Helper.java @@ -374,14 +374,10 @@ public class Gff3Helper extends GffHelperBase if (SequenceOntology.getInstance().isSequenceVariant(sf.getType())) { /* - * Ensembl returns alleles and consequence_type (amongst other details) + * Ensembl returns dna variants as 'alleles' */ - String alleles = StringUtils.listToDelimitedString( + desc = StringUtils.listToDelimitedString( attributes.get("alleles"), ","); - String consequence = StringUtils.listToDelimitedString( - attributes.get("consequence_type"), ","); - desc = String.format("alleles %s; consequence %s", alleles, - consequence); } return desc; } diff --git a/src/jalview/io/gff/GffHelperBase.java b/src/jalview/io/gff/GffHelperBase.java index 6d9327b..d4a5358 100644 --- a/src/jalview/io/gff/GffHelperBase.java +++ b/src/jalview/io/gff/GffHelperBase.java @@ -349,7 +349,7 @@ public abstract class GffHelperBase implements GffHelperI for (Entry> attr : attributes.entrySet()) { String values = StringUtils.listToDelimitedString( - attr.getValue(), "; "); + attr.getValue(), ","); sf.setValue(attr.getKey(), values); if (NOTE.equals(attr.getKey())) { -- 1.7.10.2