From: gmungoc Date: Wed, 27 Jan 2016 16:57:11 +0000 (+0000) Subject: JAL-1705 add exon name to features and render on peptide (to show X-Git-Tag: Release_2_10_0~296^2~56 X-Git-Url: http://source.jalview.org/gitweb/?p=jalview.git;a=commitdiff_plain;h=b4fac54ddfb91688f281a6a2ede0d8d44ec1dd13 JAL-1705 add exon name to features and render on peptide (to show boundaries) --- diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java index 68f6c93..e96d9d7 100644 --- a/src/jalview/analysis/CrossRef.java +++ b/src/jalview/analysis/CrossRef.java @@ -27,22 +27,13 @@ import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; -import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.io.gff.SequenceOntology; -import jalview.schemes.ResidueProperties; import jalview.util.DBRefUtils; -import jalview.util.MapList; -import jalview.util.MappingUtils; -import jalview.util.StringUtils; import jalview.ws.SequenceFetcher; import jalview.ws.seqfetcher.ASequenceFetcher; import java.util.ArrayList; -import java.util.Collections; -import java.util.LinkedHashMap; import java.util.List; -import java.util.Map.Entry; import java.util.Vector; /** @@ -278,12 +269,6 @@ public class CrossRef // map should be from protein seq to its coding dna cf.addMap(rsq, dss, xref.getMap().getMap().getInverse()); } - - /* - * compute peptide variants from dna variants - */ - rsq.createDatasetSequence(); - computeProteinVariants(seq, rsq, xref.getMap().getMap()); } found = true; } @@ -585,195 +570,6 @@ public class CrossRef } /** - * Computes variants in peptide product generated by variants in dna, and adds - * them as sequence_variant features on the protein sequence. Returns the - * number of variant features added. - * - * @param dnaSeq - * @param peptide - * @param dnaToProtein - */ - protected static int computeProteinVariants(SequenceI dnaSeq, - SequenceI peptide, MapList dnaToProtein) - { - /* - * map from peptide position to all variant features of the codon for it - * LinkedHashMap ensures we add the peptide features in sequence order - */ - LinkedHashMap variants = new LinkedHashMap(); - SequenceOntology so = SequenceOntology.getInstance(); - - SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures(); - if (dnaFeatures == null) - { - return 0; - } - - int[] lastCodon = null; - int lastPeptidePostion = 0; - - /* - * build a map of codon variations for peptides - */ - for (SequenceFeature sf : dnaFeatures) - { - int dnaCol = sf.getBegin(); - if (dnaCol != sf.getEnd()) - { - // not handling multi-locus variant features - continue; - } - if (so.isSequenceVariant(sf.getType())) - { - int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol); - if (mapsTo == null) - { - // feature doesn't lie within coding region - continue; - } - int peptidePosition = mapsTo[0]; - String[][] codonVariants = variants.get(peptidePosition); - if (codonVariants == null) - { - codonVariants = new String[3][]; - variants.put(peptidePosition, codonVariants); - } - - /* - * extract dna variants to a string array - */ - String alls = (String) sf.getValue("alleles"); - if (alls == null) - { - continue; - } - String[] alleles = alls.split(","); - - /* - * get this peptides codon positions e.g. [3, 4, 5] or [4, 7, 10] - */ - int[] codon = peptidePosition == lastPeptidePostion ? lastCodon - : MappingUtils.flattenRanges(dnaToProtein.locateInFrom( - peptidePosition, peptidePosition)); - lastPeptidePostion = peptidePosition; - lastCodon = codon; - - /* - * save nucleotide (and this variant) for each codon position - */ - for (int codonPos = 0; codonPos < 3; codonPos++) - { - String nucleotide = String.valueOf(dnaSeq - .getCharAt(codon[codonPos] - 1)); - if (codon[codonPos] == dnaCol) - { - /* - * record current dna base and its alleles - */ - String[] dnaVariants = new String[alleles.length + 1]; - dnaVariants[0] = nucleotide; - System.arraycopy(alleles, 0, dnaVariants, 1, alleles.length); - codonVariants[codonPos] = dnaVariants; - } - else if (codonVariants[codonPos] == null) - { - /* - * record current dna base only - * (at least until we find any variation and overwrite it) - */ - codonVariants[codonPos] = new String[] { nucleotide }; - } - } - } - } - - /* - * scan codon variations, compute peptide variants and add to peptide sequence - */ - int count = 0; - for (Entry variant : variants.entrySet()) - { - int peptidePos = variant.getKey(); - String[][] codonVariants = variant.getValue(); - String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based - List peptideVariants = computePeptideVariants(codonVariants, - residue); - if (!peptideVariants.isEmpty()) - { - Collections.sort(peptideVariants); - String desc = StringUtils.listToDelimitedString(peptideVariants, - ", "); - SequenceFeature sf = new SequenceFeature( - SequenceOntology.SEQUENCE_VARIANT, desc, peptidePos, - peptidePos, Float.NaN, null); - peptide.getDatasetSequence().addSequenceFeature(sf); - count++; - } - } - return count; - } - - /** - * Returns a non-redundant list of all peptide translations generated by the - * given dna variants, excluding the current residue value - * - * @param codonVariants - * an array of base values for codon positions 1, 2, 3 - * @param residue - * the current residue translation - * @return - */ - protected static List computePeptideVariants( - String[][] codonVariants, String residue) - { - List result = new ArrayList(); - for (String base1 : codonVariants[0]) - { - for (String base2 : codonVariants[1]) - { - for (String base3 : codonVariants[2]) - { - String codon = base1 + base2 + base3; - // TODO: report frameshift/insertion/deletion - // and multiple-base variants?! - String peptide = codon.contains("-") ? "-" : ResidueProperties - .codonTranslate(codon); - if (peptide != null && !result.contains(peptide) - && !peptide.equals(residue)) - { - result.add(peptide); - } - } - } - } - return result; - } - - /** - * Computes a list of all peptide variants given dna variants - * - * @param dnaSeq - * the coding dna sequence - * @param codonVariants - * variant features for each codon position (null if no variant) - * @param residue - * the canonical protein translation - * @return - */ - protected static List computePeptideVariants(SequenceI dnaSeq, - SequenceFeature[] codonVariants, String residue) - { - List result = new ArrayList(); - int[][] dnaVariants = new int[3][]; - for (int i = 0; i < 3; i++) - { - - } - // TODO Auto-generated method stub - return null; - } - - /** * precalculate different products that can be found for seqs in dataset and * return them. * diff --git a/src/jalview/ext/ensembl/EnsemblCdna.java b/src/jalview/ext/ensembl/EnsemblCdna.java index 0a97425..a2ecfcd 100644 --- a/src/jalview/ext/ensembl/EnsemblCdna.java +++ b/src/jalview/ext/ensembl/EnsemblCdna.java @@ -45,18 +45,13 @@ public class EnsemblCdna extends EnsemblSeqProxy } /** - * Answers true unless the feature type is 'transcript' or 'exon' (or a - * sub-type in the Sequence Ontology). These features are only retrieved in - * order to identify the exon sequence loci, and are redundant information on - * the exon sequence itself. + * Answers true unless the feature type is 'transcript' (or a sub-type in the + * Sequence Ontology). */ @Override protected boolean retainFeature(SequenceFeature sf, String accessionId) { - SequenceOntology so = SequenceOntology.getInstance(); - String type = sf.getType(); - - if (isTranscript(type) || so.isA(type, SequenceOntology.EXON)) + if (isTranscript(sf.getType())) { return false; } diff --git a/src/jalview/ext/ensembl/EnsemblCds.java b/src/jalview/ext/ensembl/EnsemblCds.java index 1f875a7..7d0b6fd 100644 --- a/src/jalview/ext/ensembl/EnsemblCds.java +++ b/src/jalview/ext/ensembl/EnsemblCds.java @@ -7,10 +7,11 @@ public class EnsemblCds extends EnsemblSeqProxy { /* * fetch cds features on genomic sequence (to identify the CDS regions) - * and variation features (to retain) + * and exon and variation features (to retain for display) */ private static final EnsemblFeatureType[] FEATURES_TO_FETCH = { - EnsemblFeatureType.cds, EnsemblFeatureType.variation }; + EnsemblFeatureType.cds, EnsemblFeatureType.exon, + EnsemblFeatureType.variation }; /** * Constructor diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index a6d838b..fb0b01c 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -11,8 +11,11 @@ import jalview.exceptions.JalviewException; import jalview.io.FastaFile; import jalview.io.FileParse; import jalview.io.gff.SequenceOntology; +import jalview.schemes.ResidueProperties; import jalview.util.DBRefUtils; import jalview.util.MapList; +import jalview.util.MappingUtils; +import jalview.util.StringUtils; import java.io.IOException; import java.net.MalformedURLException; @@ -21,7 +24,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map.Entry; /** * Base class for Ensembl sequence fetchers @@ -266,6 +271,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(), accId, map); querySeq.getDatasetSequence().addDBRef(dbr); + + /* + * compute peptide variants from dna variants and add as + * sequence features on the protein sequence ta-da + */ + computeProteinFeatures(querySeq, proteinSeq, mapList); } } catch (Exception e) { @@ -802,6 +813,240 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** + * Maps exon features from dna to protein, and computes variants in peptide + * product generated by variants in dna, and adds them as sequence_variant + * features on the protein sequence. Returns the number of variant features + * added. + * + * @param dnaSeq + * @param peptide + * @param dnaToProtein + */ + static int computeProteinFeatures(SequenceI dnaSeq, + SequenceI peptide, MapList dnaToProtein) + { + while (dnaSeq.getDatasetSequence() != null) + { + dnaSeq = dnaSeq.getDatasetSequence(); + } + while (peptide.getDatasetSequence() != null) + { + peptide = peptide.getDatasetSequence(); + } + + mapExonsToProtein(dnaSeq, peptide, dnaToProtein); + + LinkedHashMap variants = buildDnaVariantsMap( + dnaSeq, dnaToProtein); + + /* + * scan codon variations, compute peptide variants and add to peptide sequence + */ + int count = 0; + for (Entry variant : variants.entrySet()) + { + int peptidePos = variant.getKey(); + String[][] codonVariants = variant.getValue(); + String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based + List peptideVariants = computePeptideVariants(codonVariants, + residue); + if (!peptideVariants.isEmpty()) + { + Collections.sort(peptideVariants); + String desc = StringUtils.listToDelimitedString(peptideVariants, + ", "); + SequenceFeature sf = new SequenceFeature( + SequenceOntology.SEQUENCE_VARIANT, desc, peptidePos, + peptidePos, 0f, null); + peptide.addSequenceFeature(sf); + count++; + } + } + return count; + } + + /** + * Transfers exon features to the corresponding mapped regions of the protein + * sequence. This is useful because it allows visualisation of exon boundaries + * on the peptide (using 'colour by label' for the exon name). Returns the + * number of features written. + * + * @param dnaSeq + * @param peptide + * @param dnaToProtein + */ + static int mapExonsToProtein(SequenceI dnaSeq, SequenceI peptide, + MapList dnaToProtein) + { + SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); + if (sfs == null) + { + return 0; + } + + SequenceOntology so = SequenceOntology.getInstance(); + int count = 0; + + for (SequenceFeature sf : sfs) + { + if (so.isA(sf.getType(), SequenceOntology.EXON)) + { + int start = sf.getBegin(); + int end = sf.getEnd(); + int[] mapsTo = dnaToProtein.locateInTo(start, end); + if (mapsTo != null) + { + SequenceFeature copy = new SequenceFeature(SequenceOntology.EXON, + sf.getDescription(), mapsTo[0], mapsTo[1], 0f, null); + peptide.addSequenceFeature(copy); + count++; + } + } + } + return count; + } + + /** + * Builds a map whose key is position in the protein sequence, and value is an + * array of all variants for the coding codon positions + * + * @param dnaSeq + * @param dnaToProtein + * @return + */ + static LinkedHashMap buildDnaVariantsMap( + SequenceI dnaSeq, MapList dnaToProtein) + { + /* + * map from peptide position to all variant features of the codon for it + * LinkedHashMap ensures we add the peptide features in sequence order + */ + LinkedHashMap variants = new LinkedHashMap(); + SequenceOntology so = SequenceOntology.getInstance(); + + SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures(); + if (dnaFeatures == null) + { + return variants; + } + + int[] lastCodon = null; + int lastPeptidePostion = 0; + + /* + * build a map of codon variations for peptides + */ + for (SequenceFeature sf : dnaFeatures) + { + int dnaCol = sf.getBegin(); + if (dnaCol != sf.getEnd()) + { + // not handling multi-locus variant features + continue; + } + if (so.isSequenceVariant(sf.getType())) + { + int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol); + if (mapsTo == null) + { + // feature doesn't lie within coding region + continue; + } + int peptidePosition = mapsTo[0]; + String[][] codonVariants = variants.get(peptidePosition); + if (codonVariants == null) + { + codonVariants = new String[3][]; + variants.put(peptidePosition, codonVariants); + } + + /* + * extract dna variants to a string array + */ + String alls = (String) sf.getValue("alleles"); + if (alls == null) + { + continue; + } + String[] alleles = alls.split(","); + + /* + * get this peptides codon positions e.g. [3, 4, 5] or [4, 7, 10] + */ + int[] codon = peptidePosition == lastPeptidePostion ? lastCodon + : MappingUtils.flattenRanges(dnaToProtein.locateInFrom( + peptidePosition, peptidePosition)); + lastPeptidePostion = peptidePosition; + lastCodon = codon; + + /* + * save nucleotide (and this variant) for each codon position + */ + for (int codonPos = 0; codonPos < 3; codonPos++) + { + String nucleotide = String.valueOf(dnaSeq + .getCharAt(codon[codonPos] - 1)); + if (codon[codonPos] == dnaCol) + { + /* + * record current dna base and its alleles + */ + String[] dnaVariants = new String[alleles.length + 1]; + dnaVariants[0] = nucleotide; + System.arraycopy(alleles, 0, dnaVariants, 1, alleles.length); + codonVariants[codonPos] = dnaVariants; + } + else if (codonVariants[codonPos] == null) + { + /* + * record current dna base only + * (at least until we find any variation and overwrite it) + */ + codonVariants[codonPos] = new String[] { nucleotide }; + } + } + } + } + return variants; + } + + /** + * Returns a non-redundant list of all peptide translations generated by the + * given dna variants, excluding the current residue value + * + * @param codonVariants + * an array of base values for codon positions 1, 2, 3 + * @param residue + * the current residue translation + * @return + */ + static List computePeptideVariants( + String[][] codonVariants, String residue) + { + List result = new ArrayList(); + for (String base1 : codonVariants[0]) + { + for (String base2 : codonVariants[1]) + { + for (String base3 : codonVariants[2]) + { + String codon = base1 + base2 + base3; + // TODO: report frameshift/insertion/deletion + // and multiple-base variants?! + String peptide = codon.contains("-") ? "-" : ResidueProperties + .codonTranslate(codon); + if (peptide != null && !result.contains(peptide) + && !peptide.equals(residue)) + { + result.add(peptide); + } + } + } + } + return result; + } + + /** * Answers true if the feature type is either 'NMD_transcript_variant' or * 'transcript' or one of its sub-types in the Sequence Ontology. This is * needed because NMD_transcript_variant behaves like 'transcript' in Ensembl diff --git a/src/jalview/io/gff/Gff3Helper.java b/src/jalview/io/gff/Gff3Helper.java index 4a2a50e..2e98e4e 100644 --- a/src/jalview/io/gff/Gff3Helper.java +++ b/src/jalview/io/gff/Gff3Helper.java @@ -372,7 +372,9 @@ public class Gff3Helper extends GffHelperBase desc = target.split(" ")[0]; } - if (SequenceOntology.getInstance().isSequenceVariant(sf.getType())) + SequenceOntology so = SequenceOntology.getInstance(); + String type = sf.getType(); + if (so.isSequenceVariant(type)) { /* * Ensembl returns dna variants as 'alleles' @@ -382,9 +384,11 @@ public class Gff3Helper extends GffHelperBase } /* - * Ensembl returns gene name as 'Name' for a transcript + * extract 'Name' for a transcript (to show gene name) + * or an exon (so 'colour by label' shows exon boundaries) */ - if (EnsemblSeqProxy.isTranscript(sf.getType())) + if (EnsemblSeqProxy.isTranscript(type) + || so.isA(type, SequenceOntology.EXON)) { desc = StringUtils.listToDelimitedString(attributes.get("Name"), ","); } diff --git a/src/jalview/io/gff/GffHelperBase.java b/src/jalview/io/gff/GffHelperBase.java index 499fa7b..feeec1d 100644 --- a/src/jalview/io/gff/GffHelperBase.java +++ b/src/jalview/io/gff/GffHelperBase.java @@ -321,13 +321,18 @@ public abstract class GffHelperBase implements GffHelperI { int start = Integer.parseInt(gff[START_COL]); int end = Integer.parseInt(gff[END_COL]); - float score = Float.NaN; + + /* + * default 'score' is 0 rather than Float.NaN as the latter currently + * disables the 'graduated colour => colour by label' option + */ + float score = 0f; try { score = Float.parseFloat(gff[SCORE_COL]); } catch (NumberFormatException nfe) { - // e.g. '.' - leave as NaN to indicate no score + // e.g. '.' - leave as zero } SequenceFeature sf = new SequenceFeature(gff[TYPE_COL],