From 56b5f4d5ca50971a34c9284bbb4b0507f7ba8a71 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 28 Jan 2016 15:00:17 +0000 Subject: [PATCH] JAL-1705 tests added, minor bugfix and refactoring --- src/jalview/ext/ensembl/EnsemblCds.java | 17 +++ src/jalview/ext/ensembl/EnsemblSeqProxy.java | 121 ++++++++++++++------- test/jalview/ext/ensembl/EnsemblSeqProxyTest.java | 73 ++++++++++++- 3 files changed, 168 insertions(+), 43 deletions(-) diff --git a/src/jalview/ext/ensembl/EnsemblCds.java b/src/jalview/ext/ensembl/EnsemblCds.java index 7d0b6fd..22c0a06 100644 --- a/src/jalview/ext/ensembl/EnsemblCds.java +++ b/src/jalview/ext/ensembl/EnsemblCds.java @@ -1,8 +1,11 @@ package jalview.ext.ensembl; import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; import jalview.io.gff.SequenceOntology; +import java.util.List; + public class EnsemblCds extends EnsemblSeqProxy { /* @@ -76,4 +79,18 @@ public class EnsemblCds extends EnsemblSeqProxy return false; } + /** + * Overrides this method to trivially return a range which is the whole of the + * nucleotide sequence. This is both faster than scanning for CDS features, + * and also means we don't need to keep CDS features on CDS sequence (where + * they are redundant information). + */ + @Override + protected int getCdsRanges(SequenceI dnaSeq, List ranges) + { + int len = dnaSeq.getLength(); + ranges.add(new int[] { 1, len }); + return len; + } + } diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index fb0b01c..e241874 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -296,29 +296,62 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq) { - SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); - if (sfs == null) + List ranges = new ArrayList(50); + + int mappedDnaLength = getCdsRanges(dnaSeq, ranges); + + int proteinLength = proteinSeq.getLength(); + List proteinRange = new ArrayList(); + int proteinStart = 1; + + /* + * incomplete start codon may mean X at start of peptide + * we ignore both for mapping purposes + */ + if (proteinSeq.getCharAt(0) == 'X') { - return null; + proteinStart = 2; + proteinLength--; } + proteinRange.add(new int[] { proteinStart, proteinLength }); - List ranges = new ArrayList(50); - SequenceOntology so = SequenceOntology.getInstance(); - - int mappedDnaLength = 0; - /* - * Map CDS columns of dna to peptide. No need to worry about reverse strand - * dna here since the retrieved sequence is as transcribed (reverse - * complement for reverse strand), i.e in the same sense as the peptide. + * dna length should map to protein (or protein plus stop codon) */ - boolean fivePrimeIncomplete = false; + int codesForResidues = mappedDnaLength / 3; + if (codesForResidues == proteinLength + || codesForResidues == (proteinLength + 1)) + { + return new MapList(ranges, proteinRange, 3, 1); + } + return null; + } + + /** + * Adds CDS ranges to the ranges list, and returns the total length mapped. + * + * No need to worry about reverse strand dna here since the retrieved sequence + * is as transcribed (reverse complement for reverse strand), i.e in the same + * sense as the peptide. + * + * @param dnaSeq + * @param ranges + * @return + */ + protected int getCdsRanges(SequenceI dnaSeq, List ranges) + { + SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); + if (sfs == null) + { + return 0; + } + int mappedDnaLength = 0; for (SequenceFeature sf : sfs) { /* * process a CDS feature (or a sub-type of CDS) */ - if (so.isA(sf.getType(), SequenceOntology.CDS)) + if (SequenceOntology.getInstance().isA(sf.getType(), SequenceOntology.CDS)) { int phase = 0; try { @@ -335,7 +368,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient int end = sf.getEnd(); if (ranges.isEmpty() && phase > 0) { - fivePrimeIncomplete = true; begin += phase; if (begin > end) { @@ -346,26 +378,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient mappedDnaLength += Math.abs(end - begin) + 1; } } - int proteinLength = proteinSeq.getLength(); - List proteinRange = new ArrayList(); - int proteinStart = 1; - if (fivePrimeIncomplete && proteinSeq.getCharAt(0) == 'X') - { - proteinStart = 2; - proteinLength--; - } - proteinRange.add(new int[] { proteinStart, proteinLength }); - - /* - * dna length should map to protein (or protein plus stop codon) - */ - int codesForResidues = mappedDnaLength / 3; - if (codesForResidues == proteinLength - || codesForResidues == (proteinLength + 1)) - { - return new MapList(ranges, proteinRange, 3, 1); - } - return null; + return mappedDnaLength; } /** @@ -834,7 +847,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient peptide = peptide.getDatasetSequence(); } - mapExonsToProtein(dnaSeq, peptide, dnaToProtein); + mapExonFeaturesToProtein(dnaSeq, peptide, dnaToProtein); LinkedHashMap variants = buildDnaVariantsMap( dnaSeq, dnaToProtein); @@ -852,7 +865,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient residue); if (!peptideVariants.isEmpty()) { - Collections.sort(peptideVariants); String desc = StringUtils.listToDelimitedString(peptideVariants, ", "); SequenceFeature sf = new SequenceFeature( @@ -875,7 +887,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * @param peptide * @param dnaToProtein */ - static int mapExonsToProtein(SequenceI dnaSeq, SequenceI peptide, + static int mapExonFeaturesToProtein(SequenceI dnaSeq, SequenceI peptide, MapList dnaToProtein) { SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); @@ -930,6 +942,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return variants; } + int dnaStart = dnaSeq.getStart(); int[] lastCodon = null; int lastPeptidePostion = 0; @@ -985,7 +998,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient for (int codonPos = 0; codonPos < 3; codonPos++) { String nucleotide = String.valueOf(dnaSeq - .getCharAt(codon[codonPos] - 1)); + .getCharAt(codon[codonPos] - dnaStart)); if (codon[codonPos] == dnaCol) { /* @@ -1011,11 +1024,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Returns a non-redundant list of all peptide translations generated by the - * given dna variants, excluding the current residue value + * Returns a sorted, non-redundant list of all peptide translations generated + * by the given dna variants, excluding the current residue value * * @param codonVariants - * an array of base values for codon positions 1, 2, 3 + * an array of base values (acgtACGT) for codon positions 1, 2, 3 * @param residue * the current residue translation * @return @@ -1036,13 +1049,37 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String peptide = codon.contains("-") ? "-" : ResidueProperties .codonTranslate(codon); if (peptide != null && !result.contains(peptide) - && !peptide.equals(residue)) + && !peptide.equalsIgnoreCase(residue)) { result.add(peptide); } } } } + + /* + * sort alphabetically with STOP at the end + */ + Collections.sort(result, new Comparator() + { + + @Override + public int compare(String o1, String o2) + { + if ("STOP".equals(o1)) + { + return 1; + } + else if ("STOP".equals(o2)) + { + return -1; + } + else + { + return o1.compareTo(o2); + } + } + }); return result; } diff --git a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java index 10ecfe0..c525e95 100644 --- a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java +++ b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java @@ -1,5 +1,7 @@ package jalview.ext.ensembl; +import static org.testng.AssertJUnit.assertEquals; + import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.SequenceI; @@ -208,5 +210,74 @@ public class EnsemblSeqProxyTest + (isAvailable ? "UP!" : "DOWN or unreachable ******************* BAD!")); } - // todo lots of tests + + /** + * Tests for the method that computes all peptide variants given codon + * variants + */ + @Test(groups = "Functional") + public void testComputePeptideVariants() + { + String[][] codonVariants = new String[][] { { "A" }, { "G" }, { "T" } }; + + /* + * AGT codes for S - this is not included in the variants returned + */ + List variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S"); + assertEquals("[]", variants.toString()); + + // S is reported if it differs from the current value (A): + variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "A"); + assertEquals("[S]", variants.toString()); + + /* + * synonymous variant is not reported + */ + codonVariants = new String[][] { { "A" }, { "G" }, { "C", "T" } }; + // AGC and AGT both code for S + variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "s"); + assertEquals("[]", variants.toString()); + + /* + * equivalent variants are only reported once + */ + codonVariants = new String[][] { { "C" }, { "T" }, + { "A", "C", "G", "T" } }; + // CTA CTC CTG CTT all code for L + variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S"); + assertEquals("[L]", variants.toString()); + + /* + * vary codons 1 and 2; variant products are sorted and non-redundant + */ + codonVariants = new String[][] { { "a", "C" }, { "g", "T" }, { "A" } }; + // aga ata cga cta code for R, I, R, L + variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S"); + assertEquals("[I, L, R]", variants.toString()); + + /* + * vary codons 2 and 3 + */ + codonVariants = new String[][] { { "a" }, { "g", "T" }, { "A", "c" } }; + // aga agc ata atc code for R, S, I, I + variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S"); + assertEquals("[I, R]", variants.toString()); + + /* + * vary codons 1 and 3 + */ + codonVariants = new String[][] { { "a", "t" }, { "a" }, { "t", "g" } }; + // aat aag tat tag code for N, K, Y, STOP - STOP sorted to end + variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S"); + assertEquals("[K, N, Y, STOP]", variants.toString()); + + /* + * vary codons 1, 2 and 3 + */ + codonVariants = new String[][] { { "a", "t" }, { "G", "C" }, + { "t", "g" } }; + // agt agg act acg tgt tgg tct tcg code for S, R, T, T, C, W, S, S + variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S"); + assertEquals("[C, R, T, W]", variants.toString()); + } } \ No newline at end of file -- 1.7.10.2