From a9cc938067b06b2ee3d40245f0814ccf43008988 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Wed, 11 Mar 2015 08:47:00 +0000 Subject: [PATCH] JAL-845 cDNA mapping: select on sequence translation instead of name --- src/jalview/analysis/AlignmentUtils.java | 124 ++++++++++++++++-------- src/jalview/schemes/ResidueProperties.java | 8 +- test/jalview/analysis/AlignmentUtilsTests.java | 45 +++++++++ 3 files changed, 133 insertions(+), 44 deletions(-) diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 6b7d18b..99ad525 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -214,11 +214,11 @@ public class AlignmentUtils /** * Build mapping of protein to cDNA alignment. Mappings are made between - * sequences which have the same name and compatible lengths. Any new mappings - * are added to the protein alignment. Has a 3-valued result: either Mapped - * (at least one sequence mapping was created), AlreadyMapped (all possible - * sequence mappings already exist), or NotMapped (no possible sequence - * mappings exist). + * sequences where the cDNA translates to the protein sequence. Any new + * mappings are added to the protein alignment. Has a 3-valued result: either + * Mapped (at least one sequence mapping was created), AlreadyMapped (all + * possible sequence mappings already exist), or NotMapped (no possible + * sequence mappings exist). * * @param proteinAlignment * @param cdnaAlignment @@ -236,29 +236,25 @@ public class AlignmentUtils boolean mappingPossible = false; boolean mappingPerformed = false; + List mapped = new ArrayList(); + List thisSeqs = proteinAlignment.getSequences(); - /* - * Build a look-up of cDNA sequences by name, for matching purposes. - */ - Map> cdnaSeqs = cdnaAlignment - .getSequencesByName(); - for (SequenceI aaSeq : thisSeqs) { AlignedCodonFrame acf = new AlignedCodonFrame(); - List candidates = cdnaSeqs.get(aaSeq.getName()); - if (candidates == null) + + for (SequenceI cdnaSeq : cdnaAlignment.getSequences()) { /* - * No cDNA sequence with matching name, so no mapping possible for this - * protein sequence + * Heuristic rule: don't map more than one AA sequence to the same cDNA; + * map progressively assuming that alignments have mappable sequences in + * the same respective order */ - continue; - } - mappingPossible = true; - for (SequenceI cdnaSeq : candidates) - { + if (mapped.contains(cdnaSeq)) + { + continue; + } if (!mappingExists(proteinAlignment.getCodonFrames(), aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence())) { @@ -267,6 +263,12 @@ public class AlignmentUtils { acf.addMap(cdnaSeq, aaSeq, map); mappingPerformed = true; + mapped.add(cdnaSeq); + + /* + * Heuristic rule #2: don't map AA sequence to more than one cDNA + */ + break; } } } @@ -311,7 +313,8 @@ public class AlignmentUtils /** * Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA * must be three times the length of the protein, possibly after ignoring - * start and/or stop codons. Returns null if no mapping is determined. + * start and/or stop codons, and must translate to the protein. Returns null + * if no mapping is determined. * * @param proteinSeqs * @param cdnaSeq @@ -321,34 +324,41 @@ public class AlignmentUtils SequenceI cdnaSeq) { /* - * Here we handle either dataset sequence set (desktop) or absent (applet) + * Here we handle either dataset sequence set (desktop) or absent (applet). + * Use only the char[] form of the sequence to avoid creating possibly large + * String objects. */ final SequenceI proteinDataset = proteinSeq.getDatasetSequence(); - String aaSeqString = proteinDataset != null ? proteinDataset - .getSequenceAsString() : proteinSeq.getSequenceAsString(); + char[] aaSeqChars = proteinDataset != null ? proteinDataset + .getSequence() : proteinSeq.getSequence(); final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence(); - String cdnaSeqString = cdnaDataset != null ? cdnaDataset - .getSequenceAsString() : cdnaSeq.getSequenceAsString(); - if (aaSeqString == null || cdnaSeqString == null) + char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence() + : cdnaSeq.getSequence(); + if (aaSeqChars == null || cdnaSeqChars == null) { return null; } - final int mappedLength = 3 * aaSeqString.length(); - int cdnaLength = cdnaSeqString.length(); + /* + * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping) + */ + final int mappedLength = 3 * aaSeqChars.length; + int cdnaLength = cdnaSeqChars.length; int cdnaStart = 1; int cdnaEnd = cdnaLength; final int proteinStart = 1; - final int proteinEnd = aaSeqString.length(); + final int proteinEnd = aaSeqChars.length; /* * If lengths don't match, try ignoring stop codon. */ - if (cdnaLength != mappedLength) + if (cdnaLength != mappedLength && cdnaLength > 2) { - for (Object stop : ResidueProperties.STOP) + String lastCodon = String.valueOf(cdnaSeqChars, cdnaLength - 3, 3) + .toUpperCase(); + for (String stop : ResidueProperties.STOP) { - if (cdnaSeqString.toUpperCase().endsWith((String) stop)) + if (lastCodon.equals(stop)) { cdnaEnd -= 3; cdnaLength -= 3; @@ -361,24 +371,58 @@ public class AlignmentUtils * If lengths still don't match, try ignoring start codon. */ if (cdnaLength != mappedLength - && cdnaSeqString.toUpperCase().startsWith( + && cdnaLength > 2 + && String.valueOf(cdnaSeqChars, 0, 3).toUpperCase() + .equals( ResidueProperties.START)) { cdnaStart += 3; cdnaLength -= 3; } - if (cdnaLength == mappedLength) + if (cdnaLength != mappedLength) { - MapList map = new MapList(new int[] - { cdnaStart, cdnaEnd }, new int[] - { proteinStart, proteinEnd }, 3, 1); - return map; + return null; } - else + if (!translatesAs(cdnaSeqChars, cdnaStart - 1, aaSeqChars)) { return null; } + MapList map = new MapList(new int[] + { cdnaStart, cdnaEnd }, new int[] + { proteinStart, proteinEnd }, 3, 1); + return map; + } + + /** + * Test whether the given cdna sequence, starting at the given offset, + * translates to the given amino acid sequence, using the standard translation + * table. Designed to fail fast i.e. as soon as a mismatch position is found. + * + * @param cdnaSeqChars + * @param cdnaStart + * @param aaSeqChars + * @return + */ + protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart, + char[] aaSeqChars) + { + int aaResidue = 0; + for (int i = cdnaStart; i < cdnaSeqChars.length - 2 + && aaResidue < aaSeqChars.length; i += 3) + { + String codon = String.valueOf(cdnaSeqChars, i, 3); + final String translated = ResidueProperties.codonTranslate( + codon); + if (translated == null + || !(aaSeqChars[aaResidue] == translated.charAt(0))) + { + return false; + } + aaResidue++; + } + // fail if we didn't match all of the aa sequence + return (aaResidue == aaSeqChars.length); } /** diff --git a/src/jalview/schemes/ResidueProperties.java b/src/jalview/schemes/ResidueProperties.java index 4bb246a..c11423c 100755 --- a/src/jalview/schemes/ResidueProperties.java +++ b/src/jalview/schemes/ResidueProperties.java @@ -675,7 +675,7 @@ public class ResidueProperties public static Vector Phe = new Vector(); - public static Vector STOP = new Vector(); + public static List STOP = new ArrayList(); public static String START = "ATG"; @@ -1099,9 +1099,9 @@ public class ResidueProperties Gly.addElement("GGC"); Gly.addElement("GGT"); - STOP.addElement("TGA"); - STOP.addElement("TAA"); - STOP.addElement("TAG"); + STOP.add("TGA"); + STOP.add("TAA"); + STOP.add("TAG"); Trp.addElement("TGG"); diff --git a/test/jalview/analysis/AlignmentUtilsTests.java b/test/jalview/analysis/AlignmentUtilsTests.java index 2711a36..f56dd5d 100644 --- a/test/jalview/analysis/AlignmentUtilsTests.java +++ b/test/jalview/analysis/AlignmentUtilsTests.java @@ -21,6 +21,7 @@ package jalview.analysis; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; import jalview.analysis.AlignmentUtils.MappingResult; @@ -568,4 +569,48 @@ public class AlignmentUtilsTests assertEquals("C--H--Y-Q", prot3.getSequenceAsString()); } + /** + * Test the method that tests whether a CDNA sequence translates to a protein + * sequence + */ + @Test + public void testTranslatesAs() + { + assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0, + "FPKG".toCharArray())); + // with start codon + assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(), + 3, "FPKG".toCharArray())); + // with stop codon1 + assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(), + 0, "FPKG".toCharArray())); + // with stop codon2 + assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(), + 0, "FPKG".toCharArray())); + // with stop codon3 + assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(), + 0, "FPKG".toCharArray())); + // with start and stop codon1 + assertTrue(AlignmentUtils.translatesAs( + "atgtttcccaaaggtaa".toCharArray(), 3, "FPKG".toCharArray())); + // with start and stop codon2 + assertTrue(AlignmentUtils.translatesAs( + "atgtttcccaaaggtag".toCharArray(), 3, "FPKG".toCharArray())); + // with start and stop codon3 + assertTrue(AlignmentUtils.translatesAs( + "atgtttcccaaaggtga".toCharArray(), 3, "FPKG".toCharArray())); + + // wrong protein + assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), + 0, + "FPMG".toCharArray())); + } + + @Test + public void testTranslatesAs_withAmbiguityCodes() + { + // R means A or G + assertTrue(AlignmentUtils.translatesAs("car".toCharArray(), 0, + "Q".toCharArray())); + } } -- 1.7.10.2