From: gmungoc Date: Fri, 9 Feb 2018 12:14:38 +0000 (+0000) Subject: JAL-2789 avoid using transcript (with no UTR) as CDS sequence X-Git-Tag: Release_2_10_4~61^2 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=refs%2Fheads%2Fbug%2FJAL-2666;p=jalview.git JAL-2789 avoid using transcript (with no UTR) as CDS sequence --- diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index dfb185a..343ebc7 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -1833,7 +1833,7 @@ public class AlignmentUtils * @param seqMappings * the set of mappings involving dnaSeq * @param aMapping - * an initial candidate from seqMappings + * a transcript-to-peptide mapping * @return */ static SequenceI findCdsForProtein(List mappings, @@ -1858,7 +1858,15 @@ public class AlignmentUtils if (mappedFromLength == dnaLength || mappedFromLength == dnaLength - CODON_LENGTH) { - return seqDss; + /* + * if sequence has CDS features, this is a transcript with no UTR + * - do not take this as the CDS sequence! (JAL-2789) + */ + if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS) + .isEmpty()) + { + return seqDss; + } } /* @@ -1883,10 +1891,12 @@ public class AlignmentUtils { /* * found a 3:1 mapping to the protein product which covers - * the whole dna sequence i.e. is from CDS; finally check it - * is from the dna start sequence + * the whole dna sequence i.e. is from CDS; finally check the CDS + * is mapped from the given dna start sequence */ SequenceI cdsSeq = map.getFromSeq(); + // todo this test is weak if seqMappings contains multiple mappings; + // we get away with it if transcript:cds relationship is 1:1 List dnaToCdsMaps = MappingUtils .findMappingsForSequence(cdsSeq, seqMappings); if (!dnaToCdsMaps.isEmpty()) @@ -2180,12 +2190,13 @@ public class AlignmentUtils int mappedDnaLength = MappingUtils.getLength(ranges); /* - * if not a whole number of codons, something is wrong, - * abort mapping + * if not a whole number of codons, truncate mapping */ - if (mappedDnaLength % CODON_LENGTH > 0) + int codonRemainder = mappedDnaLength % CODON_LENGTH; + if (codonRemainder > 0) { - return null; + mappedDnaLength -= codonRemainder; + MappingUtils.removeEndPositions(codonRemainder, ranges); } int proteinLength = proteinSeq.getLength(); diff --git a/test/jalview/analysis/AlignmentUtilsTests.java b/test/jalview/analysis/AlignmentUtilsTests.java index 06b51e6..35196fa 100644 --- a/test/jalview/analysis/AlignmentUtilsTests.java +++ b/test/jalview/analysis/AlignmentUtilsTests.java @@ -47,6 +47,7 @@ import jalview.io.DataSourceType; import jalview.io.FileFormat; import jalview.io.FileFormatI; import jalview.io.FormatAdapter; +import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; import jalview.util.MappingUtils; @@ -263,14 +264,14 @@ public class AlignmentUtilsTests @Test(groups = { "Functional" }) public void testMapProteinAlignmentToCdna_noXrefs() throws IOException { - List protseqs = new ArrayList(); + List protseqs = new ArrayList<>(); protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12347", "SAR")); AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3])); protein.setDataset(null); - List dnaseqs = new ArrayList(); + List dnaseqs = new ArrayList<>(); dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ @@ -507,7 +508,7 @@ public class AlignmentUtilsTests acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map); acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map); acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map); - ArrayList acfs = new ArrayList(); + ArrayList acfs = new ArrayList<>(); acfs.add(acf); protein.setCodonFrames(acfs); @@ -605,14 +606,14 @@ public class AlignmentUtilsTests public void testMapProteinAlignmentToCdna_withStartAndStopCodons() throws IOException { - List protseqs = new ArrayList(); + List protseqs = new ArrayList<>(); protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12347", "SAR")); AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3])); protein.setDataset(null); - List dnaseqs = new ArrayList(); + List dnaseqs = new ArrayList<>(); // start + SAR: dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC")); // = EIQ + stop @@ -697,14 +698,14 @@ public class AlignmentUtilsTests @Test(groups = { "Functional" }) public void testMapProteinAlignmentToCdna_withXrefs() throws IOException { - List protseqs = new ArrayList(); + List protseqs = new ArrayList<>(); protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12347", "SAR")); AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3])); protein.setDataset(null); - List dnaseqs = new ArrayList(); + List dnaseqs = new ArrayList<>(); dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ @@ -774,14 +775,14 @@ public class AlignmentUtilsTests public void testMapProteinAlignmentToCdna_prioritiseXrefs() throws IOException { - List protseqs = new ArrayList(); + List protseqs = new ArrayList<>(); protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); AlignmentI protein = new Alignment( protseqs.toArray(new SequenceI[protseqs.size()])); protein.setDataset(null); - List dnaseqs = new ArrayList(); + List dnaseqs = new ArrayList<>(); dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs @@ -848,8 +849,8 @@ public class AlignmentUtilsTests al.addAnnotation(ann4); // Temp for seq1 al.addAnnotation(ann5); // Temp for seq2 al.addAnnotation(ann6); // Temp for no sequence - List types = new ArrayList(); - List scope = new ArrayList(); + List types = new ArrayList<>(); + List scope = new ArrayList<>(); /* * Set all sequence related Structure to hidden (ann1, ann2) @@ -1747,7 +1748,7 @@ public class AlignmentUtilsTests map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1); acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map); - ArrayList acfs = new ArrayList(); + ArrayList acfs = new ArrayList<>(); acfs.add(acf); protein.setCodonFrames(acfs); @@ -2030,9 +2031,9 @@ public class AlignmentUtilsTests sf6.setValue("ID", "var6"); sf6.setValue("clinical_significance", "Good"); - List codon1Variants = new ArrayList(); - List codon2Variants = new ArrayList(); - List codon3Variants = new ArrayList(); + List codon1Variants = new ArrayList<>(); + List codon2Variants = new ArrayList<>(); + List codon3Variants = new ArrayList<>(); List codonVariants[] = new ArrayList[3]; codonVariants[0] = codon1Variants; codonVariants[1] = codon2Variants; @@ -2272,7 +2273,7 @@ public class AlignmentUtilsTests seq1.createDatasetSequence(); Mapping mapping = new Mapping(seq1, new MapList( new int[] { 3, 6, 9, 10 }, new int[] { 1, 6 }, 1, 1)); - Map> map = new TreeMap>(); + Map> map = new TreeMap<>(); AlignmentUtils.addMappedPositions(seq1, from, mapping, map); /* @@ -2304,7 +2305,7 @@ public class AlignmentUtilsTests seq1.createDatasetSequence(); Mapping mapping = new Mapping(seq1, new MapList( new int[] { 3, 6, 9, 10 }, new int[] { 1, 6 }, 1, 1)); - Map> map = new TreeMap>(); + Map> map = new TreeMap<>(); AlignmentUtils.addMappedPositions(seq1, from, mapping, map); /* @@ -2561,7 +2562,7 @@ public class AlignmentUtilsTests * Case 2: CDS 3 times length of peptide + stop codon * (note code does not currently check trailing codon is a stop codon) */ - dna = new Sequence("dna", "AACGacgtCTCCTTGA"); + dna = new Sequence("dna", "AACGacgtCTCCTCCC"); dna.createDatasetSequence(); dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 16, null)); @@ -2574,17 +2575,42 @@ public class AlignmentUtilsTests Arrays.deepToString(ml.getFromRanges().toArray())); /* - * Case 3: CDS not 3 times length of peptide - no mapping is made + * Case 3: CDS longer than 3 * peptide + stop codon - no mapping is made + */ + dna = new Sequence("dna", "AACGacgtCTCCTTGATCA"); + dna.createDatasetSequence(); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 19, null)); + ml = AlignmentUtils.mapCdsToProtein(dna, peptide); + assertNull(ml); + + /* + * Case 4: CDS shorter than 3 * peptide - no mapping is made + */ + dna = new Sequence("dna", "AACGacgtCTCC"); + dna.createDatasetSequence(); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 12, null)); + ml = AlignmentUtils.mapCdsToProtein(dna, peptide); + assertNull(ml); + + /* + * Case 5: CDS 3 times length of peptide + part codon - mapping is truncated */ dna = new Sequence("dna", "AACGacgtCTCCTTG"); dna.createDatasetSequence(); dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 15, null)); ml = AlignmentUtils.mapCdsToProtein(dna, peptide); - assertNull(ml); + assertEquals(3, ml.getFromRatio()); + assertEquals(1, ml.getToRatio()); + assertEquals("[[1, 3]]", + Arrays.deepToString(ml.getToRanges().toArray())); + assertEquals("[[1, 4], [9, 13]]", + Arrays.deepToString(ml.getFromRanges().toArray())); /* - * Case 4: incomplete start codon corresponding to X in peptide + * Case 6: incomplete start codon corresponding to X in peptide */ dna = new Sequence("dna", "ACGacgtCTCCTTGG"); dna.createDatasetSequence(); @@ -2600,4 +2626,151 @@ public class AlignmentUtilsTests Arrays.deepToString(ml.getFromRanges().toArray())); } + /** + * Tests for the method that locates the CDS sequence that has a mapping to + * the given protein. That is, given a transcript-to-peptide mapping, find the + * cds-to-peptide mapping that relates to both, and return the CDS sequence. + */ + @Test + public void testFindCdsForProtein() + { + List mappings = new ArrayList<>(); + AlignedCodonFrame acf1 = new AlignedCodonFrame(); + mappings.add(acf1); + + SequenceI dna1 = new Sequence("dna1", "cgatATcgGCTATCTATGacg"); + dna1.createDatasetSequence(); + + // NB we currently exclude STOP codon from CDS sequences + // the test would need to change if this changes in future + SequenceI cds1 = new Sequence("cds1", "ATGCTATCT"); + cds1.createDatasetSequence(); + + SequenceI pep1 = new Sequence("pep1", "MLS"); + pep1.createDatasetSequence(); + List seqMappings = new ArrayList<>(); + MapList mapList = new MapList( + new int[] + { 5, 6, 9, 15 }, new int[] { 1, 3 }, 3, 1); + Mapping dnaToPeptide = new Mapping(pep1.getDatasetSequence(), mapList); + + // add dna to peptide mapping + seqMappings.add(acf1); + acf1.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), + mapList); + + /* + * first case - no dna-to-CDS mapping exists - search fails + */ + SequenceI seq = AlignmentUtils.findCdsForProtein(mappings, dna1, + seqMappings, dnaToPeptide); + assertNull(seq); + + /* + * second case - CDS-to-peptide mapping exists but no dna-to-CDS + * - search fails + */ + // todo this test fails if the mapping is added to acf1, not acf2 + // need to tidy up use of lists of mappings in AlignedCodonFrame + AlignedCodonFrame acf2 = new AlignedCodonFrame(); + mappings.add(acf2); + MapList cdsToPeptideMapping = new MapList(new int[] + { 1, 9 }, new int[] { 1, 3 }, 3, 1); + acf2.addMap(cds1.getDatasetSequence(), pep1.getDatasetSequence(), + cdsToPeptideMapping); + assertNull(AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, + dnaToPeptide)); + + /* + * third case - add dna-to-CDS mapping - CDS is now found! + */ + MapList dnaToCdsMapping = new MapList(new int[] { 5, 6, 9, 15 }, + new int[] + { 1, 9 }, 1, 1); + acf1.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), + dnaToCdsMapping); + seq = AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, + dnaToPeptide); + assertSame(seq, cds1.getDatasetSequence()); + } + + /** + * Tests for the method that locates the CDS sequence that has a mapping to + * the given protein. That is, given a transcript-to-peptide mapping, find the + * cds-to-peptide mapping that relates to both, and return the CDS sequence. + * This test is for the case where transcript and CDS are the same length. + */ + @Test + public void testFindCdsForProtein_noUTR() + { + List mappings = new ArrayList<>(); + AlignedCodonFrame acf1 = new AlignedCodonFrame(); + mappings.add(acf1); + + SequenceI dna1 = new Sequence("dna1", "ATGCTATCTTAA"); + dna1.createDatasetSequence(); + + // NB we currently exclude STOP codon from CDS sequences + // the test would need to change if this changes in future + SequenceI cds1 = new Sequence("cds1", "ATGCTATCT"); + cds1.createDatasetSequence(); + + SequenceI pep1 = new Sequence("pep1", "MLS"); + pep1.createDatasetSequence(); + List seqMappings = new ArrayList<>(); + MapList mapList = new MapList( + new int[] + { 1, 9 }, new int[] { 1, 3 }, 3, 1); + Mapping dnaToPeptide = new Mapping(pep1.getDatasetSequence(), mapList); + + // add dna to peptide mapping + seqMappings.add(acf1); + acf1.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), + mapList); + + /* + * first case - transcript lacks CDS features - it appears to be + * the CDS sequence and is returned + */ + SequenceI seq = AlignmentUtils.findCdsForProtein(mappings, dna1, + seqMappings, dnaToPeptide); + assertSame(seq, dna1.getDatasetSequence()); + + /* + * second case - transcript has CDS feature - this means it is + * not returned as a match for CDS (CDS sequences don't have CDS features) + */ + dna1.addSequenceFeature( + new SequenceFeature(SequenceOntologyI.CDS, "cds", 1, 12, null)); + seq = AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, + dnaToPeptide); + assertNull(seq); + + /* + * third case - CDS-to-peptide mapping exists but no dna-to-CDS + * - search fails + */ + // todo this test fails if the mapping is added to acf1, not acf2 + // need to tidy up use of lists of mappings in AlignedCodonFrame + AlignedCodonFrame acf2 = new AlignedCodonFrame(); + mappings.add(acf2); + MapList cdsToPeptideMapping = new MapList(new int[] + { 1, 9 }, new int[] { 1, 3 }, 3, 1); + acf2.addMap(cds1.getDatasetSequence(), pep1.getDatasetSequence(), + cdsToPeptideMapping); + assertNull(AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, + dnaToPeptide)); + + /* + * fourth case - add dna-to-CDS mapping - CDS is now found! + */ + MapList dnaToCdsMapping = new MapList(new int[] { 1, 9 }, + new int[] + { 1, 9 }, 1, 1); + acf1.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), + dnaToCdsMapping); + seq = AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, + dnaToPeptide); + assertSame(seq, cds1.getDatasetSequence()); + } }