From: gmungoc Date: Tue, 14 Apr 2015 19:56:29 +0000 (+0100) Subject: JAL-1693 generate multiple exon sequences for 1-many dna-protein X-Git-Tag: Jalview_2_9~67 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;ds=sidebyside;h=e29e164306d88242fe615973eeebc8c4a49d1bb0;p=jalview.git JAL-1693 generate multiple exon sequences for 1-many dna-protein mappings --- diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index a811d84..7dec4ec 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -44,7 +44,6 @@ import jalview.datamodel.FeatureProperties; import jalview.datamodel.Mapping; import jalview.datamodel.SearchResults; import jalview.datamodel.Sequence; -import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceGroup; import jalview.datamodel.SequenceI; import jalview.schemes.ResidueProperties; @@ -1287,20 +1286,16 @@ public class AlignmentUtils final SequenceI ds = dnaSeq.getDatasetSequence(); List seqMappings = MappingUtils .findMappingsForSequence(ds, mappings); - if (!seqMappings.isEmpty()) + for (AlignedCodonFrame acf : seqMappings) { - /* - * We assume here that only one protein mapping is expected per dna - * sequence. Mapping to multiple protein sequences is conceivable but - * undefined. Splitting a mapping to one protein sequence across - * multiple mappings is possible but pathological. Need closer - * constraints on the contents of AlignedCodonFrame. - */ AlignedCodonFrame newMapping = new AlignedCodonFrame(); - final SequenceI exonSequence = makeExonSequence(ds, - seqMappings.get(0), newMapping); - exonSequences.add(exonSequence); - newMappings.add(newMapping); + final List mappedExons = makeExonSequences(ds, acf, + newMapping); + if (!mappedExons.isEmpty()) + { + exonSequences.addAll(mappedExons); + newMappings.add(newMapping); + } } } AlignmentI al = new Alignment( @@ -1317,71 +1312,86 @@ public class AlignmentUtils } /** - * Helper method to make an exon-only sequence and populate its mapping to - * protein + * Helper method to make exon-only sequences and populate their mappings to + * protein products *

* For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein * then generate a sequence CCTTGA with mapping [1, 6] to the same protein * residues + *

+ * Typically eukaryotic dna will include exons encoding for a single peptide + * sequence i.e. return a single result. Bacterial dna may have overlapping + * exon mappings coding for multiple peptides so return multiple results + * (example EMBL KF591215). * * @param dnaSeq * a dna dataset sequence * @param mapping - * the current mapping of the sequence to protein + * containing one or more mappings of the sequence to protein * @param newMapping - * the new mapping to populate, from the exon-only sequence + * the new mapping to populate, from the exon-only sequences to their + * mapped protein sequences * @return */ - protected static SequenceI makeExonSequence(SequenceI dnaSeq, - AlignedCodonFrame acf, AlignedCodonFrame newMapping) + protected static List makeExonSequences(SequenceI dnaSeq, + AlignedCodonFrame mapping, AlignedCodonFrame newMapping) { - Mapping mapping = acf.getMappingForSequence(dnaSeq); + List exonSequences = new ArrayList(); + List seqMappings = mapping.getMappingsForSequence(dnaSeq); final char[] dna = dnaSeq.getSequence(); - StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); - - /* - * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } - */ - List exonRanges = mapping.getMap().getFromRanges(); - for (int[] range : exonRanges) + for (Mapping seqMapping : seqMappings) { - for (int pos = range[0]; pos <= range[1]; pos++) + StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); + + /* + * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } + */ + List exonRanges = seqMapping.getMap().getFromRanges(); + for (int[] range : exonRanges) { - newSequence.append(dna[pos - 1]); + for (int pos = range[0]; pos <= range[1]; pos++) + { + newSequence.append(dna[pos - 1]); + } } - } - SequenceI exon = new Sequence(dnaSeq.getName(), newSequence.toString()); + SequenceI exon = new Sequence(dnaSeq.getName(), + newSequence.toString()); - /* - * Locate any xrefs to CDS database on the protein product and attach to the - * CDS sequence. Also add as a sub-token of the sequence name. - */ - // default to "CDS" if we can't locate an actual gene id - String cdsAccId = FeatureProperties.getCodingFeature(DBRefSource.EMBL); - DBRefEntry[] cdsRefs = DBRefUtils.selectRefs( - mapping.getTo().getDBRef(), DBRefSource.CODINGDBS); - if (cdsRefs != null) - { - for (DBRefEntry cdsRef : cdsRefs) + /* + * Locate any xrefs to CDS database on the protein product and attach to + * the CDS sequence. Also add as a sub-token of the sequence name. + */ + // default to "CDS" if we can't locate an actual gene id + String cdsAccId = FeatureProperties + .getCodingFeature(DBRefSource.EMBL); + DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(seqMapping.getTo() + .getDBRef(), DBRefSource.CODINGDBS); + if (cdsRefs != null) { - exon.addDBRef(new DBRefEntry(cdsRef)); - cdsAccId = cdsRef.getAccessionId(); + for (DBRefEntry cdsRef : cdsRefs) + { + exon.addDBRef(new DBRefEntry(cdsRef)); + cdsAccId = cdsRef.getAccessionId(); + } } - } - exon.setName(exon.getName() + "|" + cdsAccId); - exon.createDatasetSequence(); + exon.setName(exon.getName() + "|" + cdsAccId); + exon.createDatasetSequence(); - /* - * Build new mappings - from the same protein regions, but now to contiguous - * exons - */ - List exonRange = new ArrayList(); - exonRange.add(new int[] - { 1, newSequence.length() }); - MapList map = new MapList(exonRange, mapping.getMap().getToRanges(), 3, 1); - newMapping.addMap(exon.getDatasetSequence(), mapping.getTo(), map); + /* + * Build new mappings - from the same protein regions, but now to + * contiguous exons + */ + List exonRange = new ArrayList(); + exonRange.add(new int[] + { 1, newSequence.length() }); + MapList map = new MapList(exonRange, seqMapping.getMap() + .getToRanges(), + 3, 1); + newMapping.addMap(exon.getDatasetSequence(), seqMapping.getTo(), map); - return exon; + exonSequences.add(exon); + } + return exonSequences; } } diff --git a/src/jalview/datamodel/AlignedCodonFrame.java b/src/jalview/datamodel/AlignedCodonFrame.java index 1f5d827..d0b2731 100644 --- a/src/jalview/datamodel/AlignedCodonFrame.java +++ b/src/jalview/datamodel/AlignedCodonFrame.java @@ -20,6 +20,9 @@ */ package jalview.datamodel; +import java.util.ArrayList; +import java.util.List; + import jalview.util.MapList; import jalview.util.MappingUtils; @@ -423,4 +426,37 @@ public class AlignedCodonFrame { dnaSeq[codonPos[0] - 1], dnaSeq[codonPos[1] - 1], dnaSeq[codonPos[2] - 1] }; } + + /** + * Returns any mappings found which are to (or from) the given sequence, and + * to distinct sequences. + * + * @param seq + * @return + */ + public List getMappingsForSequence(SequenceI seq) + { + List result = new ArrayList(); + if (dnaSeqs == null) + { + return result; + } + List related = new ArrayList(); + SequenceI seqDs = seq.getDatasetSequence(); + seqDs = seqDs != null ? seqDs : seq; + + for (int ds = 0; ds < dnaSeqs.length; ds++) + { + final Mapping mapping = dnaToProt[ds]; + if (dnaSeqs[ds] == seqDs || mapping.to == seqDs) + { + if (!related.contains(mapping.to)) + { + result.add(mapping); + related.add(mapping.to); + } + } + } + return result; + } } diff --git a/test/jalview/analysis/AlignmentUtilsTests.java b/test/jalview/analysis/AlignmentUtilsTests.java index 98d77d4..c19884e 100644 --- a/test/jalview/analysis/AlignmentUtilsTests.java +++ b/test/jalview/analysis/AlignmentUtilsTests.java @@ -30,6 +30,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -974,7 +975,7 @@ public class AlignmentUtilsTests * x-ref to the EMBLCDS record. */ @Test - public void testMakeExonSequence() + public void testMakeExonSequences() { SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa"); SequenceI pep1 = new Sequence("pep1", "GF"); @@ -996,7 +997,10 @@ public class AlignmentUtilsTests mappings.add(acf); AlignedCodonFrame newMapping = new AlignedCodonFrame(); - SequenceI exon = AlignmentUtils.makeExonSequence(dna1, acf, newMapping); + List exons = AlignmentUtils.makeExonSequences(dna1, acf, + newMapping); + assertEquals(1, exons.size()); + SequenceI exon = exons.get(0); assertEquals("GGGTTT", exon.getSequenceAsString()); assertEquals("dna1|A12345", exon.getName()); @@ -1005,6 +1009,95 @@ public class AlignmentUtilsTests assertEquals("EMBLCDS", cdsRef.getSource()); assertEquals("2", cdsRef.getVersion()); assertEquals("A12345", cdsRef.getAccessionId()); + } + + /** + * Test the method that makes an exon-only alignment from a DNA sequence and + * its product mappings, for the case where there are multiple exon mappings + * to different protein products. + */ + @Test + public void testMakeExonAlignment_multipleProteins() + { + SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa"); + SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT + SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc + SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT + dna1.createDatasetSequence(); + pep1.createDatasetSequence(); + pep2.createDatasetSequence(); + pep3.createDatasetSequence(); + pep1.getDatasetSequence().addDBRef( + new DBRefEntry("EMBLCDS", "2", "A12345")); + pep2.getDatasetSequence().addDBRef( + new DBRefEntry("EMBLCDS", "3", "A12346")); + pep3.getDatasetSequence().addDBRef( + new DBRefEntry("EMBLCDS", "4", "A12347")); + + /* + * Make the mappings from dna to protein. Using LinkedHashset is a + * convenience so results are in the input order. There is no assertion that + * the generated exon sequences are in any particular order. + */ + Set mappings = new LinkedHashSet(); + // map ...GGG...TTT to GF + MapList map = new MapList(new int[] + { 4, 6, 10, 12 }, new int[] + { 1, 2 }, 3, 1); + AlignedCodonFrame acf = new AlignedCodonFrame(); + acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map); + mappings.add(acf); + + // map aaa...ccc to KP + map = new MapList(new int[] + { 1, 3, 7, 9 }, new int[] + { 1, 2 }, 3, 1); + acf = new AlignedCodonFrame(); + acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map); + mappings.add(acf); + + // map aaa......TTT to KF + map = new MapList(new int[] + { 1, 3, 10, 12 }, new int[] + { 1, 2 }, 3, 1); + acf = new AlignedCodonFrame(); + acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map); + mappings.add(acf); + + AlignmentI exal = AlignmentUtils.makeExonAlignment(new SequenceI[] + { dna1 }, mappings); + + /* + * Verify we have 3 exon sequences, mapped to pep1/2/3 respectively + */ + List exons = exal.getSequences(); + assertEquals(3, exons.size()); + + SequenceI exon = exons.get(0); + assertEquals("GGGTTT", exon.getSequenceAsString()); + assertEquals("dna1|A12345", exon.getName()); + assertEquals(1, exon.getDBRef().length); + DBRefEntry cdsRef = exon.getDBRef()[0]; + assertEquals("EMBLCDS", cdsRef.getSource()); + assertEquals("2", cdsRef.getVersion()); + assertEquals("A12345", cdsRef.getAccessionId()); + + exon = exons.get(1); + assertEquals("aaaccc", exon.getSequenceAsString()); + assertEquals("dna1|A12346", exon.getName()); + assertEquals(1, exon.getDBRef().length); + cdsRef = exon.getDBRef()[0]; + assertEquals("EMBLCDS", cdsRef.getSource()); + assertEquals("3", cdsRef.getVersion()); + assertEquals("A12346", cdsRef.getAccessionId()); + exon = exons.get(2); + assertEquals("aaaTTT", exon.getSequenceAsString()); + assertEquals("dna1|A12347", exon.getName()); + assertEquals(1, exon.getDBRef().length); + cdsRef = exon.getDBRef()[0]; + assertEquals("EMBLCDS", cdsRef.getSource()); + assertEquals("4", cdsRef.getVersion()); + assertEquals("A12347", cdsRef.getAccessionId()); } }