X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=test%2Fjalview%2Fanalysis%2FAlignmentUtilsTests.java;fp=test%2Fjalview%2Fanalysis%2FAlignmentUtilsTests.java;h=d1300fe2976a204e5e10fac1ee7104303b691165;hb=e07fc11d5aaf4d1237d83e5f7ea0909dea09902e;hp=18b4252ef784adb8146cdba9fa6f973f568850bc;hpb=913c7db3eaf97959bcd34a7e96b64632b622edc8;p=jalview.git diff --git a/test/jalview/analysis/AlignmentUtilsTests.java b/test/jalview/analysis/AlignmentUtilsTests.java index 18b4252..d1300fe 100644 --- a/test/jalview/analysis/AlignmentUtilsTests.java +++ b/test/jalview/analysis/AlignmentUtilsTests.java @@ -20,19 +20,63 @@ */ package jalview.analysis; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; - -import org.junit.Test; - +import jalview.analysis.AlignmentUtils.MappingResult; +import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; +import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; import jalview.io.AppletFormatAdapter; +import jalview.io.FormatAdapter; +import jalview.util.MapList; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.junit.Test; public class AlignmentUtilsTests { + // @formatter:off + private static final String TEST_DATA = + "# STOCKHOLM 1.0\n" + + "#=GS D.melanogaster.1 AC AY119185.1/838-902\n" + + "#=GS D.melanogaster.2 AC AC092237.1/57223-57161\n" + + "#=GS D.melanogaster.3 AC AY060611.1/560-627\n" + + "D.melanogaster.1 G.AGCC.CU...AUGAUCGA\n" + + "#=GR D.melanogaster.1 SS ................((((\n" + + "D.melanogaster.2 C.AUUCAACU.UAUGAGGAU\n" + + "#=GR D.melanogaster.2 SS ................((((\n" + + "D.melanogaster.3 G.UGGCGCU..UAUGACGCA\n" + + "#=GR D.melanogaster.3 SS (.(((...(....(((((((\n" + + "//"; + + private static final String AA_SEQS_1 = + ">Seq1Name\n" + + "K-QY--L\n" + + ">Seq2Name\n" + + "-R-FP-W-\n"; + + private static final String CDNA_SEQS_1 = + ">Seq1Name\n" + + "AC-GG--CUC-CAA-CT\n" + + ">Seq2Name\n" + + "-CG-TTA--ACG---AAGT\n"; + + private static final String CDNA_SEQS_2 = + ">Seq1Name\n" + + "GCTCGUCGTACT\n" + + ">Seq2Name\n" + + "GGGTCAGGCAGT\n"; + // @formatter:on + public static Sequence ts=new Sequence("short","ASDASDASDASDASDASDASDASDASDASDASDASDASD"); + @Test public void testExpandFlanks() { @@ -55,6 +99,386 @@ public class AlignmentUtilsTests assertTrue("Flanking sequence not the same as original dataset sequence.\n"+ung+"\n"+sq.getDatasetSequence().getSequenceAsString(),ung.equalsIgnoreCase(sq.getDatasetSequence().getSequenceAsString())); } } + } } + + /** + * Test method that returns a map of lists of sequences by sequence name. + * + * @throws IOException + */ + @Test + public void testGetSequencesByName() throws IOException + { + final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n" + + ">Seq1Name\nABCD\n"; + AlignmentI al = loadAlignment(data, "FASTA"); + Map> map = AlignmentUtils + .getSequencesByName(al); + assertEquals(2, map.keySet().size()); + assertEquals(2, map.get("Seq1Name").size()); + assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString()); + assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString()); + assertEquals(1, map.get("Seq2Name").size()); + assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString()); + } + /** + * Helper method to load an alignment and ensure dataset sequences are set up. + * + * @param data + * @param format TODO + * @return + * @throws IOException + */ + protected AlignmentI loadAlignment(final String data, String format) throws IOException + { + Alignment a = new FormatAdapter().readFile(data, + AppletFormatAdapter.PASTE, format); + a.setDataset(null); + return a; + } + /** + * Test mapping of protein to cDNA. + * + * @throws IOException + */ + @Test + public void testMapProteinToCdna() throws IOException + { + // protein: Human + Mouse, 3 residues + AlignmentI protein = loadAlignment( + ">Human\nKQY\n>Mouse\nAFP\n>Worm\nRST\n", + "FASTA"); + // cDNA: Mouse, Human, Mouse, 9 bases + // @formatter:off + String dnaData = + ">Mouse\nGAAATCCAG\n" + + ">Human\nTTCGATTAC\n" + + ">Mouse\nGTCGTTTGC\n" + + ">Mouse\nGTCGTTTGCgac\n" + // not mapped - wrong length + ">Fly\nGTCGTTTGC\n"; // not mapped - no name match + // @formatter:on + AlignmentI cdna1 = loadAlignment( + dnaData, + "FASTA"); + MappingResult mapped = AlignmentUtils.mapProteinToCdna(protein, cdna1); + assertEquals(mapped, MappingResult.Mapped); + + /* + * Check two mappings (one for Mouse, one for Human) + */ + assertEquals(2, protein.getCodonFrames().size()); + assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size()); + assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size()); + + /* + * Inspect mapping for Human protein + */ + AlignedCodonFrame humanMapping = protein.getCodonFrame( + protein.getSequenceAt(0)).get(0); + assertEquals(1, humanMapping.getdnaSeqs().length); + assertEquals(cdna1.getSequenceAt(1).getDatasetSequence(), + humanMapping.getdnaSeqs()[0]); + Mapping[] protMappings = humanMapping.getProtMappings(); + assertEquals(1, protMappings.length); + MapList mapList = protMappings[0].getMap(); + assertEquals(3, mapList.getFromRatio()); + assertEquals(1, mapList.getToRatio()); + assertTrue(Arrays.equals(new int[] + { 1, 9 }, mapList.getFromRanges())); + assertTrue(Arrays.equals(new int[] + { 1, 3 }, mapList.getToRanges())); + + /* + * Inspect mappings for Mouse protein + */ + AlignedCodonFrame mouseMapping1 = protein.getCodonFrame( + protein.getSequenceAt(1)).get(0); + assertEquals(2, mouseMapping1.getdnaSeqs().length); + assertEquals(cdna1.getSequenceAt(0).getDatasetSequence(), + mouseMapping1.getdnaSeqs()[0]); + assertEquals(cdna1.getSequenceAt(2).getDatasetSequence(), + mouseMapping1.getdnaSeqs()[1]); + protMappings = mouseMapping1.getProtMappings(); + assertEquals(2, protMappings.length); + for (int i = 0; i < 2; i++) + { + mapList = protMappings[i].getMap(); + assertEquals(3, mapList.getFromRatio()); + assertEquals(1, mapList.getToRatio()); + assertTrue(Arrays.equals(new int[] + { 1, 9 }, mapList.getFromRanges())); + assertTrue(Arrays.equals(new int[] + { 1, 3 }, mapList.getToRanges())); + } + } + + /** + * Test mapping of protein to cDNA which may include start and/or stop codons. + * + * @throws IOException + */ + @Test + public void testMapProteinToCdna_stopStartCodons() throws IOException + { + // protein: Human + Mouse, 3 residues + AlignmentI protein = loadAlignment( + ">Human\nKQY\n>Mouse\nAFP\n>Worm\nRST\n", "FASTA"); + // @formatter:off + String dnaData = + ">Mouse\natgGAAATCCAG\n" + // Mouse with start codon + ">Human\nTTCGATtactaa\n" + // Human with stop codon TAA + ">Mouse\nGTCGTTTGctaG\n" + // Mouse with stop codon TAG + ">Human\nGTCGTTTgctGa\n" + // Human with stop codon TGA + ">Mouse\nATGGTCGTTTGCtag\n"; // Mouse with start and stop codons + // @formatter:on + AlignmentI cdna1 = loadAlignment( + dnaData, + "FASTA"); + MappingResult mapped = AlignmentUtils.mapProteinToCdna(protein, cdna1); + assertEquals(mapped, MappingResult.Mapped); + + /* + * Check two mappings (one for Mouse, one for Human) + */ + assertEquals(2, protein.getCodonFrames().size()); + assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size()); + assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size()); + + /* + * Inspect mapping for Human protein - should map to 2nd and 4th cDNA seqs + */ + AlignedCodonFrame humanMapping = protein.getCodonFrame( + protein.getSequenceAt(0)).get(0); + assertEquals(2, humanMapping.getdnaSeqs().length); + assertEquals(cdna1.getSequenceAt(1).getDatasetSequence(), + humanMapping.getdnaSeqs()[0]); + assertEquals(cdna1.getSequenceAt(3).getDatasetSequence(), + humanMapping.getdnaSeqs()[1]); + Mapping[] protMappings = humanMapping.getProtMappings(); + // two mappings, both to cDNA with stop codon + assertEquals(2, protMappings.length); + MapList mapList = protMappings[0].getMap(); + assertEquals(3, mapList.getFromRatio()); + assertEquals(1, mapList.getToRatio()); + assertTrue(Arrays.equals(new int[] + { 1, 9 }, mapList.getFromRanges())); + assertTrue(Arrays.equals(new int[] + { 1, 3 }, mapList.getToRanges())); + mapList = protMappings[1].getMap(); + assertEquals(3, mapList.getFromRatio()); + assertEquals(1, mapList.getToRatio()); + assertTrue(Arrays.equals(new int[] + { 1, 9 }, mapList.getFromRanges())); + assertTrue(Arrays.equals(new int[] + { 1, 3 }, mapList.getToRanges())); + + /* + * Inspect mapping for Mouse protein - should map to 1st/3rd/5th cDNA seqs + */ + AlignedCodonFrame mouseMapping = protein.getCodonFrame( + protein.getSequenceAt(1)).get(0); + assertEquals(3, mouseMapping.getdnaSeqs().length); + assertEquals(cdna1.getSequenceAt(0).getDatasetSequence(), + mouseMapping.getdnaSeqs()[0]); + assertEquals(cdna1.getSequenceAt(2).getDatasetSequence(), + mouseMapping.getdnaSeqs()[1]); + assertEquals(cdna1.getSequenceAt(4).getDatasetSequence(), + mouseMapping.getdnaSeqs()[2]); + + // three mappings + protMappings = mouseMapping.getProtMappings(); + assertEquals(3, protMappings.length); + + // first mapping to cDNA with start codon + mapList = protMappings[0].getMap(); + assertEquals(3, mapList.getFromRatio()); + assertEquals(1, mapList.getToRatio()); + assertTrue(Arrays.equals(new int[] + { 4, 12 }, mapList.getFromRanges())); + assertTrue(Arrays.equals(new int[] + { 1, 3 }, mapList.getToRanges())); + + // second mapping to cDNA with stop codon + mapList = protMappings[1].getMap(); + assertEquals(3, mapList.getFromRatio()); + assertEquals(1, mapList.getToRatio()); + assertTrue(Arrays.equals(new int[] + { 1, 9 }, mapList.getFromRanges())); + assertTrue(Arrays.equals(new int[] + { 1, 3 }, mapList.getToRanges())); + + // third mapping to cDNA with start and stop codon + mapList = protMappings[2].getMap(); + assertEquals(3, mapList.getFromRatio()); + assertEquals(1, mapList.getToRatio()); + assertTrue(Arrays.equals(new int[] + { 4, 12 }, mapList.getFromRanges())); + assertTrue(Arrays.equals(new int[] + { 1, 3 }, mapList.getToRanges())); + } + + /** + * Test for the alignSequenceAs method that takes two sequences and a mapping. + */ + @Test + public void testAlignSequenceAs_withMapping_noIntrons() + { + MapList map = new MapList(new int[] + { 1, 6 }, new int[] + { 1, 2 }, 3, 1); + + /* + * No existing gaps in dna: + */ + checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map, + "---GGG---AAA"); + + /* + * Now introduce gaps in dna but ignore them when realigning. + */ + checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map, + "---GGG---AAA"); + + /* + * Now include gaps in dna when realigning. First retaining 'mapped' gaps + * only, i.e. those within the exon region. + */ + checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map, + "---G-G--G---A--A-A"); + + /* + * Include all gaps in dna when realigning (within and without the exon + * region). The leading gap, and the gaps between codons, are subsumed by + * the protein alignment gap. + */ + checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", true, true, map, + "---G-GG---AA-A-"); + + /* + * Include only unmapped gaps in dna when realigning (outside the exon + * region). The leading gap, and the gaps between codons, are subsumed by + * the protein alignment gap. + */ + checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map, + "---GGG---AAA-"); + } + + /** + * Test for the alignSequenceAs method that takes two sequences and a mapping. + */ + @Test + public void testAlignSequenceAs_withMapping_withIntrons() + { + /* + * Exons at codon 2 (AAA) and 4 (TTT) + */ + MapList map = new MapList(new int[] + { 4, 6, 10, 12 }, new int[] + { 1, 2 }, 3, 1); + + /* + * Simple case: no gaps in dna + */ + checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map, + "GGG---AAACCCTTTGGG"); + + /* + * Add gaps to dna - but ignore when realigning. + */ + checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-", + false, false, map, "GGG---AAACCCTTTGGG"); + + /* + * Add gaps to dna - include within exons only when realigning. + */ + checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-", + true, false, map, "GGG---A--A---ACCCT-TTGGG"); + + /* + * Include gaps outside exons only when realigning. + */ + checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-", + false, true, map, "-G-G-GAAAC-CCTTT-GG-G-"); + + /* + * Include gaps following first intron if we are 'preserving mapped gaps' + */ + checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-", + true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-"); + + /* + * Include all gaps in dna when realigning. + */ + checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-", + true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-"); + } + + /** + * Test for the case where not all of the protein sequence is mapped to cDNA. + */ + @Test + public void testAlignSequenceAs_withMapping_withUnmappedProtein() + { + + /* + * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P + */ + final MapList map = new MapList(new int[] + { 4, 6, 10, 12 }, new int[] + { 1, 1, 3, 3 }, 3, 1); + + + /* + * Expect alignment does nothing (aborts realignment). Change this test + * first if different behaviour wanted. + */ + checkAlignSequenceAs("GGGAAACCCTTTGGG", "-A-L-P-", false, + false, map, "GGGAAACCCTTTGGG"); + } + + /** + * Helper method that performs and verifies the method under test. + * + * @param dnaSeq + * @param proteinSeq + * @param preserveMappedGaps + * @param preserveUnmappedGaps + * @param map + * @param expected + */ + protected void checkAlignSequenceAs(final String dnaSeq, + final String proteinSeq, final boolean preserveMappedGaps, + final boolean preserveUnmappedGaps, MapList map, + final String expected) + { + SequenceI dna = new Sequence("Seq1", dnaSeq); + dna.createDatasetSequence(); + SequenceI protein = new Sequence("Seq1", proteinSeq); + protein.createDatasetSequence(); + AlignedCodonFrame acf = new AlignedCodonFrame(); + acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map); + + AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-', + preserveMappedGaps, preserveUnmappedGaps); + assertEquals(expected, dna.getSequenceAsString()); + } + + /** + * Test for the alignSequenceAs method where we preserve gaps in introns only. + */ + @Test + public void testAlignSequenceAs_keepIntronGapsOnly() + { + + /* + * Intron GGGAAA followed by exon CCCTTT + */ + MapList map = new MapList(new int[] + { 7, 12 }, new int[] + { 1, 2 }, 3, 1); + + checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL", + false, true, map, "GG-G-AA-ACCCTTT"); } }