/* * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2) * Copyright (C) 2014 The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.analysis; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import jalview.analysis.AlignmentUtils.MappingResult; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; import jalview.io.AppletFormatAdapter; import jalview.io.FormatAdapter; import jalview.util.MapList; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.Map; import org.junit.Test; public class AlignmentUtilsTests { // @formatter:off private static final String TEST_DATA = "# STOCKHOLM 1.0\n" + "#=GS D.melanogaster.1 AC AY119185.1/838-902\n" + "#=GS D.melanogaster.2 AC AC092237.1/57223-57161\n" + "#=GS D.melanogaster.3 AC AY060611.1/560-627\n" + "D.melanogaster.1 G.AGCC.CU...AUGAUCGA\n" + "#=GR D.melanogaster.1 SS ................((((\n" + "D.melanogaster.2 C.AUUCAACU.UAUGAGGAU\n" + "#=GR D.melanogaster.2 SS ................((((\n" + "D.melanogaster.3 G.UGGCGCU..UAUGACGCA\n" + "#=GR D.melanogaster.3 SS (.(((...(....(((((((\n" + "//"; private static final String AA_SEQS_1 = ">Seq1Name\n" + "K-QY--L\n" + ">Seq2Name\n" + "-R-FP-W-\n"; private static final String CDNA_SEQS_1 = ">Seq1Name\n" + "AC-GG--CUC-CAA-CT\n" + ">Seq2Name\n" + "-CG-TTA--ACG---AAGT\n"; private static final String CDNA_SEQS_2 = ">Seq1Name\n" + "GCTCGUCGTACT\n" + ">Seq2Name\n" + "GGGTCAGGCAGT\n"; // @formatter:on public static Sequence ts=new Sequence("short","ASDASDASDASDASDASDASDASDASDASDASDASDASD"); @Test public void testExpandFlanks() { AlignmentI al = new Alignment(new Sequence[] {}); for (int i=4;i<14;i+=3) { SequenceI s1=ts.deriveSequence().getSubSequence(i, i+7); al.addSequence(s1); } System.out.println(new AppletFormatAdapter().formatSequences("Clustal", al, true)); for (int flnk=-1;flnk<25; flnk++) { AlignmentI exp; System.out.println("\nFlank size: "+flnk); System.out.println(new AppletFormatAdapter().formatSequences("Clustal", exp=AlignmentUtils.expandContext(al, flnk), true)); if (flnk==-1) { for (SequenceI sq:exp.getSequences()) { String ung = sq.getSequenceAsString().replaceAll("-+", ""); assertTrue("Flanking sequence not the same as original dataset sequence.\n"+ung+"\n"+sq.getDatasetSequence().getSequenceAsString(),ung.equalsIgnoreCase(sq.getDatasetSequence().getSequenceAsString())); } } } } /** * Test method that returns a map of lists of sequences by sequence name. * * @throws IOException */ @Test public void testGetSequencesByName() throws IOException { final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n" + ">Seq1Name\nABCD\n"; AlignmentI al = loadAlignment(data, "FASTA"); Map> map = AlignmentUtils .getSequencesByName(al); assertEquals(2, map.keySet().size()); assertEquals(2, map.get("Seq1Name").size()); assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString()); assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString()); assertEquals(1, map.get("Seq2Name").size()); assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString()); } /** * Helper method to load an alignment and ensure dataset sequences are set up. * * @param data * @param format TODO * @return * @throws IOException */ protected AlignmentI loadAlignment(final String data, String format) throws IOException { Alignment a = new FormatAdapter().readFile(data, AppletFormatAdapter.PASTE, format); a.setDataset(null); return a; } /** * Test mapping of protein to cDNA. * * @throws IOException */ @Test public void testMapProteinToCdna() throws IOException { // protein: Human + Mouse, 3 residues AlignmentI protein = loadAlignment( ">Human\nKQY\n>Mouse\nAFP\n>Worm\nRST\n", "FASTA"); // cDNA: Mouse, Human, Mouse, 9 bases // @formatter:off String dnaData = ">Mouse\nGAAATCCAG\n" + ">Human\nTTCGATTAC\n" + ">Mouse\nGTCGTTTGC\n" + ">Mouse\nGTCGTTTGCgac\n" + // not mapped - wrong length ">Fly\nGTCGTTTGC\n"; // not mapped - no name match // @formatter:on AlignmentI cdna1 = loadAlignment( dnaData, "FASTA"); MappingResult mapped = AlignmentUtils.mapProteinToCdna(protein, cdna1); assertEquals(mapped, MappingResult.Mapped); /* * Check two mappings (one for Mouse, one for Human) */ assertEquals(2, protein.getCodonFrames().length); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).length); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).length); /* * Inspect mapping for Human protein */ AlignedCodonFrame humanMapping = protein.getCodonFrame(protein .getSequenceAt(0))[0]; assertEquals(1, humanMapping.getdnaSeqs().length); assertEquals(cdna1.getSequenceAt(1).getDatasetSequence(), humanMapping.getdnaSeqs()[0]); Mapping[] protMappings = humanMapping.getProtMappings(); assertEquals(1, protMappings.length); MapList mapList = protMappings[0].getMap(); assertEquals(3, mapList.getFromRatio()); assertEquals(1, mapList.getToRatio()); assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges())); assertTrue(Arrays.equals(new int[] { 1, 3 }, mapList.getToRanges())); /* * Inspect mappings for Mouse protein */ AlignedCodonFrame mouseMapping1 = protein.getCodonFrame(protein .getSequenceAt(1))[0]; assertEquals(2, mouseMapping1.getdnaSeqs().length); assertEquals(cdna1.getSequenceAt(0).getDatasetSequence(), mouseMapping1.getdnaSeqs()[0]); assertEquals(cdna1.getSequenceAt(2).getDatasetSequence(), mouseMapping1.getdnaSeqs()[1]); protMappings = mouseMapping1.getProtMappings(); assertEquals(2, protMappings.length); for (int i = 0; i < 2; i++) { mapList = protMappings[i].getMap(); assertEquals(3, mapList.getFromRatio()); assertEquals(1, mapList.getToRatio()); assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges())); assertTrue(Arrays.equals(new int[] { 1, 3 }, mapList.getToRanges())); } } /** * Test mapping of protein to cDNA which may include start and/or stop codons. * * @throws IOException */ @Test public void testMapProteinToCdna_stopStartCodons() throws IOException { // protein: Human + Mouse, 3 residues AlignmentI protein = loadAlignment( ">Human\nKQY\n>Mouse\nAFP\n>Worm\nRST\n", "FASTA"); // @formatter:off String dnaData = ">Mouse\natgGAAATCCAG\n" + // Mouse with start codon ">Human\nTTCGATtactaa\n" + // Human with stop codon TAA ">Mouse\nGTCGTTTGctaG\n" + // Mouse with stop codon TAG ">Human\nGTCGTTTgctGa\n" + // Human with stop codon TGA ">Mouse\nATGGTCGTTTGCtag\n"; // Mouse with start and stop codons // @formatter:on AlignmentI cdna1 = loadAlignment( dnaData, "FASTA"); MappingResult mapped = AlignmentUtils.mapProteinToCdna(protein, cdna1); assertEquals(mapped, MappingResult.Mapped); /* * Check two mappings (one for Mouse, one for Human) */ assertEquals(2, protein.getCodonFrames().length); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).length); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).length); /* * Inspect mapping for Human protein - should map to 2nd and 4th cDNA seqs */ AlignedCodonFrame humanMapping = protein.getCodonFrame(protein .getSequenceAt(0))[0]; assertEquals(2, humanMapping.getdnaSeqs().length); assertEquals(cdna1.getSequenceAt(1).getDatasetSequence(), humanMapping.getdnaSeqs()[0]); assertEquals(cdna1.getSequenceAt(3).getDatasetSequence(), humanMapping.getdnaSeqs()[1]); Mapping[] protMappings = humanMapping.getProtMappings(); // two mappings, both to cDNA with stop codon assertEquals(2, protMappings.length); MapList mapList = protMappings[0].getMap(); assertEquals(3, mapList.getFromRatio()); assertEquals(1, mapList.getToRatio()); assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges())); assertTrue(Arrays.equals(new int[] { 1, 3 }, mapList.getToRanges())); mapList = protMappings[1].getMap(); assertEquals(3, mapList.getFromRatio()); assertEquals(1, mapList.getToRatio()); assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges())); assertTrue(Arrays.equals(new int[] { 1, 3 }, mapList.getToRanges())); /* * Inspect mapping for Mouse protein - should map to 1st/3rd/5th cDNA seqs */ AlignedCodonFrame mouseMapping = protein.getCodonFrame(protein .getSequenceAt(1))[0]; assertEquals(3, mouseMapping.getdnaSeqs().length); assertEquals(cdna1.getSequenceAt(0).getDatasetSequence(), mouseMapping.getdnaSeqs()[0]); assertEquals(cdna1.getSequenceAt(2).getDatasetSequence(), mouseMapping.getdnaSeqs()[1]); assertEquals(cdna1.getSequenceAt(4).getDatasetSequence(), mouseMapping.getdnaSeqs()[2]); // three mappings protMappings = mouseMapping.getProtMappings(); assertEquals(3, protMappings.length); // first mapping to cDNA with start codon mapList = protMappings[0].getMap(); assertEquals(3, mapList.getFromRatio()); assertEquals(1, mapList.getToRatio()); assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges())); assertTrue(Arrays.equals(new int[] { 1, 3 }, mapList.getToRanges())); // second mapping to cDNA with stop codon mapList = protMappings[1].getMap(); assertEquals(3, mapList.getFromRatio()); assertEquals(1, mapList.getToRatio()); assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges())); assertTrue(Arrays.equals(new int[] { 1, 3 }, mapList.getToRanges())); // third mapping to cDNA with start and stop codon mapList = protMappings[2].getMap(); assertEquals(3, mapList.getFromRatio()); assertEquals(1, mapList.getToRatio()); assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges())); assertTrue(Arrays.equals(new int[] { 1, 3 }, mapList.getToRanges())); } /** * Test for the alignSequenceAs method that takes two sequences and a mapping. */ @Test public void testAlignSequenceAs_withMapping_noIntrons() { /* * Simple case: no gaps in dna */ SequenceI dna = new Sequence("Seq1", "GGGAAA"); dna.createDatasetSequence(); SequenceI protein = new Sequence("Seq1", "-A-L-"); protein.createDatasetSequence(); AlignedCodonFrame acf = new AlignedCodonFrame(); MapList map = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1); acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map); /* * No existing gaps in dna: */ AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-', false, false); assertEquals("---GGG---AAA", dna.getSequenceAsString()); /* * Now introduce gaps in dna but ignore them when realigning. */ dna.setSequence("-G-G-G-A-A-A-"); AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-', false, false); assertEquals("---GGG---AAA", dna.getSequenceAsString()); /* * Now include gaps in dna when realigning. First retaining 'mapped' gaps * only, i.e. those within the exon region. */ dna.setSequence("-G-G--G-A--A-A-"); AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-', true, false); assertEquals("---G-G--G---A--A-A", dna.getSequenceAsString()); /* * Include all gaps in dna when realigning (within and without the exon * region). The leading gap, and the gaps between codons, are subsumed by * the protein alignment gap. */ dna.setSequence("-G-GG--AA-A-"); AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-', true, true); assertEquals("---G-GG---AA-A-", dna.getSequenceAsString()); /* * Include only unmapped gaps in dna when realigning (outside the exon * region). The leading gap, and the gaps between codons, are subsumed by * the protein alignment gap. */ dna.setSequence("-G-GG--AA-A-"); AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-', false, true); assertEquals("---GGG---AAA-", dna.getSequenceAsString()); } /** * Test for the alignSequenceAs method that takes two sequences and a mapping. */ @Test public void testAlignSequenceAs_withMapping_withIntrons() { /* * Simple case: no gaps in dna */ SequenceI dna = new Sequence("Seq1", "GGGAAACCCTTTGGG"); dna.createDatasetSequence(); SequenceI protein = new Sequence("Seq1", "-A-L-"); protein.createDatasetSequence(); AlignedCodonFrame acf = new AlignedCodonFrame(); /* * Exons at codon 2 (AAA) and 4 (TTT) */ MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] { 1, 2 }, 3, 1); acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map); /* * Align dna as "-A-L-". The protein 'gaps' follow the introns, i.e are * placed immediately before the mapped codons. */ AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-', false, false); assertEquals("GGG---AAACCC---TTTGGG", dna.getSequenceAsString()); /* * Add gaps to dna - but ignore when realigning. */ dna.setSequence("-G-G-G--A--A---AC-CC-T-TT-GG-G-"); AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-', false, false); assertEquals("GGG---AAACCC---TTTGGG", dna.getSequenceAsString()); /* * Add gaps to dna - include within exons only when realigning. */ dna.setSequence("-G-G-G--A--A---A-C-CC-T-TT-GG-G-"); AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-', true, false); assertEquals("GGG---A--A---ACCC---T-TTGGG", dna.getSequenceAsString()); /* * Include gaps outside exons only when realigning. */ dna.setSequence("-G-G-G--A--A---A-C-CC-T-TT-GG-G-"); AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-', false, true); assertEquals("-G-G-G---AAA-C-CC---TTT-GG-G-", dna.getSequenceAsString()); /* * Include all gaps in dna when realigning. */ dna.setSequence("-G-G-G--A--A---A-C-CC-T-TT-GG-G-"); AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-', true, true); assertEquals("-G-G-G---A--A---A-C-CC---T-TT-GG-G-", dna.getSequenceAsString()); } /** * Test for the case where not all of the protein sequence is mapped to cDNA. */ @Test public void testAlignSequenceAs_withMapping_withUnmappedProtein() { SequenceI dna = new Sequence("Seq1", "GGGAAACCCTTTGGG"); dna.createDatasetSequence(); SequenceI protein = new Sequence("Seq1", "-A-L-P-"); protein.createDatasetSequence(); AlignedCodonFrame acf = new AlignedCodonFrame(); /* * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P */ MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] { 1, 1, 3, 3 }, 3, 1); acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map); /* * Align dna as "-A-L-P-". Currently, does nothing (aborts realignment). * Change this test first if different behaviour wanted. */ AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-', false, false); assertEquals("GGGAAACCCTTTGGG", dna.getSequenceAsString()); } }