2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2)
3 * Copyright (C) 2014 The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.junit.Assert.assertEquals;
24 import static org.junit.Assert.assertFalse;
25 import static org.junit.Assert.assertSame;
26 import static org.junit.Assert.assertTrue;
27 import jalview.analysis.AlignmentUtils.MappingResult;
28 import jalview.datamodel.AlignedCodonFrame;
29 import jalview.datamodel.Alignment;
30 import jalview.datamodel.AlignmentI;
31 import jalview.datamodel.Mapping;
32 import jalview.datamodel.Sequence;
33 import jalview.datamodel.SequenceI;
34 import jalview.io.AppletFormatAdapter;
35 import jalview.io.FormatAdapter;
36 import jalview.util.MapList;
38 import java.io.IOException;
39 import java.util.Arrays;
40 import java.util.Collections;
41 import java.util.List;
44 import org.junit.Test;
46 public class AlignmentUtilsTests
49 private static final String TEST_DATA =
51 "#=GS D.melanogaster.1 AC AY119185.1/838-902\n" +
52 "#=GS D.melanogaster.2 AC AC092237.1/57223-57161\n" +
53 "#=GS D.melanogaster.3 AC AY060611.1/560-627\n" +
54 "D.melanogaster.1 G.AGCC.CU...AUGAUCGA\n" +
55 "#=GR D.melanogaster.1 SS ................((((\n" +
56 "D.melanogaster.2 C.AUUCAACU.UAUGAGGAU\n" +
57 "#=GR D.melanogaster.2 SS ................((((\n" +
58 "D.melanogaster.3 G.UGGCGCU..UAUGACGCA\n" +
59 "#=GR D.melanogaster.3 SS (.(((...(....(((((((\n" +
62 private static final String AA_SEQS_1 =
68 private static final String CDNA_SEQS_1 =
70 "AC-GG--CUC-CAA-CT\n" +
72 "-CG-TTA--ACG---AAGT\n";
74 private static final String CDNA_SEQS_2 =
81 public static Sequence ts=new Sequence("short","ASDASDASDASDASDASDASDASDASDASDASDASDASD");
84 public void testExpandFlanks()
86 AlignmentI al = new Alignment(new Sequence[] {});
87 for (int i=4;i<14;i+=3)
89 SequenceI s1=ts.deriveSequence().getSubSequence(i, i+7);
92 System.out.println(new AppletFormatAdapter().formatSequences("Clustal", al, true));
93 for (int flnk=-1;flnk<25; flnk++)
96 System.out.println("\nFlank size: "+flnk);
97 System.out.println(new AppletFormatAdapter().formatSequences("Clustal", exp=AlignmentUtils.expandContext(al, flnk), true));
99 for (SequenceI sq:exp.getSequences())
101 String ung = sq.getSequenceAsString().replaceAll("-+", "");
102 assertTrue("Flanking sequence not the same as original dataset sequence.\n"+ung+"\n"+sq.getDatasetSequence().getSequenceAsString(),ung.equalsIgnoreCase(sq.getDatasetSequence().getSequenceAsString()));
109 * Test method that returns a map of lists of sequences by sequence name.
111 * @throws IOException
114 public void testGetSequencesByName() throws IOException
116 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
117 + ">Seq1Name\nABCD\n";
118 AlignmentI al = loadAlignment(data, "FASTA");
119 Map<String, List<SequenceI>> map = AlignmentUtils
120 .getSequencesByName(al);
121 assertEquals(2, map.keySet().size());
122 assertEquals(2, map.get("Seq1Name").size());
123 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
124 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
125 assertEquals(1, map.get("Seq2Name").size());
126 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
129 * Helper method to load an alignment and ensure dataset sequences are set up.
134 * @throws IOException
136 protected AlignmentI loadAlignment(final String data, String format) throws IOException
138 Alignment a = new FormatAdapter().readFile(data,
139 AppletFormatAdapter.PASTE, format);
144 * Test mapping of protein to cDNA.
146 * @throws IOException
149 public void testMapProteinToCdna() throws IOException
151 // protein: Human + Mouse, 3 residues
152 AlignmentI protein = loadAlignment(
153 ">Human\nKQY\n>Mouse\nAFP\n>Worm\nRST\n",
155 // cDNA: Mouse, Human, Mouse, 9 bases
158 ">Mouse\nGAAATCCAG\n" +
159 ">Human\nTTCGATTAC\n" +
160 ">Mouse\nGTCGTTTGC\n" +
161 ">Mouse\nGTCGTTTGCgac\n" + // not mapped - wrong length
162 ">Fly\nGTCGTTTGC\n"; // not mapped - no name match
164 AlignmentI cdna1 = loadAlignment(
167 MappingResult mapped = AlignmentUtils.mapProteinToCdna(protein, cdna1);
168 assertEquals(mapped, MappingResult.Mapped);
171 * Check two mappings (one for Mouse, one for Human)
173 assertEquals(2, protein.getCodonFrames().size());
174 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
175 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
178 * Inspect mapping for Human protein
180 AlignedCodonFrame humanMapping = protein.getCodonFrame(
181 protein.getSequenceAt(0)).get(0);
182 assertEquals(1, humanMapping.getdnaSeqs().length);
183 assertEquals(cdna1.getSequenceAt(1).getDatasetSequence(),
184 humanMapping.getdnaSeqs()[0]);
185 Mapping[] protMappings = humanMapping.getProtMappings();
186 assertEquals(1, protMappings.length);
187 MapList mapList = protMappings[0].getMap();
188 assertEquals(3, mapList.getFromRatio());
189 assertEquals(1, mapList.getToRatio());
190 assertTrue(Arrays.equals(new int[]
191 { 1, 9 }, mapList.getFromRanges().get(0)));
192 assertEquals(1, mapList.getFromRanges().size());
193 assertTrue(Arrays.equals(new int[]
194 { 1, 3 }, mapList.getToRanges().get(0)));
195 assertEquals(1, mapList.getToRanges().size());
198 * Inspect mappings for Mouse protein
200 AlignedCodonFrame mouseMapping1 = protein.getCodonFrame(
201 protein.getSequenceAt(1)).get(0);
202 assertEquals(2, mouseMapping1.getdnaSeqs().length);
203 assertEquals(cdna1.getSequenceAt(0).getDatasetSequence(),
204 mouseMapping1.getdnaSeqs()[0]);
205 assertEquals(cdna1.getSequenceAt(2).getDatasetSequence(),
206 mouseMapping1.getdnaSeqs()[1]);
207 protMappings = mouseMapping1.getProtMappings();
208 assertEquals(2, protMappings.length);
209 for (int i = 0; i < 2; i++)
211 mapList = protMappings[i].getMap();
212 assertEquals(3, mapList.getFromRatio());
213 assertEquals(1, mapList.getToRatio());
214 assertTrue(Arrays.equals(new int[]
215 { 1, 9 }, mapList.getFromRanges().get(0)));
216 assertEquals(1, mapList.getFromRanges().size());
217 assertTrue(Arrays.equals(new int[]
218 { 1, 3 }, mapList.getToRanges().get(0)));
219 assertEquals(1, mapList.getToRanges().size());
224 * Test mapping of protein to cDNA which may include start and/or stop codons.
226 * @throws IOException
229 public void testMapProteinToCdna_stopStartCodons() throws IOException
231 // protein: Human + Mouse, 3 residues
232 AlignmentI protein = loadAlignment(
233 ">Human\nKQY\n>Mouse\nAFP\n>Worm\nRST\n", "FASTA");
236 ">Mouse\natgGAAATCCAG\n" + // Mouse with start codon
237 ">Human\nTTCGATtactaa\n" + // Human with stop codon TAA
238 ">Mouse\nGTCGTTTGctaG\n" + // Mouse with stop codon TAG
239 ">Human\nGTCGTTTgctGa\n" + // Human with stop codon TGA
240 ">Mouse\nATGGTCGTTTGCtag\n"; // Mouse with start and stop codons
242 AlignmentI cdna1 = loadAlignment(
245 MappingResult mapped = AlignmentUtils.mapProteinToCdna(protein, cdna1);
246 assertEquals(mapped, MappingResult.Mapped);
249 * Check two mappings (one for Mouse, one for Human)
251 assertEquals(2, protein.getCodonFrames().size());
252 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
253 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
256 * Inspect mapping for Human protein - should map to 2nd and 4th cDNA seqs
258 AlignedCodonFrame humanMapping = protein.getCodonFrame(
259 protein.getSequenceAt(0)).get(0);
260 assertEquals(2, humanMapping.getdnaSeqs().length);
261 assertEquals(cdna1.getSequenceAt(1).getDatasetSequence(),
262 humanMapping.getdnaSeqs()[0]);
263 assertEquals(cdna1.getSequenceAt(3).getDatasetSequence(),
264 humanMapping.getdnaSeqs()[1]);
265 Mapping[] protMappings = humanMapping.getProtMappings();
266 // two mappings, both to cDNA with stop codon
267 assertEquals(2, protMappings.length);
269 MapList mapList = protMappings[0].getMap();
270 assertEquals(3, mapList.getFromRatio());
271 assertEquals(1, mapList.getToRatio());
272 assertTrue(Arrays.equals(new int[]
273 { 1, 9 }, mapList.getFromRanges().get(0)));
274 assertEquals(1, mapList.getFromRanges().size());
275 assertTrue(Arrays.equals(new int[]
276 { 1, 3 }, mapList.getToRanges().get(0)));
277 assertEquals(1, mapList.getToRanges().size());
279 mapList = protMappings[1].getMap();
280 assertEquals(3, mapList.getFromRatio());
281 assertEquals(1, mapList.getToRatio());
282 assertTrue(Arrays.equals(new int[]
283 { 1, 9 }, mapList.getFromRanges().get(0)));
284 assertEquals(1, mapList.getFromRanges().size());
285 assertTrue(Arrays.equals(new int[]
286 { 1, 3 }, mapList.getToRanges().get(0)));
287 assertEquals(1, mapList.getToRanges().size());
290 * Inspect mapping for Mouse protein - should map to 1st/3rd/5th cDNA seqs
292 AlignedCodonFrame mouseMapping = protein.getCodonFrame(
293 protein.getSequenceAt(1)).get(0);
294 assertEquals(3, mouseMapping.getdnaSeqs().length);
295 assertEquals(cdna1.getSequenceAt(0).getDatasetSequence(),
296 mouseMapping.getdnaSeqs()[0]);
297 assertEquals(cdna1.getSequenceAt(2).getDatasetSequence(),
298 mouseMapping.getdnaSeqs()[1]);
299 assertEquals(cdna1.getSequenceAt(4).getDatasetSequence(),
300 mouseMapping.getdnaSeqs()[2]);
303 protMappings = mouseMapping.getProtMappings();
304 assertEquals(3, protMappings.length);
306 // first mapping to cDNA with start codon
307 mapList = protMappings[0].getMap();
308 assertEquals(3, mapList.getFromRatio());
309 assertEquals(1, mapList.getToRatio());
310 assertTrue(Arrays.equals(new int[]
311 { 4, 12 }, mapList.getFromRanges().get(0)));
312 assertEquals(1, mapList.getFromRanges().size());
313 assertTrue(Arrays.equals(new int[]
314 { 1, 3 }, mapList.getToRanges().get(0)));
315 assertEquals(1, mapList.getToRanges().size());
317 // second mapping to cDNA with stop codon
318 mapList = protMappings[1].getMap();
319 assertEquals(3, mapList.getFromRatio());
320 assertEquals(1, mapList.getToRatio());
321 assertTrue(Arrays.equals(new int[]
322 { 1, 9 }, mapList.getFromRanges().get(0)));
323 assertEquals(1, mapList.getFromRanges().size());
324 assertTrue(Arrays.equals(new int[]
325 { 1, 3 }, mapList.getToRanges().get(0)));
326 assertEquals(1, mapList.getToRanges().size());
328 // third mapping to cDNA with start and stop codon
329 mapList = protMappings[2].getMap();
330 assertEquals(3, mapList.getFromRatio());
331 assertEquals(1, mapList.getToRatio());
332 assertTrue(Arrays.equals(new int[]
333 { 4, 12 }, mapList.getFromRanges().get(0)));
334 assertEquals(1, mapList.getFromRanges().size());
335 assertTrue(Arrays.equals(new int[]
336 { 1, 3 }, mapList.getToRanges().get(0)));
337 assertEquals(1, mapList.getToRanges().size());
341 * Test for the alignSequenceAs method that takes two sequences and a mapping.
344 public void testAlignSequenceAs_withMapping_noIntrons()
346 MapList map = new MapList(new int[]
351 * No existing gaps in dna:
353 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
357 * Now introduce gaps in dna but ignore them when realigning.
359 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
363 * Now include gaps in dna when realigning. First retaining 'mapped' gaps
364 * only, i.e. those within the exon region.
366 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
367 "---G-G--G---A--A-A");
370 * Include all gaps in dna when realigning (within and without the exon
371 * region). The leading gap, and the gaps between codons, are subsumed by
372 * the protein alignment gap.
374 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", true, true, map,
378 * Include only unmapped gaps in dna when realigning (outside the exon
379 * region). The leading gap, and the gaps between codons, are subsumed by
380 * the protein alignment gap.
382 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
387 * Test for the alignSequenceAs method that takes two sequences and a mapping.
390 public void testAlignSequenceAs_withMapping_withIntrons()
393 * Exons at codon 2 (AAA) and 4 (TTT)
395 MapList map = new MapList(new int[]
396 { 4, 6, 10, 12 }, new int[]
400 * Simple case: no gaps in dna
402 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
403 "GGG---AAACCCTTTGGG");
406 * Add gaps to dna - but ignore when realigning.
408 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
409 false, false, map, "GGG---AAACCCTTTGGG");
412 * Add gaps to dna - include within exons only when realigning.
414 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
415 true, false, map, "GGG---A--A---ACCCT-TTGGG");
418 * Include gaps outside exons only when realigning.
420 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
421 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
424 * Include gaps following first intron if we are 'preserving mapped gaps'
426 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
427 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
430 * Include all gaps in dna when realigning.
432 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
433 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
437 * Test for the case where not all of the protein sequence is mapped to cDNA.
440 public void testAlignSequenceAs_withMapping_withUnmappedProtein()
444 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
446 final MapList map = new MapList(new int[]
447 { 4, 6, 10, 12 }, new int[]
448 { 1, 1, 3, 3 }, 3, 1);
452 * Expect alignment does nothing (aborts realignment). Change this test
453 * first if different behaviour wanted.
455 checkAlignSequenceAs("GGGAAACCCTTTGGG", "-A-L-P-", false,
456 false, map, "GGGAAACCCTTTGGG");
460 * Helper method that performs and verifies the method under test.
464 * @param preserveMappedGaps
465 * @param preserveUnmappedGaps
469 protected void checkAlignSequenceAs(final String dnaSeq,
470 final String proteinSeq, final boolean preserveMappedGaps,
471 final boolean preserveUnmappedGaps, MapList map,
472 final String expected)
474 SequenceI dna = new Sequence("Seq1", dnaSeq);
475 dna.createDatasetSequence();
476 SequenceI protein = new Sequence("Seq1", proteinSeq);
477 protein.createDatasetSequence();
478 AlignedCodonFrame acf = new AlignedCodonFrame();
479 acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map);
481 AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-',
482 preserveMappedGaps, preserveUnmappedGaps);
483 assertEquals(expected, dna.getSequenceAsString());
487 * Test for the alignSequenceAs method where we preserve gaps in introns only.
490 public void testAlignSequenceAs_keepIntronGapsOnly()
494 * Intron GGGAAA followed by exon CCCTTT
496 MapList map = new MapList(new int[]
500 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL",
501 false, true, map, "GG-G-AA-ACCCTTT");
505 * Test for the method that generates an aligned translated sequence from one
509 public void testGetAlignedTranslation_dnaLikeProtein()
511 // dna alignment will be replaced
512 SequenceI dna = new Sequence("Seq1", "T-G-CC-A--T-TAC-CAG-");
513 dna.createDatasetSequence();
514 // protein alignment will be 'applied' to dna
515 SequenceI protein = new Sequence("Seq1", "-CH-Y--Q-");
516 protein.createDatasetSequence();
517 MapList map = new MapList(new int[]
520 AlignedCodonFrame acf = new AlignedCodonFrame();
521 acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map);
523 final SequenceI aligned = AlignmentUtils
524 .getAlignedTranslation(protein, '-', acf);
525 assertEquals("---TGCCAT---TAC------CAG---", aligned.getSequenceAsString());
526 assertSame(aligned.getDatasetSequence(), dna.getDatasetSequence());
530 * Test the method that realigns protein to match mapped codon alignment.
533 public void testAlignProteinAsDna()
535 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
536 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
537 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
538 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
539 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
540 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
541 AlignmentI dna = new Alignment(new SequenceI[]
542 { dna1, dna2, dna3 });
543 dna.setDataset(null);
545 // protein alignment will be realigned like dna
546 SequenceI prot1 = new Sequence("Seq1", "CHYQ");
547 SequenceI prot2 = new Sequence("Seq2", "CHYQ");
548 SequenceI prot3 = new Sequence("Seq3", "CHYQ");
549 AlignmentI protein = new Alignment(new SequenceI[]
550 { prot1, prot2, prot3 });
551 protein.setDataset(null);
553 MapList map = new MapList(new int[]
556 AlignedCodonFrame acf = new AlignedCodonFrame();
557 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
558 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
559 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
560 protein.setCodonFrames(Collections.singleton(acf));
563 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
564 * [8,9,10] [10,11,12] [11,12,13]
566 AlignmentUtils.alignProteinAsDna(protein, dna);
567 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
568 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
569 assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
573 * Test the method that tests whether a CDNA sequence translates to a protein
577 public void testTranslatesAs()
579 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
580 "FPKG".toCharArray()));
582 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
583 3, "FPKG".toCharArray()));
585 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
586 0, "FPKG".toCharArray()));
588 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
589 0, "FPKG".toCharArray()));
591 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
592 0, "FPKG".toCharArray()));
593 // with start and stop codon1
594 assertTrue(AlignmentUtils.translatesAs(
595 "atgtttcccaaaggtaa".toCharArray(), 3, "FPKG".toCharArray()));
596 // with start and stop codon2
597 assertTrue(AlignmentUtils.translatesAs(
598 "atgtttcccaaaggtag".toCharArray(), 3, "FPKG".toCharArray()));
599 // with start and stop codon3
600 assertTrue(AlignmentUtils.translatesAs(
601 "atgtttcccaaaggtga".toCharArray(), 3, "FPKG".toCharArray()));
604 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
606 "FPMG".toCharArray()));
610 public void testTranslatesAs_withAmbiguityCodes()
613 assertTrue(AlignmentUtils.translatesAs("car".toCharArray(), 0,