2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.testng.AssertJUnit.assertEquals;
24 import static org.testng.AssertJUnit.assertFalse;
25 import static org.testng.AssertJUnit.assertNull;
26 import static org.testng.AssertJUnit.assertSame;
27 import static org.testng.AssertJUnit.assertTrue;
29 import jalview.analysis.AlignmentUtils.DnaVariant;
30 import jalview.datamodel.AlignedCodonFrame;
31 import jalview.datamodel.Alignment;
32 import jalview.datamodel.AlignmentAnnotation;
33 import jalview.datamodel.AlignmentI;
34 import jalview.datamodel.Annotation;
35 import jalview.datamodel.DBRefEntry;
36 import jalview.datamodel.Mapping;
37 import jalview.datamodel.SearchResults;
38 import jalview.datamodel.SearchResults.Match;
39 import jalview.datamodel.Sequence;
40 import jalview.datamodel.SequenceFeature;
41 import jalview.datamodel.SequenceI;
42 import jalview.io.AppletFormatAdapter;
43 import jalview.io.FormatAdapter;
44 import jalview.util.MapList;
45 import jalview.util.MappingUtils;
47 import java.io.IOException;
48 import java.util.ArrayList;
49 import java.util.Arrays;
50 import java.util.Iterator;
51 import java.util.LinkedHashMap;
52 import java.util.List;
54 import java.util.TreeMap;
56 import org.testng.Assert;
57 import org.testng.annotations.Test;
59 public class AlignmentUtilsTests
61 public static Sequence ts = new Sequence("short",
62 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm");
64 @Test(groups = { "Functional" })
65 public void testExpandContext()
67 AlignmentI al = new Alignment(new Sequence[] {});
68 for (int i = 4; i < 14; i += 2)
70 SequenceI s1 = ts.deriveSequence().getSubSequence(i, i + 7);
73 System.out.println(new AppletFormatAdapter().formatSequences("Clustal",
75 for (int flnk = -1; flnk < 25; flnk++)
77 AlignmentI exp = AlignmentUtils.expandContext(al, flnk);
78 System.out.println("\nFlank size: " + flnk);
79 System.out.println(new AppletFormatAdapter().formatSequences(
80 "Clustal", exp, true));
84 * Full expansion to complete sequences
86 for (SequenceI sq : exp.getSequences())
88 String ung = sq.getSequenceAsString().replaceAll("-+", "");
89 final String errorMsg = "Flanking sequence not the same as original dataset sequence.\n"
92 + sq.getDatasetSequence().getSequenceAsString();
93 assertTrue(errorMsg, ung.equalsIgnoreCase(sq.getDatasetSequence()
94 .getSequenceAsString()));
100 * Last sequence is fully expanded, others have leading gaps to match
102 assertTrue(exp.getSequenceAt(4).getSequenceAsString()
104 assertTrue(exp.getSequenceAt(3).getSequenceAsString()
105 .startsWith("--abc"));
106 assertTrue(exp.getSequenceAt(2).getSequenceAsString()
107 .startsWith("----abc"));
108 assertTrue(exp.getSequenceAt(1).getSequenceAsString()
109 .startsWith("------abc"));
110 assertTrue(exp.getSequenceAt(0).getSequenceAsString()
111 .startsWith("--------abc"));
117 * Test that annotations are correctly adjusted by expandContext
119 @Test(groups = { "Functional" })
120 public void testExpandContext_annotation()
122 AlignmentI al = new Alignment(new Sequence[] {});
123 SequenceI ds = new Sequence("Seq1", "ABCDEFGHI");
125 SequenceI seq1 = ds.deriveSequence().getSubSequence(3, 6);
126 al.addSequence(seq1);
129 * Annotate DEF with 4/5/6 respectively
131 Annotation[] anns = new Annotation[] { new Annotation(4),
132 new Annotation(5), new Annotation(6) };
133 AlignmentAnnotation ann = new AlignmentAnnotation("SS",
134 "secondary structure", anns);
135 seq1.addAlignmentAnnotation(ann);
138 * The annotations array should match aligned positions
140 assertEquals(3, ann.annotations.length);
141 assertEquals(4, ann.annotations[0].value, 0.001);
142 assertEquals(5, ann.annotations[1].value, 0.001);
143 assertEquals(6, ann.annotations[2].value, 0.001);
146 * Check annotation to sequence position mappings before expanding the
147 * sequence; these are set up in Sequence.addAlignmentAnnotation ->
148 * Annotation.setSequenceRef -> createSequenceMappings
150 assertNull(ann.getAnnotationForPosition(1));
151 assertNull(ann.getAnnotationForPosition(2));
152 assertNull(ann.getAnnotationForPosition(3));
153 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
154 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
155 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
156 assertNull(ann.getAnnotationForPosition(7));
157 assertNull(ann.getAnnotationForPosition(8));
158 assertNull(ann.getAnnotationForPosition(9));
161 * Expand the subsequence to the full sequence abcDEFghi
163 AlignmentI expanded = AlignmentUtils.expandContext(al, -1);
164 assertEquals("abcDEFghi", expanded.getSequenceAt(0)
165 .getSequenceAsString());
168 * Confirm the alignment and sequence have the same SS annotation,
169 * referencing the expanded sequence
171 ann = expanded.getSequenceAt(0).getAnnotation()[0];
172 assertSame(ann, expanded.getAlignmentAnnotation()[0]);
173 assertSame(expanded.getSequenceAt(0), ann.sequenceRef);
176 * The annotations array should have null values except for annotated
179 assertNull(ann.annotations[0]);
180 assertNull(ann.annotations[1]);
181 assertNull(ann.annotations[2]);
182 assertEquals(4, ann.annotations[3].value, 0.001);
183 assertEquals(5, ann.annotations[4].value, 0.001);
184 assertEquals(6, ann.annotations[5].value, 0.001);
185 assertNull(ann.annotations[6]);
186 assertNull(ann.annotations[7]);
187 assertNull(ann.annotations[8]);
190 * sequence position mappings should be unchanged
192 assertNull(ann.getAnnotationForPosition(1));
193 assertNull(ann.getAnnotationForPosition(2));
194 assertNull(ann.getAnnotationForPosition(3));
195 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
196 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
197 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
198 assertNull(ann.getAnnotationForPosition(7));
199 assertNull(ann.getAnnotationForPosition(8));
200 assertNull(ann.getAnnotationForPosition(9));
204 * Test method that returns a map of lists of sequences by sequence name.
206 * @throws IOException
208 @Test(groups = { "Functional" })
209 public void testGetSequencesByName() throws IOException
211 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
212 + ">Seq1Name\nABCD\n";
213 AlignmentI al = loadAlignment(data, "FASTA");
214 Map<String, List<SequenceI>> map = AlignmentUtils
215 .getSequencesByName(al);
216 assertEquals(2, map.keySet().size());
217 assertEquals(2, map.get("Seq1Name").size());
218 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
219 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
220 assertEquals(1, map.get("Seq2Name").size());
221 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
225 * Helper method to load an alignment and ensure dataset sequences are set up.
231 * @throws IOException
233 protected AlignmentI loadAlignment(final String data, String format)
236 AlignmentI a = new FormatAdapter().readFile(data,
237 AppletFormatAdapter.PASTE, format);
243 * Test mapping of protein to cDNA, for the case where we have no sequence
244 * cross-references, so mappings are made first-served 1-1 where sequences
247 * @throws IOException
249 @Test(groups = { "Functional" })
250 public void testMapProteinAlignmentToCdna_noXrefs() throws IOException
252 List<SequenceI> protseqs = new ArrayList<SequenceI>();
253 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
254 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
255 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
256 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
257 protein.setDataset(null);
259 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
260 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
261 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
262 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
263 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
264 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
265 cdna.setDataset(null);
267 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
269 // 3 mappings made, each from 1 to 1 sequence
270 assertEquals(3, protein.getCodonFrames().size());
271 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
272 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
273 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
275 // V12345 mapped to A22222
276 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
278 assertEquals(1, acf.getdnaSeqs().length);
279 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
280 acf.getdnaSeqs()[0]);
281 Mapping[] protMappings = acf.getProtMappings();
282 assertEquals(1, protMappings.length);
283 MapList mapList = protMappings[0].getMap();
284 assertEquals(3, mapList.getFromRatio());
285 assertEquals(1, mapList.getToRatio());
286 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
288 assertEquals(1, mapList.getFromRanges().size());
289 assertTrue(Arrays.equals(new int[] { 1, 3 },
290 mapList.getToRanges().get(0)));
291 assertEquals(1, mapList.getToRanges().size());
293 // V12346 mapped to A33333
294 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
295 assertEquals(1, acf.getdnaSeqs().length);
296 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
297 acf.getdnaSeqs()[0]);
299 // V12347 mapped to A11111
300 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
301 assertEquals(1, acf.getdnaSeqs().length);
302 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
303 acf.getdnaSeqs()[0]);
305 // no mapping involving the 'extra' A44444
306 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
310 * Test for the alignSequenceAs method that takes two sequences and a mapping.
312 @Test(groups = { "Functional" })
313 public void testAlignSequenceAs_withMapping_noIntrons()
315 MapList map = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1);
318 * No existing gaps in dna:
320 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
324 * Now introduce gaps in dna but ignore them when realigning.
326 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
330 * Now include gaps in dna when realigning. First retaining 'mapped' gaps
331 * only, i.e. those within the exon region.
333 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
334 "---G-G--G---A--A-A");
337 * Include all gaps in dna when realigning (within and without the exon
338 * region). The leading gap, and the gaps between codons, are subsumed by
339 * the protein alignment gap.
341 checkAlignSequenceAs("-G-GG--AA-A---", "-A-L-", true, true, map,
342 "---G-GG---AA-A---");
345 * Include only unmapped gaps in dna when realigning (outside the exon
346 * region). The leading gap, and the gaps between codons, are subsumed by
347 * the protein alignment gap.
349 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
354 * Test for the alignSequenceAs method that takes two sequences and a mapping.
356 @Test(groups = { "Functional" })
357 public void testAlignSequenceAs_withMapping_withIntrons()
360 * Exons at codon 2 (AAA) and 4 (TTT)
362 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
363 new int[] { 1, 2 }, 3, 1);
366 * Simple case: no gaps in dna
368 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
369 "GGG---AAACCCTTTGGG");
372 * Add gaps to dna - but ignore when realigning.
374 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
375 false, false, map, "GGG---AAACCCTTTGGG");
378 * Add gaps to dna - include within exons only when realigning.
380 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
381 true, false, map, "GGG---A--A---ACCCT-TTGGG");
384 * Include gaps outside exons only when realigning.
386 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
387 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
390 * Include gaps following first intron if we are 'preserving mapped gaps'
392 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
393 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
396 * Include all gaps in dna when realigning.
398 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
399 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
403 * Test for the case where not all of the protein sequence is mapped to cDNA.
405 @Test(groups = { "Functional" })
406 public void testAlignSequenceAs_withMapping_withUnmappedProtein()
409 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
411 final MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] {
415 * -L- 'aligns' ccc------
417 checkAlignSequenceAs("gggAAAcccTTTggg", "-A-L-P-", false, false, map,
418 "gggAAAccc------TTTggg");
422 * Helper method that performs and verifies the method under test.
425 * the sequence to be realigned
427 * the sequence whose alignment is to be copied
428 * @param preserveMappedGaps
429 * @param preserveUnmappedGaps
433 protected void checkAlignSequenceAs(final String alignee,
434 final String alignModel, final boolean preserveMappedGaps,
435 final boolean preserveUnmappedGaps, MapList map,
436 final String expected)
438 SequenceI alignMe = new Sequence("Seq1", alignee);
439 alignMe.createDatasetSequence();
440 SequenceI alignFrom = new Sequence("Seq2", alignModel);
441 alignFrom.createDatasetSequence();
442 AlignedCodonFrame acf = new AlignedCodonFrame();
443 acf.addMap(alignMe.getDatasetSequence(), alignFrom.getDatasetSequence(), map);
445 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "---", '-',
446 preserveMappedGaps, preserveUnmappedGaps);
447 assertEquals(expected, alignMe.getSequenceAsString());
451 * Test for the alignSequenceAs method where we preserve gaps in introns only.
453 @Test(groups = { "Functional" })
454 public void testAlignSequenceAs_keepIntronGapsOnly()
458 * Intron GGGAAA followed by exon CCCTTT
460 MapList map = new MapList(new int[] { 7, 12 }, new int[] { 1, 2 }, 3, 1);
462 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL", false, true, map,
467 * Test the method that realigns protein to match mapped codon alignment.
469 @Test(groups = { "Functional" })
470 public void testAlignProteinAsDna()
472 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
473 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
474 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
475 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
476 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
477 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
478 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
479 dna.setDataset(null);
481 // protein alignment will be realigned like dna
482 SequenceI prot1 = new Sequence("Seq1", "CHYQ");
483 SequenceI prot2 = new Sequence("Seq2", "CHYQ");
484 SequenceI prot3 = new Sequence("Seq3", "CHYQ");
485 SequenceI prot4 = new Sequence("Seq4", "R-QSV"); // unmapped, unchanged
486 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
488 protein.setDataset(null);
490 MapList map = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 3, 1);
491 AlignedCodonFrame acf = new AlignedCodonFrame();
492 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
493 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
494 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
495 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
497 protein.setCodonFrames(acfs);
500 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
501 * [8,9,10] [10,11,12] [11,12,13]
503 AlignmentUtils.alignProteinAsDna(protein, dna);
504 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
505 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
506 assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
507 assertEquals("R-QSV", prot4.getSequenceAsString());
511 * Test the method that tests whether a CDNA sequence translates to a protein
514 @Test(groups = { "Functional" })
515 public void testTranslatesAs()
517 // null arguments check
518 assertFalse(AlignmentUtils.translatesAs(null, 0, null));
519 assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null));
520 assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' }));
522 // straight translation
523 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
524 "FPKG".toCharArray()));
525 // with extra start codon (not in protein)
526 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
527 3, "FPKG".toCharArray()));
528 // with stop codon1 (not in protein)
529 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
530 0, "FPKG".toCharArray()));
531 // with stop codon1 (in protein as *)
532 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
533 0, "FPKG*".toCharArray()));
534 // with stop codon2 (not in protein)
535 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
536 0, "FPKG".toCharArray()));
537 // with stop codon3 (not in protein)
538 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
539 0, "FPKG".toCharArray()));
540 // with start and stop codon1
541 assertTrue(AlignmentUtils.translatesAs(
542 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG".toCharArray()));
543 // with start and stop codon1 (in protein as *)
544 assertTrue(AlignmentUtils.translatesAs(
545 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG*".toCharArray()));
546 // with start and stop codon2
547 assertTrue(AlignmentUtils.translatesAs(
548 "atgtttcccaaagggtag".toCharArray(), 3, "FPKG".toCharArray()));
549 // with start and stop codon3
550 assertTrue(AlignmentUtils.translatesAs(
551 "atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray()));
553 // with embedded stop codons
554 assertTrue(AlignmentUtils.translatesAs(
555 "atgtttTAGcccaaaTAAgggtga".toCharArray(), 3,
556 "F*PK*G".toCharArray()));
559 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
560 0, "FPMG".toCharArray()));
563 assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0,
564 "FPKG".toCharArray()));
567 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
568 0, "FPK".toCharArray()));
570 // overlong dna (doesn't end in stop codon)
571 assertFalse(AlignmentUtils.translatesAs(
572 "tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray()));
574 // dna + stop codon + more
575 assertFalse(AlignmentUtils.translatesAs(
576 "tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray()));
579 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
580 0, "FPKGQ".toCharArray()));
584 * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
585 * stop codons in addition to the protein coding sequence.
587 * @throws IOException
589 @Test(groups = { "Functional" })
590 public void testMapProteinAlignmentToCdna_withStartAndStopCodons()
593 List<SequenceI> protseqs = new ArrayList<SequenceI>();
594 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
595 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
596 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
597 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
598 protein.setDataset(null);
600 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
602 dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
604 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
605 // = start +EIQ + stop
606 dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
607 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
608 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
609 cdna.setDataset(null);
611 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
613 // 3 mappings made, each from 1 to 1 sequence
614 assertEquals(3, protein.getCodonFrames().size());
615 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
616 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
617 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
619 // V12345 mapped from A22222
620 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
622 assertEquals(1, acf.getdnaSeqs().length);
623 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
624 acf.getdnaSeqs()[0]);
625 Mapping[] protMappings = acf.getProtMappings();
626 assertEquals(1, protMappings.length);
627 MapList mapList = protMappings[0].getMap();
628 assertEquals(3, mapList.getFromRatio());
629 assertEquals(1, mapList.getToRatio());
630 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
632 assertEquals(1, mapList.getFromRanges().size());
633 assertTrue(Arrays.equals(new int[] { 1, 3 },
634 mapList.getToRanges().get(0)));
635 assertEquals(1, mapList.getToRanges().size());
637 // V12346 mapped from A33333 starting position 4
638 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
639 assertEquals(1, acf.getdnaSeqs().length);
640 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
641 acf.getdnaSeqs()[0]);
642 protMappings = acf.getProtMappings();
643 assertEquals(1, protMappings.length);
644 mapList = protMappings[0].getMap();
645 assertEquals(3, mapList.getFromRatio());
646 assertEquals(1, mapList.getToRatio());
647 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
649 assertEquals(1, mapList.getFromRanges().size());
650 assertTrue(Arrays.equals(new int[] { 1, 3 },
651 mapList.getToRanges().get(0)));
652 assertEquals(1, mapList.getToRanges().size());
654 // V12347 mapped to A11111 starting position 4
655 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
656 assertEquals(1, acf.getdnaSeqs().length);
657 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
658 acf.getdnaSeqs()[0]);
659 protMappings = acf.getProtMappings();
660 assertEquals(1, protMappings.length);
661 mapList = protMappings[0].getMap();
662 assertEquals(3, mapList.getFromRatio());
663 assertEquals(1, mapList.getToRatio());
664 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
666 assertEquals(1, mapList.getFromRanges().size());
667 assertTrue(Arrays.equals(new int[] { 1, 3 },
668 mapList.getToRanges().get(0)));
669 assertEquals(1, mapList.getToRanges().size());
671 // no mapping involving the 'extra' A44444
672 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
676 * Test mapping of protein to cDNA, for the case where we have some sequence
677 * cross-references. Verify that 1-to-many mappings are made where
678 * cross-references exist and sequences are mappable.
680 * @throws IOException
682 @Test(groups = { "Functional" })
683 public void testMapProteinAlignmentToCdna_withXrefs() throws IOException
685 List<SequenceI> protseqs = new ArrayList<SequenceI>();
686 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
687 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
688 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
689 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
690 protein.setDataset(null);
692 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
693 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
694 dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
695 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
696 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
697 dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
698 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
699 cdna.setDataset(null);
701 // Xref A22222 to V12345 (should get mapped)
702 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
703 // Xref V12345 to A44444 (should get mapped)
704 protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
705 // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
706 dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
707 // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
708 // it should get paired up with the unmapped A33333
709 // A11111 should be mapped to V12347
710 // A55555 is spare and has no xref so is not mapped
712 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
714 // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
715 assertEquals(3, protein.getCodonFrames().size());
716 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
717 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
718 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
720 // one mapping for each of the first 4 cDNA sequences
721 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
722 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
723 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
724 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
726 // V12345 mapped to A22222 and A44444
727 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
729 assertEquals(2, acf.getdnaSeqs().length);
730 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
731 acf.getdnaSeqs()[0]);
732 assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
733 acf.getdnaSeqs()[1]);
735 // V12346 mapped to A33333
736 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
737 assertEquals(1, acf.getdnaSeqs().length);
738 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
739 acf.getdnaSeqs()[0]);
741 // V12347 mapped to A11111
742 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
743 assertEquals(1, acf.getdnaSeqs().length);
744 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
745 acf.getdnaSeqs()[0]);
747 // no mapping involving the 'extra' A55555
748 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
752 * Test mapping of protein to cDNA, for the case where we have some sequence
753 * cross-references. Verify that once we have made an xref mapping we don't
754 * also map un-xrefd sequeces.
756 * @throws IOException
758 @Test(groups = { "Functional" })
759 public void testMapProteinAlignmentToCdna_prioritiseXrefs()
762 List<SequenceI> protseqs = new ArrayList<SequenceI>();
763 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
764 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
765 AlignmentI protein = new Alignment(
766 protseqs.toArray(new SequenceI[protseqs.size()]));
767 protein.setDataset(null);
769 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
770 dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
771 dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
772 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
774 cdna.setDataset(null);
776 // Xref A22222 to V12345 (should get mapped)
777 // A11111 should then be mapped to the unmapped V12346
778 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
780 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
782 // 2 protein mappings made
783 assertEquals(2, protein.getCodonFrames().size());
784 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
785 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
787 // one mapping for each of the cDNA sequences
788 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
789 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
791 // V12345 mapped to A22222
792 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
794 assertEquals(1, acf.getdnaSeqs().length);
795 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
796 acf.getdnaSeqs()[0]);
798 // V12346 mapped to A11111
799 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
800 assertEquals(1, acf.getdnaSeqs().length);
801 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
802 acf.getdnaSeqs()[0]);
806 * Test the method that shows or hides sequence annotations by type(s) and
809 @Test(groups = { "Functional" })
810 public void testShowOrHideSequenceAnnotations()
812 SequenceI seq1 = new Sequence("Seq1", "AAA");
813 SequenceI seq2 = new Sequence("Seq2", "BBB");
814 SequenceI seq3 = new Sequence("Seq3", "CCC");
815 Annotation[] anns = new Annotation[] { new Annotation(2f) };
816 AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1",
818 ann1.setSequenceRef(seq1);
819 AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2",
821 ann2.setSequenceRef(seq2);
822 AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3",
824 AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns);
825 ann4.setSequenceRef(seq1);
826 AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns);
827 ann5.setSequenceRef(seq2);
828 AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns);
829 AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2, seq3 });
830 al.addAnnotation(ann1); // Structure for Seq1
831 al.addAnnotation(ann2); // Structure for Seq2
832 al.addAnnotation(ann3); // Structure for no sequence
833 al.addAnnotation(ann4); // Temp for seq1
834 al.addAnnotation(ann5); // Temp for seq2
835 al.addAnnotation(ann6); // Temp for no sequence
836 List<String> types = new ArrayList<String>();
837 List<SequenceI> scope = new ArrayList<SequenceI>();
840 * Set all sequence related Structure to hidden (ann1, ann2)
842 types.add("Structure");
843 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
845 assertFalse(ann1.visible);
846 assertFalse(ann2.visible);
847 assertTrue(ann3.visible); // not sequence-related, not affected
848 assertTrue(ann4.visible); // not Structure, not affected
849 assertTrue(ann5.visible); // "
850 assertTrue(ann6.visible); // not sequence-related, not affected
853 * Set Temp in {seq1, seq3} to hidden
859 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false,
861 assertFalse(ann1.visible); // unchanged
862 assertFalse(ann2.visible); // unchanged
863 assertTrue(ann3.visible); // not sequence-related, not affected
864 assertFalse(ann4.visible); // Temp for seq1 hidden
865 assertTrue(ann5.visible); // not in scope, not affected
866 assertTrue(ann6.visible); // not sequence-related, not affected
869 * Set Temp in all sequences to hidden
875 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
877 assertFalse(ann1.visible); // unchanged
878 assertFalse(ann2.visible); // unchanged
879 assertTrue(ann3.visible); // not sequence-related, not affected
880 assertFalse(ann4.visible); // Temp for seq1 hidden
881 assertFalse(ann5.visible); // Temp for seq2 hidden
882 assertTrue(ann6.visible); // not sequence-related, not affected
885 * Set all types in {seq1, seq3} to visible
891 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true,
893 assertTrue(ann1.visible); // Structure for seq1 set visible
894 assertFalse(ann2.visible); // not in scope, unchanged
895 assertTrue(ann3.visible); // not sequence-related, not affected
896 assertTrue(ann4.visible); // Temp for seq1 set visible
897 assertFalse(ann5.visible); // not in scope, unchanged
898 assertTrue(ann6.visible); // not sequence-related, not affected
901 * Set all types in all scope to hidden
903 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true,
905 assertFalse(ann1.visible);
906 assertFalse(ann2.visible);
907 assertTrue(ann3.visible); // not sequence-related, not affected
908 assertFalse(ann4.visible);
909 assertFalse(ann5.visible);
910 assertTrue(ann6.visible); // not sequence-related, not affected
914 * Tests for the method that checks if one sequence cross-references another
916 @Test(groups = { "Functional" })
917 public void testHasCrossRef()
919 assertFalse(AlignmentUtils.hasCrossRef(null, null));
920 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
921 assertFalse(AlignmentUtils.hasCrossRef(seq1, null));
922 assertFalse(AlignmentUtils.hasCrossRef(null, seq1));
923 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
924 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
927 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193"));
928 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
930 // case-insensitive; version number is ignored
931 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192"));
932 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
935 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
936 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
937 // test is one-way only
938 assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1));
942 * Tests for the method that checks if either sequence cross-references the
945 @Test(groups = { "Functional" })
946 public void testHaveCrossRef()
948 assertFalse(AlignmentUtils.hasCrossRef(null, null));
949 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
950 assertFalse(AlignmentUtils.haveCrossRef(seq1, null));
951 assertFalse(AlignmentUtils.haveCrossRef(null, seq1));
952 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
953 assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2));
955 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
956 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
957 // next is true for haveCrossRef, false for hasCrossRef
958 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
960 // now the other way round
961 seq1.setDBRefs(null);
962 seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345"));
963 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
964 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
967 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
968 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
969 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
973 * Test the method that extracts the cds-only part of a dna alignment.
975 @Test(groups = { "Functional" })
976 public void testMakeCdsAlignment()
978 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
979 SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
980 SequenceI pep1 = new Sequence("pep1", "GF");
981 SequenceI pep2 = new Sequence("pep2", "GFP");
982 dna1.createDatasetSequence();
983 dna2.createDatasetSequence();
984 pep1.createDatasetSequence();
985 pep2.createDatasetSequence();
986 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
988 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
990 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
992 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
994 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds5", 13, 15, 0f,
996 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
997 dna.setDataset(null);
999 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1000 new int[] { 1, 2 }, 3, 1);
1001 AlignedCodonFrame acf = new AlignedCodonFrame();
1002 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1003 dna.addCodonFrame(acf);
1004 map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 },
1006 acf = new AlignedCodonFrame();
1007 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1008 dna.addCodonFrame(acf);
1011 * execute method under test:
1013 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1014 dna1, dna2 }, dna.getDataset());
1016 assertEquals(2, cds.getSequences().size());
1017 assertEquals("GGGTTT", cds.getSequenceAt(0)
1018 .getSequenceAsString());
1019 assertEquals("GGGTTTCCC", cds.getSequenceAt(1)
1020 .getSequenceAsString());
1023 * verify shared, extended alignment dataset
1025 assertSame(dna.getDataset(), cds.getDataset());
1026 assertTrue(dna.getDataset().getSequences()
1027 .contains(cds.getSequenceAt(0).getDatasetSequence()));
1028 assertTrue(dna.getDataset().getSequences()
1029 .contains(cds.getSequenceAt(1).getDatasetSequence()));
1032 * Verify mappings from CDS to peptide, cDNA to CDS, and cDNA to peptide
1033 * the mappings are on the shared alignment dataset
1035 List<AlignedCodonFrame> cdsMappings = cds.getDataset().getCodonFrames();
1037 * 6 mappings, 2*(DNA->CDS), 2*(DNA->Pep), 2*(CDS->Pep)
1039 assertEquals(6, cdsMappings.size());
1041 * verify that mapping sets for dna and cds alignments are different
1043 Assert.assertNotSame(dna.getCodonFrames(), cds.getCodonFrames());
1044 assertEquals(4, dna.getCodonFrames());
1045 assertEquals(4, cds.getCodonFrames());
1047 * Mapping from pep1 to GGGTTT in first new exon sequence
1049 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1050 .findMappingsForSequence(pep1, cds.getCodonFrames());
1051 assertEquals(1, pep1Mapping.size()); // CDS->Pep
1054 SearchResults sr = MappingUtils
1055 .buildSearchResults(pep1, 1, cdsMappings);
1056 assertEquals(1, sr.getResults().size());
1057 Match m = sr.getResults().get(0);
1058 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1060 assertEquals(1, m.getStart());
1061 assertEquals(3, m.getEnd());
1063 sr = MappingUtils.buildSearchResults(pep1, 2, cdsMappings);
1064 m = sr.getResults().get(0);
1065 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1067 assertEquals(4, m.getStart());
1068 assertEquals(6, m.getEnd());
1071 * Mapping from pep2 to GGGTTTCCC in second new exon sequence
1073 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1074 .findMappingsForSequence(pep2, cdsMappings);
1075 assertEquals(1, pep2Mapping.size());
1077 sr = MappingUtils.buildSearchResults(pep2, 1, cdsMappings);
1078 assertEquals(1, sr.getResults().size());
1079 m = sr.getResults().get(0);
1080 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1082 assertEquals(1, m.getStart());
1083 assertEquals(3, m.getEnd());
1085 sr = MappingUtils.buildSearchResults(pep2, 2, cdsMappings);
1086 m = sr.getResults().get(0);
1087 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1089 assertEquals(4, m.getStart());
1090 assertEquals(6, m.getEnd());
1092 sr = MappingUtils.buildSearchResults(pep2, 3, cdsMappings);
1093 m = sr.getResults().get(0);
1094 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1096 assertEquals(7, m.getStart());
1097 assertEquals(9, m.getEnd());
1101 * Test the method that makes a cds-only alignment from a DNA sequence and its
1102 * product mappings, for the case where there are multiple exon mappings to
1103 * different protein products.
1105 @Test(groups = { "Functional" })
1106 public void testMakeCdsAlignment_multipleProteins()
1108 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1109 SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT
1110 SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc
1111 SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT
1112 dna1.createDatasetSequence();
1113 pep1.createDatasetSequence();
1114 pep2.createDatasetSequence();
1115 pep3.createDatasetSequence();
1116 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
1118 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
1120 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
1122 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
1124 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds5", 1, 3, 0f,
1126 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds6", 10, 12, 0f,
1128 pep1.getDatasetSequence().addDBRef(
1129 new DBRefEntry("EMBLCDS", "2", "A12345"));
1130 pep2.getDatasetSequence().addDBRef(
1131 new DBRefEntry("EMBLCDS", "3", "A12346"));
1132 pep3.getDatasetSequence().addDBRef(
1133 new DBRefEntry("EMBLCDS", "4", "A12347"));
1136 * Create the CDS alignment
1138 AlignmentI dna = new Alignment(new SequenceI[] { dna1 });
1139 dna.setDataset(null);
1142 * Make the mappings from dna to protein
1144 // map ...GGG...TTT to GF
1145 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1146 new int[] { 1, 2 }, 3, 1);
1147 AlignedCodonFrame acf = new AlignedCodonFrame();
1148 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1149 dna.addCodonFrame(acf);
1151 // map aaa...ccc to KP
1152 map = new MapList(new int[] { 1, 3, 7, 9 }, new int[] { 1, 2 }, 3, 1);
1153 acf = new AlignedCodonFrame();
1154 acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map);
1155 dna.addCodonFrame(acf);
1157 // map aaa......TTT to KF
1158 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 2 }, 3, 1);
1159 acf = new AlignedCodonFrame();
1160 acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
1161 dna.addCodonFrame(acf);
1164 * execute method under test
1166 AlignmentI cdsal = AlignmentUtils.makeCdsAlignment(
1167 new SequenceI[] { dna1 }, dna);
1170 * Verify we have 3 cds sequences, mapped to pep1/2/3 respectively
1172 List<SequenceI> cds = cdsal.getSequences();
1173 assertEquals(3, cds.size());
1176 * verify shared, extended alignment dataset
1178 assertSame(cdsal.getDataset(), dna.getDataset());
1179 assertTrue(dna.getDataset().getSequences()
1180 .contains(cds.get(0).getDatasetSequence()));
1181 assertTrue(dna.getDataset().getSequences()
1182 .contains(cds.get(1).getDatasetSequence()));
1183 assertTrue(dna.getDataset().getSequences()
1184 .contains(cds.get(2).getDatasetSequence()));
1187 * verify aligned cds sequences and their xrefs
1189 SequenceI cdsSeq = cds.get(0);
1190 assertEquals("GGGTTT", cdsSeq.getSequenceAsString());
1191 // assertEquals("dna1|A12345", cdsSeq.getName());
1192 assertEquals("dna1|pep1", cdsSeq.getName());
1193 // assertEquals(1, cdsSeq.getDBRefs().length);
1194 // DBRefEntry cdsRef = cdsSeq.getDBRefs()[0];
1195 // assertEquals("EMBLCDS", cdsRef.getSource());
1196 // assertEquals("2", cdsRef.getVersion());
1197 // assertEquals("A12345", cdsRef.getAccessionId());
1199 cdsSeq = cds.get(1);
1200 assertEquals("aaaccc", cdsSeq.getSequenceAsString());
1201 // assertEquals("dna1|A12346", cdsSeq.getName());
1202 assertEquals("dna1|pep2", cdsSeq.getName());
1203 // assertEquals(1, cdsSeq.getDBRefs().length);
1204 // cdsRef = cdsSeq.getDBRefs()[0];
1205 // assertEquals("EMBLCDS", cdsRef.getSource());
1206 // assertEquals("3", cdsRef.getVersion());
1207 // assertEquals("A12346", cdsRef.getAccessionId());
1209 cdsSeq = cds.get(2);
1210 assertEquals("aaaTTT", cdsSeq.getSequenceAsString());
1211 // assertEquals("dna1|A12347", cdsSeq.getName());
1212 assertEquals("dna1|pep3", cdsSeq.getName());
1213 // assertEquals(1, cdsSeq.getDBRefs().length);
1214 // cdsRef = cdsSeq.getDBRefs()[0];
1215 // assertEquals("EMBLCDS", cdsRef.getSource());
1216 // assertEquals("4", cdsRef.getVersion());
1217 // assertEquals("A12347", cdsRef.getAccessionId());
1220 * Verify there are mappings from each cds sequence to its protein product
1221 * and also to its dna source
1223 Iterator<AlignedCodonFrame> newMappingsIterator = cdsal
1224 .getCodonFrames().iterator();
1226 // mappings for dna1 - exon1 - pep1
1227 AlignedCodonFrame cdsMapping = newMappingsIterator.next();
1228 List<Mapping> dnaMappings = cdsMapping.getMappingsFromSequence(dna1);
1229 assertEquals(3, dnaMappings.size());
1230 assertSame(cds.get(0).getDatasetSequence(), dnaMappings.get(0)
1232 assertEquals("G(1) in CDS should map to G(4) in DNA", 4, dnaMappings
1233 .get(0).getMap().getToPosition(1));
1234 List<Mapping> peptideMappings = cdsMapping.getMappingsFromSequence(cds
1235 .get(0).getDatasetSequence());
1236 assertEquals(1, peptideMappings.size());
1237 assertSame(pep1.getDatasetSequence(), peptideMappings.get(0).getTo());
1239 // mappings for dna1 - cds2 - pep2
1240 assertSame(cds.get(1).getDatasetSequence(), dnaMappings.get(1)
1242 assertEquals("c(4) in CDS should map to c(7) in DNA", 7, dnaMappings
1243 .get(1).getMap().getToPosition(4));
1244 peptideMappings = cdsMapping.getMappingsFromSequence(cds.get(1)
1245 .getDatasetSequence());
1246 assertEquals(1, peptideMappings.size());
1247 assertSame(pep2.getDatasetSequence(), peptideMappings.get(0).getTo());
1249 // mappings for dna1 - cds3 - pep3
1250 assertSame(cds.get(2).getDatasetSequence(), dnaMappings.get(2)
1252 assertEquals("T(4) in CDS should map to T(10) in DNA", 10, dnaMappings
1253 .get(2).getMap().getToPosition(4));
1254 peptideMappings = cdsMapping.getMappingsFromSequence(cds.get(2)
1255 .getDatasetSequence());
1256 assertEquals(1, peptideMappings.size());
1257 assertSame(pep3.getDatasetSequence(), peptideMappings.get(0).getTo());
1260 @Test(groups = { "Functional" })
1261 public void testIsMappable()
1263 SequenceI dna1 = new Sequence("dna1", "cgCAGtgGT");
1264 SequenceI aa1 = new Sequence("aa1", "RSG");
1265 AlignmentI al1 = new Alignment(new SequenceI[] { dna1 });
1266 AlignmentI al2 = new Alignment(new SequenceI[] { aa1 });
1268 assertFalse(AlignmentUtils.isMappable(null, null));
1269 assertFalse(AlignmentUtils.isMappable(al1, null));
1270 assertFalse(AlignmentUtils.isMappable(null, al1));
1271 assertFalse(AlignmentUtils.isMappable(al1, al1));
1272 assertFalse(AlignmentUtils.isMappable(al2, al2));
1274 assertTrue(AlignmentUtils.isMappable(al1, al2));
1275 assertTrue(AlignmentUtils.isMappable(al2, al1));
1279 * Test creating a mapping when the sequences involved do not start at residue
1282 * @throws IOException
1284 @Test(groups = { "Functional" })
1285 public void testMapCdnaToProtein_forSubsequence()
1288 SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12);
1289 prot.createDatasetSequence();
1291 SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48);
1292 dna.createDatasetSequence();
1294 MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna);
1295 assertEquals(10, map.getToLowest());
1296 assertEquals(12, map.getToHighest());
1297 assertEquals(40, map.getFromLowest());
1298 assertEquals(48, map.getFromHighest());
1302 * Test for the alignSequenceAs method where we have protein mapped to protein
1304 @Test(groups = { "Functional" })
1305 public void testAlignSequenceAs_mappedProteinProtein()
1308 SequenceI alignMe = new Sequence("Match", "MGAASEV");
1309 alignMe.createDatasetSequence();
1310 SequenceI alignFrom = new Sequence("Query", "LQTGYMGAASEVMFSPTRR");
1311 alignFrom.createDatasetSequence();
1313 AlignedCodonFrame acf = new AlignedCodonFrame();
1314 // this is like a domain or motif match of part of a peptide sequence
1315 MapList map = new MapList(new int[] { 6, 12 }, new int[] { 1, 7 }, 1, 1);
1316 acf.addMap(alignFrom.getDatasetSequence(),
1317 alignMe.getDatasetSequence(), map);
1319 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "-", '-', true,
1321 assertEquals("-----MGAASEV-------", alignMe.getSequenceAsString());
1325 * Test for the alignSequenceAs method where there are trailing unmapped
1326 * residues in the model sequence
1328 @Test(groups = { "Functional" })
1329 public void testAlignSequenceAs_withTrailingPeptide()
1331 // map first 3 codons to KPF; G is a trailing unmapped residue
1332 MapList map = new MapList(new int[] { 1, 9 }, new int[] { 1, 3 }, 3, 1);
1334 checkAlignSequenceAs("AAACCCTTT", "K-PFG", true, true, map,
1339 * Tests for transferring features between mapped sequences
1341 @Test(groups = { "Functional" })
1342 public void testTransferFeatures()
1344 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1345 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1348 dna.addSequenceFeature(new SequenceFeature("type1", "desc1", 1, 2, 1f,
1350 // partial overlap - to [1, 1]
1351 dna.addSequenceFeature(new SequenceFeature("type2", "desc2", 3, 4, 2f,
1353 // exact overlap - to [1, 3]
1354 dna.addSequenceFeature(new SequenceFeature("type3", "desc3", 4, 6, 3f,
1356 // spanning overlap - to [2, 5]
1357 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1359 // exactly overlaps whole mapped range [1, 6]
1360 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1362 // no overlap (internal)
1363 dna.addSequenceFeature(new SequenceFeature("type6", "desc6", 7, 9, 6f,
1365 // no overlap (3' end)
1366 dna.addSequenceFeature(new SequenceFeature("type7", "desc7", 13, 15,
1368 // overlap (3' end) - to [6, 6]
1369 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1371 // extended overlap - to [6, +]
1372 dna.addSequenceFeature(new SequenceFeature("type9", "desc9", 12, 13,
1375 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1376 new int[] { 1, 6 }, 1, 1);
1379 * transferFeatures() will build 'partial overlap' for regions
1380 * that partially overlap 5' or 3' (start or end) of target sequence
1382 AlignmentUtils.transferFeatures(dna, cds, map, null);
1383 SequenceFeature[] sfs = cds.getSequenceFeatures();
1384 assertEquals(6, sfs.length);
1386 SequenceFeature sf = sfs[0];
1387 assertEquals("type2", sf.getType());
1388 assertEquals("desc2", sf.getDescription());
1389 assertEquals(2f, sf.getScore());
1390 assertEquals(1, sf.getBegin());
1391 assertEquals(1, sf.getEnd());
1394 assertEquals("type3", sf.getType());
1395 assertEquals("desc3", sf.getDescription());
1396 assertEquals(3f, sf.getScore());
1397 assertEquals(1, sf.getBegin());
1398 assertEquals(3, sf.getEnd());
1401 assertEquals("type4", sf.getType());
1402 assertEquals(2, sf.getBegin());
1403 assertEquals(5, sf.getEnd());
1406 assertEquals("type5", sf.getType());
1407 assertEquals(1, sf.getBegin());
1408 assertEquals(6, sf.getEnd());
1411 assertEquals("type8", sf.getType());
1412 assertEquals(6, sf.getBegin());
1413 assertEquals(6, sf.getEnd());
1416 assertEquals("type9", sf.getType());
1417 assertEquals(6, sf.getBegin());
1418 assertEquals(6, sf.getEnd());
1422 * Tests for transferring features between mapped sequences
1424 @Test(groups = { "Functional" })
1425 public void testTransferFeatures_withOmit()
1427 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1428 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1430 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1431 new int[] { 1, 6 }, 1, 1);
1433 // [5, 11] maps to [2, 5]
1434 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1436 // [4, 12] maps to [1, 6]
1437 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1439 // [12, 12] maps to [6, 6]
1440 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1443 // desc4 and desc8 are the 'omit these' varargs
1444 AlignmentUtils.transferFeatures(dna, cds, map, null, "type4", "type8");
1445 SequenceFeature[] sfs = cds.getSequenceFeatures();
1446 assertEquals(1, sfs.length);
1448 SequenceFeature sf = sfs[0];
1449 assertEquals("type5", sf.getType());
1450 assertEquals(1, sf.getBegin());
1451 assertEquals(6, sf.getEnd());
1455 * Tests for transferring features between mapped sequences
1457 @Test(groups = { "Functional" })
1458 public void testTransferFeatures_withSelect()
1460 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1461 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1463 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1464 new int[] { 1, 6 }, 1, 1);
1466 // [5, 11] maps to [2, 5]
1467 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1469 // [4, 12] maps to [1, 6]
1470 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1472 // [12, 12] maps to [6, 6]
1473 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1476 // "type5" is the 'select this type' argument
1477 AlignmentUtils.transferFeatures(dna, cds, map, "type5");
1478 SequenceFeature[] sfs = cds.getSequenceFeatures();
1479 assertEquals(1, sfs.length);
1481 SequenceFeature sf = sfs[0];
1482 assertEquals("type5", sf.getType());
1483 assertEquals(1, sf.getBegin());
1484 assertEquals(6, sf.getEnd());
1488 * Test the method that extracts the cds-only part of a dna alignment, for the
1489 * case where the cds should be aligned to match its nucleotide sequence.
1491 @Test(groups = { "Functional" })
1492 public void testMakeCdsAlignment_alternativeTranscripts()
1494 SequenceI dna1 = new Sequence("dna1", "aaaGGGCC-----CTTTaaaGGG");
1495 // alternative transcript of same dna skips CCC codon
1496 SequenceI dna2 = new Sequence("dna2", "aaaGGGCC-----cttTaaaGGG");
1497 // dna3 has no mapping (protein product) so should be ignored here
1498 SequenceI dna3 = new Sequence("dna3", "aaaGGGCCCCCGGGcttTaaaGGG");
1499 SequenceI pep1 = new Sequence("pep1", "GPFG");
1500 SequenceI pep2 = new Sequence("pep2", "GPG");
1501 dna1.createDatasetSequence();
1502 dna2.createDatasetSequence();
1503 dna3.createDatasetSequence();
1504 pep1.createDatasetSequence();
1505 pep2.createDatasetSequence();
1506 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 8, 0f,
1508 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 9, 12, 0f,
1510 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 16, 18, 0f,
1512 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 4, 8, 0f,
1514 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 12, 12, 0f,
1516 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 16, 18, 0f,
1519 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1520 dna.setDataset(null);
1522 MapList map = new MapList(new int[] { 4, 12, 16, 18 },
1523 new int[] { 1, 4 }, 3, 1);
1524 AlignedCodonFrame acf = new AlignedCodonFrame();
1525 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1526 dna.addCodonFrame(acf);
1527 map = new MapList(new int[] { 4, 8, 12, 12, 16, 18 },
1530 acf = new AlignedCodonFrame();
1531 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1532 dna.addCodonFrame(acf);
1534 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1535 dna1, dna2, dna3 }, dna);
1536 List<SequenceI> cdsSeqs = cds.getSequences();
1537 assertEquals(2, cdsSeqs.size());
1538 assertEquals("GGGCCCTTTGGG", cdsSeqs.get(0).getSequenceAsString());
1539 assertEquals("GGGCCTGGG", cdsSeqs.get(1).getSequenceAsString());
1542 * verify shared, extended alignment dataset
1544 assertSame(dna.getDataset(), cds.getDataset());
1545 assertTrue(dna.getDataset().getSequences()
1546 .contains(cdsSeqs.get(0).getDatasetSequence()));
1547 assertTrue(dna.getDataset().getSequences()
1548 .contains(cdsSeqs.get(1).getDatasetSequence()));
1551 * Verify updated mappings
1553 List<AlignedCodonFrame> cdsMappings = cds.getCodonFrames();
1554 assertEquals(2, cdsMappings.size());
1557 * Mapping from pep1 to GGGTTT in first new CDS sequence
1559 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1560 .findMappingsForSequence(pep1, cdsMappings);
1561 assertEquals(1, pep1Mapping.size());
1563 * maps GPFG to 1-3,4-6,7-9,10-12
1565 SearchResults sr = MappingUtils
1566 .buildSearchResults(pep1, 1, cdsMappings);
1567 assertEquals(1, sr.getResults().size());
1568 Match m = sr.getResults().get(0);
1569 assertEquals(cds.getSequenceAt(0).getDatasetSequence(),
1571 assertEquals(1, m.getStart());
1572 assertEquals(3, m.getEnd());
1573 sr = MappingUtils.buildSearchResults(pep1, 2, cdsMappings);
1574 m = sr.getResults().get(0);
1575 assertEquals(4, m.getStart());
1576 assertEquals(6, m.getEnd());
1577 sr = MappingUtils.buildSearchResults(pep1, 3, cdsMappings);
1578 m = sr.getResults().get(0);
1579 assertEquals(7, m.getStart());
1580 assertEquals(9, m.getEnd());
1581 sr = MappingUtils.buildSearchResults(pep1, 4, cdsMappings);
1582 m = sr.getResults().get(0);
1583 assertEquals(10, m.getStart());
1584 assertEquals(12, m.getEnd());
1587 * GPG in pep2 map to 1-3,4-6,7-9 in second CDS sequence
1589 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1590 .findMappingsForSequence(pep2, cdsMappings);
1591 assertEquals(1, pep2Mapping.size());
1592 sr = MappingUtils.buildSearchResults(pep2, 1, cdsMappings);
1593 assertEquals(1, sr.getResults().size());
1594 m = sr.getResults().get(0);
1595 assertEquals(cds.getSequenceAt(1).getDatasetSequence(),
1597 assertEquals(1, m.getStart());
1598 assertEquals(3, m.getEnd());
1599 sr = MappingUtils.buildSearchResults(pep2, 2, cdsMappings);
1600 m = sr.getResults().get(0);
1601 assertEquals(4, m.getStart());
1602 assertEquals(6, m.getEnd());
1603 sr = MappingUtils.buildSearchResults(pep2, 3, cdsMappings);
1604 m = sr.getResults().get(0);
1605 assertEquals(7, m.getStart());
1606 assertEquals(9, m.getEnd());
1610 * Test the method that realigns protein to match mapped codon alignment.
1612 @Test(groups = { "Functional" })
1613 public void testAlignProteinAsDna_incompleteStartCodon()
1615 // seq1: incomplete start codon (not mapped), then [3, 11]
1616 SequenceI dna1 = new Sequence("Seq1", "ccAAA-TTT-GGG-");
1617 // seq2 codons are [4, 5], [8, 11]
1618 SequenceI dna2 = new Sequence("Seq2", "ccaAA-ttT-GGG-");
1619 // seq3 incomplete start codon at 'tt'
1620 SequenceI dna3 = new Sequence("Seq3", "ccaaa-ttt-GGG-");
1621 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1622 dna.setDataset(null);
1624 // prot1 has 'X' for incomplete start codon (not mapped)
1625 SequenceI prot1 = new Sequence("Seq1", "XKFG"); // X for incomplete start
1626 SequenceI prot2 = new Sequence("Seq2", "NG");
1627 SequenceI prot3 = new Sequence("Seq3", "XG"); // X for incomplete start
1628 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
1630 protein.setDataset(null);
1632 // map dna1 [3, 11] to prot1 [2, 4] KFG
1633 MapList map = new MapList(new int[] { 3, 11 }, new int[] { 2, 4 }, 3, 1);
1634 AlignedCodonFrame acf = new AlignedCodonFrame();
1635 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
1637 // map dna2 [4, 5] [8, 11] to prot2 [1, 2] NG
1638 map = new MapList(new int[] { 4, 5, 8, 11 }, new int[] { 1, 2 }, 3, 1);
1639 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
1641 // map dna3 [9, 11] to prot3 [2, 2] G
1642 map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1);
1643 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
1645 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
1647 protein.setCodonFrames(acfs);
1650 * verify X is included in the aligned proteins, and placed just
1651 * before the first mapped residue
1652 * CCT is between CCC and TTT
1654 AlignmentUtils.alignProteinAsDna(protein, dna);
1655 assertEquals("XK-FG", prot1.getSequenceAsString());
1656 assertEquals("--N-G", prot2.getSequenceAsString());
1657 assertEquals("---XG", prot3.getSequenceAsString());
1661 * Tests for the method that maps the subset of a dna sequence that has CDS
1662 * (or subtype) feature - case where the start codon is incomplete.
1664 @Test(groups = "Functional")
1665 public void testFindCdsPositions_fivePrimeIncomplete()
1667 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
1668 dnaSeq.createDatasetSequence();
1669 SequenceI ds = dnaSeq.getDatasetSequence();
1671 // CDS for dna 5-6 (incomplete codon), 7-9
1672 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
1673 sf.setPhase("2"); // skip 2 bases to start of next codon
1674 ds.addSequenceFeature(sf);
1675 // CDS for dna 13-15
1676 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
1677 ds.addSequenceFeature(sf);
1679 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1682 * check the mapping starts with the first complete codon
1684 assertEquals(6, MappingUtils.getLength(ranges));
1685 assertEquals(2, ranges.size());
1686 assertEquals(7, ranges.get(0)[0]);
1687 assertEquals(9, ranges.get(0)[1]);
1688 assertEquals(13, ranges.get(1)[0]);
1689 assertEquals(15, ranges.get(1)[1]);
1693 * Tests for the method that maps the subset of a dna sequence that has CDS
1694 * (or subtype) feature.
1696 @Test(groups = "Functional")
1697 public void testFindCdsPositions()
1699 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
1700 dnaSeq.createDatasetSequence();
1701 SequenceI ds = dnaSeq.getDatasetSequence();
1703 // CDS for dna 10-12
1704 SequenceFeature sf = new SequenceFeature("CDS_predicted", "", 10, 12,
1707 ds.addSequenceFeature(sf);
1709 sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
1711 ds.addSequenceFeature(sf);
1712 // exon feature should be ignored here
1713 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
1714 ds.addSequenceFeature(sf);
1716 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1718 * verify ranges { [4-6], [12-10] }
1719 * note CDS ranges are ordered ascending even if the CDS
1722 assertEquals(6, MappingUtils.getLength(ranges));
1723 assertEquals(2, ranges.size());
1724 assertEquals(4, ranges.get(0)[0]);
1725 assertEquals(6, ranges.get(0)[1]);
1726 assertEquals(10, ranges.get(1)[0]);
1727 assertEquals(12, ranges.get(1)[1]);
1731 * Test the method that computes a map of codon variants for each protein
1732 * position from "sequence_variant" features on dna
1734 @Test(groups = "Functional")
1735 public void testBuildDnaVariantsMap()
1737 SequenceI dna = new Sequence("dna", "atgAAATTTGGGCCCtag");
1738 MapList map = new MapList(new int[] { 1, 18 }, new int[] { 1, 5 }, 3, 1);
1741 * first with no variants on dna
1743 LinkedHashMap<Integer, List<DnaVariant>[]> variantsMap = AlignmentUtils
1744 .buildDnaVariantsMap(dna, map);
1745 assertTrue(variantsMap.isEmpty());
1748 * single allele codon 1, on base 1
1750 SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
1752 sf1.setValue("alleles", "T");
1753 sf1.setValue("ID", "sequence_variant:rs758803211");
1754 dna.addSequenceFeature(sf1);
1757 * two alleles codon 2, on bases 2 and 3 (distinct variants)
1759 SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 5, 5,
1761 sf2.setValue("alleles", "T");
1762 sf2.setValue("ID", "sequence_variant:rs758803212");
1763 dna.addSequenceFeature(sf2);
1764 SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 6, 6,
1766 sf3.setValue("alleles", "G");
1767 sf3.setValue("ID", "sequence_variant:rs758803213");
1768 dna.addSequenceFeature(sf3);
1771 * two alleles codon 3, both on base 2 (one variant)
1773 SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 8, 8,
1775 sf4.setValue("alleles", "C, G");
1776 sf4.setValue("ID", "sequence_variant:rs758803214");
1777 dna.addSequenceFeature(sf4);
1779 // no alleles on codon 4
1782 * alleles on codon 5 on all 3 bases (distinct variants)
1784 SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 13,
1786 sf5.setValue("alleles", "C, G"); // (C duplicates given base value)
1787 sf5.setValue("ID", "sequence_variant:rs758803215");
1788 dna.addSequenceFeature(sf5);
1789 SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 14,
1791 sf6.setValue("alleles", "g, a"); // should force to upper-case
1792 sf6.setValue("ID", "sequence_variant:rs758803216");
1793 dna.addSequenceFeature(sf6);
1794 SequenceFeature sf7 = new SequenceFeature("sequence_variant", "", 15,
1796 sf7.setValue("alleles", "A, T");
1797 sf7.setValue("ID", "sequence_variant:rs758803217");
1798 dna.addSequenceFeature(sf7);
1801 * build map - expect variants on positions 1, 2, 3, 5
1803 variantsMap = AlignmentUtils.buildDnaVariantsMap(dna, map);
1804 assertEquals(4, variantsMap.size());
1807 * protein residue 1: variant on codon (ATG) base 1, not on 2 or 3
1809 List<DnaVariant>[] pep1Variants = variantsMap.get(1);
1810 assertEquals(3, pep1Variants.length);
1811 assertEquals(1, pep1Variants[0].size());
1812 assertEquals("A", pep1Variants[0].get(0).base); // codon[1] base
1813 assertSame(sf1, pep1Variants[0].get(0).variant); // codon[1] variant
1814 assertEquals(1, pep1Variants[1].size());
1815 assertEquals("T", pep1Variants[1].get(0).base); // codon[2] base
1816 assertNull(pep1Variants[1].get(0).variant); // no variant here
1817 assertEquals(1, pep1Variants[2].size());
1818 assertEquals("G", pep1Variants[2].get(0).base); // codon[3] base
1819 assertNull(pep1Variants[2].get(0).variant); // no variant here
1822 * protein residue 2: variants on codon (AAA) bases 2 and 3
1824 List<DnaVariant>[] pep2Variants = variantsMap.get(2);
1825 assertEquals(3, pep2Variants.length);
1826 assertEquals(1, pep2Variants[0].size());
1827 // codon[1] base recorded while processing variant on codon[2]
1828 assertEquals("A", pep2Variants[0].get(0).base);
1829 assertNull(pep2Variants[0].get(0).variant); // no variant here
1830 // codon[2] base and variant:
1831 assertEquals(1, pep2Variants[1].size());
1832 assertEquals("A", pep2Variants[1].get(0).base);
1833 assertSame(sf2, pep2Variants[1].get(0).variant);
1834 // codon[3] base was recorded when processing codon[2] variant
1835 // and then the variant for codon[3] added to it
1836 assertEquals(1, pep2Variants[2].size());
1837 assertEquals("A", pep2Variants[2].get(0).base);
1838 assertSame(sf3, pep2Variants[2].get(0).variant);
1841 * protein residue 3: variants on codon (TTT) base 2 only
1843 List<DnaVariant>[] pep3Variants = variantsMap.get(3);
1844 assertEquals(3, pep3Variants.length);
1845 assertEquals(1, pep3Variants[0].size());
1846 assertEquals("T", pep3Variants[0].get(0).base); // codon[1] base
1847 assertNull(pep3Variants[0].get(0).variant); // no variant here
1848 assertEquals(1, pep3Variants[1].size());
1849 assertEquals("T", pep3Variants[1].get(0).base); // codon[2] base
1850 assertSame(sf4, pep3Variants[1].get(0).variant); // codon[2] variant
1851 assertEquals(1, pep3Variants[2].size());
1852 assertEquals("T", pep3Variants[2].get(0).base); // codon[3] base
1853 assertNull(pep3Variants[2].get(0).variant); // no variant here
1856 * three variants on protein position 5
1858 List<DnaVariant>[] pep5Variants = variantsMap.get(5);
1859 assertEquals(3, pep5Variants.length);
1860 assertEquals(1, pep5Variants[0].size());
1861 assertEquals("C", pep5Variants[0].get(0).base); // codon[1] base
1862 assertSame(sf5, pep5Variants[0].get(0).variant); // codon[1] variant
1863 assertEquals(1, pep5Variants[1].size());
1864 assertEquals("C", pep5Variants[1].get(0).base); // codon[2] base
1865 assertSame(sf6, pep5Variants[1].get(0).variant); // codon[2] variant
1866 assertEquals(1, pep5Variants[2].size());
1867 assertEquals("C", pep5Variants[2].get(0).base); // codon[3] base
1868 assertSame(sf7, pep5Variants[2].get(0).variant); // codon[3] variant
1872 * Tests for the method that computes all peptide variants given codon
1875 @Test(groups = "Functional")
1876 public void testComputePeptideVariants()
1879 * scenario: AAATTTCCC codes for KFP, with variants
1885 * CAC,CGC -> H,R (as one variant)
1887 SequenceI peptide = new Sequence("pep/10-12", "KFP");
1890 * two distinct variants for codon 1 position 1
1891 * second one has clinical significance
1893 SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
1895 sf1.setValue("alleles", "A,G"); // GAA -> E
1896 sf1.setValue("ID", "var1.125A>G");
1897 SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 1, 1,
1899 sf2.setValue("alleles", "A,C"); // CAA -> Q
1900 sf2.setValue("ID", "var2");
1901 sf2.setValue("clinical_significance", "Dodgy");
1902 SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 3, 3,
1904 sf3.setValue("alleles", "A,G"); // synonymous
1905 sf3.setValue("ID", "var3");
1906 sf3.setValue("clinical_significance", "None");
1907 SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 3, 3,
1909 sf4.setValue("alleles", "A,T"); // AAT -> N
1910 sf4.setValue("ID", "sequence_variant:var4"); // prefix gets stripped off
1911 sf4.setValue("clinical_significance", "Benign");
1912 SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 6, 6,
1914 sf5.setValue("alleles", "T,C"); // synonymous
1915 sf5.setValue("ID", "var5");
1916 sf5.setValue("clinical_significance", "Bad");
1917 SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 8, 8,
1919 sf6.setValue("alleles", "C,A,G"); // CAC,CGC -> H,R
1920 sf6.setValue("ID", "var6");
1921 sf6.setValue("clinical_significance", "Good");
1923 List<DnaVariant> codon1Variants = new ArrayList<DnaVariant>();
1924 List<DnaVariant> codon2Variants = new ArrayList<DnaVariant>();
1925 List<DnaVariant> codon3Variants = new ArrayList<DnaVariant>();
1926 List<DnaVariant> codonVariants[] = new ArrayList[3];
1927 codonVariants[0] = codon1Variants;
1928 codonVariants[1] = codon2Variants;
1929 codonVariants[2] = codon3Variants;
1932 * compute variants for protein position 1
1934 codon1Variants.add(new DnaVariant("A", sf1));
1935 codon1Variants.add(new DnaVariant("A", sf2));
1936 codon2Variants.add(new DnaVariant("A"));
1937 codon2Variants.add(new DnaVariant("A"));
1938 codon3Variants.add(new DnaVariant("A", sf3));
1939 codon3Variants.add(new DnaVariant("A", sf4));
1940 AlignmentUtils.computePeptideVariants(peptide, 1, codonVariants);
1943 * compute variants for protein position 2
1945 codon1Variants.clear();
1946 codon2Variants.clear();
1947 codon3Variants.clear();
1948 codon1Variants.add(new DnaVariant("T"));
1949 codon2Variants.add(new DnaVariant("T"));
1950 codon3Variants.add(new DnaVariant("T", sf5));
1951 AlignmentUtils.computePeptideVariants(peptide, 2, codonVariants);
1954 * compute variants for protein position 3
1956 codon1Variants.clear();
1957 codon2Variants.clear();
1958 codon3Variants.clear();
1959 codon1Variants.add(new DnaVariant("C"));
1960 codon2Variants.add(new DnaVariant("C", sf6));
1961 codon3Variants.add(new DnaVariant("C"));
1962 AlignmentUtils.computePeptideVariants(peptide, 3, codonVariants);
1965 * verify added sequence features for
1972 SequenceFeature[] sfs = peptide.getSequenceFeatures();
1973 assertEquals(5, sfs.length);
1974 SequenceFeature sf = sfs[0];
1975 assertEquals(1, sf.getBegin());
1976 assertEquals(1, sf.getEnd());
1977 assertEquals("p.Lys1Glu", sf.getDescription());
1978 assertEquals("var1.125A>G", sf.getValue("ID"));
1979 assertNull(sf.getValue("clinical_significance"));
1980 assertEquals("ID=var1.125A>G", sf.getAttributes());
1981 assertEquals(1, sf.links.size());
1982 // link to variation is urlencoded
1984 "p.Lys1Glu var1.125A>G|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var1.125A%3EG",
1986 assertEquals("Jalview", sf.getFeatureGroup());
1988 assertEquals(1, sf.getBegin());
1989 assertEquals(1, sf.getEnd());
1990 assertEquals("p.Lys1Gln", sf.getDescription());
1991 assertEquals("var2", sf.getValue("ID"));
1992 assertEquals("Dodgy", sf.getValue("clinical_significance"));
1993 assertEquals("ID=var2;clinical_significance=Dodgy", sf.getAttributes());
1994 assertEquals(1, sf.links.size());
1996 "p.Lys1Gln var2|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var2",
1998 assertEquals("Jalview", sf.getFeatureGroup());
2000 assertEquals(1, sf.getBegin());
2001 assertEquals(1, sf.getEnd());
2002 assertEquals("p.Lys1Asn", sf.getDescription());
2003 assertEquals("var4", sf.getValue("ID"));
2004 assertEquals("Benign", sf.getValue("clinical_significance"));
2005 assertEquals("ID=var4;clinical_significance=Benign", sf.getAttributes());
2006 assertEquals(1, sf.links.size());
2008 "p.Lys1Asn var4|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var4",
2010 assertEquals("Jalview", sf.getFeatureGroup());
2012 assertEquals(3, sf.getBegin());
2013 assertEquals(3, sf.getEnd());
2014 assertEquals("p.Pro3His", sf.getDescription());
2015 assertEquals("var6", sf.getValue("ID"));
2016 assertEquals("Good", sf.getValue("clinical_significance"));
2017 assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes());
2018 assertEquals(1, sf.links.size());
2020 "p.Pro3His var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
2022 // var5 generates two distinct protein variant features
2023 assertEquals("Jalview", sf.getFeatureGroup());
2025 assertEquals(3, sf.getBegin());
2026 assertEquals(3, sf.getEnd());
2027 assertEquals("p.Pro3Arg", sf.getDescription());
2028 assertEquals("var6", sf.getValue("ID"));
2029 assertEquals("Good", sf.getValue("clinical_significance"));
2030 assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes());
2031 assertEquals(1, sf.links.size());
2033 "p.Pro3Arg var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
2035 assertEquals("Jalview", sf.getFeatureGroup());
2039 * Tests for the method that maps the subset of a dna sequence that has CDS
2040 * (or subtype) feature, with CDS strand = '-' (reverse)
2042 // test turned off as currently findCdsPositions is not strand-dependent
2043 // left in case it comes around again...
2044 @Test(groups = "Functional", enabled = false)
2045 public void testFindCdsPositions_reverseStrand()
2047 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
2048 dnaSeq.createDatasetSequence();
2049 SequenceI ds = dnaSeq.getDatasetSequence();
2052 SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
2054 ds.addSequenceFeature(sf);
2055 // exon feature should be ignored here
2056 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
2057 ds.addSequenceFeature(sf);
2058 // CDS for dna 10-12
2059 sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
2061 ds.addSequenceFeature(sf);
2063 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
2065 * verify ranges { [12-10], [6-4] }
2067 assertEquals(6, MappingUtils.getLength(ranges));
2068 assertEquals(2, ranges.size());
2069 assertEquals(12, ranges.get(0)[0]);
2070 assertEquals(10, ranges.get(0)[1]);
2071 assertEquals(6, ranges.get(1)[0]);
2072 assertEquals(4, ranges.get(1)[1]);
2076 * Tests for the method that maps the subset of a dna sequence that has CDS
2077 * (or subtype) feature - reverse strand case where the start codon is
2080 @Test(groups = "Functional", enabled = false)
2081 // test turned off as currently findCdsPositions is not strand-dependent
2082 // left in case it comes around again...
2083 public void testFindCdsPositions_reverseStrandThreePrimeIncomplete()
2085 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
2086 dnaSeq.createDatasetSequence();
2087 SequenceI ds = dnaSeq.getDatasetSequence();
2090 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
2092 ds.addSequenceFeature(sf);
2093 // CDS for dna 13-15
2094 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
2096 sf.setPhase("2"); // skip 2 bases to start of next codon
2097 ds.addSequenceFeature(sf);
2099 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
2102 * check the mapping starts with the first complete codon
2103 * expect ranges [13, 13], [9, 5]
2105 assertEquals(6, MappingUtils.getLength(ranges));
2106 assertEquals(2, ranges.size());
2107 assertEquals(13, ranges.get(0)[0]);
2108 assertEquals(13, ranges.get(0)[1]);
2109 assertEquals(9, ranges.get(1)[0]);
2110 assertEquals(5, ranges.get(1)[1]);
2113 @Test(groups = "Functional")
2114 public void testAlignAs_alternateTranscriptsUngapped()
2116 SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
2117 SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
2118 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
2119 ((Alignment) dna).createDatasetAlignment();
2120 SequenceI cds1 = new Sequence("cds1", "GGGTTT");
2121 SequenceI cds2 = new Sequence("cds2", "CCCAAA");
2122 AlignmentI cds = new Alignment(new SequenceI[] { cds1, cds2 });
2123 ((Alignment) cds).createDatasetAlignment();
2125 AlignedCodonFrame acf = new AlignedCodonFrame();
2126 MapList map = new MapList(new int[] { 4, 9 }, new int[] { 1, 6 }, 1, 1);
2127 acf.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), map);
2128 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 6 }, 1, 1);
2129 acf.addMap(dna2.getDatasetSequence(), cds2.getDatasetSequence(), map);
2132 * verify CDS alignment is as:
2133 * cccGGGTTTaaa (cdna)
2134 * CCCgggtttAAA (cdna)
2136 * ---GGGTTT--- (cds)
2137 * CCC------AAA (cds)
2139 dna.addCodonFrame(acf);
2140 AlignmentUtils.alignAs(cds, dna);
2141 assertEquals("---GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
2142 assertEquals("CCC------AAA", cds.getSequenceAt(1).getSequenceAsString());
2145 @Test(groups = { "Functional" })
2146 public void testAddMappedPositions()
2148 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
2149 SequenceI seq1 = new Sequence("cds", "AAATTT");
2150 from.createDatasetSequence();
2151 seq1.createDatasetSequence();
2152 Mapping mapping = new Mapping(seq1, new MapList(
2153 new int[] { 3, 6, 9, 10 },
2154 new int[] { 1, 6 }, 1, 1));
2155 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
2156 AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2159 * verify map has seq1 residues in columns 3,4,6,7,11,12
2161 assertEquals(6, map.size());
2162 assertEquals('A', map.get(3).get(seq1).charValue());
2163 assertEquals('A', map.get(4).get(seq1).charValue());
2164 assertEquals('A', map.get(6).get(seq1).charValue());
2165 assertEquals('T', map.get(7).get(seq1).charValue());
2166 assertEquals('T', map.get(11).get(seq1).charValue());
2167 assertEquals('T', map.get(12).get(seq1).charValue());
2175 * Test case where the mapping 'from' range includes a stop codon which is
2176 * absent in the 'to' range
2178 @Test(groups = { "Functional" })
2179 public void testAddMappedPositions_withStopCodon()
2181 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
2182 SequenceI seq1 = new Sequence("cds", "AAATTT");
2183 from.createDatasetSequence();
2184 seq1.createDatasetSequence();
2185 Mapping mapping = new Mapping(seq1, new MapList(
2186 new int[] { 3, 6, 9, 10 },
2187 new int[] { 1, 6 }, 1, 1));
2188 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
2189 AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2192 * verify map has seq1 residues in columns 3,4,6,7,11,12
2194 assertEquals(6, map.size());
2195 assertEquals('A', map.get(3).get(seq1).charValue());
2196 assertEquals('A', map.get(4).get(seq1).charValue());
2197 assertEquals('A', map.get(6).get(seq1).charValue());
2198 assertEquals('T', map.get(7).get(seq1).charValue());
2199 assertEquals('T', map.get(11).get(seq1).charValue());
2200 assertEquals('T', map.get(12).get(seq1).charValue());