2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.testng.AssertJUnit.assertEquals;
24 import static org.testng.AssertJUnit.assertFalse;
25 import static org.testng.AssertJUnit.assertNull;
26 import static org.testng.AssertJUnit.assertSame;
27 import static org.testng.AssertJUnit.assertTrue;
29 import jalview.analysis.AlignmentUtils.DnaVariant;
30 import jalview.datamodel.AlignedCodonFrame;
31 import jalview.datamodel.Alignment;
32 import jalview.datamodel.AlignmentAnnotation;
33 import jalview.datamodel.AlignmentI;
34 import jalview.datamodel.Annotation;
35 import jalview.datamodel.DBRefEntry;
36 import jalview.datamodel.Mapping;
37 import jalview.datamodel.SearchResults;
38 import jalview.datamodel.SearchResults.Match;
39 import jalview.datamodel.Sequence;
40 import jalview.datamodel.SequenceFeature;
41 import jalview.datamodel.SequenceI;
42 import jalview.io.AppletFormatAdapter;
43 import jalview.io.FormatAdapter;
44 import jalview.util.MapList;
45 import jalview.util.MappingUtils;
47 import java.io.IOException;
48 import java.util.ArrayList;
49 import java.util.Arrays;
50 import java.util.Iterator;
51 import java.util.LinkedHashMap;
52 import java.util.List;
54 import java.util.TreeMap;
56 import org.testng.annotations.Test;
58 public class AlignmentUtilsTests
60 public static Sequence ts = new Sequence("short",
61 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm");
63 @Test(groups = { "Functional" })
64 public void testExpandContext()
66 AlignmentI al = new Alignment(new Sequence[] {});
67 for (int i = 4; i < 14; i += 2)
69 SequenceI s1 = ts.deriveSequence().getSubSequence(i, i + 7);
72 System.out.println(new AppletFormatAdapter().formatSequences("Clustal",
74 for (int flnk = -1; flnk < 25; flnk++)
76 AlignmentI exp = AlignmentUtils.expandContext(al, flnk);
77 System.out.println("\nFlank size: " + flnk);
78 System.out.println(new AppletFormatAdapter().formatSequences(
79 "Clustal", exp, true));
83 * Full expansion to complete sequences
85 for (SequenceI sq : exp.getSequences())
87 String ung = sq.getSequenceAsString().replaceAll("-+", "");
88 final String errorMsg = "Flanking sequence not the same as original dataset sequence.\n"
91 + sq.getDatasetSequence().getSequenceAsString();
92 assertTrue(errorMsg, ung.equalsIgnoreCase(sq.getDatasetSequence()
93 .getSequenceAsString()));
99 * Last sequence is fully expanded, others have leading gaps to match
101 assertTrue(exp.getSequenceAt(4).getSequenceAsString()
103 assertTrue(exp.getSequenceAt(3).getSequenceAsString()
104 .startsWith("--abc"));
105 assertTrue(exp.getSequenceAt(2).getSequenceAsString()
106 .startsWith("----abc"));
107 assertTrue(exp.getSequenceAt(1).getSequenceAsString()
108 .startsWith("------abc"));
109 assertTrue(exp.getSequenceAt(0).getSequenceAsString()
110 .startsWith("--------abc"));
116 * Test that annotations are correctly adjusted by expandContext
118 @Test(groups = { "Functional" })
119 public void testExpandContext_annotation()
121 AlignmentI al = new Alignment(new Sequence[] {});
122 SequenceI ds = new Sequence("Seq1", "ABCDEFGHI");
124 SequenceI seq1 = ds.deriveSequence().getSubSequence(3, 6);
125 al.addSequence(seq1);
128 * Annotate DEF with 4/5/6 respectively
130 Annotation[] anns = new Annotation[] { new Annotation(4),
131 new Annotation(5), new Annotation(6) };
132 AlignmentAnnotation ann = new AlignmentAnnotation("SS",
133 "secondary structure", anns);
134 seq1.addAlignmentAnnotation(ann);
137 * The annotations array should match aligned positions
139 assertEquals(3, ann.annotations.length);
140 assertEquals(4, ann.annotations[0].value, 0.001);
141 assertEquals(5, ann.annotations[1].value, 0.001);
142 assertEquals(6, ann.annotations[2].value, 0.001);
145 * Check annotation to sequence position mappings before expanding the
146 * sequence; these are set up in Sequence.addAlignmentAnnotation ->
147 * Annotation.setSequenceRef -> createSequenceMappings
149 assertNull(ann.getAnnotationForPosition(1));
150 assertNull(ann.getAnnotationForPosition(2));
151 assertNull(ann.getAnnotationForPosition(3));
152 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
153 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
154 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
155 assertNull(ann.getAnnotationForPosition(7));
156 assertNull(ann.getAnnotationForPosition(8));
157 assertNull(ann.getAnnotationForPosition(9));
160 * Expand the subsequence to the full sequence abcDEFghi
162 AlignmentI expanded = AlignmentUtils.expandContext(al, -1);
163 assertEquals("abcDEFghi", expanded.getSequenceAt(0)
164 .getSequenceAsString());
167 * Confirm the alignment and sequence have the same SS annotation,
168 * referencing the expanded sequence
170 ann = expanded.getSequenceAt(0).getAnnotation()[0];
171 assertSame(ann, expanded.getAlignmentAnnotation()[0]);
172 assertSame(expanded.getSequenceAt(0), ann.sequenceRef);
175 * The annotations array should have null values except for annotated
178 assertNull(ann.annotations[0]);
179 assertNull(ann.annotations[1]);
180 assertNull(ann.annotations[2]);
181 assertEquals(4, ann.annotations[3].value, 0.001);
182 assertEquals(5, ann.annotations[4].value, 0.001);
183 assertEquals(6, ann.annotations[5].value, 0.001);
184 assertNull(ann.annotations[6]);
185 assertNull(ann.annotations[7]);
186 assertNull(ann.annotations[8]);
189 * sequence position mappings should be unchanged
191 assertNull(ann.getAnnotationForPosition(1));
192 assertNull(ann.getAnnotationForPosition(2));
193 assertNull(ann.getAnnotationForPosition(3));
194 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
195 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
196 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
197 assertNull(ann.getAnnotationForPosition(7));
198 assertNull(ann.getAnnotationForPosition(8));
199 assertNull(ann.getAnnotationForPosition(9));
203 * Test method that returns a map of lists of sequences by sequence name.
205 * @throws IOException
207 @Test(groups = { "Functional" })
208 public void testGetSequencesByName() throws IOException
210 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
211 + ">Seq1Name\nABCD\n";
212 AlignmentI al = loadAlignment(data, "FASTA");
213 Map<String, List<SequenceI>> map = AlignmentUtils
214 .getSequencesByName(al);
215 assertEquals(2, map.keySet().size());
216 assertEquals(2, map.get("Seq1Name").size());
217 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
218 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
219 assertEquals(1, map.get("Seq2Name").size());
220 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
224 * Helper method to load an alignment and ensure dataset sequences are set up.
230 * @throws IOException
232 protected AlignmentI loadAlignment(final String data, String format)
235 AlignmentI a = new FormatAdapter().readFile(data,
236 AppletFormatAdapter.PASTE, format);
242 * Test mapping of protein to cDNA, for the case where we have no sequence
243 * cross-references, so mappings are made first-served 1-1 where sequences
246 * @throws IOException
248 @Test(groups = { "Functional" })
249 public void testMapProteinAlignmentToCdna_noXrefs() throws IOException
251 List<SequenceI> protseqs = new ArrayList<SequenceI>();
252 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
253 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
254 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
255 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
256 protein.setDataset(null);
258 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
259 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
260 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
261 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
262 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
263 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
264 cdna.setDataset(null);
266 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
268 // 3 mappings made, each from 1 to 1 sequence
269 assertEquals(3, protein.getCodonFrames().size());
270 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
271 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
272 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
274 // V12345 mapped to A22222
275 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
277 assertEquals(1, acf.getdnaSeqs().length);
278 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
279 acf.getdnaSeqs()[0]);
280 Mapping[] protMappings = acf.getProtMappings();
281 assertEquals(1, protMappings.length);
282 MapList mapList = protMappings[0].getMap();
283 assertEquals(3, mapList.getFromRatio());
284 assertEquals(1, mapList.getToRatio());
285 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
287 assertEquals(1, mapList.getFromRanges().size());
288 assertTrue(Arrays.equals(new int[] { 1, 3 },
289 mapList.getToRanges().get(0)));
290 assertEquals(1, mapList.getToRanges().size());
292 // V12346 mapped to A33333
293 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
294 assertEquals(1, acf.getdnaSeqs().length);
295 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
296 acf.getdnaSeqs()[0]);
298 // V12347 mapped to A11111
299 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
300 assertEquals(1, acf.getdnaSeqs().length);
301 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
302 acf.getdnaSeqs()[0]);
304 // no mapping involving the 'extra' A44444
305 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
309 * Test for the alignSequenceAs method that takes two sequences and a mapping.
311 @Test(groups = { "Functional" })
312 public void testAlignSequenceAs_withMapping_noIntrons()
314 MapList map = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1);
317 * No existing gaps in dna:
319 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
323 * Now introduce gaps in dna but ignore them when realigning.
325 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
329 * Now include gaps in dna when realigning. First retaining 'mapped' gaps
330 * only, i.e. those within the exon region.
332 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
333 "---G-G--G---A--A-A");
336 * Include all gaps in dna when realigning (within and without the exon
337 * region). The leading gap, and the gaps between codons, are subsumed by
338 * the protein alignment gap.
340 checkAlignSequenceAs("-G-GG--AA-A---", "-A-L-", true, true, map,
341 "---G-GG---AA-A---");
344 * Include only unmapped gaps in dna when realigning (outside the exon
345 * region). The leading gap, and the gaps between codons, are subsumed by
346 * the protein alignment gap.
348 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
353 * Test for the alignSequenceAs method that takes two sequences and a mapping.
355 @Test(groups = { "Functional" })
356 public void testAlignSequenceAs_withMapping_withIntrons()
359 * Exons at codon 2 (AAA) and 4 (TTT)
361 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
362 new int[] { 1, 2 }, 3, 1);
365 * Simple case: no gaps in dna
367 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
368 "GGG---AAACCCTTTGGG");
371 * Add gaps to dna - but ignore when realigning.
373 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
374 false, false, map, "GGG---AAACCCTTTGGG");
377 * Add gaps to dna - include within exons only when realigning.
379 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
380 true, false, map, "GGG---A--A---ACCCT-TTGGG");
383 * Include gaps outside exons only when realigning.
385 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
386 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
389 * Include gaps following first intron if we are 'preserving mapped gaps'
391 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
392 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
395 * Include all gaps in dna when realigning.
397 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
398 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
402 * Test for the case where not all of the protein sequence is mapped to cDNA.
404 @Test(groups = { "Functional" })
405 public void testAlignSequenceAs_withMapping_withUnmappedProtein()
408 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
410 final MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] {
414 * -L- 'aligns' ccc------
416 checkAlignSequenceAs("gggAAAcccTTTggg", "-A-L-P-", false, false, map,
417 "gggAAAccc------TTTggg");
421 * Helper method that performs and verifies the method under test.
424 * the sequence to be realigned
426 * the sequence whose alignment is to be copied
427 * @param preserveMappedGaps
428 * @param preserveUnmappedGaps
432 protected void checkAlignSequenceAs(final String alignee,
433 final String alignModel, final boolean preserveMappedGaps,
434 final boolean preserveUnmappedGaps, MapList map,
435 final String expected)
437 SequenceI alignMe = new Sequence("Seq1", alignee);
438 alignMe.createDatasetSequence();
439 SequenceI alignFrom = new Sequence("Seq2", alignModel);
440 alignFrom.createDatasetSequence();
441 AlignedCodonFrame acf = new AlignedCodonFrame();
442 acf.addMap(alignMe.getDatasetSequence(), alignFrom.getDatasetSequence(), map);
444 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "---", '-',
445 preserveMappedGaps, preserveUnmappedGaps);
446 assertEquals(expected, alignMe.getSequenceAsString());
450 * Test for the alignSequenceAs method where we preserve gaps in introns only.
452 @Test(groups = { "Functional" })
453 public void testAlignSequenceAs_keepIntronGapsOnly()
457 * Intron GGGAAA followed by exon CCCTTT
459 MapList map = new MapList(new int[] { 7, 12 }, new int[] { 1, 2 }, 3, 1);
461 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL", false, true, map,
466 * Test the method that realigns protein to match mapped codon alignment.
468 @Test(groups = { "Functional" })
469 public void testAlignProteinAsDna()
471 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
472 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
473 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
474 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
475 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
476 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
477 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
478 dna.setDataset(null);
480 // protein alignment will be realigned like dna
481 SequenceI prot1 = new Sequence("Seq1", "CHYQ");
482 SequenceI prot2 = new Sequence("Seq2", "CHYQ");
483 SequenceI prot3 = new Sequence("Seq3", "CHYQ");
484 SequenceI prot4 = new Sequence("Seq4", "R-QSV"); // unmapped, unchanged
485 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
487 protein.setDataset(null);
489 MapList map = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 3, 1);
490 AlignedCodonFrame acf = new AlignedCodonFrame();
491 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
492 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
493 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
494 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
496 protein.setCodonFrames(acfs);
499 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
500 * [8,9,10] [10,11,12] [11,12,13]
502 AlignmentUtils.alignProteinAsDna(protein, dna);
503 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
504 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
505 assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
506 assertEquals("R-QSV", prot4.getSequenceAsString());
510 * Test the method that tests whether a CDNA sequence translates to a protein
513 @Test(groups = { "Functional" })
514 public void testTranslatesAs()
516 // null arguments check
517 assertFalse(AlignmentUtils.translatesAs(null, 0, null));
518 assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null));
519 assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' }));
521 // straight translation
522 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
523 "FPKG".toCharArray()));
524 // with extra start codon (not in protein)
525 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
526 3, "FPKG".toCharArray()));
527 // with stop codon1 (not in protein)
528 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
529 0, "FPKG".toCharArray()));
530 // with stop codon1 (in protein as *)
531 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
532 0, "FPKG*".toCharArray()));
533 // with stop codon2 (not in protein)
534 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
535 0, "FPKG".toCharArray()));
536 // with stop codon3 (not in protein)
537 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
538 0, "FPKG".toCharArray()));
539 // with start and stop codon1
540 assertTrue(AlignmentUtils.translatesAs(
541 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG".toCharArray()));
542 // with start and stop codon1 (in protein as *)
543 assertTrue(AlignmentUtils.translatesAs(
544 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG*".toCharArray()));
545 // with start and stop codon2
546 assertTrue(AlignmentUtils.translatesAs(
547 "atgtttcccaaagggtag".toCharArray(), 3, "FPKG".toCharArray()));
548 // with start and stop codon3
549 assertTrue(AlignmentUtils.translatesAs(
550 "atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray()));
552 // with embedded stop codons
553 assertTrue(AlignmentUtils.translatesAs(
554 "atgtttTAGcccaaaTAAgggtga".toCharArray(), 3,
555 "F*PK*G".toCharArray()));
558 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
559 0, "FPMG".toCharArray()));
562 assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0,
563 "FPKG".toCharArray()));
566 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
567 0, "FPK".toCharArray()));
569 // overlong dna (doesn't end in stop codon)
570 assertFalse(AlignmentUtils.translatesAs(
571 "tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray()));
573 // dna + stop codon + more
574 assertFalse(AlignmentUtils.translatesAs(
575 "tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray()));
578 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
579 0, "FPKGQ".toCharArray()));
583 * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
584 * stop codons in addition to the protein coding sequence.
586 * @throws IOException
588 @Test(groups = { "Functional" })
589 public void testMapProteinAlignmentToCdna_withStartAndStopCodons()
592 List<SequenceI> protseqs = new ArrayList<SequenceI>();
593 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
594 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
595 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
596 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
597 protein.setDataset(null);
599 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
601 dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
603 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
604 // = start +EIQ + stop
605 dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
606 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
607 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
608 cdna.setDataset(null);
610 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
612 // 3 mappings made, each from 1 to 1 sequence
613 assertEquals(3, protein.getCodonFrames().size());
614 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
615 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
616 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
618 // V12345 mapped from A22222
619 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
621 assertEquals(1, acf.getdnaSeqs().length);
622 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
623 acf.getdnaSeqs()[0]);
624 Mapping[] protMappings = acf.getProtMappings();
625 assertEquals(1, protMappings.length);
626 MapList mapList = protMappings[0].getMap();
627 assertEquals(3, mapList.getFromRatio());
628 assertEquals(1, mapList.getToRatio());
629 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
631 assertEquals(1, mapList.getFromRanges().size());
632 assertTrue(Arrays.equals(new int[] { 1, 3 },
633 mapList.getToRanges().get(0)));
634 assertEquals(1, mapList.getToRanges().size());
636 // V12346 mapped from A33333 starting position 4
637 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
638 assertEquals(1, acf.getdnaSeqs().length);
639 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
640 acf.getdnaSeqs()[0]);
641 protMappings = acf.getProtMappings();
642 assertEquals(1, protMappings.length);
643 mapList = protMappings[0].getMap();
644 assertEquals(3, mapList.getFromRatio());
645 assertEquals(1, mapList.getToRatio());
646 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
648 assertEquals(1, mapList.getFromRanges().size());
649 assertTrue(Arrays.equals(new int[] { 1, 3 },
650 mapList.getToRanges().get(0)));
651 assertEquals(1, mapList.getToRanges().size());
653 // V12347 mapped to A11111 starting position 4
654 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
655 assertEquals(1, acf.getdnaSeqs().length);
656 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
657 acf.getdnaSeqs()[0]);
658 protMappings = acf.getProtMappings();
659 assertEquals(1, protMappings.length);
660 mapList = protMappings[0].getMap();
661 assertEquals(3, mapList.getFromRatio());
662 assertEquals(1, mapList.getToRatio());
663 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
665 assertEquals(1, mapList.getFromRanges().size());
666 assertTrue(Arrays.equals(new int[] { 1, 3 },
667 mapList.getToRanges().get(0)));
668 assertEquals(1, mapList.getToRanges().size());
670 // no mapping involving the 'extra' A44444
671 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
675 * Test mapping of protein to cDNA, for the case where we have some sequence
676 * cross-references. Verify that 1-to-many mappings are made where
677 * cross-references exist and sequences are mappable.
679 * @throws IOException
681 @Test(groups = { "Functional" })
682 public void testMapProteinAlignmentToCdna_withXrefs() throws IOException
684 List<SequenceI> protseqs = new ArrayList<SequenceI>();
685 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
686 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
687 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
688 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
689 protein.setDataset(null);
691 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
692 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
693 dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
694 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
695 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
696 dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
697 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
698 cdna.setDataset(null);
700 // Xref A22222 to V12345 (should get mapped)
701 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
702 // Xref V12345 to A44444 (should get mapped)
703 protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
704 // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
705 dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
706 // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
707 // it should get paired up with the unmapped A33333
708 // A11111 should be mapped to V12347
709 // A55555 is spare and has no xref so is not mapped
711 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
713 // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
714 assertEquals(3, protein.getCodonFrames().size());
715 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
716 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
717 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
719 // one mapping for each of the first 4 cDNA sequences
720 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
721 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
722 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
723 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
725 // V12345 mapped to A22222 and A44444
726 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
728 assertEquals(2, acf.getdnaSeqs().length);
729 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
730 acf.getdnaSeqs()[0]);
731 assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
732 acf.getdnaSeqs()[1]);
734 // V12346 mapped to A33333
735 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
736 assertEquals(1, acf.getdnaSeqs().length);
737 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
738 acf.getdnaSeqs()[0]);
740 // V12347 mapped to A11111
741 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
742 assertEquals(1, acf.getdnaSeqs().length);
743 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
744 acf.getdnaSeqs()[0]);
746 // no mapping involving the 'extra' A55555
747 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
751 * Test mapping of protein to cDNA, for the case where we have some sequence
752 * cross-references. Verify that once we have made an xref mapping we don't
753 * also map un-xrefd sequeces.
755 * @throws IOException
757 @Test(groups = { "Functional" })
758 public void testMapProteinAlignmentToCdna_prioritiseXrefs()
761 List<SequenceI> protseqs = new ArrayList<SequenceI>();
762 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
763 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
764 AlignmentI protein = new Alignment(
765 protseqs.toArray(new SequenceI[protseqs.size()]));
766 protein.setDataset(null);
768 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
769 dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
770 dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
771 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
773 cdna.setDataset(null);
775 // Xref A22222 to V12345 (should get mapped)
776 // A11111 should then be mapped to the unmapped V12346
777 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
779 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
781 // 2 protein mappings made
782 assertEquals(2, protein.getCodonFrames().size());
783 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
784 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
786 // one mapping for each of the cDNA sequences
787 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
788 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
790 // V12345 mapped to A22222
791 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
793 assertEquals(1, acf.getdnaSeqs().length);
794 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
795 acf.getdnaSeqs()[0]);
797 // V12346 mapped to A11111
798 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
799 assertEquals(1, acf.getdnaSeqs().length);
800 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
801 acf.getdnaSeqs()[0]);
805 * Test the method that shows or hides sequence annotations by type(s) and
808 @Test(groups = { "Functional" })
809 public void testShowOrHideSequenceAnnotations()
811 SequenceI seq1 = new Sequence("Seq1", "AAA");
812 SequenceI seq2 = new Sequence("Seq2", "BBB");
813 SequenceI seq3 = new Sequence("Seq3", "CCC");
814 Annotation[] anns = new Annotation[] { new Annotation(2f) };
815 AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1",
817 ann1.setSequenceRef(seq1);
818 AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2",
820 ann2.setSequenceRef(seq2);
821 AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3",
823 AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns);
824 ann4.setSequenceRef(seq1);
825 AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns);
826 ann5.setSequenceRef(seq2);
827 AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns);
828 AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2, seq3 });
829 al.addAnnotation(ann1); // Structure for Seq1
830 al.addAnnotation(ann2); // Structure for Seq2
831 al.addAnnotation(ann3); // Structure for no sequence
832 al.addAnnotation(ann4); // Temp for seq1
833 al.addAnnotation(ann5); // Temp for seq2
834 al.addAnnotation(ann6); // Temp for no sequence
835 List<String> types = new ArrayList<String>();
836 List<SequenceI> scope = new ArrayList<SequenceI>();
839 * Set all sequence related Structure to hidden (ann1, ann2)
841 types.add("Structure");
842 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
844 assertFalse(ann1.visible);
845 assertFalse(ann2.visible);
846 assertTrue(ann3.visible); // not sequence-related, not affected
847 assertTrue(ann4.visible); // not Structure, not affected
848 assertTrue(ann5.visible); // "
849 assertTrue(ann6.visible); // not sequence-related, not affected
852 * Set Temp in {seq1, seq3} to hidden
858 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false,
860 assertFalse(ann1.visible); // unchanged
861 assertFalse(ann2.visible); // unchanged
862 assertTrue(ann3.visible); // not sequence-related, not affected
863 assertFalse(ann4.visible); // Temp for seq1 hidden
864 assertTrue(ann5.visible); // not in scope, not affected
865 assertTrue(ann6.visible); // not sequence-related, not affected
868 * Set Temp in all sequences to hidden
874 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
876 assertFalse(ann1.visible); // unchanged
877 assertFalse(ann2.visible); // unchanged
878 assertTrue(ann3.visible); // not sequence-related, not affected
879 assertFalse(ann4.visible); // Temp for seq1 hidden
880 assertFalse(ann5.visible); // Temp for seq2 hidden
881 assertTrue(ann6.visible); // not sequence-related, not affected
884 * Set all types in {seq1, seq3} to visible
890 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true,
892 assertTrue(ann1.visible); // Structure for seq1 set visible
893 assertFalse(ann2.visible); // not in scope, unchanged
894 assertTrue(ann3.visible); // not sequence-related, not affected
895 assertTrue(ann4.visible); // Temp for seq1 set visible
896 assertFalse(ann5.visible); // not in scope, unchanged
897 assertTrue(ann6.visible); // not sequence-related, not affected
900 * Set all types in all scope to hidden
902 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true,
904 assertFalse(ann1.visible);
905 assertFalse(ann2.visible);
906 assertTrue(ann3.visible); // not sequence-related, not affected
907 assertFalse(ann4.visible);
908 assertFalse(ann5.visible);
909 assertTrue(ann6.visible); // not sequence-related, not affected
913 * Tests for the method that checks if one sequence cross-references another
915 @Test(groups = { "Functional" })
916 public void testHasCrossRef()
918 assertFalse(AlignmentUtils.hasCrossRef(null, null));
919 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
920 assertFalse(AlignmentUtils.hasCrossRef(seq1, null));
921 assertFalse(AlignmentUtils.hasCrossRef(null, seq1));
922 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
923 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
926 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193"));
927 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
929 // case-insensitive; version number is ignored
930 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192"));
931 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
934 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
935 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
936 // test is one-way only
937 assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1));
941 * Tests for the method that checks if either sequence cross-references the
944 @Test(groups = { "Functional" })
945 public void testHaveCrossRef()
947 assertFalse(AlignmentUtils.hasCrossRef(null, null));
948 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
949 assertFalse(AlignmentUtils.haveCrossRef(seq1, null));
950 assertFalse(AlignmentUtils.haveCrossRef(null, seq1));
951 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
952 assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2));
954 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
955 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
956 // next is true for haveCrossRef, false for hasCrossRef
957 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
959 // now the other way round
960 seq1.setDBRefs(null);
961 seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345"));
962 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
963 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
966 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
967 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
968 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
972 * Test the method that extracts the cds-only part of a dna alignment.
974 @Test(groups = { "Functional" })
975 public void testMakeCdsAlignment()
977 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
978 SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
979 SequenceI pep1 = new Sequence("pep1", "GF");
980 SequenceI pep2 = new Sequence("pep2", "GFP");
981 dna1.createDatasetSequence();
982 dna2.createDatasetSequence();
983 pep1.createDatasetSequence();
984 pep2.createDatasetSequence();
985 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
987 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
989 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
991 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
993 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds5", 13, 15, 0f,
995 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
996 dna.setDataset(null);
998 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
999 new int[] { 1, 2 }, 3, 1);
1000 AlignedCodonFrame acf = new AlignedCodonFrame();
1001 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1002 dna.addCodonFrame(acf);
1003 map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 },
1005 acf = new AlignedCodonFrame();
1006 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1007 dna.addCodonFrame(acf);
1010 * execute method under test:
1012 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1013 dna1, dna2 }, dna.getDataset());
1015 assertEquals(2, cds.getSequences().size());
1016 assertEquals("GGGTTT", cds.getSequenceAt(0)
1017 .getSequenceAsString());
1018 assertEquals("GGGTTTCCC", cds.getSequenceAt(1)
1019 .getSequenceAsString());
1022 * verify shared, extended alignment dataset
1024 assertSame(dna.getDataset(), cds.getDataset());
1025 assertTrue(dna.getDataset().getSequences()
1026 .contains(cds.getSequenceAt(0).getDatasetSequence()));
1027 assertTrue(dna.getDataset().getSequences()
1028 .contains(cds.getSequenceAt(1).getDatasetSequence()));
1031 * Verify mappings from CDS to peptide and cDNA to CDS
1032 * the mappings are on the shared alignment dataset
1034 assertSame(dna.getCodonFrames(), cds.getCodonFrames());
1035 List<AlignedCodonFrame> cdsMappings = cds.getCodonFrames();
1036 assertEquals(2, cdsMappings.size());
1039 * Mapping from pep1 to GGGTTT in first new exon sequence
1041 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1042 .findMappingsForSequence(pep1, cdsMappings);
1043 assertEquals(1, pep1Mapping.size());
1045 SearchResults sr = MappingUtils
1046 .buildSearchResults(pep1, 1, cdsMappings);
1047 assertEquals(1, sr.getResults().size());
1048 Match m = sr.getResults().get(0);
1049 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1051 assertEquals(1, m.getStart());
1052 assertEquals(3, m.getEnd());
1054 sr = MappingUtils.buildSearchResults(pep1, 2, cdsMappings);
1055 m = sr.getResults().get(0);
1056 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1058 assertEquals(4, m.getStart());
1059 assertEquals(6, m.getEnd());
1062 * Mapping from pep2 to GGGTTTCCC in second new exon sequence
1064 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1065 .findMappingsForSequence(pep2, cdsMappings);
1066 assertEquals(1, pep2Mapping.size());
1068 sr = MappingUtils.buildSearchResults(pep2, 1, cdsMappings);
1069 assertEquals(1, sr.getResults().size());
1070 m = sr.getResults().get(0);
1071 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1073 assertEquals(1, m.getStart());
1074 assertEquals(3, m.getEnd());
1076 sr = MappingUtils.buildSearchResults(pep2, 2, cdsMappings);
1077 m = sr.getResults().get(0);
1078 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1080 assertEquals(4, m.getStart());
1081 assertEquals(6, m.getEnd());
1083 sr = MappingUtils.buildSearchResults(pep2, 3, cdsMappings);
1084 m = sr.getResults().get(0);
1085 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1087 assertEquals(7, m.getStart());
1088 assertEquals(9, m.getEnd());
1092 * Test the method that makes a cds-only alignment from a DNA sequence and its
1093 * product mappings, for the case where there are multiple exon mappings to
1094 * different protein products.
1096 @Test(groups = { "Functional" })
1097 public void testMakeCdsAlignment_multipleProteins()
1099 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1100 SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT
1101 SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc
1102 SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT
1103 dna1.createDatasetSequence();
1104 pep1.createDatasetSequence();
1105 pep2.createDatasetSequence();
1106 pep3.createDatasetSequence();
1107 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
1109 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
1111 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
1113 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
1115 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds5", 1, 3, 0f,
1117 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds6", 10, 12, 0f,
1119 pep1.getDatasetSequence().addDBRef(
1120 new DBRefEntry("EMBLCDS", "2", "A12345"));
1121 pep2.getDatasetSequence().addDBRef(
1122 new DBRefEntry("EMBLCDS", "3", "A12346"));
1123 pep3.getDatasetSequence().addDBRef(
1124 new DBRefEntry("EMBLCDS", "4", "A12347"));
1127 * Create the CDS alignment
1129 AlignmentI dna = new Alignment(new SequenceI[] { dna1 });
1130 dna.setDataset(null);
1133 * Make the mappings from dna to protein
1135 // map ...GGG...TTT to GF
1136 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1137 new int[] { 1, 2 }, 3, 1);
1138 AlignedCodonFrame acf = new AlignedCodonFrame();
1139 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1140 dna.addCodonFrame(acf);
1142 // map aaa...ccc to KP
1143 map = new MapList(new int[] { 1, 3, 7, 9 }, new int[] { 1, 2 }, 3, 1);
1144 acf = new AlignedCodonFrame();
1145 acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map);
1146 dna.addCodonFrame(acf);
1148 // map aaa......TTT to KF
1149 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 2 }, 3, 1);
1150 acf = new AlignedCodonFrame();
1151 acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
1152 dna.addCodonFrame(acf);
1155 * execute method under test
1157 AlignmentI cdsal = AlignmentUtils.makeCdsAlignment(
1158 new SequenceI[] { dna1 }, dna);
1161 * Verify we have 3 cds sequences, mapped to pep1/2/3 respectively
1163 List<SequenceI> cds = cdsal.getSequences();
1164 assertEquals(3, cds.size());
1167 * verify shared, extended alignment dataset
1169 assertSame(cdsal.getDataset(), dna.getDataset());
1170 assertTrue(dna.getDataset().getSequences()
1171 .contains(cds.get(0).getDatasetSequence()));
1172 assertTrue(dna.getDataset().getSequences()
1173 .contains(cds.get(1).getDatasetSequence()));
1174 assertTrue(dna.getDataset().getSequences()
1175 .contains(cds.get(2).getDatasetSequence()));
1178 * verify aligned cds sequences and their xrefs
1180 SequenceI cdsSeq = cds.get(0);
1181 assertEquals("GGGTTT", cdsSeq.getSequenceAsString());
1182 // assertEquals("dna1|A12345", cdsSeq.getName());
1183 assertEquals("dna1|pep1", cdsSeq.getName());
1184 // assertEquals(1, cdsSeq.getDBRefs().length);
1185 // DBRefEntry cdsRef = cdsSeq.getDBRefs()[0];
1186 // assertEquals("EMBLCDS", cdsRef.getSource());
1187 // assertEquals("2", cdsRef.getVersion());
1188 // assertEquals("A12345", cdsRef.getAccessionId());
1190 cdsSeq = cds.get(1);
1191 assertEquals("aaaccc", cdsSeq.getSequenceAsString());
1192 // assertEquals("dna1|A12346", cdsSeq.getName());
1193 assertEquals("dna1|pep2", cdsSeq.getName());
1194 // assertEquals(1, cdsSeq.getDBRefs().length);
1195 // cdsRef = cdsSeq.getDBRefs()[0];
1196 // assertEquals("EMBLCDS", cdsRef.getSource());
1197 // assertEquals("3", cdsRef.getVersion());
1198 // assertEquals("A12346", cdsRef.getAccessionId());
1200 cdsSeq = cds.get(2);
1201 assertEquals("aaaTTT", cdsSeq.getSequenceAsString());
1202 // assertEquals("dna1|A12347", cdsSeq.getName());
1203 assertEquals("dna1|pep3", cdsSeq.getName());
1204 // assertEquals(1, cdsSeq.getDBRefs().length);
1205 // cdsRef = cdsSeq.getDBRefs()[0];
1206 // assertEquals("EMBLCDS", cdsRef.getSource());
1207 // assertEquals("4", cdsRef.getVersion());
1208 // assertEquals("A12347", cdsRef.getAccessionId());
1211 * Verify there are mappings from each cds sequence to its protein product
1212 * and also to its dna source
1214 Iterator<AlignedCodonFrame> newMappingsIterator = cdsal
1215 .getCodonFrames().iterator();
1217 // mappings for dna1 - exon1 - pep1
1218 AlignedCodonFrame cdsMapping = newMappingsIterator.next();
1219 List<Mapping> dnaMappings = cdsMapping.getMappingsFromSequence(dna1);
1220 assertEquals(3, dnaMappings.size());
1221 assertSame(cds.get(0).getDatasetSequence(), dnaMappings.get(0)
1223 assertEquals("G(1) in CDS should map to G(4) in DNA", 4, dnaMappings
1224 .get(0).getMap().getToPosition(1));
1225 List<Mapping> peptideMappings = cdsMapping.getMappingsFromSequence(cds
1226 .get(0).getDatasetSequence());
1227 assertEquals(1, peptideMappings.size());
1228 assertSame(pep1.getDatasetSequence(), peptideMappings.get(0).getTo());
1230 // mappings for dna1 - cds2 - pep2
1231 assertSame(cds.get(1).getDatasetSequence(), dnaMappings.get(1)
1233 assertEquals("c(4) in CDS should map to c(7) in DNA", 7, dnaMappings
1234 .get(1).getMap().getToPosition(4));
1235 peptideMappings = cdsMapping.getMappingsFromSequence(cds.get(1)
1236 .getDatasetSequence());
1237 assertEquals(1, peptideMappings.size());
1238 assertSame(pep2.getDatasetSequence(), peptideMappings.get(0).getTo());
1240 // mappings for dna1 - cds3 - pep3
1241 assertSame(cds.get(2).getDatasetSequence(), dnaMappings.get(2)
1243 assertEquals("T(4) in CDS should map to T(10) in DNA", 10, dnaMappings
1244 .get(2).getMap().getToPosition(4));
1245 peptideMappings = cdsMapping.getMappingsFromSequence(cds.get(2)
1246 .getDatasetSequence());
1247 assertEquals(1, peptideMappings.size());
1248 assertSame(pep3.getDatasetSequence(), peptideMappings.get(0).getTo());
1251 @Test(groups = { "Functional" })
1252 public void testIsMappable()
1254 SequenceI dna1 = new Sequence("dna1", "cgCAGtgGT");
1255 SequenceI aa1 = new Sequence("aa1", "RSG");
1256 AlignmentI al1 = new Alignment(new SequenceI[] { dna1 });
1257 AlignmentI al2 = new Alignment(new SequenceI[] { aa1 });
1259 assertFalse(AlignmentUtils.isMappable(null, null));
1260 assertFalse(AlignmentUtils.isMappable(al1, null));
1261 assertFalse(AlignmentUtils.isMappable(null, al1));
1262 assertFalse(AlignmentUtils.isMappable(al1, al1));
1263 assertFalse(AlignmentUtils.isMappable(al2, al2));
1265 assertTrue(AlignmentUtils.isMappable(al1, al2));
1266 assertTrue(AlignmentUtils.isMappable(al2, al1));
1270 * Test creating a mapping when the sequences involved do not start at residue
1273 * @throws IOException
1275 @Test(groups = { "Functional" })
1276 public void testMapCdnaToProtein_forSubsequence()
1279 SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12);
1280 prot.createDatasetSequence();
1282 SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48);
1283 dna.createDatasetSequence();
1285 MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna);
1286 assertEquals(10, map.getToLowest());
1287 assertEquals(12, map.getToHighest());
1288 assertEquals(40, map.getFromLowest());
1289 assertEquals(48, map.getFromHighest());
1293 * Test for the alignSequenceAs method where we have protein mapped to protein
1295 @Test(groups = { "Functional" })
1296 public void testAlignSequenceAs_mappedProteinProtein()
1299 SequenceI alignMe = new Sequence("Match", "MGAASEV");
1300 alignMe.createDatasetSequence();
1301 SequenceI alignFrom = new Sequence("Query", "LQTGYMGAASEVMFSPTRR");
1302 alignFrom.createDatasetSequence();
1304 AlignedCodonFrame acf = new AlignedCodonFrame();
1305 // this is like a domain or motif match of part of a peptide sequence
1306 MapList map = new MapList(new int[] { 6, 12 }, new int[] { 1, 7 }, 1, 1);
1307 acf.addMap(alignFrom.getDatasetSequence(),
1308 alignMe.getDatasetSequence(), map);
1310 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "-", '-', true,
1312 assertEquals("-----MGAASEV-------", alignMe.getSequenceAsString());
1316 * Test for the alignSequenceAs method where there are trailing unmapped
1317 * residues in the model sequence
1319 @Test(groups = { "Functional" })
1320 public void testAlignSequenceAs_withTrailingPeptide()
1322 // map first 3 codons to KPF; G is a trailing unmapped residue
1323 MapList map = new MapList(new int[] { 1, 9 }, new int[] { 1, 3 }, 3, 1);
1325 checkAlignSequenceAs("AAACCCTTT", "K-PFG", true, true, map,
1330 * Tests for transferring features between mapped sequences
1332 @Test(groups = { "Functional" })
1333 public void testTransferFeatures()
1335 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1336 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1339 dna.addSequenceFeature(new SequenceFeature("type1", "desc1", 1, 2, 1f,
1341 // partial overlap - to [1, 1]
1342 dna.addSequenceFeature(new SequenceFeature("type2", "desc2", 3, 4, 2f,
1344 // exact overlap - to [1, 3]
1345 dna.addSequenceFeature(new SequenceFeature("type3", "desc3", 4, 6, 3f,
1347 // spanning overlap - to [2, 5]
1348 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1350 // exactly overlaps whole mapped range [1, 6]
1351 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1353 // no overlap (internal)
1354 dna.addSequenceFeature(new SequenceFeature("type6", "desc6", 7, 9, 6f,
1356 // no overlap (3' end)
1357 dna.addSequenceFeature(new SequenceFeature("type7", "desc7", 13, 15,
1359 // overlap (3' end) - to [6, 6]
1360 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1362 // extended overlap - to [6, +]
1363 dna.addSequenceFeature(new SequenceFeature("type9", "desc9", 12, 13,
1366 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1367 new int[] { 1, 6 }, 1, 1);
1370 * transferFeatures() will build 'partial overlap' for regions
1371 * that partially overlap 5' or 3' (start or end) of target sequence
1373 AlignmentUtils.transferFeatures(dna, cds, map, null);
1374 SequenceFeature[] sfs = cds.getSequenceFeatures();
1375 assertEquals(6, sfs.length);
1377 SequenceFeature sf = sfs[0];
1378 assertEquals("type2", sf.getType());
1379 assertEquals("desc2", sf.getDescription());
1380 assertEquals(2f, sf.getScore());
1381 assertEquals(1, sf.getBegin());
1382 assertEquals(1, sf.getEnd());
1385 assertEquals("type3", sf.getType());
1386 assertEquals("desc3", sf.getDescription());
1387 assertEquals(3f, sf.getScore());
1388 assertEquals(1, sf.getBegin());
1389 assertEquals(3, sf.getEnd());
1392 assertEquals("type4", sf.getType());
1393 assertEquals(2, sf.getBegin());
1394 assertEquals(5, sf.getEnd());
1397 assertEquals("type5", sf.getType());
1398 assertEquals(1, sf.getBegin());
1399 assertEquals(6, sf.getEnd());
1402 assertEquals("type8", sf.getType());
1403 assertEquals(6, sf.getBegin());
1404 assertEquals(6, sf.getEnd());
1407 assertEquals("type9", sf.getType());
1408 assertEquals(6, sf.getBegin());
1409 assertEquals(6, sf.getEnd());
1413 * Tests for transferring features between mapped sequences
1415 @Test(groups = { "Functional" })
1416 public void testTransferFeatures_withOmit()
1418 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1419 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1421 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1422 new int[] { 1, 6 }, 1, 1);
1424 // [5, 11] maps to [2, 5]
1425 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1427 // [4, 12] maps to [1, 6]
1428 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1430 // [12, 12] maps to [6, 6]
1431 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1434 // desc4 and desc8 are the 'omit these' varargs
1435 AlignmentUtils.transferFeatures(dna, cds, map, null, "type4", "type8");
1436 SequenceFeature[] sfs = cds.getSequenceFeatures();
1437 assertEquals(1, sfs.length);
1439 SequenceFeature sf = sfs[0];
1440 assertEquals("type5", sf.getType());
1441 assertEquals(1, sf.getBegin());
1442 assertEquals(6, sf.getEnd());
1446 * Tests for transferring features between mapped sequences
1448 @Test(groups = { "Functional" })
1449 public void testTransferFeatures_withSelect()
1451 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1452 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1454 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1455 new int[] { 1, 6 }, 1, 1);
1457 // [5, 11] maps to [2, 5]
1458 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1460 // [4, 12] maps to [1, 6]
1461 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1463 // [12, 12] maps to [6, 6]
1464 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1467 // "type5" is the 'select this type' argument
1468 AlignmentUtils.transferFeatures(dna, cds, map, "type5");
1469 SequenceFeature[] sfs = cds.getSequenceFeatures();
1470 assertEquals(1, sfs.length);
1472 SequenceFeature sf = sfs[0];
1473 assertEquals("type5", sf.getType());
1474 assertEquals(1, sf.getBegin());
1475 assertEquals(6, sf.getEnd());
1479 * Test the method that extracts the cds-only part of a dna alignment, for the
1480 * case where the cds should be aligned to match its nucleotide sequence.
1482 @Test(groups = { "Functional" })
1483 public void testMakeCdsAlignment_alternativeTranscripts()
1485 SequenceI dna1 = new Sequence("dna1", "aaaGGGCC-----CTTTaaaGGG");
1486 // alternative transcript of same dna skips CCC codon
1487 SequenceI dna2 = new Sequence("dna2", "aaaGGGCC-----cttTaaaGGG");
1488 // dna3 has no mapping (protein product) so should be ignored here
1489 SequenceI dna3 = new Sequence("dna3", "aaaGGGCCCCCGGGcttTaaaGGG");
1490 SequenceI pep1 = new Sequence("pep1", "GPFG");
1491 SequenceI pep2 = new Sequence("pep2", "GPG");
1492 dna1.createDatasetSequence();
1493 dna2.createDatasetSequence();
1494 dna3.createDatasetSequence();
1495 pep1.createDatasetSequence();
1496 pep2.createDatasetSequence();
1497 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 8, 0f,
1499 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 9, 12, 0f,
1501 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 16, 18, 0f,
1503 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 4, 8, 0f,
1505 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 12, 12, 0f,
1507 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 16, 18, 0f,
1510 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1511 dna.setDataset(null);
1513 MapList map = new MapList(new int[] { 4, 12, 16, 18 },
1514 new int[] { 1, 4 }, 3, 1);
1515 AlignedCodonFrame acf = new AlignedCodonFrame();
1516 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1517 dna.addCodonFrame(acf);
1518 map = new MapList(new int[] { 4, 8, 12, 12, 16, 18 },
1521 acf = new AlignedCodonFrame();
1522 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1523 dna.addCodonFrame(acf);
1525 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1526 dna1, dna2, dna3 }, dna);
1527 List<SequenceI> cdsSeqs = cds.getSequences();
1528 assertEquals(2, cdsSeqs.size());
1529 assertEquals("GGGCCCTTTGGG", cdsSeqs.get(0).getSequenceAsString());
1530 assertEquals("GGGCCTGGG", cdsSeqs.get(1).getSequenceAsString());
1533 * verify shared, extended alignment dataset
1535 assertSame(dna.getDataset(), cds.getDataset());
1536 assertTrue(dna.getDataset().getSequences()
1537 .contains(cdsSeqs.get(0).getDatasetSequence()));
1538 assertTrue(dna.getDataset().getSequences()
1539 .contains(cdsSeqs.get(1).getDatasetSequence()));
1542 * Verify updated mappings
1544 List<AlignedCodonFrame> cdsMappings = cds.getCodonFrames();
1545 assertEquals(2, cdsMappings.size());
1548 * Mapping from pep1 to GGGTTT in first new CDS sequence
1550 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1551 .findMappingsForSequence(pep1, cdsMappings);
1552 assertEquals(1, pep1Mapping.size());
1554 * maps GPFG to 1-3,4-6,7-9,10-12
1556 SearchResults sr = MappingUtils
1557 .buildSearchResults(pep1, 1, cdsMappings);
1558 assertEquals(1, sr.getResults().size());
1559 Match m = sr.getResults().get(0);
1560 assertEquals(cds.getSequenceAt(0).getDatasetSequence(),
1562 assertEquals(1, m.getStart());
1563 assertEquals(3, m.getEnd());
1564 sr = MappingUtils.buildSearchResults(pep1, 2, cdsMappings);
1565 m = sr.getResults().get(0);
1566 assertEquals(4, m.getStart());
1567 assertEquals(6, m.getEnd());
1568 sr = MappingUtils.buildSearchResults(pep1, 3, cdsMappings);
1569 m = sr.getResults().get(0);
1570 assertEquals(7, m.getStart());
1571 assertEquals(9, m.getEnd());
1572 sr = MappingUtils.buildSearchResults(pep1, 4, cdsMappings);
1573 m = sr.getResults().get(0);
1574 assertEquals(10, m.getStart());
1575 assertEquals(12, m.getEnd());
1578 * GPG in pep2 map to 1-3,4-6,7-9 in second CDS sequence
1580 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1581 .findMappingsForSequence(pep2, cdsMappings);
1582 assertEquals(1, pep2Mapping.size());
1583 sr = MappingUtils.buildSearchResults(pep2, 1, cdsMappings);
1584 assertEquals(1, sr.getResults().size());
1585 m = sr.getResults().get(0);
1586 assertEquals(cds.getSequenceAt(1).getDatasetSequence(),
1588 assertEquals(1, m.getStart());
1589 assertEquals(3, m.getEnd());
1590 sr = MappingUtils.buildSearchResults(pep2, 2, cdsMappings);
1591 m = sr.getResults().get(0);
1592 assertEquals(4, m.getStart());
1593 assertEquals(6, m.getEnd());
1594 sr = MappingUtils.buildSearchResults(pep2, 3, cdsMappings);
1595 m = sr.getResults().get(0);
1596 assertEquals(7, m.getStart());
1597 assertEquals(9, m.getEnd());
1601 * Test the method that realigns protein to match mapped codon alignment.
1603 @Test(groups = { "Functional" })
1604 public void testAlignProteinAsDna_incompleteStartCodon()
1606 // seq1: incomplete start codon (not mapped), then [3, 11]
1607 SequenceI dna1 = new Sequence("Seq1", "ccAAA-TTT-GGG-");
1608 // seq2 codons are [4, 5], [8, 11]
1609 SequenceI dna2 = new Sequence("Seq2", "ccaAA-ttT-GGG-");
1610 // seq3 incomplete start codon at 'tt'
1611 SequenceI dna3 = new Sequence("Seq3", "ccaaa-ttt-GGG-");
1612 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1613 dna.setDataset(null);
1615 // prot1 has 'X' for incomplete start codon (not mapped)
1616 SequenceI prot1 = new Sequence("Seq1", "XKFG"); // X for incomplete start
1617 SequenceI prot2 = new Sequence("Seq2", "NG");
1618 SequenceI prot3 = new Sequence("Seq3", "XG"); // X for incomplete start
1619 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
1621 protein.setDataset(null);
1623 // map dna1 [3, 11] to prot1 [2, 4] KFG
1624 MapList map = new MapList(new int[] { 3, 11 }, new int[] { 2, 4 }, 3, 1);
1625 AlignedCodonFrame acf = new AlignedCodonFrame();
1626 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
1628 // map dna2 [4, 5] [8, 11] to prot2 [1, 2] NG
1629 map = new MapList(new int[] { 4, 5, 8, 11 }, new int[] { 1, 2 }, 3, 1);
1630 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
1632 // map dna3 [9, 11] to prot3 [2, 2] G
1633 map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1);
1634 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
1636 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
1638 protein.setCodonFrames(acfs);
1641 * verify X is included in the aligned proteins, and placed just
1642 * before the first mapped residue
1643 * CCT is between CCC and TTT
1645 AlignmentUtils.alignProteinAsDna(protein, dna);
1646 assertEquals("XK-FG", prot1.getSequenceAsString());
1647 assertEquals("--N-G", prot2.getSequenceAsString());
1648 assertEquals("---XG", prot3.getSequenceAsString());
1652 * Tests for the method that maps the subset of a dna sequence that has CDS
1653 * (or subtype) feature - case where the start codon is incomplete.
1655 @Test(groups = "Functional")
1656 public void testFindCdsPositions_fivePrimeIncomplete()
1658 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
1659 dnaSeq.createDatasetSequence();
1660 SequenceI ds = dnaSeq.getDatasetSequence();
1662 // CDS for dna 5-6 (incomplete codon), 7-9
1663 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
1664 sf.setPhase("2"); // skip 2 bases to start of next codon
1665 ds.addSequenceFeature(sf);
1666 // CDS for dna 13-15
1667 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
1668 ds.addSequenceFeature(sf);
1670 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1673 * check the mapping starts with the first complete codon
1675 assertEquals(6, MappingUtils.getLength(ranges));
1676 assertEquals(2, ranges.size());
1677 assertEquals(7, ranges.get(0)[0]);
1678 assertEquals(9, ranges.get(0)[1]);
1679 assertEquals(13, ranges.get(1)[0]);
1680 assertEquals(15, ranges.get(1)[1]);
1684 * Tests for the method that maps the subset of a dna sequence that has CDS
1685 * (or subtype) feature.
1687 @Test(groups = "Functional")
1688 public void testFindCdsPositions()
1690 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
1691 dnaSeq.createDatasetSequence();
1692 SequenceI ds = dnaSeq.getDatasetSequence();
1694 // CDS for dna 10-12
1695 SequenceFeature sf = new SequenceFeature("CDS_predicted", "", 10, 12,
1698 ds.addSequenceFeature(sf);
1700 sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
1702 ds.addSequenceFeature(sf);
1703 // exon feature should be ignored here
1704 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
1705 ds.addSequenceFeature(sf);
1707 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1709 * verify ranges { [4-6], [12-10] }
1710 * note CDS ranges are ordered ascending even if the CDS
1713 assertEquals(6, MappingUtils.getLength(ranges));
1714 assertEquals(2, ranges.size());
1715 assertEquals(4, ranges.get(0)[0]);
1716 assertEquals(6, ranges.get(0)[1]);
1717 assertEquals(10, ranges.get(1)[0]);
1718 assertEquals(12, ranges.get(1)[1]);
1722 * Test the method that computes a map of codon variants for each protein
1723 * position from "sequence_variant" features on dna
1725 @Test(groups = "Functional")
1726 public void testBuildDnaVariantsMap()
1728 SequenceI dna = new Sequence("dna", "atgAAATTTGGGCCCtag");
1729 MapList map = new MapList(new int[] { 1, 18 }, new int[] { 1, 5 }, 3, 1);
1732 * first with no variants on dna
1734 LinkedHashMap<Integer, List<DnaVariant>[]> variantsMap = AlignmentUtils
1735 .buildDnaVariantsMap(dna, map);
1736 assertTrue(variantsMap.isEmpty());
1739 * single allele codon 1, on base 1
1741 SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
1743 sf1.setValue("alleles", "T");
1744 sf1.setValue("ID", "sequence_variant:rs758803211");
1745 dna.addSequenceFeature(sf1);
1748 * two alleles codon 2, on bases 2 and 3 (distinct variants)
1750 SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 5, 5,
1752 sf2.setValue("alleles", "T");
1753 sf2.setValue("ID", "sequence_variant:rs758803212");
1754 dna.addSequenceFeature(sf2);
1755 SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 6, 6,
1757 sf3.setValue("alleles", "G");
1758 sf3.setValue("ID", "sequence_variant:rs758803213");
1759 dna.addSequenceFeature(sf3);
1762 * two alleles codon 3, both on base 2 (one variant)
1764 SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 8, 8,
1766 sf4.setValue("alleles", "C, G");
1767 sf4.setValue("ID", "sequence_variant:rs758803214");
1768 dna.addSequenceFeature(sf4);
1770 // no alleles on codon 4
1773 * alleles on codon 5 on all 3 bases (distinct variants)
1775 SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 13,
1777 sf5.setValue("alleles", "C, G"); // (C duplicates given base value)
1778 sf5.setValue("ID", "sequence_variant:rs758803215");
1779 dna.addSequenceFeature(sf5);
1780 SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 14,
1782 sf6.setValue("alleles", "g, a"); // should force to upper-case
1783 sf6.setValue("ID", "sequence_variant:rs758803216");
1784 dna.addSequenceFeature(sf6);
1785 SequenceFeature sf7 = new SequenceFeature("sequence_variant", "", 15,
1787 sf7.setValue("alleles", "A, T");
1788 sf7.setValue("ID", "sequence_variant:rs758803217");
1789 dna.addSequenceFeature(sf7);
1792 * build map - expect variants on positions 1, 2, 3, 5
1794 variantsMap = AlignmentUtils.buildDnaVariantsMap(dna, map);
1795 assertEquals(4, variantsMap.size());
1798 * protein residue 1: variant on codon (ATG) base 1, not on 2 or 3
1800 List<DnaVariant>[] pep1Variants = variantsMap.get(1);
1801 assertEquals(3, pep1Variants.length);
1802 assertEquals(1, pep1Variants[0].size());
1803 assertEquals("A", pep1Variants[0].get(0).base); // codon[1] base
1804 assertSame(sf1, pep1Variants[0].get(0).variant); // codon[1] variant
1805 assertEquals(1, pep1Variants[1].size());
1806 assertEquals("T", pep1Variants[1].get(0).base); // codon[2] base
1807 assertNull(pep1Variants[1].get(0).variant); // no variant here
1808 assertEquals(1, pep1Variants[2].size());
1809 assertEquals("G", pep1Variants[2].get(0).base); // codon[3] base
1810 assertNull(pep1Variants[2].get(0).variant); // no variant here
1813 * protein residue 2: variants on codon (AAA) bases 2 and 3
1815 List<DnaVariant>[] pep2Variants = variantsMap.get(2);
1816 assertEquals(3, pep2Variants.length);
1817 assertEquals(1, pep2Variants[0].size());
1818 // codon[1] base recorded while processing variant on codon[2]
1819 assertEquals("A", pep2Variants[0].get(0).base);
1820 assertNull(pep2Variants[0].get(0).variant); // no variant here
1821 // codon[2] base and variant:
1822 assertEquals(1, pep2Variants[1].size());
1823 assertEquals("A", pep2Variants[1].get(0).base);
1824 assertSame(sf2, pep2Variants[1].get(0).variant);
1825 // codon[3] base was recorded when processing codon[2] variant
1826 // and then the variant for codon[3] added to it
1827 assertEquals(1, pep2Variants[2].size());
1828 assertEquals("A", pep2Variants[2].get(0).base);
1829 assertSame(sf3, pep2Variants[2].get(0).variant);
1832 * protein residue 3: variants on codon (TTT) base 2 only
1834 List<DnaVariant>[] pep3Variants = variantsMap.get(3);
1835 assertEquals(3, pep3Variants.length);
1836 assertEquals(1, pep3Variants[0].size());
1837 assertEquals("T", pep3Variants[0].get(0).base); // codon[1] base
1838 assertNull(pep3Variants[0].get(0).variant); // no variant here
1839 assertEquals(1, pep3Variants[1].size());
1840 assertEquals("T", pep3Variants[1].get(0).base); // codon[2] base
1841 assertSame(sf4, pep3Variants[1].get(0).variant); // codon[2] variant
1842 assertEquals(1, pep3Variants[2].size());
1843 assertEquals("T", pep3Variants[2].get(0).base); // codon[3] base
1844 assertNull(pep3Variants[2].get(0).variant); // no variant here
1847 * three variants on protein position 5
1849 List<DnaVariant>[] pep5Variants = variantsMap.get(5);
1850 assertEquals(3, pep5Variants.length);
1851 assertEquals(1, pep5Variants[0].size());
1852 assertEquals("C", pep5Variants[0].get(0).base); // codon[1] base
1853 assertSame(sf5, pep5Variants[0].get(0).variant); // codon[1] variant
1854 assertEquals(1, pep5Variants[1].size());
1855 assertEquals("C", pep5Variants[1].get(0).base); // codon[2] base
1856 assertSame(sf6, pep5Variants[1].get(0).variant); // codon[2] variant
1857 assertEquals(1, pep5Variants[2].size());
1858 assertEquals("C", pep5Variants[2].get(0).base); // codon[3] base
1859 assertSame(sf7, pep5Variants[2].get(0).variant); // codon[3] variant
1863 * Tests for the method that computes all peptide variants given codon
1866 @Test(groups = "Functional")
1867 public void testComputePeptideVariants()
1870 * scenario: AAATTTCCC codes for KFP, with variants
1876 * CAC,CGC -> H,R (as one variant)
1878 SequenceI peptide = new Sequence("pep/10-12", "KFP");
1881 * two distinct variants for codon 1 position 1
1882 * second one has clinical significance
1884 SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
1886 sf1.setValue("alleles", "A,G"); // GAA -> E
1887 sf1.setValue("ID", "var1.125A>G");
1888 SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 1, 1,
1890 sf2.setValue("alleles", "A,C"); // CAA -> Q
1891 sf2.setValue("ID", "var2");
1892 sf2.setValue("clinical_significance", "Dodgy");
1893 SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 3, 3,
1895 sf3.setValue("alleles", "A,G"); // synonymous
1896 sf3.setValue("ID", "var3");
1897 sf3.setValue("clinical_significance", "None");
1898 SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 3, 3,
1900 sf4.setValue("alleles", "A,T"); // AAT -> N
1901 sf4.setValue("ID", "sequence_variant:var4"); // prefix gets stripped off
1902 sf4.setValue("clinical_significance", "Benign");
1903 SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 6, 6,
1905 sf5.setValue("alleles", "T,C"); // synonymous
1906 sf5.setValue("ID", "var5");
1907 sf5.setValue("clinical_significance", "Bad");
1908 SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 8, 8,
1910 sf6.setValue("alleles", "C,A,G"); // CAC,CGC -> H,R
1911 sf6.setValue("ID", "var6");
1912 sf6.setValue("clinical_significance", "Good");
1914 List<DnaVariant> codon1Variants = new ArrayList<DnaVariant>();
1915 List<DnaVariant> codon2Variants = new ArrayList<DnaVariant>();
1916 List<DnaVariant> codon3Variants = new ArrayList<DnaVariant>();
1917 List<DnaVariant> codonVariants[] = new ArrayList[3];
1918 codonVariants[0] = codon1Variants;
1919 codonVariants[1] = codon2Variants;
1920 codonVariants[2] = codon3Variants;
1923 * compute variants for protein position 1
1925 codon1Variants.add(new DnaVariant("A", sf1));
1926 codon1Variants.add(new DnaVariant("A", sf2));
1927 codon2Variants.add(new DnaVariant("A"));
1928 codon2Variants.add(new DnaVariant("A"));
1929 codon3Variants.add(new DnaVariant("A", sf3));
1930 codon3Variants.add(new DnaVariant("A", sf4));
1931 AlignmentUtils.computePeptideVariants(peptide, 1, codonVariants);
1934 * compute variants for protein position 2
1936 codon1Variants.clear();
1937 codon2Variants.clear();
1938 codon3Variants.clear();
1939 codon1Variants.add(new DnaVariant("T"));
1940 codon2Variants.add(new DnaVariant("T"));
1941 codon3Variants.add(new DnaVariant("T", sf5));
1942 AlignmentUtils.computePeptideVariants(peptide, 2, codonVariants);
1945 * compute variants for protein position 3
1947 codon1Variants.clear();
1948 codon2Variants.clear();
1949 codon3Variants.clear();
1950 codon1Variants.add(new DnaVariant("C"));
1951 codon2Variants.add(new DnaVariant("C", sf6));
1952 codon3Variants.add(new DnaVariant("C"));
1953 AlignmentUtils.computePeptideVariants(peptide, 3, codonVariants);
1956 * verify added sequence features for
1963 SequenceFeature[] sfs = peptide.getSequenceFeatures();
1964 assertEquals(5, sfs.length);
1965 SequenceFeature sf = sfs[0];
1966 assertEquals(1, sf.getBegin());
1967 assertEquals(1, sf.getEnd());
1968 assertEquals("p.Lys1Glu", sf.getDescription());
1969 assertEquals("var1.125A>G", sf.getValue("ID"));
1970 assertNull(sf.getValue("clinical_significance"));
1971 assertEquals("ID=var1.125A>G", sf.getAttributes());
1972 assertEquals(1, sf.links.size());
1973 // link to variation is urlencoded
1975 "p.Lys1Glu var1.125A>G|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var1.125A%3EG",
1977 assertEquals("Jalview", sf.getFeatureGroup());
1979 assertEquals(1, sf.getBegin());
1980 assertEquals(1, sf.getEnd());
1981 assertEquals("p.Lys1Gln", sf.getDescription());
1982 assertEquals("var2", sf.getValue("ID"));
1983 assertEquals("Dodgy", sf.getValue("clinical_significance"));
1984 assertEquals("ID=var2;clinical_significance=Dodgy", sf.getAttributes());
1985 assertEquals(1, sf.links.size());
1987 "p.Lys1Gln var2|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var2",
1989 assertEquals("Jalview", sf.getFeatureGroup());
1991 assertEquals(1, sf.getBegin());
1992 assertEquals(1, sf.getEnd());
1993 assertEquals("p.Lys1Asn", sf.getDescription());
1994 assertEquals("var4", sf.getValue("ID"));
1995 assertEquals("Benign", sf.getValue("clinical_significance"));
1996 assertEquals("ID=var4;clinical_significance=Benign", sf.getAttributes());
1997 assertEquals(1, sf.links.size());
1999 "p.Lys1Asn var4|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var4",
2001 assertEquals("Jalview", sf.getFeatureGroup());
2003 assertEquals(3, sf.getBegin());
2004 assertEquals(3, sf.getEnd());
2005 assertEquals("p.Pro3His", sf.getDescription());
2006 assertEquals("var6", sf.getValue("ID"));
2007 assertEquals("Good", sf.getValue("clinical_significance"));
2008 assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes());
2009 assertEquals(1, sf.links.size());
2011 "p.Pro3His var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
2013 // var5 generates two distinct protein variant features
2014 assertEquals("Jalview", sf.getFeatureGroup());
2016 assertEquals(3, sf.getBegin());
2017 assertEquals(3, sf.getEnd());
2018 assertEquals("p.Pro3Arg", sf.getDescription());
2019 assertEquals("var6", sf.getValue("ID"));
2020 assertEquals("Good", sf.getValue("clinical_significance"));
2021 assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes());
2022 assertEquals(1, sf.links.size());
2024 "p.Pro3Arg var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
2026 assertEquals("Jalview", sf.getFeatureGroup());
2030 * Tests for the method that maps the subset of a dna sequence that has CDS
2031 * (or subtype) feature, with CDS strand = '-' (reverse)
2033 // test turned off as currently findCdsPositions is not strand-dependent
2034 // left in case it comes around again...
2035 @Test(groups = "Functional", enabled = false)
2036 public void testFindCdsPositions_reverseStrand()
2038 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
2039 dnaSeq.createDatasetSequence();
2040 SequenceI ds = dnaSeq.getDatasetSequence();
2043 SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
2045 ds.addSequenceFeature(sf);
2046 // exon feature should be ignored here
2047 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
2048 ds.addSequenceFeature(sf);
2049 // CDS for dna 10-12
2050 sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
2052 ds.addSequenceFeature(sf);
2054 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
2056 * verify ranges { [12-10], [6-4] }
2058 assertEquals(6, MappingUtils.getLength(ranges));
2059 assertEquals(2, ranges.size());
2060 assertEquals(12, ranges.get(0)[0]);
2061 assertEquals(10, ranges.get(0)[1]);
2062 assertEquals(6, ranges.get(1)[0]);
2063 assertEquals(4, ranges.get(1)[1]);
2067 * Tests for the method that maps the subset of a dna sequence that has CDS
2068 * (or subtype) feature - reverse strand case where the start codon is
2071 @Test(groups = "Functional", enabled = false)
2072 // test turned off as currently findCdsPositions is not strand-dependent
2073 // left in case it comes around again...
2074 public void testFindCdsPositions_reverseStrandThreePrimeIncomplete()
2076 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
2077 dnaSeq.createDatasetSequence();
2078 SequenceI ds = dnaSeq.getDatasetSequence();
2081 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
2083 ds.addSequenceFeature(sf);
2084 // CDS for dna 13-15
2085 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
2087 sf.setPhase("2"); // skip 2 bases to start of next codon
2088 ds.addSequenceFeature(sf);
2090 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
2093 * check the mapping starts with the first complete codon
2094 * expect ranges [13, 13], [9, 5]
2096 assertEquals(6, MappingUtils.getLength(ranges));
2097 assertEquals(2, ranges.size());
2098 assertEquals(13, ranges.get(0)[0]);
2099 assertEquals(13, ranges.get(0)[1]);
2100 assertEquals(9, ranges.get(1)[0]);
2101 assertEquals(5, ranges.get(1)[1]);
2104 @Test(groups = "Functional")
2105 public void testAlignAs_alternateTranscriptsUngapped()
2107 SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
2108 SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
2109 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
2110 ((Alignment) dna).createDatasetAlignment();
2111 SequenceI cds1 = new Sequence("cds1", "GGGTTT");
2112 SequenceI cds2 = new Sequence("cds2", "CCCAAA");
2113 AlignmentI cds = new Alignment(new SequenceI[] { cds1, cds2 });
2114 ((Alignment) cds).createDatasetAlignment();
2116 AlignedCodonFrame acf = new AlignedCodonFrame();
2117 MapList map = new MapList(new int[] { 4, 9 }, new int[] { 1, 6 }, 1, 1);
2118 acf.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), map);
2119 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 6 }, 1, 1);
2120 acf.addMap(dna2.getDatasetSequence(), cds2.getDatasetSequence(), map);
2123 * verify CDS alignment is as:
2124 * cccGGGTTTaaa (cdna)
2125 * CCCgggtttAAA (cdna)
2127 * ---GGGTTT--- (cds)
2128 * CCC------AAA (cds)
2130 dna.addCodonFrame(acf);
2131 AlignmentUtils.alignAs(cds, dna);
2132 assertEquals("---GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
2133 assertEquals("CCC------AAA", cds.getSequenceAt(1).getSequenceAsString());
2136 @Test(groups = { "Functional" })
2137 public void testAddMappedPositions()
2139 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
2140 SequenceI seq1 = new Sequence("cds", "AAATTT");
2141 from.createDatasetSequence();
2142 seq1.createDatasetSequence();
2143 Mapping mapping = new Mapping(seq1, new MapList(
2144 new int[] { 3, 6, 9, 10 },
2145 new int[] { 1, 6 }, 1, 1));
2146 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
2147 AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2150 * verify map has seq1 residues in columns 3,4,6,7,11,12
2152 assertEquals(6, map.size());
2153 assertEquals('A', map.get(3).get(seq1).charValue());
2154 assertEquals('A', map.get(4).get(seq1).charValue());
2155 assertEquals('A', map.get(6).get(seq1).charValue());
2156 assertEquals('T', map.get(7).get(seq1).charValue());
2157 assertEquals('T', map.get(11).get(seq1).charValue());
2158 assertEquals('T', map.get(12).get(seq1).charValue());
2166 * Test case where the mapping 'from' range includes a stop codon which is
2167 * absent in the 'to' range
2169 @Test(groups = { "Functional" })
2170 public void testAddMappedPositions_withStopCodon()
2172 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
2173 SequenceI seq1 = new Sequence("cds", "AAATTT");
2174 from.createDatasetSequence();
2175 seq1.createDatasetSequence();
2176 Mapping mapping = new Mapping(seq1, new MapList(
2177 new int[] { 3, 6, 9, 10 },
2178 new int[] { 1, 6 }, 1, 1));
2179 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
2180 AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2183 * verify map has seq1 residues in columns 3,4,6,7,11,12
2185 assertEquals(6, map.size());
2186 assertEquals('A', map.get(3).get(seq1).charValue());
2187 assertEquals('A', map.get(4).get(seq1).charValue());
2188 assertEquals('A', map.get(6).get(seq1).charValue());
2189 assertEquals('T', map.get(7).get(seq1).charValue());
2190 assertEquals('T', map.get(11).get(seq1).charValue());
2191 assertEquals('T', map.get(12).get(seq1).charValue());