2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.testng.AssertJUnit.assertEquals;
24 import static org.testng.AssertJUnit.assertFalse;
25 import static org.testng.AssertJUnit.assertNull;
26 import static org.testng.AssertJUnit.assertSame;
27 import static org.testng.AssertJUnit.assertTrue;
29 import jalview.analysis.AlignmentUtils.DnaVariant;
30 import jalview.datamodel.AlignedCodonFrame;
31 import jalview.datamodel.Alignment;
32 import jalview.datamodel.AlignmentAnnotation;
33 import jalview.datamodel.AlignmentI;
34 import jalview.datamodel.Annotation;
35 import jalview.datamodel.DBRefEntry;
36 import jalview.datamodel.Mapping;
37 import jalview.datamodel.SearchResults;
38 import jalview.datamodel.SearchResults.Match;
39 import jalview.datamodel.Sequence;
40 import jalview.datamodel.SequenceFeature;
41 import jalview.datamodel.SequenceI;
42 import jalview.io.AppletFormatAdapter;
43 import jalview.io.FormatAdapter;
44 import jalview.util.MapList;
45 import jalview.util.MappingUtils;
47 import java.io.IOException;
48 import java.util.ArrayList;
49 import java.util.Arrays;
50 import java.util.Iterator;
51 import java.util.LinkedHashMap;
52 import java.util.List;
54 import java.util.TreeMap;
56 import org.testng.annotations.Test;
58 public class AlignmentUtilsTests
60 public static Sequence ts = new Sequence("short",
61 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm");
63 @Test(groups = { "Functional" })
64 public void testExpandContext()
66 AlignmentI al = new Alignment(new Sequence[] {});
67 for (int i = 4; i < 14; i += 2)
69 SequenceI s1 = ts.deriveSequence().getSubSequence(i, i + 7);
72 System.out.println(new AppletFormatAdapter().formatSequences("Clustal",
74 for (int flnk = -1; flnk < 25; flnk++)
76 AlignmentI exp = AlignmentUtils.expandContext(al, flnk);
77 System.out.println("\nFlank size: " + flnk);
78 System.out.println(new AppletFormatAdapter().formatSequences(
79 "Clustal", exp, true));
83 * Full expansion to complete sequences
85 for (SequenceI sq : exp.getSequences())
87 String ung = sq.getSequenceAsString().replaceAll("-+", "");
88 final String errorMsg = "Flanking sequence not the same as original dataset sequence.\n"
91 + sq.getDatasetSequence().getSequenceAsString();
92 assertTrue(errorMsg, ung.equalsIgnoreCase(sq.getDatasetSequence()
93 .getSequenceAsString()));
99 * Last sequence is fully expanded, others have leading gaps to match
101 assertTrue(exp.getSequenceAt(4).getSequenceAsString()
103 assertTrue(exp.getSequenceAt(3).getSequenceAsString()
104 .startsWith("--abc"));
105 assertTrue(exp.getSequenceAt(2).getSequenceAsString()
106 .startsWith("----abc"));
107 assertTrue(exp.getSequenceAt(1).getSequenceAsString()
108 .startsWith("------abc"));
109 assertTrue(exp.getSequenceAt(0).getSequenceAsString()
110 .startsWith("--------abc"));
116 * Test that annotations are correctly adjusted by expandContext
118 @Test(groups = { "Functional" })
119 public void testExpandContext_annotation()
121 AlignmentI al = new Alignment(new Sequence[] {});
122 SequenceI ds = new Sequence("Seq1", "ABCDEFGHI");
124 SequenceI seq1 = ds.deriveSequence().getSubSequence(3, 6);
125 al.addSequence(seq1);
128 * Annotate DEF with 4/5/6 respectively
130 Annotation[] anns = new Annotation[] { new Annotation(4),
131 new Annotation(5), new Annotation(6) };
132 AlignmentAnnotation ann = new AlignmentAnnotation("SS",
133 "secondary structure", anns);
134 seq1.addAlignmentAnnotation(ann);
137 * The annotations array should match aligned positions
139 assertEquals(3, ann.annotations.length);
140 assertEquals(4, ann.annotations[0].value, 0.001);
141 assertEquals(5, ann.annotations[1].value, 0.001);
142 assertEquals(6, ann.annotations[2].value, 0.001);
145 * Check annotation to sequence position mappings before expanding the
146 * sequence; these are set up in Sequence.addAlignmentAnnotation ->
147 * Annotation.setSequenceRef -> createSequenceMappings
149 assertNull(ann.getAnnotationForPosition(1));
150 assertNull(ann.getAnnotationForPosition(2));
151 assertNull(ann.getAnnotationForPosition(3));
152 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
153 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
154 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
155 assertNull(ann.getAnnotationForPosition(7));
156 assertNull(ann.getAnnotationForPosition(8));
157 assertNull(ann.getAnnotationForPosition(9));
160 * Expand the subsequence to the full sequence abcDEFghi
162 AlignmentI expanded = AlignmentUtils.expandContext(al, -1);
163 assertEquals("abcDEFghi", expanded.getSequenceAt(0)
164 .getSequenceAsString());
167 * Confirm the alignment and sequence have the same SS annotation,
168 * referencing the expanded sequence
170 ann = expanded.getSequenceAt(0).getAnnotation()[0];
171 assertSame(ann, expanded.getAlignmentAnnotation()[0]);
172 assertSame(expanded.getSequenceAt(0), ann.sequenceRef);
175 * The annotations array should have null values except for annotated
178 assertNull(ann.annotations[0]);
179 assertNull(ann.annotations[1]);
180 assertNull(ann.annotations[2]);
181 assertEquals(4, ann.annotations[3].value, 0.001);
182 assertEquals(5, ann.annotations[4].value, 0.001);
183 assertEquals(6, ann.annotations[5].value, 0.001);
184 assertNull(ann.annotations[6]);
185 assertNull(ann.annotations[7]);
186 assertNull(ann.annotations[8]);
189 * sequence position mappings should be unchanged
191 assertNull(ann.getAnnotationForPosition(1));
192 assertNull(ann.getAnnotationForPosition(2));
193 assertNull(ann.getAnnotationForPosition(3));
194 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
195 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
196 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
197 assertNull(ann.getAnnotationForPosition(7));
198 assertNull(ann.getAnnotationForPosition(8));
199 assertNull(ann.getAnnotationForPosition(9));
203 * Test method that returns a map of lists of sequences by sequence name.
205 * @throws IOException
207 @Test(groups = { "Functional" })
208 public void testGetSequencesByName() throws IOException
210 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
211 + ">Seq1Name\nABCD\n";
212 AlignmentI al = loadAlignment(data, "FASTA");
213 Map<String, List<SequenceI>> map = AlignmentUtils
214 .getSequencesByName(al);
215 assertEquals(2, map.keySet().size());
216 assertEquals(2, map.get("Seq1Name").size());
217 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
218 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
219 assertEquals(1, map.get("Seq2Name").size());
220 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
224 * Helper method to load an alignment and ensure dataset sequences are set up.
230 * @throws IOException
232 protected AlignmentI loadAlignment(final String data, String format)
235 AlignmentI a = new FormatAdapter().readFile(data,
236 AppletFormatAdapter.PASTE, format);
242 * Test mapping of protein to cDNA, for the case where we have no sequence
243 * cross-references, so mappings are made first-served 1-1 where sequences
246 * @throws IOException
248 @Test(groups = { "Functional" })
249 public void testMapProteinAlignmentToCdna_noXrefs() throws IOException
251 List<SequenceI> protseqs = new ArrayList<SequenceI>();
252 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
253 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
254 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
255 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
256 protein.setDataset(null);
258 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
259 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
260 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
261 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
262 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
263 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
264 cdna.setDataset(null);
266 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
268 // 3 mappings made, each from 1 to 1 sequence
269 assertEquals(3, protein.getCodonFrames().size());
270 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
271 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
272 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
274 // V12345 mapped to A22222
275 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
277 assertEquals(1, acf.getdnaSeqs().length);
278 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
279 acf.getdnaSeqs()[0]);
280 Mapping[] protMappings = acf.getProtMappings();
281 assertEquals(1, protMappings.length);
282 MapList mapList = protMappings[0].getMap();
283 assertEquals(3, mapList.getFromRatio());
284 assertEquals(1, mapList.getToRatio());
285 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
287 assertEquals(1, mapList.getFromRanges().size());
288 assertTrue(Arrays.equals(new int[] { 1, 3 },
289 mapList.getToRanges().get(0)));
290 assertEquals(1, mapList.getToRanges().size());
292 // V12346 mapped to A33333
293 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
294 assertEquals(1, acf.getdnaSeqs().length);
295 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
296 acf.getdnaSeqs()[0]);
298 // V12347 mapped to A11111
299 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
300 assertEquals(1, acf.getdnaSeqs().length);
301 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
302 acf.getdnaSeqs()[0]);
304 // no mapping involving the 'extra' A44444
305 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
309 * Test for the alignSequenceAs method that takes two sequences and a mapping.
311 @Test(groups = { "Functional" })
312 public void testAlignSequenceAs_withMapping_noIntrons()
314 MapList map = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1);
317 * No existing gaps in dna:
319 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
323 * Now introduce gaps in dna but ignore them when realigning.
325 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
329 * Now include gaps in dna when realigning. First retaining 'mapped' gaps
330 * only, i.e. those within the exon region.
332 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
333 "---G-G--G---A--A-A");
336 * Include all gaps in dna when realigning (within and without the exon
337 * region). The leading gap, and the gaps between codons, are subsumed by
338 * the protein alignment gap.
340 checkAlignSequenceAs("-G-GG--AA-A---", "-A-L-", true, true, map,
341 "---G-GG---AA-A---");
344 * Include only unmapped gaps in dna when realigning (outside the exon
345 * region). The leading gap, and the gaps between codons, are subsumed by
346 * the protein alignment gap.
348 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
353 * Test for the alignSequenceAs method that takes two sequences and a mapping.
355 @Test(groups = { "Functional" })
356 public void testAlignSequenceAs_withMapping_withIntrons()
359 * Exons at codon 2 (AAA) and 4 (TTT)
361 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
362 new int[] { 1, 2 }, 3, 1);
365 * Simple case: no gaps in dna
367 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
368 "GGG---AAACCCTTTGGG");
371 * Add gaps to dna - but ignore when realigning.
373 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
374 false, false, map, "GGG---AAACCCTTTGGG");
377 * Add gaps to dna - include within exons only when realigning.
379 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
380 true, false, map, "GGG---A--A---ACCCT-TTGGG");
383 * Include gaps outside exons only when realigning.
385 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
386 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
389 * Include gaps following first intron if we are 'preserving mapped gaps'
391 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
392 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
395 * Include all gaps in dna when realigning.
397 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
398 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
402 * Test for the case where not all of the protein sequence is mapped to cDNA.
404 @Test(groups = { "Functional" })
405 public void testAlignSequenceAs_withMapping_withUnmappedProtein()
408 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
410 final MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] {
414 * -L- 'aligns' ccc------
416 checkAlignSequenceAs("gggAAAcccTTTggg", "-A-L-P-", false, false, map,
417 "gggAAAccc------TTTggg");
421 * Helper method that performs and verifies the method under test.
424 * the sequence to be realigned
426 * the sequence whose alignment is to be copied
427 * @param preserveMappedGaps
428 * @param preserveUnmappedGaps
432 protected void checkAlignSequenceAs(final String alignee,
433 final String alignModel, final boolean preserveMappedGaps,
434 final boolean preserveUnmappedGaps, MapList map,
435 final String expected)
437 SequenceI alignMe = new Sequence("Seq1", alignee);
438 alignMe.createDatasetSequence();
439 SequenceI alignFrom = new Sequence("Seq2", alignModel);
440 alignFrom.createDatasetSequence();
441 AlignedCodonFrame acf = new AlignedCodonFrame();
442 acf.addMap(alignMe.getDatasetSequence(), alignFrom.getDatasetSequence(), map);
444 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "---", '-',
445 preserveMappedGaps, preserveUnmappedGaps);
446 assertEquals(expected, alignMe.getSequenceAsString());
450 * Test for the alignSequenceAs method where we preserve gaps in introns only.
452 @Test(groups = { "Functional" })
453 public void testAlignSequenceAs_keepIntronGapsOnly()
457 * Intron GGGAAA followed by exon CCCTTT
459 MapList map = new MapList(new int[] { 7, 12 }, new int[] { 1, 2 }, 3, 1);
461 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL", false, true, map,
466 * Test the method that realigns protein to match mapped codon alignment.
468 @Test(groups = { "Functional" })
469 public void testAlignProteinAsDna()
471 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
472 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
473 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
474 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
475 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
476 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
477 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
478 dna.setDataset(null);
480 // protein alignment will be realigned like dna
481 SequenceI prot1 = new Sequence("Seq1", "CHYQ");
482 SequenceI prot2 = new Sequence("Seq2", "CHYQ");
483 SequenceI prot3 = new Sequence("Seq3", "CHYQ");
484 SequenceI prot4 = new Sequence("Seq4", "R-QSV"); // unmapped, unchanged
485 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
487 protein.setDataset(null);
489 MapList map = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 3, 1);
490 AlignedCodonFrame acf = new AlignedCodonFrame();
491 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
492 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
493 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
494 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
496 protein.setCodonFrames(acfs);
499 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
500 * [8,9,10] [10,11,12] [11,12,13]
502 AlignmentUtils.alignProteinAsDna(protein, dna);
503 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
504 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
505 assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
506 assertEquals("R-QSV", prot4.getSequenceAsString());
510 * Test the method that tests whether a CDNA sequence translates to a protein
513 @Test(groups = { "Functional" })
514 public void testTranslatesAs()
516 // null arguments check
517 assertFalse(AlignmentUtils.translatesAs(null, 0, null));
518 assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null));
519 assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' }));
521 // straight translation
522 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
523 "FPKG".toCharArray()));
524 // with extra start codon (not in protein)
525 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
526 3, "FPKG".toCharArray()));
527 // with stop codon1 (not in protein)
528 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
529 0, "FPKG".toCharArray()));
530 // with stop codon1 (in protein as *)
531 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
532 0, "FPKG*".toCharArray()));
533 // with stop codon2 (not in protein)
534 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
535 0, "FPKG".toCharArray()));
536 // with stop codon3 (not in protein)
537 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
538 0, "FPKG".toCharArray()));
539 // with start and stop codon1
540 assertTrue(AlignmentUtils.translatesAs(
541 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG".toCharArray()));
542 // with start and stop codon1 (in protein as *)
543 assertTrue(AlignmentUtils.translatesAs(
544 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG*".toCharArray()));
545 // with start and stop codon2
546 assertTrue(AlignmentUtils.translatesAs(
547 "atgtttcccaaagggtag".toCharArray(), 3, "FPKG".toCharArray()));
548 // with start and stop codon3
549 assertTrue(AlignmentUtils.translatesAs(
550 "atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray()));
552 // with embedded stop codons
553 assertTrue(AlignmentUtils.translatesAs(
554 "atgtttTAGcccaaaTAAgggtga".toCharArray(), 3,
555 "F*PK*G".toCharArray()));
558 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
559 0, "FPMG".toCharArray()));
562 assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0,
563 "FPKG".toCharArray()));
566 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
567 0, "FPK".toCharArray()));
569 // overlong dna (doesn't end in stop codon)
570 assertFalse(AlignmentUtils.translatesAs(
571 "tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray()));
573 // dna + stop codon + more
574 assertFalse(AlignmentUtils.translatesAs(
575 "tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray()));
578 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
579 0, "FPKGQ".toCharArray()));
583 * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
584 * stop codons in addition to the protein coding sequence.
586 * @throws IOException
588 @Test(groups = { "Functional" })
589 public void testMapProteinAlignmentToCdna_withStartAndStopCodons()
592 List<SequenceI> protseqs = new ArrayList<SequenceI>();
593 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
594 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
595 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
596 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
597 protein.setDataset(null);
599 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
601 dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
603 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
604 // = start +EIQ + stop
605 dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
606 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
607 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
608 cdna.setDataset(null);
610 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
612 // 3 mappings made, each from 1 to 1 sequence
613 assertEquals(3, protein.getCodonFrames().size());
614 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
615 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
616 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
618 // V12345 mapped from A22222
619 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
621 assertEquals(1, acf.getdnaSeqs().length);
622 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
623 acf.getdnaSeqs()[0]);
624 Mapping[] protMappings = acf.getProtMappings();
625 assertEquals(1, protMappings.length);
626 MapList mapList = protMappings[0].getMap();
627 assertEquals(3, mapList.getFromRatio());
628 assertEquals(1, mapList.getToRatio());
629 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
631 assertEquals(1, mapList.getFromRanges().size());
632 assertTrue(Arrays.equals(new int[] { 1, 3 },
633 mapList.getToRanges().get(0)));
634 assertEquals(1, mapList.getToRanges().size());
636 // V12346 mapped from A33333 starting position 4
637 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
638 assertEquals(1, acf.getdnaSeqs().length);
639 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
640 acf.getdnaSeqs()[0]);
641 protMappings = acf.getProtMappings();
642 assertEquals(1, protMappings.length);
643 mapList = protMappings[0].getMap();
644 assertEquals(3, mapList.getFromRatio());
645 assertEquals(1, mapList.getToRatio());
646 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
648 assertEquals(1, mapList.getFromRanges().size());
649 assertTrue(Arrays.equals(new int[] { 1, 3 },
650 mapList.getToRanges().get(0)));
651 assertEquals(1, mapList.getToRanges().size());
653 // V12347 mapped to A11111 starting position 4
654 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
655 assertEquals(1, acf.getdnaSeqs().length);
656 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
657 acf.getdnaSeqs()[0]);
658 protMappings = acf.getProtMappings();
659 assertEquals(1, protMappings.length);
660 mapList = protMappings[0].getMap();
661 assertEquals(3, mapList.getFromRatio());
662 assertEquals(1, mapList.getToRatio());
663 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
665 assertEquals(1, mapList.getFromRanges().size());
666 assertTrue(Arrays.equals(new int[] { 1, 3 },
667 mapList.getToRanges().get(0)));
668 assertEquals(1, mapList.getToRanges().size());
670 // no mapping involving the 'extra' A44444
671 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
675 * Test mapping of protein to cDNA, for the case where we have some sequence
676 * cross-references. Verify that 1-to-many mappings are made where
677 * cross-references exist and sequences are mappable.
679 * @throws IOException
681 @Test(groups = { "Functional" })
682 public void testMapProteinAlignmentToCdna_withXrefs() throws IOException
684 List<SequenceI> protseqs = new ArrayList<SequenceI>();
685 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
686 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
687 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
688 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
689 protein.setDataset(null);
691 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
692 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
693 dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
694 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
695 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
696 dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
697 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
698 cdna.setDataset(null);
700 // Xref A22222 to V12345 (should get mapped)
701 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
702 // Xref V12345 to A44444 (should get mapped)
703 protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
704 // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
705 dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
706 // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
707 // it should get paired up with the unmapped A33333
708 // A11111 should be mapped to V12347
709 // A55555 is spare and has no xref so is not mapped
711 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
713 // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
714 assertEquals(3, protein.getCodonFrames().size());
715 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
716 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
717 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
719 // one mapping for each of the first 4 cDNA sequences
720 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
721 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
722 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
723 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
725 // V12345 mapped to A22222 and A44444
726 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
728 assertEquals(2, acf.getdnaSeqs().length);
729 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
730 acf.getdnaSeqs()[0]);
731 assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
732 acf.getdnaSeqs()[1]);
734 // V12346 mapped to A33333
735 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
736 assertEquals(1, acf.getdnaSeqs().length);
737 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
738 acf.getdnaSeqs()[0]);
740 // V12347 mapped to A11111
741 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
742 assertEquals(1, acf.getdnaSeqs().length);
743 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
744 acf.getdnaSeqs()[0]);
746 // no mapping involving the 'extra' A55555
747 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
751 * Test mapping of protein to cDNA, for the case where we have some sequence
752 * cross-references. Verify that once we have made an xref mapping we don't
753 * also map un-xrefd sequeces.
755 * @throws IOException
757 @Test(groups = { "Functional" })
758 public void testMapProteinAlignmentToCdna_prioritiseXrefs()
761 List<SequenceI> protseqs = new ArrayList<SequenceI>();
762 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
763 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
764 AlignmentI protein = new Alignment(
765 protseqs.toArray(new SequenceI[protseqs.size()]));
766 protein.setDataset(null);
768 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
769 dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
770 dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
771 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
773 cdna.setDataset(null);
775 // Xref A22222 to V12345 (should get mapped)
776 // A11111 should then be mapped to the unmapped V12346
777 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
779 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
781 // 2 protein mappings made
782 assertEquals(2, protein.getCodonFrames().size());
783 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
784 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
786 // one mapping for each of the cDNA sequences
787 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
788 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
790 // V12345 mapped to A22222
791 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
793 assertEquals(1, acf.getdnaSeqs().length);
794 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
795 acf.getdnaSeqs()[0]);
797 // V12346 mapped to A11111
798 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
799 assertEquals(1, acf.getdnaSeqs().length);
800 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
801 acf.getdnaSeqs()[0]);
805 * Test the method that shows or hides sequence annotations by type(s) and
808 @Test(groups = { "Functional" })
809 public void testShowOrHideSequenceAnnotations()
811 SequenceI seq1 = new Sequence("Seq1", "AAA");
812 SequenceI seq2 = new Sequence("Seq2", "BBB");
813 SequenceI seq3 = new Sequence("Seq3", "CCC");
814 Annotation[] anns = new Annotation[] { new Annotation(2f) };
815 AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1",
817 ann1.setSequenceRef(seq1);
818 AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2",
820 ann2.setSequenceRef(seq2);
821 AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3",
823 AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns);
824 ann4.setSequenceRef(seq1);
825 AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns);
826 ann5.setSequenceRef(seq2);
827 AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns);
828 AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2, seq3 });
829 al.addAnnotation(ann1); // Structure for Seq1
830 al.addAnnotation(ann2); // Structure for Seq2
831 al.addAnnotation(ann3); // Structure for no sequence
832 al.addAnnotation(ann4); // Temp for seq1
833 al.addAnnotation(ann5); // Temp for seq2
834 al.addAnnotation(ann6); // Temp for no sequence
835 List<String> types = new ArrayList<String>();
836 List<SequenceI> scope = new ArrayList<SequenceI>();
839 * Set all sequence related Structure to hidden (ann1, ann2)
841 types.add("Structure");
842 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
844 assertFalse(ann1.visible);
845 assertFalse(ann2.visible);
846 assertTrue(ann3.visible); // not sequence-related, not affected
847 assertTrue(ann4.visible); // not Structure, not affected
848 assertTrue(ann5.visible); // "
849 assertTrue(ann6.visible); // not sequence-related, not affected
852 * Set Temp in {seq1, seq3} to hidden
858 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false,
860 assertFalse(ann1.visible); // unchanged
861 assertFalse(ann2.visible); // unchanged
862 assertTrue(ann3.visible); // not sequence-related, not affected
863 assertFalse(ann4.visible); // Temp for seq1 hidden
864 assertTrue(ann5.visible); // not in scope, not affected
865 assertTrue(ann6.visible); // not sequence-related, not affected
868 * Set Temp in all sequences to hidden
874 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
876 assertFalse(ann1.visible); // unchanged
877 assertFalse(ann2.visible); // unchanged
878 assertTrue(ann3.visible); // not sequence-related, not affected
879 assertFalse(ann4.visible); // Temp for seq1 hidden
880 assertFalse(ann5.visible); // Temp for seq2 hidden
881 assertTrue(ann6.visible); // not sequence-related, not affected
884 * Set all types in {seq1, seq3} to visible
890 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true,
892 assertTrue(ann1.visible); // Structure for seq1 set visible
893 assertFalse(ann2.visible); // not in scope, unchanged
894 assertTrue(ann3.visible); // not sequence-related, not affected
895 assertTrue(ann4.visible); // Temp for seq1 set visible
896 assertFalse(ann5.visible); // not in scope, unchanged
897 assertTrue(ann6.visible); // not sequence-related, not affected
900 * Set all types in all scope to hidden
902 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true,
904 assertFalse(ann1.visible);
905 assertFalse(ann2.visible);
906 assertTrue(ann3.visible); // not sequence-related, not affected
907 assertFalse(ann4.visible);
908 assertFalse(ann5.visible);
909 assertTrue(ann6.visible); // not sequence-related, not affected
913 * Tests for the method that checks if one sequence cross-references another
915 @Test(groups = { "Functional" })
916 public void testHasCrossRef()
918 assertFalse(AlignmentUtils.hasCrossRef(null, null));
919 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
920 assertFalse(AlignmentUtils.hasCrossRef(seq1, null));
921 assertFalse(AlignmentUtils.hasCrossRef(null, seq1));
922 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
923 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
926 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193"));
927 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
929 // case-insensitive; version number is ignored
930 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192"));
931 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
934 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
935 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
936 // test is one-way only
937 assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1));
941 * Tests for the method that checks if either sequence cross-references the
944 @Test(groups = { "Functional" })
945 public void testHaveCrossRef()
947 assertFalse(AlignmentUtils.hasCrossRef(null, null));
948 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
949 assertFalse(AlignmentUtils.haveCrossRef(seq1, null));
950 assertFalse(AlignmentUtils.haveCrossRef(null, seq1));
951 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
952 assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2));
954 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
955 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
956 // next is true for haveCrossRef, false for hasCrossRef
957 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
959 // now the other way round
960 seq1.setDBRefs(null);
961 seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345"));
962 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
963 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
966 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
967 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
968 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
972 * Test the method that extracts the cds-only part of a dna alignment.
974 @Test(groups = { "Functional" })
975 public void testMakeCdsAlignment()
977 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
978 SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
979 SequenceI pep1 = new Sequence("pep1", "GF");
980 SequenceI pep2 = new Sequence("pep2", "GFP");
981 dna1.createDatasetSequence();
982 dna2.createDatasetSequence();
983 pep1.createDatasetSequence();
984 pep2.createDatasetSequence();
985 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
987 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
989 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
991 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
993 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds5", 13, 15, 0f,
995 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
996 dna.setDataset(null);
998 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
999 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1000 new int[] { 1, 2 }, 3, 1);
1001 AlignedCodonFrame acf = new AlignedCodonFrame();
1002 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1004 map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 },
1006 acf = new AlignedCodonFrame();
1007 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1011 * execute method under test:
1013 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1014 dna1, dna2 }, mappings, dna);
1016 assertEquals(2, cds.getSequences().size());
1017 assertEquals("GGGTTT", cds.getSequenceAt(0)
1018 .getSequenceAsString());
1019 assertEquals("GGGTTTCCC", cds.getSequenceAt(1)
1020 .getSequenceAsString());
1023 * verify shared, extended alignment dataset
1025 assertSame(dna.getDataset(), cds.getDataset());
1026 assertTrue(dna.getDataset().getSequences()
1027 .contains(cds.getSequenceAt(0).getDatasetSequence()));
1028 assertTrue(dna.getDataset().getSequences()
1029 .contains(cds.getSequenceAt(1).getDatasetSequence()));
1032 * Verify mappings from CDS to peptide and cDNA to CDS
1033 * the mappings are on the shared alignment dataset
1035 assertSame(dna.getCodonFrames(), cds.getCodonFrames());
1036 List<AlignedCodonFrame> cdsMappings = cds.getCodonFrames();
1037 assertEquals(2, cdsMappings.size());
1040 * Mapping from pep1 to GGGTTT in first new exon sequence
1042 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1043 .findMappingsForSequence(pep1, cdsMappings);
1044 assertEquals(1, pep1Mapping.size());
1046 SearchResults sr = MappingUtils
1047 .buildSearchResults(pep1, 1, cdsMappings);
1048 assertEquals(1, sr.getResults().size());
1049 Match m = sr.getResults().get(0);
1050 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1052 assertEquals(1, m.getStart());
1053 assertEquals(3, m.getEnd());
1055 sr = MappingUtils.buildSearchResults(pep1, 2, cdsMappings);
1056 m = sr.getResults().get(0);
1057 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1059 assertEquals(4, m.getStart());
1060 assertEquals(6, m.getEnd());
1063 * Mapping from pep2 to GGGTTTCCC in second new exon sequence
1065 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1066 .findMappingsForSequence(pep2, cdsMappings);
1067 assertEquals(1, pep2Mapping.size());
1069 sr = MappingUtils.buildSearchResults(pep2, 1, cdsMappings);
1070 assertEquals(1, sr.getResults().size());
1071 m = sr.getResults().get(0);
1072 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1074 assertEquals(1, m.getStart());
1075 assertEquals(3, m.getEnd());
1077 sr = MappingUtils.buildSearchResults(pep2, 2, cdsMappings);
1078 m = sr.getResults().get(0);
1079 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1081 assertEquals(4, m.getStart());
1082 assertEquals(6, m.getEnd());
1084 sr = MappingUtils.buildSearchResults(pep2, 3, cdsMappings);
1085 m = sr.getResults().get(0);
1086 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1088 assertEquals(7, m.getStart());
1089 assertEquals(9, m.getEnd());
1093 * Test the method that makes a cds-only alignment from a DNA sequence and its
1094 * product mappings, for the case where there are multiple exon mappings to
1095 * different protein products.
1097 @Test(groups = { "Functional" })
1098 public void testMakeCdsAlignment_multipleProteins()
1100 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1101 SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT
1102 SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc
1103 SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT
1104 dna1.createDatasetSequence();
1105 pep1.createDatasetSequence();
1106 pep2.createDatasetSequence();
1107 pep3.createDatasetSequence();
1108 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
1110 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
1112 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
1114 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
1116 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds5", 1, 3, 0f,
1118 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds6", 10, 12, 0f,
1120 pep1.getDatasetSequence().addDBRef(
1121 new DBRefEntry("EMBLCDS", "2", "A12345"));
1122 pep2.getDatasetSequence().addDBRef(
1123 new DBRefEntry("EMBLCDS", "3", "A12346"));
1124 pep3.getDatasetSequence().addDBRef(
1125 new DBRefEntry("EMBLCDS", "4", "A12347"));
1128 * Make the mappings from dna to protein
1130 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
1131 // map ...GGG...TTT to GF
1132 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1133 new int[] { 1, 2 }, 3, 1);
1134 AlignedCodonFrame acf = new AlignedCodonFrame();
1135 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1138 // map aaa...ccc to KP
1139 map = new MapList(new int[] { 1, 3, 7, 9 }, new int[] { 1, 2 }, 3, 1);
1140 acf = new AlignedCodonFrame();
1141 acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map);
1144 // map aaa......TTT to KF
1145 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 2 }, 3, 1);
1146 acf = new AlignedCodonFrame();
1147 acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
1151 * Create the CDS alignment; also augments the dna-to-protein mappings with
1152 * exon-to-protein and exon-to-dna mappings
1154 AlignmentI dna = new Alignment(new SequenceI[] { dna1 });
1155 dna.setDataset(null);
1158 * execute method under test
1160 AlignmentI cdsal = AlignmentUtils.makeCdsAlignment(
1161 new SequenceI[] { dna1 }, mappings, dna);
1164 * Verify we have 3 cds sequences, mapped to pep1/2/3 respectively
1166 List<SequenceI> cds = cdsal.getSequences();
1167 assertEquals(3, cds.size());
1170 * verify shared, extended alignment dataset
1172 assertSame(cdsal.getDataset(), dna.getDataset());
1173 assertTrue(dna.getDataset().getSequences()
1174 .contains(cds.get(0).getDatasetSequence()));
1175 assertTrue(dna.getDataset().getSequences()
1176 .contains(cds.get(1).getDatasetSequence()));
1177 assertTrue(dna.getDataset().getSequences()
1178 .contains(cds.get(2).getDatasetSequence()));
1181 * verify aligned cds sequences and their xrefs
1183 SequenceI cdsSeq = cds.get(0);
1184 assertEquals("GGGTTT", cdsSeq.getSequenceAsString());
1185 // assertEquals("dna1|A12345", cdsSeq.getName());
1186 assertEquals("dna1|pep1", cdsSeq.getName());
1187 // assertEquals(1, cdsSeq.getDBRefs().length);
1188 // DBRefEntry cdsRef = cdsSeq.getDBRefs()[0];
1189 // assertEquals("EMBLCDS", cdsRef.getSource());
1190 // assertEquals("2", cdsRef.getVersion());
1191 // assertEquals("A12345", cdsRef.getAccessionId());
1193 cdsSeq = cds.get(1);
1194 assertEquals("aaaccc", cdsSeq.getSequenceAsString());
1195 // assertEquals("dna1|A12346", cdsSeq.getName());
1196 assertEquals("dna1|pep2", cdsSeq.getName());
1197 // assertEquals(1, cdsSeq.getDBRefs().length);
1198 // cdsRef = cdsSeq.getDBRefs()[0];
1199 // assertEquals("EMBLCDS", cdsRef.getSource());
1200 // assertEquals("3", cdsRef.getVersion());
1201 // assertEquals("A12346", cdsRef.getAccessionId());
1203 cdsSeq = cds.get(2);
1204 assertEquals("aaaTTT", cdsSeq.getSequenceAsString());
1205 // assertEquals("dna1|A12347", cdsSeq.getName());
1206 assertEquals("dna1|pep3", cdsSeq.getName());
1207 // assertEquals(1, cdsSeq.getDBRefs().length);
1208 // cdsRef = cdsSeq.getDBRefs()[0];
1209 // assertEquals("EMBLCDS", cdsRef.getSource());
1210 // assertEquals("4", cdsRef.getVersion());
1211 // assertEquals("A12347", cdsRef.getAccessionId());
1214 * Verify there are mappings from each cds sequence to its protein product
1215 * and also to its dna source
1217 Iterator<AlignedCodonFrame> newMappingsIterator = cdsal
1218 .getCodonFrames().iterator();
1220 // mappings for dna1 - exon1 - pep1
1221 AlignedCodonFrame cdsMapping = newMappingsIterator.next();
1222 List<Mapping> dnaMappings = cdsMapping.getMappingsFromSequence(dna1);
1223 assertEquals(3, dnaMappings.size());
1224 assertSame(cds.get(0).getDatasetSequence(), dnaMappings.get(0)
1226 assertEquals("G(1) in CDS should map to G(4) in DNA", 4, dnaMappings
1227 .get(0).getMap().getToPosition(1));
1228 List<Mapping> peptideMappings = cdsMapping.getMappingsFromSequence(cds
1229 .get(0).getDatasetSequence());
1230 assertEquals(1, peptideMappings.size());
1231 assertSame(pep1.getDatasetSequence(), peptideMappings.get(0).getTo());
1233 // mappings for dna1 - cds2 - pep2
1234 assertSame(cds.get(1).getDatasetSequence(), dnaMappings.get(1)
1236 assertEquals("c(4) in CDS should map to c(7) in DNA", 7, dnaMappings
1237 .get(1).getMap().getToPosition(4));
1238 peptideMappings = cdsMapping.getMappingsFromSequence(cds.get(1)
1239 .getDatasetSequence());
1240 assertEquals(1, peptideMappings.size());
1241 assertSame(pep2.getDatasetSequence(), peptideMappings.get(0).getTo());
1243 // mappings for dna1 - cds3 - pep3
1244 assertSame(cds.get(2).getDatasetSequence(), dnaMappings.get(2)
1246 assertEquals("T(4) in CDS should map to T(10) in DNA", 10, dnaMappings
1247 .get(2).getMap().getToPosition(4));
1248 peptideMappings = cdsMapping.getMappingsFromSequence(cds.get(2)
1249 .getDatasetSequence());
1250 assertEquals(1, peptideMappings.size());
1251 assertSame(pep3.getDatasetSequence(), peptideMappings.get(0).getTo());
1254 @Test(groups = { "Functional" })
1255 public void testIsMappable()
1257 SequenceI dna1 = new Sequence("dna1", "cgCAGtgGT");
1258 SequenceI aa1 = new Sequence("aa1", "RSG");
1259 AlignmentI al1 = new Alignment(new SequenceI[] { dna1 });
1260 AlignmentI al2 = new Alignment(new SequenceI[] { aa1 });
1262 assertFalse(AlignmentUtils.isMappable(null, null));
1263 assertFalse(AlignmentUtils.isMappable(al1, null));
1264 assertFalse(AlignmentUtils.isMappable(null, al1));
1265 assertFalse(AlignmentUtils.isMappable(al1, al1));
1266 assertFalse(AlignmentUtils.isMappable(al2, al2));
1268 assertTrue(AlignmentUtils.isMappable(al1, al2));
1269 assertTrue(AlignmentUtils.isMappable(al2, al1));
1273 * Test creating a mapping when the sequences involved do not start at residue
1276 * @throws IOException
1278 @Test(groups = { "Functional" })
1279 public void testMapCdnaToProtein_forSubsequence()
1282 SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12);
1283 prot.createDatasetSequence();
1285 SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48);
1286 dna.createDatasetSequence();
1288 MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna);
1289 assertEquals(10, map.getToLowest());
1290 assertEquals(12, map.getToHighest());
1291 assertEquals(40, map.getFromLowest());
1292 assertEquals(48, map.getFromHighest());
1296 * Test for the alignSequenceAs method where we have protein mapped to protein
1298 @Test(groups = { "Functional" })
1299 public void testAlignSequenceAs_mappedProteinProtein()
1302 SequenceI alignMe = new Sequence("Match", "MGAASEV");
1303 alignMe.createDatasetSequence();
1304 SequenceI alignFrom = new Sequence("Query", "LQTGYMGAASEVMFSPTRR");
1305 alignFrom.createDatasetSequence();
1307 AlignedCodonFrame acf = new AlignedCodonFrame();
1308 // this is like a domain or motif match of part of a peptide sequence
1309 MapList map = new MapList(new int[] { 6, 12 }, new int[] { 1, 7 }, 1, 1);
1310 acf.addMap(alignFrom.getDatasetSequence(),
1311 alignMe.getDatasetSequence(), map);
1313 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "-", '-', true,
1315 assertEquals("-----MGAASEV-------", alignMe.getSequenceAsString());
1319 * Test for the alignSequenceAs method where there are trailing unmapped
1320 * residues in the model sequence
1322 @Test(groups = { "Functional" })
1323 public void testAlignSequenceAs_withTrailingPeptide()
1325 // map first 3 codons to KPF; G is a trailing unmapped residue
1326 MapList map = new MapList(new int[] { 1, 9 }, new int[] { 1, 3 }, 3, 1);
1328 checkAlignSequenceAs("AAACCCTTT", "K-PFG", true, true, map,
1333 * Tests for transferring features between mapped sequences
1335 @Test(groups = { "Functional" })
1336 public void testTransferFeatures()
1338 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1339 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1342 dna.addSequenceFeature(new SequenceFeature("type1", "desc1", 1, 2, 1f,
1344 // partial overlap - to [1, 1]
1345 dna.addSequenceFeature(new SequenceFeature("type2", "desc2", 3, 4, 2f,
1347 // exact overlap - to [1, 3]
1348 dna.addSequenceFeature(new SequenceFeature("type3", "desc3", 4, 6, 3f,
1350 // spanning overlap - to [2, 5]
1351 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1353 // exactly overlaps whole mapped range [1, 6]
1354 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1356 // no overlap (internal)
1357 dna.addSequenceFeature(new SequenceFeature("type6", "desc6", 7, 9, 6f,
1359 // no overlap (3' end)
1360 dna.addSequenceFeature(new SequenceFeature("type7", "desc7", 13, 15,
1362 // overlap (3' end) - to [6, 6]
1363 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1365 // extended overlap - to [6, +]
1366 dna.addSequenceFeature(new SequenceFeature("type9", "desc9", 12, 13,
1369 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1370 new int[] { 1, 6 }, 1, 1);
1373 * transferFeatures() will build 'partial overlap' for regions
1374 * that partially overlap 5' or 3' (start or end) of target sequence
1376 AlignmentUtils.transferFeatures(dna, cds, map, null);
1377 SequenceFeature[] sfs = cds.getSequenceFeatures();
1378 assertEquals(6, sfs.length);
1380 SequenceFeature sf = sfs[0];
1381 assertEquals("type2", sf.getType());
1382 assertEquals("desc2", sf.getDescription());
1383 assertEquals(2f, sf.getScore());
1384 assertEquals(1, sf.getBegin());
1385 assertEquals(1, sf.getEnd());
1388 assertEquals("type3", sf.getType());
1389 assertEquals("desc3", sf.getDescription());
1390 assertEquals(3f, sf.getScore());
1391 assertEquals(1, sf.getBegin());
1392 assertEquals(3, sf.getEnd());
1395 assertEquals("type4", sf.getType());
1396 assertEquals(2, sf.getBegin());
1397 assertEquals(5, sf.getEnd());
1400 assertEquals("type5", sf.getType());
1401 assertEquals(1, sf.getBegin());
1402 assertEquals(6, sf.getEnd());
1405 assertEquals("type8", sf.getType());
1406 assertEquals(6, sf.getBegin());
1407 assertEquals(6, sf.getEnd());
1410 assertEquals("type9", sf.getType());
1411 assertEquals(6, sf.getBegin());
1412 assertEquals(6, sf.getEnd());
1416 * Tests for transferring features between mapped sequences
1418 @Test(groups = { "Functional" })
1419 public void testTransferFeatures_withOmit()
1421 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1422 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1424 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1425 new int[] { 1, 6 }, 1, 1);
1427 // [5, 11] maps to [2, 5]
1428 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1430 // [4, 12] maps to [1, 6]
1431 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1433 // [12, 12] maps to [6, 6]
1434 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1437 // desc4 and desc8 are the 'omit these' varargs
1438 AlignmentUtils.transferFeatures(dna, cds, map, null, "type4", "type8");
1439 SequenceFeature[] sfs = cds.getSequenceFeatures();
1440 assertEquals(1, sfs.length);
1442 SequenceFeature sf = sfs[0];
1443 assertEquals("type5", sf.getType());
1444 assertEquals(1, sf.getBegin());
1445 assertEquals(6, sf.getEnd());
1449 * Tests for transferring features between mapped sequences
1451 @Test(groups = { "Functional" })
1452 public void testTransferFeatures_withSelect()
1454 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1455 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1457 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1458 new int[] { 1, 6 }, 1, 1);
1460 // [5, 11] maps to [2, 5]
1461 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1463 // [4, 12] maps to [1, 6]
1464 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1466 // [12, 12] maps to [6, 6]
1467 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1470 // "type5" is the 'select this type' argument
1471 AlignmentUtils.transferFeatures(dna, cds, map, "type5");
1472 SequenceFeature[] sfs = cds.getSequenceFeatures();
1473 assertEquals(1, sfs.length);
1475 SequenceFeature sf = sfs[0];
1476 assertEquals("type5", sf.getType());
1477 assertEquals(1, sf.getBegin());
1478 assertEquals(6, sf.getEnd());
1482 * Test the method that extracts the cds-only part of a dna alignment, for the
1483 * case where the cds should be aligned to match its nucleotide sequence.
1485 @Test(groups = { "Functional" })
1486 public void testMakeCdsAlignment_alternativeTranscripts()
1488 SequenceI dna1 = new Sequence("dna1", "aaaGGGCC-----CTTTaaaGGG");
1489 // alternative transcript of same dna skips CCC codon
1490 SequenceI dna2 = new Sequence("dna2", "aaaGGGCC-----cttTaaaGGG");
1491 // dna3 has no mapping (protein product) so should be ignored here
1492 SequenceI dna3 = new Sequence("dna3", "aaaGGGCCCCCGGGcttTaaaGGG");
1493 SequenceI pep1 = new Sequence("pep1", "GPFG");
1494 SequenceI pep2 = new Sequence("pep2", "GPG");
1495 dna1.createDatasetSequence();
1496 dna2.createDatasetSequence();
1497 dna3.createDatasetSequence();
1498 pep1.createDatasetSequence();
1499 pep2.createDatasetSequence();
1500 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 8, 0f,
1502 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 9, 12, 0f,
1504 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 16, 18, 0f,
1506 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 4, 8, 0f,
1508 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 12, 12, 0f,
1510 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 16, 18, 0f,
1513 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
1514 MapList map = new MapList(new int[] { 4, 12, 16, 18 },
1515 new int[] { 1, 4 }, 3, 1);
1516 AlignedCodonFrame acf = new AlignedCodonFrame();
1517 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1519 map = new MapList(new int[] { 4, 8, 12, 12, 16, 18 },
1522 acf = new AlignedCodonFrame();
1523 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1526 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1527 dna.setDataset(null);
1528 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1529 dna1, dna2, dna3 }, mappings, dna);
1530 List<SequenceI> cdsSeqs = cds.getSequences();
1531 assertEquals(2, cdsSeqs.size());
1532 assertEquals("GGGCCCTTTGGG", cdsSeqs.get(0).getSequenceAsString());
1533 assertEquals("GGGCCTGGG", cdsSeqs.get(1).getSequenceAsString());
1536 * verify shared, extended alignment dataset
1538 assertSame(dna.getDataset(), cds.getDataset());
1539 assertTrue(dna.getDataset().getSequences()
1540 .contains(cdsSeqs.get(0).getDatasetSequence()));
1541 assertTrue(dna.getDataset().getSequences()
1542 .contains(cdsSeqs.get(1).getDatasetSequence()));
1545 * Verify updated mappings
1547 List<AlignedCodonFrame> cdsMappings = cds.getCodonFrames();
1548 assertEquals(2, cdsMappings.size());
1551 * Mapping from pep1 to GGGTTT in first new CDS sequence
1553 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1554 .findMappingsForSequence(pep1, cdsMappings);
1555 assertEquals(1, pep1Mapping.size());
1557 * maps GPFG to 1-3,4-6,7-9,10-12
1559 SearchResults sr = MappingUtils
1560 .buildSearchResults(pep1, 1, cdsMappings);
1561 assertEquals(1, sr.getResults().size());
1562 Match m = sr.getResults().get(0);
1563 assertEquals(cds.getSequenceAt(0).getDatasetSequence(),
1565 assertEquals(1, m.getStart());
1566 assertEquals(3, m.getEnd());
1567 sr = MappingUtils.buildSearchResults(pep1, 2, cdsMappings);
1568 m = sr.getResults().get(0);
1569 assertEquals(4, m.getStart());
1570 assertEquals(6, m.getEnd());
1571 sr = MappingUtils.buildSearchResults(pep1, 3, cdsMappings);
1572 m = sr.getResults().get(0);
1573 assertEquals(7, m.getStart());
1574 assertEquals(9, m.getEnd());
1575 sr = MappingUtils.buildSearchResults(pep1, 4, cdsMappings);
1576 m = sr.getResults().get(0);
1577 assertEquals(10, m.getStart());
1578 assertEquals(12, m.getEnd());
1581 * GPG in pep2 map to 1-3,4-6,7-9 in second CDS sequence
1583 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1584 .findMappingsForSequence(pep2, cdsMappings);
1585 assertEquals(1, pep2Mapping.size());
1586 sr = MappingUtils.buildSearchResults(pep2, 1, cdsMappings);
1587 assertEquals(1, sr.getResults().size());
1588 m = sr.getResults().get(0);
1589 assertEquals(cds.getSequenceAt(1).getDatasetSequence(),
1591 assertEquals(1, m.getStart());
1592 assertEquals(3, m.getEnd());
1593 sr = MappingUtils.buildSearchResults(pep2, 2, cdsMappings);
1594 m = sr.getResults().get(0);
1595 assertEquals(4, m.getStart());
1596 assertEquals(6, m.getEnd());
1597 sr = MappingUtils.buildSearchResults(pep2, 3, cdsMappings);
1598 m = sr.getResults().get(0);
1599 assertEquals(7, m.getStart());
1600 assertEquals(9, m.getEnd());
1604 * Test the method that realigns protein to match mapped codon alignment.
1606 @Test(groups = { "Functional" })
1607 public void testAlignProteinAsDna_incompleteStartCodon()
1609 // seq1: incomplete start codon (not mapped), then [3, 11]
1610 SequenceI dna1 = new Sequence("Seq1", "ccAAA-TTT-GGG-");
1611 // seq2 codons are [4, 5], [8, 11]
1612 SequenceI dna2 = new Sequence("Seq2", "ccaAA-ttT-GGG-");
1613 // seq3 incomplete start codon at 'tt'
1614 SequenceI dna3 = new Sequence("Seq3", "ccaaa-ttt-GGG-");
1615 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1616 dna.setDataset(null);
1618 // prot1 has 'X' for incomplete start codon (not mapped)
1619 SequenceI prot1 = new Sequence("Seq1", "XKFG"); // X for incomplete start
1620 SequenceI prot2 = new Sequence("Seq2", "NG");
1621 SequenceI prot3 = new Sequence("Seq3", "XG"); // X for incomplete start
1622 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
1624 protein.setDataset(null);
1626 // map dna1 [3, 11] to prot1 [2, 4] KFG
1627 MapList map = new MapList(new int[] { 3, 11 }, new int[] { 2, 4 }, 3, 1);
1628 AlignedCodonFrame acf = new AlignedCodonFrame();
1629 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
1631 // map dna2 [4, 5] [8, 11] to prot2 [1, 2] NG
1632 map = new MapList(new int[] { 4, 5, 8, 11 }, new int[] { 1, 2 }, 3, 1);
1633 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
1635 // map dna3 [9, 11] to prot3 [2, 2] G
1636 map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1);
1637 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
1639 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
1641 protein.setCodonFrames(acfs);
1644 * verify X is included in the aligned proteins, and placed just
1645 * before the first mapped residue
1646 * CCT is between CCC and TTT
1648 AlignmentUtils.alignProteinAsDna(protein, dna);
1649 assertEquals("XK-FG", prot1.getSequenceAsString());
1650 assertEquals("--N-G", prot2.getSequenceAsString());
1651 assertEquals("---XG", prot3.getSequenceAsString());
1655 * Tests for the method that maps the subset of a dna sequence that has CDS
1656 * (or subtype) feature - case where the start codon is incomplete.
1658 @Test(groups = "Functional")
1659 public void testFindCdsPositions_fivePrimeIncomplete()
1661 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
1662 dnaSeq.createDatasetSequence();
1663 SequenceI ds = dnaSeq.getDatasetSequence();
1665 // CDS for dna 5-6 (incomplete codon), 7-9
1666 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
1667 sf.setPhase("2"); // skip 2 bases to start of next codon
1668 ds.addSequenceFeature(sf);
1669 // CDS for dna 13-15
1670 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
1671 ds.addSequenceFeature(sf);
1673 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1676 * check the mapping starts with the first complete codon
1678 assertEquals(6, MappingUtils.getLength(ranges));
1679 assertEquals(2, ranges.size());
1680 assertEquals(7, ranges.get(0)[0]);
1681 assertEquals(9, ranges.get(0)[1]);
1682 assertEquals(13, ranges.get(1)[0]);
1683 assertEquals(15, ranges.get(1)[1]);
1687 * Tests for the method that maps the subset of a dna sequence that has CDS
1688 * (or subtype) feature.
1690 @Test(groups = "Functional")
1691 public void testFindCdsPositions()
1693 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
1694 dnaSeq.createDatasetSequence();
1695 SequenceI ds = dnaSeq.getDatasetSequence();
1697 // CDS for dna 10-12
1698 SequenceFeature sf = new SequenceFeature("CDS_predicted", "", 10, 12,
1701 ds.addSequenceFeature(sf);
1703 sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
1705 ds.addSequenceFeature(sf);
1706 // exon feature should be ignored here
1707 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
1708 ds.addSequenceFeature(sf);
1710 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1712 * verify ranges { [4-6], [12-10] }
1713 * note CDS ranges are ordered ascending even if the CDS
1716 assertEquals(6, MappingUtils.getLength(ranges));
1717 assertEquals(2, ranges.size());
1718 assertEquals(4, ranges.get(0)[0]);
1719 assertEquals(6, ranges.get(0)[1]);
1720 assertEquals(10, ranges.get(1)[0]);
1721 assertEquals(12, ranges.get(1)[1]);
1725 * Test the method that computes a map of codon variants for each protein
1726 * position from "sequence_variant" features on dna
1728 @Test(groups = "Functional")
1729 public void testBuildDnaVariantsMap()
1731 SequenceI dna = new Sequence("dna", "atgAAATTTGGGCCCtag");
1732 MapList map = new MapList(new int[] { 1, 18 }, new int[] { 1, 5 }, 3, 1);
1735 * first with no variants on dna
1737 LinkedHashMap<Integer, List<DnaVariant>[]> variantsMap = AlignmentUtils
1738 .buildDnaVariantsMap(dna, map);
1739 assertTrue(variantsMap.isEmpty());
1742 * single allele codon 1, on base 1
1744 SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
1746 sf1.setValue("alleles", "T");
1747 sf1.setValue("ID", "sequence_variant:rs758803211");
1748 dna.addSequenceFeature(sf1);
1751 * two alleles codon 2, on bases 2 and 3 (distinct variants)
1753 SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 5, 5,
1755 sf2.setValue("alleles", "T");
1756 sf2.setValue("ID", "sequence_variant:rs758803212");
1757 dna.addSequenceFeature(sf2);
1758 SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 6, 6,
1760 sf3.setValue("alleles", "G");
1761 sf3.setValue("ID", "sequence_variant:rs758803213");
1762 dna.addSequenceFeature(sf3);
1765 * two alleles codon 3, both on base 2 (one variant)
1767 SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 8, 8,
1769 sf4.setValue("alleles", "C, G");
1770 sf4.setValue("ID", "sequence_variant:rs758803214");
1771 dna.addSequenceFeature(sf4);
1773 // no alleles on codon 4
1776 * alleles on codon 5 on all 3 bases (distinct variants)
1778 SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 13,
1780 sf5.setValue("alleles", "C, G"); // (C duplicates given base value)
1781 sf5.setValue("ID", "sequence_variant:rs758803215");
1782 dna.addSequenceFeature(sf5);
1783 SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 14,
1785 sf6.setValue("alleles", "g, a"); // should force to upper-case
1786 sf6.setValue("ID", "sequence_variant:rs758803216");
1787 dna.addSequenceFeature(sf6);
1788 SequenceFeature sf7 = new SequenceFeature("sequence_variant", "", 15,
1790 sf7.setValue("alleles", "A, T");
1791 sf7.setValue("ID", "sequence_variant:rs758803217");
1792 dna.addSequenceFeature(sf7);
1795 * build map - expect variants on positions 1, 2, 3, 5
1797 variantsMap = AlignmentUtils.buildDnaVariantsMap(dna, map);
1798 assertEquals(4, variantsMap.size());
1801 * protein residue 1: variant on codon (ATG) base 1, not on 2 or 3
1803 List<DnaVariant>[] pep1Variants = variantsMap.get(1);
1804 assertEquals(3, pep1Variants.length);
1805 assertEquals(1, pep1Variants[0].size());
1806 assertEquals("A", pep1Variants[0].get(0).base); // codon[1] base
1807 assertSame(sf1, pep1Variants[0].get(0).variant); // codon[1] variant
1808 assertEquals(1, pep1Variants[1].size());
1809 assertEquals("T", pep1Variants[1].get(0).base); // codon[2] base
1810 assertNull(pep1Variants[1].get(0).variant); // no variant here
1811 assertEquals(1, pep1Variants[2].size());
1812 assertEquals("G", pep1Variants[2].get(0).base); // codon[3] base
1813 assertNull(pep1Variants[2].get(0).variant); // no variant here
1816 * protein residue 2: variants on codon (AAA) bases 2 and 3
1818 List<DnaVariant>[] pep2Variants = variantsMap.get(2);
1819 assertEquals(3, pep2Variants.length);
1820 assertEquals(1, pep2Variants[0].size());
1821 // codon[1] base recorded while processing variant on codon[2]
1822 assertEquals("A", pep2Variants[0].get(0).base);
1823 assertNull(pep2Variants[0].get(0).variant); // no variant here
1824 // codon[2] base and variant:
1825 assertEquals(1, pep2Variants[1].size());
1826 assertEquals("A", pep2Variants[1].get(0).base);
1827 assertSame(sf2, pep2Variants[1].get(0).variant);
1828 // codon[3] base was recorded when processing codon[2] variant
1829 // and then the variant for codon[3] added to it
1830 assertEquals(1, pep2Variants[2].size());
1831 assertEquals("A", pep2Variants[2].get(0).base);
1832 assertSame(sf3, pep2Variants[2].get(0).variant);
1835 * protein residue 3: variants on codon (TTT) base 2 only
1837 List<DnaVariant>[] pep3Variants = variantsMap.get(3);
1838 assertEquals(3, pep3Variants.length);
1839 assertEquals(1, pep3Variants[0].size());
1840 assertEquals("T", pep3Variants[0].get(0).base); // codon[1] base
1841 assertNull(pep3Variants[0].get(0).variant); // no variant here
1842 assertEquals(1, pep3Variants[1].size());
1843 assertEquals("T", pep3Variants[1].get(0).base); // codon[2] base
1844 assertSame(sf4, pep3Variants[1].get(0).variant); // codon[2] variant
1845 assertEquals(1, pep3Variants[2].size());
1846 assertEquals("T", pep3Variants[2].get(0).base); // codon[3] base
1847 assertNull(pep3Variants[2].get(0).variant); // no variant here
1850 * three variants on protein position 5
1852 List<DnaVariant>[] pep5Variants = variantsMap.get(5);
1853 assertEquals(3, pep5Variants.length);
1854 assertEquals(1, pep5Variants[0].size());
1855 assertEquals("C", pep5Variants[0].get(0).base); // codon[1] base
1856 assertSame(sf5, pep5Variants[0].get(0).variant); // codon[1] variant
1857 assertEquals(1, pep5Variants[1].size());
1858 assertEquals("C", pep5Variants[1].get(0).base); // codon[2] base
1859 assertSame(sf6, pep5Variants[1].get(0).variant); // codon[2] variant
1860 assertEquals(1, pep5Variants[2].size());
1861 assertEquals("C", pep5Variants[2].get(0).base); // codon[3] base
1862 assertSame(sf7, pep5Variants[2].get(0).variant); // codon[3] variant
1866 * Tests for the method that computes all peptide variants given codon
1869 @Test(groups = "Functional")
1870 public void testComputePeptideVariants()
1873 * scenario: AAATTTCCC codes for KFP, with variants
1879 * CAC,CGC -> H,R (as one variant)
1881 SequenceI peptide = new Sequence("pep/10-12", "KFP");
1884 * two distinct variants for codon 1 position 1
1885 * second one has clinical significance
1887 SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
1889 sf1.setValue("alleles", "A,G"); // GAA -> E
1890 sf1.setValue("ID", "var1.125A>G");
1891 SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 1, 1,
1893 sf2.setValue("alleles", "A,C"); // CAA -> Q
1894 sf2.setValue("ID", "var2");
1895 sf2.setValue("clinical_significance", "Dodgy");
1896 SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 3, 3,
1898 sf3.setValue("alleles", "A,G"); // synonymous
1899 sf3.setValue("ID", "var3");
1900 sf3.setValue("clinical_significance", "None");
1901 SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 3, 3,
1903 sf4.setValue("alleles", "A,T"); // AAT -> N
1904 sf4.setValue("ID", "sequence_variant:var4"); // prefix gets stripped off
1905 sf4.setValue("clinical_significance", "Benign");
1906 SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 6, 6,
1908 sf5.setValue("alleles", "T,C"); // synonymous
1909 sf5.setValue("ID", "var5");
1910 sf5.setValue("clinical_significance", "Bad");
1911 SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 8, 8,
1913 sf6.setValue("alleles", "C,A,G"); // CAC,CGC -> H,R
1914 sf6.setValue("ID", "var6");
1915 sf6.setValue("clinical_significance", "Good");
1917 List<DnaVariant> codon1Variants = new ArrayList<DnaVariant>();
1918 List<DnaVariant> codon2Variants = new ArrayList<DnaVariant>();
1919 List<DnaVariant> codon3Variants = new ArrayList<DnaVariant>();
1920 List<DnaVariant> codonVariants[] = new ArrayList[3];
1921 codonVariants[0] = codon1Variants;
1922 codonVariants[1] = codon2Variants;
1923 codonVariants[2] = codon3Variants;
1926 * compute variants for protein position 1
1928 codon1Variants.add(new DnaVariant("A", sf1));
1929 codon1Variants.add(new DnaVariant("A", sf2));
1930 codon2Variants.add(new DnaVariant("A"));
1931 codon2Variants.add(new DnaVariant("A"));
1932 codon3Variants.add(new DnaVariant("A", sf3));
1933 codon3Variants.add(new DnaVariant("A", sf4));
1934 AlignmentUtils.computePeptideVariants(peptide, 1, codonVariants);
1937 * compute variants for protein position 2
1939 codon1Variants.clear();
1940 codon2Variants.clear();
1941 codon3Variants.clear();
1942 codon1Variants.add(new DnaVariant("T"));
1943 codon2Variants.add(new DnaVariant("T"));
1944 codon3Variants.add(new DnaVariant("T", sf5));
1945 AlignmentUtils.computePeptideVariants(peptide, 2, codonVariants);
1948 * compute variants for protein position 3
1950 codon1Variants.clear();
1951 codon2Variants.clear();
1952 codon3Variants.clear();
1953 codon1Variants.add(new DnaVariant("C"));
1954 codon2Variants.add(new DnaVariant("C", sf6));
1955 codon3Variants.add(new DnaVariant("C"));
1956 AlignmentUtils.computePeptideVariants(peptide, 3, codonVariants);
1959 * verify added sequence features for
1966 SequenceFeature[] sfs = peptide.getSequenceFeatures();
1967 assertEquals(5, sfs.length);
1968 SequenceFeature sf = sfs[0];
1969 assertEquals(1, sf.getBegin());
1970 assertEquals(1, sf.getEnd());
1971 assertEquals("p.Lys1Glu", sf.getDescription());
1972 assertEquals("var1.125A>G", sf.getValue("ID"));
1973 assertNull(sf.getValue("clinical_significance"));
1974 assertEquals("ID=var1.125A>G", sf.getAttributes());
1975 assertEquals(1, sf.links.size());
1976 // link to variation is urlencoded
1978 "p.Lys1Glu var1.125A>G|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var1.125A%3EG",
1980 assertEquals("Jalview", sf.getFeatureGroup());
1982 assertEquals(1, sf.getBegin());
1983 assertEquals(1, sf.getEnd());
1984 assertEquals("p.Lys1Gln", sf.getDescription());
1985 assertEquals("var2", sf.getValue("ID"));
1986 assertEquals("Dodgy", sf.getValue("clinical_significance"));
1987 assertEquals("ID=var2;clinical_significance=Dodgy", sf.getAttributes());
1988 assertEquals(1, sf.links.size());
1990 "p.Lys1Gln var2|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var2",
1992 assertEquals("Jalview", sf.getFeatureGroup());
1994 assertEquals(1, sf.getBegin());
1995 assertEquals(1, sf.getEnd());
1996 assertEquals("p.Lys1Asn", sf.getDescription());
1997 assertEquals("var4", sf.getValue("ID"));
1998 assertEquals("Benign", sf.getValue("clinical_significance"));
1999 assertEquals("ID=var4;clinical_significance=Benign", sf.getAttributes());
2000 assertEquals(1, sf.links.size());
2002 "p.Lys1Asn var4|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var4",
2004 assertEquals("Jalview", sf.getFeatureGroup());
2006 assertEquals(3, sf.getBegin());
2007 assertEquals(3, sf.getEnd());
2008 assertEquals("p.Pro3His", sf.getDescription());
2009 assertEquals("var6", sf.getValue("ID"));
2010 assertEquals("Good", sf.getValue("clinical_significance"));
2011 assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes());
2012 assertEquals(1, sf.links.size());
2014 "p.Pro3His var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
2016 // var5 generates two distinct protein variant features
2017 assertEquals("Jalview", sf.getFeatureGroup());
2019 assertEquals(3, sf.getBegin());
2020 assertEquals(3, sf.getEnd());
2021 assertEquals("p.Pro3Arg", sf.getDescription());
2022 assertEquals("var6", sf.getValue("ID"));
2023 assertEquals("Good", sf.getValue("clinical_significance"));
2024 assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes());
2025 assertEquals(1, sf.links.size());
2027 "p.Pro3Arg var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
2029 assertEquals("Jalview", sf.getFeatureGroup());
2033 * Tests for the method that maps the subset of a dna sequence that has CDS
2034 * (or subtype) feature, with CDS strand = '-' (reverse)
2036 // test turned off as currently findCdsPositions is not strand-dependent
2037 // left in case it comes around again...
2038 @Test(groups = "Functional", enabled = false)
2039 public void testFindCdsPositions_reverseStrand()
2041 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
2042 dnaSeq.createDatasetSequence();
2043 SequenceI ds = dnaSeq.getDatasetSequence();
2046 SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
2048 ds.addSequenceFeature(sf);
2049 // exon feature should be ignored here
2050 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
2051 ds.addSequenceFeature(sf);
2052 // CDS for dna 10-12
2053 sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
2055 ds.addSequenceFeature(sf);
2057 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
2059 * verify ranges { [12-10], [6-4] }
2061 assertEquals(6, MappingUtils.getLength(ranges));
2062 assertEquals(2, ranges.size());
2063 assertEquals(12, ranges.get(0)[0]);
2064 assertEquals(10, ranges.get(0)[1]);
2065 assertEquals(6, ranges.get(1)[0]);
2066 assertEquals(4, ranges.get(1)[1]);
2070 * Tests for the method that maps the subset of a dna sequence that has CDS
2071 * (or subtype) feature - reverse strand case where the start codon is
2074 @Test(groups = "Functional", enabled = false)
2075 // test turned off as currently findCdsPositions is not strand-dependent
2076 // left in case it comes around again...
2077 public void testFindCdsPositions_reverseStrandThreePrimeIncomplete()
2079 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
2080 dnaSeq.createDatasetSequence();
2081 SequenceI ds = dnaSeq.getDatasetSequence();
2084 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
2086 ds.addSequenceFeature(sf);
2087 // CDS for dna 13-15
2088 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
2090 sf.setPhase("2"); // skip 2 bases to start of next codon
2091 ds.addSequenceFeature(sf);
2093 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
2096 * check the mapping starts with the first complete codon
2097 * expect ranges [13, 13], [9, 5]
2099 assertEquals(6, MappingUtils.getLength(ranges));
2100 assertEquals(2, ranges.size());
2101 assertEquals(13, ranges.get(0)[0]);
2102 assertEquals(13, ranges.get(0)[1]);
2103 assertEquals(9, ranges.get(1)[0]);
2104 assertEquals(5, ranges.get(1)[1]);
2107 @Test(groups = "Functional")
2108 public void testAlignAs_alternateTranscriptsUngapped()
2110 SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
2111 SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
2112 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
2113 ((Alignment) dna).createDatasetAlignment();
2114 SequenceI cds1 = new Sequence("cds1", "GGGTTT");
2115 SequenceI cds2 = new Sequence("cds2", "CCCAAA");
2116 AlignmentI cds = new Alignment(new SequenceI[] { cds1, cds2 });
2117 ((Alignment) cds).createDatasetAlignment();
2119 AlignedCodonFrame acf = new AlignedCodonFrame();
2120 MapList map = new MapList(new int[] { 4, 9 }, new int[] { 1, 6 }, 1, 1);
2121 acf.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), map);
2122 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 6 }, 1, 1);
2123 acf.addMap(dna2.getDatasetSequence(), cds2.getDatasetSequence(), map);
2126 * verify CDS alignment is as:
2127 * cccGGGTTTaaa (cdna)
2128 * CCCgggtttAAA (cdna)
2130 * ---GGGTTT--- (cds)
2131 * CCC------AAA (cds)
2133 dna.addCodonFrame(acf);
2134 AlignmentUtils.alignAs(cds, dna);
2135 assertEquals("---GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
2136 assertEquals("CCC------AAA", cds.getSequenceAt(1).getSequenceAsString());
2139 @Test(groups = { "Functional" })
2140 public void testAddMappedPositions()
2142 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
2143 SequenceI seq1 = new Sequence("cds", "AAATTT");
2144 from.createDatasetSequence();
2145 seq1.createDatasetSequence();
2146 Mapping mapping = new Mapping(seq1, new MapList(
2147 new int[] { 3, 6, 9, 10 },
2148 new int[] { 1, 6 }, 1, 1));
2149 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
2150 AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2153 * verify map has seq1 residues in columns 3,4,6,7,11,12
2155 assertEquals(6, map.size());
2156 assertEquals('A', map.get(3).get(seq1).charValue());
2157 assertEquals('A', map.get(4).get(seq1).charValue());
2158 assertEquals('A', map.get(6).get(seq1).charValue());
2159 assertEquals('T', map.get(7).get(seq1).charValue());
2160 assertEquals('T', map.get(11).get(seq1).charValue());
2161 assertEquals('T', map.get(12).get(seq1).charValue());
2169 * Test case where the mapping 'from' range includes a stop codon which is
2170 * absent in the 'to' range
2172 @Test(groups = { "Functional" })
2173 public void testAddMappedPositions_withStopCodon()
2175 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
2176 SequenceI seq1 = new Sequence("cds", "AAATTT");
2177 from.createDatasetSequence();
2178 seq1.createDatasetSequence();
2179 Mapping mapping = new Mapping(seq1, new MapList(
2180 new int[] { 3, 6, 9, 10 },
2181 new int[] { 1, 6 }, 1, 1));
2182 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
2183 AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2186 * verify map has seq1 residues in columns 3,4,6,7,11,12
2188 assertEquals(6, map.size());
2189 assertEquals('A', map.get(3).get(seq1).charValue());
2190 assertEquals('A', map.get(4).get(seq1).charValue());
2191 assertEquals('A', map.get(6).get(seq1).charValue());
2192 assertEquals('T', map.get(7).get(seq1).charValue());
2193 assertEquals('T', map.get(11).get(seq1).charValue());
2194 assertEquals('T', map.get(12).get(seq1).charValue());