2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.testng.AssertJUnit.assertEquals;
24 import static org.testng.AssertJUnit.assertFalse;
25 import static org.testng.AssertJUnit.assertNull;
26 import static org.testng.AssertJUnit.assertSame;
27 import static org.testng.AssertJUnit.assertTrue;
29 import jalview.analysis.AlignmentUtils.DnaVariant;
30 import jalview.datamodel.AlignedCodonFrame;
31 import jalview.datamodel.Alignment;
32 import jalview.datamodel.AlignmentAnnotation;
33 import jalview.datamodel.AlignmentI;
34 import jalview.datamodel.Annotation;
35 import jalview.datamodel.DBRefEntry;
36 import jalview.datamodel.Mapping;
37 import jalview.datamodel.SearchResults;
38 import jalview.datamodel.SearchResults.Match;
39 import jalview.datamodel.Sequence;
40 import jalview.datamodel.SequenceFeature;
41 import jalview.datamodel.SequenceI;
42 import jalview.io.AppletFormatAdapter;
43 import jalview.io.FormatAdapter;
44 import jalview.util.MapList;
45 import jalview.util.MappingUtils;
47 import java.io.IOException;
48 import java.util.ArrayList;
49 import java.util.Arrays;
50 import java.util.Iterator;
51 import java.util.LinkedHashMap;
52 import java.util.List;
54 import java.util.TreeMap;
56 import org.testng.Assert;
57 import org.testng.annotations.Test;
59 public class AlignmentUtilsTests
61 public static Sequence ts = new Sequence("short",
62 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm");
64 @Test(groups = { "Functional" })
65 public void testExpandContext()
67 AlignmentI al = new Alignment(new Sequence[] {});
68 for (int i = 4; i < 14; i += 2)
70 SequenceI s1 = ts.deriveSequence().getSubSequence(i, i + 7);
73 System.out.println(new AppletFormatAdapter().formatSequences("Clustal",
75 for (int flnk = -1; flnk < 25; flnk++)
77 AlignmentI exp = AlignmentUtils.expandContext(al, flnk);
78 System.out.println("\nFlank size: " + flnk);
79 System.out.println(new AppletFormatAdapter().formatSequences(
80 "Clustal", exp, true));
84 * Full expansion to complete sequences
86 for (SequenceI sq : exp.getSequences())
88 String ung = sq.getSequenceAsString().replaceAll("-+", "");
89 final String errorMsg = "Flanking sequence not the same as original dataset sequence.\n"
92 + sq.getDatasetSequence().getSequenceAsString();
93 assertTrue(errorMsg, ung.equalsIgnoreCase(sq.getDatasetSequence()
94 .getSequenceAsString()));
100 * Last sequence is fully expanded, others have leading gaps to match
102 assertTrue(exp.getSequenceAt(4).getSequenceAsString()
104 assertTrue(exp.getSequenceAt(3).getSequenceAsString()
105 .startsWith("--abc"));
106 assertTrue(exp.getSequenceAt(2).getSequenceAsString()
107 .startsWith("----abc"));
108 assertTrue(exp.getSequenceAt(1).getSequenceAsString()
109 .startsWith("------abc"));
110 assertTrue(exp.getSequenceAt(0).getSequenceAsString()
111 .startsWith("--------abc"));
117 * Test that annotations are correctly adjusted by expandContext
119 @Test(groups = { "Functional" })
120 public void testExpandContext_annotation()
122 AlignmentI al = new Alignment(new Sequence[] {});
123 SequenceI ds = new Sequence("Seq1", "ABCDEFGHI");
125 SequenceI seq1 = ds.deriveSequence().getSubSequence(3, 6);
126 al.addSequence(seq1);
129 * Annotate DEF with 4/5/6 respectively
131 Annotation[] anns = new Annotation[] { new Annotation(4),
132 new Annotation(5), new Annotation(6) };
133 AlignmentAnnotation ann = new AlignmentAnnotation("SS",
134 "secondary structure", anns);
135 seq1.addAlignmentAnnotation(ann);
138 * The annotations array should match aligned positions
140 assertEquals(3, ann.annotations.length);
141 assertEquals(4, ann.annotations[0].value, 0.001);
142 assertEquals(5, ann.annotations[1].value, 0.001);
143 assertEquals(6, ann.annotations[2].value, 0.001);
146 * Check annotation to sequence position mappings before expanding the
147 * sequence; these are set up in Sequence.addAlignmentAnnotation ->
148 * Annotation.setSequenceRef -> createSequenceMappings
150 assertNull(ann.getAnnotationForPosition(1));
151 assertNull(ann.getAnnotationForPosition(2));
152 assertNull(ann.getAnnotationForPosition(3));
153 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
154 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
155 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
156 assertNull(ann.getAnnotationForPosition(7));
157 assertNull(ann.getAnnotationForPosition(8));
158 assertNull(ann.getAnnotationForPosition(9));
161 * Expand the subsequence to the full sequence abcDEFghi
163 AlignmentI expanded = AlignmentUtils.expandContext(al, -1);
164 assertEquals("abcDEFghi", expanded.getSequenceAt(0)
165 .getSequenceAsString());
168 * Confirm the alignment and sequence have the same SS annotation,
169 * referencing the expanded sequence
171 ann = expanded.getSequenceAt(0).getAnnotation()[0];
172 assertSame(ann, expanded.getAlignmentAnnotation()[0]);
173 assertSame(expanded.getSequenceAt(0), ann.sequenceRef);
176 * The annotations array should have null values except for annotated
179 assertNull(ann.annotations[0]);
180 assertNull(ann.annotations[1]);
181 assertNull(ann.annotations[2]);
182 assertEquals(4, ann.annotations[3].value, 0.001);
183 assertEquals(5, ann.annotations[4].value, 0.001);
184 assertEquals(6, ann.annotations[5].value, 0.001);
185 assertNull(ann.annotations[6]);
186 assertNull(ann.annotations[7]);
187 assertNull(ann.annotations[8]);
190 * sequence position mappings should be unchanged
192 assertNull(ann.getAnnotationForPosition(1));
193 assertNull(ann.getAnnotationForPosition(2));
194 assertNull(ann.getAnnotationForPosition(3));
195 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
196 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
197 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
198 assertNull(ann.getAnnotationForPosition(7));
199 assertNull(ann.getAnnotationForPosition(8));
200 assertNull(ann.getAnnotationForPosition(9));
204 * Test method that returns a map of lists of sequences by sequence name.
206 * @throws IOException
208 @Test(groups = { "Functional" })
209 public void testGetSequencesByName() throws IOException
211 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
212 + ">Seq1Name\nABCD\n";
213 AlignmentI al = loadAlignment(data, "FASTA");
214 Map<String, List<SequenceI>> map = AlignmentUtils
215 .getSequencesByName(al);
216 assertEquals(2, map.keySet().size());
217 assertEquals(2, map.get("Seq1Name").size());
218 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
219 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
220 assertEquals(1, map.get("Seq2Name").size());
221 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
225 * Helper method to load an alignment and ensure dataset sequences are set up.
231 * @throws IOException
233 protected AlignmentI loadAlignment(final String data, String format)
236 AlignmentI a = new FormatAdapter().readFile(data,
237 AppletFormatAdapter.PASTE, format);
243 * Test mapping of protein to cDNA, for the case where we have no sequence
244 * cross-references, so mappings are made first-served 1-1 where sequences
247 * @throws IOException
249 @Test(groups = { "Functional" })
250 public void testMapProteinAlignmentToCdna_noXrefs() throws IOException
252 List<SequenceI> protseqs = new ArrayList<SequenceI>();
253 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
254 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
255 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
256 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
257 protein.setDataset(null);
259 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
260 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
261 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
262 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
263 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
264 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
265 cdna.setDataset(null);
267 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
269 // 3 mappings made, each from 1 to 1 sequence
270 assertEquals(3, protein.getCodonFrames().size());
271 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
272 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
273 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
275 // V12345 mapped to A22222
276 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
278 assertEquals(1, acf.getdnaSeqs().length);
279 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
280 acf.getdnaSeqs()[0]);
281 Mapping[] protMappings = acf.getProtMappings();
282 assertEquals(1, protMappings.length);
283 MapList mapList = protMappings[0].getMap();
284 assertEquals(3, mapList.getFromRatio());
285 assertEquals(1, mapList.getToRatio());
286 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
288 assertEquals(1, mapList.getFromRanges().size());
289 assertTrue(Arrays.equals(new int[] { 1, 3 },
290 mapList.getToRanges().get(0)));
291 assertEquals(1, mapList.getToRanges().size());
293 // V12346 mapped to A33333
294 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
295 assertEquals(1, acf.getdnaSeqs().length);
296 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
297 acf.getdnaSeqs()[0]);
299 // V12347 mapped to A11111
300 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
301 assertEquals(1, acf.getdnaSeqs().length);
302 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
303 acf.getdnaSeqs()[0]);
305 // no mapping involving the 'extra' A44444
306 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
310 * Test for the alignSequenceAs method that takes two sequences and a mapping.
312 @Test(groups = { "Functional" })
313 public void testAlignSequenceAs_withMapping_noIntrons()
315 MapList map = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1);
318 * No existing gaps in dna:
320 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
324 * Now introduce gaps in dna but ignore them when realigning.
326 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
330 * Now include gaps in dna when realigning. First retaining 'mapped' gaps
331 * only, i.e. those within the exon region.
333 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
334 "---G-G--G---A--A-A");
337 * Include all gaps in dna when realigning (within and without the exon
338 * region). The leading gap, and the gaps between codons, are subsumed by
339 * the protein alignment gap.
341 checkAlignSequenceAs("-G-GG--AA-A---", "-A-L-", true, true, map,
342 "---G-GG---AA-A---");
345 * Include only unmapped gaps in dna when realigning (outside the exon
346 * region). The leading gap, and the gaps between codons, are subsumed by
347 * the protein alignment gap.
349 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
354 * Test for the alignSequenceAs method that takes two sequences and a mapping.
356 @Test(groups = { "Functional" })
357 public void testAlignSequenceAs_withMapping_withIntrons()
360 * Exons at codon 2 (AAA) and 4 (TTT)
362 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
363 new int[] { 1, 2 }, 3, 1);
366 * Simple case: no gaps in dna
368 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
369 "GGG---AAACCCTTTGGG");
372 * Add gaps to dna - but ignore when realigning.
374 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
375 false, false, map, "GGG---AAACCCTTTGGG");
378 * Add gaps to dna - include within exons only when realigning.
380 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
381 true, false, map, "GGG---A--A---ACCCT-TTGGG");
384 * Include gaps outside exons only when realigning.
386 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
387 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
390 * Include gaps following first intron if we are 'preserving mapped gaps'
392 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
393 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
396 * Include all gaps in dna when realigning.
398 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
399 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
403 * Test for the case where not all of the protein sequence is mapped to cDNA.
405 @Test(groups = { "Functional" })
406 public void testAlignSequenceAs_withMapping_withUnmappedProtein()
409 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
411 final MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] {
415 * -L- 'aligns' ccc------
417 checkAlignSequenceAs("gggAAAcccTTTggg", "-A-L-P-", false, false, map,
418 "gggAAAccc------TTTggg");
422 * Helper method that performs and verifies the method under test.
425 * the sequence to be realigned
427 * the sequence whose alignment is to be copied
428 * @param preserveMappedGaps
429 * @param preserveUnmappedGaps
433 protected void checkAlignSequenceAs(final String alignee,
434 final String alignModel, final boolean preserveMappedGaps,
435 final boolean preserveUnmappedGaps, MapList map,
436 final String expected)
438 SequenceI alignMe = new Sequence("Seq1", alignee);
439 alignMe.createDatasetSequence();
440 SequenceI alignFrom = new Sequence("Seq2", alignModel);
441 alignFrom.createDatasetSequence();
442 AlignedCodonFrame acf = new AlignedCodonFrame();
443 acf.addMap(alignMe.getDatasetSequence(), alignFrom.getDatasetSequence(), map);
445 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "---", '-',
446 preserveMappedGaps, preserveUnmappedGaps);
447 assertEquals(expected, alignMe.getSequenceAsString());
451 * Test for the alignSequenceAs method where we preserve gaps in introns only.
453 @Test(groups = { "Functional" })
454 public void testAlignSequenceAs_keepIntronGapsOnly()
458 * Intron GGGAAA followed by exon CCCTTT
460 MapList map = new MapList(new int[] { 7, 12 }, new int[] { 1, 2 }, 3, 1);
462 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL", false, true, map,
467 * Test the method that realigns protein to match mapped codon alignment.
469 @Test(groups = { "Functional" })
470 public void testAlignProteinAsDna()
472 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
473 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
474 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
475 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
476 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
477 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
478 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
479 dna.setDataset(null);
481 // protein alignment will be realigned like dna
482 SequenceI prot1 = new Sequence("Seq1", "CHYQ");
483 SequenceI prot2 = new Sequence("Seq2", "CHYQ");
484 SequenceI prot3 = new Sequence("Seq3", "CHYQ");
485 SequenceI prot4 = new Sequence("Seq4", "R-QSV"); // unmapped, unchanged
486 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
488 protein.setDataset(null);
490 MapList map = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 3, 1);
491 AlignedCodonFrame acf = new AlignedCodonFrame();
492 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
493 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
494 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
495 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
497 protein.setCodonFrames(acfs);
500 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
501 * [8,9,10] [10,11,12] [11,12,13]
503 AlignmentUtils.alignProteinAsDna(protein, dna);
504 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
505 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
506 assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
507 assertEquals("R-QSV", prot4.getSequenceAsString());
511 * Test the method that tests whether a CDNA sequence translates to a protein
514 @Test(groups = { "Functional" })
515 public void testTranslatesAs()
517 // null arguments check
518 assertFalse(AlignmentUtils.translatesAs(null, 0, null));
519 assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null));
520 assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' }));
522 // straight translation
523 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
524 "FPKG".toCharArray()));
525 // with extra start codon (not in protein)
526 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
527 3, "FPKG".toCharArray()));
528 // with stop codon1 (not in protein)
529 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
530 0, "FPKG".toCharArray()));
531 // with stop codon1 (in protein as *)
532 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
533 0, "FPKG*".toCharArray()));
534 // with stop codon2 (not in protein)
535 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
536 0, "FPKG".toCharArray()));
537 // with stop codon3 (not in protein)
538 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
539 0, "FPKG".toCharArray()));
540 // with start and stop codon1
541 assertTrue(AlignmentUtils.translatesAs(
542 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG".toCharArray()));
543 // with start and stop codon1 (in protein as *)
544 assertTrue(AlignmentUtils.translatesAs(
545 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG*".toCharArray()));
546 // with start and stop codon2
547 assertTrue(AlignmentUtils.translatesAs(
548 "atgtttcccaaagggtag".toCharArray(), 3, "FPKG".toCharArray()));
549 // with start and stop codon3
550 assertTrue(AlignmentUtils.translatesAs(
551 "atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray()));
553 // with embedded stop codons
554 assertTrue(AlignmentUtils.translatesAs(
555 "atgtttTAGcccaaaTAAgggtga".toCharArray(), 3,
556 "F*PK*G".toCharArray()));
559 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
560 0, "FPMG".toCharArray()));
563 assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0,
564 "FPKG".toCharArray()));
567 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
568 0, "FPK".toCharArray()));
570 // overlong dna (doesn't end in stop codon)
571 assertFalse(AlignmentUtils.translatesAs(
572 "tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray()));
574 // dna + stop codon + more
575 assertFalse(AlignmentUtils.translatesAs(
576 "tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray()));
579 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
580 0, "FPKGQ".toCharArray()));
584 * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
585 * stop codons in addition to the protein coding sequence.
587 * @throws IOException
589 @Test(groups = { "Functional" })
590 public void testMapProteinAlignmentToCdna_withStartAndStopCodons()
593 List<SequenceI> protseqs = new ArrayList<SequenceI>();
594 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
595 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
596 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
597 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
598 protein.setDataset(null);
600 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
602 dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
604 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
605 // = start +EIQ + stop
606 dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
607 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
608 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
609 cdna.setDataset(null);
611 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
613 // 3 mappings made, each from 1 to 1 sequence
614 assertEquals(3, protein.getCodonFrames().size());
615 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
616 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
617 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
619 // V12345 mapped from A22222
620 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
622 assertEquals(1, acf.getdnaSeqs().length);
623 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
624 acf.getdnaSeqs()[0]);
625 Mapping[] protMappings = acf.getProtMappings();
626 assertEquals(1, protMappings.length);
627 MapList mapList = protMappings[0].getMap();
628 assertEquals(3, mapList.getFromRatio());
629 assertEquals(1, mapList.getToRatio());
630 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
632 assertEquals(1, mapList.getFromRanges().size());
633 assertTrue(Arrays.equals(new int[] { 1, 3 },
634 mapList.getToRanges().get(0)));
635 assertEquals(1, mapList.getToRanges().size());
637 // V12346 mapped from A33333 starting position 4
638 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
639 assertEquals(1, acf.getdnaSeqs().length);
640 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
641 acf.getdnaSeqs()[0]);
642 protMappings = acf.getProtMappings();
643 assertEquals(1, protMappings.length);
644 mapList = protMappings[0].getMap();
645 assertEquals(3, mapList.getFromRatio());
646 assertEquals(1, mapList.getToRatio());
647 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
649 assertEquals(1, mapList.getFromRanges().size());
650 assertTrue(Arrays.equals(new int[] { 1, 3 },
651 mapList.getToRanges().get(0)));
652 assertEquals(1, mapList.getToRanges().size());
654 // V12347 mapped to A11111 starting position 4
655 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
656 assertEquals(1, acf.getdnaSeqs().length);
657 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
658 acf.getdnaSeqs()[0]);
659 protMappings = acf.getProtMappings();
660 assertEquals(1, protMappings.length);
661 mapList = protMappings[0].getMap();
662 assertEquals(3, mapList.getFromRatio());
663 assertEquals(1, mapList.getToRatio());
664 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
666 assertEquals(1, mapList.getFromRanges().size());
667 assertTrue(Arrays.equals(new int[] { 1, 3 },
668 mapList.getToRanges().get(0)));
669 assertEquals(1, mapList.getToRanges().size());
671 // no mapping involving the 'extra' A44444
672 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
676 * Test mapping of protein to cDNA, for the case where we have some sequence
677 * cross-references. Verify that 1-to-many mappings are made where
678 * cross-references exist and sequences are mappable.
680 * @throws IOException
682 @Test(groups = { "Functional" })
683 public void testMapProteinAlignmentToCdna_withXrefs() throws IOException
685 List<SequenceI> protseqs = new ArrayList<SequenceI>();
686 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
687 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
688 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
689 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
690 protein.setDataset(null);
692 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
693 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
694 dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
695 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
696 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
697 dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
698 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
699 cdna.setDataset(null);
701 // Xref A22222 to V12345 (should get mapped)
702 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
703 // Xref V12345 to A44444 (should get mapped)
704 protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
705 // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
706 dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
707 // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
708 // it should get paired up with the unmapped A33333
709 // A11111 should be mapped to V12347
710 // A55555 is spare and has no xref so is not mapped
712 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
714 // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
715 assertEquals(3, protein.getCodonFrames().size());
716 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
717 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
718 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
720 // one mapping for each of the first 4 cDNA sequences
721 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
722 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
723 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
724 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
726 // V12345 mapped to A22222 and A44444
727 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
729 assertEquals(2, acf.getdnaSeqs().length);
730 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
731 acf.getdnaSeqs()[0]);
732 assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
733 acf.getdnaSeqs()[1]);
735 // V12346 mapped to A33333
736 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
737 assertEquals(1, acf.getdnaSeqs().length);
738 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
739 acf.getdnaSeqs()[0]);
741 // V12347 mapped to A11111
742 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
743 assertEquals(1, acf.getdnaSeqs().length);
744 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
745 acf.getdnaSeqs()[0]);
747 // no mapping involving the 'extra' A55555
748 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
752 * Test mapping of protein to cDNA, for the case where we have some sequence
753 * cross-references. Verify that once we have made an xref mapping we don't
754 * also map un-xrefd sequeces.
756 * @throws IOException
758 @Test(groups = { "Functional" })
759 public void testMapProteinAlignmentToCdna_prioritiseXrefs()
762 List<SequenceI> protseqs = new ArrayList<SequenceI>();
763 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
764 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
765 AlignmentI protein = new Alignment(
766 protseqs.toArray(new SequenceI[protseqs.size()]));
767 protein.setDataset(null);
769 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
770 dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
771 dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
772 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
774 cdna.setDataset(null);
776 // Xref A22222 to V12345 (should get mapped)
777 // A11111 should then be mapped to the unmapped V12346
778 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
780 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
782 // 2 protein mappings made
783 assertEquals(2, protein.getCodonFrames().size());
784 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
785 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
787 // one mapping for each of the cDNA sequences
788 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
789 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
791 // V12345 mapped to A22222
792 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
794 assertEquals(1, acf.getdnaSeqs().length);
795 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
796 acf.getdnaSeqs()[0]);
798 // V12346 mapped to A11111
799 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
800 assertEquals(1, acf.getdnaSeqs().length);
801 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
802 acf.getdnaSeqs()[0]);
806 * Test the method that shows or hides sequence annotations by type(s) and
809 @Test(groups = { "Functional" })
810 public void testShowOrHideSequenceAnnotations()
812 SequenceI seq1 = new Sequence("Seq1", "AAA");
813 SequenceI seq2 = new Sequence("Seq2", "BBB");
814 SequenceI seq3 = new Sequence("Seq3", "CCC");
815 Annotation[] anns = new Annotation[] { new Annotation(2f) };
816 AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1",
818 ann1.setSequenceRef(seq1);
819 AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2",
821 ann2.setSequenceRef(seq2);
822 AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3",
824 AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns);
825 ann4.setSequenceRef(seq1);
826 AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns);
827 ann5.setSequenceRef(seq2);
828 AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns);
829 AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2, seq3 });
830 al.addAnnotation(ann1); // Structure for Seq1
831 al.addAnnotation(ann2); // Structure for Seq2
832 al.addAnnotation(ann3); // Structure for no sequence
833 al.addAnnotation(ann4); // Temp for seq1
834 al.addAnnotation(ann5); // Temp for seq2
835 al.addAnnotation(ann6); // Temp for no sequence
836 List<String> types = new ArrayList<String>();
837 List<SequenceI> scope = new ArrayList<SequenceI>();
840 * Set all sequence related Structure to hidden (ann1, ann2)
842 types.add("Structure");
843 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
845 assertFalse(ann1.visible);
846 assertFalse(ann2.visible);
847 assertTrue(ann3.visible); // not sequence-related, not affected
848 assertTrue(ann4.visible); // not Structure, not affected
849 assertTrue(ann5.visible); // "
850 assertTrue(ann6.visible); // not sequence-related, not affected
853 * Set Temp in {seq1, seq3} to hidden
859 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false,
861 assertFalse(ann1.visible); // unchanged
862 assertFalse(ann2.visible); // unchanged
863 assertTrue(ann3.visible); // not sequence-related, not affected
864 assertFalse(ann4.visible); // Temp for seq1 hidden
865 assertTrue(ann5.visible); // not in scope, not affected
866 assertTrue(ann6.visible); // not sequence-related, not affected
869 * Set Temp in all sequences to hidden
875 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
877 assertFalse(ann1.visible); // unchanged
878 assertFalse(ann2.visible); // unchanged
879 assertTrue(ann3.visible); // not sequence-related, not affected
880 assertFalse(ann4.visible); // Temp for seq1 hidden
881 assertFalse(ann5.visible); // Temp for seq2 hidden
882 assertTrue(ann6.visible); // not sequence-related, not affected
885 * Set all types in {seq1, seq3} to visible
891 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true,
893 assertTrue(ann1.visible); // Structure for seq1 set visible
894 assertFalse(ann2.visible); // not in scope, unchanged
895 assertTrue(ann3.visible); // not sequence-related, not affected
896 assertTrue(ann4.visible); // Temp for seq1 set visible
897 assertFalse(ann5.visible); // not in scope, unchanged
898 assertTrue(ann6.visible); // not sequence-related, not affected
901 * Set all types in all scope to hidden
903 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true,
905 assertFalse(ann1.visible);
906 assertFalse(ann2.visible);
907 assertTrue(ann3.visible); // not sequence-related, not affected
908 assertFalse(ann4.visible);
909 assertFalse(ann5.visible);
910 assertTrue(ann6.visible); // not sequence-related, not affected
914 * Tests for the method that checks if one sequence cross-references another
916 @Test(groups = { "Functional" })
917 public void testHasCrossRef()
919 assertFalse(AlignmentUtils.hasCrossRef(null, null));
920 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
921 assertFalse(AlignmentUtils.hasCrossRef(seq1, null));
922 assertFalse(AlignmentUtils.hasCrossRef(null, seq1));
923 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
924 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
927 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193"));
928 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
930 // case-insensitive; version number is ignored
931 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192"));
932 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
935 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
936 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
937 // test is one-way only
938 assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1));
942 * Tests for the method that checks if either sequence cross-references the
945 @Test(groups = { "Functional" })
946 public void testHaveCrossRef()
948 assertFalse(AlignmentUtils.hasCrossRef(null, null));
949 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
950 assertFalse(AlignmentUtils.haveCrossRef(seq1, null));
951 assertFalse(AlignmentUtils.haveCrossRef(null, seq1));
952 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
953 assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2));
955 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
956 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
957 // next is true for haveCrossRef, false for hasCrossRef
958 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
960 // now the other way round
961 seq1.setDBRefs(null);
962 seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345"));
963 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
964 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
967 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
968 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
969 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
973 * Test the method that extracts the cds-only part of a dna alignment.
975 @Test(groups = { "Functional" })
976 public void testMakeCdsAlignment()
978 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
979 SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
980 SequenceI pep1 = new Sequence("pep1", "GF");
981 SequenceI pep2 = new Sequence("pep2", "GFP");
982 dna1.createDatasetSequence();
983 dna2.createDatasetSequence();
984 pep1.createDatasetSequence();
985 pep2.createDatasetSequence();
986 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
988 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
990 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
992 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
994 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds5", 13, 15, 0f,
996 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
997 dna.setDataset(null);
999 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1000 new int[] { 1, 2 }, 3, 1);
1001 AlignedCodonFrame acf = new AlignedCodonFrame();
1002 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1003 dna.addCodonFrame(acf);
1004 map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 },
1006 acf = new AlignedCodonFrame();
1007 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1008 dna.addCodonFrame(acf);
1011 * execute method under test:
1013 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1014 dna1, dna2 }, dna.getDataset());
1016 assertEquals(2, cds.getSequences().size());
1017 assertEquals("GGGTTT", cds.getSequenceAt(0)
1018 .getSequenceAsString());
1019 assertEquals("GGGTTTCCC", cds.getSequenceAt(1)
1020 .getSequenceAsString());
1023 * verify shared, extended alignment dataset
1025 assertSame(dna.getDataset(), cds.getDataset());
1026 assertTrue(dna.getDataset().getSequences()
1027 .contains(cds.getSequenceAt(0).getDatasetSequence()));
1028 assertTrue(dna.getDataset().getSequences()
1029 .contains(cds.getSequenceAt(1).getDatasetSequence()));
1032 * Verify mappings from CDS to peptide, cDNA to CDS, and cDNA to peptide
1033 * the mappings are on the shared alignment dataset
1035 List<AlignedCodonFrame> cdsMappings = cds.getDataset().getCodonFrames();
1037 * 6 mappings, 2*(DNA->CDS), 2*(DNA->Pep), 2*(CDS->Pep)
1039 assertEquals(6, cdsMappings.size());
1041 * verify that mapping sets for dna and cds alignments are different
1043 // select -> subselect type to test.
1044 Assert.assertNotSame(dna.getCodonFrames(), cds.getCodonFrames());
1045 assertEquals(4, dna.getCodonFrames());
1046 assertEquals(4, cds.getCodonFrames());
1048 * Mapping from pep1 to GGGTTT in first new exon sequence
1050 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1051 .findMappingsForSequence(pep1, cds.getCodonFrames());
1052 assertEquals(1, pep1Mapping.size()); // CDS->Pep
1055 SearchResults sr = MappingUtils
1056 .buildSearchResults(pep1, 1, cdsMappings);
1057 assertEquals(1, sr.getResults().size());
1058 Match m = sr.getResults().get(0);
1059 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1061 assertEquals(1, m.getStart());
1062 assertEquals(3, m.getEnd());
1064 sr = MappingUtils.buildSearchResults(pep1, 2, cdsMappings);
1065 m = sr.getResults().get(0);
1066 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1068 assertEquals(4, m.getStart());
1069 assertEquals(6, m.getEnd());
1072 * Mapping from pep2 to GGGTTTCCC in second new exon sequence
1074 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1075 .findMappingsForSequence(pep2, cdsMappings);
1076 assertEquals(1, pep2Mapping.size());
1078 sr = MappingUtils.buildSearchResults(pep2, 1, cdsMappings);
1079 assertEquals(1, sr.getResults().size());
1080 m = sr.getResults().get(0);
1081 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1083 assertEquals(1, m.getStart());
1084 assertEquals(3, m.getEnd());
1086 sr = MappingUtils.buildSearchResults(pep2, 2, cdsMappings);
1087 m = sr.getResults().get(0);
1088 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1090 assertEquals(4, m.getStart());
1091 assertEquals(6, m.getEnd());
1093 sr = MappingUtils.buildSearchResults(pep2, 3, cdsMappings);
1094 m = sr.getResults().get(0);
1095 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1097 assertEquals(7, m.getStart());
1098 assertEquals(9, m.getEnd());
1102 * Test the method that makes a cds-only alignment from a DNA sequence and its
1103 * product mappings, for the case where there are multiple exon mappings to
1104 * different protein products.
1106 @Test(groups = { "Functional" })
1107 public void testMakeCdsAlignment_multipleProteins()
1109 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1110 SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT
1111 SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc
1112 SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT
1113 dna1.createDatasetSequence();
1114 pep1.createDatasetSequence();
1115 pep2.createDatasetSequence();
1116 pep3.createDatasetSequence();
1117 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
1119 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
1121 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
1123 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
1125 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds5", 1, 3, 0f,
1127 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds6", 10, 12, 0f,
1129 pep1.getDatasetSequence().addDBRef(
1130 new DBRefEntry("EMBLCDS", "2", "A12345"));
1131 pep2.getDatasetSequence().addDBRef(
1132 new DBRefEntry("EMBLCDS", "3", "A12346"));
1133 pep3.getDatasetSequence().addDBRef(
1134 new DBRefEntry("EMBLCDS", "4", "A12347"));
1137 * Create the CDS alignment
1139 AlignmentI dna = new Alignment(new SequenceI[] { dna1 });
1140 dna.setDataset(null);
1143 * Make the mappings from dna to protein
1145 // map ...GGG...TTT to GF
1146 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1147 new int[] { 1, 2 }, 3, 1);
1148 AlignedCodonFrame acf = new AlignedCodonFrame();
1149 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1150 dna.addCodonFrame(acf);
1152 // map aaa...ccc to KP
1153 map = new MapList(new int[] { 1, 3, 7, 9 }, new int[] { 1, 2 }, 3, 1);
1154 acf = new AlignedCodonFrame();
1155 acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map);
1156 dna.addCodonFrame(acf);
1158 // map aaa......TTT to KF
1159 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 2 }, 3, 1);
1160 acf = new AlignedCodonFrame();
1161 acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
1162 dna.addCodonFrame(acf);
1165 * execute method under test
1167 AlignmentI cdsal = AlignmentUtils.makeCdsAlignment(
1168 new SequenceI[] { dna1 }, dna.getDataset());
1171 * Verify we have 3 cds sequences, mapped to pep1/2/3 respectively
1173 List<SequenceI> cds = cdsal.getSequences();
1174 assertEquals(3, cds.size());
1177 * verify shared, extended alignment dataset
1179 assertSame(cdsal.getDataset(), dna.getDataset());
1180 assertTrue(dna.getDataset().getSequences()
1181 .contains(cds.get(0).getDatasetSequence()));
1182 assertTrue(dna.getDataset().getSequences()
1183 .contains(cds.get(1).getDatasetSequence()));
1184 assertTrue(dna.getDataset().getSequences()
1185 .contains(cds.get(2).getDatasetSequence()));
1188 * verify aligned cds sequences and their xrefs
1190 SequenceI cdsSeq = cds.get(0);
1191 assertEquals("GGGTTT", cdsSeq.getSequenceAsString());
1192 // assertEquals("dna1|A12345", cdsSeq.getName());
1193 assertEquals("dna1|pep1", cdsSeq.getName());
1194 // assertEquals(1, cdsSeq.getDBRefs().length);
1195 // DBRefEntry cdsRef = cdsSeq.getDBRefs()[0];
1196 // assertEquals("EMBLCDS", cdsRef.getSource());
1197 // assertEquals("2", cdsRef.getVersion());
1198 // assertEquals("A12345", cdsRef.getAccessionId());
1200 cdsSeq = cds.get(1);
1201 assertEquals("aaaccc", cdsSeq.getSequenceAsString());
1202 // assertEquals("dna1|A12346", cdsSeq.getName());
1203 assertEquals("dna1|pep2", cdsSeq.getName());
1204 // assertEquals(1, cdsSeq.getDBRefs().length);
1205 // cdsRef = cdsSeq.getDBRefs()[0];
1206 // assertEquals("EMBLCDS", cdsRef.getSource());
1207 // assertEquals("3", cdsRef.getVersion());
1208 // assertEquals("A12346", cdsRef.getAccessionId());
1210 cdsSeq = cds.get(2);
1211 assertEquals("aaaTTT", cdsSeq.getSequenceAsString());
1212 // assertEquals("dna1|A12347", cdsSeq.getName());
1213 assertEquals("dna1|pep3", cdsSeq.getName());
1214 // assertEquals(1, cdsSeq.getDBRefs().length);
1215 // cdsRef = cdsSeq.getDBRefs()[0];
1216 // assertEquals("EMBLCDS", cdsRef.getSource());
1217 // assertEquals("4", cdsRef.getVersion());
1218 // assertEquals("A12347", cdsRef.getAccessionId());
1221 * Verify there are mappings from each cds sequence to its protein product
1222 * and also to its dna source
1224 Iterator<AlignedCodonFrame> newMappingsIterator = cdsal
1225 .getCodonFrames().iterator();
1227 // mappings for dna1 - exon1 - pep1
1228 AlignedCodonFrame cdsMapping = newMappingsIterator.next();
1229 List<Mapping> dnaMappings = cdsMapping.getMappingsFromSequence(dna1);
1230 assertEquals(3, dnaMappings.size());
1231 assertSame(cds.get(0).getDatasetSequence(), dnaMappings.get(0)
1233 assertEquals("G(1) in CDS should map to G(4) in DNA", 4, dnaMappings
1234 .get(0).getMap().getToPosition(1));
1235 List<Mapping> peptideMappings = cdsMapping.getMappingsFromSequence(cds
1236 .get(0).getDatasetSequence());
1237 assertEquals(1, peptideMappings.size());
1238 assertSame(pep1.getDatasetSequence(), peptideMappings.get(0).getTo());
1240 // mappings for dna1 - cds2 - pep2
1241 assertSame(cds.get(1).getDatasetSequence(), dnaMappings.get(1)
1243 assertEquals("c(4) in CDS should map to c(7) in DNA", 7, dnaMappings
1244 .get(1).getMap().getToPosition(4));
1245 peptideMappings = cdsMapping.getMappingsFromSequence(cds.get(1)
1246 .getDatasetSequence());
1247 assertEquals(1, peptideMappings.size());
1248 assertSame(pep2.getDatasetSequence(), peptideMappings.get(0).getTo());
1250 // mappings for dna1 - cds3 - pep3
1251 assertSame(cds.get(2).getDatasetSequence(), dnaMappings.get(2)
1253 assertEquals("T(4) in CDS should map to T(10) in DNA", 10, dnaMappings
1254 .get(2).getMap().getToPosition(4));
1255 peptideMappings = cdsMapping.getMappingsFromSequence(cds.get(2)
1256 .getDatasetSequence());
1257 assertEquals(1, peptideMappings.size());
1258 assertSame(pep3.getDatasetSequence(), peptideMappings.get(0).getTo());
1261 @Test(groups = { "Functional" })
1262 public void testIsMappable()
1264 SequenceI dna1 = new Sequence("dna1", "cgCAGtgGT");
1265 SequenceI aa1 = new Sequence("aa1", "RSG");
1266 AlignmentI al1 = new Alignment(new SequenceI[] { dna1 });
1267 AlignmentI al2 = new Alignment(new SequenceI[] { aa1 });
1269 assertFalse(AlignmentUtils.isMappable(null, null));
1270 assertFalse(AlignmentUtils.isMappable(al1, null));
1271 assertFalse(AlignmentUtils.isMappable(null, al1));
1272 assertFalse(AlignmentUtils.isMappable(al1, al1));
1273 assertFalse(AlignmentUtils.isMappable(al2, al2));
1275 assertTrue(AlignmentUtils.isMappable(al1, al2));
1276 assertTrue(AlignmentUtils.isMappable(al2, al1));
1280 * Test creating a mapping when the sequences involved do not start at residue
1283 * @throws IOException
1285 @Test(groups = { "Functional" })
1286 public void testMapCdnaToProtein_forSubsequence()
1289 SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12);
1290 prot.createDatasetSequence();
1292 SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48);
1293 dna.createDatasetSequence();
1295 MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna);
1296 assertEquals(10, map.getToLowest());
1297 assertEquals(12, map.getToHighest());
1298 assertEquals(40, map.getFromLowest());
1299 assertEquals(48, map.getFromHighest());
1303 * Test for the alignSequenceAs method where we have protein mapped to protein
1305 @Test(groups = { "Functional" })
1306 public void testAlignSequenceAs_mappedProteinProtein()
1309 SequenceI alignMe = new Sequence("Match", "MGAASEV");
1310 alignMe.createDatasetSequence();
1311 SequenceI alignFrom = new Sequence("Query", "LQTGYMGAASEVMFSPTRR");
1312 alignFrom.createDatasetSequence();
1314 AlignedCodonFrame acf = new AlignedCodonFrame();
1315 // this is like a domain or motif match of part of a peptide sequence
1316 MapList map = new MapList(new int[] { 6, 12 }, new int[] { 1, 7 }, 1, 1);
1317 acf.addMap(alignFrom.getDatasetSequence(),
1318 alignMe.getDatasetSequence(), map);
1320 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "-", '-', true,
1322 assertEquals("-----MGAASEV-------", alignMe.getSequenceAsString());
1326 * Test for the alignSequenceAs method where there are trailing unmapped
1327 * residues in the model sequence
1329 @Test(groups = { "Functional" })
1330 public void testAlignSequenceAs_withTrailingPeptide()
1332 // map first 3 codons to KPF; G is a trailing unmapped residue
1333 MapList map = new MapList(new int[] { 1, 9 }, new int[] { 1, 3 }, 3, 1);
1335 checkAlignSequenceAs("AAACCCTTT", "K-PFG", true, true, map,
1340 * Tests for transferring features between mapped sequences
1342 @Test(groups = { "Functional" })
1343 public void testTransferFeatures()
1345 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1346 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1349 dna.addSequenceFeature(new SequenceFeature("type1", "desc1", 1, 2, 1f,
1351 // partial overlap - to [1, 1]
1352 dna.addSequenceFeature(new SequenceFeature("type2", "desc2", 3, 4, 2f,
1354 // exact overlap - to [1, 3]
1355 dna.addSequenceFeature(new SequenceFeature("type3", "desc3", 4, 6, 3f,
1357 // spanning overlap - to [2, 5]
1358 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1360 // exactly overlaps whole mapped range [1, 6]
1361 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1363 // no overlap (internal)
1364 dna.addSequenceFeature(new SequenceFeature("type6", "desc6", 7, 9, 6f,
1366 // no overlap (3' end)
1367 dna.addSequenceFeature(new SequenceFeature("type7", "desc7", 13, 15,
1369 // overlap (3' end) - to [6, 6]
1370 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1372 // extended overlap - to [6, +]
1373 dna.addSequenceFeature(new SequenceFeature("type9", "desc9", 12, 13,
1376 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1377 new int[] { 1, 6 }, 1, 1);
1380 * transferFeatures() will build 'partial overlap' for regions
1381 * that partially overlap 5' or 3' (start or end) of target sequence
1383 AlignmentUtils.transferFeatures(dna, cds, map, null);
1384 SequenceFeature[] sfs = cds.getSequenceFeatures();
1385 assertEquals(6, sfs.length);
1387 SequenceFeature sf = sfs[0];
1388 assertEquals("type2", sf.getType());
1389 assertEquals("desc2", sf.getDescription());
1390 assertEquals(2f, sf.getScore());
1391 assertEquals(1, sf.getBegin());
1392 assertEquals(1, sf.getEnd());
1395 assertEquals("type3", sf.getType());
1396 assertEquals("desc3", sf.getDescription());
1397 assertEquals(3f, sf.getScore());
1398 assertEquals(1, sf.getBegin());
1399 assertEquals(3, sf.getEnd());
1402 assertEquals("type4", sf.getType());
1403 assertEquals(2, sf.getBegin());
1404 assertEquals(5, sf.getEnd());
1407 assertEquals("type5", sf.getType());
1408 assertEquals(1, sf.getBegin());
1409 assertEquals(6, sf.getEnd());
1412 assertEquals("type8", sf.getType());
1413 assertEquals(6, sf.getBegin());
1414 assertEquals(6, sf.getEnd());
1417 assertEquals("type9", sf.getType());
1418 assertEquals(6, sf.getBegin());
1419 assertEquals(6, sf.getEnd());
1423 * Tests for transferring features between mapped sequences
1425 @Test(groups = { "Functional" })
1426 public void testTransferFeatures_withOmit()
1428 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1429 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1431 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1432 new int[] { 1, 6 }, 1, 1);
1434 // [5, 11] maps to [2, 5]
1435 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1437 // [4, 12] maps to [1, 6]
1438 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1440 // [12, 12] maps to [6, 6]
1441 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1444 // desc4 and desc8 are the 'omit these' varargs
1445 AlignmentUtils.transferFeatures(dna, cds, map, null, "type4", "type8");
1446 SequenceFeature[] sfs = cds.getSequenceFeatures();
1447 assertEquals(1, sfs.length);
1449 SequenceFeature sf = sfs[0];
1450 assertEquals("type5", sf.getType());
1451 assertEquals(1, sf.getBegin());
1452 assertEquals(6, sf.getEnd());
1456 * Tests for transferring features between mapped sequences
1458 @Test(groups = { "Functional" })
1459 public void testTransferFeatures_withSelect()
1461 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1462 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1464 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1465 new int[] { 1, 6 }, 1, 1);
1467 // [5, 11] maps to [2, 5]
1468 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1470 // [4, 12] maps to [1, 6]
1471 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1473 // [12, 12] maps to [6, 6]
1474 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1477 // "type5" is the 'select this type' argument
1478 AlignmentUtils.transferFeatures(dna, cds, map, "type5");
1479 SequenceFeature[] sfs = cds.getSequenceFeatures();
1480 assertEquals(1, sfs.length);
1482 SequenceFeature sf = sfs[0];
1483 assertEquals("type5", sf.getType());
1484 assertEquals(1, sf.getBegin());
1485 assertEquals(6, sf.getEnd());
1489 * Test the method that extracts the cds-only part of a dna alignment, for the
1490 * case where the cds should be aligned to match its nucleotide sequence.
1492 @Test(groups = { "Functional" })
1493 public void testMakeCdsAlignment_alternativeTranscripts()
1495 SequenceI dna1 = new Sequence("dna1", "aaaGGGCC-----CTTTaaaGGG");
1496 // alternative transcript of same dna skips CCC codon
1497 SequenceI dna2 = new Sequence("dna2", "aaaGGGCC-----cttTaaaGGG");
1498 // dna3 has no mapping (protein product) so should be ignored here
1499 SequenceI dna3 = new Sequence("dna3", "aaaGGGCCCCCGGGcttTaaaGGG");
1500 SequenceI pep1 = new Sequence("pep1", "GPFG");
1501 SequenceI pep2 = new Sequence("pep2", "GPG");
1502 dna1.createDatasetSequence();
1503 dna2.createDatasetSequence();
1504 dna3.createDatasetSequence();
1505 pep1.createDatasetSequence();
1506 pep2.createDatasetSequence();
1507 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 8, 0f,
1509 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 9, 12, 0f,
1511 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 16, 18, 0f,
1513 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 4, 8, 0f,
1515 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 12, 12, 0f,
1517 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 16, 18, 0f,
1520 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1521 dna.setDataset(null);
1523 MapList map = new MapList(new int[] { 4, 12, 16, 18 },
1524 new int[] { 1, 4 }, 3, 1);
1525 AlignedCodonFrame acf = new AlignedCodonFrame();
1526 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1527 dna.addCodonFrame(acf);
1528 map = new MapList(new int[] { 4, 8, 12, 12, 16, 18 },
1531 acf = new AlignedCodonFrame();
1532 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1533 dna.addCodonFrame(acf);
1535 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1536 dna1, dna2, dna3 }, dna.getDataset());
1537 List<SequenceI> cdsSeqs = cds.getSequences();
1538 assertEquals(2, cdsSeqs.size());
1539 assertEquals("GGGCCCTTTGGG", cdsSeqs.get(0).getSequenceAsString());
1540 assertEquals("GGGCCTGGG", cdsSeqs.get(1).getSequenceAsString());
1543 * verify shared, extended alignment dataset
1545 assertSame(dna.getDataset(), cds.getDataset());
1546 assertTrue(dna.getDataset().getSequences()
1547 .contains(cdsSeqs.get(0).getDatasetSequence()));
1548 assertTrue(dna.getDataset().getSequences()
1549 .contains(cdsSeqs.get(1).getDatasetSequence()));
1552 * Verify updated mappings
1554 List<AlignedCodonFrame> cdsMappings = cds.getCodonFrames();
1555 assertEquals(2, cdsMappings.size());
1558 * Mapping from pep1 to GGGTTT in first new CDS sequence
1560 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1561 .findMappingsForSequence(pep1, cdsMappings);
1562 assertEquals(1, pep1Mapping.size());
1564 * maps GPFG to 1-3,4-6,7-9,10-12
1566 SearchResults sr = MappingUtils
1567 .buildSearchResults(pep1, 1, cdsMappings);
1568 assertEquals(1, sr.getResults().size());
1569 Match m = sr.getResults().get(0);
1570 assertEquals(cds.getSequenceAt(0).getDatasetSequence(),
1572 assertEquals(1, m.getStart());
1573 assertEquals(3, m.getEnd());
1574 sr = MappingUtils.buildSearchResults(pep1, 2, cdsMappings);
1575 m = sr.getResults().get(0);
1576 assertEquals(4, m.getStart());
1577 assertEquals(6, m.getEnd());
1578 sr = MappingUtils.buildSearchResults(pep1, 3, cdsMappings);
1579 m = sr.getResults().get(0);
1580 assertEquals(7, m.getStart());
1581 assertEquals(9, m.getEnd());
1582 sr = MappingUtils.buildSearchResults(pep1, 4, cdsMappings);
1583 m = sr.getResults().get(0);
1584 assertEquals(10, m.getStart());
1585 assertEquals(12, m.getEnd());
1588 * GPG in pep2 map to 1-3,4-6,7-9 in second CDS sequence
1590 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1591 .findMappingsForSequence(pep2, cdsMappings);
1592 assertEquals(1, pep2Mapping.size());
1593 sr = MappingUtils.buildSearchResults(pep2, 1, cdsMappings);
1594 assertEquals(1, sr.getResults().size());
1595 m = sr.getResults().get(0);
1596 assertEquals(cds.getSequenceAt(1).getDatasetSequence(),
1598 assertEquals(1, m.getStart());
1599 assertEquals(3, m.getEnd());
1600 sr = MappingUtils.buildSearchResults(pep2, 2, cdsMappings);
1601 m = sr.getResults().get(0);
1602 assertEquals(4, m.getStart());
1603 assertEquals(6, m.getEnd());
1604 sr = MappingUtils.buildSearchResults(pep2, 3, cdsMappings);
1605 m = sr.getResults().get(0);
1606 assertEquals(7, m.getStart());
1607 assertEquals(9, m.getEnd());
1611 * Test the method that realigns protein to match mapped codon alignment.
1613 @Test(groups = { "Functional" })
1614 public void testAlignProteinAsDna_incompleteStartCodon()
1616 // seq1: incomplete start codon (not mapped), then [3, 11]
1617 SequenceI dna1 = new Sequence("Seq1", "ccAAA-TTT-GGG-");
1618 // seq2 codons are [4, 5], [8, 11]
1619 SequenceI dna2 = new Sequence("Seq2", "ccaAA-ttT-GGG-");
1620 // seq3 incomplete start codon at 'tt'
1621 SequenceI dna3 = new Sequence("Seq3", "ccaaa-ttt-GGG-");
1622 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1623 dna.setDataset(null);
1625 // prot1 has 'X' for incomplete start codon (not mapped)
1626 SequenceI prot1 = new Sequence("Seq1", "XKFG"); // X for incomplete start
1627 SequenceI prot2 = new Sequence("Seq2", "NG");
1628 SequenceI prot3 = new Sequence("Seq3", "XG"); // X for incomplete start
1629 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
1631 protein.setDataset(null);
1633 // map dna1 [3, 11] to prot1 [2, 4] KFG
1634 MapList map = new MapList(new int[] { 3, 11 }, new int[] { 2, 4 }, 3, 1);
1635 AlignedCodonFrame acf = new AlignedCodonFrame();
1636 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
1638 // map dna2 [4, 5] [8, 11] to prot2 [1, 2] NG
1639 map = new MapList(new int[] { 4, 5, 8, 11 }, new int[] { 1, 2 }, 3, 1);
1640 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
1642 // map dna3 [9, 11] to prot3 [2, 2] G
1643 map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1);
1644 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
1646 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
1648 protein.setCodonFrames(acfs);
1651 * verify X is included in the aligned proteins, and placed just
1652 * before the first mapped residue
1653 * CCT is between CCC and TTT
1655 AlignmentUtils.alignProteinAsDna(protein, dna);
1656 assertEquals("XK-FG", prot1.getSequenceAsString());
1657 assertEquals("--N-G", prot2.getSequenceAsString());
1658 assertEquals("---XG", prot3.getSequenceAsString());
1662 * Tests for the method that maps the subset of a dna sequence that has CDS
1663 * (or subtype) feature - case where the start codon is incomplete.
1665 @Test(groups = "Functional")
1666 public void testFindCdsPositions_fivePrimeIncomplete()
1668 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
1669 dnaSeq.createDatasetSequence();
1670 SequenceI ds = dnaSeq.getDatasetSequence();
1672 // CDS for dna 5-6 (incomplete codon), 7-9
1673 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
1674 sf.setPhase("2"); // skip 2 bases to start of next codon
1675 ds.addSequenceFeature(sf);
1676 // CDS for dna 13-15
1677 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
1678 ds.addSequenceFeature(sf);
1680 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1683 * check the mapping starts with the first complete codon
1685 assertEquals(6, MappingUtils.getLength(ranges));
1686 assertEquals(2, ranges.size());
1687 assertEquals(7, ranges.get(0)[0]);
1688 assertEquals(9, ranges.get(0)[1]);
1689 assertEquals(13, ranges.get(1)[0]);
1690 assertEquals(15, ranges.get(1)[1]);
1694 * Tests for the method that maps the subset of a dna sequence that has CDS
1695 * (or subtype) feature.
1697 @Test(groups = "Functional")
1698 public void testFindCdsPositions()
1700 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
1701 dnaSeq.createDatasetSequence();
1702 SequenceI ds = dnaSeq.getDatasetSequence();
1704 // CDS for dna 10-12
1705 SequenceFeature sf = new SequenceFeature("CDS_predicted", "", 10, 12,
1708 ds.addSequenceFeature(sf);
1710 sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
1712 ds.addSequenceFeature(sf);
1713 // exon feature should be ignored here
1714 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
1715 ds.addSequenceFeature(sf);
1717 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1719 * verify ranges { [4-6], [12-10] }
1720 * note CDS ranges are ordered ascending even if the CDS
1723 assertEquals(6, MappingUtils.getLength(ranges));
1724 assertEquals(2, ranges.size());
1725 assertEquals(4, ranges.get(0)[0]);
1726 assertEquals(6, ranges.get(0)[1]);
1727 assertEquals(10, ranges.get(1)[0]);
1728 assertEquals(12, ranges.get(1)[1]);
1732 * Test the method that computes a map of codon variants for each protein
1733 * position from "sequence_variant" features on dna
1735 @Test(groups = "Functional")
1736 public void testBuildDnaVariantsMap()
1738 SequenceI dna = new Sequence("dna", "atgAAATTTGGGCCCtag");
1739 MapList map = new MapList(new int[] { 1, 18 }, new int[] { 1, 5 }, 3, 1);
1742 * first with no variants on dna
1744 LinkedHashMap<Integer, List<DnaVariant>[]> variantsMap = AlignmentUtils
1745 .buildDnaVariantsMap(dna, map);
1746 assertTrue(variantsMap.isEmpty());
1749 * single allele codon 1, on base 1
1751 SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
1753 sf1.setValue("alleles", "T");
1754 sf1.setValue("ID", "sequence_variant:rs758803211");
1755 dna.addSequenceFeature(sf1);
1758 * two alleles codon 2, on bases 2 and 3 (distinct variants)
1760 SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 5, 5,
1762 sf2.setValue("alleles", "T");
1763 sf2.setValue("ID", "sequence_variant:rs758803212");
1764 dna.addSequenceFeature(sf2);
1765 SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 6, 6,
1767 sf3.setValue("alleles", "G");
1768 sf3.setValue("ID", "sequence_variant:rs758803213");
1769 dna.addSequenceFeature(sf3);
1772 * two alleles codon 3, both on base 2 (one variant)
1774 SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 8, 8,
1776 sf4.setValue("alleles", "C, G");
1777 sf4.setValue("ID", "sequence_variant:rs758803214");
1778 dna.addSequenceFeature(sf4);
1780 // no alleles on codon 4
1783 * alleles on codon 5 on all 3 bases (distinct variants)
1785 SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 13,
1787 sf5.setValue("alleles", "C, G"); // (C duplicates given base value)
1788 sf5.setValue("ID", "sequence_variant:rs758803215");
1789 dna.addSequenceFeature(sf5);
1790 SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 14,
1792 sf6.setValue("alleles", "g, a"); // should force to upper-case
1793 sf6.setValue("ID", "sequence_variant:rs758803216");
1794 dna.addSequenceFeature(sf6);
1795 SequenceFeature sf7 = new SequenceFeature("sequence_variant", "", 15,
1797 sf7.setValue("alleles", "A, T");
1798 sf7.setValue("ID", "sequence_variant:rs758803217");
1799 dna.addSequenceFeature(sf7);
1802 * build map - expect variants on positions 1, 2, 3, 5
1804 variantsMap = AlignmentUtils.buildDnaVariantsMap(dna, map);
1805 assertEquals(4, variantsMap.size());
1808 * protein residue 1: variant on codon (ATG) base 1, not on 2 or 3
1810 List<DnaVariant>[] pep1Variants = variantsMap.get(1);
1811 assertEquals(3, pep1Variants.length);
1812 assertEquals(1, pep1Variants[0].size());
1813 assertEquals("A", pep1Variants[0].get(0).base); // codon[1] base
1814 assertSame(sf1, pep1Variants[0].get(0).variant); // codon[1] variant
1815 assertEquals(1, pep1Variants[1].size());
1816 assertEquals("T", pep1Variants[1].get(0).base); // codon[2] base
1817 assertNull(pep1Variants[1].get(0).variant); // no variant here
1818 assertEquals(1, pep1Variants[2].size());
1819 assertEquals("G", pep1Variants[2].get(0).base); // codon[3] base
1820 assertNull(pep1Variants[2].get(0).variant); // no variant here
1823 * protein residue 2: variants on codon (AAA) bases 2 and 3
1825 List<DnaVariant>[] pep2Variants = variantsMap.get(2);
1826 assertEquals(3, pep2Variants.length);
1827 assertEquals(1, pep2Variants[0].size());
1828 // codon[1] base recorded while processing variant on codon[2]
1829 assertEquals("A", pep2Variants[0].get(0).base);
1830 assertNull(pep2Variants[0].get(0).variant); // no variant here
1831 // codon[2] base and variant:
1832 assertEquals(1, pep2Variants[1].size());
1833 assertEquals("A", pep2Variants[1].get(0).base);
1834 assertSame(sf2, pep2Variants[1].get(0).variant);
1835 // codon[3] base was recorded when processing codon[2] variant
1836 // and then the variant for codon[3] added to it
1837 assertEquals(1, pep2Variants[2].size());
1838 assertEquals("A", pep2Variants[2].get(0).base);
1839 assertSame(sf3, pep2Variants[2].get(0).variant);
1842 * protein residue 3: variants on codon (TTT) base 2 only
1844 List<DnaVariant>[] pep3Variants = variantsMap.get(3);
1845 assertEquals(3, pep3Variants.length);
1846 assertEquals(1, pep3Variants[0].size());
1847 assertEquals("T", pep3Variants[0].get(0).base); // codon[1] base
1848 assertNull(pep3Variants[0].get(0).variant); // no variant here
1849 assertEquals(1, pep3Variants[1].size());
1850 assertEquals("T", pep3Variants[1].get(0).base); // codon[2] base
1851 assertSame(sf4, pep3Variants[1].get(0).variant); // codon[2] variant
1852 assertEquals(1, pep3Variants[2].size());
1853 assertEquals("T", pep3Variants[2].get(0).base); // codon[3] base
1854 assertNull(pep3Variants[2].get(0).variant); // no variant here
1857 * three variants on protein position 5
1859 List<DnaVariant>[] pep5Variants = variantsMap.get(5);
1860 assertEquals(3, pep5Variants.length);
1861 assertEquals(1, pep5Variants[0].size());
1862 assertEquals("C", pep5Variants[0].get(0).base); // codon[1] base
1863 assertSame(sf5, pep5Variants[0].get(0).variant); // codon[1] variant
1864 assertEquals(1, pep5Variants[1].size());
1865 assertEquals("C", pep5Variants[1].get(0).base); // codon[2] base
1866 assertSame(sf6, pep5Variants[1].get(0).variant); // codon[2] variant
1867 assertEquals(1, pep5Variants[2].size());
1868 assertEquals("C", pep5Variants[2].get(0).base); // codon[3] base
1869 assertSame(sf7, pep5Variants[2].get(0).variant); // codon[3] variant
1873 * Tests for the method that computes all peptide variants given codon
1876 @Test(groups = "Functional")
1877 public void testComputePeptideVariants()
1880 * scenario: AAATTTCCC codes for KFP, with variants
1886 * CAC,CGC -> H,R (as one variant)
1888 SequenceI peptide = new Sequence("pep/10-12", "KFP");
1891 * two distinct variants for codon 1 position 1
1892 * second one has clinical significance
1894 SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
1896 sf1.setValue("alleles", "A,G"); // GAA -> E
1897 sf1.setValue("ID", "var1.125A>G");
1898 SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 1, 1,
1900 sf2.setValue("alleles", "A,C"); // CAA -> Q
1901 sf2.setValue("ID", "var2");
1902 sf2.setValue("clinical_significance", "Dodgy");
1903 SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 3, 3,
1905 sf3.setValue("alleles", "A,G"); // synonymous
1906 sf3.setValue("ID", "var3");
1907 sf3.setValue("clinical_significance", "None");
1908 SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 3, 3,
1910 sf4.setValue("alleles", "A,T"); // AAT -> N
1911 sf4.setValue("ID", "sequence_variant:var4"); // prefix gets stripped off
1912 sf4.setValue("clinical_significance", "Benign");
1913 SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 6, 6,
1915 sf5.setValue("alleles", "T,C"); // synonymous
1916 sf5.setValue("ID", "var5");
1917 sf5.setValue("clinical_significance", "Bad");
1918 SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 8, 8,
1920 sf6.setValue("alleles", "C,A,G"); // CAC,CGC -> H,R
1921 sf6.setValue("ID", "var6");
1922 sf6.setValue("clinical_significance", "Good");
1924 List<DnaVariant> codon1Variants = new ArrayList<DnaVariant>();
1925 List<DnaVariant> codon2Variants = new ArrayList<DnaVariant>();
1926 List<DnaVariant> codon3Variants = new ArrayList<DnaVariant>();
1927 List<DnaVariant> codonVariants[] = new ArrayList[3];
1928 codonVariants[0] = codon1Variants;
1929 codonVariants[1] = codon2Variants;
1930 codonVariants[2] = codon3Variants;
1933 * compute variants for protein position 1
1935 codon1Variants.add(new DnaVariant("A", sf1));
1936 codon1Variants.add(new DnaVariant("A", sf2));
1937 codon2Variants.add(new DnaVariant("A"));
1938 codon2Variants.add(new DnaVariant("A"));
1939 codon3Variants.add(new DnaVariant("A", sf3));
1940 codon3Variants.add(new DnaVariant("A", sf4));
1941 AlignmentUtils.computePeptideVariants(peptide, 1, codonVariants);
1944 * compute variants for protein position 2
1946 codon1Variants.clear();
1947 codon2Variants.clear();
1948 codon3Variants.clear();
1949 codon1Variants.add(new DnaVariant("T"));
1950 codon2Variants.add(new DnaVariant("T"));
1951 codon3Variants.add(new DnaVariant("T", sf5));
1952 AlignmentUtils.computePeptideVariants(peptide, 2, codonVariants);
1955 * compute variants for protein position 3
1957 codon1Variants.clear();
1958 codon2Variants.clear();
1959 codon3Variants.clear();
1960 codon1Variants.add(new DnaVariant("C"));
1961 codon2Variants.add(new DnaVariant("C", sf6));
1962 codon3Variants.add(new DnaVariant("C"));
1963 AlignmentUtils.computePeptideVariants(peptide, 3, codonVariants);
1966 * verify added sequence features for
1973 SequenceFeature[] sfs = peptide.getSequenceFeatures();
1974 assertEquals(5, sfs.length);
1975 SequenceFeature sf = sfs[0];
1976 assertEquals(1, sf.getBegin());
1977 assertEquals(1, sf.getEnd());
1978 assertEquals("p.Lys1Glu", sf.getDescription());
1979 assertEquals("var1.125A>G", sf.getValue("ID"));
1980 assertNull(sf.getValue("clinical_significance"));
1981 assertEquals("ID=var1.125A>G", sf.getAttributes());
1982 assertEquals(1, sf.links.size());
1983 // link to variation is urlencoded
1985 "p.Lys1Glu var1.125A>G|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var1.125A%3EG",
1987 assertEquals("Jalview", sf.getFeatureGroup());
1989 assertEquals(1, sf.getBegin());
1990 assertEquals(1, sf.getEnd());
1991 assertEquals("p.Lys1Gln", sf.getDescription());
1992 assertEquals("var2", sf.getValue("ID"));
1993 assertEquals("Dodgy", sf.getValue("clinical_significance"));
1994 assertEquals("ID=var2;clinical_significance=Dodgy", sf.getAttributes());
1995 assertEquals(1, sf.links.size());
1997 "p.Lys1Gln var2|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var2",
1999 assertEquals("Jalview", sf.getFeatureGroup());
2001 assertEquals(1, sf.getBegin());
2002 assertEquals(1, sf.getEnd());
2003 assertEquals("p.Lys1Asn", sf.getDescription());
2004 assertEquals("var4", sf.getValue("ID"));
2005 assertEquals("Benign", sf.getValue("clinical_significance"));
2006 assertEquals("ID=var4;clinical_significance=Benign", sf.getAttributes());
2007 assertEquals(1, sf.links.size());
2009 "p.Lys1Asn var4|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var4",
2011 assertEquals("Jalview", sf.getFeatureGroup());
2013 assertEquals(3, sf.getBegin());
2014 assertEquals(3, sf.getEnd());
2015 assertEquals("p.Pro3His", sf.getDescription());
2016 assertEquals("var6", sf.getValue("ID"));
2017 assertEquals("Good", sf.getValue("clinical_significance"));
2018 assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes());
2019 assertEquals(1, sf.links.size());
2021 "p.Pro3His var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
2023 // var5 generates two distinct protein variant features
2024 assertEquals("Jalview", sf.getFeatureGroup());
2026 assertEquals(3, sf.getBegin());
2027 assertEquals(3, sf.getEnd());
2028 assertEquals("p.Pro3Arg", sf.getDescription());
2029 assertEquals("var6", sf.getValue("ID"));
2030 assertEquals("Good", sf.getValue("clinical_significance"));
2031 assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes());
2032 assertEquals(1, sf.links.size());
2034 "p.Pro3Arg var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
2036 assertEquals("Jalview", sf.getFeatureGroup());
2040 * Tests for the method that maps the subset of a dna sequence that has CDS
2041 * (or subtype) feature, with CDS strand = '-' (reverse)
2043 // test turned off as currently findCdsPositions is not strand-dependent
2044 // left in case it comes around again...
2045 @Test(groups = "Functional", enabled = false)
2046 public void testFindCdsPositions_reverseStrand()
2048 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
2049 dnaSeq.createDatasetSequence();
2050 SequenceI ds = dnaSeq.getDatasetSequence();
2053 SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
2055 ds.addSequenceFeature(sf);
2056 // exon feature should be ignored here
2057 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
2058 ds.addSequenceFeature(sf);
2059 // CDS for dna 10-12
2060 sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
2062 ds.addSequenceFeature(sf);
2064 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
2066 * verify ranges { [12-10], [6-4] }
2068 assertEquals(6, MappingUtils.getLength(ranges));
2069 assertEquals(2, ranges.size());
2070 assertEquals(12, ranges.get(0)[0]);
2071 assertEquals(10, ranges.get(0)[1]);
2072 assertEquals(6, ranges.get(1)[0]);
2073 assertEquals(4, ranges.get(1)[1]);
2077 * Tests for the method that maps the subset of a dna sequence that has CDS
2078 * (or subtype) feature - reverse strand case where the start codon is
2081 @Test(groups = "Functional", enabled = false)
2082 // test turned off as currently findCdsPositions is not strand-dependent
2083 // left in case it comes around again...
2084 public void testFindCdsPositions_reverseStrandThreePrimeIncomplete()
2086 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
2087 dnaSeq.createDatasetSequence();
2088 SequenceI ds = dnaSeq.getDatasetSequence();
2091 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
2093 ds.addSequenceFeature(sf);
2094 // CDS for dna 13-15
2095 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
2097 sf.setPhase("2"); // skip 2 bases to start of next codon
2098 ds.addSequenceFeature(sf);
2100 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
2103 * check the mapping starts with the first complete codon
2104 * expect ranges [13, 13], [9, 5]
2106 assertEquals(6, MappingUtils.getLength(ranges));
2107 assertEquals(2, ranges.size());
2108 assertEquals(13, ranges.get(0)[0]);
2109 assertEquals(13, ranges.get(0)[1]);
2110 assertEquals(9, ranges.get(1)[0]);
2111 assertEquals(5, ranges.get(1)[1]);
2114 @Test(groups = "Functional")
2115 public void testAlignAs_alternateTranscriptsUngapped()
2117 SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
2118 SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
2119 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
2120 ((Alignment) dna).createDatasetAlignment();
2121 SequenceI cds1 = new Sequence("cds1", "GGGTTT");
2122 SequenceI cds2 = new Sequence("cds2", "CCCAAA");
2123 AlignmentI cds = new Alignment(new SequenceI[] { cds1, cds2 });
2124 ((Alignment) cds).createDatasetAlignment();
2126 AlignedCodonFrame acf = new AlignedCodonFrame();
2127 MapList map = new MapList(new int[] { 4, 9 }, new int[] { 1, 6 }, 1, 1);
2128 acf.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), map);
2129 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 6 }, 1, 1);
2130 acf.addMap(dna2.getDatasetSequence(), cds2.getDatasetSequence(), map);
2133 * verify CDS alignment is as:
2134 * cccGGGTTTaaa (cdna)
2135 * CCCgggtttAAA (cdna)
2137 * ---GGGTTT--- (cds)
2138 * CCC------AAA (cds)
2140 dna.addCodonFrame(acf);
2141 AlignmentUtils.alignAs(cds, dna);
2142 assertEquals("---GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
2143 assertEquals("CCC------AAA", cds.getSequenceAt(1).getSequenceAsString());
2146 @Test(groups = { "Functional" })
2147 public void testAddMappedPositions()
2149 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
2150 SequenceI seq1 = new Sequence("cds", "AAATTT");
2151 from.createDatasetSequence();
2152 seq1.createDatasetSequence();
2153 Mapping mapping = new Mapping(seq1, new MapList(
2154 new int[] { 3, 6, 9, 10 },
2155 new int[] { 1, 6 }, 1, 1));
2156 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
2157 AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2160 * verify map has seq1 residues in columns 3,4,6,7,11,12
2162 assertEquals(6, map.size());
2163 assertEquals('A', map.get(3).get(seq1).charValue());
2164 assertEquals('A', map.get(4).get(seq1).charValue());
2165 assertEquals('A', map.get(6).get(seq1).charValue());
2166 assertEquals('T', map.get(7).get(seq1).charValue());
2167 assertEquals('T', map.get(11).get(seq1).charValue());
2168 assertEquals('T', map.get(12).get(seq1).charValue());
2176 * Test case where the mapping 'from' range includes a stop codon which is
2177 * absent in the 'to' range
2179 @Test(groups = { "Functional" })
2180 public void testAddMappedPositions_withStopCodon()
2182 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
2183 SequenceI seq1 = new Sequence("cds", "AAATTT");
2184 from.createDatasetSequence();
2185 seq1.createDatasetSequence();
2186 Mapping mapping = new Mapping(seq1, new MapList(
2187 new int[] { 3, 6, 9, 10 },
2188 new int[] { 1, 6 }, 1, 1));
2189 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
2190 AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2193 * verify map has seq1 residues in columns 3,4,6,7,11,12
2195 assertEquals(6, map.size());
2196 assertEquals('A', map.get(3).get(seq1).charValue());
2197 assertEquals('A', map.get(4).get(seq1).charValue());
2198 assertEquals('A', map.get(6).get(seq1).charValue());
2199 assertEquals('T', map.get(7).get(seq1).charValue());
2200 assertEquals('T', map.get(11).get(seq1).charValue());
2201 assertEquals('T', map.get(12).get(seq1).charValue());