2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.testng.AssertJUnit.assertEquals;
24 import static org.testng.AssertJUnit.assertFalse;
25 import static org.testng.AssertJUnit.assertNull;
26 import static org.testng.AssertJUnit.assertSame;
27 import static org.testng.AssertJUnit.assertTrue;
29 import jalview.datamodel.AlignedCodonFrame;
30 import jalview.datamodel.Alignment;
31 import jalview.datamodel.AlignmentAnnotation;
32 import jalview.datamodel.AlignmentI;
33 import jalview.datamodel.Annotation;
34 import jalview.datamodel.DBRefEntry;
35 import jalview.datamodel.Mapping;
36 import jalview.datamodel.SearchResults;
37 import jalview.datamodel.SearchResults.Match;
38 import jalview.datamodel.Sequence;
39 import jalview.datamodel.SequenceFeature;
40 import jalview.datamodel.SequenceI;
41 import jalview.io.AppletFormatAdapter;
42 import jalview.io.FormatAdapter;
43 import jalview.util.MapList;
44 import jalview.util.MappingUtils;
46 import java.io.IOException;
47 import java.util.ArrayList;
48 import java.util.Arrays;
49 import java.util.Iterator;
50 import java.util.LinkedHashMap;
51 import java.util.List;
54 import org.testng.annotations.Test;
56 public class AlignmentUtilsTests
58 public static Sequence ts = new Sequence("short",
59 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm");
61 @Test(groups = { "Functional" })
62 public void testExpandContext()
64 AlignmentI al = new Alignment(new Sequence[] {});
65 for (int i = 4; i < 14; i += 2)
67 SequenceI s1 = ts.deriveSequence().getSubSequence(i, i + 7);
70 System.out.println(new AppletFormatAdapter().formatSequences("Clustal",
72 for (int flnk = -1; flnk < 25; flnk++)
74 AlignmentI exp = AlignmentUtils.expandContext(al, flnk);
75 System.out.println("\nFlank size: " + flnk);
76 System.out.println(new AppletFormatAdapter().formatSequences(
77 "Clustal", exp, true));
81 * Full expansion to complete sequences
83 for (SequenceI sq : exp.getSequences())
85 String ung = sq.getSequenceAsString().replaceAll("-+", "");
86 final String errorMsg = "Flanking sequence not the same as original dataset sequence.\n"
89 + sq.getDatasetSequence().getSequenceAsString();
90 assertTrue(errorMsg, ung.equalsIgnoreCase(sq.getDatasetSequence()
91 .getSequenceAsString()));
97 * Last sequence is fully expanded, others have leading gaps to match
99 assertTrue(exp.getSequenceAt(4).getSequenceAsString()
101 assertTrue(exp.getSequenceAt(3).getSequenceAsString()
102 .startsWith("--abc"));
103 assertTrue(exp.getSequenceAt(2).getSequenceAsString()
104 .startsWith("----abc"));
105 assertTrue(exp.getSequenceAt(1).getSequenceAsString()
106 .startsWith("------abc"));
107 assertTrue(exp.getSequenceAt(0).getSequenceAsString()
108 .startsWith("--------abc"));
114 * Test that annotations are correctly adjusted by expandContext
116 @Test(groups = { "Functional" })
117 public void testExpandContext_annotation()
119 AlignmentI al = new Alignment(new Sequence[] {});
120 SequenceI ds = new Sequence("Seq1", "ABCDEFGHI");
122 SequenceI seq1 = ds.deriveSequence().getSubSequence(3, 6);
123 al.addSequence(seq1);
126 * Annotate DEF with 4/5/6 respectively
128 Annotation[] anns = new Annotation[] { new Annotation(4),
129 new Annotation(5), new Annotation(6) };
130 AlignmentAnnotation ann = new AlignmentAnnotation("SS",
131 "secondary structure", anns);
132 seq1.addAlignmentAnnotation(ann);
135 * The annotations array should match aligned positions
137 assertEquals(3, ann.annotations.length);
138 assertEquals(4, ann.annotations[0].value, 0.001);
139 assertEquals(5, ann.annotations[1].value, 0.001);
140 assertEquals(6, ann.annotations[2].value, 0.001);
143 * Check annotation to sequence position mappings before expanding the
144 * sequence; these are set up in Sequence.addAlignmentAnnotation ->
145 * Annotation.setSequenceRef -> createSequenceMappings
147 assertNull(ann.getAnnotationForPosition(1));
148 assertNull(ann.getAnnotationForPosition(2));
149 assertNull(ann.getAnnotationForPosition(3));
150 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
151 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
152 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
153 assertNull(ann.getAnnotationForPosition(7));
154 assertNull(ann.getAnnotationForPosition(8));
155 assertNull(ann.getAnnotationForPosition(9));
158 * Expand the subsequence to the full sequence abcDEFghi
160 AlignmentI expanded = AlignmentUtils.expandContext(al, -1);
161 assertEquals("abcDEFghi", expanded.getSequenceAt(0)
162 .getSequenceAsString());
165 * Confirm the alignment and sequence have the same SS annotation,
166 * referencing the expanded sequence
168 ann = expanded.getSequenceAt(0).getAnnotation()[0];
169 assertSame(ann, expanded.getAlignmentAnnotation()[0]);
170 assertSame(expanded.getSequenceAt(0), ann.sequenceRef);
173 * The annotations array should have null values except for annotated
176 assertNull(ann.annotations[0]);
177 assertNull(ann.annotations[1]);
178 assertNull(ann.annotations[2]);
179 assertEquals(4, ann.annotations[3].value, 0.001);
180 assertEquals(5, ann.annotations[4].value, 0.001);
181 assertEquals(6, ann.annotations[5].value, 0.001);
182 assertNull(ann.annotations[6]);
183 assertNull(ann.annotations[7]);
184 assertNull(ann.annotations[8]);
187 * sequence position mappings should be unchanged
189 assertNull(ann.getAnnotationForPosition(1));
190 assertNull(ann.getAnnotationForPosition(2));
191 assertNull(ann.getAnnotationForPosition(3));
192 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
193 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
194 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
195 assertNull(ann.getAnnotationForPosition(7));
196 assertNull(ann.getAnnotationForPosition(8));
197 assertNull(ann.getAnnotationForPosition(9));
201 * Test method that returns a map of lists of sequences by sequence name.
203 * @throws IOException
205 @Test(groups = { "Functional" })
206 public void testGetSequencesByName() throws IOException
208 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
209 + ">Seq1Name\nABCD\n";
210 AlignmentI al = loadAlignment(data, "FASTA");
211 Map<String, List<SequenceI>> map = AlignmentUtils
212 .getSequencesByName(al);
213 assertEquals(2, map.keySet().size());
214 assertEquals(2, map.get("Seq1Name").size());
215 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
216 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
217 assertEquals(1, map.get("Seq2Name").size());
218 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
222 * Helper method to load an alignment and ensure dataset sequences are set up.
228 * @throws IOException
230 protected AlignmentI loadAlignment(final String data, String format)
233 AlignmentI a = new FormatAdapter().readFile(data,
234 AppletFormatAdapter.PASTE, format);
240 * Test mapping of protein to cDNA, for the case where we have no sequence
241 * cross-references, so mappings are made first-served 1-1 where sequences
244 * @throws IOException
246 @Test(groups = { "Functional" })
247 public void testMapProteinAlignmentToCdna_noXrefs() throws IOException
249 List<SequenceI> protseqs = new ArrayList<SequenceI>();
250 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
251 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
252 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
253 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
254 protein.setDataset(null);
256 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
257 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
258 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
259 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
260 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
261 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
262 cdna.setDataset(null);
264 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
266 // 3 mappings made, each from 1 to 1 sequence
267 assertEquals(3, protein.getCodonFrames().size());
268 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
269 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
270 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
272 // V12345 mapped to A22222
273 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
275 assertEquals(1, acf.getdnaSeqs().length);
276 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
277 acf.getdnaSeqs()[0]);
278 Mapping[] protMappings = acf.getProtMappings();
279 assertEquals(1, protMappings.length);
280 MapList mapList = protMappings[0].getMap();
281 assertEquals(3, mapList.getFromRatio());
282 assertEquals(1, mapList.getToRatio());
283 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
285 assertEquals(1, mapList.getFromRanges().size());
286 assertTrue(Arrays.equals(new int[] { 1, 3 },
287 mapList.getToRanges().get(0)));
288 assertEquals(1, mapList.getToRanges().size());
290 // V12346 mapped to A33333
291 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
292 assertEquals(1, acf.getdnaSeqs().length);
293 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
294 acf.getdnaSeqs()[0]);
296 // V12347 mapped to A11111
297 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
298 assertEquals(1, acf.getdnaSeqs().length);
299 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
300 acf.getdnaSeqs()[0]);
302 // no mapping involving the 'extra' A44444
303 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
307 * Test for the alignSequenceAs method that takes two sequences and a mapping.
309 @Test(groups = { "Functional" })
310 public void testAlignSequenceAs_withMapping_noIntrons()
312 MapList map = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1);
315 * No existing gaps in dna:
317 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
321 * Now introduce gaps in dna but ignore them when realigning.
323 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
327 * Now include gaps in dna when realigning. First retaining 'mapped' gaps
328 * only, i.e. those within the exon region.
330 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
331 "---G-G--G---A--A-A");
334 * Include all gaps in dna when realigning (within and without the exon
335 * region). The leading gap, and the gaps between codons, are subsumed by
336 * the protein alignment gap.
338 checkAlignSequenceAs("-G-GG--AA-A---", "-A-L-", true, true, map,
339 "---G-GG---AA-A---");
342 * Include only unmapped gaps in dna when realigning (outside the exon
343 * region). The leading gap, and the gaps between codons, are subsumed by
344 * the protein alignment gap.
346 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
351 * Test for the alignSequenceAs method that takes two sequences and a mapping.
353 @Test(groups = { "Functional" })
354 public void testAlignSequenceAs_withMapping_withIntrons()
357 * Exons at codon 2 (AAA) and 4 (TTT)
359 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
360 new int[] { 1, 2 }, 3, 1);
363 * Simple case: no gaps in dna
365 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
366 "GGG---AAACCCTTTGGG");
369 * Add gaps to dna - but ignore when realigning.
371 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
372 false, false, map, "GGG---AAACCCTTTGGG");
375 * Add gaps to dna - include within exons only when realigning.
377 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
378 true, false, map, "GGG---A--A---ACCCT-TTGGG");
381 * Include gaps outside exons only when realigning.
383 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
384 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
387 * Include gaps following first intron if we are 'preserving mapped gaps'
389 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
390 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
393 * Include all gaps in dna when realigning.
395 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
396 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
400 * Test for the case where not all of the protein sequence is mapped to cDNA.
402 @Test(groups = { "Functional" })
403 public void testAlignSequenceAs_withMapping_withUnmappedProtein()
406 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
408 final MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] {
412 * -L- 'aligns' ccc------
414 checkAlignSequenceAs("gggAAAcccTTTggg", "-A-L-P-", false, false, map,
415 "gggAAAccc------TTTggg");
419 * Helper method that performs and verifies the method under test.
422 * the sequence to be realigned
424 * the sequence whose alignment is to be copied
425 * @param preserveMappedGaps
426 * @param preserveUnmappedGaps
430 protected void checkAlignSequenceAs(final String alignee,
431 final String alignModel, final boolean preserveMappedGaps,
432 final boolean preserveUnmappedGaps, MapList map,
433 final String expected)
435 SequenceI alignMe = new Sequence("Seq1", alignee);
436 alignMe.createDatasetSequence();
437 SequenceI alignFrom = new Sequence("Seq2", alignModel);
438 alignFrom.createDatasetSequence();
439 AlignedCodonFrame acf = new AlignedCodonFrame();
440 acf.addMap(alignMe.getDatasetSequence(), alignFrom.getDatasetSequence(), map);
442 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "---", '-',
443 preserveMappedGaps, preserveUnmappedGaps);
444 assertEquals(expected, alignMe.getSequenceAsString());
448 * Test for the alignSequenceAs method where we preserve gaps in introns only.
450 @Test(groups = { "Functional" })
451 public void testAlignSequenceAs_keepIntronGapsOnly()
455 * Intron GGGAAA followed by exon CCCTTT
457 MapList map = new MapList(new int[] { 7, 12 }, new int[] { 1, 2 }, 3, 1);
459 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL", false, true, map,
464 * Test the method that realigns protein to match mapped codon alignment.
466 @Test(groups = { "Functional" })
467 public void testAlignProteinAsDna()
469 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
470 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
471 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
472 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
473 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
474 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
475 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
476 dna.setDataset(null);
478 // protein alignment will be realigned like dna
479 SequenceI prot1 = new Sequence("Seq1", "CHYQ");
480 SequenceI prot2 = new Sequence("Seq2", "CHYQ");
481 SequenceI prot3 = new Sequence("Seq3", "CHYQ");
482 SequenceI prot4 = new Sequence("Seq4", "R-QSV"); // unmapped, unchanged
483 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
485 protein.setDataset(null);
487 MapList map = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 3, 1);
488 AlignedCodonFrame acf = new AlignedCodonFrame();
489 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
490 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
491 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
492 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
494 protein.setCodonFrames(acfs);
497 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
498 * [8,9,10] [10,11,12] [11,12,13]
500 AlignmentUtils.alignProteinAsDna(protein, dna);
501 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
502 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
503 assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
504 assertEquals("R-QSV", prot4.getSequenceAsString());
508 * Test the method that tests whether a CDNA sequence translates to a protein
511 @Test(groups = { "Functional" })
512 public void testTranslatesAs()
514 // null arguments check
515 assertFalse(AlignmentUtils.translatesAs(null, 0, null));
516 assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null));
517 assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' }));
519 // straight translation
520 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
521 "FPKG".toCharArray()));
522 // with extra start codon (not in protein)
523 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
524 3, "FPKG".toCharArray()));
525 // with stop codon1 (not in protein)
526 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
527 0, "FPKG".toCharArray()));
528 // with stop codon1 (in protein as *)
529 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
530 0, "FPKG*".toCharArray()));
531 // with stop codon2 (not in protein)
532 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
533 0, "FPKG".toCharArray()));
534 // with stop codon3 (not in protein)
535 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
536 0, "FPKG".toCharArray()));
537 // with start and stop codon1
538 assertTrue(AlignmentUtils.translatesAs(
539 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG".toCharArray()));
540 // with start and stop codon1 (in protein as *)
541 assertTrue(AlignmentUtils.translatesAs(
542 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG*".toCharArray()));
543 // with start and stop codon2
544 assertTrue(AlignmentUtils.translatesAs(
545 "atgtttcccaaagggtag".toCharArray(), 3, "FPKG".toCharArray()));
546 // with start and stop codon3
547 assertTrue(AlignmentUtils.translatesAs(
548 "atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray()));
550 // with embedded stop codons
551 assertTrue(AlignmentUtils.translatesAs(
552 "atgtttTAGcccaaaTAAgggtga".toCharArray(), 3,
553 "F*PK*G".toCharArray()));
556 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
557 0, "FPMG".toCharArray()));
560 assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0,
561 "FPKG".toCharArray()));
564 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
565 0, "FPK".toCharArray()));
567 // overlong dna (doesn't end in stop codon)
568 assertFalse(AlignmentUtils.translatesAs(
569 "tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray()));
571 // dna + stop codon + more
572 assertFalse(AlignmentUtils.translatesAs(
573 "tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray()));
576 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
577 0, "FPKGQ".toCharArray()));
581 * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
582 * stop codons in addition to the protein coding sequence.
584 * @throws IOException
586 @Test(groups = { "Functional" })
587 public void testMapProteinAlignmentToCdna_withStartAndStopCodons()
590 List<SequenceI> protseqs = new ArrayList<SequenceI>();
591 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
592 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
593 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
594 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
595 protein.setDataset(null);
597 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
599 dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
601 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
602 // = start +EIQ + stop
603 dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
604 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
605 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
606 cdna.setDataset(null);
608 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
610 // 3 mappings made, each from 1 to 1 sequence
611 assertEquals(3, protein.getCodonFrames().size());
612 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
613 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
614 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
616 // V12345 mapped from A22222
617 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
619 assertEquals(1, acf.getdnaSeqs().length);
620 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
621 acf.getdnaSeqs()[0]);
622 Mapping[] protMappings = acf.getProtMappings();
623 assertEquals(1, protMappings.length);
624 MapList mapList = protMappings[0].getMap();
625 assertEquals(3, mapList.getFromRatio());
626 assertEquals(1, mapList.getToRatio());
627 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
629 assertEquals(1, mapList.getFromRanges().size());
630 assertTrue(Arrays.equals(new int[] { 1, 3 },
631 mapList.getToRanges().get(0)));
632 assertEquals(1, mapList.getToRanges().size());
634 // V12346 mapped from A33333 starting position 4
635 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
636 assertEquals(1, acf.getdnaSeqs().length);
637 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
638 acf.getdnaSeqs()[0]);
639 protMappings = acf.getProtMappings();
640 assertEquals(1, protMappings.length);
641 mapList = protMappings[0].getMap();
642 assertEquals(3, mapList.getFromRatio());
643 assertEquals(1, mapList.getToRatio());
644 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
646 assertEquals(1, mapList.getFromRanges().size());
647 assertTrue(Arrays.equals(new int[] { 1, 3 },
648 mapList.getToRanges().get(0)));
649 assertEquals(1, mapList.getToRanges().size());
651 // V12347 mapped to A11111 starting position 4
652 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
653 assertEquals(1, acf.getdnaSeqs().length);
654 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
655 acf.getdnaSeqs()[0]);
656 protMappings = acf.getProtMappings();
657 assertEquals(1, protMappings.length);
658 mapList = protMappings[0].getMap();
659 assertEquals(3, mapList.getFromRatio());
660 assertEquals(1, mapList.getToRatio());
661 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
663 assertEquals(1, mapList.getFromRanges().size());
664 assertTrue(Arrays.equals(new int[] { 1, 3 },
665 mapList.getToRanges().get(0)));
666 assertEquals(1, mapList.getToRanges().size());
668 // no mapping involving the 'extra' A44444
669 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
673 * Test mapping of protein to cDNA, for the case where we have some sequence
674 * cross-references. Verify that 1-to-many mappings are made where
675 * cross-references exist and sequences are mappable.
677 * @throws IOException
679 @Test(groups = { "Functional" })
680 public void testMapProteinAlignmentToCdna_withXrefs() throws IOException
682 List<SequenceI> protseqs = new ArrayList<SequenceI>();
683 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
684 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
685 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
686 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
687 protein.setDataset(null);
689 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
690 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
691 dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
692 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
693 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
694 dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
695 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
696 cdna.setDataset(null);
698 // Xref A22222 to V12345 (should get mapped)
699 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
700 // Xref V12345 to A44444 (should get mapped)
701 protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
702 // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
703 dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
704 // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
705 // it should get paired up with the unmapped A33333
706 // A11111 should be mapped to V12347
707 // A55555 is spare and has no xref so is not mapped
709 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
711 // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
712 assertEquals(3, protein.getCodonFrames().size());
713 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
714 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
715 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
717 // one mapping for each of the first 4 cDNA sequences
718 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
719 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
720 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
721 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
723 // V12345 mapped to A22222 and A44444
724 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
726 assertEquals(2, acf.getdnaSeqs().length);
727 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
728 acf.getdnaSeqs()[0]);
729 assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
730 acf.getdnaSeqs()[1]);
732 // V12346 mapped to A33333
733 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
734 assertEquals(1, acf.getdnaSeqs().length);
735 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
736 acf.getdnaSeqs()[0]);
738 // V12347 mapped to A11111
739 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
740 assertEquals(1, acf.getdnaSeqs().length);
741 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
742 acf.getdnaSeqs()[0]);
744 // no mapping involving the 'extra' A55555
745 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
749 * Test mapping of protein to cDNA, for the case where we have some sequence
750 * cross-references. Verify that once we have made an xref mapping we don't
751 * also map un-xrefd sequeces.
753 * @throws IOException
755 @Test(groups = { "Functional" })
756 public void testMapProteinAlignmentToCdna_prioritiseXrefs()
759 List<SequenceI> protseqs = new ArrayList<SequenceI>();
760 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
761 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
762 AlignmentI protein = new Alignment(
763 protseqs.toArray(new SequenceI[protseqs.size()]));
764 protein.setDataset(null);
766 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
767 dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
768 dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
769 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
771 cdna.setDataset(null);
773 // Xref A22222 to V12345 (should get mapped)
774 // A11111 should then be mapped to the unmapped V12346
775 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
777 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
779 // 2 protein mappings made
780 assertEquals(2, protein.getCodonFrames().size());
781 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
782 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
784 // one mapping for each of the cDNA sequences
785 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
786 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
788 // V12345 mapped to A22222
789 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
791 assertEquals(1, acf.getdnaSeqs().length);
792 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
793 acf.getdnaSeqs()[0]);
795 // V12346 mapped to A11111
796 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
797 assertEquals(1, acf.getdnaSeqs().length);
798 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
799 acf.getdnaSeqs()[0]);
803 * Test the method that shows or hides sequence annotations by type(s) and
806 @Test(groups = { "Functional" })
807 public void testShowOrHideSequenceAnnotations()
809 SequenceI seq1 = new Sequence("Seq1", "AAA");
810 SequenceI seq2 = new Sequence("Seq2", "BBB");
811 SequenceI seq3 = new Sequence("Seq3", "CCC");
812 Annotation[] anns = new Annotation[] { new Annotation(2f) };
813 AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1",
815 ann1.setSequenceRef(seq1);
816 AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2",
818 ann2.setSequenceRef(seq2);
819 AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3",
821 AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns);
822 ann4.setSequenceRef(seq1);
823 AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns);
824 ann5.setSequenceRef(seq2);
825 AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns);
826 AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2, seq3 });
827 al.addAnnotation(ann1); // Structure for Seq1
828 al.addAnnotation(ann2); // Structure for Seq2
829 al.addAnnotation(ann3); // Structure for no sequence
830 al.addAnnotation(ann4); // Temp for seq1
831 al.addAnnotation(ann5); // Temp for seq2
832 al.addAnnotation(ann6); // Temp for no sequence
833 List<String> types = new ArrayList<String>();
834 List<SequenceI> scope = new ArrayList<SequenceI>();
837 * Set all sequence related Structure to hidden (ann1, ann2)
839 types.add("Structure");
840 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
842 assertFalse(ann1.visible);
843 assertFalse(ann2.visible);
844 assertTrue(ann3.visible); // not sequence-related, not affected
845 assertTrue(ann4.visible); // not Structure, not affected
846 assertTrue(ann5.visible); // "
847 assertTrue(ann6.visible); // not sequence-related, not affected
850 * Set Temp in {seq1, seq3} to hidden
856 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false,
858 assertFalse(ann1.visible); // unchanged
859 assertFalse(ann2.visible); // unchanged
860 assertTrue(ann3.visible); // not sequence-related, not affected
861 assertFalse(ann4.visible); // Temp for seq1 hidden
862 assertTrue(ann5.visible); // not in scope, not affected
863 assertTrue(ann6.visible); // not sequence-related, not affected
866 * Set Temp in all sequences to hidden
872 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
874 assertFalse(ann1.visible); // unchanged
875 assertFalse(ann2.visible); // unchanged
876 assertTrue(ann3.visible); // not sequence-related, not affected
877 assertFalse(ann4.visible); // Temp for seq1 hidden
878 assertFalse(ann5.visible); // Temp for seq2 hidden
879 assertTrue(ann6.visible); // not sequence-related, not affected
882 * Set all types in {seq1, seq3} to visible
888 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true,
890 assertTrue(ann1.visible); // Structure for seq1 set visible
891 assertFalse(ann2.visible); // not in scope, unchanged
892 assertTrue(ann3.visible); // not sequence-related, not affected
893 assertTrue(ann4.visible); // Temp for seq1 set visible
894 assertFalse(ann5.visible); // not in scope, unchanged
895 assertTrue(ann6.visible); // not sequence-related, not affected
898 * Set all types in all scope to hidden
900 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true,
902 assertFalse(ann1.visible);
903 assertFalse(ann2.visible);
904 assertTrue(ann3.visible); // not sequence-related, not affected
905 assertFalse(ann4.visible);
906 assertFalse(ann5.visible);
907 assertTrue(ann6.visible); // not sequence-related, not affected
911 * Tests for the method that checks if one sequence cross-references another
913 @Test(groups = { "Functional" })
914 public void testHasCrossRef()
916 assertFalse(AlignmentUtils.hasCrossRef(null, null));
917 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
918 assertFalse(AlignmentUtils.hasCrossRef(seq1, null));
919 assertFalse(AlignmentUtils.hasCrossRef(null, seq1));
920 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
921 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
924 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193"));
925 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
927 // case-insensitive; version number is ignored
928 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192"));
929 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
932 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
933 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
934 // test is one-way only
935 assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1));
939 * Tests for the method that checks if either sequence cross-references the
942 @Test(groups = { "Functional" })
943 public void testHaveCrossRef()
945 assertFalse(AlignmentUtils.hasCrossRef(null, null));
946 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
947 assertFalse(AlignmentUtils.haveCrossRef(seq1, null));
948 assertFalse(AlignmentUtils.haveCrossRef(null, seq1));
949 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
950 assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2));
952 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
953 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
954 // next is true for haveCrossRef, false for hasCrossRef
955 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
957 // now the other way round
958 seq1.setDBRefs(null);
959 seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345"));
960 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
961 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
964 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
965 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
966 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
970 * Test the method that extracts the cds-only part of a dna alignment.
972 @Test(groups = { "Functional" })
973 public void testMakeCdsAlignment()
975 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
976 SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
977 SequenceI pep1 = new Sequence("pep1", "GF");
978 SequenceI pep2 = new Sequence("pep2", "GFP");
979 dna1.createDatasetSequence();
980 dna2.createDatasetSequence();
981 pep1.createDatasetSequence();
982 pep2.createDatasetSequence();
983 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
985 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
987 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
989 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
991 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds5", 13, 15, 0f,
993 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
994 dna.setDataset(null);
996 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
997 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
998 new int[] { 1, 2 }, 3, 1);
999 AlignedCodonFrame acf = new AlignedCodonFrame();
1000 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1002 map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 },
1004 acf = new AlignedCodonFrame();
1005 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1009 * execute method under test:
1011 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1012 dna1, dna2 }, mappings, dna);
1014 assertEquals(2, cds.getSequences().size());
1015 assertEquals("GGGTTT", cds.getSequenceAt(0)
1016 .getSequenceAsString());
1017 assertEquals("GGGTTTCCC", cds.getSequenceAt(1)
1018 .getSequenceAsString());
1021 * verify shared, extended alignment dataset
1023 assertSame(dna.getDataset(), cds.getDataset());
1024 assertTrue(dna.getDataset().getSequences()
1025 .contains(cds.getSequenceAt(0).getDatasetSequence()));
1026 assertTrue(dna.getDataset().getSequences()
1027 .contains(cds.getSequenceAt(1).getDatasetSequence()));
1030 * Verify mappings from CDS to peptide and cDNA to CDS
1031 * the mappings are on the shared alignment dataset
1033 assertSame(dna.getCodonFrames(), cds.getCodonFrames());
1034 List<AlignedCodonFrame> cdsMappings = cds.getCodonFrames();
1035 assertEquals(2, cdsMappings.size());
1038 * Mapping from pep1 to GGGTTT in first new exon sequence
1040 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1041 .findMappingsForSequence(pep1, cdsMappings);
1042 assertEquals(1, pep1Mapping.size());
1044 SearchResults sr = MappingUtils
1045 .buildSearchResults(pep1, 1, cdsMappings);
1046 assertEquals(1, sr.getResults().size());
1047 Match m = sr.getResults().get(0);
1048 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1050 assertEquals(1, m.getStart());
1051 assertEquals(3, m.getEnd());
1053 sr = MappingUtils.buildSearchResults(pep1, 2, cdsMappings);
1054 m = sr.getResults().get(0);
1055 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1057 assertEquals(4, m.getStart());
1058 assertEquals(6, m.getEnd());
1061 * Mapping from pep2 to GGGTTTCCC in second new exon sequence
1063 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1064 .findMappingsForSequence(pep2, cdsMappings);
1065 assertEquals(1, pep2Mapping.size());
1067 sr = MappingUtils.buildSearchResults(pep2, 1, cdsMappings);
1068 assertEquals(1, sr.getResults().size());
1069 m = sr.getResults().get(0);
1070 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1072 assertEquals(1, m.getStart());
1073 assertEquals(3, m.getEnd());
1075 sr = MappingUtils.buildSearchResults(pep2, 2, cdsMappings);
1076 m = sr.getResults().get(0);
1077 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1079 assertEquals(4, m.getStart());
1080 assertEquals(6, m.getEnd());
1082 sr = MappingUtils.buildSearchResults(pep2, 3, cdsMappings);
1083 m = sr.getResults().get(0);
1084 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1086 assertEquals(7, m.getStart());
1087 assertEquals(9, m.getEnd());
1091 * Test the method that makes a cds-only alignment from a DNA sequence and its
1092 * product mappings, for the case where there are multiple exon mappings to
1093 * different protein products.
1095 @Test(groups = { "Functional" })
1096 public void testMakeCdsAlignment_multipleProteins()
1098 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1099 SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT
1100 SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc
1101 SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT
1102 dna1.createDatasetSequence();
1103 pep1.createDatasetSequence();
1104 pep2.createDatasetSequence();
1105 pep3.createDatasetSequence();
1106 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
1108 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
1110 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
1112 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
1114 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds5", 1, 3, 0f,
1116 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds6", 10, 12, 0f,
1118 pep1.getDatasetSequence().addDBRef(
1119 new DBRefEntry("EMBLCDS", "2", "A12345"));
1120 pep2.getDatasetSequence().addDBRef(
1121 new DBRefEntry("EMBLCDS", "3", "A12346"));
1122 pep3.getDatasetSequence().addDBRef(
1123 new DBRefEntry("EMBLCDS", "4", "A12347"));
1126 * Make the mappings from dna to protein
1128 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
1129 // map ...GGG...TTT to GF
1130 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1131 new int[] { 1, 2 }, 3, 1);
1132 AlignedCodonFrame acf = new AlignedCodonFrame();
1133 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1136 // map aaa...ccc to KP
1137 map = new MapList(new int[] { 1, 3, 7, 9 }, new int[] { 1, 2 }, 3, 1);
1138 acf = new AlignedCodonFrame();
1139 acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map);
1142 // map aaa......TTT to KF
1143 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 2 }, 3, 1);
1144 acf = new AlignedCodonFrame();
1145 acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
1149 * Create the CDS alignment; also augments the dna-to-protein mappings with
1150 * exon-to-protein and exon-to-dna mappings
1152 AlignmentI dna = new Alignment(new SequenceI[] { dna1 });
1153 dna.setDataset(null);
1156 * execute method under test
1158 AlignmentI cdsal = AlignmentUtils.makeCdsAlignment(
1159 new SequenceI[] { dna1 }, mappings, dna);
1162 * Verify we have 3 cds sequences, mapped to pep1/2/3 respectively
1164 List<SequenceI> cds = cdsal.getSequences();
1165 assertEquals(3, cds.size());
1168 * verify shared, extended alignment dataset
1170 assertSame(cdsal.getDataset(), dna.getDataset());
1171 assertTrue(dna.getDataset().getSequences()
1172 .contains(cds.get(0).getDatasetSequence()));
1173 assertTrue(dna.getDataset().getSequences()
1174 .contains(cds.get(1).getDatasetSequence()));
1175 assertTrue(dna.getDataset().getSequences()
1176 .contains(cds.get(2).getDatasetSequence()));
1179 * verify aligned cds sequences and their xrefs
1181 SequenceI cdsSeq = cds.get(0);
1182 assertEquals("GGGTTT", cdsSeq.getSequenceAsString());
1183 // assertEquals("dna1|A12345", cdsSeq.getName());
1184 assertEquals("dna1|pep1", cdsSeq.getName());
1185 // assertEquals(1, cdsSeq.getDBRefs().length);
1186 // DBRefEntry cdsRef = cdsSeq.getDBRefs()[0];
1187 // assertEquals("EMBLCDS", cdsRef.getSource());
1188 // assertEquals("2", cdsRef.getVersion());
1189 // assertEquals("A12345", cdsRef.getAccessionId());
1191 cdsSeq = cds.get(1);
1192 assertEquals("aaaccc", cdsSeq.getSequenceAsString());
1193 // assertEquals("dna1|A12346", cdsSeq.getName());
1194 assertEquals("dna1|pep2", cdsSeq.getName());
1195 // assertEquals(1, cdsSeq.getDBRefs().length);
1196 // cdsRef = cdsSeq.getDBRefs()[0];
1197 // assertEquals("EMBLCDS", cdsRef.getSource());
1198 // assertEquals("3", cdsRef.getVersion());
1199 // assertEquals("A12346", cdsRef.getAccessionId());
1201 cdsSeq = cds.get(2);
1202 assertEquals("aaaTTT", cdsSeq.getSequenceAsString());
1203 // assertEquals("dna1|A12347", cdsSeq.getName());
1204 assertEquals("dna1|pep3", cdsSeq.getName());
1205 // assertEquals(1, cdsSeq.getDBRefs().length);
1206 // cdsRef = cdsSeq.getDBRefs()[0];
1207 // assertEquals("EMBLCDS", cdsRef.getSource());
1208 // assertEquals("4", cdsRef.getVersion());
1209 // assertEquals("A12347", cdsRef.getAccessionId());
1212 * Verify there are mappings from each cds sequence to its protein product
1213 * and also to its dna source
1215 Iterator<AlignedCodonFrame> newMappingsIterator = cdsal
1216 .getCodonFrames().iterator();
1218 // mappings for dna1 - exon1 - pep1
1219 AlignedCodonFrame cdsMapping = newMappingsIterator.next();
1220 List<Mapping> dnaMappings = cdsMapping.getMappingsFromSequence(dna1);
1221 assertEquals(3, dnaMappings.size());
1222 assertSame(cds.get(0).getDatasetSequence(), dnaMappings.get(0)
1224 assertEquals("G(1) in CDS should map to G(4) in DNA", 4, dnaMappings
1225 .get(0).getMap().getToPosition(1));
1226 List<Mapping> peptideMappings = cdsMapping.getMappingsFromSequence(cds
1227 .get(0).getDatasetSequence());
1228 assertEquals(1, peptideMappings.size());
1229 assertSame(pep1.getDatasetSequence(), peptideMappings.get(0).getTo());
1231 // mappings for dna1 - cds2 - pep2
1232 assertSame(cds.get(1).getDatasetSequence(), dnaMappings.get(1)
1234 assertEquals("c(4) in CDS should map to c(7) in DNA", 7, dnaMappings
1235 .get(1).getMap().getToPosition(4));
1236 peptideMappings = cdsMapping.getMappingsFromSequence(cds.get(1)
1237 .getDatasetSequence());
1238 assertEquals(1, peptideMappings.size());
1239 assertSame(pep2.getDatasetSequence(), peptideMappings.get(0).getTo());
1241 // mappings for dna1 - cds3 - pep3
1242 assertSame(cds.get(2).getDatasetSequence(), dnaMappings.get(2)
1244 assertEquals("T(4) in CDS should map to T(10) in DNA", 10, dnaMappings
1245 .get(2).getMap().getToPosition(4));
1246 peptideMappings = cdsMapping.getMappingsFromSequence(cds.get(2)
1247 .getDatasetSequence());
1248 assertEquals(1, peptideMappings.size());
1249 assertSame(pep3.getDatasetSequence(), peptideMappings.get(0).getTo());
1252 @Test(groups = { "Functional" })
1253 public void testIsMappable()
1255 SequenceI dna1 = new Sequence("dna1", "cgCAGtgGT");
1256 SequenceI aa1 = new Sequence("aa1", "RSG");
1257 AlignmentI al1 = new Alignment(new SequenceI[] { dna1 });
1258 AlignmentI al2 = new Alignment(new SequenceI[] { aa1 });
1260 assertFalse(AlignmentUtils.isMappable(null, null));
1261 assertFalse(AlignmentUtils.isMappable(al1, null));
1262 assertFalse(AlignmentUtils.isMappable(null, al1));
1263 assertFalse(AlignmentUtils.isMappable(al1, al1));
1264 assertFalse(AlignmentUtils.isMappable(al2, al2));
1266 assertTrue(AlignmentUtils.isMappable(al1, al2));
1267 assertTrue(AlignmentUtils.isMappable(al2, al1));
1271 * Test creating a mapping when the sequences involved do not start at residue
1274 * @throws IOException
1276 @Test(groups = { "Functional" })
1277 public void testMapCdnaToProtein_forSubsequence()
1280 SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12);
1281 prot.createDatasetSequence();
1283 SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48);
1284 dna.createDatasetSequence();
1286 MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna);
1287 assertEquals(10, map.getToLowest());
1288 assertEquals(12, map.getToHighest());
1289 assertEquals(40, map.getFromLowest());
1290 assertEquals(48, map.getFromHighest());
1294 * Test for the alignSequenceAs method where we have protein mapped to protein
1296 @Test(groups = { "Functional" })
1297 public void testAlignSequenceAs_mappedProteinProtein()
1300 SequenceI alignMe = new Sequence("Match", "MGAASEV");
1301 alignMe.createDatasetSequence();
1302 SequenceI alignFrom = new Sequence("Query", "LQTGYMGAASEVMFSPTRR");
1303 alignFrom.createDatasetSequence();
1305 AlignedCodonFrame acf = new AlignedCodonFrame();
1306 // this is like a domain or motif match of part of a peptide sequence
1307 MapList map = new MapList(new int[] { 6, 12 }, new int[] { 1, 7 }, 1, 1);
1308 acf.addMap(alignFrom.getDatasetSequence(),
1309 alignMe.getDatasetSequence(), map);
1311 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "-", '-', true,
1313 assertEquals("-----MGAASEV-------", alignMe.getSequenceAsString());
1317 * Test for the alignSequenceAs method where there are trailing unmapped
1318 * residues in the model sequence
1320 @Test(groups = { "Functional" })
1321 public void testAlignSequenceAs_withTrailingPeptide()
1323 // map first 3 codons to KPF; G is a trailing unmapped residue
1324 MapList map = new MapList(new int[] { 1, 9 }, new int[] { 1, 3 }, 3, 1);
1326 checkAlignSequenceAs("AAACCCTTT", "K-PFG", true, true, map,
1331 * Tests for transferring features between mapped sequences
1333 @Test(groups = { "Functional" })
1334 public void testTransferFeatures()
1336 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1337 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1340 dna.addSequenceFeature(new SequenceFeature("type1", "desc1", 1, 2, 1f,
1342 // partial overlap - to [1, 1]
1343 dna.addSequenceFeature(new SequenceFeature("type2", "desc2", 3, 4, 2f,
1345 // exact overlap - to [1, 3]
1346 dna.addSequenceFeature(new SequenceFeature("type3", "desc3", 4, 6, 3f,
1348 // spanning overlap - to [2, 5]
1349 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1351 // exactly overlaps whole mapped range [1, 6]
1352 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1354 // no overlap (internal)
1355 dna.addSequenceFeature(new SequenceFeature("type6", "desc6", 7, 9, 6f,
1357 // no overlap (3' end)
1358 dna.addSequenceFeature(new SequenceFeature("type7", "desc7", 13, 15,
1360 // overlap (3' end) - to [6, 6]
1361 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1363 // extended overlap - to [6, +]
1364 dna.addSequenceFeature(new SequenceFeature("type9", "desc9", 12, 13,
1367 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1368 new int[] { 1, 6 }, 1, 1);
1371 * transferFeatures() will build 'partial overlap' for regions
1372 * that partially overlap 5' or 3' (start or end) of target sequence
1374 AlignmentUtils.transferFeatures(dna, cds, map, null);
1375 SequenceFeature[] sfs = cds.getSequenceFeatures();
1376 assertEquals(6, sfs.length);
1378 SequenceFeature sf = sfs[0];
1379 assertEquals("type2", sf.getType());
1380 assertEquals("desc2", sf.getDescription());
1381 assertEquals(2f, sf.getScore());
1382 assertEquals(1, sf.getBegin());
1383 assertEquals(1, sf.getEnd());
1386 assertEquals("type3", sf.getType());
1387 assertEquals("desc3", sf.getDescription());
1388 assertEquals(3f, sf.getScore());
1389 assertEquals(1, sf.getBegin());
1390 assertEquals(3, sf.getEnd());
1393 assertEquals("type4", sf.getType());
1394 assertEquals(2, sf.getBegin());
1395 assertEquals(5, sf.getEnd());
1398 assertEquals("type5", sf.getType());
1399 assertEquals(1, sf.getBegin());
1400 assertEquals(6, sf.getEnd());
1403 assertEquals("type8", sf.getType());
1404 assertEquals(6, sf.getBegin());
1405 assertEquals(6, sf.getEnd());
1408 assertEquals("type9", sf.getType());
1409 assertEquals(6, sf.getBegin());
1410 assertEquals(6, sf.getEnd());
1414 * Tests for transferring features between mapped sequences
1416 @Test(groups = { "Functional" })
1417 public void testTransferFeatures_withOmit()
1419 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1420 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1422 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1423 new int[] { 1, 6 }, 1, 1);
1425 // [5, 11] maps to [2, 5]
1426 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1428 // [4, 12] maps to [1, 6]
1429 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1431 // [12, 12] maps to [6, 6]
1432 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1435 // desc4 and desc8 are the 'omit these' varargs
1436 AlignmentUtils.transferFeatures(dna, cds, map, null, "type4", "type8");
1437 SequenceFeature[] sfs = cds.getSequenceFeatures();
1438 assertEquals(1, sfs.length);
1440 SequenceFeature sf = sfs[0];
1441 assertEquals("type5", sf.getType());
1442 assertEquals(1, sf.getBegin());
1443 assertEquals(6, sf.getEnd());
1447 * Tests for transferring features between mapped sequences
1449 @Test(groups = { "Functional" })
1450 public void testTransferFeatures_withSelect()
1452 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1453 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1455 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1456 new int[] { 1, 6 }, 1, 1);
1458 // [5, 11] maps to [2, 5]
1459 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1461 // [4, 12] maps to [1, 6]
1462 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1464 // [12, 12] maps to [6, 6]
1465 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1468 // "type5" is the 'select this type' argument
1469 AlignmentUtils.transferFeatures(dna, cds, map, "type5");
1470 SequenceFeature[] sfs = cds.getSequenceFeatures();
1471 assertEquals(1, sfs.length);
1473 SequenceFeature sf = sfs[0];
1474 assertEquals("type5", sf.getType());
1475 assertEquals(1, sf.getBegin());
1476 assertEquals(6, sf.getEnd());
1480 * Test the method that extracts the cds-only part of a dna alignment, for the
1481 * case where the cds should be aligned to match its nucleotide sequence.
1483 @Test(groups = { "Functional" })
1484 public void testMakeCdsAlignment_alternativeTranscripts()
1486 SequenceI dna1 = new Sequence("dna1", "aaaGGGCC-----CTTTaaaGGG");
1487 // alternative transcript of same dna skips CCC codon
1488 SequenceI dna2 = new Sequence("dna2", "aaaGGGCC-----cttTaaaGGG");
1489 // dna3 has no mapping (protein product) so should be ignored here
1490 SequenceI dna3 = new Sequence("dna3", "aaaGGGCCCCCGGGcttTaaaGGG");
1491 SequenceI pep1 = new Sequence("pep1", "GPFG");
1492 SequenceI pep2 = new Sequence("pep2", "GPG");
1493 dna1.createDatasetSequence();
1494 dna2.createDatasetSequence();
1495 dna3.createDatasetSequence();
1496 pep1.createDatasetSequence();
1497 pep2.createDatasetSequence();
1498 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 8, 0f,
1500 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 9, 12, 0f,
1502 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 16, 18, 0f,
1504 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 4, 8, 0f,
1506 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 12, 12, 0f,
1508 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 16, 18, 0f,
1511 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
1512 MapList map = new MapList(new int[] { 4, 12, 16, 18 },
1513 new int[] { 1, 4 }, 3, 1);
1514 AlignedCodonFrame acf = new AlignedCodonFrame();
1515 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1517 map = new MapList(new int[] { 4, 8, 12, 12, 16, 18 },
1520 acf = new AlignedCodonFrame();
1521 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1524 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1525 dna.setDataset(null);
1526 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1527 dna1, dna2, dna3 }, mappings, dna);
1528 List<SequenceI> cdsSeqs = cds.getSequences();
1529 assertEquals(2, cdsSeqs.size());
1530 assertEquals("GGGCCCTTTGGG", cdsSeqs.get(0).getSequenceAsString());
1531 assertEquals("GGGCCTGGG", cdsSeqs.get(1).getSequenceAsString());
1534 * verify shared, extended alignment dataset
1536 assertSame(dna.getDataset(), cds.getDataset());
1537 assertTrue(dna.getDataset().getSequences()
1538 .contains(cdsSeqs.get(0).getDatasetSequence()));
1539 assertTrue(dna.getDataset().getSequences()
1540 .contains(cdsSeqs.get(1).getDatasetSequence()));
1543 * Verify updated mappings
1545 List<AlignedCodonFrame> cdsMappings = cds.getCodonFrames();
1546 assertEquals(2, cdsMappings.size());
1549 * Mapping from pep1 to GGGTTT in first new CDS sequence
1551 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1552 .findMappingsForSequence(pep1, cdsMappings);
1553 assertEquals(1, pep1Mapping.size());
1555 * maps GPFG to 1-3,4-6,7-9,10-12
1557 SearchResults sr = MappingUtils
1558 .buildSearchResults(pep1, 1, cdsMappings);
1559 assertEquals(1, sr.getResults().size());
1560 Match m = sr.getResults().get(0);
1561 assertEquals(cds.getSequenceAt(0).getDatasetSequence(),
1563 assertEquals(1, m.getStart());
1564 assertEquals(3, m.getEnd());
1565 sr = MappingUtils.buildSearchResults(pep1, 2, cdsMappings);
1566 m = sr.getResults().get(0);
1567 assertEquals(4, m.getStart());
1568 assertEquals(6, m.getEnd());
1569 sr = MappingUtils.buildSearchResults(pep1, 3, cdsMappings);
1570 m = sr.getResults().get(0);
1571 assertEquals(7, m.getStart());
1572 assertEquals(9, m.getEnd());
1573 sr = MappingUtils.buildSearchResults(pep1, 4, cdsMappings);
1574 m = sr.getResults().get(0);
1575 assertEquals(10, m.getStart());
1576 assertEquals(12, m.getEnd());
1579 * GPG in pep2 map to 1-3,4-6,7-9 in second CDS sequence
1581 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1582 .findMappingsForSequence(pep2, cdsMappings);
1583 assertEquals(1, pep2Mapping.size());
1584 sr = MappingUtils.buildSearchResults(pep2, 1, cdsMappings);
1585 assertEquals(1, sr.getResults().size());
1586 m = sr.getResults().get(0);
1587 assertEquals(cds.getSequenceAt(1).getDatasetSequence(),
1589 assertEquals(1, m.getStart());
1590 assertEquals(3, m.getEnd());
1591 sr = MappingUtils.buildSearchResults(pep2, 2, cdsMappings);
1592 m = sr.getResults().get(0);
1593 assertEquals(4, m.getStart());
1594 assertEquals(6, m.getEnd());
1595 sr = MappingUtils.buildSearchResults(pep2, 3, cdsMappings);
1596 m = sr.getResults().get(0);
1597 assertEquals(7, m.getStart());
1598 assertEquals(9, m.getEnd());
1602 * Test the method that realigns protein to match mapped codon alignment.
1604 @Test(groups = { "Functional" })
1605 public void testAlignProteinAsDna_incompleteStartCodon()
1607 // seq1: incomplete start codon (not mapped), then [3, 11]
1608 SequenceI dna1 = new Sequence("Seq1", "ccAAA-TTT-GGG-");
1609 // seq2 codons are [4, 5], [8, 11]
1610 SequenceI dna2 = new Sequence("Seq2", "ccaAA-ttT-GGG-");
1611 // seq3 incomplete start codon at 'tt'
1612 SequenceI dna3 = new Sequence("Seq3", "ccaaa-ttt-GGG-");
1613 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1614 dna.setDataset(null);
1616 // prot1 has 'X' for incomplete start codon (not mapped)
1617 SequenceI prot1 = new Sequence("Seq1", "XKFG"); // X for incomplete start
1618 SequenceI prot2 = new Sequence("Seq2", "NG");
1619 SequenceI prot3 = new Sequence("Seq3", "XG"); // X for incomplete start
1620 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
1622 protein.setDataset(null);
1624 // map dna1 [3, 11] to prot1 [2, 4] KFG
1625 MapList map = new MapList(new int[] { 3, 11 }, new int[] { 2, 4 }, 3, 1);
1626 AlignedCodonFrame acf = new AlignedCodonFrame();
1627 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
1629 // map dna2 [4, 5] [8, 11] to prot2 [1, 2] NG
1630 map = new MapList(new int[] { 4, 5, 8, 11 }, new int[] { 1, 2 }, 3, 1);
1631 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
1633 // map dna3 [9, 11] to prot3 [2, 2] G
1634 map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1);
1635 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
1637 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
1639 protein.setCodonFrames(acfs);
1642 * verify X is included in the aligned proteins, and placed just
1643 * before the first mapped residue
1644 * CCT is between CCC and TTT
1646 AlignmentUtils.alignProteinAsDna(protein, dna);
1647 assertEquals("XK-FG", prot1.getSequenceAsString());
1648 assertEquals("--N-G", prot2.getSequenceAsString());
1649 assertEquals("---XG", prot3.getSequenceAsString());
1653 * Tests for the method that maps the subset of a dna sequence that has CDS
1654 * (or subtype) feature - case where the start codon is incomplete.
1656 @Test(groups = "Functional")
1657 public void testFindCdsPositions_fivePrimeIncomplete()
1659 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
1660 dnaSeq.createDatasetSequence();
1661 SequenceI ds = dnaSeq.getDatasetSequence();
1663 // CDS for dna 5-6 (incomplete codon), 7-9
1664 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
1665 sf.setPhase("2"); // skip 2 bases to start of next codon
1666 ds.addSequenceFeature(sf);
1667 // CDS for dna 13-15
1668 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
1669 ds.addSequenceFeature(sf);
1671 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1674 * check the mapping starts with the first complete codon
1676 assertEquals(6, MappingUtils.getLength(ranges));
1677 assertEquals(2, ranges.size());
1678 assertEquals(7, ranges.get(0)[0]);
1679 assertEquals(9, ranges.get(0)[1]);
1680 assertEquals(13, ranges.get(1)[0]);
1681 assertEquals(15, ranges.get(1)[1]);
1685 * Tests for the method that maps the subset of a dna sequence that has CDS
1686 * (or subtype) feature.
1688 @Test(groups = "Functional")
1689 public void testFindCdsPositions()
1691 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
1692 dnaSeq.createDatasetSequence();
1693 SequenceI ds = dnaSeq.getDatasetSequence();
1695 // CDS for dna 10-12
1696 SequenceFeature sf = new SequenceFeature("CDS_predicted", "", 10, 12,
1699 ds.addSequenceFeature(sf);
1701 sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
1703 ds.addSequenceFeature(sf);
1704 // exon feature should be ignored here
1705 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
1706 ds.addSequenceFeature(sf);
1708 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1710 * verify ranges { [4-6], [12-10] }
1711 * note CDS ranges are ordered ascending even if the CDS
1714 assertEquals(6, MappingUtils.getLength(ranges));
1715 assertEquals(2, ranges.size());
1716 assertEquals(4, ranges.get(0)[0]);
1717 assertEquals(6, ranges.get(0)[1]);
1718 assertEquals(10, ranges.get(1)[0]);
1719 assertEquals(12, ranges.get(1)[1]);
1723 * Test the method that computes a map of codon variants for each protein
1724 * position from "sequence_variant" features on dna
1726 @Test(groups = "Functional")
1727 public void testBuildDnaVariantsMap()
1729 SequenceI dna = new Sequence("dna", "atgAAATTTGGGCCCtag");
1730 MapList map = new MapList(new int[] { 1, 18 }, new int[] { 1, 5 }, 3, 1);
1733 * first with no variants on dna
1735 LinkedHashMap<Integer, String[][]> variantsMap = AlignmentUtils
1736 .buildDnaVariantsMap(dna, map);
1737 assertTrue(variantsMap.isEmpty());
1739 // single allele codon 1, on base 1
1740 SequenceFeature sf = new SequenceFeature("sequence_variant", "", 1, 1,
1742 sf.setValue("alleles", "T");
1743 dna.addSequenceFeature(sf);
1745 // two alleles codon 2, on bases 2 and 3
1746 sf = new SequenceFeature("sequence_variant", "", 5, 5, 0f, null);
1747 sf.setValue("alleles", "T");
1748 dna.addSequenceFeature(sf);
1749 sf = new SequenceFeature("sequence_variant", "", 6, 6, 0f, null);
1750 sf.setValue("alleles", "G");
1751 dna.addSequenceFeature(sf);
1753 // two alleles codon 3, both on base 2
1754 sf = new SequenceFeature("sequence_variant", "", 8, 8, 0f, null);
1755 sf.setValue("alleles", "C, G");
1756 dna.addSequenceFeature(sf);
1758 // no alleles on codon 4
1759 // alleles on codon 5 on all 3 bases
1760 sf = new SequenceFeature("sequence_variant", "", 13, 13, 0f, null);
1761 sf.setValue("alleles", "C, G"); // (C duplicates given base value)
1762 dna.addSequenceFeature(sf);
1763 sf = new SequenceFeature("sequence_variant", "", 14, 14, 0f, null);
1764 sf.setValue("alleles", "g, a"); // should force to upper-case
1765 dna.addSequenceFeature(sf);
1766 sf = new SequenceFeature("sequence_variant", "", 15, 15, 0f, null);
1767 sf.setValue("alleles", "A, T");
1768 dna.addSequenceFeature(sf);
1770 variantsMap = AlignmentUtils.buildDnaVariantsMap(dna, map);
1771 assertEquals(4, variantsMap.size());
1772 assertTrue(Arrays.deepEquals(new String[][] { { "A", "T" }, { "T" },
1773 { "G" } }, variantsMap.get(1)));
1774 assertTrue(Arrays.deepEquals(new String[][] { { "A" }, { "A", "T" },
1775 { "A", "G" } }, variantsMap.get(2)));
1776 assertTrue(Arrays.deepEquals(new String[][] { { "T" },
1777 { "T", "C", "G" }, { "T" } }, variantsMap.get(3)));
1778 // duplicated bases are not removed here, handled in computePeptideVariants
1779 assertTrue(Arrays.deepEquals(new String[][] { { "C", "C", "G" },
1780 { "C", "G", "A" }, { "C", "A", "T" } }, variantsMap.get(5)));
1784 * Tests for the method that computes all peptide variants given codon
1787 @Test(groups = "Functional")
1788 public void testComputePeptideVariants()
1790 String[][] codonVariants = new String[][] { { "A" }, { "G" }, { "T" } };
1793 * AGT codes for S - this is not included in the variants returned
1795 List<String> variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1796 assertEquals("[]", variants.toString());
1798 // S is reported if it differs from the current value (A):
1799 variants = AlignmentUtils.computePeptideVariants(codonVariants, "A");
1800 assertEquals("[S]", variants.toString());
1803 * synonymous variant is not reported
1805 codonVariants = new String[][] { { "A" }, { "G" }, { "C", "T" } };
1806 // AGC and AGT both code for S
1807 variants = AlignmentUtils.computePeptideVariants(codonVariants, "s");
1808 assertEquals("[]", variants.toString());
1811 * equivalent variants are only reported once
1813 codonVariants = new String[][] { { "C" }, { "T" },
1814 { "A", "C", "G", "T" } };
1815 // CTA CTC CTG CTT all code for L
1816 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1817 assertEquals("[L]", variants.toString());
1820 * vary codons 1 and 2; variant products are sorted and non-redundant
1822 codonVariants = new String[][] { { "a", "C" }, { "g", "T" }, { "A" } };
1823 // aga ata cga cta code for R, I, R, L
1824 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1825 assertEquals("[I, L, R]", variants.toString());
1828 * vary codons 2 and 3
1830 codonVariants = new String[][] { { "a" }, { "g", "T" }, { "A", "c" } };
1831 // aga agc ata atc code for R, S, I, I
1832 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1833 assertEquals("[I, R]", variants.toString());
1836 * vary codons 1 and 3
1838 codonVariants = new String[][] { { "a", "t" }, { "a" }, { "t", "g" } };
1839 // aat aag tat tag code for N, K, Y, STOP - STOP sorted to end
1840 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1841 assertEquals("[K, N, Y, STOP]", variants.toString());
1844 * vary codons 1, 2 and 3
1846 codonVariants = new String[][] { { "a", "t" }, { "G", "C" },
1848 // agt agg act acg tgt tgg tct tcg code for S, R, T, T, C, W, S, S
1849 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1850 assertEquals("[C, R, T, W]", variants.toString());
1854 * Tests for the method that maps the subset of a dna sequence that has CDS
1855 * (or subtype) feature, with CDS strand = '-' (reverse)
1857 // test turned off as currently findCdsPositions is not strand-dependent
1858 // left in case it comes around again...
1859 @Test(groups = "Functional", enabled = false)
1860 public void testFindCdsPositions_reverseStrand()
1862 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
1863 dnaSeq.createDatasetSequence();
1864 SequenceI ds = dnaSeq.getDatasetSequence();
1867 SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
1869 ds.addSequenceFeature(sf);
1870 // exon feature should be ignored here
1871 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
1872 ds.addSequenceFeature(sf);
1873 // CDS for dna 10-12
1874 sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
1876 ds.addSequenceFeature(sf);
1878 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1880 * verify ranges { [12-10], [6-4] }
1882 assertEquals(6, MappingUtils.getLength(ranges));
1883 assertEquals(2, ranges.size());
1884 assertEquals(12, ranges.get(0)[0]);
1885 assertEquals(10, ranges.get(0)[1]);
1886 assertEquals(6, ranges.get(1)[0]);
1887 assertEquals(4, ranges.get(1)[1]);
1891 * Tests for the method that maps the subset of a dna sequence that has CDS
1892 * (or subtype) feature - reverse strand case where the start codon is
1895 @Test(groups = "Functional", enabled = false)
1896 // test turned off as currently findCdsPositions is not strand-dependent
1897 // left in case it comes around again...
1898 public void testFindCdsPositions_reverseStrandThreePrimeIncomplete()
1900 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
1901 dnaSeq.createDatasetSequence();
1902 SequenceI ds = dnaSeq.getDatasetSequence();
1905 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
1907 ds.addSequenceFeature(sf);
1908 // CDS for dna 13-15
1909 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
1911 sf.setPhase("2"); // skip 2 bases to start of next codon
1912 ds.addSequenceFeature(sf);
1914 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1917 * check the mapping starts with the first complete codon
1918 * expect ranges [13, 13], [9, 5]
1920 assertEquals(6, MappingUtils.getLength(ranges));
1921 assertEquals(2, ranges.size());
1922 assertEquals(13, ranges.get(0)[0]);
1923 assertEquals(13, ranges.get(0)[1]);
1924 assertEquals(9, ranges.get(1)[0]);
1925 assertEquals(5, ranges.get(1)[1]);
1928 @Test(groups = "Functional")
1929 public void testAlignAs_alternateTranscriptsUngapped()
1931 SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
1932 SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
1933 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
1934 ((Alignment) dna).createDatasetAlignment();
1935 SequenceI cds1 = new Sequence("cds1", "GGGTTT");
1936 SequenceI cds2 = new Sequence("cds2", "CCCAAA");
1937 AlignmentI cds = new Alignment(new SequenceI[] { cds1, cds2 });
1938 ((Alignment) cds).createDatasetAlignment();
1940 AlignedCodonFrame acf = new AlignedCodonFrame();
1941 MapList map = new MapList(new int[] { 4, 9 }, new int[] { 1, 6 }, 1, 1);
1942 acf.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), map);
1943 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 6 }, 1, 1);
1944 acf.addMap(dna2.getDatasetSequence(), cds2.getDatasetSequence(), map);
1947 * verify CDS alignment is as:
1948 * cccGGGTTTaaa (cdna)
1949 * CCCgggtttAAA (cdna)
1951 * ---GGGTTT--- (cds)
1952 * CCC------AAA (cds)
1954 dna.addCodonFrame(acf);
1955 AlignmentUtils.alignAs(cds, dna);
1956 assertEquals("---GGGTTT---", cds.getSequenceAt(0).getSequenceAsString());
1957 assertEquals("CCC------AAA", cds.getSequenceAt(1).getSequenceAsString());