2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.testng.AssertJUnit.assertEquals;
24 import static org.testng.AssertJUnit.assertFalse;
25 import static org.testng.AssertJUnit.assertNull;
26 import static org.testng.AssertJUnit.assertSame;
27 import static org.testng.AssertJUnit.assertTrue;
29 import jalview.analysis.AlignmentUtils.DnaVariant;
30 import jalview.datamodel.AlignedCodonFrame;
31 import jalview.datamodel.Alignment;
32 import jalview.datamodel.AlignmentAnnotation;
33 import jalview.datamodel.AlignmentI;
34 import jalview.datamodel.Annotation;
35 import jalview.datamodel.DBRefEntry;
36 import jalview.datamodel.Mapping;
37 import jalview.datamodel.SearchResults;
38 import jalview.datamodel.SearchResults.Match;
39 import jalview.datamodel.Sequence;
40 import jalview.datamodel.SequenceFeature;
41 import jalview.datamodel.SequenceI;
42 import jalview.io.AppletFormatAdapter;
43 import jalview.io.DataSourceType;
44 import jalview.io.FileFormat;
45 import jalview.io.FileFormatI;
46 import jalview.io.FormatAdapter;
47 import jalview.util.MapList;
48 import jalview.util.MappingUtils;
50 import java.io.IOException;
51 import java.util.ArrayList;
52 import java.util.Arrays;
53 import java.util.Iterator;
54 import java.util.LinkedHashMap;
55 import java.util.List;
57 import java.util.TreeMap;
59 import org.testng.annotations.Test;
61 public class AlignmentUtilsTests
63 public static Sequence ts = new Sequence("short",
64 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm");
66 @Test(groups = { "Functional" })
67 public void testExpandContext()
69 AlignmentI al = new Alignment(new Sequence[] {});
70 for (int i = 4; i < 14; i += 2)
72 SequenceI s1 = ts.deriveSequence().getSubSequence(i, i + 7);
75 System.out.println(new AppletFormatAdapter().formatSequences(
78 for (int flnk = -1; flnk < 25; flnk++)
80 AlignmentI exp = AlignmentUtils.expandContext(al, flnk);
81 System.out.println("\nFlank size: " + flnk);
82 System.out.println(new AppletFormatAdapter().formatSequences(
83 FileFormat.Clustal, exp, true));
87 * Full expansion to complete sequences
89 for (SequenceI sq : exp.getSequences())
91 String ung = sq.getSequenceAsString().replaceAll("-+", "");
92 final String errorMsg = "Flanking sequence not the same as original dataset sequence.\n"
95 + sq.getDatasetSequence().getSequenceAsString();
96 assertTrue(errorMsg, ung.equalsIgnoreCase(sq.getDatasetSequence()
97 .getSequenceAsString()));
103 * Last sequence is fully expanded, others have leading gaps to match
105 assertTrue(exp.getSequenceAt(4).getSequenceAsString()
107 assertTrue(exp.getSequenceAt(3).getSequenceAsString()
108 .startsWith("--abc"));
109 assertTrue(exp.getSequenceAt(2).getSequenceAsString()
110 .startsWith("----abc"));
111 assertTrue(exp.getSequenceAt(1).getSequenceAsString()
112 .startsWith("------abc"));
113 assertTrue(exp.getSequenceAt(0).getSequenceAsString()
114 .startsWith("--------abc"));
120 * Test that annotations are correctly adjusted by expandContext
122 @Test(groups = { "Functional" })
123 public void testExpandContext_annotation()
125 AlignmentI al = new Alignment(new Sequence[] {});
126 SequenceI ds = new Sequence("Seq1", "ABCDEFGHI");
128 SequenceI seq1 = ds.deriveSequence().getSubSequence(3, 6);
129 al.addSequence(seq1);
132 * Annotate DEF with 4/5/6 respectively
134 Annotation[] anns = new Annotation[] { new Annotation(4),
135 new Annotation(5), new Annotation(6) };
136 AlignmentAnnotation ann = new AlignmentAnnotation("SS",
137 "secondary structure", anns);
138 seq1.addAlignmentAnnotation(ann);
141 * The annotations array should match aligned positions
143 assertEquals(3, ann.annotations.length);
144 assertEquals(4, ann.annotations[0].value, 0.001);
145 assertEquals(5, ann.annotations[1].value, 0.001);
146 assertEquals(6, ann.annotations[2].value, 0.001);
149 * Check annotation to sequence position mappings before expanding the
150 * sequence; these are set up in Sequence.addAlignmentAnnotation ->
151 * Annotation.setSequenceRef -> createSequenceMappings
153 assertNull(ann.getAnnotationForPosition(1));
154 assertNull(ann.getAnnotationForPosition(2));
155 assertNull(ann.getAnnotationForPosition(3));
156 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
157 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
158 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
159 assertNull(ann.getAnnotationForPosition(7));
160 assertNull(ann.getAnnotationForPosition(8));
161 assertNull(ann.getAnnotationForPosition(9));
164 * Expand the subsequence to the full sequence abcDEFghi
166 AlignmentI expanded = AlignmentUtils.expandContext(al, -1);
167 assertEquals("abcDEFghi", expanded.getSequenceAt(0)
168 .getSequenceAsString());
171 * Confirm the alignment and sequence have the same SS annotation,
172 * referencing the expanded sequence
174 ann = expanded.getSequenceAt(0).getAnnotation()[0];
175 assertSame(ann, expanded.getAlignmentAnnotation()[0]);
176 assertSame(expanded.getSequenceAt(0), ann.sequenceRef);
179 * The annotations array should have null values except for annotated
182 assertNull(ann.annotations[0]);
183 assertNull(ann.annotations[1]);
184 assertNull(ann.annotations[2]);
185 assertEquals(4, ann.annotations[3].value, 0.001);
186 assertEquals(5, ann.annotations[4].value, 0.001);
187 assertEquals(6, ann.annotations[5].value, 0.001);
188 assertNull(ann.annotations[6]);
189 assertNull(ann.annotations[7]);
190 assertNull(ann.annotations[8]);
193 * sequence position mappings should be unchanged
195 assertNull(ann.getAnnotationForPosition(1));
196 assertNull(ann.getAnnotationForPosition(2));
197 assertNull(ann.getAnnotationForPosition(3));
198 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
199 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
200 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
201 assertNull(ann.getAnnotationForPosition(7));
202 assertNull(ann.getAnnotationForPosition(8));
203 assertNull(ann.getAnnotationForPosition(9));
207 * Test method that returns a map of lists of sequences by sequence name.
209 * @throws IOException
211 @Test(groups = { "Functional" })
212 public void testGetSequencesByName() throws IOException
214 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
215 + ">Seq1Name\nABCD\n";
216 AlignmentI al = loadAlignment(data, FileFormat.Fasta);
217 Map<String, List<SequenceI>> map = AlignmentUtils
218 .getSequencesByName(al);
219 assertEquals(2, map.keySet().size());
220 assertEquals(2, map.get("Seq1Name").size());
221 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
222 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
223 assertEquals(1, map.get("Seq2Name").size());
224 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
228 * Helper method to load an alignment and ensure dataset sequences are set up.
234 * @throws IOException
236 protected AlignmentI loadAlignment(final String data, FileFormatI format)
239 AlignmentI a = new FormatAdapter().readFile(data,
240 DataSourceType.PASTE, format);
246 * Test mapping of protein to cDNA, for the case where we have no sequence
247 * cross-references, so mappings are made first-served 1-1 where sequences
250 * @throws IOException
252 @Test(groups = { "Functional" })
253 public void testMapProteinAlignmentToCdna_noXrefs() throws IOException
255 List<SequenceI> protseqs = new ArrayList<SequenceI>();
256 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
257 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
258 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
259 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
260 protein.setDataset(null);
262 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
263 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
264 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
265 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
266 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
267 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
268 cdna.setDataset(null);
270 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
272 // 3 mappings made, each from 1 to 1 sequence
273 assertEquals(3, protein.getCodonFrames().size());
274 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
275 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
276 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
278 // V12345 mapped to A22222
279 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
281 assertEquals(1, acf.getdnaSeqs().length);
282 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
283 acf.getdnaSeqs()[0]);
284 Mapping[] protMappings = acf.getProtMappings();
285 assertEquals(1, protMappings.length);
286 MapList mapList = protMappings[0].getMap();
287 assertEquals(3, mapList.getFromRatio());
288 assertEquals(1, mapList.getToRatio());
289 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
291 assertEquals(1, mapList.getFromRanges().size());
292 assertTrue(Arrays.equals(new int[] { 1, 3 },
293 mapList.getToRanges().get(0)));
294 assertEquals(1, mapList.getToRanges().size());
296 // V12346 mapped to A33333
297 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
298 assertEquals(1, acf.getdnaSeqs().length);
299 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
300 acf.getdnaSeqs()[0]);
302 // V12347 mapped to A11111
303 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
304 assertEquals(1, acf.getdnaSeqs().length);
305 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
306 acf.getdnaSeqs()[0]);
308 // no mapping involving the 'extra' A44444
309 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
313 * Test for the alignSequenceAs method that takes two sequences and a mapping.
315 @Test(groups = { "Functional" })
316 public void testAlignSequenceAs_withMapping_noIntrons()
318 MapList map = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1);
321 * No existing gaps in dna:
323 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
327 * Now introduce gaps in dna but ignore them when realigning.
329 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
333 * Now include gaps in dna when realigning. First retaining 'mapped' gaps
334 * only, i.e. those within the exon region.
336 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
337 "---G-G--G---A--A-A");
340 * Include all gaps in dna when realigning (within and without the exon
341 * region). The leading gap, and the gaps between codons, are subsumed by
342 * the protein alignment gap.
344 checkAlignSequenceAs("-G-GG--AA-A---", "-A-L-", true, true, map,
345 "---G-GG---AA-A---");
348 * Include only unmapped gaps in dna when realigning (outside the exon
349 * region). The leading gap, and the gaps between codons, are subsumed by
350 * the protein alignment gap.
352 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
357 * Test for the alignSequenceAs method that takes two sequences and a mapping.
359 @Test(groups = { "Functional" })
360 public void testAlignSequenceAs_withMapping_withIntrons()
363 * Exons at codon 2 (AAA) and 4 (TTT)
365 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
366 new int[] { 1, 2 }, 3, 1);
369 * Simple case: no gaps in dna
371 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
372 "GGG---AAACCCTTTGGG");
375 * Add gaps to dna - but ignore when realigning.
377 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
378 false, false, map, "GGG---AAACCCTTTGGG");
381 * Add gaps to dna - include within exons only when realigning.
383 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
384 true, false, map, "GGG---A--A---ACCCT-TTGGG");
387 * Include gaps outside exons only when realigning.
389 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
390 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
393 * Include gaps following first intron if we are 'preserving mapped gaps'
395 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
396 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
399 * Include all gaps in dna when realigning.
401 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
402 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
406 * Test for the case where not all of the protein sequence is mapped to cDNA.
408 @Test(groups = { "Functional" })
409 public void testAlignSequenceAs_withMapping_withUnmappedProtein()
412 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
414 final MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] {
418 * -L- 'aligns' ccc------
420 checkAlignSequenceAs("gggAAAcccTTTggg", "-A-L-P-", false, false, map,
421 "gggAAAccc------TTTggg");
425 * Helper method that performs and verifies the method under test.
428 * the sequence to be realigned
430 * the sequence whose alignment is to be copied
431 * @param preserveMappedGaps
432 * @param preserveUnmappedGaps
436 protected void checkAlignSequenceAs(final String alignee,
437 final String alignModel, final boolean preserveMappedGaps,
438 final boolean preserveUnmappedGaps, MapList map,
439 final String expected)
441 SequenceI alignMe = new Sequence("Seq1", alignee);
442 alignMe.createDatasetSequence();
443 SequenceI alignFrom = new Sequence("Seq2", alignModel);
444 alignFrom.createDatasetSequence();
445 AlignedCodonFrame acf = new AlignedCodonFrame();
446 acf.addMap(alignMe.getDatasetSequence(), alignFrom.getDatasetSequence(), map);
448 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "---", '-',
449 preserveMappedGaps, preserveUnmappedGaps);
450 assertEquals(expected, alignMe.getSequenceAsString());
454 * Test for the alignSequenceAs method where we preserve gaps in introns only.
456 @Test(groups = { "Functional" })
457 public void testAlignSequenceAs_keepIntronGapsOnly()
461 * Intron GGGAAA followed by exon CCCTTT
463 MapList map = new MapList(new int[] { 7, 12 }, new int[] { 1, 2 }, 3, 1);
465 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL", false, true, map,
470 * Test the method that realigns protein to match mapped codon alignment.
472 @Test(groups = { "Functional" })
473 public void testAlignProteinAsDna()
475 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
476 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
477 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
478 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
479 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
480 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
481 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
482 dna.setDataset(null);
484 // protein alignment will be realigned like dna
485 SequenceI prot1 = new Sequence("Seq1", "CHYQ");
486 SequenceI prot2 = new Sequence("Seq2", "CHYQ");
487 SequenceI prot3 = new Sequence("Seq3", "CHYQ");
488 SequenceI prot4 = new Sequence("Seq4", "R-QSV"); // unmapped, unchanged
489 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
491 protein.setDataset(null);
493 MapList map = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 3, 1);
494 AlignedCodonFrame acf = new AlignedCodonFrame();
495 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
496 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
497 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
498 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
500 protein.setCodonFrames(acfs);
503 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
504 * [8,9,10] [10,11,12] [11,12,13]
506 AlignmentUtils.alignProteinAsDna(protein, dna);
507 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
508 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
509 assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
510 assertEquals("R-QSV", prot4.getSequenceAsString());
514 * Test the method that tests whether a CDNA sequence translates to a protein
517 @Test(groups = { "Functional" })
518 public void testTranslatesAs()
520 // null arguments check
521 assertFalse(AlignmentUtils.translatesAs(null, 0, null));
522 assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null));
523 assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' }));
525 // straight translation
526 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
527 "FPKG".toCharArray()));
528 // with extra start codon (not in protein)
529 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
530 3, "FPKG".toCharArray()));
531 // with stop codon1 (not in protein)
532 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
533 0, "FPKG".toCharArray()));
534 // with stop codon1 (in protein as *)
535 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
536 0, "FPKG*".toCharArray()));
537 // with stop codon2 (not in protein)
538 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
539 0, "FPKG".toCharArray()));
540 // with stop codon3 (not in protein)
541 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
542 0, "FPKG".toCharArray()));
543 // with start and stop codon1
544 assertTrue(AlignmentUtils.translatesAs(
545 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG".toCharArray()));
546 // with start and stop codon1 (in protein as *)
547 assertTrue(AlignmentUtils.translatesAs(
548 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG*".toCharArray()));
549 // with start and stop codon2
550 assertTrue(AlignmentUtils.translatesAs(
551 "atgtttcccaaagggtag".toCharArray(), 3, "FPKG".toCharArray()));
552 // with start and stop codon3
553 assertTrue(AlignmentUtils.translatesAs(
554 "atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray()));
556 // with embedded stop codons
557 assertTrue(AlignmentUtils.translatesAs(
558 "atgtttTAGcccaaaTAAgggtga".toCharArray(), 3,
559 "F*PK*G".toCharArray()));
562 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
563 0, "FPMG".toCharArray()));
566 assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0,
567 "FPKG".toCharArray()));
570 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
571 0, "FPK".toCharArray()));
573 // overlong dna (doesn't end in stop codon)
574 assertFalse(AlignmentUtils.translatesAs(
575 "tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray()));
577 // dna + stop codon + more
578 assertFalse(AlignmentUtils.translatesAs(
579 "tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray()));
582 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
583 0, "FPKGQ".toCharArray()));
587 * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
588 * stop codons in addition to the protein coding sequence.
590 * @throws IOException
592 @Test(groups = { "Functional" })
593 public void testMapProteinAlignmentToCdna_withStartAndStopCodons()
596 List<SequenceI> protseqs = new ArrayList<SequenceI>();
597 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
598 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
599 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
600 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
601 protein.setDataset(null);
603 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
605 dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
607 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
608 // = start +EIQ + stop
609 dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
610 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
611 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
612 cdna.setDataset(null);
614 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
616 // 3 mappings made, each from 1 to 1 sequence
617 assertEquals(3, protein.getCodonFrames().size());
618 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
619 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
620 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
622 // V12345 mapped from A22222
623 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
625 assertEquals(1, acf.getdnaSeqs().length);
626 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
627 acf.getdnaSeqs()[0]);
628 Mapping[] protMappings = acf.getProtMappings();
629 assertEquals(1, protMappings.length);
630 MapList mapList = protMappings[0].getMap();
631 assertEquals(3, mapList.getFromRatio());
632 assertEquals(1, mapList.getToRatio());
633 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
635 assertEquals(1, mapList.getFromRanges().size());
636 assertTrue(Arrays.equals(new int[] { 1, 3 },
637 mapList.getToRanges().get(0)));
638 assertEquals(1, mapList.getToRanges().size());
640 // V12346 mapped from A33333 starting position 4
641 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
642 assertEquals(1, acf.getdnaSeqs().length);
643 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
644 acf.getdnaSeqs()[0]);
645 protMappings = acf.getProtMappings();
646 assertEquals(1, protMappings.length);
647 mapList = protMappings[0].getMap();
648 assertEquals(3, mapList.getFromRatio());
649 assertEquals(1, mapList.getToRatio());
650 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
652 assertEquals(1, mapList.getFromRanges().size());
653 assertTrue(Arrays.equals(new int[] { 1, 3 },
654 mapList.getToRanges().get(0)));
655 assertEquals(1, mapList.getToRanges().size());
657 // V12347 mapped to A11111 starting position 4
658 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
659 assertEquals(1, acf.getdnaSeqs().length);
660 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
661 acf.getdnaSeqs()[0]);
662 protMappings = acf.getProtMappings();
663 assertEquals(1, protMappings.length);
664 mapList = protMappings[0].getMap();
665 assertEquals(3, mapList.getFromRatio());
666 assertEquals(1, mapList.getToRatio());
667 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
669 assertEquals(1, mapList.getFromRanges().size());
670 assertTrue(Arrays.equals(new int[] { 1, 3 },
671 mapList.getToRanges().get(0)));
672 assertEquals(1, mapList.getToRanges().size());
674 // no mapping involving the 'extra' A44444
675 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
679 * Test mapping of protein to cDNA, for the case where we have some sequence
680 * cross-references. Verify that 1-to-many mappings are made where
681 * cross-references exist and sequences are mappable.
683 * @throws IOException
685 @Test(groups = { "Functional" })
686 public void testMapProteinAlignmentToCdna_withXrefs() throws IOException
688 List<SequenceI> protseqs = new ArrayList<SequenceI>();
689 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
690 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
691 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
692 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
693 protein.setDataset(null);
695 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
696 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
697 dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
698 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
699 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
700 dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
701 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
702 cdna.setDataset(null);
704 // Xref A22222 to V12345 (should get mapped)
705 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
706 // Xref V12345 to A44444 (should get mapped)
707 protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
708 // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
709 dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
710 // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
711 // it should get paired up with the unmapped A33333
712 // A11111 should be mapped to V12347
713 // A55555 is spare and has no xref so is not mapped
715 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
717 // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
718 assertEquals(3, protein.getCodonFrames().size());
719 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
720 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
721 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
723 // one mapping for each of the first 4 cDNA sequences
724 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
725 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
726 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
727 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
729 // V12345 mapped to A22222 and A44444
730 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
732 assertEquals(2, acf.getdnaSeqs().length);
733 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
734 acf.getdnaSeqs()[0]);
735 assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
736 acf.getdnaSeqs()[1]);
738 // V12346 mapped to A33333
739 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
740 assertEquals(1, acf.getdnaSeqs().length);
741 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
742 acf.getdnaSeqs()[0]);
744 // V12347 mapped to A11111
745 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
746 assertEquals(1, acf.getdnaSeqs().length);
747 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
748 acf.getdnaSeqs()[0]);
750 // no mapping involving the 'extra' A55555
751 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
755 * Test mapping of protein to cDNA, for the case where we have some sequence
756 * cross-references. Verify that once we have made an xref mapping we don't
757 * also map un-xrefd sequeces.
759 * @throws IOException
761 @Test(groups = { "Functional" })
762 public void testMapProteinAlignmentToCdna_prioritiseXrefs()
765 List<SequenceI> protseqs = new ArrayList<SequenceI>();
766 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
767 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
768 AlignmentI protein = new Alignment(
769 protseqs.toArray(new SequenceI[protseqs.size()]));
770 protein.setDataset(null);
772 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
773 dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
774 dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
775 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
777 cdna.setDataset(null);
779 // Xref A22222 to V12345 (should get mapped)
780 // A11111 should then be mapped to the unmapped V12346
781 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
783 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
785 // 2 protein mappings made
786 assertEquals(2, protein.getCodonFrames().size());
787 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
788 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
790 // one mapping for each of the cDNA sequences
791 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
792 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
794 // V12345 mapped to A22222
795 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
797 assertEquals(1, acf.getdnaSeqs().length);
798 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
799 acf.getdnaSeqs()[0]);
801 // V12346 mapped to A11111
802 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
803 assertEquals(1, acf.getdnaSeqs().length);
804 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
805 acf.getdnaSeqs()[0]);
809 * Test the method that shows or hides sequence annotations by type(s) and
812 @Test(groups = { "Functional" })
813 public void testShowOrHideSequenceAnnotations()
815 SequenceI seq1 = new Sequence("Seq1", "AAA");
816 SequenceI seq2 = new Sequence("Seq2", "BBB");
817 SequenceI seq3 = new Sequence("Seq3", "CCC");
818 Annotation[] anns = new Annotation[] { new Annotation(2f) };
819 AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1",
821 ann1.setSequenceRef(seq1);
822 AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2",
824 ann2.setSequenceRef(seq2);
825 AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3",
827 AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns);
828 ann4.setSequenceRef(seq1);
829 AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns);
830 ann5.setSequenceRef(seq2);
831 AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns);
832 AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2, seq3 });
833 al.addAnnotation(ann1); // Structure for Seq1
834 al.addAnnotation(ann2); // Structure for Seq2
835 al.addAnnotation(ann3); // Structure for no sequence
836 al.addAnnotation(ann4); // Temp for seq1
837 al.addAnnotation(ann5); // Temp for seq2
838 al.addAnnotation(ann6); // Temp for no sequence
839 List<String> types = new ArrayList<String>();
840 List<SequenceI> scope = new ArrayList<SequenceI>();
843 * Set all sequence related Structure to hidden (ann1, ann2)
845 types.add("Structure");
846 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
848 assertFalse(ann1.visible);
849 assertFalse(ann2.visible);
850 assertTrue(ann3.visible); // not sequence-related, not affected
851 assertTrue(ann4.visible); // not Structure, not affected
852 assertTrue(ann5.visible); // "
853 assertTrue(ann6.visible); // not sequence-related, not affected
856 * Set Temp in {seq1, seq3} to hidden
862 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false,
864 assertFalse(ann1.visible); // unchanged
865 assertFalse(ann2.visible); // unchanged
866 assertTrue(ann3.visible); // not sequence-related, not affected
867 assertFalse(ann4.visible); // Temp for seq1 hidden
868 assertTrue(ann5.visible); // not in scope, not affected
869 assertTrue(ann6.visible); // not sequence-related, not affected
872 * Set Temp in all sequences to hidden
878 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
880 assertFalse(ann1.visible); // unchanged
881 assertFalse(ann2.visible); // unchanged
882 assertTrue(ann3.visible); // not sequence-related, not affected
883 assertFalse(ann4.visible); // Temp for seq1 hidden
884 assertFalse(ann5.visible); // Temp for seq2 hidden
885 assertTrue(ann6.visible); // not sequence-related, not affected
888 * Set all types in {seq1, seq3} to visible
894 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true,
896 assertTrue(ann1.visible); // Structure for seq1 set visible
897 assertFalse(ann2.visible); // not in scope, unchanged
898 assertTrue(ann3.visible); // not sequence-related, not affected
899 assertTrue(ann4.visible); // Temp for seq1 set visible
900 assertFalse(ann5.visible); // not in scope, unchanged
901 assertTrue(ann6.visible); // not sequence-related, not affected
904 * Set all types in all scope to hidden
906 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true,
908 assertFalse(ann1.visible);
909 assertFalse(ann2.visible);
910 assertTrue(ann3.visible); // not sequence-related, not affected
911 assertFalse(ann4.visible);
912 assertFalse(ann5.visible);
913 assertTrue(ann6.visible); // not sequence-related, not affected
917 * Tests for the method that checks if one sequence cross-references another
919 @Test(groups = { "Functional" })
920 public void testHasCrossRef()
922 assertFalse(AlignmentUtils.hasCrossRef(null, null));
923 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
924 assertFalse(AlignmentUtils.hasCrossRef(seq1, null));
925 assertFalse(AlignmentUtils.hasCrossRef(null, seq1));
926 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
927 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
930 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193"));
931 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
933 // case-insensitive; version number is ignored
934 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192"));
935 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
938 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
939 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
940 // test is one-way only
941 assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1));
945 * Tests for the method that checks if either sequence cross-references the
948 @Test(groups = { "Functional" })
949 public void testHaveCrossRef()
951 assertFalse(AlignmentUtils.hasCrossRef(null, null));
952 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
953 assertFalse(AlignmentUtils.haveCrossRef(seq1, null));
954 assertFalse(AlignmentUtils.haveCrossRef(null, seq1));
955 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
956 assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2));
958 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
959 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
960 // next is true for haveCrossRef, false for hasCrossRef
961 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
963 // now the other way round
964 seq1.setDBRefs(null);
965 seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345"));
966 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
967 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
970 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
971 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
972 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
976 * Test the method that extracts the cds-only part of a dna alignment.
978 @Test(groups = { "Functional" })
979 public void testMakeCdsAlignment()
981 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
982 SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
983 SequenceI pep1 = new Sequence("pep1", "GF");
984 SequenceI pep2 = new Sequence("pep2", "GFP");
985 dna1.createDatasetSequence();
986 dna2.createDatasetSequence();
987 pep1.createDatasetSequence();
988 pep2.createDatasetSequence();
989 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
991 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
993 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
995 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
997 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds5", 13, 15, 0f,
999 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
1000 dna.setDataset(null);
1002 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
1003 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1004 new int[] { 1, 2 }, 3, 1);
1005 AlignedCodonFrame acf = new AlignedCodonFrame();
1006 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1008 map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 },
1010 acf = new AlignedCodonFrame();
1011 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1015 * execute method under test:
1017 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1018 dna1, dna2 }, mappings, dna);
1020 assertEquals(2, cds.getSequences().size());
1021 assertEquals("GGGTTT", cds.getSequenceAt(0)
1022 .getSequenceAsString());
1023 assertEquals("GGGTTTCCC", cds.getSequenceAt(1)
1024 .getSequenceAsString());
1027 * verify shared, extended alignment dataset
1029 assertSame(dna.getDataset(), cds.getDataset());
1030 assertTrue(dna.getDataset().getSequences()
1031 .contains(cds.getSequenceAt(0).getDatasetSequence()));
1032 assertTrue(dna.getDataset().getSequences()
1033 .contains(cds.getSequenceAt(1).getDatasetSequence()));
1036 * Verify mappings from CDS to peptide and cDNA to CDS
1037 * the mappings are on the shared alignment dataset
1039 assertSame(dna.getCodonFrames(), cds.getCodonFrames());
1040 List<AlignedCodonFrame> cdsMappings = cds.getCodonFrames();
1041 assertEquals(2, cdsMappings.size());
1044 * Mapping from pep1 to GGGTTT in first new exon sequence
1046 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1047 .findMappingsForSequence(pep1, cdsMappings);
1048 assertEquals(1, pep1Mapping.size());
1050 SearchResults sr = MappingUtils
1051 .buildSearchResults(pep1, 1, cdsMappings);
1052 assertEquals(1, sr.getResults().size());
1053 Match m = sr.getResults().get(0);
1054 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1056 assertEquals(1, m.getStart());
1057 assertEquals(3, m.getEnd());
1059 sr = MappingUtils.buildSearchResults(pep1, 2, cdsMappings);
1060 m = sr.getResults().get(0);
1061 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1063 assertEquals(4, m.getStart());
1064 assertEquals(6, m.getEnd());
1067 * Mapping from pep2 to GGGTTTCCC in second new exon sequence
1069 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1070 .findMappingsForSequence(pep2, cdsMappings);
1071 assertEquals(1, pep2Mapping.size());
1073 sr = MappingUtils.buildSearchResults(pep2, 1, cdsMappings);
1074 assertEquals(1, sr.getResults().size());
1075 m = sr.getResults().get(0);
1076 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1078 assertEquals(1, m.getStart());
1079 assertEquals(3, m.getEnd());
1081 sr = MappingUtils.buildSearchResults(pep2, 2, cdsMappings);
1082 m = sr.getResults().get(0);
1083 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1085 assertEquals(4, m.getStart());
1086 assertEquals(6, m.getEnd());
1088 sr = MappingUtils.buildSearchResults(pep2, 3, cdsMappings);
1089 m = sr.getResults().get(0);
1090 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1092 assertEquals(7, m.getStart());
1093 assertEquals(9, m.getEnd());
1097 * Test the method that makes a cds-only alignment from a DNA sequence and its
1098 * product mappings, for the case where there are multiple exon mappings to
1099 * different protein products.
1101 @Test(groups = { "Functional" })
1102 public void testMakeCdsAlignment_multipleProteins()
1104 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1105 SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT
1106 SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc
1107 SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT
1108 dna1.createDatasetSequence();
1109 pep1.createDatasetSequence();
1110 pep2.createDatasetSequence();
1111 pep3.createDatasetSequence();
1112 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
1114 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
1116 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
1118 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
1120 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds5", 1, 3, 0f,
1122 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds6", 10, 12, 0f,
1124 pep1.getDatasetSequence().addDBRef(
1125 new DBRefEntry("EMBLCDS", "2", "A12345"));
1126 pep2.getDatasetSequence().addDBRef(
1127 new DBRefEntry("EMBLCDS", "3", "A12346"));
1128 pep3.getDatasetSequence().addDBRef(
1129 new DBRefEntry("EMBLCDS", "4", "A12347"));
1132 * Make the mappings from dna to protein
1134 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
1135 // map ...GGG...TTT to GF
1136 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1137 new int[] { 1, 2 }, 3, 1);
1138 AlignedCodonFrame acf = new AlignedCodonFrame();
1139 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1142 // map aaa...ccc to KP
1143 map = new MapList(new int[] { 1, 3, 7, 9 }, new int[] { 1, 2 }, 3, 1);
1144 acf = new AlignedCodonFrame();
1145 acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map);
1148 // map aaa......TTT to KF
1149 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 2 }, 3, 1);
1150 acf = new AlignedCodonFrame();
1151 acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
1155 * Create the CDS alignment; also augments the dna-to-protein mappings with
1156 * exon-to-protein and exon-to-dna mappings
1158 AlignmentI dna = new Alignment(new SequenceI[] { dna1 });
1159 dna.setDataset(null);
1162 * execute method under test
1164 AlignmentI cdsal = AlignmentUtils.makeCdsAlignment(
1165 new SequenceI[] { dna1 }, mappings, dna);
1168 * Verify we have 3 cds sequences, mapped to pep1/2/3 respectively
1170 List<SequenceI> cds = cdsal.getSequences();
1171 assertEquals(3, cds.size());
1174 * verify shared, extended alignment dataset
1176 assertSame(cdsal.getDataset(), dna.getDataset());
1177 assertTrue(dna.getDataset().getSequences()
1178 .contains(cds.get(0).getDatasetSequence()));
1179 assertTrue(dna.getDataset().getSequences()
1180 .contains(cds.get(1).getDatasetSequence()));
1181 assertTrue(dna.getDataset().getSequences()
1182 .contains(cds.get(2).getDatasetSequence()));
1185 * verify aligned cds sequences and their xrefs
1187 SequenceI cdsSeq = cds.get(0);
1188 assertEquals("GGGTTT", cdsSeq.getSequenceAsString());
1189 // assertEquals("dna1|A12345", cdsSeq.getName());
1190 assertEquals("dna1|pep1", cdsSeq.getName());
1191 // assertEquals(1, cdsSeq.getDBRefs().length);
1192 // DBRefEntry cdsRef = cdsSeq.getDBRefs()[0];
1193 // assertEquals("EMBLCDS", cdsRef.getSource());
1194 // assertEquals("2", cdsRef.getVersion());
1195 // assertEquals("A12345", cdsRef.getAccessionId());
1197 cdsSeq = cds.get(1);
1198 assertEquals("aaaccc", cdsSeq.getSequenceAsString());
1199 // assertEquals("dna1|A12346", cdsSeq.getName());
1200 assertEquals("dna1|pep2", cdsSeq.getName());
1201 // assertEquals(1, cdsSeq.getDBRefs().length);
1202 // cdsRef = cdsSeq.getDBRefs()[0];
1203 // assertEquals("EMBLCDS", cdsRef.getSource());
1204 // assertEquals("3", cdsRef.getVersion());
1205 // assertEquals("A12346", cdsRef.getAccessionId());
1207 cdsSeq = cds.get(2);
1208 assertEquals("aaaTTT", cdsSeq.getSequenceAsString());
1209 // assertEquals("dna1|A12347", cdsSeq.getName());
1210 assertEquals("dna1|pep3", cdsSeq.getName());
1211 // assertEquals(1, cdsSeq.getDBRefs().length);
1212 // cdsRef = cdsSeq.getDBRefs()[0];
1213 // assertEquals("EMBLCDS", cdsRef.getSource());
1214 // assertEquals("4", cdsRef.getVersion());
1215 // assertEquals("A12347", cdsRef.getAccessionId());
1218 * Verify there are mappings from each cds sequence to its protein product
1219 * and also to its dna source
1221 Iterator<AlignedCodonFrame> newMappingsIterator = cdsal
1222 .getCodonFrames().iterator();
1224 // mappings for dna1 - exon1 - pep1
1225 AlignedCodonFrame cdsMapping = newMappingsIterator.next();
1226 List<Mapping> dnaMappings = cdsMapping.getMappingsFromSequence(dna1);
1227 assertEquals(3, dnaMappings.size());
1228 assertSame(cds.get(0).getDatasetSequence(), dnaMappings.get(0)
1230 assertEquals("G(1) in CDS should map to G(4) in DNA", 4, dnaMappings
1231 .get(0).getMap().getToPosition(1));
1232 List<Mapping> peptideMappings = cdsMapping.getMappingsFromSequence(cds
1233 .get(0).getDatasetSequence());
1234 assertEquals(1, peptideMappings.size());
1235 assertSame(pep1.getDatasetSequence(), peptideMappings.get(0).getTo());
1237 // mappings for dna1 - cds2 - pep2
1238 assertSame(cds.get(1).getDatasetSequence(), dnaMappings.get(1)
1240 assertEquals("c(4) in CDS should map to c(7) in DNA", 7, dnaMappings
1241 .get(1).getMap().getToPosition(4));
1242 peptideMappings = cdsMapping.getMappingsFromSequence(cds.get(1)
1243 .getDatasetSequence());
1244 assertEquals(1, peptideMappings.size());
1245 assertSame(pep2.getDatasetSequence(), peptideMappings.get(0).getTo());
1247 // mappings for dna1 - cds3 - pep3
1248 assertSame(cds.get(2).getDatasetSequence(), dnaMappings.get(2)
1250 assertEquals("T(4) in CDS should map to T(10) in DNA", 10, dnaMappings
1251 .get(2).getMap().getToPosition(4));
1252 peptideMappings = cdsMapping.getMappingsFromSequence(cds.get(2)
1253 .getDatasetSequence());
1254 assertEquals(1, peptideMappings.size());
1255 assertSame(pep3.getDatasetSequence(), peptideMappings.get(0).getTo());
1258 @Test(groups = { "Functional" })
1259 public void testIsMappable()
1261 SequenceI dna1 = new Sequence("dna1", "cgCAGtgGT");
1262 SequenceI aa1 = new Sequence("aa1", "RSG");
1263 AlignmentI al1 = new Alignment(new SequenceI[] { dna1 });
1264 AlignmentI al2 = new Alignment(new SequenceI[] { aa1 });
1266 assertFalse(AlignmentUtils.isMappable(null, null));
1267 assertFalse(AlignmentUtils.isMappable(al1, null));
1268 assertFalse(AlignmentUtils.isMappable(null, al1));
1269 assertFalse(AlignmentUtils.isMappable(al1, al1));
1270 assertFalse(AlignmentUtils.isMappable(al2, al2));
1272 assertTrue(AlignmentUtils.isMappable(al1, al2));
1273 assertTrue(AlignmentUtils.isMappable(al2, al1));
1277 * Test creating a mapping when the sequences involved do not start at residue
1280 * @throws IOException
1282 @Test(groups = { "Functional" })
1283 public void testMapCdnaToProtein_forSubsequence()
1286 SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12);
1287 prot.createDatasetSequence();
1289 SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48);
1290 dna.createDatasetSequence();
1292 MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna);
1293 assertEquals(10, map.getToLowest());
1294 assertEquals(12, map.getToHighest());
1295 assertEquals(40, map.getFromLowest());
1296 assertEquals(48, map.getFromHighest());
1300 * Test for the alignSequenceAs method where we have protein mapped to protein
1302 @Test(groups = { "Functional" })
1303 public void testAlignSequenceAs_mappedProteinProtein()
1306 SequenceI alignMe = new Sequence("Match", "MGAASEV");
1307 alignMe.createDatasetSequence();
1308 SequenceI alignFrom = new Sequence("Query", "LQTGYMGAASEVMFSPTRR");
1309 alignFrom.createDatasetSequence();
1311 AlignedCodonFrame acf = new AlignedCodonFrame();
1312 // this is like a domain or motif match of part of a peptide sequence
1313 MapList map = new MapList(new int[] { 6, 12 }, new int[] { 1, 7 }, 1, 1);
1314 acf.addMap(alignFrom.getDatasetSequence(),
1315 alignMe.getDatasetSequence(), map);
1317 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "-", '-', true,
1319 assertEquals("-----MGAASEV-------", alignMe.getSequenceAsString());
1323 * Test for the alignSequenceAs method where there are trailing unmapped
1324 * residues in the model sequence
1326 @Test(groups = { "Functional" })
1327 public void testAlignSequenceAs_withTrailingPeptide()
1329 // map first 3 codons to KPF; G is a trailing unmapped residue
1330 MapList map = new MapList(new int[] { 1, 9 }, new int[] { 1, 3 }, 3, 1);
1332 checkAlignSequenceAs("AAACCCTTT", "K-PFG", true, true, map,
1337 * Tests for transferring features between mapped sequences
1339 @Test(groups = { "Functional" })
1340 public void testTransferFeatures()
1342 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1343 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1346 dna.addSequenceFeature(new SequenceFeature("type1", "desc1", 1, 2, 1f,
1348 // partial overlap - to [1, 1]
1349 dna.addSequenceFeature(new SequenceFeature("type2", "desc2", 3, 4, 2f,
1351 // exact overlap - to [1, 3]
1352 dna.addSequenceFeature(new SequenceFeature("type3", "desc3", 4, 6, 3f,
1354 // spanning overlap - to [2, 5]
1355 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1357 // exactly overlaps whole mapped range [1, 6]
1358 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1360 // no overlap (internal)
1361 dna.addSequenceFeature(new SequenceFeature("type6", "desc6", 7, 9, 6f,
1363 // no overlap (3' end)
1364 dna.addSequenceFeature(new SequenceFeature("type7", "desc7", 13, 15,
1366 // overlap (3' end) - to [6, 6]
1367 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1369 // extended overlap - to [6, +]
1370 dna.addSequenceFeature(new SequenceFeature("type9", "desc9", 12, 13,
1373 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1374 new int[] { 1, 6 }, 1, 1);
1377 * transferFeatures() will build 'partial overlap' for regions
1378 * that partially overlap 5' or 3' (start or end) of target sequence
1380 AlignmentUtils.transferFeatures(dna, cds, map, null);
1381 SequenceFeature[] sfs = cds.getSequenceFeatures();
1382 assertEquals(6, sfs.length);
1384 SequenceFeature sf = sfs[0];
1385 assertEquals("type2", sf.getType());
1386 assertEquals("desc2", sf.getDescription());
1387 assertEquals(2f, sf.getScore());
1388 assertEquals(1, sf.getBegin());
1389 assertEquals(1, sf.getEnd());
1392 assertEquals("type3", sf.getType());
1393 assertEquals("desc3", sf.getDescription());
1394 assertEquals(3f, sf.getScore());
1395 assertEquals(1, sf.getBegin());
1396 assertEquals(3, sf.getEnd());
1399 assertEquals("type4", sf.getType());
1400 assertEquals(2, sf.getBegin());
1401 assertEquals(5, sf.getEnd());
1404 assertEquals("type5", sf.getType());
1405 assertEquals(1, sf.getBegin());
1406 assertEquals(6, sf.getEnd());
1409 assertEquals("type8", sf.getType());
1410 assertEquals(6, sf.getBegin());
1411 assertEquals(6, sf.getEnd());
1414 assertEquals("type9", sf.getType());
1415 assertEquals(6, sf.getBegin());
1416 assertEquals(6, sf.getEnd());
1420 * Tests for transferring features between mapped sequences
1422 @Test(groups = { "Functional" })
1423 public void testTransferFeatures_withOmit()
1425 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1426 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1428 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1429 new int[] { 1, 6 }, 1, 1);
1431 // [5, 11] maps to [2, 5]
1432 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1434 // [4, 12] maps to [1, 6]
1435 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1437 // [12, 12] maps to [6, 6]
1438 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1441 // desc4 and desc8 are the 'omit these' varargs
1442 AlignmentUtils.transferFeatures(dna, cds, map, null, "type4", "type8");
1443 SequenceFeature[] sfs = cds.getSequenceFeatures();
1444 assertEquals(1, sfs.length);
1446 SequenceFeature sf = sfs[0];
1447 assertEquals("type5", sf.getType());
1448 assertEquals(1, sf.getBegin());
1449 assertEquals(6, sf.getEnd());
1453 * Tests for transferring features between mapped sequences
1455 @Test(groups = { "Functional" })
1456 public void testTransferFeatures_withSelect()
1458 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1459 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1461 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1462 new int[] { 1, 6 }, 1, 1);
1464 // [5, 11] maps to [2, 5]
1465 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1467 // [4, 12] maps to [1, 6]
1468 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1470 // [12, 12] maps to [6, 6]
1471 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1474 // "type5" is the 'select this type' argument
1475 AlignmentUtils.transferFeatures(dna, cds, map, "type5");
1476 SequenceFeature[] sfs = cds.getSequenceFeatures();
1477 assertEquals(1, sfs.length);
1479 SequenceFeature sf = sfs[0];
1480 assertEquals("type5", sf.getType());
1481 assertEquals(1, sf.getBegin());
1482 assertEquals(6, sf.getEnd());
1486 * Test the method that extracts the cds-only part of a dna alignment, for the
1487 * case where the cds should be aligned to match its nucleotide sequence.
1489 @Test(groups = { "Functional" })
1490 public void testMakeCdsAlignment_alternativeTranscripts()
1492 SequenceI dna1 = new Sequence("dna1", "aaaGGGCC-----CTTTaaaGGG");
1493 // alternative transcript of same dna skips CCC codon
1494 SequenceI dna2 = new Sequence("dna2", "aaaGGGCC-----cttTaaaGGG");
1495 // dna3 has no mapping (protein product) so should be ignored here
1496 SequenceI dna3 = new Sequence("dna3", "aaaGGGCCCCCGGGcttTaaaGGG");
1497 SequenceI pep1 = new Sequence("pep1", "GPFG");
1498 SequenceI pep2 = new Sequence("pep2", "GPG");
1499 dna1.createDatasetSequence();
1500 dna2.createDatasetSequence();
1501 dna3.createDatasetSequence();
1502 pep1.createDatasetSequence();
1503 pep2.createDatasetSequence();
1504 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 8, 0f,
1506 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 9, 12, 0f,
1508 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 16, 18, 0f,
1510 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 4, 8, 0f,
1512 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 12, 12, 0f,
1514 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 16, 18, 0f,
1517 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
1518 MapList map = new MapList(new int[] { 4, 12, 16, 18 },
1519 new int[] { 1, 4 }, 3, 1);
1520 AlignedCodonFrame acf = new AlignedCodonFrame();
1521 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1523 map = new MapList(new int[] { 4, 8, 12, 12, 16, 18 },
1526 acf = new AlignedCodonFrame();
1527 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1530 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1531 dna.setDataset(null);
1532 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1533 dna1, dna2, dna3 }, mappings, dna);
1534 List<SequenceI> cdsSeqs = cds.getSequences();
1535 assertEquals(2, cdsSeqs.size());
1536 assertEquals("GGGCCCTTTGGG", cdsSeqs.get(0).getSequenceAsString());
1537 assertEquals("GGGCCTGGG", cdsSeqs.get(1).getSequenceAsString());
1540 * verify shared, extended alignment dataset
1542 assertSame(dna.getDataset(), cds.getDataset());
1543 assertTrue(dna.getDataset().getSequences()
1544 .contains(cdsSeqs.get(0).getDatasetSequence()));
1545 assertTrue(dna.getDataset().getSequences()
1546 .contains(cdsSeqs.get(1).getDatasetSequence()));
1549 * Verify updated mappings
1551 List<AlignedCodonFrame> cdsMappings = cds.getCodonFrames();
1552 assertEquals(2, cdsMappings.size());
1555 * Mapping from pep1 to GGGTTT in first new CDS sequence
1557 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1558 .findMappingsForSequence(pep1, cdsMappings);
1559 assertEquals(1, pep1Mapping.size());
1561 * maps GPFG to 1-3,4-6,7-9,10-12
1563 SearchResults sr = MappingUtils
1564 .buildSearchResults(pep1, 1, cdsMappings);
1565 assertEquals(1, sr.getResults().size());
1566 Match m = sr.getResults().get(0);
1567 assertEquals(cds.getSequenceAt(0).getDatasetSequence(),
1569 assertEquals(1, m.getStart());
1570 assertEquals(3, m.getEnd());
1571 sr = MappingUtils.buildSearchResults(pep1, 2, cdsMappings);
1572 m = sr.getResults().get(0);
1573 assertEquals(4, m.getStart());
1574 assertEquals(6, m.getEnd());
1575 sr = MappingUtils.buildSearchResults(pep1, 3, cdsMappings);
1576 m = sr.getResults().get(0);
1577 assertEquals(7, m.getStart());
1578 assertEquals(9, m.getEnd());
1579 sr = MappingUtils.buildSearchResults(pep1, 4, cdsMappings);
1580 m = sr.getResults().get(0);
1581 assertEquals(10, m.getStart());
1582 assertEquals(12, m.getEnd());
1585 * GPG in pep2 map to 1-3,4-6,7-9 in second CDS sequence
1587 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1588 .findMappingsForSequence(pep2, cdsMappings);
1589 assertEquals(1, pep2Mapping.size());
1590 sr = MappingUtils.buildSearchResults(pep2, 1, cdsMappings);
1591 assertEquals(1, sr.getResults().size());
1592 m = sr.getResults().get(0);
1593 assertEquals(cds.getSequenceAt(1).getDatasetSequence(),
1595 assertEquals(1, m.getStart());
1596 assertEquals(3, m.getEnd());
1597 sr = MappingUtils.buildSearchResults(pep2, 2, cdsMappings);
1598 m = sr.getResults().get(0);
1599 assertEquals(4, m.getStart());
1600 assertEquals(6, m.getEnd());
1601 sr = MappingUtils.buildSearchResults(pep2, 3, cdsMappings);
1602 m = sr.getResults().get(0);
1603 assertEquals(7, m.getStart());
1604 assertEquals(9, m.getEnd());
1608 * Test the method that realigns protein to match mapped codon alignment.
1610 @Test(groups = { "Functional" })
1611 public void testAlignProteinAsDna_incompleteStartCodon()
1613 // seq1: incomplete start codon (not mapped), then [3, 11]
1614 SequenceI dna1 = new Sequence("Seq1", "ccAAA-TTT-GGG-");
1615 // seq2 codons are [4, 5], [8, 11]
1616 SequenceI dna2 = new Sequence("Seq2", "ccaAA-ttT-GGG-");
1617 // seq3 incomplete start codon at 'tt'
1618 SequenceI dna3 = new Sequence("Seq3", "ccaaa-ttt-GGG-");
1619 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1620 dna.setDataset(null);
1622 // prot1 has 'X' for incomplete start codon (not mapped)
1623 SequenceI prot1 = new Sequence("Seq1", "XKFG"); // X for incomplete start
1624 SequenceI prot2 = new Sequence("Seq2", "NG");
1625 SequenceI prot3 = new Sequence("Seq3", "XG"); // X for incomplete start
1626 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
1628 protein.setDataset(null);
1630 // map dna1 [3, 11] to prot1 [2, 4] KFG
1631 MapList map = new MapList(new int[] { 3, 11 }, new int[] { 2, 4 }, 3, 1);
1632 AlignedCodonFrame acf = new AlignedCodonFrame();
1633 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
1635 // map dna2 [4, 5] [8, 11] to prot2 [1, 2] NG
1636 map = new MapList(new int[] { 4, 5, 8, 11 }, new int[] { 1, 2 }, 3, 1);
1637 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
1639 // map dna3 [9, 11] to prot3 [2, 2] G
1640 map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1);
1641 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
1643 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
1645 protein.setCodonFrames(acfs);
1648 * verify X is included in the aligned proteins, and placed just
1649 * before the first mapped residue
1650 * CCT is between CCC and TTT
1652 AlignmentUtils.alignProteinAsDna(protein, dna);
1653 assertEquals("XK-FG", prot1.getSequenceAsString());
1654 assertEquals("--N-G", prot2.getSequenceAsString());
1655 assertEquals("---XG", prot3.getSequenceAsString());
1659 * Tests for the method that maps the subset of a dna sequence that has CDS
1660 * (or subtype) feature - case where the start codon is incomplete.
1662 @Test(groups = "Functional")
1663 public void testFindCdsPositions_fivePrimeIncomplete()
1665 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
1666 dnaSeq.createDatasetSequence();
1667 SequenceI ds = dnaSeq.getDatasetSequence();
1669 // CDS for dna 5-6 (incomplete codon), 7-9
1670 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
1671 sf.setPhase("2"); // skip 2 bases to start of next codon
1672 ds.addSequenceFeature(sf);
1673 // CDS for dna 13-15
1674 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
1675 ds.addSequenceFeature(sf);
1677 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1680 * check the mapping starts with the first complete codon
1682 assertEquals(6, MappingUtils.getLength(ranges));
1683 assertEquals(2, ranges.size());
1684 assertEquals(7, ranges.get(0)[0]);
1685 assertEquals(9, ranges.get(0)[1]);
1686 assertEquals(13, ranges.get(1)[0]);
1687 assertEquals(15, ranges.get(1)[1]);
1691 * Tests for the method that maps the subset of a dna sequence that has CDS
1692 * (or subtype) feature.
1694 @Test(groups = "Functional")
1695 public void testFindCdsPositions()
1697 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
1698 dnaSeq.createDatasetSequence();
1699 SequenceI ds = dnaSeq.getDatasetSequence();
1701 // CDS for dna 10-12
1702 SequenceFeature sf = new SequenceFeature("CDS_predicted", "", 10, 12,
1705 ds.addSequenceFeature(sf);
1707 sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
1709 ds.addSequenceFeature(sf);
1710 // exon feature should be ignored here
1711 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
1712 ds.addSequenceFeature(sf);
1714 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1716 * verify ranges { [4-6], [12-10] }
1717 * note CDS ranges are ordered ascending even if the CDS
1720 assertEquals(6, MappingUtils.getLength(ranges));
1721 assertEquals(2, ranges.size());
1722 assertEquals(4, ranges.get(0)[0]);
1723 assertEquals(6, ranges.get(0)[1]);
1724 assertEquals(10, ranges.get(1)[0]);
1725 assertEquals(12, ranges.get(1)[1]);
1729 * Test the method that computes a map of codon variants for each protein
1730 * position from "sequence_variant" features on dna
1732 @Test(groups = "Functional")
1733 public void testBuildDnaVariantsMap()
1735 SequenceI dna = new Sequence("dna", "atgAAATTTGGGCCCtag");
1736 MapList map = new MapList(new int[] { 1, 18 }, new int[] { 1, 5 }, 3, 1);
1739 * first with no variants on dna
1741 LinkedHashMap<Integer, List<DnaVariant>[]> variantsMap = AlignmentUtils
1742 .buildDnaVariantsMap(dna, map);
1743 assertTrue(variantsMap.isEmpty());
1746 * single allele codon 1, on base 1
1748 SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
1750 sf1.setValue("alleles", "T");
1751 sf1.setValue("ID", "sequence_variant:rs758803211");
1752 dna.addSequenceFeature(sf1);
1755 * two alleles codon 2, on bases 2 and 3 (distinct variants)
1757 SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 5, 5,
1759 sf2.setValue("alleles", "T");
1760 sf2.setValue("ID", "sequence_variant:rs758803212");
1761 dna.addSequenceFeature(sf2);
1762 SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 6, 6,
1764 sf3.setValue("alleles", "G");
1765 sf3.setValue("ID", "sequence_variant:rs758803213");
1766 dna.addSequenceFeature(sf3);
1769 * two alleles codon 3, both on base 2 (one variant)
1771 SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 8, 8,
1773 sf4.setValue("alleles", "C, G");
1774 sf4.setValue("ID", "sequence_variant:rs758803214");
1775 dna.addSequenceFeature(sf4);
1777 // no alleles on codon 4
1780 * alleles on codon 5 on all 3 bases (distinct variants)
1782 SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 13,
1784 sf5.setValue("alleles", "C, G"); // (C duplicates given base value)
1785 sf5.setValue("ID", "sequence_variant:rs758803215");
1786 dna.addSequenceFeature(sf5);
1787 SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 14,
1789 sf6.setValue("alleles", "g, a"); // should force to upper-case
1790 sf6.setValue("ID", "sequence_variant:rs758803216");
1791 dna.addSequenceFeature(sf6);
1792 SequenceFeature sf7 = new SequenceFeature("sequence_variant", "", 15,
1794 sf7.setValue("alleles", "A, T");
1795 sf7.setValue("ID", "sequence_variant:rs758803217");
1796 dna.addSequenceFeature(sf7);
1799 * build map - expect variants on positions 1, 2, 3, 5
1801 variantsMap = AlignmentUtils.buildDnaVariantsMap(dna, map);
1802 assertEquals(4, variantsMap.size());
1805 * protein residue 1: variant on codon (ATG) base 1, not on 2 or 3
1807 List<DnaVariant>[] pep1Variants = variantsMap.get(1);
1808 assertEquals(3, pep1Variants.length);
1809 assertEquals(1, pep1Variants[0].size());
1810 assertEquals("A", pep1Variants[0].get(0).base); // codon[1] base
1811 assertSame(sf1, pep1Variants[0].get(0).variant); // codon[1] variant
1812 assertEquals(1, pep1Variants[1].size());
1813 assertEquals("T", pep1Variants[1].get(0).base); // codon[2] base
1814 assertNull(pep1Variants[1].get(0).variant); // no variant here
1815 assertEquals(1, pep1Variants[2].size());
1816 assertEquals("G", pep1Variants[2].get(0).base); // codon[3] base
1817 assertNull(pep1Variants[2].get(0).variant); // no variant here
1820 * protein residue 2: variants on codon (AAA) bases 2 and 3
1822 List<DnaVariant>[] pep2Variants = variantsMap.get(2);
1823 assertEquals(3, pep2Variants.length);
1824 assertEquals(1, pep2Variants[0].size());
1825 // codon[1] base recorded while processing variant on codon[2]
1826 assertEquals("A", pep2Variants[0].get(0).base);
1827 assertNull(pep2Variants[0].get(0).variant); // no variant here
1828 // codon[2] base and variant:
1829 assertEquals(1, pep2Variants[1].size());
1830 assertEquals("A", pep2Variants[1].get(0).base);
1831 assertSame(sf2, pep2Variants[1].get(0).variant);
1832 // codon[3] base was recorded when processing codon[2] variant
1833 // and then the variant for codon[3] added to it
1834 assertEquals(1, pep2Variants[2].size());
1835 assertEquals("A", pep2Variants[2].get(0).base);
1836 assertSame(sf3, pep2Variants[2].get(0).variant);
1839 * protein residue 3: variants on codon (TTT) base 2 only
1841 List<DnaVariant>[] pep3Variants = variantsMap.get(3);
1842 assertEquals(3, pep3Variants.length);
1843 assertEquals(1, pep3Variants[0].size());
1844 assertEquals("T", pep3Variants[0].get(0).base); // codon[1] base
1845 assertNull(pep3Variants[0].get(0).variant); // no variant here
1846 assertEquals(1, pep3Variants[1].size());
1847 assertEquals("T", pep3Variants[1].get(0).base); // codon[2] base
1848 assertSame(sf4, pep3Variants[1].get(0).variant); // codon[2] variant
1849 assertEquals(1, pep3Variants[2].size());
1850 assertEquals("T", pep3Variants[2].get(0).base); // codon[3] base
1851 assertNull(pep3Variants[2].get(0).variant); // no variant here
1854 * three variants on protein position 5
1856 List<DnaVariant>[] pep5Variants = variantsMap.get(5);
1857 assertEquals(3, pep5Variants.length);
1858 assertEquals(1, pep5Variants[0].size());
1859 assertEquals("C", pep5Variants[0].get(0).base); // codon[1] base
1860 assertSame(sf5, pep5Variants[0].get(0).variant); // codon[1] variant
1861 assertEquals(1, pep5Variants[1].size());
1862 assertEquals("C", pep5Variants[1].get(0).base); // codon[2] base
1863 assertSame(sf6, pep5Variants[1].get(0).variant); // codon[2] variant
1864 assertEquals(1, pep5Variants[2].size());
1865 assertEquals("C", pep5Variants[2].get(0).base); // codon[3] base
1866 assertSame(sf7, pep5Variants[2].get(0).variant); // codon[3] variant
1870 * Tests for the method that computes all peptide variants given codon
1873 @Test(groups = "Functional")
1874 public void testComputePeptideVariants()
1877 * scenario: AAATTTCCC codes for KFP, with variants
1883 * CAC,CGC -> H,R (as one variant)
1885 SequenceI peptide = new Sequence("pep/10-12", "KFP");
1888 * two distinct variants for codon 1 position 1
1889 * second one has clinical significance
1891 SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
1893 sf1.setValue("alleles", "A,G"); // GAA -> E
1894 sf1.setValue("ID", "var1.125A>G");
1895 SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 1, 1,
1897 sf2.setValue("alleles", "A,C"); // CAA -> Q
1898 sf2.setValue("ID", "var2");
1899 sf2.setValue("clinical_significance", "Dodgy");
1900 SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 3, 3,
1902 sf3.setValue("alleles", "A,G"); // synonymous
1903 sf3.setValue("ID", "var3");
1904 sf3.setValue("clinical_significance", "None");
1905 SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 3, 3,
1907 sf4.setValue("alleles", "A,T"); // AAT -> N
1908 sf4.setValue("ID", "sequence_variant:var4"); // prefix gets stripped off
1909 sf4.setValue("clinical_significance", "Benign");
1910 SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 6, 6,
1912 sf5.setValue("alleles", "T,C"); // synonymous
1913 sf5.setValue("ID", "var5");
1914 sf5.setValue("clinical_significance", "Bad");
1915 SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 8, 8,
1917 sf6.setValue("alleles", "C,A,G"); // CAC,CGC -> H,R
1918 sf6.setValue("ID", "var6");
1919 sf6.setValue("clinical_significance", "Good");
1921 List<DnaVariant> codon1Variants = new ArrayList<DnaVariant>();
1922 List<DnaVariant> codon2Variants = new ArrayList<DnaVariant>();
1923 List<DnaVariant> codon3Variants = new ArrayList<DnaVariant>();
1924 List<DnaVariant> codonVariants[] = new ArrayList[3];
1925 codonVariants[0] = codon1Variants;
1926 codonVariants[1] = codon2Variants;
1927 codonVariants[2] = codon3Variants;
1930 * compute variants for protein position 1
1932 codon1Variants.add(new DnaVariant("A", sf1));
1933 codon1Variants.add(new DnaVariant("A", sf2));
1934 codon2Variants.add(new DnaVariant("A"));
1935 codon2Variants.add(new DnaVariant("A"));
1936 codon3Variants.add(new DnaVariant("A", sf3));
1937 codon3Variants.add(new DnaVariant("A", sf4));
1938 AlignmentUtils.computePeptideVariants(peptide, 1, codonVariants);
1941 * compute variants for protein position 2
1943 codon1Variants.clear();
1944 codon2Variants.clear();
1945 codon3Variants.clear();
1946 codon1Variants.add(new DnaVariant("T"));
1947 codon2Variants.add(new DnaVariant("T"));
1948 codon3Variants.add(new DnaVariant("T", sf5));
1949 AlignmentUtils.computePeptideVariants(peptide, 2, codonVariants);
1952 * compute variants for protein position 3
1954 codon1Variants.clear();
1955 codon2Variants.clear();
1956 codon3Variants.clear();
1957 codon1Variants.add(new DnaVariant("C"));
1958 codon2Variants.add(new DnaVariant("C", sf6));
1959 codon3Variants.add(new DnaVariant("C"));
1960 AlignmentUtils.computePeptideVariants(peptide, 3, codonVariants);
1963 * verify added sequence features for
1970 SequenceFeature[] sfs = peptide.getSequenceFeatures();
1971 assertEquals(5, sfs.length);
1972 SequenceFeature sf = sfs[0];
1973 assertEquals(1, sf.getBegin());
1974 assertEquals(1, sf.getEnd());
1975 assertEquals("p.Lys1Glu", sf.getDescription());
1976 assertEquals("var1.125A>G", sf.getValue("ID"));
1977 assertNull(sf.getValue("clinical_significance"));
1978 assertEquals("ID=var1.125A>G", sf.getAttributes());
1979 assertEquals(1, sf.links.size());
1980 // link to variation is urlencoded
1982 "p.Lys1Glu var1.125A>G|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var1.125A%3EG",
1984 assertEquals("Jalview", sf.getFeatureGroup());
1986 assertEquals(1, sf.getBegin());
1987 assertEquals(1, sf.getEnd());
1988 assertEquals("p.Lys1Gln", sf.getDescription());
1989 assertEquals("var2", sf.getValue("ID"));
1990 assertEquals("Dodgy", sf.getValue("clinical_significance"));
1991 assertEquals("ID=var2;clinical_significance=Dodgy", sf.getAttributes());
1992 assertEquals(1, sf.links.size());
1994 "p.Lys1Gln var2|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var2",
1996 assertEquals("Jalview", sf.getFeatureGroup());
1998 assertEquals(1, sf.getBegin());
1999 assertEquals(1, sf.getEnd());
2000 assertEquals("p.Lys1Asn", sf.getDescription());
2001 assertEquals("var4", sf.getValue("ID"));
2002 assertEquals("Benign", sf.getValue("clinical_significance"));
2003 assertEquals("ID=var4;clinical_significance=Benign", sf.getAttributes());
2004 assertEquals(1, sf.links.size());
2006 "p.Lys1Asn var4|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var4",
2008 assertEquals("Jalview", sf.getFeatureGroup());
2010 assertEquals(3, sf.getBegin());
2011 assertEquals(3, sf.getEnd());
2012 assertEquals("p.Pro3His", sf.getDescription());
2013 assertEquals("var6", sf.getValue("ID"));
2014 assertEquals("Good", sf.getValue("clinical_significance"));
2015 assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes());
2016 assertEquals(1, sf.links.size());
2018 "p.Pro3His var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
2020 // var5 generates two distinct protein variant features
2021 assertEquals("Jalview", sf.getFeatureGroup());
2023 assertEquals(3, sf.getBegin());
2024 assertEquals(3, sf.getEnd());
2025 assertEquals("p.Pro3Arg", sf.getDescription());
2026 assertEquals("var6", sf.getValue("ID"));
2027 assertEquals("Good", sf.getValue("clinical_significance"));
2028 assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes());
2029 assertEquals(1, sf.links.size());
2031 "p.Pro3Arg var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
2033 assertEquals("Jalview", sf.getFeatureGroup());
2037 * Tests for the method that maps the subset of a dna sequence that has CDS
2038 * (or subtype) feature, with CDS strand = '-' (reverse)
2040 // test turned off as currently findCdsPositions is not strand-dependent
2041 // left in case it comes around again...
2042 @Test(groups = "Functional", enabled = false)
2043 public void testFindCdsPositions_reverseStrand()
2045 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
2046 dnaSeq.createDatasetSequence();
2047 SequenceI ds = dnaSeq.getDatasetSequence();
2050 SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
2052 ds.addSequenceFeature(sf);
2053 // exon feature should be ignored here
2054 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
2055 ds.addSequenceFeature(sf);
2056 // CDS for dna 10-12
2057 sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
2059 ds.addSequenceFeature(sf);
2061 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
2063 * verify ranges { [12-10], [6-4] }
2065 assertEquals(6, MappingUtils.getLength(ranges));
2066 assertEquals(2, ranges.size());
2067 assertEquals(12, ranges.get(0)[0]);
2068 assertEquals(10, ranges.get(0)[1]);
2069 assertEquals(6, ranges.get(1)[0]);
2070 assertEquals(4, ranges.get(1)[1]);
2074 * Tests for the method that maps the subset of a dna sequence that has CDS
2075 * (or subtype) feature - reverse strand case where the start codon is
2078 @Test(groups = "Functional", enabled = false)
2079 // test turned off as currently findCdsPositions is not strand-dependent
2080 // left in case it comes around again...
2081 public void testFindCdsPositions_reverseStrandThreePrimeIncomplete()
2083 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
2084 dnaSeq.createDatasetSequence();
2085 SequenceI ds = dnaSeq.getDatasetSequence();
2088 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
2090 ds.addSequenceFeature(sf);
2091 // CDS for dna 13-15
2092 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
2094 sf.setPhase("2"); // skip 2 bases to start of next codon
2095 ds.addSequenceFeature(sf);
2097 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
2100 * check the mapping starts with the first complete codon
2101 * expect ranges [13, 13], [9, 5]
2103 assertEquals(6, MappingUtils.getLength(ranges));
2104 assertEquals(2, ranges.size());
2105 assertEquals(13, ranges.get(0)[0]);
2106 assertEquals(13, ranges.get(0)[1]);
2107 assertEquals(9, ranges.get(1)[0]);
2108 assertEquals(5, ranges.get(1)[1]);
2111 @Test(groups = "Functional")
2112 public void testAlignAs_alternateTranscriptsUngapped()
2114 SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
2115 SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
2116 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
2117 ((Alignment) dna).createDatasetAlignment();
2118 SequenceI cds1 = new Sequence("cds1", "GGGTTT");
2119 SequenceI cds2 = new Sequence("cds2", "CCCAAA");
2120 AlignmentI cds = new Alignment(new SequenceI[] { cds1, cds2 });
2121 ((Alignment) cds).createDatasetAlignment();
2123 AlignedCodonFrame acf = new AlignedCodonFrame();
2124 MapList map = new MapList(new int[] { 4, 9 }, new int[] { 1, 6 }, 1, 1);
2125 acf.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), map);
2126 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 6 }, 1, 1);
2127 acf.addMap(dna2.getDatasetSequence(), cds2.getDatasetSequence(), map);
2130 * verify CDS alignment is as:
2131 * cccGGGTTTaaa (cdna)
2132 * CCCgggtttAAA (cdna)
2134 * ---GGGTTT--- (cds)
2135 * CCC------AAA (cds)
2137 dna.addCodonFrame(acf);
2138 AlignmentUtils.alignAs(cds, dna);
2139 assertEquals("---GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
2140 assertEquals("CCC------AAA", cds.getSequenceAt(1).getSequenceAsString());
2143 @Test(groups = { "Functional" })
2144 public void testAddMappedPositions()
2146 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
2147 SequenceI seq1 = new Sequence("cds", "AAATTT");
2148 from.createDatasetSequence();
2149 seq1.createDatasetSequence();
2150 Mapping mapping = new Mapping(seq1, new MapList(
2151 new int[] { 3, 6, 9, 10 },
2152 new int[] { 1, 6 }, 1, 1));
2153 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
2154 AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2157 * verify map has seq1 residues in columns 3,4,6,7,11,12
2159 assertEquals(6, map.size());
2160 assertEquals('A', map.get(3).get(seq1).charValue());
2161 assertEquals('A', map.get(4).get(seq1).charValue());
2162 assertEquals('A', map.get(6).get(seq1).charValue());
2163 assertEquals('T', map.get(7).get(seq1).charValue());
2164 assertEquals('T', map.get(11).get(seq1).charValue());
2165 assertEquals('T', map.get(12).get(seq1).charValue());
2173 * Test case where the mapping 'from' range includes a stop codon which is
2174 * absent in the 'to' range
2176 @Test(groups = { "Functional" })
2177 public void testAddMappedPositions_withStopCodon()
2179 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
2180 SequenceI seq1 = new Sequence("cds", "AAATTT");
2181 from.createDatasetSequence();
2182 seq1.createDatasetSequence();
2183 Mapping mapping = new Mapping(seq1, new MapList(
2184 new int[] { 3, 6, 9, 10 },
2185 new int[] { 1, 6 }, 1, 1));
2186 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
2187 AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2190 * verify map has seq1 residues in columns 3,4,6,7,11,12
2192 assertEquals(6, map.size());
2193 assertEquals('A', map.get(3).get(seq1).charValue());
2194 assertEquals('A', map.get(4).get(seq1).charValue());
2195 assertEquals('A', map.get(6).get(seq1).charValue());
2196 assertEquals('T', map.get(7).get(seq1).charValue());
2197 assertEquals('T', map.get(11).get(seq1).charValue());
2198 assertEquals('T', map.get(12).get(seq1).charValue());