2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.testng.AssertJUnit.assertEquals;
24 import static org.testng.AssertJUnit.assertFalse;
25 import static org.testng.AssertJUnit.assertNull;
26 import static org.testng.AssertJUnit.assertSame;
27 import static org.testng.AssertJUnit.assertTrue;
29 import jalview.analysis.AlignmentUtils.DnaVariant;
30 import jalview.datamodel.AlignedCodonFrame;
31 import jalview.datamodel.Alignment;
32 import jalview.datamodel.AlignmentAnnotation;
33 import jalview.datamodel.AlignmentI;
34 import jalview.datamodel.Annotation;
35 import jalview.datamodel.DBRefEntry;
36 import jalview.datamodel.Mapping;
37 import jalview.datamodel.SearchResults;
38 import jalview.datamodel.SearchResults.Match;
39 import jalview.datamodel.Sequence;
40 import jalview.datamodel.SequenceFeature;
41 import jalview.datamodel.SequenceI;
42 import jalview.io.AppletFormatAdapter;
43 import jalview.io.FormatAdapter;
44 import jalview.util.MapList;
45 import jalview.util.MappingUtils;
47 import java.io.IOException;
48 import java.util.ArrayList;
49 import java.util.Arrays;
50 import java.util.LinkedHashMap;
51 import java.util.List;
53 import java.util.TreeMap;
55 import org.testng.annotations.Test;
57 public class AlignmentUtilsTests
59 public static Sequence ts = new Sequence("short",
60 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm");
62 @Test(groups = { "Functional" })
63 public void testExpandContext()
65 AlignmentI al = new Alignment(new Sequence[] {});
66 for (int i = 4; i < 14; i += 2)
68 SequenceI s1 = ts.deriveSequence().getSubSequence(i, i + 7);
71 System.out.println(new AppletFormatAdapter().formatSequences("Clustal",
73 for (int flnk = -1; flnk < 25; flnk++)
75 AlignmentI exp = AlignmentUtils.expandContext(al, flnk);
76 System.out.println("\nFlank size: " + flnk);
77 System.out.println(new AppletFormatAdapter().formatSequences(
78 "Clustal", exp, true));
82 * Full expansion to complete sequences
84 for (SequenceI sq : exp.getSequences())
86 String ung = sq.getSequenceAsString().replaceAll("-+", "");
87 final String errorMsg = "Flanking sequence not the same as original dataset sequence.\n"
90 + sq.getDatasetSequence().getSequenceAsString();
91 assertTrue(errorMsg, ung.equalsIgnoreCase(sq.getDatasetSequence()
92 .getSequenceAsString()));
98 * Last sequence is fully expanded, others have leading gaps to match
100 assertTrue(exp.getSequenceAt(4).getSequenceAsString()
102 assertTrue(exp.getSequenceAt(3).getSequenceAsString()
103 .startsWith("--abc"));
104 assertTrue(exp.getSequenceAt(2).getSequenceAsString()
105 .startsWith("----abc"));
106 assertTrue(exp.getSequenceAt(1).getSequenceAsString()
107 .startsWith("------abc"));
108 assertTrue(exp.getSequenceAt(0).getSequenceAsString()
109 .startsWith("--------abc"));
115 * Test that annotations are correctly adjusted by expandContext
117 @Test(groups = { "Functional" })
118 public void testExpandContext_annotation()
120 AlignmentI al = new Alignment(new Sequence[] {});
121 SequenceI ds = new Sequence("Seq1", "ABCDEFGHI");
123 SequenceI seq1 = ds.deriveSequence().getSubSequence(3, 6);
124 al.addSequence(seq1);
127 * Annotate DEF with 4/5/6 respectively
129 Annotation[] anns = new Annotation[] { new Annotation(4),
130 new Annotation(5), new Annotation(6) };
131 AlignmentAnnotation ann = new AlignmentAnnotation("SS",
132 "secondary structure", anns);
133 seq1.addAlignmentAnnotation(ann);
136 * The annotations array should match aligned positions
138 assertEquals(3, ann.annotations.length);
139 assertEquals(4, ann.annotations[0].value, 0.001);
140 assertEquals(5, ann.annotations[1].value, 0.001);
141 assertEquals(6, ann.annotations[2].value, 0.001);
144 * Check annotation to sequence position mappings before expanding the
145 * sequence; these are set up in Sequence.addAlignmentAnnotation ->
146 * Annotation.setSequenceRef -> createSequenceMappings
148 assertNull(ann.getAnnotationForPosition(1));
149 assertNull(ann.getAnnotationForPosition(2));
150 assertNull(ann.getAnnotationForPosition(3));
151 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
152 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
153 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
154 assertNull(ann.getAnnotationForPosition(7));
155 assertNull(ann.getAnnotationForPosition(8));
156 assertNull(ann.getAnnotationForPosition(9));
159 * Expand the subsequence to the full sequence abcDEFghi
161 AlignmentI expanded = AlignmentUtils.expandContext(al, -1);
162 assertEquals("abcDEFghi", expanded.getSequenceAt(0)
163 .getSequenceAsString());
166 * Confirm the alignment and sequence have the same SS annotation,
167 * referencing the expanded sequence
169 ann = expanded.getSequenceAt(0).getAnnotation()[0];
170 assertSame(ann, expanded.getAlignmentAnnotation()[0]);
171 assertSame(expanded.getSequenceAt(0), ann.sequenceRef);
174 * The annotations array should have null values except for annotated
177 assertNull(ann.annotations[0]);
178 assertNull(ann.annotations[1]);
179 assertNull(ann.annotations[2]);
180 assertEquals(4, ann.annotations[3].value, 0.001);
181 assertEquals(5, ann.annotations[4].value, 0.001);
182 assertEquals(6, ann.annotations[5].value, 0.001);
183 assertNull(ann.annotations[6]);
184 assertNull(ann.annotations[7]);
185 assertNull(ann.annotations[8]);
188 * sequence position mappings should be unchanged
190 assertNull(ann.getAnnotationForPosition(1));
191 assertNull(ann.getAnnotationForPosition(2));
192 assertNull(ann.getAnnotationForPosition(3));
193 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
194 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
195 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
196 assertNull(ann.getAnnotationForPosition(7));
197 assertNull(ann.getAnnotationForPosition(8));
198 assertNull(ann.getAnnotationForPosition(9));
202 * Test method that returns a map of lists of sequences by sequence name.
204 * @throws IOException
206 @Test(groups = { "Functional" })
207 public void testGetSequencesByName() throws IOException
209 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
210 + ">Seq1Name\nABCD\n";
211 AlignmentI al = loadAlignment(data, "FASTA");
212 Map<String, List<SequenceI>> map = AlignmentUtils
213 .getSequencesByName(al);
214 assertEquals(2, map.keySet().size());
215 assertEquals(2, map.get("Seq1Name").size());
216 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
217 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
218 assertEquals(1, map.get("Seq2Name").size());
219 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
223 * Helper method to load an alignment and ensure dataset sequences are set up.
229 * @throws IOException
231 protected AlignmentI loadAlignment(final String data, String format)
234 AlignmentI a = new FormatAdapter().readFile(data,
235 AppletFormatAdapter.PASTE, format);
241 * Test mapping of protein to cDNA, for the case where we have no sequence
242 * cross-references, so mappings are made first-served 1-1 where sequences
245 * @throws IOException
247 @Test(groups = { "Functional" })
248 public void testMapProteinAlignmentToCdna_noXrefs() throws IOException
250 List<SequenceI> protseqs = new ArrayList<SequenceI>();
251 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
252 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
253 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
254 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
255 protein.setDataset(null);
257 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
258 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
259 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
260 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
261 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
262 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
263 cdna.setDataset(null);
265 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
267 // 3 mappings made, each from 1 to 1 sequence
268 assertEquals(3, protein.getCodonFrames().size());
269 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
270 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
271 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
273 // V12345 mapped to A22222
274 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
276 assertEquals(1, acf.getdnaSeqs().length);
277 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
278 acf.getdnaSeqs()[0]);
279 Mapping[] protMappings = acf.getProtMappings();
280 assertEquals(1, protMappings.length);
281 MapList mapList = protMappings[0].getMap();
282 assertEquals(3, mapList.getFromRatio());
283 assertEquals(1, mapList.getToRatio());
284 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
286 assertEquals(1, mapList.getFromRanges().size());
287 assertTrue(Arrays.equals(new int[] { 1, 3 },
288 mapList.getToRanges().get(0)));
289 assertEquals(1, mapList.getToRanges().size());
291 // V12346 mapped to A33333
292 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
293 assertEquals(1, acf.getdnaSeqs().length);
294 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
295 acf.getdnaSeqs()[0]);
297 // V12347 mapped to A11111
298 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
299 assertEquals(1, acf.getdnaSeqs().length);
300 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
301 acf.getdnaSeqs()[0]);
303 // no mapping involving the 'extra' A44444
304 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
308 * Test for the alignSequenceAs method that takes two sequences and a mapping.
310 @Test(groups = { "Functional" })
311 public void testAlignSequenceAs_withMapping_noIntrons()
313 MapList map = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1);
316 * No existing gaps in dna:
318 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
322 * Now introduce gaps in dna but ignore them when realigning.
324 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
328 * Now include gaps in dna when realigning. First retaining 'mapped' gaps
329 * only, i.e. those within the exon region.
331 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
332 "---G-G--G---A--A-A");
335 * Include all gaps in dna when realigning (within and without the exon
336 * region). The leading gap, and the gaps between codons, are subsumed by
337 * the protein alignment gap.
339 checkAlignSequenceAs("-G-GG--AA-A---", "-A-L-", true, true, map,
340 "---G-GG---AA-A---");
343 * Include only unmapped gaps in dna when realigning (outside the exon
344 * region). The leading gap, and the gaps between codons, are subsumed by
345 * the protein alignment gap.
347 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
352 * Test for the alignSequenceAs method that takes two sequences and a mapping.
354 @Test(groups = { "Functional" })
355 public void testAlignSequenceAs_withMapping_withIntrons()
358 * Exons at codon 2 (AAA) and 4 (TTT)
360 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
361 new int[] { 1, 2 }, 3, 1);
364 * Simple case: no gaps in dna
366 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
367 "GGG---AAACCCTTTGGG");
370 * Add gaps to dna - but ignore when realigning.
372 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
373 false, false, map, "GGG---AAACCCTTTGGG");
376 * Add gaps to dna - include within exons only when realigning.
378 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
379 true, false, map, "GGG---A--A---ACCCT-TTGGG");
382 * Include gaps outside exons only when realigning.
384 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
385 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
388 * Include gaps following first intron if we are 'preserving mapped gaps'
390 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
391 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
394 * Include all gaps in dna when realigning.
396 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
397 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
401 * Test for the case where not all of the protein sequence is mapped to cDNA.
403 @Test(groups = { "Functional" })
404 public void testAlignSequenceAs_withMapping_withUnmappedProtein()
407 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
409 final MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] {
413 * -L- 'aligns' ccc------
415 checkAlignSequenceAs("gggAAAcccTTTggg", "-A-L-P-", false, false, map,
416 "gggAAAccc------TTTggg");
420 * Helper method that performs and verifies the method under test.
423 * the sequence to be realigned
425 * the sequence whose alignment is to be copied
426 * @param preserveMappedGaps
427 * @param preserveUnmappedGaps
431 protected void checkAlignSequenceAs(final String alignee,
432 final String alignModel, final boolean preserveMappedGaps,
433 final boolean preserveUnmappedGaps, MapList map,
434 final String expected)
436 SequenceI alignMe = new Sequence("Seq1", alignee);
437 alignMe.createDatasetSequence();
438 SequenceI alignFrom = new Sequence("Seq2", alignModel);
439 alignFrom.createDatasetSequence();
440 AlignedCodonFrame acf = new AlignedCodonFrame();
441 acf.addMap(alignMe.getDatasetSequence(), alignFrom.getDatasetSequence(), map);
443 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "---", '-',
444 preserveMappedGaps, preserveUnmappedGaps);
445 assertEquals(expected, alignMe.getSequenceAsString());
449 * Test for the alignSequenceAs method where we preserve gaps in introns only.
451 @Test(groups = { "Functional" })
452 public void testAlignSequenceAs_keepIntronGapsOnly()
456 * Intron GGGAAA followed by exon CCCTTT
458 MapList map = new MapList(new int[] { 7, 12 }, new int[] { 1, 2 }, 3, 1);
460 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL", false, true, map,
465 * Test the method that realigns protein to match mapped codon alignment.
467 @Test(groups = { "Functional" })
468 public void testAlignProteinAsDna()
470 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
471 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
472 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
473 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
474 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
475 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
476 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
477 dna.setDataset(null);
479 // protein alignment will be realigned like dna
480 SequenceI prot1 = new Sequence("Seq1", "CHYQ");
481 SequenceI prot2 = new Sequence("Seq2", "CHYQ");
482 SequenceI prot3 = new Sequence("Seq3", "CHYQ");
483 SequenceI prot4 = new Sequence("Seq4", "R-QSV"); // unmapped, unchanged
484 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
486 protein.setDataset(null);
488 MapList map = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 3, 1);
489 AlignedCodonFrame acf = new AlignedCodonFrame();
490 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
491 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
492 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
493 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
495 protein.setCodonFrames(acfs);
498 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
499 * [8,9,10] [10,11,12] [11,12,13]
501 AlignmentUtils.alignProteinAsDna(protein, dna);
502 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
503 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
504 assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
505 assertEquals("R-QSV", prot4.getSequenceAsString());
509 * Test the method that tests whether a CDNA sequence translates to a protein
512 @Test(groups = { "Functional" })
513 public void testTranslatesAs()
515 // null arguments check
516 assertFalse(AlignmentUtils.translatesAs(null, 0, null));
517 assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null));
518 assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' }));
520 // straight translation
521 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
522 "FPKG".toCharArray()));
523 // with extra start codon (not in protein)
524 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
525 3, "FPKG".toCharArray()));
526 // with stop codon1 (not in protein)
527 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
528 0, "FPKG".toCharArray()));
529 // with stop codon1 (in protein as *)
530 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
531 0, "FPKG*".toCharArray()));
532 // with stop codon2 (not in protein)
533 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
534 0, "FPKG".toCharArray()));
535 // with stop codon3 (not in protein)
536 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
537 0, "FPKG".toCharArray()));
538 // with start and stop codon1
539 assertTrue(AlignmentUtils.translatesAs(
540 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG".toCharArray()));
541 // with start and stop codon1 (in protein as *)
542 assertTrue(AlignmentUtils.translatesAs(
543 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG*".toCharArray()));
544 // with start and stop codon2
545 assertTrue(AlignmentUtils.translatesAs(
546 "atgtttcccaaagggtag".toCharArray(), 3, "FPKG".toCharArray()));
547 // with start and stop codon3
548 assertTrue(AlignmentUtils.translatesAs(
549 "atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray()));
551 // with embedded stop codons
552 assertTrue(AlignmentUtils.translatesAs(
553 "atgtttTAGcccaaaTAAgggtga".toCharArray(), 3,
554 "F*PK*G".toCharArray()));
557 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
558 0, "FPMG".toCharArray()));
561 assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0,
562 "FPKG".toCharArray()));
565 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
566 0, "FPK".toCharArray()));
568 // overlong dna (doesn't end in stop codon)
569 assertFalse(AlignmentUtils.translatesAs(
570 "tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray()));
572 // dna + stop codon + more
573 assertFalse(AlignmentUtils.translatesAs(
574 "tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray()));
577 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
578 0, "FPKGQ".toCharArray()));
582 * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
583 * stop codons in addition to the protein coding sequence.
585 * @throws IOException
587 @Test(groups = { "Functional" })
588 public void testMapProteinAlignmentToCdna_withStartAndStopCodons()
591 List<SequenceI> protseqs = new ArrayList<SequenceI>();
592 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
593 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
594 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
595 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
596 protein.setDataset(null);
598 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
600 dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
602 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
603 // = start +EIQ + stop
604 dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
605 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
606 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
607 cdna.setDataset(null);
609 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
611 // 3 mappings made, each from 1 to 1 sequence
612 assertEquals(3, protein.getCodonFrames().size());
613 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
614 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
615 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
617 // V12345 mapped from A22222
618 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
620 assertEquals(1, acf.getdnaSeqs().length);
621 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
622 acf.getdnaSeqs()[0]);
623 Mapping[] protMappings = acf.getProtMappings();
624 assertEquals(1, protMappings.length);
625 MapList mapList = protMappings[0].getMap();
626 assertEquals(3, mapList.getFromRatio());
627 assertEquals(1, mapList.getToRatio());
628 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
630 assertEquals(1, mapList.getFromRanges().size());
631 assertTrue(Arrays.equals(new int[] { 1, 3 },
632 mapList.getToRanges().get(0)));
633 assertEquals(1, mapList.getToRanges().size());
635 // V12346 mapped from A33333 starting position 4
636 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
637 assertEquals(1, acf.getdnaSeqs().length);
638 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
639 acf.getdnaSeqs()[0]);
640 protMappings = acf.getProtMappings();
641 assertEquals(1, protMappings.length);
642 mapList = protMappings[0].getMap();
643 assertEquals(3, mapList.getFromRatio());
644 assertEquals(1, mapList.getToRatio());
645 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
647 assertEquals(1, mapList.getFromRanges().size());
648 assertTrue(Arrays.equals(new int[] { 1, 3 },
649 mapList.getToRanges().get(0)));
650 assertEquals(1, mapList.getToRanges().size());
652 // V12347 mapped to A11111 starting position 4
653 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
654 assertEquals(1, acf.getdnaSeqs().length);
655 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
656 acf.getdnaSeqs()[0]);
657 protMappings = acf.getProtMappings();
658 assertEquals(1, protMappings.length);
659 mapList = protMappings[0].getMap();
660 assertEquals(3, mapList.getFromRatio());
661 assertEquals(1, mapList.getToRatio());
662 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
664 assertEquals(1, mapList.getFromRanges().size());
665 assertTrue(Arrays.equals(new int[] { 1, 3 },
666 mapList.getToRanges().get(0)));
667 assertEquals(1, mapList.getToRanges().size());
669 // no mapping involving the 'extra' A44444
670 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
674 * Test mapping of protein to cDNA, for the case where we have some sequence
675 * cross-references. Verify that 1-to-many mappings are made where
676 * cross-references exist and sequences are mappable.
678 * @throws IOException
680 @Test(groups = { "Functional" })
681 public void testMapProteinAlignmentToCdna_withXrefs() throws IOException
683 List<SequenceI> protseqs = new ArrayList<SequenceI>();
684 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
685 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
686 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
687 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
688 protein.setDataset(null);
690 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
691 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
692 dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
693 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
694 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
695 dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
696 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
697 cdna.setDataset(null);
699 // Xref A22222 to V12345 (should get mapped)
700 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
701 // Xref V12345 to A44444 (should get mapped)
702 protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
703 // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
704 dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
705 // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
706 // it should get paired up with the unmapped A33333
707 // A11111 should be mapped to V12347
708 // A55555 is spare and has no xref so is not mapped
710 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
712 // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
713 assertEquals(3, protein.getCodonFrames().size());
714 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
715 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
716 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
718 // one mapping for each of the first 4 cDNA sequences
719 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
720 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
721 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
722 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
724 // V12345 mapped to A22222 and A44444
725 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
727 assertEquals(2, acf.getdnaSeqs().length);
728 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
729 acf.getdnaSeqs()[0]);
730 assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
731 acf.getdnaSeqs()[1]);
733 // V12346 mapped to A33333
734 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
735 assertEquals(1, acf.getdnaSeqs().length);
736 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
737 acf.getdnaSeqs()[0]);
739 // V12347 mapped to A11111
740 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
741 assertEquals(1, acf.getdnaSeqs().length);
742 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
743 acf.getdnaSeqs()[0]);
745 // no mapping involving the 'extra' A55555
746 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
750 * Test mapping of protein to cDNA, for the case where we have some sequence
751 * cross-references. Verify that once we have made an xref mapping we don't
752 * also map un-xrefd sequeces.
754 * @throws IOException
756 @Test(groups = { "Functional" })
757 public void testMapProteinAlignmentToCdna_prioritiseXrefs()
760 List<SequenceI> protseqs = new ArrayList<SequenceI>();
761 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
762 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
763 AlignmentI protein = new Alignment(
764 protseqs.toArray(new SequenceI[protseqs.size()]));
765 protein.setDataset(null);
767 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
768 dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
769 dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
770 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
772 cdna.setDataset(null);
774 // Xref A22222 to V12345 (should get mapped)
775 // A11111 should then be mapped to the unmapped V12346
776 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
778 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
780 // 2 protein mappings made
781 assertEquals(2, protein.getCodonFrames().size());
782 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
783 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
785 // one mapping for each of the cDNA sequences
786 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
787 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
789 // V12345 mapped to A22222
790 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
792 assertEquals(1, acf.getdnaSeqs().length);
793 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
794 acf.getdnaSeqs()[0]);
796 // V12346 mapped to A11111
797 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
798 assertEquals(1, acf.getdnaSeqs().length);
799 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
800 acf.getdnaSeqs()[0]);
804 * Test the method that shows or hides sequence annotations by type(s) and
807 @Test(groups = { "Functional" })
808 public void testShowOrHideSequenceAnnotations()
810 SequenceI seq1 = new Sequence("Seq1", "AAA");
811 SequenceI seq2 = new Sequence("Seq2", "BBB");
812 SequenceI seq3 = new Sequence("Seq3", "CCC");
813 Annotation[] anns = new Annotation[] { new Annotation(2f) };
814 AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1",
816 ann1.setSequenceRef(seq1);
817 AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2",
819 ann2.setSequenceRef(seq2);
820 AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3",
822 AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns);
823 ann4.setSequenceRef(seq1);
824 AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns);
825 ann5.setSequenceRef(seq2);
826 AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns);
827 AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2, seq3 });
828 al.addAnnotation(ann1); // Structure for Seq1
829 al.addAnnotation(ann2); // Structure for Seq2
830 al.addAnnotation(ann3); // Structure for no sequence
831 al.addAnnotation(ann4); // Temp for seq1
832 al.addAnnotation(ann5); // Temp for seq2
833 al.addAnnotation(ann6); // Temp for no sequence
834 List<String> types = new ArrayList<String>();
835 List<SequenceI> scope = new ArrayList<SequenceI>();
838 * Set all sequence related Structure to hidden (ann1, ann2)
840 types.add("Structure");
841 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
843 assertFalse(ann1.visible);
844 assertFalse(ann2.visible);
845 assertTrue(ann3.visible); // not sequence-related, not affected
846 assertTrue(ann4.visible); // not Structure, not affected
847 assertTrue(ann5.visible); // "
848 assertTrue(ann6.visible); // not sequence-related, not affected
851 * Set Temp in {seq1, seq3} to hidden
857 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false,
859 assertFalse(ann1.visible); // unchanged
860 assertFalse(ann2.visible); // unchanged
861 assertTrue(ann3.visible); // not sequence-related, not affected
862 assertFalse(ann4.visible); // Temp for seq1 hidden
863 assertTrue(ann5.visible); // not in scope, not affected
864 assertTrue(ann6.visible); // not sequence-related, not affected
867 * Set Temp in all sequences to hidden
873 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
875 assertFalse(ann1.visible); // unchanged
876 assertFalse(ann2.visible); // unchanged
877 assertTrue(ann3.visible); // not sequence-related, not affected
878 assertFalse(ann4.visible); // Temp for seq1 hidden
879 assertFalse(ann5.visible); // Temp for seq2 hidden
880 assertTrue(ann6.visible); // not sequence-related, not affected
883 * Set all types in {seq1, seq3} to visible
889 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true,
891 assertTrue(ann1.visible); // Structure for seq1 set visible
892 assertFalse(ann2.visible); // not in scope, unchanged
893 assertTrue(ann3.visible); // not sequence-related, not affected
894 assertTrue(ann4.visible); // Temp for seq1 set visible
895 assertFalse(ann5.visible); // not in scope, unchanged
896 assertTrue(ann6.visible); // not sequence-related, not affected
899 * Set all types in all scope to hidden
901 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true,
903 assertFalse(ann1.visible);
904 assertFalse(ann2.visible);
905 assertTrue(ann3.visible); // not sequence-related, not affected
906 assertFalse(ann4.visible);
907 assertFalse(ann5.visible);
908 assertTrue(ann6.visible); // not sequence-related, not affected
912 * Tests for the method that checks if one sequence cross-references another
914 @Test(groups = { "Functional" })
915 public void testHasCrossRef()
917 assertFalse(AlignmentUtils.hasCrossRef(null, null));
918 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
919 assertFalse(AlignmentUtils.hasCrossRef(seq1, null));
920 assertFalse(AlignmentUtils.hasCrossRef(null, seq1));
921 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
922 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
925 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193"));
926 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
928 // case-insensitive; version number is ignored
929 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192"));
930 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
933 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
934 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
935 // test is one-way only
936 assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1));
940 * Tests for the method that checks if either sequence cross-references the
943 @Test(groups = { "Functional" })
944 public void testHaveCrossRef()
946 assertFalse(AlignmentUtils.hasCrossRef(null, null));
947 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
948 assertFalse(AlignmentUtils.haveCrossRef(seq1, null));
949 assertFalse(AlignmentUtils.haveCrossRef(null, seq1));
950 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
951 assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2));
953 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
954 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
955 // next is true for haveCrossRef, false for hasCrossRef
956 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
958 // now the other way round
959 seq1.setDBRefs(null);
960 seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345"));
961 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
962 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
965 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
966 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
967 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
971 * Test the method that extracts the cds-only part of a dna alignment.
973 @Test(groups = { "Functional" })
974 public void testMakeCdsAlignment()
976 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
977 SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
978 SequenceI pep1 = new Sequence("pep1", "GF");
979 SequenceI pep2 = new Sequence("pep2", "GFP");
980 dna1.createDatasetSequence();
981 dna2.createDatasetSequence();
982 pep1.createDatasetSequence();
983 pep2.createDatasetSequence();
984 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
986 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
988 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
990 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
992 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds5", 13, 15, 0f,
994 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
995 dna.setDataset(null);
997 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
998 new int[] { 1, 2 }, 3, 1);
999 AlignedCodonFrame acf = new AlignedCodonFrame();
1000 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1001 dna.addCodonFrame(acf);
1002 map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 },
1004 acf = new AlignedCodonFrame();
1005 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1006 dna.addCodonFrame(acf);
1009 * execute method under test:
1011 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1012 dna1, dna2 }, dna.getDataset());
1014 assertEquals(2, cds.getSequences().size());
1015 assertEquals("GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
1016 assertEquals("GGGTTTCCC", cds.getSequenceAt(1).getSequenceAsString());
1019 * verify shared, extended alignment dataset
1021 assertSame(dna.getDataset(), cds.getDataset());
1022 assertTrue(dna.getDataset().getSequences()
1023 .contains(cds.getSequenceAt(0).getDatasetSequence()));
1024 assertTrue(dna.getDataset().getSequences()
1025 .contains(cds.getSequenceAt(1).getDatasetSequence()));
1028 * Verify mappings from CDS to peptide, cDNA to CDS, and cDNA to peptide
1029 * the mappings are on the shared alignment dataset
1031 List<AlignedCodonFrame> cdsMappings = cds.getDataset().getCodonFrames();
1033 * 6 mappings, 2*(DNA->CDS), 2*(DNA->Pep), 2*(CDS->Pep)
1035 assertEquals(6, cdsMappings.size());
1038 * verify that mapping sets for dna and cds alignments are different
1039 * [not current behaviour - all mappings are on the alignment dataset]
1041 // select -> subselect type to test.
1042 // Assert.assertNotSame(dna.getCodonFrames(), cds.getCodonFrames());
1043 // assertEquals(4, dna.getCodonFrames().size());
1044 // assertEquals(4, cds.getCodonFrames().size());
1047 * Two mappings involve pep1 (dna to pep1, cds to pep1)
1048 * Mapping from pep1 to GGGTTT in first new exon sequence
1050 List<AlignedCodonFrame> pep1Mappings = MappingUtils
1051 .findMappingsForSequence(pep1, cdsMappings);
1052 assertEquals(2, pep1Mappings.size());
1053 List<AlignedCodonFrame> mappings = MappingUtils
1054 .findMappingsForSequence(cds.getSequenceAt(0), pep1Mappings);
1055 assertEquals(1, mappings.size());
1058 SearchResults sr = MappingUtils.buildSearchResults(pep1, 1, mappings);
1059 assertEquals(1, sr.getResults().size());
1060 Match m = sr.getResults().get(0);
1061 assertSame(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence());
1062 assertEquals(1, m.getStart());
1063 assertEquals(3, m.getEnd());
1065 sr = MappingUtils.buildSearchResults(pep1, 2, mappings);
1066 m = sr.getResults().get(0);
1067 assertSame(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence());
1068 assertEquals(4, m.getStart());
1069 assertEquals(6, m.getEnd());
1072 * Two mappings involve pep2 (dna to pep2, cds to pep2)
1073 * Verify mapping from pep2 to GGGTTTCCC in second new exon sequence
1075 List<AlignedCodonFrame> pep2Mappings = MappingUtils
1076 .findMappingsForSequence(pep2, cdsMappings);
1077 assertEquals(2, pep2Mappings.size());
1078 mappings = MappingUtils.findMappingsForSequence(cds.getSequenceAt(1),
1080 assertEquals(1, mappings.size());
1082 sr = MappingUtils.buildSearchResults(pep2, 1, mappings);
1083 assertEquals(1, sr.getResults().size());
1084 m = sr.getResults().get(0);
1085 assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence());
1086 assertEquals(1, m.getStart());
1087 assertEquals(3, m.getEnd());
1089 sr = MappingUtils.buildSearchResults(pep2, 2, mappings);
1090 m = sr.getResults().get(0);
1091 assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence());
1092 assertEquals(4, m.getStart());
1093 assertEquals(6, m.getEnd());
1095 sr = MappingUtils.buildSearchResults(pep2, 3, mappings);
1096 m = sr.getResults().get(0);
1097 assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence());
1098 assertEquals(7, m.getStart());
1099 assertEquals(9, m.getEnd());
1103 * Test the method that makes a cds-only alignment from a DNA sequence and its
1104 * product mappings, for the case where there are multiple exon mappings to
1105 * different protein products.
1107 @Test(groups = { "Functional" })
1108 public void testMakeCdsAlignment_multipleProteins()
1110 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1111 SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT
1112 SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc
1113 SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT
1114 dna1.createDatasetSequence();
1115 pep1.createDatasetSequence();
1116 pep2.createDatasetSequence();
1117 pep3.createDatasetSequence();
1118 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
1120 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
1122 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
1124 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
1126 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds5", 1, 3, 0f,
1128 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds6", 10, 12, 0f,
1130 pep1.getDatasetSequence().addDBRef(
1131 new DBRefEntry("EMBLCDS", "2", "A12345"));
1132 pep2.getDatasetSequence().addDBRef(
1133 new DBRefEntry("EMBLCDS", "3", "A12346"));
1134 pep3.getDatasetSequence().addDBRef(
1135 new DBRefEntry("EMBLCDS", "4", "A12347"));
1138 * Create the CDS alignment
1140 AlignmentI dna = new Alignment(new SequenceI[] { dna1 });
1141 dna.setDataset(null);
1144 * Make the mappings from dna to protein
1146 // map ...GGG...TTT to GF
1147 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1148 new int[] { 1, 2 }, 3, 1);
1149 AlignedCodonFrame acf = new AlignedCodonFrame();
1150 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1151 dna.addCodonFrame(acf);
1153 // map aaa...ccc to KP
1154 map = new MapList(new int[] { 1, 3, 7, 9 }, new int[] { 1, 2 }, 3, 1);
1155 acf = new AlignedCodonFrame();
1156 acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map);
1157 dna.addCodonFrame(acf);
1159 // map aaa......TTT to KF
1160 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 2 }, 3, 1);
1161 acf = new AlignedCodonFrame();
1162 acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
1163 dna.addCodonFrame(acf);
1166 * execute method under test
1168 AlignmentI cdsal = AlignmentUtils.makeCdsAlignment(
1169 new SequenceI[] { dna1 }, dna.getDataset());
1172 * Verify we have 3 cds sequences, mapped to pep1/2/3 respectively
1174 List<SequenceI> cds = cdsal.getSequences();
1175 assertEquals(3, cds.size());
1178 * verify shared, extended alignment dataset
1180 assertSame(cdsal.getDataset(), dna.getDataset());
1181 assertTrue(dna.getDataset().getSequences()
1182 .contains(cds.get(0).getDatasetSequence()));
1183 assertTrue(dna.getDataset().getSequences()
1184 .contains(cds.get(1).getDatasetSequence()));
1185 assertTrue(dna.getDataset().getSequences()
1186 .contains(cds.get(2).getDatasetSequence()));
1189 * verify aligned cds sequences and their xrefs
1191 SequenceI cdsSeq = cds.get(0);
1192 assertEquals("GGGTTT", cdsSeq.getSequenceAsString());
1193 // assertEquals("dna1|A12345", cdsSeq.getName());
1194 assertEquals("dna1|pep1", cdsSeq.getName());
1195 // assertEquals(1, cdsSeq.getDBRefs().length);
1196 // DBRefEntry cdsRef = cdsSeq.getDBRefs()[0];
1197 // assertEquals("EMBLCDS", cdsRef.getSource());
1198 // assertEquals("2", cdsRef.getVersion());
1199 // assertEquals("A12345", cdsRef.getAccessionId());
1201 cdsSeq = cds.get(1);
1202 assertEquals("aaaccc", cdsSeq.getSequenceAsString());
1203 // assertEquals("dna1|A12346", cdsSeq.getName());
1204 assertEquals("dna1|pep2", cdsSeq.getName());
1205 // assertEquals(1, cdsSeq.getDBRefs().length);
1206 // cdsRef = cdsSeq.getDBRefs()[0];
1207 // assertEquals("EMBLCDS", cdsRef.getSource());
1208 // assertEquals("3", cdsRef.getVersion());
1209 // assertEquals("A12346", cdsRef.getAccessionId());
1211 cdsSeq = cds.get(2);
1212 assertEquals("aaaTTT", cdsSeq.getSequenceAsString());
1213 // assertEquals("dna1|A12347", cdsSeq.getName());
1214 assertEquals("dna1|pep3", cdsSeq.getName());
1215 // assertEquals(1, cdsSeq.getDBRefs().length);
1216 // cdsRef = cdsSeq.getDBRefs()[0];
1217 // assertEquals("EMBLCDS", cdsRef.getSource());
1218 // assertEquals("4", cdsRef.getVersion());
1219 // assertEquals("A12347", cdsRef.getAccessionId());
1222 * Verify there are mappings from each cds sequence to its protein product
1223 * and also to its dna source
1225 List<AlignedCodonFrame> newMappings = cdsal.getCodonFrames();
1228 * 6 mappings involve dna1 (to pep1/2/3, cds1/2/3)
1230 List<AlignedCodonFrame> dnaMappings = MappingUtils
1231 .findMappingsForSequence(dna1, newMappings);
1232 assertEquals(6, dnaMappings.size());
1237 List<AlignedCodonFrame> mappings = MappingUtils
1238 .findMappingsForSequence(pep1, dnaMappings);
1239 assertEquals(1, mappings.size());
1240 assertEquals(1, mappings.get(0).getMappings().size());
1241 assertSame(pep1.getDatasetSequence(), mappings.get(0).getMappings()
1242 .get(0).getMapping().getTo());
1247 List<AlignedCodonFrame> dnaToCds1Mappings = MappingUtils
1248 .findMappingsForSequence(cds.get(0), dnaMappings);
1249 Mapping mapping = dnaToCds1Mappings.get(0).getMappings().get(0)
1251 assertSame(cds.get(0).getDatasetSequence(), mapping
1253 assertEquals("G(1) in CDS should map to G(4) in DNA", 4, mapping
1254 .getMap().getToPosition(1));
1259 mappings = MappingUtils.findMappingsForSequence(pep2, dnaMappings);
1260 assertEquals(1, mappings.size());
1261 assertEquals(1, mappings.get(0).getMappings().size());
1262 assertSame(pep2.getDatasetSequence(), mappings.get(0).getMappings()
1263 .get(0).getMapping().getTo());
1268 List<AlignedCodonFrame> dnaToCds2Mappings = MappingUtils
1269 .findMappingsForSequence(cds.get(1), dnaMappings);
1270 mapping = dnaToCds2Mappings.get(0).getMappings().get(0).getMapping();
1271 assertSame(cds.get(1).getDatasetSequence(), mapping.getTo());
1272 assertEquals("c(4) in CDS should map to c(7) in DNA", 7, mapping
1273 .getMap().getToPosition(4));
1278 mappings = MappingUtils.findMappingsForSequence(pep3, dnaMappings);
1279 assertEquals(1, mappings.size());
1280 assertEquals(1, mappings.get(0).getMappings().size());
1281 assertSame(pep3.getDatasetSequence(), mappings.get(0).getMappings()
1282 .get(0).getMapping().getTo());
1287 List<AlignedCodonFrame> dnaToCds3Mappings = MappingUtils
1288 .findMappingsForSequence(cds.get(2), dnaMappings);
1289 mapping = dnaToCds3Mappings.get(0).getMappings().get(0).getMapping();
1290 assertSame(cds.get(2).getDatasetSequence(), mapping.getTo());
1291 assertEquals("T(4) in CDS should map to T(10) in DNA", 10, mapping
1292 .getMap().getToPosition(4));
1295 @Test(groups = { "Functional" })
1296 public void testIsMappable()
1298 SequenceI dna1 = new Sequence("dna1", "cgCAGtgGT");
1299 SequenceI aa1 = new Sequence("aa1", "RSG");
1300 AlignmentI al1 = new Alignment(new SequenceI[] { dna1 });
1301 AlignmentI al2 = new Alignment(new SequenceI[] { aa1 });
1303 assertFalse(AlignmentUtils.isMappable(null, null));
1304 assertFalse(AlignmentUtils.isMappable(al1, null));
1305 assertFalse(AlignmentUtils.isMappable(null, al1));
1306 assertFalse(AlignmentUtils.isMappable(al1, al1));
1307 assertFalse(AlignmentUtils.isMappable(al2, al2));
1309 assertTrue(AlignmentUtils.isMappable(al1, al2));
1310 assertTrue(AlignmentUtils.isMappable(al2, al1));
1314 * Test creating a mapping when the sequences involved do not start at residue
1317 * @throws IOException
1319 @Test(groups = { "Functional" })
1320 public void testMapCdnaToProtein_forSubsequence()
1323 SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12);
1324 prot.createDatasetSequence();
1326 SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48);
1327 dna.createDatasetSequence();
1329 MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna);
1330 assertEquals(10, map.getToLowest());
1331 assertEquals(12, map.getToHighest());
1332 assertEquals(40, map.getFromLowest());
1333 assertEquals(48, map.getFromHighest());
1337 * Test for the alignSequenceAs method where we have protein mapped to protein
1339 @Test(groups = { "Functional" })
1340 public void testAlignSequenceAs_mappedProteinProtein()
1343 SequenceI alignMe = new Sequence("Match", "MGAASEV");
1344 alignMe.createDatasetSequence();
1345 SequenceI alignFrom = new Sequence("Query", "LQTGYMGAASEVMFSPTRR");
1346 alignFrom.createDatasetSequence();
1348 AlignedCodonFrame acf = new AlignedCodonFrame();
1349 // this is like a domain or motif match of part of a peptide sequence
1350 MapList map = new MapList(new int[] { 6, 12 }, new int[] { 1, 7 }, 1, 1);
1351 acf.addMap(alignFrom.getDatasetSequence(),
1352 alignMe.getDatasetSequence(), map);
1354 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "-", '-', true,
1356 assertEquals("-----MGAASEV-------", alignMe.getSequenceAsString());
1360 * Test for the alignSequenceAs method where there are trailing unmapped
1361 * residues in the model sequence
1363 @Test(groups = { "Functional" })
1364 public void testAlignSequenceAs_withTrailingPeptide()
1366 // map first 3 codons to KPF; G is a trailing unmapped residue
1367 MapList map = new MapList(new int[] { 1, 9 }, new int[] { 1, 3 }, 3, 1);
1369 checkAlignSequenceAs("AAACCCTTT", "K-PFG", true, true, map,
1374 * Tests for transferring features between mapped sequences
1376 @Test(groups = { "Functional" })
1377 public void testTransferFeatures()
1379 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1380 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1383 dna.addSequenceFeature(new SequenceFeature("type1", "desc1", 1, 2, 1f,
1385 // partial overlap - to [1, 1]
1386 dna.addSequenceFeature(new SequenceFeature("type2", "desc2", 3, 4, 2f,
1388 // exact overlap - to [1, 3]
1389 dna.addSequenceFeature(new SequenceFeature("type3", "desc3", 4, 6, 3f,
1391 // spanning overlap - to [2, 5]
1392 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1394 // exactly overlaps whole mapped range [1, 6]
1395 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1397 // no overlap (internal)
1398 dna.addSequenceFeature(new SequenceFeature("type6", "desc6", 7, 9, 6f,
1400 // no overlap (3' end)
1401 dna.addSequenceFeature(new SequenceFeature("type7", "desc7", 13, 15,
1403 // overlap (3' end) - to [6, 6]
1404 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1406 // extended overlap - to [6, +]
1407 dna.addSequenceFeature(new SequenceFeature("type9", "desc9", 12, 13,
1410 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1411 new int[] { 1, 6 }, 1, 1);
1414 * transferFeatures() will build 'partial overlap' for regions
1415 * that partially overlap 5' or 3' (start or end) of target sequence
1417 AlignmentUtils.transferFeatures(dna, cds, map, null);
1418 SequenceFeature[] sfs = cds.getSequenceFeatures();
1419 assertEquals(6, sfs.length);
1421 SequenceFeature sf = sfs[0];
1422 assertEquals("type2", sf.getType());
1423 assertEquals("desc2", sf.getDescription());
1424 assertEquals(2f, sf.getScore());
1425 assertEquals(1, sf.getBegin());
1426 assertEquals(1, sf.getEnd());
1429 assertEquals("type3", sf.getType());
1430 assertEquals("desc3", sf.getDescription());
1431 assertEquals(3f, sf.getScore());
1432 assertEquals(1, sf.getBegin());
1433 assertEquals(3, sf.getEnd());
1436 assertEquals("type4", sf.getType());
1437 assertEquals(2, sf.getBegin());
1438 assertEquals(5, sf.getEnd());
1441 assertEquals("type5", sf.getType());
1442 assertEquals(1, sf.getBegin());
1443 assertEquals(6, sf.getEnd());
1446 assertEquals("type8", sf.getType());
1447 assertEquals(6, sf.getBegin());
1448 assertEquals(6, sf.getEnd());
1451 assertEquals("type9", sf.getType());
1452 assertEquals(6, sf.getBegin());
1453 assertEquals(6, sf.getEnd());
1457 * Tests for transferring features between mapped sequences
1459 @Test(groups = { "Functional" })
1460 public void testTransferFeatures_withOmit()
1462 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1463 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1465 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1466 new int[] { 1, 6 }, 1, 1);
1468 // [5, 11] maps to [2, 5]
1469 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1471 // [4, 12] maps to [1, 6]
1472 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1474 // [12, 12] maps to [6, 6]
1475 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1478 // desc4 and desc8 are the 'omit these' varargs
1479 AlignmentUtils.transferFeatures(dna, cds, map, null, "type4", "type8");
1480 SequenceFeature[] sfs = cds.getSequenceFeatures();
1481 assertEquals(1, sfs.length);
1483 SequenceFeature sf = sfs[0];
1484 assertEquals("type5", sf.getType());
1485 assertEquals(1, sf.getBegin());
1486 assertEquals(6, sf.getEnd());
1490 * Tests for transferring features between mapped sequences
1492 @Test(groups = { "Functional" })
1493 public void testTransferFeatures_withSelect()
1495 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1496 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1498 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1499 new int[] { 1, 6 }, 1, 1);
1501 // [5, 11] maps to [2, 5]
1502 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1504 // [4, 12] maps to [1, 6]
1505 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1507 // [12, 12] maps to [6, 6]
1508 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1511 // "type5" is the 'select this type' argument
1512 AlignmentUtils.transferFeatures(dna, cds, map, "type5");
1513 SequenceFeature[] sfs = cds.getSequenceFeatures();
1514 assertEquals(1, sfs.length);
1516 SequenceFeature sf = sfs[0];
1517 assertEquals("type5", sf.getType());
1518 assertEquals(1, sf.getBegin());
1519 assertEquals(6, sf.getEnd());
1523 * Test the method that extracts the cds-only part of a dna alignment, for the
1524 * case where the cds should be aligned to match its nucleotide sequence.
1526 @Test(groups = { "Functional" })
1527 public void testMakeCdsAlignment_alternativeTranscripts()
1529 SequenceI dna1 = new Sequence("dna1", "aaaGGGCC-----CTTTaaaGGG");
1530 // alternative transcript of same dna skips CCC codon
1531 SequenceI dna2 = new Sequence("dna2", "aaaGGGCC-----cttTaaaGGG");
1532 // dna3 has no mapping (protein product) so should be ignored here
1533 SequenceI dna3 = new Sequence("dna3", "aaaGGGCCCCCGGGcttTaaaGGG");
1534 SequenceI pep1 = new Sequence("pep1", "GPFG");
1535 SequenceI pep2 = new Sequence("pep2", "GPG");
1536 dna1.createDatasetSequence();
1537 dna2.createDatasetSequence();
1538 dna3.createDatasetSequence();
1539 pep1.createDatasetSequence();
1540 pep2.createDatasetSequence();
1541 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 8, 0f,
1543 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 9, 12, 0f,
1545 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 16, 18, 0f,
1547 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 4, 8, 0f,
1549 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 12, 12, 0f,
1551 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 16, 18, 0f,
1554 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1555 dna.setDataset(null);
1557 MapList map = new MapList(new int[] { 4, 12, 16, 18 },
1558 new int[] { 1, 4 }, 3, 1);
1559 AlignedCodonFrame acf = new AlignedCodonFrame();
1560 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1561 dna.addCodonFrame(acf);
1562 map = new MapList(new int[] { 4, 8, 12, 12, 16, 18 },
1565 acf = new AlignedCodonFrame();
1566 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1567 dna.addCodonFrame(acf);
1569 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1570 dna1, dna2, dna3 }, dna.getDataset());
1571 List<SequenceI> cdsSeqs = cds.getSequences();
1572 assertEquals(2, cdsSeqs.size());
1573 assertEquals("GGGCCCTTTGGG", cdsSeqs.get(0).getSequenceAsString());
1574 assertEquals("GGGCCTGGG", cdsSeqs.get(1).getSequenceAsString());
1577 * verify shared, extended alignment dataset
1579 assertSame(dna.getDataset(), cds.getDataset());
1580 assertTrue(dna.getDataset().getSequences()
1581 .contains(cdsSeqs.get(0).getDatasetSequence()));
1582 assertTrue(dna.getDataset().getSequences()
1583 .contains(cdsSeqs.get(1).getDatasetSequence()));
1586 * Verify 6 mappings: dna1 to cds1, cds1 to pep1, dna1 to pep1
1587 * and the same for dna2/cds2/pep2
1589 List<AlignedCodonFrame> mappings = cds.getCodonFrames();
1590 assertEquals(6, mappings.size());
1593 * 2 mappings involve pep1
1595 List<AlignedCodonFrame> pep1Mappings = MappingUtils
1596 .findMappingsForSequence(pep1, mappings);
1597 assertEquals(2, pep1Mappings.size());
1600 * Get mapping of pep1 to cds1 and verify it
1601 * maps GPFG to 1-3,4-6,7-9,10-12
1603 List<AlignedCodonFrame> pep1CdsMappings = MappingUtils
1604 .findMappingsForSequence(cds.getSequenceAt(0), pep1Mappings);
1605 assertEquals(1, pep1CdsMappings.size());
1606 SearchResults sr = MappingUtils.buildSearchResults(pep1, 1,
1608 assertEquals(1, sr.getResults().size());
1609 Match m = sr.getResults().get(0);
1610 assertEquals(cds.getSequenceAt(0).getDatasetSequence(),
1612 assertEquals(1, m.getStart());
1613 assertEquals(3, m.getEnd());
1614 sr = MappingUtils.buildSearchResults(pep1, 2, pep1CdsMappings);
1615 m = sr.getResults().get(0);
1616 assertEquals(4, m.getStart());
1617 assertEquals(6, m.getEnd());
1618 sr = MappingUtils.buildSearchResults(pep1, 3, pep1CdsMappings);
1619 m = sr.getResults().get(0);
1620 assertEquals(7, m.getStart());
1621 assertEquals(9, m.getEnd());
1622 sr = MappingUtils.buildSearchResults(pep1, 4, pep1CdsMappings);
1623 m = sr.getResults().get(0);
1624 assertEquals(10, m.getStart());
1625 assertEquals(12, m.getEnd());
1628 * Get mapping of pep2 to cds2 and verify it
1629 * maps GPG in pep2 to 1-3,4-6,7-9 in second CDS sequence
1631 List<AlignedCodonFrame> pep2Mappings = MappingUtils
1632 .findMappingsForSequence(pep2, mappings);
1633 assertEquals(2, pep2Mappings.size());
1634 List<AlignedCodonFrame> pep2CdsMappings = MappingUtils
1635 .findMappingsForSequence(cds.getSequenceAt(1), pep2Mappings);
1636 assertEquals(1, pep2CdsMappings.size());
1637 sr = MappingUtils.buildSearchResults(pep2, 1, pep2CdsMappings);
1638 assertEquals(1, sr.getResults().size());
1639 m = sr.getResults().get(0);
1640 assertEquals(cds.getSequenceAt(1).getDatasetSequence(),
1642 assertEquals(1, m.getStart());
1643 assertEquals(3, m.getEnd());
1644 sr = MappingUtils.buildSearchResults(pep2, 2, pep2CdsMappings);
1645 m = sr.getResults().get(0);
1646 assertEquals(4, m.getStart());
1647 assertEquals(6, m.getEnd());
1648 sr = MappingUtils.buildSearchResults(pep2, 3, pep2CdsMappings);
1649 m = sr.getResults().get(0);
1650 assertEquals(7, m.getStart());
1651 assertEquals(9, m.getEnd());
1655 * Test the method that realigns protein to match mapped codon alignment.
1657 @Test(groups = { "Functional" })
1658 public void testAlignProteinAsDna_incompleteStartCodon()
1660 // seq1: incomplete start codon (not mapped), then [3, 11]
1661 SequenceI dna1 = new Sequence("Seq1", "ccAAA-TTT-GGG-");
1662 // seq2 codons are [4, 5], [8, 11]
1663 SequenceI dna2 = new Sequence("Seq2", "ccaAA-ttT-GGG-");
1664 // seq3 incomplete start codon at 'tt'
1665 SequenceI dna3 = new Sequence("Seq3", "ccaaa-ttt-GGG-");
1666 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1667 dna.setDataset(null);
1669 // prot1 has 'X' for incomplete start codon (not mapped)
1670 SequenceI prot1 = new Sequence("Seq1", "XKFG"); // X for incomplete start
1671 SequenceI prot2 = new Sequence("Seq2", "NG");
1672 SequenceI prot3 = new Sequence("Seq3", "XG"); // X for incomplete start
1673 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
1675 protein.setDataset(null);
1677 // map dna1 [3, 11] to prot1 [2, 4] KFG
1678 MapList map = new MapList(new int[] { 3, 11 }, new int[] { 2, 4 }, 3, 1);
1679 AlignedCodonFrame acf = new AlignedCodonFrame();
1680 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
1682 // map dna2 [4, 5] [8, 11] to prot2 [1, 2] NG
1683 map = new MapList(new int[] { 4, 5, 8, 11 }, new int[] { 1, 2 }, 3, 1);
1684 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
1686 // map dna3 [9, 11] to prot3 [2, 2] G
1687 map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1);
1688 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
1690 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
1692 protein.setCodonFrames(acfs);
1695 * verify X is included in the aligned proteins, and placed just
1696 * before the first mapped residue
1697 * CCT is between CCC and TTT
1699 AlignmentUtils.alignProteinAsDna(protein, dna);
1700 assertEquals("XK-FG", prot1.getSequenceAsString());
1701 assertEquals("--N-G", prot2.getSequenceAsString());
1702 assertEquals("---XG", prot3.getSequenceAsString());
1706 * Tests for the method that maps the subset of a dna sequence that has CDS
1707 * (or subtype) feature - case where the start codon is incomplete.
1709 @Test(groups = "Functional")
1710 public void testFindCdsPositions_fivePrimeIncomplete()
1712 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
1713 dnaSeq.createDatasetSequence();
1714 SequenceI ds = dnaSeq.getDatasetSequence();
1716 // CDS for dna 5-6 (incomplete codon), 7-9
1717 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
1718 sf.setPhase("2"); // skip 2 bases to start of next codon
1719 ds.addSequenceFeature(sf);
1720 // CDS for dna 13-15
1721 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
1722 ds.addSequenceFeature(sf);
1724 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1727 * check the mapping starts with the first complete codon
1729 assertEquals(6, MappingUtils.getLength(ranges));
1730 assertEquals(2, ranges.size());
1731 assertEquals(7, ranges.get(0)[0]);
1732 assertEquals(9, ranges.get(0)[1]);
1733 assertEquals(13, ranges.get(1)[0]);
1734 assertEquals(15, ranges.get(1)[1]);
1738 * Tests for the method that maps the subset of a dna sequence that has CDS
1739 * (or subtype) feature.
1741 @Test(groups = "Functional")
1742 public void testFindCdsPositions()
1744 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
1745 dnaSeq.createDatasetSequence();
1746 SequenceI ds = dnaSeq.getDatasetSequence();
1748 // CDS for dna 10-12
1749 SequenceFeature sf = new SequenceFeature("CDS_predicted", "", 10, 12,
1752 ds.addSequenceFeature(sf);
1754 sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
1756 ds.addSequenceFeature(sf);
1757 // exon feature should be ignored here
1758 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
1759 ds.addSequenceFeature(sf);
1761 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1763 * verify ranges { [4-6], [12-10] }
1764 * note CDS ranges are ordered ascending even if the CDS
1767 assertEquals(6, MappingUtils.getLength(ranges));
1768 assertEquals(2, ranges.size());
1769 assertEquals(4, ranges.get(0)[0]);
1770 assertEquals(6, ranges.get(0)[1]);
1771 assertEquals(10, ranges.get(1)[0]);
1772 assertEquals(12, ranges.get(1)[1]);
1776 * Test the method that computes a map of codon variants for each protein
1777 * position from "sequence_variant" features on dna
1779 @Test(groups = "Functional")
1780 public void testBuildDnaVariantsMap()
1782 SequenceI dna = new Sequence("dna", "atgAAATTTGGGCCCtag");
1783 MapList map = new MapList(new int[] { 1, 18 }, new int[] { 1, 5 }, 3, 1);
1786 * first with no variants on dna
1788 LinkedHashMap<Integer, List<DnaVariant>[]> variantsMap = AlignmentUtils
1789 .buildDnaVariantsMap(dna, map);
1790 assertTrue(variantsMap.isEmpty());
1793 * single allele codon 1, on base 1
1795 SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
1797 sf1.setValue("alleles", "T");
1798 sf1.setValue("ID", "sequence_variant:rs758803211");
1799 dna.addSequenceFeature(sf1);
1802 * two alleles codon 2, on bases 2 and 3 (distinct variants)
1804 SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 5, 5,
1806 sf2.setValue("alleles", "T");
1807 sf2.setValue("ID", "sequence_variant:rs758803212");
1808 dna.addSequenceFeature(sf2);
1809 SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 6, 6,
1811 sf3.setValue("alleles", "G");
1812 sf3.setValue("ID", "sequence_variant:rs758803213");
1813 dna.addSequenceFeature(sf3);
1816 * two alleles codon 3, both on base 2 (one variant)
1818 SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 8, 8,
1820 sf4.setValue("alleles", "C, G");
1821 sf4.setValue("ID", "sequence_variant:rs758803214");
1822 dna.addSequenceFeature(sf4);
1824 // no alleles on codon 4
1827 * alleles on codon 5 on all 3 bases (distinct variants)
1829 SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 13,
1831 sf5.setValue("alleles", "C, G"); // (C duplicates given base value)
1832 sf5.setValue("ID", "sequence_variant:rs758803215");
1833 dna.addSequenceFeature(sf5);
1834 SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 14,
1836 sf6.setValue("alleles", "g, a"); // should force to upper-case
1837 sf6.setValue("ID", "sequence_variant:rs758803216");
1838 dna.addSequenceFeature(sf6);
1839 SequenceFeature sf7 = new SequenceFeature("sequence_variant", "", 15,
1841 sf7.setValue("alleles", "A, T");
1842 sf7.setValue("ID", "sequence_variant:rs758803217");
1843 dna.addSequenceFeature(sf7);
1846 * build map - expect variants on positions 1, 2, 3, 5
1848 variantsMap = AlignmentUtils.buildDnaVariantsMap(dna, map);
1849 assertEquals(4, variantsMap.size());
1852 * protein residue 1: variant on codon (ATG) base 1, not on 2 or 3
1854 List<DnaVariant>[] pep1Variants = variantsMap.get(1);
1855 assertEquals(3, pep1Variants.length);
1856 assertEquals(1, pep1Variants[0].size());
1857 assertEquals("A", pep1Variants[0].get(0).base); // codon[1] base
1858 assertSame(sf1, pep1Variants[0].get(0).variant); // codon[1] variant
1859 assertEquals(1, pep1Variants[1].size());
1860 assertEquals("T", pep1Variants[1].get(0).base); // codon[2] base
1861 assertNull(pep1Variants[1].get(0).variant); // no variant here
1862 assertEquals(1, pep1Variants[2].size());
1863 assertEquals("G", pep1Variants[2].get(0).base); // codon[3] base
1864 assertNull(pep1Variants[2].get(0).variant); // no variant here
1867 * protein residue 2: variants on codon (AAA) bases 2 and 3
1869 List<DnaVariant>[] pep2Variants = variantsMap.get(2);
1870 assertEquals(3, pep2Variants.length);
1871 assertEquals(1, pep2Variants[0].size());
1872 // codon[1] base recorded while processing variant on codon[2]
1873 assertEquals("A", pep2Variants[0].get(0).base);
1874 assertNull(pep2Variants[0].get(0).variant); // no variant here
1875 // codon[2] base and variant:
1876 assertEquals(1, pep2Variants[1].size());
1877 assertEquals("A", pep2Variants[1].get(0).base);
1878 assertSame(sf2, pep2Variants[1].get(0).variant);
1879 // codon[3] base was recorded when processing codon[2] variant
1880 // and then the variant for codon[3] added to it
1881 assertEquals(1, pep2Variants[2].size());
1882 assertEquals("A", pep2Variants[2].get(0).base);
1883 assertSame(sf3, pep2Variants[2].get(0).variant);
1886 * protein residue 3: variants on codon (TTT) base 2 only
1888 List<DnaVariant>[] pep3Variants = variantsMap.get(3);
1889 assertEquals(3, pep3Variants.length);
1890 assertEquals(1, pep3Variants[0].size());
1891 assertEquals("T", pep3Variants[0].get(0).base); // codon[1] base
1892 assertNull(pep3Variants[0].get(0).variant); // no variant here
1893 assertEquals(1, pep3Variants[1].size());
1894 assertEquals("T", pep3Variants[1].get(0).base); // codon[2] base
1895 assertSame(sf4, pep3Variants[1].get(0).variant); // codon[2] variant
1896 assertEquals(1, pep3Variants[2].size());
1897 assertEquals("T", pep3Variants[2].get(0).base); // codon[3] base
1898 assertNull(pep3Variants[2].get(0).variant); // no variant here
1901 * three variants on protein position 5
1903 List<DnaVariant>[] pep5Variants = variantsMap.get(5);
1904 assertEquals(3, pep5Variants.length);
1905 assertEquals(1, pep5Variants[0].size());
1906 assertEquals("C", pep5Variants[0].get(0).base); // codon[1] base
1907 assertSame(sf5, pep5Variants[0].get(0).variant); // codon[1] variant
1908 assertEquals(1, pep5Variants[1].size());
1909 assertEquals("C", pep5Variants[1].get(0).base); // codon[2] base
1910 assertSame(sf6, pep5Variants[1].get(0).variant); // codon[2] variant
1911 assertEquals(1, pep5Variants[2].size());
1912 assertEquals("C", pep5Variants[2].get(0).base); // codon[3] base
1913 assertSame(sf7, pep5Variants[2].get(0).variant); // codon[3] variant
1917 * Tests for the method that computes all peptide variants given codon
1920 @Test(groups = "Functional")
1921 public void testComputePeptideVariants()
1924 * scenario: AAATTTCCC codes for KFP, with variants
1930 * CAC,CGC -> H,R (as one variant)
1932 SequenceI peptide = new Sequence("pep/10-12", "KFP");
1935 * two distinct variants for codon 1 position 1
1936 * second one has clinical significance
1938 SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
1940 sf1.setValue("alleles", "A,G"); // GAA -> E
1941 sf1.setValue("ID", "var1.125A>G");
1942 SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 1, 1,
1944 sf2.setValue("alleles", "A,C"); // CAA -> Q
1945 sf2.setValue("ID", "var2");
1946 sf2.setValue("clinical_significance", "Dodgy");
1947 SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 3, 3,
1949 sf3.setValue("alleles", "A,G"); // synonymous
1950 sf3.setValue("ID", "var3");
1951 sf3.setValue("clinical_significance", "None");
1952 SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 3, 3,
1954 sf4.setValue("alleles", "A,T"); // AAT -> N
1955 sf4.setValue("ID", "sequence_variant:var4"); // prefix gets stripped off
1956 sf4.setValue("clinical_significance", "Benign");
1957 SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 6, 6,
1959 sf5.setValue("alleles", "T,C"); // synonymous
1960 sf5.setValue("ID", "var5");
1961 sf5.setValue("clinical_significance", "Bad");
1962 SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 8, 8,
1964 sf6.setValue("alleles", "C,A,G"); // CAC,CGC -> H,R
1965 sf6.setValue("ID", "var6");
1966 sf6.setValue("clinical_significance", "Good");
1968 List<DnaVariant> codon1Variants = new ArrayList<DnaVariant>();
1969 List<DnaVariant> codon2Variants = new ArrayList<DnaVariant>();
1970 List<DnaVariant> codon3Variants = new ArrayList<DnaVariant>();
1971 List<DnaVariant> codonVariants[] = new ArrayList[3];
1972 codonVariants[0] = codon1Variants;
1973 codonVariants[1] = codon2Variants;
1974 codonVariants[2] = codon3Variants;
1977 * compute variants for protein position 1
1979 codon1Variants.add(new DnaVariant("A", sf1));
1980 codon1Variants.add(new DnaVariant("A", sf2));
1981 codon2Variants.add(new DnaVariant("A"));
1982 codon2Variants.add(new DnaVariant("A"));
1983 codon3Variants.add(new DnaVariant("A", sf3));
1984 codon3Variants.add(new DnaVariant("A", sf4));
1985 AlignmentUtils.computePeptideVariants(peptide, 1, codonVariants);
1988 * compute variants for protein position 2
1990 codon1Variants.clear();
1991 codon2Variants.clear();
1992 codon3Variants.clear();
1993 codon1Variants.add(new DnaVariant("T"));
1994 codon2Variants.add(new DnaVariant("T"));
1995 codon3Variants.add(new DnaVariant("T", sf5));
1996 AlignmentUtils.computePeptideVariants(peptide, 2, codonVariants);
1999 * compute variants for protein position 3
2001 codon1Variants.clear();
2002 codon2Variants.clear();
2003 codon3Variants.clear();
2004 codon1Variants.add(new DnaVariant("C"));
2005 codon2Variants.add(new DnaVariant("C", sf6));
2006 codon3Variants.add(new DnaVariant("C"));
2007 AlignmentUtils.computePeptideVariants(peptide, 3, codonVariants);
2010 * verify added sequence features for
2017 SequenceFeature[] sfs = peptide.getSequenceFeatures();
2018 assertEquals(5, sfs.length);
2019 SequenceFeature sf = sfs[0];
2020 assertEquals(1, sf.getBegin());
2021 assertEquals(1, sf.getEnd());
2022 assertEquals("p.Lys1Glu", sf.getDescription());
2023 assertEquals("var1.125A>G", sf.getValue("ID"));
2024 assertNull(sf.getValue("clinical_significance"));
2025 assertEquals("ID=var1.125A>G", sf.getAttributes());
2026 assertEquals(1, sf.links.size());
2027 // link to variation is urlencoded
2029 "p.Lys1Glu var1.125A>G|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var1.125A%3EG",
2031 assertEquals("Jalview", sf.getFeatureGroup());
2033 assertEquals(1, sf.getBegin());
2034 assertEquals(1, sf.getEnd());
2035 assertEquals("p.Lys1Gln", sf.getDescription());
2036 assertEquals("var2", sf.getValue("ID"));
2037 assertEquals("Dodgy", sf.getValue("clinical_significance"));
2038 assertEquals("ID=var2;clinical_significance=Dodgy", sf.getAttributes());
2039 assertEquals(1, sf.links.size());
2041 "p.Lys1Gln var2|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var2",
2043 assertEquals("Jalview", sf.getFeatureGroup());
2045 assertEquals(1, sf.getBegin());
2046 assertEquals(1, sf.getEnd());
2047 assertEquals("p.Lys1Asn", sf.getDescription());
2048 assertEquals("var4", sf.getValue("ID"));
2049 assertEquals("Benign", sf.getValue("clinical_significance"));
2050 assertEquals("ID=var4;clinical_significance=Benign", sf.getAttributes());
2051 assertEquals(1, sf.links.size());
2053 "p.Lys1Asn var4|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var4",
2055 assertEquals("Jalview", sf.getFeatureGroup());
2057 assertEquals(3, sf.getBegin());
2058 assertEquals(3, sf.getEnd());
2059 assertEquals("p.Pro3His", sf.getDescription());
2060 assertEquals("var6", sf.getValue("ID"));
2061 assertEquals("Good", sf.getValue("clinical_significance"));
2062 assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes());
2063 assertEquals(1, sf.links.size());
2065 "p.Pro3His var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
2067 // var5 generates two distinct protein variant features
2068 assertEquals("Jalview", sf.getFeatureGroup());
2070 assertEquals(3, sf.getBegin());
2071 assertEquals(3, sf.getEnd());
2072 assertEquals("p.Pro3Arg", sf.getDescription());
2073 assertEquals("var6", sf.getValue("ID"));
2074 assertEquals("Good", sf.getValue("clinical_significance"));
2075 assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes());
2076 assertEquals(1, sf.links.size());
2078 "p.Pro3Arg var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
2080 assertEquals("Jalview", sf.getFeatureGroup());
2084 * Tests for the method that maps the subset of a dna sequence that has CDS
2085 * (or subtype) feature, with CDS strand = '-' (reverse)
2087 // test turned off as currently findCdsPositions is not strand-dependent
2088 // left in case it comes around again...
2089 @Test(groups = "Functional", enabled = false)
2090 public void testFindCdsPositions_reverseStrand()
2092 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
2093 dnaSeq.createDatasetSequence();
2094 SequenceI ds = dnaSeq.getDatasetSequence();
2097 SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
2099 ds.addSequenceFeature(sf);
2100 // exon feature should be ignored here
2101 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
2102 ds.addSequenceFeature(sf);
2103 // CDS for dna 10-12
2104 sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
2106 ds.addSequenceFeature(sf);
2108 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
2110 * verify ranges { [12-10], [6-4] }
2112 assertEquals(6, MappingUtils.getLength(ranges));
2113 assertEquals(2, ranges.size());
2114 assertEquals(12, ranges.get(0)[0]);
2115 assertEquals(10, ranges.get(0)[1]);
2116 assertEquals(6, ranges.get(1)[0]);
2117 assertEquals(4, ranges.get(1)[1]);
2121 * Tests for the method that maps the subset of a dna sequence that has CDS
2122 * (or subtype) feature - reverse strand case where the start codon is
2125 @Test(groups = "Functional", enabled = false)
2126 // test turned off as currently findCdsPositions is not strand-dependent
2127 // left in case it comes around again...
2128 public void testFindCdsPositions_reverseStrandThreePrimeIncomplete()
2130 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
2131 dnaSeq.createDatasetSequence();
2132 SequenceI ds = dnaSeq.getDatasetSequence();
2135 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
2137 ds.addSequenceFeature(sf);
2138 // CDS for dna 13-15
2139 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
2141 sf.setPhase("2"); // skip 2 bases to start of next codon
2142 ds.addSequenceFeature(sf);
2144 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
2147 * check the mapping starts with the first complete codon
2148 * expect ranges [13, 13], [9, 5]
2150 assertEquals(6, MappingUtils.getLength(ranges));
2151 assertEquals(2, ranges.size());
2152 assertEquals(13, ranges.get(0)[0]);
2153 assertEquals(13, ranges.get(0)[1]);
2154 assertEquals(9, ranges.get(1)[0]);
2155 assertEquals(5, ranges.get(1)[1]);
2158 @Test(groups = "Functional")
2159 public void testAlignAs_alternateTranscriptsUngapped()
2161 SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
2162 SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
2163 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
2164 ((Alignment) dna).createDatasetAlignment();
2165 SequenceI cds1 = new Sequence("cds1", "GGGTTT");
2166 SequenceI cds2 = new Sequence("cds2", "CCCAAA");
2167 AlignmentI cds = new Alignment(new SequenceI[] { cds1, cds2 });
2168 ((Alignment) cds).createDatasetAlignment();
2170 AlignedCodonFrame acf = new AlignedCodonFrame();
2171 MapList map = new MapList(new int[] { 4, 9 }, new int[] { 1, 6 }, 1, 1);
2172 acf.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), map);
2173 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 6 }, 1, 1);
2174 acf.addMap(dna2.getDatasetSequence(), cds2.getDatasetSequence(), map);
2177 * verify CDS alignment is as:
2178 * cccGGGTTTaaa (cdna)
2179 * CCCgggtttAAA (cdna)
2181 * ---GGGTTT--- (cds)
2182 * CCC------AAA (cds)
2184 dna.addCodonFrame(acf);
2185 AlignmentUtils.alignAs(cds, dna);
2186 assertEquals("---GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
2187 assertEquals("CCC------AAA", cds.getSequenceAt(1).getSequenceAsString());
2190 @Test(groups = { "Functional" })
2191 public void testAddMappedPositions()
2193 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
2194 SequenceI seq1 = new Sequence("cds", "AAATTT");
2195 from.createDatasetSequence();
2196 seq1.createDatasetSequence();
2197 Mapping mapping = new Mapping(seq1, new MapList(
2198 new int[] { 3, 6, 9, 10 },
2199 new int[] { 1, 6 }, 1, 1));
2200 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
2201 AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2204 * verify map has seq1 residues in columns 3,4,6,7,11,12
2206 assertEquals(6, map.size());
2207 assertEquals('A', map.get(3).get(seq1).charValue());
2208 assertEquals('A', map.get(4).get(seq1).charValue());
2209 assertEquals('A', map.get(6).get(seq1).charValue());
2210 assertEquals('T', map.get(7).get(seq1).charValue());
2211 assertEquals('T', map.get(11).get(seq1).charValue());
2212 assertEquals('T', map.get(12).get(seq1).charValue());
2220 * Test case where the mapping 'from' range includes a stop codon which is
2221 * absent in the 'to' range
2223 @Test(groups = { "Functional" })
2224 public void testAddMappedPositions_withStopCodon()
2226 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
2227 SequenceI seq1 = new Sequence("cds", "AAATTT");
2228 from.createDatasetSequence();
2229 seq1.createDatasetSequence();
2230 Mapping mapping = new Mapping(seq1, new MapList(
2231 new int[] { 3, 6, 9, 10 },
2232 new int[] { 1, 6 }, 1, 1));
2233 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
2234 AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2237 * verify map has seq1 residues in columns 3,4,6,7,11,12
2239 assertEquals(6, map.size());
2240 assertEquals('A', map.get(3).get(seq1).charValue());
2241 assertEquals('A', map.get(4).get(seq1).charValue());
2242 assertEquals('A', map.get(6).get(seq1).charValue());
2243 assertEquals('T', map.get(7).get(seq1).charValue());
2244 assertEquals('T', map.get(11).get(seq1).charValue());
2245 assertEquals('T', map.get(12).get(seq1).charValue());