2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.testng.AssertJUnit.assertEquals;
24 import static org.testng.AssertJUnit.assertFalse;
25 import static org.testng.AssertJUnit.assertNull;
26 import static org.testng.AssertJUnit.assertSame;
27 import static org.testng.AssertJUnit.assertTrue;
29 import jalview.datamodel.AlignedCodonFrame;
30 import jalview.datamodel.Alignment;
31 import jalview.datamodel.AlignmentAnnotation;
32 import jalview.datamodel.AlignmentI;
33 import jalview.datamodel.Annotation;
34 import jalview.datamodel.DBRefEntry;
35 import jalview.datamodel.Mapping;
36 import jalview.datamodel.SearchResults;
37 import jalview.datamodel.SearchResults.Match;
38 import jalview.datamodel.Sequence;
39 import jalview.datamodel.SequenceFeature;
40 import jalview.datamodel.SequenceI;
41 import jalview.io.AppletFormatAdapter;
42 import jalview.io.FormatAdapter;
43 import jalview.util.MapList;
44 import jalview.util.MappingUtils;
46 import java.io.IOException;
47 import java.util.ArrayList;
48 import java.util.Arrays;
49 import java.util.Iterator;
50 import java.util.LinkedHashMap;
51 import java.util.List;
53 import java.util.TreeMap;
55 import org.testng.annotations.Test;
57 public class AlignmentUtilsTests
59 public static Sequence ts = new Sequence("short",
60 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm");
62 @Test(groups = { "Functional" })
63 public void testExpandContext()
65 AlignmentI al = new Alignment(new Sequence[] {});
66 for (int i = 4; i < 14; i += 2)
68 SequenceI s1 = ts.deriveSequence().getSubSequence(i, i + 7);
71 System.out.println(new AppletFormatAdapter().formatSequences("Clustal",
73 for (int flnk = -1; flnk < 25; flnk++)
75 AlignmentI exp = AlignmentUtils.expandContext(al, flnk);
76 System.out.println("\nFlank size: " + flnk);
77 System.out.println(new AppletFormatAdapter().formatSequences(
78 "Clustal", exp, true));
82 * Full expansion to complete sequences
84 for (SequenceI sq : exp.getSequences())
86 String ung = sq.getSequenceAsString().replaceAll("-+", "");
87 final String errorMsg = "Flanking sequence not the same as original dataset sequence.\n"
90 + sq.getDatasetSequence().getSequenceAsString();
91 assertTrue(errorMsg, ung.equalsIgnoreCase(sq.getDatasetSequence()
92 .getSequenceAsString()));
98 * Last sequence is fully expanded, others have leading gaps to match
100 assertTrue(exp.getSequenceAt(4).getSequenceAsString()
102 assertTrue(exp.getSequenceAt(3).getSequenceAsString()
103 .startsWith("--abc"));
104 assertTrue(exp.getSequenceAt(2).getSequenceAsString()
105 .startsWith("----abc"));
106 assertTrue(exp.getSequenceAt(1).getSequenceAsString()
107 .startsWith("------abc"));
108 assertTrue(exp.getSequenceAt(0).getSequenceAsString()
109 .startsWith("--------abc"));
115 * Test that annotations are correctly adjusted by expandContext
117 @Test(groups = { "Functional" })
118 public void testExpandContext_annotation()
120 AlignmentI al = new Alignment(new Sequence[] {});
121 SequenceI ds = new Sequence("Seq1", "ABCDEFGHI");
123 SequenceI seq1 = ds.deriveSequence().getSubSequence(3, 6);
124 al.addSequence(seq1);
127 * Annotate DEF with 4/5/6 respectively
129 Annotation[] anns = new Annotation[] { new Annotation(4),
130 new Annotation(5), new Annotation(6) };
131 AlignmentAnnotation ann = new AlignmentAnnotation("SS",
132 "secondary structure", anns);
133 seq1.addAlignmentAnnotation(ann);
136 * The annotations array should match aligned positions
138 assertEquals(3, ann.annotations.length);
139 assertEquals(4, ann.annotations[0].value, 0.001);
140 assertEquals(5, ann.annotations[1].value, 0.001);
141 assertEquals(6, ann.annotations[2].value, 0.001);
144 * Check annotation to sequence position mappings before expanding the
145 * sequence; these are set up in Sequence.addAlignmentAnnotation ->
146 * Annotation.setSequenceRef -> createSequenceMappings
148 assertNull(ann.getAnnotationForPosition(1));
149 assertNull(ann.getAnnotationForPosition(2));
150 assertNull(ann.getAnnotationForPosition(3));
151 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
152 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
153 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
154 assertNull(ann.getAnnotationForPosition(7));
155 assertNull(ann.getAnnotationForPosition(8));
156 assertNull(ann.getAnnotationForPosition(9));
159 * Expand the subsequence to the full sequence abcDEFghi
161 AlignmentI expanded = AlignmentUtils.expandContext(al, -1);
162 assertEquals("abcDEFghi", expanded.getSequenceAt(0)
163 .getSequenceAsString());
166 * Confirm the alignment and sequence have the same SS annotation,
167 * referencing the expanded sequence
169 ann = expanded.getSequenceAt(0).getAnnotation()[0];
170 assertSame(ann, expanded.getAlignmentAnnotation()[0]);
171 assertSame(expanded.getSequenceAt(0), ann.sequenceRef);
174 * The annotations array should have null values except for annotated
177 assertNull(ann.annotations[0]);
178 assertNull(ann.annotations[1]);
179 assertNull(ann.annotations[2]);
180 assertEquals(4, ann.annotations[3].value, 0.001);
181 assertEquals(5, ann.annotations[4].value, 0.001);
182 assertEquals(6, ann.annotations[5].value, 0.001);
183 assertNull(ann.annotations[6]);
184 assertNull(ann.annotations[7]);
185 assertNull(ann.annotations[8]);
188 * sequence position mappings should be unchanged
190 assertNull(ann.getAnnotationForPosition(1));
191 assertNull(ann.getAnnotationForPosition(2));
192 assertNull(ann.getAnnotationForPosition(3));
193 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
194 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
195 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
196 assertNull(ann.getAnnotationForPosition(7));
197 assertNull(ann.getAnnotationForPosition(8));
198 assertNull(ann.getAnnotationForPosition(9));
202 * Test method that returns a map of lists of sequences by sequence name.
204 * @throws IOException
206 @Test(groups = { "Functional" })
207 public void testGetSequencesByName() throws IOException
209 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
210 + ">Seq1Name\nABCD\n";
211 AlignmentI al = loadAlignment(data, "FASTA");
212 Map<String, List<SequenceI>> map = AlignmentUtils
213 .getSequencesByName(al);
214 assertEquals(2, map.keySet().size());
215 assertEquals(2, map.get("Seq1Name").size());
216 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
217 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
218 assertEquals(1, map.get("Seq2Name").size());
219 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
223 * Helper method to load an alignment and ensure dataset sequences are set up.
229 * @throws IOException
231 protected AlignmentI loadAlignment(final String data, String format)
234 AlignmentI a = new FormatAdapter().readFile(data,
235 AppletFormatAdapter.PASTE, format);
241 * Test mapping of protein to cDNA, for the case where we have no sequence
242 * cross-references, so mappings are made first-served 1-1 where sequences
245 * @throws IOException
247 @Test(groups = { "Functional" })
248 public void testMapProteinAlignmentToCdna_noXrefs() throws IOException
250 List<SequenceI> protseqs = new ArrayList<SequenceI>();
251 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
252 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
253 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
254 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
255 protein.setDataset(null);
257 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
258 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
259 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
260 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
261 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
262 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
263 cdna.setDataset(null);
265 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
267 // 3 mappings made, each from 1 to 1 sequence
268 assertEquals(3, protein.getCodonFrames().size());
269 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
270 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
271 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
273 // V12345 mapped to A22222
274 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
276 assertEquals(1, acf.getdnaSeqs().length);
277 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
278 acf.getdnaSeqs()[0]);
279 Mapping[] protMappings = acf.getProtMappings();
280 assertEquals(1, protMappings.length);
281 MapList mapList = protMappings[0].getMap();
282 assertEquals(3, mapList.getFromRatio());
283 assertEquals(1, mapList.getToRatio());
284 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
286 assertEquals(1, mapList.getFromRanges().size());
287 assertTrue(Arrays.equals(new int[] { 1, 3 },
288 mapList.getToRanges().get(0)));
289 assertEquals(1, mapList.getToRanges().size());
291 // V12346 mapped to A33333
292 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
293 assertEquals(1, acf.getdnaSeqs().length);
294 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
295 acf.getdnaSeqs()[0]);
297 // V12347 mapped to A11111
298 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
299 assertEquals(1, acf.getdnaSeqs().length);
300 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
301 acf.getdnaSeqs()[0]);
303 // no mapping involving the 'extra' A44444
304 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
308 * Test for the alignSequenceAs method that takes two sequences and a mapping.
310 @Test(groups = { "Functional" })
311 public void testAlignSequenceAs_withMapping_noIntrons()
313 MapList map = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1);
316 * No existing gaps in dna:
318 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
322 * Now introduce gaps in dna but ignore them when realigning.
324 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
328 * Now include gaps in dna when realigning. First retaining 'mapped' gaps
329 * only, i.e. those within the exon region.
331 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
332 "---G-G--G---A--A-A");
335 * Include all gaps in dna when realigning (within and without the exon
336 * region). The leading gap, and the gaps between codons, are subsumed by
337 * the protein alignment gap.
339 checkAlignSequenceAs("-G-GG--AA-A---", "-A-L-", true, true, map,
340 "---G-GG---AA-A---");
343 * Include only unmapped gaps in dna when realigning (outside the exon
344 * region). The leading gap, and the gaps between codons, are subsumed by
345 * the protein alignment gap.
347 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
352 * Test for the alignSequenceAs method that takes two sequences and a mapping.
354 @Test(groups = { "Functional" })
355 public void testAlignSequenceAs_withMapping_withIntrons()
358 * Exons at codon 2 (AAA) and 4 (TTT)
360 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
361 new int[] { 1, 2 }, 3, 1);
364 * Simple case: no gaps in dna
366 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
367 "GGG---AAACCCTTTGGG");
370 * Add gaps to dna - but ignore when realigning.
372 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
373 false, false, map, "GGG---AAACCCTTTGGG");
376 * Add gaps to dna - include within exons only when realigning.
378 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
379 true, false, map, "GGG---A--A---ACCCT-TTGGG");
382 * Include gaps outside exons only when realigning.
384 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
385 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
388 * Include gaps following first intron if we are 'preserving mapped gaps'
390 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
391 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
394 * Include all gaps in dna when realigning.
396 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
397 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
401 * Test for the case where not all of the protein sequence is mapped to cDNA.
403 @Test(groups = { "Functional" })
404 public void testAlignSequenceAs_withMapping_withUnmappedProtein()
407 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
409 final MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] {
413 * -L- 'aligns' ccc------
415 checkAlignSequenceAs("gggAAAcccTTTggg", "-A-L-P-", false, false, map,
416 "gggAAAccc------TTTggg");
420 * Helper method that performs and verifies the method under test.
423 * the sequence to be realigned
425 * the sequence whose alignment is to be copied
426 * @param preserveMappedGaps
427 * @param preserveUnmappedGaps
431 protected void checkAlignSequenceAs(final String alignee,
432 final String alignModel, final boolean preserveMappedGaps,
433 final boolean preserveUnmappedGaps, MapList map,
434 final String expected)
436 SequenceI alignMe = new Sequence("Seq1", alignee);
437 alignMe.createDatasetSequence();
438 SequenceI alignFrom = new Sequence("Seq2", alignModel);
439 alignFrom.createDatasetSequence();
440 AlignedCodonFrame acf = new AlignedCodonFrame();
441 acf.addMap(alignMe.getDatasetSequence(), alignFrom.getDatasetSequence(), map);
443 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "---", '-',
444 preserveMappedGaps, preserveUnmappedGaps);
445 assertEquals(expected, alignMe.getSequenceAsString());
449 * Test for the alignSequenceAs method where we preserve gaps in introns only.
451 @Test(groups = { "Functional" })
452 public void testAlignSequenceAs_keepIntronGapsOnly()
456 * Intron GGGAAA followed by exon CCCTTT
458 MapList map = new MapList(new int[] { 7, 12 }, new int[] { 1, 2 }, 3, 1);
460 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL", false, true, map,
465 * Test the method that realigns protein to match mapped codon alignment.
467 @Test(groups = { "Functional" })
468 public void testAlignProteinAsDna()
470 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
471 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
472 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
473 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
474 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
475 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
476 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
477 dna.setDataset(null);
479 // protein alignment will be realigned like dna
480 SequenceI prot1 = new Sequence("Seq1", "CHYQ");
481 SequenceI prot2 = new Sequence("Seq2", "CHYQ");
482 SequenceI prot3 = new Sequence("Seq3", "CHYQ");
483 SequenceI prot4 = new Sequence("Seq4", "R-QSV"); // unmapped, unchanged
484 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
486 protein.setDataset(null);
488 MapList map = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 3, 1);
489 AlignedCodonFrame acf = new AlignedCodonFrame();
490 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
491 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
492 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
493 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
495 protein.setCodonFrames(acfs);
498 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
499 * [8,9,10] [10,11,12] [11,12,13]
501 AlignmentUtils.alignProteinAsDna(protein, dna);
502 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
503 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
504 assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
505 assertEquals("R-QSV", prot4.getSequenceAsString());
509 * Test the method that tests whether a CDNA sequence translates to a protein
512 @Test(groups = { "Functional" })
513 public void testTranslatesAs()
515 // null arguments check
516 assertFalse(AlignmentUtils.translatesAs(null, 0, null));
517 assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null));
518 assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' }));
520 // straight translation
521 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
522 "FPKG".toCharArray()));
523 // with extra start codon (not in protein)
524 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
525 3, "FPKG".toCharArray()));
526 // with stop codon1 (not in protein)
527 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
528 0, "FPKG".toCharArray()));
529 // with stop codon1 (in protein as *)
530 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
531 0, "FPKG*".toCharArray()));
532 // with stop codon2 (not in protein)
533 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
534 0, "FPKG".toCharArray()));
535 // with stop codon3 (not in protein)
536 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
537 0, "FPKG".toCharArray()));
538 // with start and stop codon1
539 assertTrue(AlignmentUtils.translatesAs(
540 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG".toCharArray()));
541 // with start and stop codon1 (in protein as *)
542 assertTrue(AlignmentUtils.translatesAs(
543 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG*".toCharArray()));
544 // with start and stop codon2
545 assertTrue(AlignmentUtils.translatesAs(
546 "atgtttcccaaagggtag".toCharArray(), 3, "FPKG".toCharArray()));
547 // with start and stop codon3
548 assertTrue(AlignmentUtils.translatesAs(
549 "atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray()));
551 // with embedded stop codons
552 assertTrue(AlignmentUtils.translatesAs(
553 "atgtttTAGcccaaaTAAgggtga".toCharArray(), 3,
554 "F*PK*G".toCharArray()));
557 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
558 0, "FPMG".toCharArray()));
561 assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0,
562 "FPKG".toCharArray()));
565 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
566 0, "FPK".toCharArray()));
568 // overlong dna (doesn't end in stop codon)
569 assertFalse(AlignmentUtils.translatesAs(
570 "tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray()));
572 // dna + stop codon + more
573 assertFalse(AlignmentUtils.translatesAs(
574 "tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray()));
577 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
578 0, "FPKGQ".toCharArray()));
582 * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
583 * stop codons in addition to the protein coding sequence.
585 * @throws IOException
587 @Test(groups = { "Functional" })
588 public void testMapProteinAlignmentToCdna_withStartAndStopCodons()
591 List<SequenceI> protseqs = new ArrayList<SequenceI>();
592 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
593 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
594 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
595 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
596 protein.setDataset(null);
598 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
600 dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
602 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
603 // = start +EIQ + stop
604 dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
605 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
606 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
607 cdna.setDataset(null);
609 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
611 // 3 mappings made, each from 1 to 1 sequence
612 assertEquals(3, protein.getCodonFrames().size());
613 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
614 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
615 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
617 // V12345 mapped from A22222
618 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
620 assertEquals(1, acf.getdnaSeqs().length);
621 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
622 acf.getdnaSeqs()[0]);
623 Mapping[] protMappings = acf.getProtMappings();
624 assertEquals(1, protMappings.length);
625 MapList mapList = protMappings[0].getMap();
626 assertEquals(3, mapList.getFromRatio());
627 assertEquals(1, mapList.getToRatio());
628 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
630 assertEquals(1, mapList.getFromRanges().size());
631 assertTrue(Arrays.equals(new int[] { 1, 3 },
632 mapList.getToRanges().get(0)));
633 assertEquals(1, mapList.getToRanges().size());
635 // V12346 mapped from A33333 starting position 4
636 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
637 assertEquals(1, acf.getdnaSeqs().length);
638 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
639 acf.getdnaSeqs()[0]);
640 protMappings = acf.getProtMappings();
641 assertEquals(1, protMappings.length);
642 mapList = protMappings[0].getMap();
643 assertEquals(3, mapList.getFromRatio());
644 assertEquals(1, mapList.getToRatio());
645 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
647 assertEquals(1, mapList.getFromRanges().size());
648 assertTrue(Arrays.equals(new int[] { 1, 3 },
649 mapList.getToRanges().get(0)));
650 assertEquals(1, mapList.getToRanges().size());
652 // V12347 mapped to A11111 starting position 4
653 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
654 assertEquals(1, acf.getdnaSeqs().length);
655 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
656 acf.getdnaSeqs()[0]);
657 protMappings = acf.getProtMappings();
658 assertEquals(1, protMappings.length);
659 mapList = protMappings[0].getMap();
660 assertEquals(3, mapList.getFromRatio());
661 assertEquals(1, mapList.getToRatio());
662 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
664 assertEquals(1, mapList.getFromRanges().size());
665 assertTrue(Arrays.equals(new int[] { 1, 3 },
666 mapList.getToRanges().get(0)));
667 assertEquals(1, mapList.getToRanges().size());
669 // no mapping involving the 'extra' A44444
670 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
674 * Test mapping of protein to cDNA, for the case where we have some sequence
675 * cross-references. Verify that 1-to-many mappings are made where
676 * cross-references exist and sequences are mappable.
678 * @throws IOException
680 @Test(groups = { "Functional" })
681 public void testMapProteinAlignmentToCdna_withXrefs() throws IOException
683 List<SequenceI> protseqs = new ArrayList<SequenceI>();
684 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
685 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
686 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
687 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
688 protein.setDataset(null);
690 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
691 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
692 dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
693 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
694 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
695 dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
696 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
697 cdna.setDataset(null);
699 // Xref A22222 to V12345 (should get mapped)
700 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
701 // Xref V12345 to A44444 (should get mapped)
702 protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
703 // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
704 dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
705 // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
706 // it should get paired up with the unmapped A33333
707 // A11111 should be mapped to V12347
708 // A55555 is spare and has no xref so is not mapped
710 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
712 // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
713 assertEquals(3, protein.getCodonFrames().size());
714 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
715 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
716 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
718 // one mapping for each of the first 4 cDNA sequences
719 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
720 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
721 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
722 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
724 // V12345 mapped to A22222 and A44444
725 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
727 assertEquals(2, acf.getdnaSeqs().length);
728 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
729 acf.getdnaSeqs()[0]);
730 assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
731 acf.getdnaSeqs()[1]);
733 // V12346 mapped to A33333
734 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
735 assertEquals(1, acf.getdnaSeqs().length);
736 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
737 acf.getdnaSeqs()[0]);
739 // V12347 mapped to A11111
740 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
741 assertEquals(1, acf.getdnaSeqs().length);
742 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
743 acf.getdnaSeqs()[0]);
745 // no mapping involving the 'extra' A55555
746 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
750 * Test mapping of protein to cDNA, for the case where we have some sequence
751 * cross-references. Verify that once we have made an xref mapping we don't
752 * also map un-xrefd sequeces.
754 * @throws IOException
756 @Test(groups = { "Functional" })
757 public void testMapProteinAlignmentToCdna_prioritiseXrefs()
760 List<SequenceI> protseqs = new ArrayList<SequenceI>();
761 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
762 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
763 AlignmentI protein = new Alignment(
764 protseqs.toArray(new SequenceI[protseqs.size()]));
765 protein.setDataset(null);
767 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
768 dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
769 dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
770 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
772 cdna.setDataset(null);
774 // Xref A22222 to V12345 (should get mapped)
775 // A11111 should then be mapped to the unmapped V12346
776 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
778 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
780 // 2 protein mappings made
781 assertEquals(2, protein.getCodonFrames().size());
782 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
783 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
785 // one mapping for each of the cDNA sequences
786 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
787 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
789 // V12345 mapped to A22222
790 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
792 assertEquals(1, acf.getdnaSeqs().length);
793 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
794 acf.getdnaSeqs()[0]);
796 // V12346 mapped to A11111
797 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
798 assertEquals(1, acf.getdnaSeqs().length);
799 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
800 acf.getdnaSeqs()[0]);
804 * Test the method that shows or hides sequence annotations by type(s) and
807 @Test(groups = { "Functional" })
808 public void testShowOrHideSequenceAnnotations()
810 SequenceI seq1 = new Sequence("Seq1", "AAA");
811 SequenceI seq2 = new Sequence("Seq2", "BBB");
812 SequenceI seq3 = new Sequence("Seq3", "CCC");
813 Annotation[] anns = new Annotation[] { new Annotation(2f) };
814 AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1",
816 ann1.setSequenceRef(seq1);
817 AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2",
819 ann2.setSequenceRef(seq2);
820 AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3",
822 AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns);
823 ann4.setSequenceRef(seq1);
824 AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns);
825 ann5.setSequenceRef(seq2);
826 AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns);
827 AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2, seq3 });
828 al.addAnnotation(ann1); // Structure for Seq1
829 al.addAnnotation(ann2); // Structure for Seq2
830 al.addAnnotation(ann3); // Structure for no sequence
831 al.addAnnotation(ann4); // Temp for seq1
832 al.addAnnotation(ann5); // Temp for seq2
833 al.addAnnotation(ann6); // Temp for no sequence
834 List<String> types = new ArrayList<String>();
835 List<SequenceI> scope = new ArrayList<SequenceI>();
838 * Set all sequence related Structure to hidden (ann1, ann2)
840 types.add("Structure");
841 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
843 assertFalse(ann1.visible);
844 assertFalse(ann2.visible);
845 assertTrue(ann3.visible); // not sequence-related, not affected
846 assertTrue(ann4.visible); // not Structure, not affected
847 assertTrue(ann5.visible); // "
848 assertTrue(ann6.visible); // not sequence-related, not affected
851 * Set Temp in {seq1, seq3} to hidden
857 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false,
859 assertFalse(ann1.visible); // unchanged
860 assertFalse(ann2.visible); // unchanged
861 assertTrue(ann3.visible); // not sequence-related, not affected
862 assertFalse(ann4.visible); // Temp for seq1 hidden
863 assertTrue(ann5.visible); // not in scope, not affected
864 assertTrue(ann6.visible); // not sequence-related, not affected
867 * Set Temp in all sequences to hidden
873 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
875 assertFalse(ann1.visible); // unchanged
876 assertFalse(ann2.visible); // unchanged
877 assertTrue(ann3.visible); // not sequence-related, not affected
878 assertFalse(ann4.visible); // Temp for seq1 hidden
879 assertFalse(ann5.visible); // Temp for seq2 hidden
880 assertTrue(ann6.visible); // not sequence-related, not affected
883 * Set all types in {seq1, seq3} to visible
889 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true,
891 assertTrue(ann1.visible); // Structure for seq1 set visible
892 assertFalse(ann2.visible); // not in scope, unchanged
893 assertTrue(ann3.visible); // not sequence-related, not affected
894 assertTrue(ann4.visible); // Temp for seq1 set visible
895 assertFalse(ann5.visible); // not in scope, unchanged
896 assertTrue(ann6.visible); // not sequence-related, not affected
899 * Set all types in all scope to hidden
901 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true,
903 assertFalse(ann1.visible);
904 assertFalse(ann2.visible);
905 assertTrue(ann3.visible); // not sequence-related, not affected
906 assertFalse(ann4.visible);
907 assertFalse(ann5.visible);
908 assertTrue(ann6.visible); // not sequence-related, not affected
912 * Tests for the method that checks if one sequence cross-references another
914 @Test(groups = { "Functional" })
915 public void testHasCrossRef()
917 assertFalse(AlignmentUtils.hasCrossRef(null, null));
918 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
919 assertFalse(AlignmentUtils.hasCrossRef(seq1, null));
920 assertFalse(AlignmentUtils.hasCrossRef(null, seq1));
921 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
922 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
925 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193"));
926 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
928 // case-insensitive; version number is ignored
929 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192"));
930 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
933 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
934 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
935 // test is one-way only
936 assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1));
940 * Tests for the method that checks if either sequence cross-references the
943 @Test(groups = { "Functional" })
944 public void testHaveCrossRef()
946 assertFalse(AlignmentUtils.hasCrossRef(null, null));
947 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
948 assertFalse(AlignmentUtils.haveCrossRef(seq1, null));
949 assertFalse(AlignmentUtils.haveCrossRef(null, seq1));
950 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
951 assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2));
953 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
954 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
955 // next is true for haveCrossRef, false for hasCrossRef
956 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
958 // now the other way round
959 seq1.setDBRefs(null);
960 seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345"));
961 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
962 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
965 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
966 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
967 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
971 * Test the method that extracts the cds-only part of a dna alignment.
973 @Test(groups = { "Functional" })
974 public void testMakeCdsAlignment()
976 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
977 SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
978 SequenceI pep1 = new Sequence("pep1", "GF");
979 SequenceI pep2 = new Sequence("pep2", "GFP");
980 dna1.createDatasetSequence();
981 dna2.createDatasetSequence();
982 pep1.createDatasetSequence();
983 pep2.createDatasetSequence();
984 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
986 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
988 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
990 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
992 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds5", 13, 15, 0f,
994 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
995 dna.setDataset(null);
997 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
998 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
999 new int[] { 1, 2 }, 3, 1);
1000 AlignedCodonFrame acf = new AlignedCodonFrame();
1001 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1003 map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 },
1005 acf = new AlignedCodonFrame();
1006 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1010 * execute method under test:
1012 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1013 dna1, dna2 }, mappings, dna);
1015 assertEquals(2, cds.getSequences().size());
1016 assertEquals("GGGTTT", cds.getSequenceAt(0)
1017 .getSequenceAsString());
1018 assertEquals("GGGTTTCCC", cds.getSequenceAt(1)
1019 .getSequenceAsString());
1022 * verify shared, extended alignment dataset
1024 assertSame(dna.getDataset(), cds.getDataset());
1025 assertTrue(dna.getDataset().getSequences()
1026 .contains(cds.getSequenceAt(0).getDatasetSequence()));
1027 assertTrue(dna.getDataset().getSequences()
1028 .contains(cds.getSequenceAt(1).getDatasetSequence()));
1031 * Verify mappings from CDS to peptide and cDNA to CDS
1032 * the mappings are on the shared alignment dataset
1034 assertSame(dna.getCodonFrames(), cds.getCodonFrames());
1035 List<AlignedCodonFrame> cdsMappings = cds.getCodonFrames();
1036 assertEquals(2, cdsMappings.size());
1039 * Mapping from pep1 to GGGTTT in first new exon sequence
1041 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1042 .findMappingsForSequence(pep1, cdsMappings);
1043 assertEquals(1, pep1Mapping.size());
1045 SearchResults sr = MappingUtils
1046 .buildSearchResults(pep1, 1, cdsMappings);
1047 assertEquals(1, sr.getResults().size());
1048 Match m = sr.getResults().get(0);
1049 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1051 assertEquals(1, m.getStart());
1052 assertEquals(3, m.getEnd());
1054 sr = MappingUtils.buildSearchResults(pep1, 2, cdsMappings);
1055 m = sr.getResults().get(0);
1056 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1058 assertEquals(4, m.getStart());
1059 assertEquals(6, m.getEnd());
1062 * Mapping from pep2 to GGGTTTCCC in second new exon sequence
1064 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1065 .findMappingsForSequence(pep2, cdsMappings);
1066 assertEquals(1, pep2Mapping.size());
1068 sr = MappingUtils.buildSearchResults(pep2, 1, cdsMappings);
1069 assertEquals(1, sr.getResults().size());
1070 m = sr.getResults().get(0);
1071 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1073 assertEquals(1, m.getStart());
1074 assertEquals(3, m.getEnd());
1076 sr = MappingUtils.buildSearchResults(pep2, 2, cdsMappings);
1077 m = sr.getResults().get(0);
1078 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1080 assertEquals(4, m.getStart());
1081 assertEquals(6, m.getEnd());
1083 sr = MappingUtils.buildSearchResults(pep2, 3, cdsMappings);
1084 m = sr.getResults().get(0);
1085 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1087 assertEquals(7, m.getStart());
1088 assertEquals(9, m.getEnd());
1092 * Test the method that makes a cds-only alignment from a DNA sequence and its
1093 * product mappings, for the case where there are multiple exon mappings to
1094 * different protein products.
1096 @Test(groups = { "Functional" })
1097 public void testMakeCdsAlignment_multipleProteins()
1099 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1100 SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT
1101 SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc
1102 SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT
1103 dna1.createDatasetSequence();
1104 pep1.createDatasetSequence();
1105 pep2.createDatasetSequence();
1106 pep3.createDatasetSequence();
1107 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
1109 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
1111 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
1113 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
1115 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds5", 1, 3, 0f,
1117 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds6", 10, 12, 0f,
1119 pep1.getDatasetSequence().addDBRef(
1120 new DBRefEntry("EMBLCDS", "2", "A12345"));
1121 pep2.getDatasetSequence().addDBRef(
1122 new DBRefEntry("EMBLCDS", "3", "A12346"));
1123 pep3.getDatasetSequence().addDBRef(
1124 new DBRefEntry("EMBLCDS", "4", "A12347"));
1127 * Make the mappings from dna to protein
1129 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
1130 // map ...GGG...TTT to GF
1131 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1132 new int[] { 1, 2 }, 3, 1);
1133 AlignedCodonFrame acf = new AlignedCodonFrame();
1134 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1137 // map aaa...ccc to KP
1138 map = new MapList(new int[] { 1, 3, 7, 9 }, new int[] { 1, 2 }, 3, 1);
1139 acf = new AlignedCodonFrame();
1140 acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map);
1143 // map aaa......TTT to KF
1144 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 2 }, 3, 1);
1145 acf = new AlignedCodonFrame();
1146 acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
1150 * Create the CDS alignment; also augments the dna-to-protein mappings with
1151 * exon-to-protein and exon-to-dna mappings
1153 AlignmentI dna = new Alignment(new SequenceI[] { dna1 });
1154 dna.setDataset(null);
1157 * execute method under test
1159 AlignmentI cdsal = AlignmentUtils.makeCdsAlignment(
1160 new SequenceI[] { dna1 }, mappings, dna);
1163 * Verify we have 3 cds sequences, mapped to pep1/2/3 respectively
1165 List<SequenceI> cds = cdsal.getSequences();
1166 assertEquals(3, cds.size());
1169 * verify shared, extended alignment dataset
1171 assertSame(cdsal.getDataset(), dna.getDataset());
1172 assertTrue(dna.getDataset().getSequences()
1173 .contains(cds.get(0).getDatasetSequence()));
1174 assertTrue(dna.getDataset().getSequences()
1175 .contains(cds.get(1).getDatasetSequence()));
1176 assertTrue(dna.getDataset().getSequences()
1177 .contains(cds.get(2).getDatasetSequence()));
1180 * verify aligned cds sequences and their xrefs
1182 SequenceI cdsSeq = cds.get(0);
1183 assertEquals("GGGTTT", cdsSeq.getSequenceAsString());
1184 // assertEquals("dna1|A12345", cdsSeq.getName());
1185 assertEquals("dna1|pep1", cdsSeq.getName());
1186 // assertEquals(1, cdsSeq.getDBRefs().length);
1187 // DBRefEntry cdsRef = cdsSeq.getDBRefs()[0];
1188 // assertEquals("EMBLCDS", cdsRef.getSource());
1189 // assertEquals("2", cdsRef.getVersion());
1190 // assertEquals("A12345", cdsRef.getAccessionId());
1192 cdsSeq = cds.get(1);
1193 assertEquals("aaaccc", cdsSeq.getSequenceAsString());
1194 // assertEquals("dna1|A12346", cdsSeq.getName());
1195 assertEquals("dna1|pep2", cdsSeq.getName());
1196 // assertEquals(1, cdsSeq.getDBRefs().length);
1197 // cdsRef = cdsSeq.getDBRefs()[0];
1198 // assertEquals("EMBLCDS", cdsRef.getSource());
1199 // assertEquals("3", cdsRef.getVersion());
1200 // assertEquals("A12346", cdsRef.getAccessionId());
1202 cdsSeq = cds.get(2);
1203 assertEquals("aaaTTT", cdsSeq.getSequenceAsString());
1204 // assertEquals("dna1|A12347", cdsSeq.getName());
1205 assertEquals("dna1|pep3", cdsSeq.getName());
1206 // assertEquals(1, cdsSeq.getDBRefs().length);
1207 // cdsRef = cdsSeq.getDBRefs()[0];
1208 // assertEquals("EMBLCDS", cdsRef.getSource());
1209 // assertEquals("4", cdsRef.getVersion());
1210 // assertEquals("A12347", cdsRef.getAccessionId());
1213 * Verify there are mappings from each cds sequence to its protein product
1214 * and also to its dna source
1216 Iterator<AlignedCodonFrame> newMappingsIterator = cdsal
1217 .getCodonFrames().iterator();
1219 // mappings for dna1 - exon1 - pep1
1220 AlignedCodonFrame cdsMapping = newMappingsIterator.next();
1221 List<Mapping> dnaMappings = cdsMapping.getMappingsFromSequence(dna1);
1222 assertEquals(3, dnaMappings.size());
1223 assertSame(cds.get(0).getDatasetSequence(), dnaMappings.get(0)
1225 assertEquals("G(1) in CDS should map to G(4) in DNA", 4, dnaMappings
1226 .get(0).getMap().getToPosition(1));
1227 List<Mapping> peptideMappings = cdsMapping.getMappingsFromSequence(cds
1228 .get(0).getDatasetSequence());
1229 assertEquals(1, peptideMappings.size());
1230 assertSame(pep1.getDatasetSequence(), peptideMappings.get(0).getTo());
1232 // mappings for dna1 - cds2 - pep2
1233 assertSame(cds.get(1).getDatasetSequence(), dnaMappings.get(1)
1235 assertEquals("c(4) in CDS should map to c(7) in DNA", 7, dnaMappings
1236 .get(1).getMap().getToPosition(4));
1237 peptideMappings = cdsMapping.getMappingsFromSequence(cds.get(1)
1238 .getDatasetSequence());
1239 assertEquals(1, peptideMappings.size());
1240 assertSame(pep2.getDatasetSequence(), peptideMappings.get(0).getTo());
1242 // mappings for dna1 - cds3 - pep3
1243 assertSame(cds.get(2).getDatasetSequence(), dnaMappings.get(2)
1245 assertEquals("T(4) in CDS should map to T(10) in DNA", 10, dnaMappings
1246 .get(2).getMap().getToPosition(4));
1247 peptideMappings = cdsMapping.getMappingsFromSequence(cds.get(2)
1248 .getDatasetSequence());
1249 assertEquals(1, peptideMappings.size());
1250 assertSame(pep3.getDatasetSequence(), peptideMappings.get(0).getTo());
1253 @Test(groups = { "Functional" })
1254 public void testIsMappable()
1256 SequenceI dna1 = new Sequence("dna1", "cgCAGtgGT");
1257 SequenceI aa1 = new Sequence("aa1", "RSG");
1258 AlignmentI al1 = new Alignment(new SequenceI[] { dna1 });
1259 AlignmentI al2 = new Alignment(new SequenceI[] { aa1 });
1261 assertFalse(AlignmentUtils.isMappable(null, null));
1262 assertFalse(AlignmentUtils.isMappable(al1, null));
1263 assertFalse(AlignmentUtils.isMappable(null, al1));
1264 assertFalse(AlignmentUtils.isMappable(al1, al1));
1265 assertFalse(AlignmentUtils.isMappable(al2, al2));
1267 assertTrue(AlignmentUtils.isMappable(al1, al2));
1268 assertTrue(AlignmentUtils.isMappable(al2, al1));
1272 * Test creating a mapping when the sequences involved do not start at residue
1275 * @throws IOException
1277 @Test(groups = { "Functional" })
1278 public void testMapCdnaToProtein_forSubsequence()
1281 SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12);
1282 prot.createDatasetSequence();
1284 SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48);
1285 dna.createDatasetSequence();
1287 MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna);
1288 assertEquals(10, map.getToLowest());
1289 assertEquals(12, map.getToHighest());
1290 assertEquals(40, map.getFromLowest());
1291 assertEquals(48, map.getFromHighest());
1295 * Test for the alignSequenceAs method where we have protein mapped to protein
1297 @Test(groups = { "Functional" })
1298 public void testAlignSequenceAs_mappedProteinProtein()
1301 SequenceI alignMe = new Sequence("Match", "MGAASEV");
1302 alignMe.createDatasetSequence();
1303 SequenceI alignFrom = new Sequence("Query", "LQTGYMGAASEVMFSPTRR");
1304 alignFrom.createDatasetSequence();
1306 AlignedCodonFrame acf = new AlignedCodonFrame();
1307 // this is like a domain or motif match of part of a peptide sequence
1308 MapList map = new MapList(new int[] { 6, 12 }, new int[] { 1, 7 }, 1, 1);
1309 acf.addMap(alignFrom.getDatasetSequence(),
1310 alignMe.getDatasetSequence(), map);
1312 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "-", '-', true,
1314 assertEquals("-----MGAASEV-------", alignMe.getSequenceAsString());
1318 * Test for the alignSequenceAs method where there are trailing unmapped
1319 * residues in the model sequence
1321 @Test(groups = { "Functional" })
1322 public void testAlignSequenceAs_withTrailingPeptide()
1324 // map first 3 codons to KPF; G is a trailing unmapped residue
1325 MapList map = new MapList(new int[] { 1, 9 }, new int[] { 1, 3 }, 3, 1);
1327 checkAlignSequenceAs("AAACCCTTT", "K-PFG", true, true, map,
1332 * Tests for transferring features between mapped sequences
1334 @Test(groups = { "Functional" })
1335 public void testTransferFeatures()
1337 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1338 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1341 dna.addSequenceFeature(new SequenceFeature("type1", "desc1", 1, 2, 1f,
1343 // partial overlap - to [1, 1]
1344 dna.addSequenceFeature(new SequenceFeature("type2", "desc2", 3, 4, 2f,
1346 // exact overlap - to [1, 3]
1347 dna.addSequenceFeature(new SequenceFeature("type3", "desc3", 4, 6, 3f,
1349 // spanning overlap - to [2, 5]
1350 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1352 // exactly overlaps whole mapped range [1, 6]
1353 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1355 // no overlap (internal)
1356 dna.addSequenceFeature(new SequenceFeature("type6", "desc6", 7, 9, 6f,
1358 // no overlap (3' end)
1359 dna.addSequenceFeature(new SequenceFeature("type7", "desc7", 13, 15,
1361 // overlap (3' end) - to [6, 6]
1362 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1364 // extended overlap - to [6, +]
1365 dna.addSequenceFeature(new SequenceFeature("type9", "desc9", 12, 13,
1368 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1369 new int[] { 1, 6 }, 1, 1);
1372 * transferFeatures() will build 'partial overlap' for regions
1373 * that partially overlap 5' or 3' (start or end) of target sequence
1375 AlignmentUtils.transferFeatures(dna, cds, map, null);
1376 SequenceFeature[] sfs = cds.getSequenceFeatures();
1377 assertEquals(6, sfs.length);
1379 SequenceFeature sf = sfs[0];
1380 assertEquals("type2", sf.getType());
1381 assertEquals("desc2", sf.getDescription());
1382 assertEquals(2f, sf.getScore());
1383 assertEquals(1, sf.getBegin());
1384 assertEquals(1, sf.getEnd());
1387 assertEquals("type3", sf.getType());
1388 assertEquals("desc3", sf.getDescription());
1389 assertEquals(3f, sf.getScore());
1390 assertEquals(1, sf.getBegin());
1391 assertEquals(3, sf.getEnd());
1394 assertEquals("type4", sf.getType());
1395 assertEquals(2, sf.getBegin());
1396 assertEquals(5, sf.getEnd());
1399 assertEquals("type5", sf.getType());
1400 assertEquals(1, sf.getBegin());
1401 assertEquals(6, sf.getEnd());
1404 assertEquals("type8", sf.getType());
1405 assertEquals(6, sf.getBegin());
1406 assertEquals(6, sf.getEnd());
1409 assertEquals("type9", sf.getType());
1410 assertEquals(6, sf.getBegin());
1411 assertEquals(6, sf.getEnd());
1415 * Tests for transferring features between mapped sequences
1417 @Test(groups = { "Functional" })
1418 public void testTransferFeatures_withOmit()
1420 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1421 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1423 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1424 new int[] { 1, 6 }, 1, 1);
1426 // [5, 11] maps to [2, 5]
1427 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1429 // [4, 12] maps to [1, 6]
1430 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1432 // [12, 12] maps to [6, 6]
1433 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1436 // desc4 and desc8 are the 'omit these' varargs
1437 AlignmentUtils.transferFeatures(dna, cds, map, null, "type4", "type8");
1438 SequenceFeature[] sfs = cds.getSequenceFeatures();
1439 assertEquals(1, sfs.length);
1441 SequenceFeature sf = sfs[0];
1442 assertEquals("type5", sf.getType());
1443 assertEquals(1, sf.getBegin());
1444 assertEquals(6, sf.getEnd());
1448 * Tests for transferring features between mapped sequences
1450 @Test(groups = { "Functional" })
1451 public void testTransferFeatures_withSelect()
1453 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1454 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1456 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1457 new int[] { 1, 6 }, 1, 1);
1459 // [5, 11] maps to [2, 5]
1460 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1462 // [4, 12] maps to [1, 6]
1463 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1465 // [12, 12] maps to [6, 6]
1466 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1469 // "type5" is the 'select this type' argument
1470 AlignmentUtils.transferFeatures(dna, cds, map, "type5");
1471 SequenceFeature[] sfs = cds.getSequenceFeatures();
1472 assertEquals(1, sfs.length);
1474 SequenceFeature sf = sfs[0];
1475 assertEquals("type5", sf.getType());
1476 assertEquals(1, sf.getBegin());
1477 assertEquals(6, sf.getEnd());
1481 * Test the method that extracts the cds-only part of a dna alignment, for the
1482 * case where the cds should be aligned to match its nucleotide sequence.
1484 @Test(groups = { "Functional" })
1485 public void testMakeCdsAlignment_alternativeTranscripts()
1487 SequenceI dna1 = new Sequence("dna1", "aaaGGGCC-----CTTTaaaGGG");
1488 // alternative transcript of same dna skips CCC codon
1489 SequenceI dna2 = new Sequence("dna2", "aaaGGGCC-----cttTaaaGGG");
1490 // dna3 has no mapping (protein product) so should be ignored here
1491 SequenceI dna3 = new Sequence("dna3", "aaaGGGCCCCCGGGcttTaaaGGG");
1492 SequenceI pep1 = new Sequence("pep1", "GPFG");
1493 SequenceI pep2 = new Sequence("pep2", "GPG");
1494 dna1.createDatasetSequence();
1495 dna2.createDatasetSequence();
1496 dna3.createDatasetSequence();
1497 pep1.createDatasetSequence();
1498 pep2.createDatasetSequence();
1499 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 8, 0f,
1501 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 9, 12, 0f,
1503 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 16, 18, 0f,
1505 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 4, 8, 0f,
1507 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 12, 12, 0f,
1509 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 16, 18, 0f,
1512 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
1513 MapList map = new MapList(new int[] { 4, 12, 16, 18 },
1514 new int[] { 1, 4 }, 3, 1);
1515 AlignedCodonFrame acf = new AlignedCodonFrame();
1516 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1518 map = new MapList(new int[] { 4, 8, 12, 12, 16, 18 },
1521 acf = new AlignedCodonFrame();
1522 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1525 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1526 dna.setDataset(null);
1527 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1528 dna1, dna2, dna3 }, mappings, dna);
1529 List<SequenceI> cdsSeqs = cds.getSequences();
1530 assertEquals(2, cdsSeqs.size());
1531 assertEquals("GGGCCCTTTGGG", cdsSeqs.get(0).getSequenceAsString());
1532 assertEquals("GGGCCTGGG", cdsSeqs.get(1).getSequenceAsString());
1535 * verify shared, extended alignment dataset
1537 assertSame(dna.getDataset(), cds.getDataset());
1538 assertTrue(dna.getDataset().getSequences()
1539 .contains(cdsSeqs.get(0).getDatasetSequence()));
1540 assertTrue(dna.getDataset().getSequences()
1541 .contains(cdsSeqs.get(1).getDatasetSequence()));
1544 * Verify updated mappings
1546 List<AlignedCodonFrame> cdsMappings = cds.getCodonFrames();
1547 assertEquals(2, cdsMappings.size());
1550 * Mapping from pep1 to GGGTTT in first new CDS sequence
1552 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1553 .findMappingsForSequence(pep1, cdsMappings);
1554 assertEquals(1, pep1Mapping.size());
1556 * maps GPFG to 1-3,4-6,7-9,10-12
1558 SearchResults sr = MappingUtils
1559 .buildSearchResults(pep1, 1, cdsMappings);
1560 assertEquals(1, sr.getResults().size());
1561 Match m = sr.getResults().get(0);
1562 assertEquals(cds.getSequenceAt(0).getDatasetSequence(),
1564 assertEquals(1, m.getStart());
1565 assertEquals(3, m.getEnd());
1566 sr = MappingUtils.buildSearchResults(pep1, 2, cdsMappings);
1567 m = sr.getResults().get(0);
1568 assertEquals(4, m.getStart());
1569 assertEquals(6, m.getEnd());
1570 sr = MappingUtils.buildSearchResults(pep1, 3, cdsMappings);
1571 m = sr.getResults().get(0);
1572 assertEquals(7, m.getStart());
1573 assertEquals(9, m.getEnd());
1574 sr = MappingUtils.buildSearchResults(pep1, 4, cdsMappings);
1575 m = sr.getResults().get(0);
1576 assertEquals(10, m.getStart());
1577 assertEquals(12, m.getEnd());
1580 * GPG in pep2 map to 1-3,4-6,7-9 in second CDS sequence
1582 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1583 .findMappingsForSequence(pep2, cdsMappings);
1584 assertEquals(1, pep2Mapping.size());
1585 sr = MappingUtils.buildSearchResults(pep2, 1, cdsMappings);
1586 assertEquals(1, sr.getResults().size());
1587 m = sr.getResults().get(0);
1588 assertEquals(cds.getSequenceAt(1).getDatasetSequence(),
1590 assertEquals(1, m.getStart());
1591 assertEquals(3, m.getEnd());
1592 sr = MappingUtils.buildSearchResults(pep2, 2, cdsMappings);
1593 m = sr.getResults().get(0);
1594 assertEquals(4, m.getStart());
1595 assertEquals(6, m.getEnd());
1596 sr = MappingUtils.buildSearchResults(pep2, 3, cdsMappings);
1597 m = sr.getResults().get(0);
1598 assertEquals(7, m.getStart());
1599 assertEquals(9, m.getEnd());
1603 * Test the method that realigns protein to match mapped codon alignment.
1605 @Test(groups = { "Functional" })
1606 public void testAlignProteinAsDna_incompleteStartCodon()
1608 // seq1: incomplete start codon (not mapped), then [3, 11]
1609 SequenceI dna1 = new Sequence("Seq1", "ccAAA-TTT-GGG-");
1610 // seq2 codons are [4, 5], [8, 11]
1611 SequenceI dna2 = new Sequence("Seq2", "ccaAA-ttT-GGG-");
1612 // seq3 incomplete start codon at 'tt'
1613 SequenceI dna3 = new Sequence("Seq3", "ccaaa-ttt-GGG-");
1614 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1615 dna.setDataset(null);
1617 // prot1 has 'X' for incomplete start codon (not mapped)
1618 SequenceI prot1 = new Sequence("Seq1", "XKFG"); // X for incomplete start
1619 SequenceI prot2 = new Sequence("Seq2", "NG");
1620 SequenceI prot3 = new Sequence("Seq3", "XG"); // X for incomplete start
1621 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
1623 protein.setDataset(null);
1625 // map dna1 [3, 11] to prot1 [2, 4] KFG
1626 MapList map = new MapList(new int[] { 3, 11 }, new int[] { 2, 4 }, 3, 1);
1627 AlignedCodonFrame acf = new AlignedCodonFrame();
1628 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
1630 // map dna2 [4, 5] [8, 11] to prot2 [1, 2] NG
1631 map = new MapList(new int[] { 4, 5, 8, 11 }, new int[] { 1, 2 }, 3, 1);
1632 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
1634 // map dna3 [9, 11] to prot3 [2, 2] G
1635 map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1);
1636 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
1638 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
1640 protein.setCodonFrames(acfs);
1643 * verify X is included in the aligned proteins, and placed just
1644 * before the first mapped residue
1645 * CCT is between CCC and TTT
1647 AlignmentUtils.alignProteinAsDna(protein, dna);
1648 assertEquals("XK-FG", prot1.getSequenceAsString());
1649 assertEquals("--N-G", prot2.getSequenceAsString());
1650 assertEquals("---XG", prot3.getSequenceAsString());
1654 * Tests for the method that maps the subset of a dna sequence that has CDS
1655 * (or subtype) feature - case where the start codon is incomplete.
1657 @Test(groups = "Functional")
1658 public void testFindCdsPositions_fivePrimeIncomplete()
1660 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
1661 dnaSeq.createDatasetSequence();
1662 SequenceI ds = dnaSeq.getDatasetSequence();
1664 // CDS for dna 5-6 (incomplete codon), 7-9
1665 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
1666 sf.setPhase("2"); // skip 2 bases to start of next codon
1667 ds.addSequenceFeature(sf);
1668 // CDS for dna 13-15
1669 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
1670 ds.addSequenceFeature(sf);
1672 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1675 * check the mapping starts with the first complete codon
1677 assertEquals(6, MappingUtils.getLength(ranges));
1678 assertEquals(2, ranges.size());
1679 assertEquals(7, ranges.get(0)[0]);
1680 assertEquals(9, ranges.get(0)[1]);
1681 assertEquals(13, ranges.get(1)[0]);
1682 assertEquals(15, ranges.get(1)[1]);
1686 * Tests for the method that maps the subset of a dna sequence that has CDS
1687 * (or subtype) feature.
1689 @Test(groups = "Functional")
1690 public void testFindCdsPositions()
1692 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
1693 dnaSeq.createDatasetSequence();
1694 SequenceI ds = dnaSeq.getDatasetSequence();
1696 // CDS for dna 10-12
1697 SequenceFeature sf = new SequenceFeature("CDS_predicted", "", 10, 12,
1700 ds.addSequenceFeature(sf);
1702 sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
1704 ds.addSequenceFeature(sf);
1705 // exon feature should be ignored here
1706 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
1707 ds.addSequenceFeature(sf);
1709 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1711 * verify ranges { [4-6], [12-10] }
1712 * note CDS ranges are ordered ascending even if the CDS
1715 assertEquals(6, MappingUtils.getLength(ranges));
1716 assertEquals(2, ranges.size());
1717 assertEquals(4, ranges.get(0)[0]);
1718 assertEquals(6, ranges.get(0)[1]);
1719 assertEquals(10, ranges.get(1)[0]);
1720 assertEquals(12, ranges.get(1)[1]);
1724 * Test the method that computes a map of codon variants for each protein
1725 * position from "sequence_variant" features on dna
1727 @Test(groups = "Functional")
1728 public void testBuildDnaVariantsMap()
1730 SequenceI dna = new Sequence("dna", "atgAAATTTGGGCCCtag");
1731 MapList map = new MapList(new int[] { 1, 18 }, new int[] { 1, 5 }, 3, 1);
1734 * first with no variants on dna
1736 LinkedHashMap<Integer, String[][]> variantsMap = AlignmentUtils
1737 .buildDnaVariantsMap(dna, map);
1738 assertTrue(variantsMap.isEmpty());
1740 // single allele codon 1, on base 1
1741 SequenceFeature sf = new SequenceFeature("sequence_variant", "", 1, 1,
1743 sf.setValue("alleles", "T");
1744 dna.addSequenceFeature(sf);
1746 // two alleles codon 2, on bases 2 and 3
1747 sf = new SequenceFeature("sequence_variant", "", 5, 5, 0f, null);
1748 sf.setValue("alleles", "T");
1749 dna.addSequenceFeature(sf);
1750 sf = new SequenceFeature("sequence_variant", "", 6, 6, 0f, null);
1751 sf.setValue("alleles", "G");
1752 dna.addSequenceFeature(sf);
1754 // two alleles codon 3, both on base 2
1755 sf = new SequenceFeature("sequence_variant", "", 8, 8, 0f, null);
1756 sf.setValue("alleles", "C, G");
1757 dna.addSequenceFeature(sf);
1759 // no alleles on codon 4
1760 // alleles on codon 5 on all 3 bases
1761 sf = new SequenceFeature("sequence_variant", "", 13, 13, 0f, null);
1762 sf.setValue("alleles", "C, G"); // (C duplicates given base value)
1763 dna.addSequenceFeature(sf);
1764 sf = new SequenceFeature("sequence_variant", "", 14, 14, 0f, null);
1765 sf.setValue("alleles", "g, a"); // should force to upper-case
1766 dna.addSequenceFeature(sf);
1767 sf = new SequenceFeature("sequence_variant", "", 15, 15, 0f, null);
1768 sf.setValue("alleles", "A, T");
1769 dna.addSequenceFeature(sf);
1771 variantsMap = AlignmentUtils.buildDnaVariantsMap(dna, map);
1772 assertEquals(4, variantsMap.size());
1773 assertTrue(Arrays.deepEquals(new String[][] { { "A", "T" }, { "T" },
1774 { "G" } }, variantsMap.get(1)));
1775 assertTrue(Arrays.deepEquals(new String[][] { { "A" }, { "A", "T" },
1776 { "A", "G" } }, variantsMap.get(2)));
1777 assertTrue(Arrays.deepEquals(new String[][] { { "T" },
1778 { "T", "C", "G" }, { "T" } }, variantsMap.get(3)));
1779 // duplicated bases are not removed here, handled in computePeptideVariants
1780 assertTrue(Arrays.deepEquals(new String[][] { { "C", "C", "G" },
1781 { "C", "G", "A" }, { "C", "A", "T" } }, variantsMap.get(5)));
1785 * Tests for the method that computes all peptide variants given codon
1788 @Test(groups = "Functional")
1789 public void testComputePeptideVariants()
1791 String[][] codonVariants = new String[][] { { "A" }, { "G" }, { "T" } };
1794 * AGT codes for S - this is not included in the variants returned
1796 List<String> variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1797 assertEquals("[]", variants.toString());
1799 // S is reported if it differs from the current value (A):
1800 variants = AlignmentUtils.computePeptideVariants(codonVariants, "A");
1801 assertEquals("[S]", variants.toString());
1804 * synonymous variant is not reported
1806 codonVariants = new String[][] { { "A" }, { "G" }, { "C", "T" } };
1807 // AGC and AGT both code for S
1808 variants = AlignmentUtils.computePeptideVariants(codonVariants, "s");
1809 assertEquals("[]", variants.toString());
1812 * equivalent variants are only reported once
1814 codonVariants = new String[][] { { "C" }, { "T" },
1815 { "A", "C", "G", "T" } };
1816 // CTA CTC CTG CTT all code for L
1817 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1818 assertEquals("[L]", variants.toString());
1821 * vary codons 1 and 2; variant products are sorted and non-redundant
1823 codonVariants = new String[][] { { "a", "C" }, { "g", "T" }, { "A" } };
1824 // aga ata cga cta code for R, I, R, L
1825 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1826 assertEquals("[I, L, R]", variants.toString());
1829 * vary codons 2 and 3
1831 codonVariants = new String[][] { { "a" }, { "g", "T" }, { "A", "c" } };
1832 // aga agc ata atc code for R, S, I, I
1833 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1834 assertEquals("[I, R]", variants.toString());
1837 * vary codons 1 and 3
1839 codonVariants = new String[][] { { "a", "t" }, { "a" }, { "t", "g" } };
1840 // aat aag tat tag code for N, K, Y, STOP - STOP sorted to end
1841 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1842 assertEquals("[K, N, Y, STOP]", variants.toString());
1845 * vary codons 1, 2 and 3
1847 codonVariants = new String[][] { { "a", "t" }, { "G", "C" },
1849 // agt agg act acg tgt tgg tct tcg code for S, R, T, T, C, W, S, S
1850 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1851 assertEquals("[C, R, T, W]", variants.toString());
1855 * Tests for the method that maps the subset of a dna sequence that has CDS
1856 * (or subtype) feature, with CDS strand = '-' (reverse)
1858 // test turned off as currently findCdsPositions is not strand-dependent
1859 // left in case it comes around again...
1860 @Test(groups = "Functional", enabled = false)
1861 public void testFindCdsPositions_reverseStrand()
1863 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
1864 dnaSeq.createDatasetSequence();
1865 SequenceI ds = dnaSeq.getDatasetSequence();
1868 SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
1870 ds.addSequenceFeature(sf);
1871 // exon feature should be ignored here
1872 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
1873 ds.addSequenceFeature(sf);
1874 // CDS for dna 10-12
1875 sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
1877 ds.addSequenceFeature(sf);
1879 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1881 * verify ranges { [12-10], [6-4] }
1883 assertEquals(6, MappingUtils.getLength(ranges));
1884 assertEquals(2, ranges.size());
1885 assertEquals(12, ranges.get(0)[0]);
1886 assertEquals(10, ranges.get(0)[1]);
1887 assertEquals(6, ranges.get(1)[0]);
1888 assertEquals(4, ranges.get(1)[1]);
1892 * Tests for the method that maps the subset of a dna sequence that has CDS
1893 * (or subtype) feature - reverse strand case where the start codon is
1896 @Test(groups = "Functional", enabled = false)
1897 // test turned off as currently findCdsPositions is not strand-dependent
1898 // left in case it comes around again...
1899 public void testFindCdsPositions_reverseStrandThreePrimeIncomplete()
1901 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
1902 dnaSeq.createDatasetSequence();
1903 SequenceI ds = dnaSeq.getDatasetSequence();
1906 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
1908 ds.addSequenceFeature(sf);
1909 // CDS for dna 13-15
1910 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
1912 sf.setPhase("2"); // skip 2 bases to start of next codon
1913 ds.addSequenceFeature(sf);
1915 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1918 * check the mapping starts with the first complete codon
1919 * expect ranges [13, 13], [9, 5]
1921 assertEquals(6, MappingUtils.getLength(ranges));
1922 assertEquals(2, ranges.size());
1923 assertEquals(13, ranges.get(0)[0]);
1924 assertEquals(13, ranges.get(0)[1]);
1925 assertEquals(9, ranges.get(1)[0]);
1926 assertEquals(5, ranges.get(1)[1]);
1929 @Test(groups = "Functional")
1930 public void testAlignAs_alternateTranscriptsUngapped()
1932 SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
1933 SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
1934 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
1935 ((Alignment) dna).createDatasetAlignment();
1936 SequenceI cds1 = new Sequence("cds1", "GGGTTT");
1937 SequenceI cds2 = new Sequence("cds2", "CCCAAA");
1938 AlignmentI cds = new Alignment(new SequenceI[] { cds1, cds2 });
1939 ((Alignment) cds).createDatasetAlignment();
1941 AlignedCodonFrame acf = new AlignedCodonFrame();
1942 MapList map = new MapList(new int[] { 4, 9 }, new int[] { 1, 6 }, 1, 1);
1943 acf.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), map);
1944 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 6 }, 1, 1);
1945 acf.addMap(dna2.getDatasetSequence(), cds2.getDatasetSequence(), map);
1948 * verify CDS alignment is as:
1949 * cccGGGTTTaaa (cdna)
1950 * CCCgggtttAAA (cdna)
1952 * ---GGGTTT--- (cds)
1953 * CCC------AAA (cds)
1955 dna.addCodonFrame(acf);
1956 AlignmentUtils.alignAs(cds, dna);
1957 assertEquals("---GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
1958 assertEquals("CCC------AAA", cds.getSequenceAt(1).getSequenceAsString());
1961 @Test(groups = { "Functional" })
1962 public void testAddMappedPositions()
1964 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
1965 SequenceI seq1 = new Sequence("cds", "AAATTT");
1966 from.createDatasetSequence();
1967 seq1.createDatasetSequence();
1968 Mapping mapping = new Mapping(seq1, new MapList(
1969 new int[] { 3, 6, 9, 10 },
1970 new int[] { 1, 6 }, 1, 1));
1971 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
1972 AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
1975 * verify map has seq1 residues in columns 3,4,6,7,11,12
1977 assertEquals(6, map.size());
1978 assertEquals('A', map.get(3).get(seq1).charValue());
1979 assertEquals('A', map.get(4).get(seq1).charValue());
1980 assertEquals('A', map.get(6).get(seq1).charValue());
1981 assertEquals('T', map.get(7).get(seq1).charValue());
1982 assertEquals('T', map.get(11).get(seq1).charValue());
1983 assertEquals('T', map.get(12).get(seq1).charValue());
1991 * Test case where the mapping 'from' range includes a stop codon which is
1992 * absent in the 'to' range
1994 @Test(groups = { "Functional" })
1995 public void testAddMappedPositions_withStopCodon()
1997 SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
1998 SequenceI seq1 = new Sequence("cds", "AAATTT");
1999 from.createDatasetSequence();
2000 seq1.createDatasetSequence();
2001 Mapping mapping = new Mapping(seq1, new MapList(
2002 new int[] { 3, 6, 9, 10 },
2003 new int[] { 1, 6 }, 1, 1));
2004 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
2005 AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
2008 * verify map has seq1 residues in columns 3,4,6,7,11,12
2010 assertEquals(6, map.size());
2011 assertEquals('A', map.get(3).get(seq1).charValue());
2012 assertEquals('A', map.get(4).get(seq1).charValue());
2013 assertEquals('A', map.get(6).get(seq1).charValue());
2014 assertEquals('T', map.get(7).get(seq1).charValue());
2015 assertEquals('T', map.get(11).get(seq1).charValue());
2016 assertEquals('T', map.get(12).get(seq1).charValue());