2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.testng.AssertJUnit.assertEquals;
24 import static org.testng.AssertJUnit.assertFalse;
25 import static org.testng.AssertJUnit.assertNull;
26 import static org.testng.AssertJUnit.assertSame;
27 import static org.testng.AssertJUnit.assertTrue;
29 import jalview.datamodel.AlignedCodonFrame;
30 import jalview.datamodel.Alignment;
31 import jalview.datamodel.AlignmentAnnotation;
32 import jalview.datamodel.AlignmentI;
33 import jalview.datamodel.Annotation;
34 import jalview.datamodel.DBRefEntry;
35 import jalview.datamodel.Mapping;
36 import jalview.datamodel.SearchResults;
37 import jalview.datamodel.SearchResults.Match;
38 import jalview.datamodel.Sequence;
39 import jalview.datamodel.SequenceFeature;
40 import jalview.datamodel.SequenceI;
41 import jalview.io.AppletFormatAdapter;
42 import jalview.io.FormatAdapter;
43 import jalview.util.MapList;
44 import jalview.util.MappingUtils;
46 import java.io.IOException;
47 import java.util.ArrayList;
48 import java.util.Arrays;
49 import java.util.HashSet;
50 import java.util.Iterator;
51 import java.util.LinkedHashMap;
52 import java.util.List;
56 import org.testng.annotations.Test;
58 public class AlignmentUtilsTests
61 private static final String TEST_DATA =
63 "#=GS D.melanogaster.1 AC AY119185.1/838-902\n" +
64 "#=GS D.melanogaster.2 AC AC092237.1/57223-57161\n" +
65 "#=GS D.melanogaster.3 AC AY060611.1/560-627\n" +
66 "D.melanogaster.1 G.AGCC.CU...AUGAUCGA\n" +
67 "#=GR D.melanogaster.1 SS ................((((\n" +
68 "D.melanogaster.2 C.AUUCAACU.UAUGAGGAU\n" +
69 "#=GR D.melanogaster.2 SS ................((((\n" +
70 "D.melanogaster.3 G.UGGCGCU..UAUGACGCA\n" +
71 "#=GR D.melanogaster.3 SS (.(((...(....(((((((\n" +
74 private static final String AA_SEQS_1 =
80 private static final String CDNA_SEQS_1 =
82 "AC-GG--CUC-CAA-CT\n" +
84 "-CG-TTA--ACG---AAGT\n";
86 private static final String CDNA_SEQS_2 =
93 // public static Sequence ts=new
94 // Sequence("short","ASDASDASDASDASDASDASDASDASDASDASDASDASD");
95 public static Sequence ts = new Sequence("short",
96 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm");
98 @Test(groups = { "Functional" })
99 public void testExpandContext()
101 AlignmentI al = new Alignment(new Sequence[] {});
102 for (int i = 4; i < 14; i += 2)
104 SequenceI s1 = ts.deriveSequence().getSubSequence(i, i + 7);
107 System.out.println(new AppletFormatAdapter().formatSequences("Clustal",
109 for (int flnk = -1; flnk < 25; flnk++)
111 AlignmentI exp = AlignmentUtils.expandContext(al, flnk);
112 System.out.println("\nFlank size: " + flnk);
113 System.out.println(new AppletFormatAdapter().formatSequences(
114 "Clustal", exp, true));
118 * Full expansion to complete sequences
120 for (SequenceI sq : exp.getSequences())
122 String ung = sq.getSequenceAsString().replaceAll("-+", "");
123 final String errorMsg = "Flanking sequence not the same as original dataset sequence.\n"
126 + sq.getDatasetSequence().getSequenceAsString();
127 assertTrue(errorMsg, ung.equalsIgnoreCase(sq.getDatasetSequence()
128 .getSequenceAsString()));
134 * Last sequence is fully expanded, others have leading gaps to match
136 assertTrue(exp.getSequenceAt(4).getSequenceAsString()
138 assertTrue(exp.getSequenceAt(3).getSequenceAsString()
139 .startsWith("--abc"));
140 assertTrue(exp.getSequenceAt(2).getSequenceAsString()
141 .startsWith("----abc"));
142 assertTrue(exp.getSequenceAt(1).getSequenceAsString()
143 .startsWith("------abc"));
144 assertTrue(exp.getSequenceAt(0).getSequenceAsString()
145 .startsWith("--------abc"));
151 * Test that annotations are correctly adjusted by expandContext
153 @Test(groups = { "Functional" })
154 public void testExpandContext_annotation()
156 AlignmentI al = new Alignment(new Sequence[] {});
157 SequenceI ds = new Sequence("Seq1", "ABCDEFGHI");
159 SequenceI seq1 = ds.deriveSequence().getSubSequence(3, 6);
160 al.addSequence(seq1);
163 * Annotate DEF with 4/5/6 respectively
165 Annotation[] anns = new Annotation[] { new Annotation(4),
166 new Annotation(5), new Annotation(6) };
167 AlignmentAnnotation ann = new AlignmentAnnotation("SS",
168 "secondary structure", anns);
169 seq1.addAlignmentAnnotation(ann);
172 * The annotations array should match aligned positions
174 assertEquals(3, ann.annotations.length);
175 assertEquals(4, ann.annotations[0].value, 0.001);
176 assertEquals(5, ann.annotations[1].value, 0.001);
177 assertEquals(6, ann.annotations[2].value, 0.001);
180 * Check annotation to sequence position mappings before expanding the
181 * sequence; these are set up in Sequence.addAlignmentAnnotation ->
182 * Annotation.setSequenceRef -> createSequenceMappings
184 assertNull(ann.getAnnotationForPosition(1));
185 assertNull(ann.getAnnotationForPosition(2));
186 assertNull(ann.getAnnotationForPosition(3));
187 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
188 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
189 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
190 assertNull(ann.getAnnotationForPosition(7));
191 assertNull(ann.getAnnotationForPosition(8));
192 assertNull(ann.getAnnotationForPosition(9));
195 * Expand the subsequence to the full sequence abcDEFghi
197 AlignmentI expanded = AlignmentUtils.expandContext(al, -1);
198 assertEquals("abcDEFghi", expanded.getSequenceAt(0)
199 .getSequenceAsString());
202 * Confirm the alignment and sequence have the same SS annotation,
203 * referencing the expanded sequence
205 ann = expanded.getSequenceAt(0).getAnnotation()[0];
206 assertSame(ann, expanded.getAlignmentAnnotation()[0]);
207 assertSame(expanded.getSequenceAt(0), ann.sequenceRef);
210 * The annotations array should have null values except for annotated
213 assertNull(ann.annotations[0]);
214 assertNull(ann.annotations[1]);
215 assertNull(ann.annotations[2]);
216 assertEquals(4, ann.annotations[3].value, 0.001);
217 assertEquals(5, ann.annotations[4].value, 0.001);
218 assertEquals(6, ann.annotations[5].value, 0.001);
219 assertNull(ann.annotations[6]);
220 assertNull(ann.annotations[7]);
221 assertNull(ann.annotations[8]);
224 * sequence position mappings should be unchanged
226 assertNull(ann.getAnnotationForPosition(1));
227 assertNull(ann.getAnnotationForPosition(2));
228 assertNull(ann.getAnnotationForPosition(3));
229 assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
230 assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
231 assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
232 assertNull(ann.getAnnotationForPosition(7));
233 assertNull(ann.getAnnotationForPosition(8));
234 assertNull(ann.getAnnotationForPosition(9));
238 * Test method that returns a map of lists of sequences by sequence name.
240 * @throws IOException
242 @Test(groups = { "Functional" })
243 public void testGetSequencesByName() throws IOException
245 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
246 + ">Seq1Name\nABCD\n";
247 AlignmentI al = loadAlignment(data, "FASTA");
248 Map<String, List<SequenceI>> map = AlignmentUtils
249 .getSequencesByName(al);
250 assertEquals(2, map.keySet().size());
251 assertEquals(2, map.get("Seq1Name").size());
252 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
253 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
254 assertEquals(1, map.get("Seq2Name").size());
255 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
259 * Helper method to load an alignment and ensure dataset sequences are set up.
265 * @throws IOException
267 protected AlignmentI loadAlignment(final String data, String format)
270 AlignmentI a = new FormatAdapter().readFile(data,
271 AppletFormatAdapter.PASTE, format);
277 * Test mapping of protein to cDNA, for the case where we have no sequence
278 * cross-references, so mappings are made first-served 1-1 where sequences
281 * @throws IOException
283 @Test(groups = { "Functional" })
284 public void testMapProteinAlignmentToCdna_noXrefs() throws IOException
286 List<SequenceI> protseqs = new ArrayList<SequenceI>();
287 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
288 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
289 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
290 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
291 protein.setDataset(null);
293 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
294 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
295 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
296 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
297 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
298 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
299 cdna.setDataset(null);
301 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
303 // 3 mappings made, each from 1 to 1 sequence
304 assertEquals(3, protein.getCodonFrames().size());
305 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
306 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
307 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
309 // V12345 mapped to A22222
310 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
312 assertEquals(1, acf.getdnaSeqs().length);
313 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
314 acf.getdnaSeqs()[0]);
315 Mapping[] protMappings = acf.getProtMappings();
316 assertEquals(1, protMappings.length);
317 MapList mapList = protMappings[0].getMap();
318 assertEquals(3, mapList.getFromRatio());
319 assertEquals(1, mapList.getToRatio());
320 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
322 assertEquals(1, mapList.getFromRanges().size());
323 assertTrue(Arrays.equals(new int[] { 1, 3 },
324 mapList.getToRanges().get(0)));
325 assertEquals(1, mapList.getToRanges().size());
327 // V12346 mapped to A33333
328 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
329 assertEquals(1, acf.getdnaSeqs().length);
330 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
331 acf.getdnaSeqs()[0]);
333 // V12347 mapped to A11111
334 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
335 assertEquals(1, acf.getdnaSeqs().length);
336 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
337 acf.getdnaSeqs()[0]);
339 // no mapping involving the 'extra' A44444
340 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
344 * Test for the alignSequenceAs method that takes two sequences and a mapping.
346 @Test(groups = { "Functional" })
347 public void testAlignSequenceAs_withMapping_noIntrons()
349 MapList map = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1);
352 * No existing gaps in dna:
354 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
358 * Now introduce gaps in dna but ignore them when realigning.
360 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
364 * Now include gaps in dna when realigning. First retaining 'mapped' gaps
365 * only, i.e. those within the exon region.
367 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
368 "---G-G--G---A--A-A");
371 * Include all gaps in dna when realigning (within and without the exon
372 * region). The leading gap, and the gaps between codons, are subsumed by
373 * the protein alignment gap.
375 checkAlignSequenceAs("-G-GG--AA-A---", "-A-L-", true, true, map,
376 "---G-GG---AA-A---");
379 * Include only unmapped gaps in dna when realigning (outside the exon
380 * region). The leading gap, and the gaps between codons, are subsumed by
381 * the protein alignment gap.
383 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
388 * Test for the alignSequenceAs method that takes two sequences and a mapping.
390 @Test(groups = { "Functional" })
391 public void testAlignSequenceAs_withMapping_withIntrons()
394 * Exons at codon 2 (AAA) and 4 (TTT)
396 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
397 new int[] { 1, 2 }, 3, 1);
400 * Simple case: no gaps in dna
402 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
403 "GGG---AAACCCTTTGGG");
406 * Add gaps to dna - but ignore when realigning.
408 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
409 false, false, map, "GGG---AAACCCTTTGGG");
412 * Add gaps to dna - include within exons only when realigning.
414 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
415 true, false, map, "GGG---A--A---ACCCT-TTGGG");
418 * Include gaps outside exons only when realigning.
420 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
421 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
424 * Include gaps following first intron if we are 'preserving mapped gaps'
426 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
427 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
430 * Include all gaps in dna when realigning.
432 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
433 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
437 * Test for the case where not all of the protein sequence is mapped to cDNA.
439 @Test(groups = { "Functional" })
440 public void testAlignSequenceAs_withMapping_withUnmappedProtein()
443 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
445 final MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] {
449 * -L- 'aligns' ccc------
451 checkAlignSequenceAs("gggAAAcccTTTggg", "-A-L-P-", false, false, map,
452 "gggAAAccc------TTTggg");
456 * Helper method that performs and verifies the method under test.
459 * the sequence to be realigned
461 * the sequence whose alignment is to be copied
462 * @param preserveMappedGaps
463 * @param preserveUnmappedGaps
467 protected void checkAlignSequenceAs(final String alignee,
468 final String alignModel, final boolean preserveMappedGaps,
469 final boolean preserveUnmappedGaps, MapList map,
470 final String expected)
472 SequenceI alignMe = new Sequence("Seq1", alignee);
473 alignMe.createDatasetSequence();
474 SequenceI alignFrom = new Sequence("Seq2", alignModel);
475 alignFrom.createDatasetSequence();
476 AlignedCodonFrame acf = new AlignedCodonFrame();
477 acf.addMap(alignMe.getDatasetSequence(), alignFrom.getDatasetSequence(), map);
479 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "---", '-',
480 preserveMappedGaps, preserveUnmappedGaps);
481 assertEquals(expected, alignMe.getSequenceAsString());
485 * Test for the alignSequenceAs method where we preserve gaps in introns only.
487 @Test(groups = { "Functional" })
488 public void testAlignSequenceAs_keepIntronGapsOnly()
492 * Intron GGGAAA followed by exon CCCTTT
494 MapList map = new MapList(new int[] { 7, 12 }, new int[] { 1, 2 }, 3, 1);
496 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL", false, true, map,
501 * Test for the method that generates an aligned translated sequence from one
504 @Test(groups = { "Functional" })
505 public void testGetAlignedTranslation_dnaLikeProtein()
507 // dna alignment will be replaced
508 SequenceI dna = new Sequence("Seq1", "T-G-CC-A--T-TAC-CAG-");
509 dna.createDatasetSequence();
510 // protein alignment will be 'applied' to dna
511 SequenceI protein = new Sequence("Seq1", "-CH-Y--Q-");
512 protein.createDatasetSequence();
513 MapList map = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 3, 1);
514 AlignedCodonFrame acf = new AlignedCodonFrame();
515 acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map);
517 final SequenceI aligned = AlignmentUtils.getAlignedTranslation(protein,
519 assertEquals("---TGCCAT---TAC------CAG---",
520 aligned.getSequenceAsString());
521 assertSame(aligned.getDatasetSequence(), dna.getDatasetSequence());
525 * Test the method that realigns protein to match mapped codon alignment.
527 @Test(groups = { "Functional" })
528 public void testAlignProteinAsDna()
530 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
531 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
532 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
533 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
534 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
535 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
536 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
537 dna.setDataset(null);
539 // protein alignment will be realigned like dna
540 SequenceI prot1 = new Sequence("Seq1", "CHYQ");
541 SequenceI prot2 = new Sequence("Seq2", "CHYQ");
542 SequenceI prot3 = new Sequence("Seq3", "CHYQ");
543 SequenceI prot4 = new Sequence("Seq4", "R-QSV"); // unmapped, unchanged
544 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
546 protein.setDataset(null);
548 MapList map = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 3, 1);
549 AlignedCodonFrame acf = new AlignedCodonFrame();
550 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
551 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
552 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
553 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
555 protein.setCodonFrames(acfs);
558 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
559 * [8,9,10] [10,11,12] [11,12,13]
561 AlignmentUtils.alignProteinAsDna(protein, dna);
562 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
563 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
564 assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
565 assertEquals("R-QSV", prot4.getSequenceAsString());
569 * Test the method that tests whether a CDNA sequence translates to a protein
572 @Test(groups = { "Functional" })
573 public void testTranslatesAs()
575 // null arguments check
576 assertFalse(AlignmentUtils.translatesAs(null, 0, null));
577 assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null));
578 assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' }));
580 // straight translation
581 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
582 "FPKG".toCharArray()));
583 // with extra start codon (not in protein)
584 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
585 3, "FPKG".toCharArray()));
586 // with stop codon1 (not in protein)
587 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
588 0, "FPKG".toCharArray()));
589 // with stop codon1 (in protein as *)
590 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
591 0, "FPKG*".toCharArray()));
592 // with stop codon2 (not in protein)
593 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
594 0, "FPKG".toCharArray()));
595 // with stop codon3 (not in protein)
596 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
597 0, "FPKG".toCharArray()));
598 // with start and stop codon1
599 assertTrue(AlignmentUtils.translatesAs(
600 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG".toCharArray()));
601 // with start and stop codon1 (in protein as *)
602 assertTrue(AlignmentUtils.translatesAs(
603 "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG*".toCharArray()));
604 // with start and stop codon2
605 assertTrue(AlignmentUtils.translatesAs(
606 "atgtttcccaaagggtag".toCharArray(), 3, "FPKG".toCharArray()));
607 // with start and stop codon3
608 assertTrue(AlignmentUtils.translatesAs(
609 "atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray()));
611 // with embedded stop codons
612 assertTrue(AlignmentUtils.translatesAs(
613 "atgtttTAGcccaaaTAAgggtga".toCharArray(), 3,
614 "F*PK*G".toCharArray()));
617 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
618 0, "FPMG".toCharArray()));
621 assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0,
622 "FPKG".toCharArray()));
625 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
626 0, "FPK".toCharArray()));
628 // overlong dna (doesn't end in stop codon)
629 assertFalse(AlignmentUtils.translatesAs(
630 "tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray()));
632 // dna + stop codon + more
633 assertFalse(AlignmentUtils.translatesAs(
634 "tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray()));
637 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
638 0, "FPKGQ".toCharArray()));
642 * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
643 * stop codons in addition to the protein coding sequence.
645 * @throws IOException
647 @Test(groups = { "Functional" })
648 public void testMapProteinAlignmentToCdna_withStartAndStopCodons()
651 List<SequenceI> protseqs = new ArrayList<SequenceI>();
652 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
653 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
654 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
655 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
656 protein.setDataset(null);
658 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
660 dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
662 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
663 // = start +EIQ + stop
664 dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
665 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
666 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
667 cdna.setDataset(null);
669 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
671 // 3 mappings made, each from 1 to 1 sequence
672 assertEquals(3, protein.getCodonFrames().size());
673 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
674 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
675 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
677 // V12345 mapped from A22222
678 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
680 assertEquals(1, acf.getdnaSeqs().length);
681 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
682 acf.getdnaSeqs()[0]);
683 Mapping[] protMappings = acf.getProtMappings();
684 assertEquals(1, protMappings.length);
685 MapList mapList = protMappings[0].getMap();
686 assertEquals(3, mapList.getFromRatio());
687 assertEquals(1, mapList.getToRatio());
688 assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
690 assertEquals(1, mapList.getFromRanges().size());
691 assertTrue(Arrays.equals(new int[] { 1, 3 },
692 mapList.getToRanges().get(0)));
693 assertEquals(1, mapList.getToRanges().size());
695 // V12346 mapped from A33333 starting position 4
696 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
697 assertEquals(1, acf.getdnaSeqs().length);
698 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
699 acf.getdnaSeqs()[0]);
700 protMappings = acf.getProtMappings();
701 assertEquals(1, protMappings.length);
702 mapList = protMappings[0].getMap();
703 assertEquals(3, mapList.getFromRatio());
704 assertEquals(1, mapList.getToRatio());
705 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
707 assertEquals(1, mapList.getFromRanges().size());
708 assertTrue(Arrays.equals(new int[] { 1, 3 },
709 mapList.getToRanges().get(0)));
710 assertEquals(1, mapList.getToRanges().size());
712 // V12347 mapped to A11111 starting position 4
713 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
714 assertEquals(1, acf.getdnaSeqs().length);
715 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
716 acf.getdnaSeqs()[0]);
717 protMappings = acf.getProtMappings();
718 assertEquals(1, protMappings.length);
719 mapList = protMappings[0].getMap();
720 assertEquals(3, mapList.getFromRatio());
721 assertEquals(1, mapList.getToRatio());
722 assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
724 assertEquals(1, mapList.getFromRanges().size());
725 assertTrue(Arrays.equals(new int[] { 1, 3 },
726 mapList.getToRanges().get(0)));
727 assertEquals(1, mapList.getToRanges().size());
729 // no mapping involving the 'extra' A44444
730 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
734 * Test mapping of protein to cDNA, for the case where we have some sequence
735 * cross-references. Verify that 1-to-many mappings are made where
736 * cross-references exist and sequences are mappable.
738 * @throws IOException
740 @Test(groups = { "Functional" })
741 public void testMapProteinAlignmentToCdna_withXrefs() throws IOException
743 List<SequenceI> protseqs = new ArrayList<SequenceI>();
744 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
745 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
746 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
747 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
748 protein.setDataset(null);
750 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
751 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
752 dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
753 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
754 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
755 dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
756 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
757 cdna.setDataset(null);
759 // Xref A22222 to V12345 (should get mapped)
760 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
761 // Xref V12345 to A44444 (should get mapped)
762 protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
763 // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
764 dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
765 // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
766 // it should get paired up with the unmapped A33333
767 // A11111 should be mapped to V12347
768 // A55555 is spare and has no xref so is not mapped
770 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
772 // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
773 assertEquals(3, protein.getCodonFrames().size());
774 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
775 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
776 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
778 // one mapping for each of the first 4 cDNA sequences
779 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
780 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
781 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
782 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
784 // V12345 mapped to A22222 and A44444
785 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
787 assertEquals(2, acf.getdnaSeqs().length);
788 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
789 acf.getdnaSeqs()[0]);
790 assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
791 acf.getdnaSeqs()[1]);
793 // V12346 mapped to A33333
794 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
795 assertEquals(1, acf.getdnaSeqs().length);
796 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
797 acf.getdnaSeqs()[0]);
799 // V12347 mapped to A11111
800 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
801 assertEquals(1, acf.getdnaSeqs().length);
802 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
803 acf.getdnaSeqs()[0]);
805 // no mapping involving the 'extra' A55555
806 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
810 * Test mapping of protein to cDNA, for the case where we have some sequence
811 * cross-references. Verify that once we have made an xref mapping we don't
812 * also map un-xrefd sequeces.
814 * @throws IOException
816 @Test(groups = { "Functional" })
817 public void testMapProteinAlignmentToCdna_prioritiseXrefs()
820 List<SequenceI> protseqs = new ArrayList<SequenceI>();
821 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
822 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
823 AlignmentI protein = new Alignment(
824 protseqs.toArray(new SequenceI[protseqs.size()]));
825 protein.setDataset(null);
827 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
828 dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
829 dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
830 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
832 cdna.setDataset(null);
834 // Xref A22222 to V12345 (should get mapped)
835 // A11111 should then be mapped to the unmapped V12346
836 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
838 assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
840 // 2 protein mappings made
841 assertEquals(2, protein.getCodonFrames().size());
842 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
843 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
845 // one mapping for each of the cDNA sequences
846 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
847 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
849 // V12345 mapped to A22222
850 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
852 assertEquals(1, acf.getdnaSeqs().length);
853 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
854 acf.getdnaSeqs()[0]);
856 // V12346 mapped to A11111
857 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
858 assertEquals(1, acf.getdnaSeqs().length);
859 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
860 acf.getdnaSeqs()[0]);
864 * Test the method that shows or hides sequence annotations by type(s) and
867 @Test(groups = { "Functional" })
868 public void testShowOrHideSequenceAnnotations()
870 SequenceI seq1 = new Sequence("Seq1", "AAA");
871 SequenceI seq2 = new Sequence("Seq2", "BBB");
872 SequenceI seq3 = new Sequence("Seq3", "CCC");
873 Annotation[] anns = new Annotation[] { new Annotation(2f) };
874 AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1",
876 ann1.setSequenceRef(seq1);
877 AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2",
879 ann2.setSequenceRef(seq2);
880 AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3",
882 AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns);
883 ann4.setSequenceRef(seq1);
884 AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns);
885 ann5.setSequenceRef(seq2);
886 AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns);
887 AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2, seq3 });
888 al.addAnnotation(ann1); // Structure for Seq1
889 al.addAnnotation(ann2); // Structure for Seq2
890 al.addAnnotation(ann3); // Structure for no sequence
891 al.addAnnotation(ann4); // Temp for seq1
892 al.addAnnotation(ann5); // Temp for seq2
893 al.addAnnotation(ann6); // Temp for no sequence
894 List<String> types = new ArrayList<String>();
895 List<SequenceI> scope = new ArrayList<SequenceI>();
898 * Set all sequence related Structure to hidden (ann1, ann2)
900 types.add("Structure");
901 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
903 assertFalse(ann1.visible);
904 assertFalse(ann2.visible);
905 assertTrue(ann3.visible); // not sequence-related, not affected
906 assertTrue(ann4.visible); // not Structure, not affected
907 assertTrue(ann5.visible); // "
908 assertTrue(ann6.visible); // not sequence-related, not affected
911 * Set Temp in {seq1, seq3} to hidden
917 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false,
919 assertFalse(ann1.visible); // unchanged
920 assertFalse(ann2.visible); // unchanged
921 assertTrue(ann3.visible); // not sequence-related, not affected
922 assertFalse(ann4.visible); // Temp for seq1 hidden
923 assertTrue(ann5.visible); // not in scope, not affected
924 assertTrue(ann6.visible); // not sequence-related, not affected
927 * Set Temp in all sequences to hidden
933 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
935 assertFalse(ann1.visible); // unchanged
936 assertFalse(ann2.visible); // unchanged
937 assertTrue(ann3.visible); // not sequence-related, not affected
938 assertFalse(ann4.visible); // Temp for seq1 hidden
939 assertFalse(ann5.visible); // Temp for seq2 hidden
940 assertTrue(ann6.visible); // not sequence-related, not affected
943 * Set all types in {seq1, seq3} to visible
949 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true,
951 assertTrue(ann1.visible); // Structure for seq1 set visible
952 assertFalse(ann2.visible); // not in scope, unchanged
953 assertTrue(ann3.visible); // not sequence-related, not affected
954 assertTrue(ann4.visible); // Temp for seq1 set visible
955 assertFalse(ann5.visible); // not in scope, unchanged
956 assertTrue(ann6.visible); // not sequence-related, not affected
959 * Set all types in all scope to hidden
961 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true,
963 assertFalse(ann1.visible);
964 assertFalse(ann2.visible);
965 assertTrue(ann3.visible); // not sequence-related, not affected
966 assertFalse(ann4.visible);
967 assertFalse(ann5.visible);
968 assertTrue(ann6.visible); // not sequence-related, not affected
972 * Tests for the method that checks if one sequence cross-references another
974 @Test(groups = { "Functional" })
975 public void testHasCrossRef()
977 assertFalse(AlignmentUtils.hasCrossRef(null, null));
978 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
979 assertFalse(AlignmentUtils.hasCrossRef(seq1, null));
980 assertFalse(AlignmentUtils.hasCrossRef(null, seq1));
981 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
982 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
985 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193"));
986 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
988 // case-insensitive; version number is ignored
989 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192"));
990 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
993 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
994 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
995 // test is one-way only
996 assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1));
1000 * Tests for the method that checks if either sequence cross-references the
1003 @Test(groups = { "Functional" })
1004 public void testHaveCrossRef()
1006 assertFalse(AlignmentUtils.hasCrossRef(null, null));
1007 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
1008 assertFalse(AlignmentUtils.haveCrossRef(seq1, null));
1009 assertFalse(AlignmentUtils.haveCrossRef(null, seq1));
1010 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
1011 assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2));
1013 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
1014 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
1015 // next is true for haveCrossRef, false for hasCrossRef
1016 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
1018 // now the other way round
1019 seq1.setDBRefs(null);
1020 seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345"));
1021 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
1022 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
1025 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
1026 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
1027 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
1031 * Test the method that extracts the cds-only part of a dna alignment.
1033 @Test(groups = { "Functional" })
1034 public void testMakeCdsAlignment()
1036 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1037 SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
1038 SequenceI pep1 = new Sequence("pep1", "GF");
1039 SequenceI pep2 = new Sequence("pep2", "GFP");
1040 dna1.createDatasetSequence();
1041 dna2.createDatasetSequence();
1042 pep1.createDatasetSequence();
1043 pep2.createDatasetSequence();
1044 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
1046 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
1048 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
1050 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
1052 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds5", 13, 15, 0f,
1054 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
1055 dna.setDataset(null);
1057 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
1058 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1059 new int[] { 1, 2 }, 3, 1);
1060 AlignedCodonFrame acf = new AlignedCodonFrame();
1061 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1063 map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 },
1065 acf = new AlignedCodonFrame();
1066 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1069 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1070 dna1, dna2 }, mappings, dna);
1071 assertEquals(2, cds.getSequences().size());
1072 assertEquals("---GGG---TTT---", cds.getSequenceAt(0)
1073 .getSequenceAsString());
1074 assertEquals("GGG---TTT---CCC", cds.getSequenceAt(1)
1075 .getSequenceAsString());
1078 * verify shared, extended alignment dataset
1080 assertSame(dna.getDataset(), cds.getDataset());
1081 assertTrue(dna.getDataset().getSequences()
1082 .contains(cds.getSequenceAt(0).getDatasetSequence()));
1083 assertTrue(dna.getDataset().getSequences()
1084 .contains(cds.getSequenceAt(1).getDatasetSequence()));
1087 * Verify updated mappings
1089 assertEquals(2, mappings.size());
1092 * Mapping from pep1 to GGGTTT in first new exon sequence
1094 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1095 .findMappingsForSequence(pep1, mappings);
1096 assertEquals(1, pep1Mapping.size());
1098 SearchResults sr = MappingUtils.buildSearchResults(pep1, 1, mappings);
1099 assertEquals(1, sr.getResults().size());
1100 Match m = sr.getResults().get(0);
1101 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1103 assertEquals(1, m.getStart());
1104 assertEquals(3, m.getEnd());
1106 sr = MappingUtils.buildSearchResults(pep1, 2, mappings);
1107 m = sr.getResults().get(0);
1108 assertSame(cds.getSequenceAt(0).getDatasetSequence(),
1110 assertEquals(4, m.getStart());
1111 assertEquals(6, m.getEnd());
1114 * Mapping from pep2 to GGGTTTCCC in second new exon sequence
1116 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1117 .findMappingsForSequence(pep2, mappings);
1118 assertEquals(1, pep2Mapping.size());
1120 sr = MappingUtils.buildSearchResults(pep2, 1, mappings);
1121 assertEquals(1, sr.getResults().size());
1122 m = sr.getResults().get(0);
1123 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1125 assertEquals(1, m.getStart());
1126 assertEquals(3, m.getEnd());
1128 sr = MappingUtils.buildSearchResults(pep2, 2, mappings);
1129 m = sr.getResults().get(0);
1130 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1132 assertEquals(4, m.getStart());
1133 assertEquals(6, m.getEnd());
1135 sr = MappingUtils.buildSearchResults(pep2, 3, mappings);
1136 m = sr.getResults().get(0);
1137 assertSame(cds.getSequenceAt(1).getDatasetSequence(),
1139 assertEquals(7, m.getStart());
1140 assertEquals(9, m.getEnd());
1144 * Test the method that makes a cds-only sequence from a DNA sequence and its
1145 * product mapping. Test includes the expected case that the DNA sequence
1146 * already has a protein product (Uniprot translation) which in turn has an
1147 * x-ref to the EMBLCDS record.
1149 @Test(groups = { "Functional" })
1150 public void testMakeCdsSequences()
1152 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1153 SequenceI pep1 = new Sequence("pep1", "GF");
1154 dna1.createDatasetSequence();
1155 pep1.createDatasetSequence();
1156 pep1.getDatasetSequence().addDBRef(
1157 new DBRefEntry("EMBLCDS", "2", "A12345"));
1160 * Make the mapping from dna to protein. The protein sequence has a DBRef to
1163 Set<AlignedCodonFrame> mappings = new HashSet<AlignedCodonFrame>();
1164 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1165 new int[] { 1, 2 }, 3, 1);
1166 AlignedCodonFrame acf = new AlignedCodonFrame();
1167 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1170 AlignedCodonFrame newMapping = new AlignedCodonFrame();
1171 List<int[]> ungappedColumns = new ArrayList<int[]>();
1172 ungappedColumns.add(new int[] { 4, 6 });
1173 ungappedColumns.add(new int[] { 10, 12 });
1174 List<SequenceI> cdsSeqs = AlignmentUtils.makeCdsSequences(dna1, acf,
1177 assertEquals(1, cdsSeqs.size());
1178 SequenceI cdsSeq = cdsSeqs.get(0);
1180 assertEquals("GGGTTT", cdsSeq.getSequenceAsString());
1181 assertEquals("dna1|A12345", cdsSeq.getName());
1182 assertEquals(1, cdsSeq.getDBRefs().length);
1183 DBRefEntry cdsRef = cdsSeq.getDBRefs()[0];
1184 assertEquals("EMBLCDS", cdsRef.getSource());
1185 assertEquals("2", cdsRef.getVersion());
1186 assertEquals("A12345", cdsRef.getAccessionId());
1190 * Test the method that makes a cds-only alignment from a DNA sequence and its
1191 * product mappings, for the case where there are multiple exon mappings to
1192 * different protein products.
1194 @Test(groups = { "Functional" })
1195 public void testMakeCdsAlignment_multipleProteins()
1197 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1198 SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT
1199 SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc
1200 SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT
1201 dna1.createDatasetSequence();
1202 pep1.createDatasetSequence();
1203 pep2.createDatasetSequence();
1204 pep3.createDatasetSequence();
1205 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 6, 0f,
1207 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 10, 12, 0f,
1209 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 1, 3, 0f,
1211 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds4", 7, 9, 0f,
1213 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds5", 1, 3, 0f,
1215 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds6", 10, 12, 0f,
1217 pep1.getDatasetSequence().addDBRef(
1218 new DBRefEntry("EMBLCDS", "2", "A12345"));
1219 pep2.getDatasetSequence().addDBRef(
1220 new DBRefEntry("EMBLCDS", "3", "A12346"));
1221 pep3.getDatasetSequence().addDBRef(
1222 new DBRefEntry("EMBLCDS", "4", "A12347"));
1225 * Make the mappings from dna to protein
1227 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
1228 // map ...GGG...TTT to GF
1229 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1230 new int[] { 1, 2 }, 3, 1);
1231 AlignedCodonFrame acf = new AlignedCodonFrame();
1232 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1235 // map aaa...ccc to KP
1236 map = new MapList(new int[] { 1, 3, 7, 9 }, new int[] { 1, 2 }, 3, 1);
1237 acf = new AlignedCodonFrame();
1238 acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map);
1241 // map aaa......TTT to KF
1242 map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 2 }, 3, 1);
1243 acf = new AlignedCodonFrame();
1244 acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
1248 * Create the Exon alignment; also replaces the dna-to-protein mappings with
1249 * exon-to-protein and exon-to-dna mappings
1251 AlignmentI dna = new Alignment(new SequenceI[] { dna1 });
1252 dna.setDataset(null);
1253 AlignmentI exal = AlignmentUtils.makeCdsAlignment(
1254 new SequenceI[] { dna1 }, mappings, dna);
1257 * Verify we have 3 cds sequences, mapped to pep1/2/3 respectively
1259 List<SequenceI> cds = exal.getSequences();
1260 assertEquals(3, cds.size());
1263 * verify shared, extended alignment dataset
1265 assertSame(exal.getDataset(), dna.getDataset());
1266 assertTrue(dna.getDataset().getSequences()
1267 .contains(cds.get(0).getDatasetSequence()));
1268 assertTrue(dna.getDataset().getSequences()
1269 .contains(cds.get(1).getDatasetSequence()));
1270 assertTrue(dna.getDataset().getSequences()
1271 .contains(cds.get(2).getDatasetSequence()));
1274 * verify aligned cds sequences and their xrefs
1276 SequenceI cdsSeq = cds.get(0);
1277 assertEquals("---GGG---TTT", cdsSeq.getSequenceAsString());
1278 assertEquals("dna1|A12345", cdsSeq.getName());
1279 assertEquals(1, cdsSeq.getDBRefs().length);
1280 DBRefEntry cdsRef = cdsSeq.getDBRefs()[0];
1281 assertEquals("EMBLCDS", cdsRef.getSource());
1282 assertEquals("2", cdsRef.getVersion());
1283 assertEquals("A12345", cdsRef.getAccessionId());
1285 cdsSeq = cds.get(1);
1286 assertEquals("aaa---ccc---", cdsSeq.getSequenceAsString());
1287 assertEquals("dna1|A12346", cdsSeq.getName());
1288 assertEquals(1, cdsSeq.getDBRefs().length);
1289 cdsRef = cdsSeq.getDBRefs()[0];
1290 assertEquals("EMBLCDS", cdsRef.getSource());
1291 assertEquals("3", cdsRef.getVersion());
1292 assertEquals("A12346", cdsRef.getAccessionId());
1294 cdsSeq = cds.get(2);
1295 assertEquals("aaa------TTT", cdsSeq.getSequenceAsString());
1296 assertEquals("dna1|A12347", cdsSeq.getName());
1297 assertEquals(1, cdsSeq.getDBRefs().length);
1298 cdsRef = cdsSeq.getDBRefs()[0];
1299 assertEquals("EMBLCDS", cdsRef.getSource());
1300 assertEquals("4", cdsRef.getVersion());
1301 assertEquals("A12347", cdsRef.getAccessionId());
1304 * Verify there are mappings from each cds sequence to its protein product
1305 * and also to its dna source
1307 Iterator<AlignedCodonFrame> newMappingsIterator = mappings.iterator();
1309 // mappings for dna1 - exon1 - pep1
1310 AlignedCodonFrame cdsMapping = newMappingsIterator.next();
1311 List<Mapping> dnaMappings = cdsMapping.getMappingsForSequence(dna1);
1312 assertEquals(1, dnaMappings.size());
1313 assertSame(cds.get(0).getDatasetSequence(), dnaMappings.get(0)
1315 assertEquals("G(1) in CDS should map to G(4) in DNA", 4, dnaMappings
1316 .get(0).getMap().getToPosition(1));
1317 List<Mapping> peptideMappings = cdsMapping
1318 .getMappingsForSequence(pep1);
1319 assertEquals(1, peptideMappings.size());
1320 assertSame(pep1.getDatasetSequence(), peptideMappings.get(0).getTo());
1322 // mappings for dna1 - cds2 - pep2
1323 cdsMapping = newMappingsIterator.next();
1324 dnaMappings = cdsMapping.getMappingsForSequence(dna1);
1325 assertEquals(1, dnaMappings.size());
1326 assertSame(cds.get(1).getDatasetSequence(), dnaMappings.get(0)
1328 assertEquals("c(4) in CDS should map to c(7) in DNA", 7, dnaMappings
1329 .get(0).getMap().getToPosition(4));
1330 peptideMappings = cdsMapping.getMappingsForSequence(pep2);
1331 assertEquals(1, peptideMappings.size());
1332 assertSame(pep2.getDatasetSequence(), peptideMappings.get(0).getTo());
1334 // mappings for dna1 - cds3 - pep3
1335 cdsMapping = newMappingsIterator.next();
1336 dnaMappings = cdsMapping.getMappingsForSequence(dna1);
1337 assertEquals(1, dnaMappings.size());
1338 assertSame(cds.get(2).getDatasetSequence(), dnaMappings.get(0)
1340 assertEquals("T(4) in CDS should map to T(10) in DNA", 10, dnaMappings
1341 .get(0).getMap().getToPosition(4));
1342 peptideMappings = cdsMapping.getMappingsForSequence(pep3);
1343 assertEquals(1, peptideMappings.size());
1344 assertSame(pep3.getDatasetSequence(), peptideMappings.get(0).getTo());
1347 @Test(groups = { "Functional" })
1348 public void testIsMappable()
1350 SequenceI dna1 = new Sequence("dna1", "cgCAGtgGT");
1351 SequenceI aa1 = new Sequence("aa1", "RSG");
1352 AlignmentI al1 = new Alignment(new SequenceI[] { dna1 });
1353 AlignmentI al2 = new Alignment(new SequenceI[] { aa1 });
1355 assertFalse(AlignmentUtils.isMappable(null, null));
1356 assertFalse(AlignmentUtils.isMappable(al1, null));
1357 assertFalse(AlignmentUtils.isMappable(null, al1));
1358 assertFalse(AlignmentUtils.isMappable(al1, al1));
1359 assertFalse(AlignmentUtils.isMappable(al2, al2));
1361 assertTrue(AlignmentUtils.isMappable(al1, al2));
1362 assertTrue(AlignmentUtils.isMappable(al2, al1));
1366 * Test creating a mapping when the sequences involved do not start at residue
1369 * @throws IOException
1371 @Test(groups = { "Functional" })
1372 public void testMapCdnaToProtein_forSubsequence()
1375 SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12);
1376 prot.createDatasetSequence();
1378 SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48);
1379 dna.createDatasetSequence();
1381 MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna);
1382 assertEquals(10, map.getToLowest());
1383 assertEquals(12, map.getToHighest());
1384 assertEquals(40, map.getFromLowest());
1385 assertEquals(48, map.getFromHighest());
1389 * Test for the alignSequenceAs method where we have protein mapped to protein
1391 @Test(groups = { "Functional" })
1392 public void testAlignSequenceAs_mappedProteinProtein()
1395 SequenceI alignMe = new Sequence("Match", "MGAASEV");
1396 alignMe.createDatasetSequence();
1397 SequenceI alignFrom = new Sequence("Query", "LQTGYMGAASEVMFSPTRR");
1398 alignFrom.createDatasetSequence();
1400 AlignedCodonFrame acf = new AlignedCodonFrame();
1401 // this is like a domain or motif match of part of a peptide sequence
1402 MapList map = new MapList(new int[] { 6, 12 }, new int[] { 1, 7 }, 1, 1);
1403 acf.addMap(alignFrom.getDatasetSequence(),
1404 alignMe.getDatasetSequence(), map);
1406 AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "-", '-', true,
1408 assertEquals("-----MGAASEV-------", alignMe.getSequenceAsString());
1412 * Test for the alignSequenceAs method where there are trailing unmapped
1413 * residues in the model sequence
1415 @Test(groups = { "Functional" })
1416 public void testAlignSequenceAs_withTrailingPeptide()
1418 // map first 3 codons to KPF; G is a trailing unmapped residue
1419 MapList map = new MapList(new int[] { 1, 9 }, new int[] { 1, 3 }, 3, 1);
1421 checkAlignSequenceAs("AAACCCTTT", "K-PFG", true, true, map,
1426 * Tests for transferring features between mapped sequences
1428 @Test(groups = { "Functional" })
1429 public void testTransferFeatures()
1431 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1432 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1435 dna.addSequenceFeature(new SequenceFeature("type1", "desc1", 1, 2, 1f,
1437 // partial overlap - to [1, 1]
1438 dna.addSequenceFeature(new SequenceFeature("type2", "desc2", 3, 4, 2f,
1440 // exact overlap - to [1, 3]
1441 dna.addSequenceFeature(new SequenceFeature("type3", "desc3", 4, 6, 3f,
1443 // spanning overlap - to [2, 5]
1444 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1446 // exactly overlaps whole mapped range [1, 6]
1447 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1449 // no overlap (internal)
1450 dna.addSequenceFeature(new SequenceFeature("type6", "desc6", 7, 9, 6f,
1452 // no overlap (3' end)
1453 dna.addSequenceFeature(new SequenceFeature("type7", "desc7", 13, 15,
1455 // overlap (3' end) - to [6, 6]
1456 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1458 // extended overlap - to [6, +]
1459 dna.addSequenceFeature(new SequenceFeature("type9", "desc9", 12, 13,
1462 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1463 new int[] { 1, 6 }, 1, 1);
1466 * transferFeatures() will build 'partial overlap' for regions
1467 * that partially overlap 5' or 3' (start or end) of target sequence
1469 AlignmentUtils.transferFeatures(dna, cds, map, null);
1470 SequenceFeature[] sfs = cds.getSequenceFeatures();
1471 assertEquals(6, sfs.length);
1473 SequenceFeature sf = sfs[0];
1474 assertEquals("type2", sf.getType());
1475 assertEquals("desc2", sf.getDescription());
1476 assertEquals(2f, sf.getScore());
1477 assertEquals(1, sf.getBegin());
1478 assertEquals(1, sf.getEnd());
1481 assertEquals("type3", sf.getType());
1482 assertEquals("desc3", sf.getDescription());
1483 assertEquals(3f, sf.getScore());
1484 assertEquals(1, sf.getBegin());
1485 assertEquals(3, sf.getEnd());
1488 assertEquals("type4", sf.getType());
1489 assertEquals(2, sf.getBegin());
1490 assertEquals(5, sf.getEnd());
1493 assertEquals("type5", sf.getType());
1494 assertEquals(1, sf.getBegin());
1495 assertEquals(6, sf.getEnd());
1498 assertEquals("type8", sf.getType());
1499 assertEquals(6, sf.getBegin());
1500 assertEquals(6, sf.getEnd());
1503 assertEquals("type9", sf.getType());
1504 assertEquals(6, sf.getBegin());
1505 assertEquals(6, sf.getEnd());
1509 * Tests for transferring features between mapped sequences
1511 @Test(groups = { "Functional" })
1512 public void testTransferFeatures_withOmit()
1514 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1515 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1517 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1518 new int[] { 1, 6 }, 1, 1);
1520 // [5, 11] maps to [2, 5]
1521 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1523 // [4, 12] maps to [1, 6]
1524 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1526 // [12, 12] maps to [6, 6]
1527 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1530 // desc4 and desc8 are the 'omit these' varargs
1531 AlignmentUtils.transferFeatures(dna, cds, map, null, "type4", "type8");
1532 SequenceFeature[] sfs = cds.getSequenceFeatures();
1533 assertEquals(1, sfs.length);
1535 SequenceFeature sf = sfs[0];
1536 assertEquals("type5", sf.getType());
1537 assertEquals(1, sf.getBegin());
1538 assertEquals(6, sf.getEnd());
1542 * Tests for transferring features between mapped sequences
1544 @Test(groups = { "Functional" })
1545 public void testTransferFeatures_withSelect()
1547 SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
1548 SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
1550 MapList map = new MapList(new int[] { 4, 6, 10, 12 },
1551 new int[] { 1, 6 }, 1, 1);
1553 // [5, 11] maps to [2, 5]
1554 dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
1556 // [4, 12] maps to [1, 6]
1557 dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
1559 // [12, 12] maps to [6, 6]
1560 dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
1563 // "type5" is the 'select this type' argument
1564 AlignmentUtils.transferFeatures(dna, cds, map, "type5");
1565 SequenceFeature[] sfs = cds.getSequenceFeatures();
1566 assertEquals(1, sfs.length);
1568 SequenceFeature sf = sfs[0];
1569 assertEquals("type5", sf.getType());
1570 assertEquals(1, sf.getBegin());
1571 assertEquals(6, sf.getEnd());
1575 * Test the method that extracts the cds-only part of a dna alignment, for the
1576 * case where the cds should be aligned to match its nucleotide sequence.
1578 @Test(groups = { "Functional" })
1579 public void testMakeCdsAlignment_alternativeTranscripts()
1581 SequenceI dna1 = new Sequence("dna1", "aaaGGGCC-----CTTTaaaGGG");
1582 // alternative transcript of same dna skips CCC codon
1583 SequenceI dna2 = new Sequence("dna2", "aaaGGGCC-----cttTaaaGGG");
1584 // dna3 has no mapping (protein product) so should be ignored here
1585 SequenceI dna3 = new Sequence("dna3", "aaaGGGCCCCCGGGcttTaaaGGG");
1586 SequenceI pep1 = new Sequence("pep1", "GPFG");
1587 SequenceI pep2 = new Sequence("pep2", "GPG");
1588 dna1.createDatasetSequence();
1589 dna2.createDatasetSequence();
1590 dna3.createDatasetSequence();
1591 pep1.createDatasetSequence();
1592 pep2.createDatasetSequence();
1593 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds1", 4, 8, 0f,
1595 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds2", 9, 12, 0f,
1597 dna1.addSequenceFeature(new SequenceFeature("CDS", "cds3", 16, 18, 0f,
1599 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 4, 8, 0f,
1601 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 12, 12, 0f,
1603 dna2.addSequenceFeature(new SequenceFeature("CDS", "cds", 16, 18, 0f,
1606 List<AlignedCodonFrame> mappings = new ArrayList<AlignedCodonFrame>();
1607 MapList map = new MapList(new int[] { 4, 12, 16, 18 },
1608 new int[] { 1, 4 }, 3, 1);
1609 AlignedCodonFrame acf = new AlignedCodonFrame();
1610 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1612 map = new MapList(new int[] { 4, 8, 12, 12, 16, 18 },
1615 acf = new AlignedCodonFrame();
1616 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
1619 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1620 dna.setDataset(null);
1621 AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
1622 dna1, dna2, dna3 }, mappings, dna);
1623 List<SequenceI> cdsSeqs = cds.getSequences();
1624 assertEquals(2, cdsSeqs.size());
1625 assertEquals("GGGCCCTTTGGG", cdsSeqs.get(0).getSequenceAsString());
1626 assertEquals("GGGCC---TGGG", cdsSeqs.get(1).getSequenceAsString());
1629 * verify shared, extended alignment dataset
1631 assertSame(dna.getDataset(), cds.getDataset());
1632 assertTrue(dna.getDataset().getSequences()
1633 .contains(cdsSeqs.get(0).getDatasetSequence()));
1634 assertTrue(dna.getDataset().getSequences()
1635 .contains(cdsSeqs.get(1).getDatasetSequence()));
1638 * Verify updated mappings
1640 assertEquals(2, mappings.size());
1643 * Mapping from pep1 to GGGTTT in first new CDS sequence
1645 List<AlignedCodonFrame> pep1Mapping = MappingUtils
1646 .findMappingsForSequence(pep1, mappings);
1647 assertEquals(1, pep1Mapping.size());
1649 * maps GPFG to 1-3,4-6,7-9,10-12
1651 SearchResults sr = MappingUtils.buildSearchResults(pep1, 1, mappings);
1652 assertEquals(1, sr.getResults().size());
1653 Match m = sr.getResults().get(0);
1654 assertEquals(cds.getSequenceAt(0).getDatasetSequence(),
1656 assertEquals(1, m.getStart());
1657 assertEquals(3, m.getEnd());
1658 sr = MappingUtils.buildSearchResults(pep1, 2, mappings);
1659 m = sr.getResults().get(0);
1660 assertEquals(4, m.getStart());
1661 assertEquals(6, m.getEnd());
1662 sr = MappingUtils.buildSearchResults(pep1, 3, mappings);
1663 m = sr.getResults().get(0);
1664 assertEquals(7, m.getStart());
1665 assertEquals(9, m.getEnd());
1666 sr = MappingUtils.buildSearchResults(pep1, 4, mappings);
1667 m = sr.getResults().get(0);
1668 assertEquals(10, m.getStart());
1669 assertEquals(12, m.getEnd());
1672 * GPG in pep2 map to 1-3,4-6,7-9 in second CDS sequence
1674 List<AlignedCodonFrame> pep2Mapping = MappingUtils
1675 .findMappingsForSequence(pep2, mappings);
1676 assertEquals(1, pep2Mapping.size());
1677 sr = MappingUtils.buildSearchResults(pep2, 1, mappings);
1678 assertEquals(1, sr.getResults().size());
1679 m = sr.getResults().get(0);
1680 assertEquals(cds.getSequenceAt(1).getDatasetSequence(),
1682 assertEquals(1, m.getStart());
1683 assertEquals(3, m.getEnd());
1684 sr = MappingUtils.buildSearchResults(pep2, 2, mappings);
1685 m = sr.getResults().get(0);
1686 assertEquals(4, m.getStart());
1687 assertEquals(6, m.getEnd());
1688 sr = MappingUtils.buildSearchResults(pep2, 3, mappings);
1689 m = sr.getResults().get(0);
1690 assertEquals(7, m.getStart());
1691 assertEquals(9, m.getEnd());
1695 * Tests for gapped column in sequences
1697 @Test(groups = { "Functional" })
1698 public void testIsGappedColumn()
1700 SequenceI seq1 = new Sequence("Seq1", "a--c.tc-a-g");
1701 SequenceI seq2 = new Sequence("Seq2", "aa---t--a-g");
1702 SequenceI seq3 = new Sequence("Seq3", "ag-c t-g-");
1703 List<SequenceI> seqs = Arrays
1704 .asList(new SequenceI[] { seq1, seq2, seq3 });
1705 // the column number is base 1
1706 assertFalse(AlignmentUtils.isGappedColumn(seqs, 1));
1707 assertFalse(AlignmentUtils.isGappedColumn(seqs, 2));
1708 assertTrue(AlignmentUtils.isGappedColumn(seqs, 3));
1709 assertFalse(AlignmentUtils.isGappedColumn(seqs, 4));
1710 assertTrue(AlignmentUtils.isGappedColumn(seqs, 5));
1711 assertFalse(AlignmentUtils.isGappedColumn(seqs, 6));
1712 assertFalse(AlignmentUtils.isGappedColumn(seqs, 7));
1713 assertFalse(AlignmentUtils.isGappedColumn(seqs, 8));
1714 assertFalse(AlignmentUtils.isGappedColumn(seqs, 9));
1715 assertTrue(AlignmentUtils.isGappedColumn(seqs, 10));
1716 assertFalse(AlignmentUtils.isGappedColumn(seqs, 11));
1718 assertTrue(AlignmentUtils.isGappedColumn(seqs, 0));
1719 assertTrue(AlignmentUtils.isGappedColumn(seqs, 100));
1720 assertTrue(AlignmentUtils.isGappedColumn(seqs, -100));
1721 assertTrue(AlignmentUtils.isGappedColumn(null, 0));
1724 @Test(groups = { "Functional" })
1725 public void testFindCdsColumns()
1727 // TODO target method belongs in a general-purpose alignment
1728 // analysis method to find columns for feature
1731 * NB this method assumes CDS ranges are contiguous (no introns)
1733 SequenceI gene = new Sequence("gene", "aaacccgggtttaaacccgggttt");
1734 SequenceI seq1 = new Sequence("Seq1", "--ac-cgGG-GGaaACC--GGtt-");
1735 SequenceI seq2 = new Sequence("Seq2", "AA--CCGG--g-AAA--cG-GTTt");
1736 seq1.createDatasetSequence();
1737 seq2.createDatasetSequence();
1738 seq1.addSequenceFeature(new SequenceFeature("CDS", "cds", 5, 6, 0f,
1740 seq1.addSequenceFeature(new SequenceFeature("CDS", "cds", 7, 8, 0f,
1742 seq1.addSequenceFeature(new SequenceFeature("CDS", "cds", 11, 13, 0f,
1744 seq1.addSequenceFeature(new SequenceFeature("CDS", "cds", 14, 15, 0f,
1746 seq2.addSequenceFeature(new SequenceFeature("CDS", "cds", 1, 2, 0f,
1748 seq2.addSequenceFeature(new SequenceFeature("CDS", "cds", 3, 6, 0f,
1750 seq2.addSequenceFeature(new SequenceFeature("CDS", "cds", 8, 10, 0f,
1752 seq2.addSequenceFeature(new SequenceFeature("CDS", "cds", 12, 12, 0f,
1754 seq2.addSequenceFeature(new SequenceFeature("CDS", "cds", 13, 15, 0f,
1757 List<int[]> cdsColumns = AlignmentUtils.findCdsColumns(new SequenceI[] {
1759 assertEquals(4, cdsColumns.size());
1760 assertEquals("[1, 2]", Arrays.toString(cdsColumns.get(0)));
1761 assertEquals("[5, 9]", Arrays.toString(cdsColumns.get(1)));
1762 assertEquals("[11, 17]", Arrays.toString(cdsColumns.get(2)));
1763 assertEquals("[19, 23]", Arrays.toString(cdsColumns.get(3)));
1767 * Test the method that realigns protein to match mapped codon alignment.
1769 @Test(groups = { "Functional" })
1770 public void testAlignProteinAsDna_incompleteStartCodon()
1772 // seq1: incomplete start codon (not mapped), then [3, 11]
1773 SequenceI dna1 = new Sequence("Seq1", "ccAAA-TTT-GGG-");
1774 // seq2 codons are [4, 5], [8, 11]
1775 SequenceI dna2 = new Sequence("Seq2", "ccaAA-ttT-GGG-");
1776 // seq3 incomplete start codon at 'tt'
1777 SequenceI dna3 = new Sequence("Seq3", "ccaaa-ttt-GGG-");
1778 AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
1779 dna.setDataset(null);
1781 // prot1 has 'X' for incomplete start codon (not mapped)
1782 SequenceI prot1 = new Sequence("Seq1", "XKFG"); // X for incomplete start
1783 SequenceI prot2 = new Sequence("Seq2", "NG");
1784 SequenceI prot3 = new Sequence("Seq3", "XG"); // X for incomplete start
1785 AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
1787 protein.setDataset(null);
1789 // map dna1 [3, 11] to prot1 [2, 4] KFG
1790 MapList map = new MapList(new int[] { 3, 11 }, new int[] { 2, 4 }, 3, 1);
1791 AlignedCodonFrame acf = new AlignedCodonFrame();
1792 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
1794 // map dna2 [4, 5] [8, 11] to prot2 [1, 2] NG
1795 map = new MapList(new int[] { 4, 5, 8, 11 }, new int[] { 1, 2 }, 3, 1);
1796 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
1798 // map dna3 [9, 11] to prot3 [2, 2] G
1799 map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1);
1800 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
1802 ArrayList<AlignedCodonFrame> acfs = new ArrayList<AlignedCodonFrame>();
1804 protein.setCodonFrames(acfs);
1807 * verify X is included in the aligned proteins, and placed just
1808 * before the first mapped residue
1809 * CCT is between CCC and TTT
1811 AlignmentUtils.alignProteinAsDna(protein, dna);
1812 assertEquals("XK-FG", prot1.getSequenceAsString());
1813 assertEquals("--N-G", prot2.getSequenceAsString());
1814 assertEquals("---XG", prot3.getSequenceAsString());
1818 * Tests for the method that maps the subset of a dna sequence that has CDS
1819 * (or subtype) feature - case where the start codon is incomplete.
1821 @Test(groups = "Functional")
1822 public void testGetCdsRanges_fivePrimeIncomplete()
1824 SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
1825 dnaSeq.createDatasetSequence();
1826 SequenceI ds = dnaSeq.getDatasetSequence();
1828 // CDS for dna 5-6 (incomplete codon), 7-9
1829 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
1830 sf.setPhase("2"); // skip 2 bases to start of next codon
1831 ds.addSequenceFeature(sf);
1832 // CDS for dna 13-15
1833 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
1834 ds.addSequenceFeature(sf);
1836 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1839 * check the mapping starts with the first complete codon
1841 assertEquals(6, MappingUtils.getLength(ranges));
1842 assertEquals(2, ranges.size());
1843 assertEquals(7, ranges.get(0)[0]);
1844 assertEquals(9, ranges.get(0)[1]);
1845 assertEquals(13, ranges.get(1)[0]);
1846 assertEquals(15, ranges.get(1)[1]);
1850 * Tests for the method that maps the subset of a dna sequence that has CDS
1851 * (or subtype) feature.
1853 @Test(groups = "Functional")
1854 public void testGetCdsRanges()
1856 SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
1857 dnaSeq.createDatasetSequence();
1858 SequenceI ds = dnaSeq.getDatasetSequence();
1861 SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
1862 ds.addSequenceFeature(sf);
1863 // exon feature should be ignored here
1864 sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
1865 ds.addSequenceFeature(sf);
1866 // CDS for dna 10-12
1867 sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
1868 ds.addSequenceFeature(sf);
1870 List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
1871 assertEquals(6, MappingUtils.getLength(ranges));
1872 assertEquals(2, ranges.size());
1873 assertEquals(4, ranges.get(0)[0]);
1874 assertEquals(6, ranges.get(0)[1]);
1875 assertEquals(10, ranges.get(1)[0]);
1876 assertEquals(12, ranges.get(1)[1]);
1880 * Test the method that computes a map of codon variants for each protein
1881 * position from "sequence_variant" features on dna
1883 @Test(groups = "Functional")
1884 public void testBuildDnaVariantsMap()
1886 SequenceI dna = new Sequence("dna", "atgAAATTTGGGCCCtag");
1887 MapList map = new MapList(new int[] { 1, 18 }, new int[] { 1, 5 }, 3, 1);
1890 * first with no variants on dna
1892 LinkedHashMap<Integer, String[][]> variantsMap = AlignmentUtils
1893 .buildDnaVariantsMap(dna, map);
1894 assertTrue(variantsMap.isEmpty());
1896 // single allele codon 1, on base 1
1897 SequenceFeature sf = new SequenceFeature("sequence_variant", "", 1, 1,
1899 sf.setValue("alleles", "T");
1900 dna.addSequenceFeature(sf);
1902 // two alleles codon 2, on bases 2 and 3
1903 sf = new SequenceFeature("sequence_variant", "", 5, 5, 0f, null);
1904 sf.setValue("alleles", "T");
1905 dna.addSequenceFeature(sf);
1906 sf = new SequenceFeature("sequence_variant", "", 6, 6, 0f, null);
1907 sf.setValue("alleles", "G");
1908 dna.addSequenceFeature(sf);
1910 // two alleles codon 3, both on base 2
1911 sf = new SequenceFeature("sequence_variant", "", 8, 8, 0f, null);
1912 sf.setValue("alleles", "C, G");
1913 dna.addSequenceFeature(sf);
1915 // no alleles on codon 4
1916 // alleles on codon 5 on all 3 bases
1917 sf = new SequenceFeature("sequence_variant", "", 13, 13, 0f, null);
1918 sf.setValue("alleles", "C, G"); // (C duplicates given base value)
1919 dna.addSequenceFeature(sf);
1920 sf = new SequenceFeature("sequence_variant", "", 14, 14, 0f, null);
1921 sf.setValue("alleles", "g, a"); // should force to upper-case
1922 dna.addSequenceFeature(sf);
1923 sf = new SequenceFeature("sequence_variant", "", 15, 15, 0f, null);
1924 sf.setValue("alleles", "A, T");
1925 dna.addSequenceFeature(sf);
1927 variantsMap = AlignmentUtils.buildDnaVariantsMap(dna, map);
1928 assertEquals(4, variantsMap.size());
1929 assertTrue(Arrays.deepEquals(new String[][] { { "A", "T" }, { "T" },
1930 { "G" } }, variantsMap.get(1)));
1931 assertTrue(Arrays.deepEquals(new String[][] { { "A" }, { "A", "T" },
1932 { "A", "G" } }, variantsMap.get(2)));
1933 assertTrue(Arrays.deepEquals(new String[][] { { "T" },
1934 { "T", "C", "G" }, { "T" } }, variantsMap.get(3)));
1935 // duplicated bases are not removed here, handled in computePeptideVariants
1936 assertTrue(Arrays.deepEquals(new String[][] { { "C", "C", "G" },
1937 { "C", "G", "A" }, { "C", "A", "T" } }, variantsMap.get(5)));
1941 * Tests for the method that computes all peptide variants given codon
1944 @Test(groups = "Functional")
1945 public void testComputePeptideVariants()
1947 String[][] codonVariants = new String[][] { { "A" }, { "G" }, { "T" } };
1950 * AGT codes for S - this is not included in the variants returned
1952 List<String> variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1953 assertEquals("[]", variants.toString());
1955 // S is reported if it differs from the current value (A):
1956 variants = AlignmentUtils.computePeptideVariants(codonVariants, "A");
1957 assertEquals("[S]", variants.toString());
1960 * synonymous variant is not reported
1962 codonVariants = new String[][] { { "A" }, { "G" }, { "C", "T" } };
1963 // AGC and AGT both code for S
1964 variants = AlignmentUtils.computePeptideVariants(codonVariants, "s");
1965 assertEquals("[]", variants.toString());
1968 * equivalent variants are only reported once
1970 codonVariants = new String[][] { { "C" }, { "T" },
1971 { "A", "C", "G", "T" } };
1972 // CTA CTC CTG CTT all code for L
1973 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1974 assertEquals("[L]", variants.toString());
1977 * vary codons 1 and 2; variant products are sorted and non-redundant
1979 codonVariants = new String[][] { { "a", "C" }, { "g", "T" }, { "A" } };
1980 // aga ata cga cta code for R, I, R, L
1981 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1982 assertEquals("[I, L, R]", variants.toString());
1985 * vary codons 2 and 3
1987 codonVariants = new String[][] { { "a" }, { "g", "T" }, { "A", "c" } };
1988 // aga agc ata atc code for R, S, I, I
1989 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1990 assertEquals("[I, R]", variants.toString());
1993 * vary codons 1 and 3
1995 codonVariants = new String[][] { { "a", "t" }, { "a" }, { "t", "g" } };
1996 // aat aag tat tag code for N, K, Y, STOP - STOP sorted to end
1997 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
1998 assertEquals("[K, N, Y, STOP]", variants.toString());
2001 * vary codons 1, 2 and 3
2003 codonVariants = new String[][] { { "a", "t" }, { "G", "C" },
2005 // agt agg act acg tgt tgg tct tcg code for S, R, T, T, C, W, S, S
2006 variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
2007 assertEquals("[C, R, T, W]", variants.toString());