2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.junit.Assert.assertEquals;
24 import static org.junit.Assert.assertFalse;
25 import static org.junit.Assert.assertSame;
26 import static org.junit.Assert.assertTrue;
27 import jalview.datamodel.AlignedCodonFrame;
28 import jalview.datamodel.Alignment;
29 import jalview.datamodel.AlignmentI;
30 import jalview.datamodel.DBRefEntry;
31 import jalview.datamodel.Mapping;
32 import jalview.datamodel.Sequence;
33 import jalview.datamodel.SequenceI;
34 import jalview.io.AppletFormatAdapter;
35 import jalview.io.FormatAdapter;
36 import jalview.util.MapList;
38 import java.io.IOException;
39 import java.util.ArrayList;
40 import java.util.Arrays;
41 import java.util.Collections;
42 import java.util.List;
45 import org.junit.Test;
47 public class AlignmentUtilsTests
50 private static final String TEST_DATA =
52 "#=GS D.melanogaster.1 AC AY119185.1/838-902\n" +
53 "#=GS D.melanogaster.2 AC AC092237.1/57223-57161\n" +
54 "#=GS D.melanogaster.3 AC AY060611.1/560-627\n" +
55 "D.melanogaster.1 G.AGCC.CU...AUGAUCGA\n" +
56 "#=GR D.melanogaster.1 SS ................((((\n" +
57 "D.melanogaster.2 C.AUUCAACU.UAUGAGGAU\n" +
58 "#=GR D.melanogaster.2 SS ................((((\n" +
59 "D.melanogaster.3 G.UGGCGCU..UAUGACGCA\n" +
60 "#=GR D.melanogaster.3 SS (.(((...(....(((((((\n" +
63 private static final String AA_SEQS_1 =
69 private static final String CDNA_SEQS_1 =
71 "AC-GG--CUC-CAA-CT\n" +
73 "-CG-TTA--ACG---AAGT\n";
75 private static final String CDNA_SEQS_2 =
82 public static Sequence ts=new Sequence("short","ASDASDASDASDASDASDASDASDASDASDASDASDASD");
85 public void testExpandFlanks()
87 AlignmentI al = new Alignment(new Sequence[] {});
88 for (int i=4;i<14;i+=3)
90 SequenceI s1=ts.deriveSequence().getSubSequence(i, i+7);
93 System.out.println(new AppletFormatAdapter().formatSequences("Clustal", al, true));
94 for (int flnk=-1;flnk<25; flnk++)
97 System.out.println("\nFlank size: "+flnk);
98 System.out.println(new AppletFormatAdapter().formatSequences("Clustal", exp=AlignmentUtils.expandContext(al, flnk), true));
100 for (SequenceI sq:exp.getSequences())
102 String ung = sq.getSequenceAsString().replaceAll("-+", "");
103 assertTrue("Flanking sequence not the same as original dataset sequence.\n"+ung+"\n"+sq.getDatasetSequence().getSequenceAsString(),ung.equalsIgnoreCase(sq.getDatasetSequence().getSequenceAsString()));
110 * Test method that returns a map of lists of sequences by sequence name.
112 * @throws IOException
115 public void testGetSequencesByName() throws IOException
117 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
118 + ">Seq1Name\nABCD\n";
119 AlignmentI al = loadAlignment(data, "FASTA");
120 Map<String, List<SequenceI>> map = AlignmentUtils
121 .getSequencesByName(al);
122 assertEquals(2, map.keySet().size());
123 assertEquals(2, map.get("Seq1Name").size());
124 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
125 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
126 assertEquals(1, map.get("Seq2Name").size());
127 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
130 * Helper method to load an alignment and ensure dataset sequences are set up.
135 * @throws IOException
137 protected AlignmentI loadAlignment(final String data, String format) throws IOException
139 Alignment a = new FormatAdapter().readFile(data,
140 AppletFormatAdapter.PASTE, format);
146 * Test mapping of protein to cDNA, for the case where we have no sequence
147 * cross-references, so mappings are made first-served 1-1 where sequences
150 * @throws IOException
153 public void testMapProteinToCdna_noXrefs() throws IOException
155 List<SequenceI> protseqs = new ArrayList<SequenceI>();
156 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
157 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
158 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
159 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
160 protein.setDataset(null);
162 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
163 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
164 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
165 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
166 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
167 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
168 cdna.setDataset(null);
170 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
172 // 3 mappings made, each from 1 to 1 sequence
173 assertEquals(3, protein.getCodonFrames().size());
174 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
175 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
176 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
178 // V12345 mapped to A22222
179 AlignedCodonFrame acf = protein.getCodonFrame(
180 protein.getSequenceAt(0)).get(0);
181 assertEquals(1, acf.getdnaSeqs().length);
182 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
183 acf.getdnaSeqs()[0]);
184 Mapping[] protMappings = acf.getProtMappings();
185 assertEquals(1, protMappings.length);
186 MapList mapList = protMappings[0].getMap();
187 assertEquals(3, mapList.getFromRatio());
188 assertEquals(1, mapList.getToRatio());
189 assertTrue(Arrays.equals(new int[]
190 { 1, 9 }, mapList.getFromRanges().get(0)));
191 assertEquals(1, mapList.getFromRanges().size());
192 assertTrue(Arrays.equals(new int[]
193 { 1, 3 }, mapList.getToRanges().get(0)));
194 assertEquals(1, mapList.getToRanges().size());
196 // V12346 mapped to A33333
197 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
198 assertEquals(1, acf.getdnaSeqs().length);
199 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
200 acf.getdnaSeqs()[0]);
202 // V12347 mapped to A11111
203 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
204 assertEquals(1, acf.getdnaSeqs().length);
205 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
206 acf.getdnaSeqs()[0]);
208 // no mapping involving the 'extra' A44444
209 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
213 * Test for the alignSequenceAs method that takes two sequences and a mapping.
216 public void testAlignSequenceAs_withMapping_noIntrons()
218 MapList map = new MapList(new int[]
223 * No existing gaps in dna:
225 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
229 * Now introduce gaps in dna but ignore them when realigning.
231 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
235 * Now include gaps in dna when realigning. First retaining 'mapped' gaps
236 * only, i.e. those within the exon region.
238 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
239 "---G-G--G---A--A-A");
242 * Include all gaps in dna when realigning (within and without the exon
243 * region). The leading gap, and the gaps between codons, are subsumed by
244 * the protein alignment gap.
246 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", true, true, map,
250 * Include only unmapped gaps in dna when realigning (outside the exon
251 * region). The leading gap, and the gaps between codons, are subsumed by
252 * the protein alignment gap.
254 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
259 * Test for the alignSequenceAs method that takes two sequences and a mapping.
262 public void testAlignSequenceAs_withMapping_withIntrons()
265 * Exons at codon 2 (AAA) and 4 (TTT)
267 MapList map = new MapList(new int[]
268 { 4, 6, 10, 12 }, new int[]
272 * Simple case: no gaps in dna
274 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
275 "GGG---AAACCCTTTGGG");
278 * Add gaps to dna - but ignore when realigning.
280 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
281 false, false, map, "GGG---AAACCCTTTGGG");
284 * Add gaps to dna - include within exons only when realigning.
286 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
287 true, false, map, "GGG---A--A---ACCCT-TTGGG");
290 * Include gaps outside exons only when realigning.
292 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
293 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
296 * Include gaps following first intron if we are 'preserving mapped gaps'
298 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
299 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
302 * Include all gaps in dna when realigning.
304 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
305 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
309 * Test for the case where not all of the protein sequence is mapped to cDNA.
312 public void testAlignSequenceAs_withMapping_withUnmappedProtein()
316 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
318 final MapList map = new MapList(new int[]
319 { 4, 6, 10, 12 }, new int[]
320 { 1, 1, 3, 3 }, 3, 1);
324 * Expect alignment does nothing (aborts realignment). Change this test
325 * first if different behaviour wanted.
327 checkAlignSequenceAs("GGGAAACCCTTTGGG", "-A-L-P-", false,
328 false, map, "GGGAAACCCTTTGGG");
332 * Helper method that performs and verifies the method under test.
336 * @param preserveMappedGaps
337 * @param preserveUnmappedGaps
341 protected void checkAlignSequenceAs(final String dnaSeq,
342 final String proteinSeq, final boolean preserveMappedGaps,
343 final boolean preserveUnmappedGaps, MapList map,
344 final String expected)
346 SequenceI dna = new Sequence("Seq1", dnaSeq);
347 dna.createDatasetSequence();
348 SequenceI protein = new Sequence("Seq1", proteinSeq);
349 protein.createDatasetSequence();
350 AlignedCodonFrame acf = new AlignedCodonFrame();
351 acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map);
353 AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-',
354 preserveMappedGaps, preserveUnmappedGaps);
355 assertEquals(expected, dna.getSequenceAsString());
359 * Test for the alignSequenceAs method where we preserve gaps in introns only.
362 public void testAlignSequenceAs_keepIntronGapsOnly()
366 * Intron GGGAAA followed by exon CCCTTT
368 MapList map = new MapList(new int[]
372 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL",
373 false, true, map, "GG-G-AA-ACCCTTT");
377 * Test for the method that generates an aligned translated sequence from one
381 public void testGetAlignedTranslation_dnaLikeProtein()
383 // dna alignment will be replaced
384 SequenceI dna = new Sequence("Seq1", "T-G-CC-A--T-TAC-CAG-");
385 dna.createDatasetSequence();
386 // protein alignment will be 'applied' to dna
387 SequenceI protein = new Sequence("Seq1", "-CH-Y--Q-");
388 protein.createDatasetSequence();
389 MapList map = new MapList(new int[]
392 AlignedCodonFrame acf = new AlignedCodonFrame();
393 acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map);
395 final SequenceI aligned = AlignmentUtils
396 .getAlignedTranslation(protein, '-', acf);
397 assertEquals("---TGCCAT---TAC------CAG---", aligned.getSequenceAsString());
398 assertSame(aligned.getDatasetSequence(), dna.getDatasetSequence());
402 * Test the method that realigns protein to match mapped codon alignment.
405 public void testAlignProteinAsDna()
407 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
408 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
409 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
410 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
411 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
412 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
413 AlignmentI dna = new Alignment(new SequenceI[]
414 { dna1, dna2, dna3 });
415 dna.setDataset(null);
417 // protein alignment will be realigned like dna
418 SequenceI prot1 = new Sequence("Seq1", "CHYQ");
419 SequenceI prot2 = new Sequence("Seq2", "CHYQ");
420 SequenceI prot3 = new Sequence("Seq3", "CHYQ");
421 AlignmentI protein = new Alignment(new SequenceI[]
422 { prot1, prot2, prot3 });
423 protein.setDataset(null);
425 MapList map = new MapList(new int[]
428 AlignedCodonFrame acf = new AlignedCodonFrame();
429 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
430 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
431 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
432 protein.setCodonFrames(Collections.singleton(acf));
435 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
436 * [8,9,10] [10,11,12] [11,12,13]
438 AlignmentUtils.alignProteinAsDna(protein, dna);
439 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
440 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
441 assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
445 * Test the method that tests whether a CDNA sequence translates to a protein
449 public void testTranslatesAs()
451 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
452 "FPKG".toCharArray()));
454 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
455 3, "FPKG".toCharArray()));
457 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
458 0, "FPKG".toCharArray()));
460 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
461 0, "FPKG".toCharArray()));
463 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
464 0, "FPKG".toCharArray()));
465 // with start and stop codon1
466 assertTrue(AlignmentUtils.translatesAs(
467 "atgtttcccaaaggtaa".toCharArray(), 3, "FPKG".toCharArray()));
468 // with start and stop codon2
469 assertTrue(AlignmentUtils.translatesAs(
470 "atgtttcccaaaggtag".toCharArray(), 3, "FPKG".toCharArray()));
471 // with start and stop codon3
472 assertTrue(AlignmentUtils.translatesAs(
473 "atgtttcccaaaggtga".toCharArray(), 3, "FPKG".toCharArray()));
476 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
478 "FPMG".toCharArray()));
482 * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
483 * stop codons in addition to the protein coding sequence.
485 * @throws IOException
488 public void testMapProteinToCdna_withStartAndStopCodons()
491 List<SequenceI> protseqs = new ArrayList<SequenceI>();
492 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
493 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
494 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
495 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
496 protein.setDataset(null);
498 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
500 dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
502 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
503 // = start +EIQ + stop
504 dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
505 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
506 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
507 cdna.setDataset(null);
509 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
511 // 3 mappings made, each from 1 to 1 sequence
512 assertEquals(3, protein.getCodonFrames().size());
513 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
514 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
515 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
517 // V12345 mapped from A22222
518 AlignedCodonFrame acf = protein.getCodonFrame(
519 protein.getSequenceAt(0)).get(0);
520 assertEquals(1, acf.getdnaSeqs().length);
521 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
522 acf.getdnaSeqs()[0]);
523 Mapping[] protMappings = acf.getProtMappings();
524 assertEquals(1, protMappings.length);
525 MapList mapList = protMappings[0].getMap();
526 assertEquals(3, mapList.getFromRatio());
527 assertEquals(1, mapList.getToRatio());
528 assertTrue(Arrays.equals(new int[]
529 { 1, 9 }, mapList.getFromRanges().get(0)));
530 assertEquals(1, mapList.getFromRanges().size());
531 assertTrue(Arrays.equals(new int[]
532 { 1, 3 }, mapList.getToRanges().get(0)));
533 assertEquals(1, mapList.getToRanges().size());
535 // V12346 mapped from A33333 starting position 4
536 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
537 assertEquals(1, acf.getdnaSeqs().length);
538 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
539 acf.getdnaSeqs()[0]);
540 protMappings = acf.getProtMappings();
541 assertEquals(1, protMappings.length);
542 mapList = protMappings[0].getMap();
543 assertEquals(3, mapList.getFromRatio());
544 assertEquals(1, mapList.getToRatio());
545 assertTrue(Arrays.equals(new int[]
546 { 4, 12 }, mapList.getFromRanges().get(0)));
547 assertEquals(1, mapList.getFromRanges().size());
548 assertTrue(Arrays.equals(new int[]
549 { 1, 3 }, mapList.getToRanges().get(0)));
550 assertEquals(1, mapList.getToRanges().size());
552 // V12347 mapped to A11111 starting position 4
553 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
554 assertEquals(1, acf.getdnaSeqs().length);
555 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
556 acf.getdnaSeqs()[0]);
557 protMappings = acf.getProtMappings();
558 assertEquals(1, protMappings.length);
559 mapList = protMappings[0].getMap();
560 assertEquals(3, mapList.getFromRatio());
561 assertEquals(1, mapList.getToRatio());
562 assertTrue(Arrays.equals(new int[]
563 { 4, 12 }, mapList.getFromRanges().get(0)));
564 assertEquals(1, mapList.getFromRanges().size());
565 assertTrue(Arrays.equals(new int[]
566 { 1, 3 }, mapList.getToRanges().get(0)));
567 assertEquals(1, mapList.getToRanges().size());
569 // no mapping involving the 'extra' A44444
570 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
574 * Test mapping of protein to cDNA, for the case where we have some sequence
575 * cross-references. Verify that 1-to-many mappings are made where
576 * cross-references exist and sequences are mappable.
578 * @throws IOException
581 public void testMapProteinToCdna_withXrefs() throws IOException
583 List<SequenceI> protseqs = new ArrayList<SequenceI>();
584 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
585 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
586 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
587 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
588 protein.setDataset(null);
590 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
591 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
592 dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
593 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
594 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
595 dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
596 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
597 cdna.setDataset(null);
599 // Xref A22222 to V12345 (should get mapped)
600 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
601 // Xref V12345 to A44444 (should get mapped)
602 protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
603 // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
604 dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
605 // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
606 // it should get paired up with the unmapped A33333
607 // A11111 should be mapped to V12347
608 // A55555 is spare and has no xref so is not mapped
610 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
612 // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
613 assertEquals(3, protein.getCodonFrames().size());
614 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
615 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
616 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
618 // one mapping for each of the first 4 cDNA sequences
619 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
620 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
621 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
622 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
624 // V12345 mapped to A22222 and A44444
625 AlignedCodonFrame acf = protein.getCodonFrame(
626 protein.getSequenceAt(0)).get(0);
627 assertEquals(2, acf.getdnaSeqs().length);
628 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
629 acf.getdnaSeqs()[0]);
630 assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
631 acf.getdnaSeqs()[1]);
633 // V12346 mapped to A33333
634 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
635 assertEquals(1, acf.getdnaSeqs().length);
636 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
637 acf.getdnaSeqs()[0]);
639 // V12347 mapped to A11111
640 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
641 assertEquals(1, acf.getdnaSeqs().length);
642 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
643 acf.getdnaSeqs()[0]);
645 // no mapping involving the 'extra' A55555
646 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
650 * Test mapping of protein to cDNA, for the case where we have some sequence
651 * cross-references. Verify that once we have made an xref mapping we don't
652 * also map un-xrefd sequeces.
654 * @throws IOException
657 public void testMapProteinToCdna_prioritiseXrefs() throws IOException
659 List<SequenceI> protseqs = new ArrayList<SequenceI>();
660 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
661 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
662 AlignmentI protein = new Alignment(
663 protseqs.toArray(new SequenceI[protseqs.size()]));
664 protein.setDataset(null);
666 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
667 dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
668 dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
669 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
671 cdna.setDataset(null);
673 // Xref A22222 to V12345 (should get mapped)
674 // A11111 should then be mapped to the unmapped V12346
675 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
677 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
679 // 2 protein mappings made
680 assertEquals(2, protein.getCodonFrames().size());
681 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
682 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
684 // one mapping for each of the cDNA sequences
685 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
686 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
688 // V12345 mapped to A22222
689 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
691 assertEquals(1, acf.getdnaSeqs().length);
692 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
693 acf.getdnaSeqs()[0]);
695 // V12346 mapped to A11111
696 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
697 assertEquals(1, acf.getdnaSeqs().length);
698 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
699 acf.getdnaSeqs()[0]);