2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.junit.Assert.assertEquals;
24 import static org.junit.Assert.assertFalse;
25 import static org.junit.Assert.assertSame;
26 import static org.junit.Assert.assertTrue;
28 import java.io.IOException;
29 import java.util.ArrayList;
30 import java.util.Arrays;
31 import java.util.Collections;
32 import java.util.HashSet;
33 import java.util.Iterator;
34 import java.util.LinkedHashSet;
35 import java.util.List;
39 import org.junit.Test;
41 import jalview.datamodel.AlignedCodonFrame;
42 import jalview.datamodel.Alignment;
43 import jalview.datamodel.AlignmentAnnotation;
44 import jalview.datamodel.AlignmentI;
45 import jalview.datamodel.Annotation;
46 import jalview.datamodel.DBRefEntry;
47 import jalview.datamodel.Mapping;
48 import jalview.datamodel.SearchResults;
49 import jalview.datamodel.SearchResults.Match;
50 import jalview.datamodel.Sequence;
51 import jalview.datamodel.SequenceI;
52 import jalview.io.AppletFormatAdapter;
53 import jalview.io.FormatAdapter;
54 import jalview.util.MapList;
55 import jalview.util.MappingUtils;
57 public class AlignmentUtilsTests
60 private static final String TEST_DATA =
62 "#=GS D.melanogaster.1 AC AY119185.1/838-902\n" +
63 "#=GS D.melanogaster.2 AC AC092237.1/57223-57161\n" +
64 "#=GS D.melanogaster.3 AC AY060611.1/560-627\n" +
65 "D.melanogaster.1 G.AGCC.CU...AUGAUCGA\n" +
66 "#=GR D.melanogaster.1 SS ................((((\n" +
67 "D.melanogaster.2 C.AUUCAACU.UAUGAGGAU\n" +
68 "#=GR D.melanogaster.2 SS ................((((\n" +
69 "D.melanogaster.3 G.UGGCGCU..UAUGACGCA\n" +
70 "#=GR D.melanogaster.3 SS (.(((...(....(((((((\n" +
73 private static final String AA_SEQS_1 =
79 private static final String CDNA_SEQS_1 =
81 "AC-GG--CUC-CAA-CT\n" +
83 "-CG-TTA--ACG---AAGT\n";
85 private static final String CDNA_SEQS_2 =
92 public static Sequence ts=new Sequence("short","ASDASDASDASDASDASDASDASDASDASDASDASDASD");
95 public void testExpandFlanks()
97 AlignmentI al = new Alignment(new Sequence[] {});
98 for (int i=4;i<14;i+=3)
100 SequenceI s1=ts.deriveSequence().getSubSequence(i, i+7);
103 System.out.println(new AppletFormatAdapter().formatSequences("Clustal", al, true));
104 for (int flnk=-1;flnk<25; flnk++)
107 System.out.println("\nFlank size: "+flnk);
108 System.out.println(new AppletFormatAdapter().formatSequences("Clustal", exp=AlignmentUtils.expandContext(al, flnk), true));
110 for (SequenceI sq:exp.getSequences())
112 String ung = sq.getSequenceAsString().replaceAll("-+", "");
113 assertTrue("Flanking sequence not the same as original dataset sequence.\n"+ung+"\n"+sq.getDatasetSequence().getSequenceAsString(),ung.equalsIgnoreCase(sq.getDatasetSequence().getSequenceAsString()));
120 * Test method that returns a map of lists of sequences by sequence name.
122 * @throws IOException
125 public void testGetSequencesByName() throws IOException
127 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
128 + ">Seq1Name\nABCD\n";
129 AlignmentI al = loadAlignment(data, "FASTA");
130 Map<String, List<SequenceI>> map = AlignmentUtils
131 .getSequencesByName(al);
132 assertEquals(2, map.keySet().size());
133 assertEquals(2, map.get("Seq1Name").size());
134 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
135 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
136 assertEquals(1, map.get("Seq2Name").size());
137 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
140 * Helper method to load an alignment and ensure dataset sequences are set up.
145 * @throws IOException
147 protected AlignmentI loadAlignment(final String data, String format) throws IOException
149 Alignment a = new FormatAdapter().readFile(data,
150 AppletFormatAdapter.PASTE, format);
156 * Test mapping of protein to cDNA, for the case where we have no sequence
157 * cross-references, so mappings are made first-served 1-1 where sequences
160 * @throws IOException
163 public void testMapProteinToCdna_noXrefs() throws IOException
165 List<SequenceI> protseqs = new ArrayList<SequenceI>();
166 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
167 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
168 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
169 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
170 protein.setDataset(null);
172 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
173 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
174 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
175 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
176 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
177 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
178 cdna.setDataset(null);
180 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
182 // 3 mappings made, each from 1 to 1 sequence
183 assertEquals(3, protein.getCodonFrames().size());
184 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
185 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
186 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
188 // V12345 mapped to A22222
189 AlignedCodonFrame acf = protein.getCodonFrame(
190 protein.getSequenceAt(0)).get(0);
191 assertEquals(1, acf.getdnaSeqs().length);
192 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
193 acf.getdnaSeqs()[0]);
194 Mapping[] protMappings = acf.getProtMappings();
195 assertEquals(1, protMappings.length);
196 MapList mapList = protMappings[0].getMap();
197 assertEquals(3, mapList.getFromRatio());
198 assertEquals(1, mapList.getToRatio());
199 assertTrue(Arrays.equals(new int[]
200 { 1, 9 }, mapList.getFromRanges().get(0)));
201 assertEquals(1, mapList.getFromRanges().size());
202 assertTrue(Arrays.equals(new int[]
203 { 1, 3 }, mapList.getToRanges().get(0)));
204 assertEquals(1, mapList.getToRanges().size());
206 // V12346 mapped to A33333
207 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
208 assertEquals(1, acf.getdnaSeqs().length);
209 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
210 acf.getdnaSeqs()[0]);
212 // V12347 mapped to A11111
213 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
214 assertEquals(1, acf.getdnaSeqs().length);
215 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
216 acf.getdnaSeqs()[0]);
218 // no mapping involving the 'extra' A44444
219 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
223 * Test for the alignSequenceAs method that takes two sequences and a mapping.
226 public void testAlignSequenceAs_withMapping_noIntrons()
228 MapList map = new MapList(new int[]
233 * No existing gaps in dna:
235 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
239 * Now introduce gaps in dna but ignore them when realigning.
241 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
245 * Now include gaps in dna when realigning. First retaining 'mapped' gaps
246 * only, i.e. those within the exon region.
248 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
249 "---G-G--G---A--A-A");
252 * Include all gaps in dna when realigning (within and without the exon
253 * region). The leading gap, and the gaps between codons, are subsumed by
254 * the protein alignment gap.
256 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", true, true, map,
260 * Include only unmapped gaps in dna when realigning (outside the exon
261 * region). The leading gap, and the gaps between codons, are subsumed by
262 * the protein alignment gap.
264 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
269 * Test for the alignSequenceAs method that takes two sequences and a mapping.
272 public void testAlignSequenceAs_withMapping_withIntrons()
275 * Exons at codon 2 (AAA) and 4 (TTT)
277 MapList map = new MapList(new int[]
278 { 4, 6, 10, 12 }, new int[]
282 * Simple case: no gaps in dna
284 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
285 "GGG---AAACCCTTTGGG");
288 * Add gaps to dna - but ignore when realigning.
290 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
291 false, false, map, "GGG---AAACCCTTTGGG");
294 * Add gaps to dna - include within exons only when realigning.
296 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
297 true, false, map, "GGG---A--A---ACCCT-TTGGG");
300 * Include gaps outside exons only when realigning.
302 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
303 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
306 * Include gaps following first intron if we are 'preserving mapped gaps'
308 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
309 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
312 * Include all gaps in dna when realigning.
314 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
315 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
319 * Test for the case where not all of the protein sequence is mapped to cDNA.
322 public void testAlignSequenceAs_withMapping_withUnmappedProtein()
326 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
328 final MapList map = new MapList(new int[]
329 { 4, 6, 10, 12 }, new int[]
330 { 1, 1, 3, 3 }, 3, 1);
334 * Expect alignment does nothing (aborts realignment). Change this test
335 * first if different behaviour wanted.
337 checkAlignSequenceAs("GGGAAACCCTTTGGG", "-A-L-P-", false,
338 false, map, "GGGAAACCCTTTGGG");
342 * Helper method that performs and verifies the method under test.
346 * @param preserveMappedGaps
347 * @param preserveUnmappedGaps
351 protected void checkAlignSequenceAs(final String dnaSeq,
352 final String proteinSeq, final boolean preserveMappedGaps,
353 final boolean preserveUnmappedGaps, MapList map,
354 final String expected)
356 SequenceI dna = new Sequence("Seq1", dnaSeq);
357 dna.createDatasetSequence();
358 SequenceI protein = new Sequence("Seq1", proteinSeq);
359 protein.createDatasetSequence();
360 AlignedCodonFrame acf = new AlignedCodonFrame();
361 acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map);
363 AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-',
364 preserveMappedGaps, preserveUnmappedGaps);
365 assertEquals(expected, dna.getSequenceAsString());
369 * Test for the alignSequenceAs method where we preserve gaps in introns only.
372 public void testAlignSequenceAs_keepIntronGapsOnly()
376 * Intron GGGAAA followed by exon CCCTTT
378 MapList map = new MapList(new int[]
382 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL",
383 false, true, map, "GG-G-AA-ACCCTTT");
387 * Test for the method that generates an aligned translated sequence from one
391 public void testGetAlignedTranslation_dnaLikeProtein()
393 // dna alignment will be replaced
394 SequenceI dna = new Sequence("Seq1", "T-G-CC-A--T-TAC-CAG-");
395 dna.createDatasetSequence();
396 // protein alignment will be 'applied' to dna
397 SequenceI protein = new Sequence("Seq1", "-CH-Y--Q-");
398 protein.createDatasetSequence();
399 MapList map = new MapList(new int[]
402 AlignedCodonFrame acf = new AlignedCodonFrame();
403 acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map);
405 final SequenceI aligned = AlignmentUtils
406 .getAlignedTranslation(protein, '-', acf);
407 assertEquals("---TGCCAT---TAC------CAG---", aligned.getSequenceAsString());
408 assertSame(aligned.getDatasetSequence(), dna.getDatasetSequence());
412 * Test the method that realigns protein to match mapped codon alignment.
415 public void testAlignProteinAsDna()
417 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
418 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
419 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
420 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
421 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
422 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
423 AlignmentI dna = new Alignment(new SequenceI[]
424 { dna1, dna2, dna3 });
425 dna.setDataset(null);
427 // protein alignment will be realigned like dna
428 SequenceI prot1 = new Sequence("Seq1", "CHYQ");
429 SequenceI prot2 = new Sequence("Seq2", "CHYQ");
430 SequenceI prot3 = new Sequence("Seq3", "CHYQ");
431 AlignmentI protein = new Alignment(new SequenceI[]
432 { prot1, prot2, prot3 });
433 protein.setDataset(null);
435 MapList map = new MapList(new int[]
438 AlignedCodonFrame acf = new AlignedCodonFrame();
439 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
440 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
441 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
442 protein.setCodonFrames(Collections.singleton(acf));
445 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
446 * [8,9,10] [10,11,12] [11,12,13]
448 AlignmentUtils.alignProteinAsDna(protein, dna);
449 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
450 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
451 assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
455 * Test the method that tests whether a CDNA sequence translates to a protein
459 public void testTranslatesAs()
461 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
462 "FPKG".toCharArray()));
464 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
465 3, "FPKG".toCharArray()));
467 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
468 0, "FPKG".toCharArray()));
470 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
471 0, "FPKG".toCharArray()));
473 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
474 0, "FPKG".toCharArray()));
475 // with start and stop codon1
476 assertTrue(AlignmentUtils.translatesAs(
477 "atgtttcccaaaggtaa".toCharArray(), 3, "FPKG".toCharArray()));
478 // with start and stop codon2
479 assertTrue(AlignmentUtils.translatesAs(
480 "atgtttcccaaaggtag".toCharArray(), 3, "FPKG".toCharArray()));
481 // with start and stop codon3
482 assertTrue(AlignmentUtils.translatesAs(
483 "atgtttcccaaaggtga".toCharArray(), 3, "FPKG".toCharArray()));
486 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
488 "FPMG".toCharArray()));
492 * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
493 * stop codons in addition to the protein coding sequence.
495 * @throws IOException
498 public void testMapProteinToCdna_withStartAndStopCodons()
501 List<SequenceI> protseqs = new ArrayList<SequenceI>();
502 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
503 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
504 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
505 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
506 protein.setDataset(null);
508 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
510 dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
512 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
513 // = start +EIQ + stop
514 dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
515 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
516 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
517 cdna.setDataset(null);
519 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
521 // 3 mappings made, each from 1 to 1 sequence
522 assertEquals(3, protein.getCodonFrames().size());
523 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
524 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
525 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
527 // V12345 mapped from A22222
528 AlignedCodonFrame acf = protein.getCodonFrame(
529 protein.getSequenceAt(0)).get(0);
530 assertEquals(1, acf.getdnaSeqs().length);
531 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
532 acf.getdnaSeqs()[0]);
533 Mapping[] protMappings = acf.getProtMappings();
534 assertEquals(1, protMappings.length);
535 MapList mapList = protMappings[0].getMap();
536 assertEquals(3, mapList.getFromRatio());
537 assertEquals(1, mapList.getToRatio());
538 assertTrue(Arrays.equals(new int[]
539 { 1, 9 }, mapList.getFromRanges().get(0)));
540 assertEquals(1, mapList.getFromRanges().size());
541 assertTrue(Arrays.equals(new int[]
542 { 1, 3 }, mapList.getToRanges().get(0)));
543 assertEquals(1, mapList.getToRanges().size());
545 // V12346 mapped from A33333 starting position 4
546 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
547 assertEquals(1, acf.getdnaSeqs().length);
548 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
549 acf.getdnaSeqs()[0]);
550 protMappings = acf.getProtMappings();
551 assertEquals(1, protMappings.length);
552 mapList = protMappings[0].getMap();
553 assertEquals(3, mapList.getFromRatio());
554 assertEquals(1, mapList.getToRatio());
555 assertTrue(Arrays.equals(new int[]
556 { 4, 12 }, mapList.getFromRanges().get(0)));
557 assertEquals(1, mapList.getFromRanges().size());
558 assertTrue(Arrays.equals(new int[]
559 { 1, 3 }, mapList.getToRanges().get(0)));
560 assertEquals(1, mapList.getToRanges().size());
562 // V12347 mapped to A11111 starting position 4
563 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
564 assertEquals(1, acf.getdnaSeqs().length);
565 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
566 acf.getdnaSeqs()[0]);
567 protMappings = acf.getProtMappings();
568 assertEquals(1, protMappings.length);
569 mapList = protMappings[0].getMap();
570 assertEquals(3, mapList.getFromRatio());
571 assertEquals(1, mapList.getToRatio());
572 assertTrue(Arrays.equals(new int[]
573 { 4, 12 }, mapList.getFromRanges().get(0)));
574 assertEquals(1, mapList.getFromRanges().size());
575 assertTrue(Arrays.equals(new int[]
576 { 1, 3 }, mapList.getToRanges().get(0)));
577 assertEquals(1, mapList.getToRanges().size());
579 // no mapping involving the 'extra' A44444
580 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
584 * Test mapping of protein to cDNA, for the case where we have some sequence
585 * cross-references. Verify that 1-to-many mappings are made where
586 * cross-references exist and sequences are mappable.
588 * @throws IOException
591 public void testMapProteinToCdna_withXrefs() throws IOException
593 List<SequenceI> protseqs = new ArrayList<SequenceI>();
594 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
595 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
596 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
597 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
598 protein.setDataset(null);
600 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
601 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
602 dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
603 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
604 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
605 dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
606 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
607 cdna.setDataset(null);
609 // Xref A22222 to V12345 (should get mapped)
610 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
611 // Xref V12345 to A44444 (should get mapped)
612 protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
613 // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
614 dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
615 // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
616 // it should get paired up with the unmapped A33333
617 // A11111 should be mapped to V12347
618 // A55555 is spare and has no xref so is not mapped
620 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
622 // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
623 assertEquals(3, protein.getCodonFrames().size());
624 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
625 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
626 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
628 // one mapping for each of the first 4 cDNA sequences
629 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
630 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
631 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
632 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
634 // V12345 mapped to A22222 and A44444
635 AlignedCodonFrame acf = protein.getCodonFrame(
636 protein.getSequenceAt(0)).get(0);
637 assertEquals(2, acf.getdnaSeqs().length);
638 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
639 acf.getdnaSeqs()[0]);
640 assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
641 acf.getdnaSeqs()[1]);
643 // V12346 mapped to A33333
644 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
645 assertEquals(1, acf.getdnaSeqs().length);
646 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
647 acf.getdnaSeqs()[0]);
649 // V12347 mapped to A11111
650 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
651 assertEquals(1, acf.getdnaSeqs().length);
652 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
653 acf.getdnaSeqs()[0]);
655 // no mapping involving the 'extra' A55555
656 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
660 * Test mapping of protein to cDNA, for the case where we have some sequence
661 * cross-references. Verify that once we have made an xref mapping we don't
662 * also map un-xrefd sequeces.
664 * @throws IOException
667 public void testMapProteinToCdna_prioritiseXrefs() throws IOException
669 List<SequenceI> protseqs = new ArrayList<SequenceI>();
670 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
671 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
672 AlignmentI protein = new Alignment(
673 protseqs.toArray(new SequenceI[protseqs.size()]));
674 protein.setDataset(null);
676 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
677 dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
678 dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
679 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
681 cdna.setDataset(null);
683 // Xref A22222 to V12345 (should get mapped)
684 // A11111 should then be mapped to the unmapped V12346
685 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
687 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
689 // 2 protein mappings made
690 assertEquals(2, protein.getCodonFrames().size());
691 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
692 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
694 // one mapping for each of the cDNA sequences
695 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
696 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
698 // V12345 mapped to A22222
699 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
701 assertEquals(1, acf.getdnaSeqs().length);
702 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
703 acf.getdnaSeqs()[0]);
705 // V12346 mapped to A11111
706 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
707 assertEquals(1, acf.getdnaSeqs().length);
708 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
709 acf.getdnaSeqs()[0]);
713 * Test the method that shows or hides sequence annotations by type(s) and
717 public void testShowOrHideSequenceAnnotations()
719 SequenceI seq1 = new Sequence("Seq1", "AAA");
720 SequenceI seq2 = new Sequence("Seq2", "BBB");
721 SequenceI seq3 = new Sequence("Seq3", "CCC");
722 Annotation[] anns = new Annotation[]
723 { new Annotation(2f) };
724 AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1",
726 ann1.setSequenceRef(seq1);
727 AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2",
729 ann2.setSequenceRef(seq2);
730 AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3",
732 AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns);
733 ann4.setSequenceRef(seq1);
734 AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns);
735 ann5.setSequenceRef(seq2);
736 AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns);
737 AlignmentI al = new Alignment(new SequenceI[] {seq1, seq2, seq3});
738 al.addAnnotation(ann1); // Structure for Seq1
739 al.addAnnotation(ann2); // Structure for Seq2
740 al.addAnnotation(ann3); // Structure for no sequence
741 al.addAnnotation(ann4); // Temp for seq1
742 al.addAnnotation(ann5); // Temp for seq2
743 al.addAnnotation(ann6); // Temp for no sequence
744 List<String> types = new ArrayList<String>();
745 List<SequenceI> scope = new ArrayList<SequenceI>();
748 * Set all sequence related Structure to hidden (ann1, ann2)
750 types.add("Structure");
751 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
753 assertFalse(ann1.visible);
754 assertFalse(ann2.visible);
755 assertTrue(ann3.visible); // not sequence-related, not affected
756 assertTrue(ann4.visible); // not Structure, not affected
757 assertTrue(ann5.visible); // "
758 assertTrue(ann6.visible); // not sequence-related, not affected
761 * Set Temp in {seq1, seq3} to hidden
767 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false,
769 assertFalse(ann1.visible); // unchanged
770 assertFalse(ann2.visible); // unchanged
771 assertTrue(ann3.visible); // not sequence-related, not affected
772 assertFalse(ann4.visible); // Temp for seq1 hidden
773 assertTrue(ann5.visible); // not in scope, not affected
774 assertTrue(ann6.visible); // not sequence-related, not affected
777 * Set Temp in all sequences to hidden
783 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
785 assertFalse(ann1.visible); // unchanged
786 assertFalse(ann2.visible); // unchanged
787 assertTrue(ann3.visible); // not sequence-related, not affected
788 assertFalse(ann4.visible); // Temp for seq1 hidden
789 assertFalse(ann5.visible); // Temp for seq2 hidden
790 assertTrue(ann6.visible); // not sequence-related, not affected
793 * Set all types in {seq1, seq3} to visible
799 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true,
801 assertTrue(ann1.visible); // Structure for seq1 set visible
802 assertFalse(ann2.visible); // not in scope, unchanged
803 assertTrue(ann3.visible); // not sequence-related, not affected
804 assertTrue(ann4.visible); // Temp for seq1 set visible
805 assertFalse(ann5.visible); // not in scope, unchanged
806 assertTrue(ann6.visible); // not sequence-related, not affected
809 * Set all types in all scope to hidden
811 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true,
813 assertFalse(ann1.visible);
814 assertFalse(ann2.visible);
815 assertTrue(ann3.visible); // not sequence-related, not affected
816 assertFalse(ann4.visible);
817 assertFalse(ann5.visible);
818 assertTrue(ann6.visible); // not sequence-related, not affected
822 * Tests for the method that checks if one sequence cross-references another
825 public void testHasCrossRef()
827 assertFalse(AlignmentUtils.hasCrossRef(null, null));
828 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
829 assertFalse(AlignmentUtils.hasCrossRef(seq1, null));
830 assertFalse(AlignmentUtils.hasCrossRef(null, seq1));
831 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
832 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
835 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193"));
836 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
838 // case-insensitive; version number is ignored
839 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192"));
840 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
843 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
844 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
845 // test is one-way only
846 assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1));
850 * Tests for the method that checks if either sequence cross-references the
854 public void testHaveCrossRef()
856 assertFalse(AlignmentUtils.hasCrossRef(null, null));
857 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
858 assertFalse(AlignmentUtils.haveCrossRef(seq1, null));
859 assertFalse(AlignmentUtils.haveCrossRef(null, seq1));
860 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
861 assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2));
863 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
864 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
865 // next is true for haveCrossRef, false for hasCrossRef
866 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
868 // now the other way round
870 seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345"));
871 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
872 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
875 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
876 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
877 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
881 * Test the method that extracts the exon-only part of a dna alignment.
884 public void testMakeExonAlignment()
886 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
887 SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
888 SequenceI pep1 = new Sequence("pep1", "GF");
889 SequenceI pep2 = new Sequence("pep2", "GFP");
890 dna1.createDatasetSequence();
891 dna2.createDatasetSequence();
892 pep1.createDatasetSequence();
893 pep2.createDatasetSequence();
895 Set<AlignedCodonFrame> mappings = new HashSet<AlignedCodonFrame>();
896 MapList map = new MapList(new int[]
897 { 4, 6, 10, 12 }, new int[]
899 AlignedCodonFrame acf = new AlignedCodonFrame();
900 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
902 map = new MapList(new int[]
903 { 1, 3, 7, 9, 13, 15 }, new int[]
905 acf = new AlignedCodonFrame();
906 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
909 AlignmentI exons = AlignmentUtils.makeExonAlignment(new SequenceI[]
910 { dna1, dna2 }, mappings);
911 assertEquals(2, exons.getSequences().size());
912 assertEquals("GGGTTT", exons.getSequenceAt(0).getSequenceAsString());
913 assertEquals("GGGTTTCCC", exons.getSequenceAt(1).getSequenceAsString());
916 * Verify updated mappings
918 assertEquals(2, mappings.size());
921 * Mapping from pep1 to GGGTTT in first new exon sequence
923 List<AlignedCodonFrame> pep1Mapping = MappingUtils
924 .findMappingsForSequence(pep1, mappings);
925 assertEquals(1, pep1Mapping.size());
927 SearchResults sr = MappingUtils.buildSearchResults(pep1, 1, mappings);
928 assertEquals(1, sr.getResults().size());
929 Match m = sr.getResults().get(0);
930 assertEquals(exons.getSequenceAt(0).getDatasetSequence(),
932 assertEquals(1, m.getStart());
933 assertEquals(3, m.getEnd());
935 sr = MappingUtils.buildSearchResults(pep1, 2, mappings);
936 m = sr.getResults().get(0);
937 assertEquals(exons.getSequenceAt(0).getDatasetSequence(),
939 assertEquals(4, m.getStart());
940 assertEquals(6, m.getEnd());
943 * Mapping from pep2 to GGGTTTCCC in second new exon sequence
945 List<AlignedCodonFrame> pep2Mapping = MappingUtils
946 .findMappingsForSequence(pep2, mappings);
947 assertEquals(1, pep2Mapping.size());
949 sr = MappingUtils.buildSearchResults(pep2, 1, mappings);
950 assertEquals(1, sr.getResults().size());
951 m = sr.getResults().get(0);
952 assertEquals(exons.getSequenceAt(1).getDatasetSequence(),
954 assertEquals(1, m.getStart());
955 assertEquals(3, m.getEnd());
957 sr = MappingUtils.buildSearchResults(pep2, 2, mappings);
958 m = sr.getResults().get(0);
959 assertEquals(exons.getSequenceAt(1).getDatasetSequence(),
961 assertEquals(4, m.getStart());
962 assertEquals(6, m.getEnd());
964 sr = MappingUtils.buildSearchResults(pep2, 3, mappings);
965 m = sr.getResults().get(0);
966 assertEquals(exons.getSequenceAt(1).getDatasetSequence(),
968 assertEquals(7, m.getStart());
969 assertEquals(9, m.getEnd());
973 * Test the method that makes an exon-only sequence from a DNA sequence and
974 * its product mapping. Test includes the expected case that the DNA sequence
975 * already has a protein product (Uniprot translation) which in turn has an
976 * x-ref to the EMBLCDS record.
979 public void testMakeExonSequences()
981 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
982 SequenceI pep1 = new Sequence("pep1", "GF");
983 dna1.createDatasetSequence();
984 pep1.createDatasetSequence();
985 pep1.getDatasetSequence().addDBRef(
986 new DBRefEntry("EMBLCDS", "2", "A12345"));
989 * Make the mapping from dna to protein. The protein sequence has a DBRef to
992 Set<AlignedCodonFrame> mappings = new HashSet<AlignedCodonFrame>();
993 MapList map = new MapList(new int[]
994 { 4, 6, 10, 12 }, new int[]
996 AlignedCodonFrame acf = new AlignedCodonFrame();
997 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1000 AlignedCodonFrame newMapping = new AlignedCodonFrame();
1001 List<SequenceI> exons = AlignmentUtils.makeExonSequences(dna1, acf,
1003 assertEquals(1, exons.size());
1004 SequenceI exon = exons.get(0);
1006 assertEquals("GGGTTT", exon.getSequenceAsString());
1007 assertEquals("dna1|A12345", exon.getName());
1008 assertEquals(1, exon.getDBRef().length);
1009 DBRefEntry cdsRef = exon.getDBRef()[0];
1010 assertEquals("EMBLCDS", cdsRef.getSource());
1011 assertEquals("2", cdsRef.getVersion());
1012 assertEquals("A12345", cdsRef.getAccessionId());
1016 * Test the method that makes an exon-only alignment from a DNA sequence and
1017 * its product mappings, for the case where there are multiple exon mappings
1018 * to different protein products.
1021 public void testMakeExonAlignment_multipleProteins()
1023 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1024 SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT
1025 SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc
1026 SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT
1027 dna1.createDatasetSequence();
1028 pep1.createDatasetSequence();
1029 pep2.createDatasetSequence();
1030 pep3.createDatasetSequence();
1031 pep1.getDatasetSequence().addDBRef(
1032 new DBRefEntry("EMBLCDS", "2", "A12345"));
1033 pep2.getDatasetSequence().addDBRef(
1034 new DBRefEntry("EMBLCDS", "3", "A12346"));
1035 pep3.getDatasetSequence().addDBRef(
1036 new DBRefEntry("EMBLCDS", "4", "A12347"));
1039 * Make the mappings from dna to protein. Using LinkedHashset is a
1040 * convenience so results are in the input order. There is no assertion that
1041 * the generated exon sequences are in any particular order.
1043 Set<AlignedCodonFrame> mappings = new LinkedHashSet<AlignedCodonFrame>();
1044 // map ...GGG...TTT to GF
1045 MapList map = new MapList(new int[]
1046 { 4, 6, 10, 12 }, new int[]
1048 AlignedCodonFrame acf = new AlignedCodonFrame();
1049 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1052 // map aaa...ccc to KP
1053 map = new MapList(new int[]
1054 { 1, 3, 7, 9 }, new int[]
1056 acf = new AlignedCodonFrame();
1057 acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map);
1060 // map aaa......TTT to KF
1061 map = new MapList(new int[]
1062 { 1, 3, 10, 12 }, new int[]
1064 acf = new AlignedCodonFrame();
1065 acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
1069 * Create the Exon alignment; also replaces the dna-to-protein mappings with
1070 * exon-to-protein and exon-to-dna mappings
1072 AlignmentI exal = AlignmentUtils.makeExonAlignment(new SequenceI[]
1073 { dna1 }, mappings);
1076 * Verify we have 3 exon sequences, mapped to pep1/2/3 respectively
1078 List<SequenceI> exons = exal.getSequences();
1079 assertEquals(3, exons.size());
1081 SequenceI exon = exons.get(0);
1082 assertEquals("GGGTTT", exon.getSequenceAsString());
1083 assertEquals("dna1|A12345", exon.getName());
1084 assertEquals(1, exon.getDBRef().length);
1085 DBRefEntry cdsRef = exon.getDBRef()[0];
1086 assertEquals("EMBLCDS", cdsRef.getSource());
1087 assertEquals("2", cdsRef.getVersion());
1088 assertEquals("A12345", cdsRef.getAccessionId());
1090 exon = exons.get(1);
1091 assertEquals("aaaccc", exon.getSequenceAsString());
1092 assertEquals("dna1|A12346", exon.getName());
1093 assertEquals(1, exon.getDBRef().length);
1094 cdsRef = exon.getDBRef()[0];
1095 assertEquals("EMBLCDS", cdsRef.getSource());
1096 assertEquals("3", cdsRef.getVersion());
1097 assertEquals("A12346", cdsRef.getAccessionId());
1099 exon = exons.get(2);
1100 assertEquals("aaaTTT", exon.getSequenceAsString());
1101 assertEquals("dna1|A12347", exon.getName());
1102 assertEquals(1, exon.getDBRef().length);
1103 cdsRef = exon.getDBRef()[0];
1104 assertEquals("EMBLCDS", cdsRef.getSource());
1105 assertEquals("4", cdsRef.getVersion());
1106 assertEquals("A12347", cdsRef.getAccessionId());
1109 * Verify there are mappings from each exon sequence to its protein product
1110 * and also to its dna source
1112 Iterator<AlignedCodonFrame> newMappingsIterator = mappings.iterator();
1114 // mappings for dna1 - exon1 - pep1
1115 AlignedCodonFrame exonMapping = newMappingsIterator.next();
1116 List<Mapping> dnaMappings = exonMapping.getMappingsForSequence(dna1);
1117 assertEquals(1, dnaMappings.size());
1118 assertSame(exons.get(0).getDatasetSequence(), dnaMappings.get(0)
1120 assertEquals("G(1) in CDS should map to G(4) in DNA", 4, dnaMappings
1121 .get(0).getMap().getToPosition(1));
1122 List<Mapping> peptideMappings = exonMapping
1123 .getMappingsForSequence(pep1);
1124 assertEquals(1, peptideMappings.size());
1125 assertSame(pep1.getDatasetSequence(), peptideMappings.get(0).getTo());
1127 // mappings for dna1 - exon2 - pep2
1128 exonMapping = newMappingsIterator.next();
1129 dnaMappings = exonMapping.getMappingsForSequence(dna1);
1130 assertEquals(1, dnaMappings.size());
1131 assertSame(exons.get(1).getDatasetSequence(), dnaMappings.get(0)
1133 assertEquals("c(4) in CDS should map to c(7) in DNA", 7, dnaMappings
1134 .get(0).getMap().getToPosition(4));
1135 peptideMappings = exonMapping.getMappingsForSequence(pep2);
1136 assertEquals(1, peptideMappings.size());
1137 assertSame(pep2.getDatasetSequence(), peptideMappings.get(0).getTo());
1139 // mappings for dna1 - exon3 - pep3
1140 exonMapping = newMappingsIterator.next();
1141 dnaMappings = exonMapping.getMappingsForSequence(dna1);
1142 assertEquals(1, dnaMappings.size());
1143 assertSame(exons.get(2).getDatasetSequence(), dnaMappings.get(0)
1145 assertEquals("T(4) in CDS should map to T(10) in DNA", 10, dnaMappings
1146 .get(0).getMap().getToPosition(4));
1147 peptideMappings = exonMapping.getMappingsForSequence(pep3);
1148 assertEquals(1, peptideMappings.size());
1149 assertSame(pep3.getDatasetSequence(), peptideMappings.get(0).getTo());