2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.junit.Assert.assertEquals;
24 import static org.junit.Assert.assertFalse;
25 import static org.junit.Assert.assertSame;
26 import static org.junit.Assert.assertTrue;
28 import java.io.IOException;
29 import java.util.ArrayList;
30 import java.util.Arrays;
31 import java.util.Collections;
32 import java.util.HashSet;
33 import java.util.LinkedHashSet;
34 import java.util.List;
38 import org.junit.Test;
40 import jalview.datamodel.AlignedCodonFrame;
41 import jalview.datamodel.Alignment;
42 import jalview.datamodel.AlignmentAnnotation;
43 import jalview.datamodel.AlignmentI;
44 import jalview.datamodel.Annotation;
45 import jalview.datamodel.DBRefEntry;
46 import jalview.datamodel.Mapping;
47 import jalview.datamodel.SearchResults;
48 import jalview.datamodel.SearchResults.Match;
49 import jalview.datamodel.Sequence;
50 import jalview.datamodel.SequenceI;
51 import jalview.io.AppletFormatAdapter;
52 import jalview.io.FormatAdapter;
53 import jalview.util.MapList;
54 import jalview.util.MappingUtils;
56 public class AlignmentUtilsTests
59 private static final String TEST_DATA =
61 "#=GS D.melanogaster.1 AC AY119185.1/838-902\n" +
62 "#=GS D.melanogaster.2 AC AC092237.1/57223-57161\n" +
63 "#=GS D.melanogaster.3 AC AY060611.1/560-627\n" +
64 "D.melanogaster.1 G.AGCC.CU...AUGAUCGA\n" +
65 "#=GR D.melanogaster.1 SS ................((((\n" +
66 "D.melanogaster.2 C.AUUCAACU.UAUGAGGAU\n" +
67 "#=GR D.melanogaster.2 SS ................((((\n" +
68 "D.melanogaster.3 G.UGGCGCU..UAUGACGCA\n" +
69 "#=GR D.melanogaster.3 SS (.(((...(....(((((((\n" +
72 private static final String AA_SEQS_1 =
78 private static final String CDNA_SEQS_1 =
80 "AC-GG--CUC-CAA-CT\n" +
82 "-CG-TTA--ACG---AAGT\n";
84 private static final String CDNA_SEQS_2 =
91 public static Sequence ts=new Sequence("short","ASDASDASDASDASDASDASDASDASDASDASDASDASD");
94 public void testExpandFlanks()
96 AlignmentI al = new Alignment(new Sequence[] {});
97 for (int i=4;i<14;i+=3)
99 SequenceI s1=ts.deriveSequence().getSubSequence(i, i+7);
102 System.out.println(new AppletFormatAdapter().formatSequences("Clustal", al, true));
103 for (int flnk=-1;flnk<25; flnk++)
106 System.out.println("\nFlank size: "+flnk);
107 System.out.println(new AppletFormatAdapter().formatSequences("Clustal", exp=AlignmentUtils.expandContext(al, flnk), true));
109 for (SequenceI sq:exp.getSequences())
111 String ung = sq.getSequenceAsString().replaceAll("-+", "");
112 assertTrue("Flanking sequence not the same as original dataset sequence.\n"+ung+"\n"+sq.getDatasetSequence().getSequenceAsString(),ung.equalsIgnoreCase(sq.getDatasetSequence().getSequenceAsString()));
119 * Test method that returns a map of lists of sequences by sequence name.
121 * @throws IOException
124 public void testGetSequencesByName() throws IOException
126 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
127 + ">Seq1Name\nABCD\n";
128 AlignmentI al = loadAlignment(data, "FASTA");
129 Map<String, List<SequenceI>> map = AlignmentUtils
130 .getSequencesByName(al);
131 assertEquals(2, map.keySet().size());
132 assertEquals(2, map.get("Seq1Name").size());
133 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
134 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
135 assertEquals(1, map.get("Seq2Name").size());
136 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
139 * Helper method to load an alignment and ensure dataset sequences are set up.
144 * @throws IOException
146 protected AlignmentI loadAlignment(final String data, String format) throws IOException
148 Alignment a = new FormatAdapter().readFile(data,
149 AppletFormatAdapter.PASTE, format);
155 * Test mapping of protein to cDNA, for the case where we have no sequence
156 * cross-references, so mappings are made first-served 1-1 where sequences
159 * @throws IOException
162 public void testMapProteinToCdna_noXrefs() throws IOException
164 List<SequenceI> protseqs = new ArrayList<SequenceI>();
165 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
166 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
167 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
168 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
169 protein.setDataset(null);
171 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
172 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
173 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
174 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
175 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
176 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
177 cdna.setDataset(null);
179 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
181 // 3 mappings made, each from 1 to 1 sequence
182 assertEquals(3, protein.getCodonFrames().size());
183 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
184 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
185 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
187 // V12345 mapped to A22222
188 AlignedCodonFrame acf = protein.getCodonFrame(
189 protein.getSequenceAt(0)).get(0);
190 assertEquals(1, acf.getdnaSeqs().length);
191 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
192 acf.getdnaSeqs()[0]);
193 Mapping[] protMappings = acf.getProtMappings();
194 assertEquals(1, protMappings.length);
195 MapList mapList = protMappings[0].getMap();
196 assertEquals(3, mapList.getFromRatio());
197 assertEquals(1, mapList.getToRatio());
198 assertTrue(Arrays.equals(new int[]
199 { 1, 9 }, mapList.getFromRanges().get(0)));
200 assertEquals(1, mapList.getFromRanges().size());
201 assertTrue(Arrays.equals(new int[]
202 { 1, 3 }, mapList.getToRanges().get(0)));
203 assertEquals(1, mapList.getToRanges().size());
205 // V12346 mapped to A33333
206 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
207 assertEquals(1, acf.getdnaSeqs().length);
208 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
209 acf.getdnaSeqs()[0]);
211 // V12347 mapped to A11111
212 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
213 assertEquals(1, acf.getdnaSeqs().length);
214 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
215 acf.getdnaSeqs()[0]);
217 // no mapping involving the 'extra' A44444
218 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
222 * Test for the alignSequenceAs method that takes two sequences and a mapping.
225 public void testAlignSequenceAs_withMapping_noIntrons()
227 MapList map = new MapList(new int[]
232 * No existing gaps in dna:
234 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
238 * Now introduce gaps in dna but ignore them when realigning.
240 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
244 * Now include gaps in dna when realigning. First retaining 'mapped' gaps
245 * only, i.e. those within the exon region.
247 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
248 "---G-G--G---A--A-A");
251 * Include all gaps in dna when realigning (within and without the exon
252 * region). The leading gap, and the gaps between codons, are subsumed by
253 * the protein alignment gap.
255 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", true, true, map,
259 * Include only unmapped gaps in dna when realigning (outside the exon
260 * region). The leading gap, and the gaps between codons, are subsumed by
261 * the protein alignment gap.
263 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
268 * Test for the alignSequenceAs method that takes two sequences and a mapping.
271 public void testAlignSequenceAs_withMapping_withIntrons()
274 * Exons at codon 2 (AAA) and 4 (TTT)
276 MapList map = new MapList(new int[]
277 { 4, 6, 10, 12 }, new int[]
281 * Simple case: no gaps in dna
283 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
284 "GGG---AAACCCTTTGGG");
287 * Add gaps to dna - but ignore when realigning.
289 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
290 false, false, map, "GGG---AAACCCTTTGGG");
293 * Add gaps to dna - include within exons only when realigning.
295 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
296 true, false, map, "GGG---A--A---ACCCT-TTGGG");
299 * Include gaps outside exons only when realigning.
301 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
302 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
305 * Include gaps following first intron if we are 'preserving mapped gaps'
307 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
308 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
311 * Include all gaps in dna when realigning.
313 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
314 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
318 * Test for the case where not all of the protein sequence is mapped to cDNA.
321 public void testAlignSequenceAs_withMapping_withUnmappedProtein()
325 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
327 final MapList map = new MapList(new int[]
328 { 4, 6, 10, 12 }, new int[]
329 { 1, 1, 3, 3 }, 3, 1);
333 * Expect alignment does nothing (aborts realignment). Change this test
334 * first if different behaviour wanted.
336 checkAlignSequenceAs("GGGAAACCCTTTGGG", "-A-L-P-", false,
337 false, map, "GGGAAACCCTTTGGG");
341 * Helper method that performs and verifies the method under test.
345 * @param preserveMappedGaps
346 * @param preserveUnmappedGaps
350 protected void checkAlignSequenceAs(final String dnaSeq,
351 final String proteinSeq, final boolean preserveMappedGaps,
352 final boolean preserveUnmappedGaps, MapList map,
353 final String expected)
355 SequenceI dna = new Sequence("Seq1", dnaSeq);
356 dna.createDatasetSequence();
357 SequenceI protein = new Sequence("Seq1", proteinSeq);
358 protein.createDatasetSequence();
359 AlignedCodonFrame acf = new AlignedCodonFrame();
360 acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map);
362 AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-',
363 preserveMappedGaps, preserveUnmappedGaps);
364 assertEquals(expected, dna.getSequenceAsString());
368 * Test for the alignSequenceAs method where we preserve gaps in introns only.
371 public void testAlignSequenceAs_keepIntronGapsOnly()
375 * Intron GGGAAA followed by exon CCCTTT
377 MapList map = new MapList(new int[]
381 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL",
382 false, true, map, "GG-G-AA-ACCCTTT");
386 * Test for the method that generates an aligned translated sequence from one
390 public void testGetAlignedTranslation_dnaLikeProtein()
392 // dna alignment will be replaced
393 SequenceI dna = new Sequence("Seq1", "T-G-CC-A--T-TAC-CAG-");
394 dna.createDatasetSequence();
395 // protein alignment will be 'applied' to dna
396 SequenceI protein = new Sequence("Seq1", "-CH-Y--Q-");
397 protein.createDatasetSequence();
398 MapList map = new MapList(new int[]
401 AlignedCodonFrame acf = new AlignedCodonFrame();
402 acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map);
404 final SequenceI aligned = AlignmentUtils
405 .getAlignedTranslation(protein, '-', acf);
406 assertEquals("---TGCCAT---TAC------CAG---", aligned.getSequenceAsString());
407 assertSame(aligned.getDatasetSequence(), dna.getDatasetSequence());
411 * Test the method that realigns protein to match mapped codon alignment.
414 public void testAlignProteinAsDna()
416 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
417 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
418 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
419 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
420 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
421 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
422 AlignmentI dna = new Alignment(new SequenceI[]
423 { dna1, dna2, dna3 });
424 dna.setDataset(null);
426 // protein alignment will be realigned like dna
427 SequenceI prot1 = new Sequence("Seq1", "CHYQ");
428 SequenceI prot2 = new Sequence("Seq2", "CHYQ");
429 SequenceI prot3 = new Sequence("Seq3", "CHYQ");
430 AlignmentI protein = new Alignment(new SequenceI[]
431 { prot1, prot2, prot3 });
432 protein.setDataset(null);
434 MapList map = new MapList(new int[]
437 AlignedCodonFrame acf = new AlignedCodonFrame();
438 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
439 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
440 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
441 protein.setCodonFrames(Collections.singleton(acf));
444 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
445 * [8,9,10] [10,11,12] [11,12,13]
447 AlignmentUtils.alignProteinAsDna(protein, dna);
448 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
449 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
450 assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
454 * Test the method that tests whether a CDNA sequence translates to a protein
458 public void testTranslatesAs()
460 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
461 "FPKG".toCharArray()));
463 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
464 3, "FPKG".toCharArray()));
466 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
467 0, "FPKG".toCharArray()));
469 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
470 0, "FPKG".toCharArray()));
472 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
473 0, "FPKG".toCharArray()));
474 // with start and stop codon1
475 assertTrue(AlignmentUtils.translatesAs(
476 "atgtttcccaaaggtaa".toCharArray(), 3, "FPKG".toCharArray()));
477 // with start and stop codon2
478 assertTrue(AlignmentUtils.translatesAs(
479 "atgtttcccaaaggtag".toCharArray(), 3, "FPKG".toCharArray()));
480 // with start and stop codon3
481 assertTrue(AlignmentUtils.translatesAs(
482 "atgtttcccaaaggtga".toCharArray(), 3, "FPKG".toCharArray()));
485 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
487 "FPMG".toCharArray()));
491 * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
492 * stop codons in addition to the protein coding sequence.
494 * @throws IOException
497 public void testMapProteinToCdna_withStartAndStopCodons()
500 List<SequenceI> protseqs = new ArrayList<SequenceI>();
501 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
502 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
503 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
504 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
505 protein.setDataset(null);
507 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
509 dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
511 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
512 // = start +EIQ + stop
513 dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
514 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
515 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
516 cdna.setDataset(null);
518 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
520 // 3 mappings made, each from 1 to 1 sequence
521 assertEquals(3, protein.getCodonFrames().size());
522 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
523 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
524 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
526 // V12345 mapped from A22222
527 AlignedCodonFrame acf = protein.getCodonFrame(
528 protein.getSequenceAt(0)).get(0);
529 assertEquals(1, acf.getdnaSeqs().length);
530 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
531 acf.getdnaSeqs()[0]);
532 Mapping[] protMappings = acf.getProtMappings();
533 assertEquals(1, protMappings.length);
534 MapList mapList = protMappings[0].getMap();
535 assertEquals(3, mapList.getFromRatio());
536 assertEquals(1, mapList.getToRatio());
537 assertTrue(Arrays.equals(new int[]
538 { 1, 9 }, mapList.getFromRanges().get(0)));
539 assertEquals(1, mapList.getFromRanges().size());
540 assertTrue(Arrays.equals(new int[]
541 { 1, 3 }, mapList.getToRanges().get(0)));
542 assertEquals(1, mapList.getToRanges().size());
544 // V12346 mapped from A33333 starting position 4
545 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
546 assertEquals(1, acf.getdnaSeqs().length);
547 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
548 acf.getdnaSeqs()[0]);
549 protMappings = acf.getProtMappings();
550 assertEquals(1, protMappings.length);
551 mapList = protMappings[0].getMap();
552 assertEquals(3, mapList.getFromRatio());
553 assertEquals(1, mapList.getToRatio());
554 assertTrue(Arrays.equals(new int[]
555 { 4, 12 }, mapList.getFromRanges().get(0)));
556 assertEquals(1, mapList.getFromRanges().size());
557 assertTrue(Arrays.equals(new int[]
558 { 1, 3 }, mapList.getToRanges().get(0)));
559 assertEquals(1, mapList.getToRanges().size());
561 // V12347 mapped to A11111 starting position 4
562 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
563 assertEquals(1, acf.getdnaSeqs().length);
564 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
565 acf.getdnaSeqs()[0]);
566 protMappings = acf.getProtMappings();
567 assertEquals(1, protMappings.length);
568 mapList = protMappings[0].getMap();
569 assertEquals(3, mapList.getFromRatio());
570 assertEquals(1, mapList.getToRatio());
571 assertTrue(Arrays.equals(new int[]
572 { 4, 12 }, mapList.getFromRanges().get(0)));
573 assertEquals(1, mapList.getFromRanges().size());
574 assertTrue(Arrays.equals(new int[]
575 { 1, 3 }, mapList.getToRanges().get(0)));
576 assertEquals(1, mapList.getToRanges().size());
578 // no mapping involving the 'extra' A44444
579 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
583 * Test mapping of protein to cDNA, for the case where we have some sequence
584 * cross-references. Verify that 1-to-many mappings are made where
585 * cross-references exist and sequences are mappable.
587 * @throws IOException
590 public void testMapProteinToCdna_withXrefs() throws IOException
592 List<SequenceI> protseqs = new ArrayList<SequenceI>();
593 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
594 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
595 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
596 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
597 protein.setDataset(null);
599 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
600 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
601 dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
602 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
603 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
604 dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
605 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
606 cdna.setDataset(null);
608 // Xref A22222 to V12345 (should get mapped)
609 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
610 // Xref V12345 to A44444 (should get mapped)
611 protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
612 // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
613 dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
614 // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
615 // it should get paired up with the unmapped A33333
616 // A11111 should be mapped to V12347
617 // A55555 is spare and has no xref so is not mapped
619 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
621 // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
622 assertEquals(3, protein.getCodonFrames().size());
623 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
624 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
625 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
627 // one mapping for each of the first 4 cDNA sequences
628 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
629 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
630 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
631 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
633 // V12345 mapped to A22222 and A44444
634 AlignedCodonFrame acf = protein.getCodonFrame(
635 protein.getSequenceAt(0)).get(0);
636 assertEquals(2, acf.getdnaSeqs().length);
637 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
638 acf.getdnaSeqs()[0]);
639 assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
640 acf.getdnaSeqs()[1]);
642 // V12346 mapped to A33333
643 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
644 assertEquals(1, acf.getdnaSeqs().length);
645 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
646 acf.getdnaSeqs()[0]);
648 // V12347 mapped to A11111
649 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
650 assertEquals(1, acf.getdnaSeqs().length);
651 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
652 acf.getdnaSeqs()[0]);
654 // no mapping involving the 'extra' A55555
655 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
659 * Test mapping of protein to cDNA, for the case where we have some sequence
660 * cross-references. Verify that once we have made an xref mapping we don't
661 * also map un-xrefd sequeces.
663 * @throws IOException
666 public void testMapProteinToCdna_prioritiseXrefs() throws IOException
668 List<SequenceI> protseqs = new ArrayList<SequenceI>();
669 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
670 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
671 AlignmentI protein = new Alignment(
672 protseqs.toArray(new SequenceI[protseqs.size()]));
673 protein.setDataset(null);
675 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
676 dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
677 dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
678 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
680 cdna.setDataset(null);
682 // Xref A22222 to V12345 (should get mapped)
683 // A11111 should then be mapped to the unmapped V12346
684 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
686 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
688 // 2 protein mappings made
689 assertEquals(2, protein.getCodonFrames().size());
690 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
691 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
693 // one mapping for each of the cDNA sequences
694 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
695 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
697 // V12345 mapped to A22222
698 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
700 assertEquals(1, acf.getdnaSeqs().length);
701 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
702 acf.getdnaSeqs()[0]);
704 // V12346 mapped to A11111
705 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
706 assertEquals(1, acf.getdnaSeqs().length);
707 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
708 acf.getdnaSeqs()[0]);
712 * Test the method that shows or hides sequence annotations by type(s) and
716 public void testShowOrHideSequenceAnnotations()
718 SequenceI seq1 = new Sequence("Seq1", "AAA");
719 SequenceI seq2 = new Sequence("Seq2", "BBB");
720 SequenceI seq3 = new Sequence("Seq3", "CCC");
721 Annotation[] anns = new Annotation[]
722 { new Annotation(2f) };
723 AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1",
725 ann1.setSequenceRef(seq1);
726 AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2",
728 ann2.setSequenceRef(seq2);
729 AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3",
731 AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns);
732 ann4.setSequenceRef(seq1);
733 AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns);
734 ann5.setSequenceRef(seq2);
735 AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns);
736 AlignmentI al = new Alignment(new SequenceI[] {seq1, seq2, seq3});
737 al.addAnnotation(ann1); // Structure for Seq1
738 al.addAnnotation(ann2); // Structure for Seq2
739 al.addAnnotation(ann3); // Structure for no sequence
740 al.addAnnotation(ann4); // Temp for seq1
741 al.addAnnotation(ann5); // Temp for seq2
742 al.addAnnotation(ann6); // Temp for no sequence
743 List<String> types = new ArrayList<String>();
744 List<SequenceI> scope = new ArrayList<SequenceI>();
747 * Set all sequence related Structure to hidden (ann1, ann2)
749 types.add("Structure");
750 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
752 assertFalse(ann1.visible);
753 assertFalse(ann2.visible);
754 assertTrue(ann3.visible); // not sequence-related, not affected
755 assertTrue(ann4.visible); // not Structure, not affected
756 assertTrue(ann5.visible); // "
757 assertTrue(ann6.visible); // not sequence-related, not affected
760 * Set Temp in {seq1, seq3} to hidden
766 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false,
768 assertFalse(ann1.visible); // unchanged
769 assertFalse(ann2.visible); // unchanged
770 assertTrue(ann3.visible); // not sequence-related, not affected
771 assertFalse(ann4.visible); // Temp for seq1 hidden
772 assertTrue(ann5.visible); // not in scope, not affected
773 assertTrue(ann6.visible); // not sequence-related, not affected
776 * Set Temp in all sequences to hidden
782 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
784 assertFalse(ann1.visible); // unchanged
785 assertFalse(ann2.visible); // unchanged
786 assertTrue(ann3.visible); // not sequence-related, not affected
787 assertFalse(ann4.visible); // Temp for seq1 hidden
788 assertFalse(ann5.visible); // Temp for seq2 hidden
789 assertTrue(ann6.visible); // not sequence-related, not affected
792 * Set all types in {seq1, seq3} to visible
798 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true,
800 assertTrue(ann1.visible); // Structure for seq1 set visible
801 assertFalse(ann2.visible); // not in scope, unchanged
802 assertTrue(ann3.visible); // not sequence-related, not affected
803 assertTrue(ann4.visible); // Temp for seq1 set visible
804 assertFalse(ann5.visible); // not in scope, unchanged
805 assertTrue(ann6.visible); // not sequence-related, not affected
808 * Set all types in all scope to hidden
810 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true,
812 assertFalse(ann1.visible);
813 assertFalse(ann2.visible);
814 assertTrue(ann3.visible); // not sequence-related, not affected
815 assertFalse(ann4.visible);
816 assertFalse(ann5.visible);
817 assertTrue(ann6.visible); // not sequence-related, not affected
821 * Tests for the method that checks if one sequence cross-references another
824 public void testHasCrossRef()
826 assertFalse(AlignmentUtils.hasCrossRef(null, null));
827 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
828 assertFalse(AlignmentUtils.hasCrossRef(seq1, null));
829 assertFalse(AlignmentUtils.hasCrossRef(null, seq1));
830 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
831 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
834 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193"));
835 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
837 // case-insensitive; version number is ignored
838 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192"));
839 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
842 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
843 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
844 // test is one-way only
845 assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1));
849 * Tests for the method that checks if either sequence cross-references the
853 public void testHaveCrossRef()
855 assertFalse(AlignmentUtils.hasCrossRef(null, null));
856 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
857 assertFalse(AlignmentUtils.haveCrossRef(seq1, null));
858 assertFalse(AlignmentUtils.haveCrossRef(null, seq1));
859 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
860 assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2));
862 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
863 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
864 // next is true for haveCrossRef, false for hasCrossRef
865 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
867 // now the other way round
869 seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345"));
870 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
871 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
874 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
875 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
876 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
880 * Test the method that extracts the exon-only part of a dna alignment.
883 public void testMakeExonAlignment()
885 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
886 SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
887 SequenceI pep1 = new Sequence("pep1", "GF");
888 SequenceI pep2 = new Sequence("pep2", "GFP");
889 dna1.createDatasetSequence();
890 dna2.createDatasetSequence();
891 pep1.createDatasetSequence();
892 pep2.createDatasetSequence();
894 Set<AlignedCodonFrame> mappings = new HashSet<AlignedCodonFrame>();
895 MapList map = new MapList(new int[]
896 { 4, 6, 10, 12 }, new int[]
898 AlignedCodonFrame acf = new AlignedCodonFrame();
899 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
901 map = new MapList(new int[]
902 { 1, 3, 7, 9, 13, 15 }, new int[]
904 acf = new AlignedCodonFrame();
905 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
908 AlignmentI exons = AlignmentUtils.makeExonAlignment(new SequenceI[]
909 { dna1, dna2 }, mappings);
910 assertEquals(2, exons.getSequences().size());
911 assertEquals("GGGTTT", exons.getSequenceAt(0).getSequenceAsString());
912 assertEquals("GGGTTTCCC", exons.getSequenceAt(1).getSequenceAsString());
915 * Verify updated mappings
917 assertEquals(2, mappings.size());
920 * Mapping from pep1 to GGGTTT in first new exon sequence
922 List<AlignedCodonFrame> pep1Mapping = MappingUtils
923 .findMappingsForSequence(pep1, mappings);
924 assertEquals(1, pep1Mapping.size());
926 SearchResults sr = MappingUtils.buildSearchResults(pep1, 1, mappings);
927 assertEquals(1, sr.getResults().size());
928 Match m = sr.getResults().get(0);
929 assertEquals(exons.getSequenceAt(0).getDatasetSequence(),
931 assertEquals(1, m.getStart());
932 assertEquals(3, m.getEnd());
934 sr = MappingUtils.buildSearchResults(pep1, 2, mappings);
935 m = sr.getResults().get(0);
936 assertEquals(exons.getSequenceAt(0).getDatasetSequence(),
938 assertEquals(4, m.getStart());
939 assertEquals(6, m.getEnd());
942 * Mapping from pep2 to GGGTTTCCC in second new exon sequence
944 List<AlignedCodonFrame> pep2Mapping = MappingUtils
945 .findMappingsForSequence(pep2, mappings);
946 assertEquals(1, pep2Mapping.size());
948 sr = MappingUtils.buildSearchResults(pep2, 1, mappings);
949 assertEquals(1, sr.getResults().size());
950 m = sr.getResults().get(0);
951 assertEquals(exons.getSequenceAt(1).getDatasetSequence(),
953 assertEquals(1, m.getStart());
954 assertEquals(3, m.getEnd());
956 sr = MappingUtils.buildSearchResults(pep2, 2, mappings);
957 m = sr.getResults().get(0);
958 assertEquals(exons.getSequenceAt(1).getDatasetSequence(),
960 assertEquals(4, m.getStart());
961 assertEquals(6, m.getEnd());
963 sr = MappingUtils.buildSearchResults(pep2, 3, mappings);
964 m = sr.getResults().get(0);
965 assertEquals(exons.getSequenceAt(1).getDatasetSequence(),
967 assertEquals(7, m.getStart());
968 assertEquals(9, m.getEnd());
972 * Test the method that makes an exon-only sequence from a DNA sequence and
973 * its product mapping. Test includes the expected case that the DNA sequence
974 * already has a protein product (Uniprot translation) which in turn has an
975 * x-ref to the EMBLCDS record.
978 public void testMakeExonSequences()
980 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
981 SequenceI pep1 = new Sequence("pep1", "GF");
982 dna1.createDatasetSequence();
983 pep1.createDatasetSequence();
984 pep1.getDatasetSequence().addDBRef(
985 new DBRefEntry("EMBLCDS", "2", "A12345"));
988 * Make the mapping from dna to protein. The protein sequence has a DBRef to
991 Set<AlignedCodonFrame> mappings = new HashSet<AlignedCodonFrame>();
992 MapList map = new MapList(new int[]
993 { 4, 6, 10, 12 }, new int[]
995 AlignedCodonFrame acf = new AlignedCodonFrame();
996 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
999 AlignedCodonFrame newMapping = new AlignedCodonFrame();
1000 List<SequenceI> exons = AlignmentUtils.makeExonSequences(dna1, acf,
1002 assertEquals(1, exons.size());
1003 SequenceI exon = exons.get(0);
1005 assertEquals("GGGTTT", exon.getSequenceAsString());
1006 assertEquals("dna1|A12345", exon.getName());
1007 assertEquals(1, exon.getDBRef().length);
1008 DBRefEntry cdsRef = exon.getDBRef()[0];
1009 assertEquals("EMBLCDS", cdsRef.getSource());
1010 assertEquals("2", cdsRef.getVersion());
1011 assertEquals("A12345", cdsRef.getAccessionId());
1015 * Test the method that makes an exon-only alignment from a DNA sequence and
1016 * its product mappings, for the case where there are multiple exon mappings
1017 * to different protein products.
1020 public void testMakeExonAlignment_multipleProteins()
1022 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
1023 SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT
1024 SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc
1025 SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT
1026 dna1.createDatasetSequence();
1027 pep1.createDatasetSequence();
1028 pep2.createDatasetSequence();
1029 pep3.createDatasetSequence();
1030 pep1.getDatasetSequence().addDBRef(
1031 new DBRefEntry("EMBLCDS", "2", "A12345"));
1032 pep2.getDatasetSequence().addDBRef(
1033 new DBRefEntry("EMBLCDS", "3", "A12346"));
1034 pep3.getDatasetSequence().addDBRef(
1035 new DBRefEntry("EMBLCDS", "4", "A12347"));
1038 * Make the mappings from dna to protein. Using LinkedHashset is a
1039 * convenience so results are in the input order. There is no assertion that
1040 * the generated exon sequences are in any particular order.
1042 Set<AlignedCodonFrame> mappings = new LinkedHashSet<AlignedCodonFrame>();
1043 // map ...GGG...TTT to GF
1044 MapList map = new MapList(new int[]
1045 { 4, 6, 10, 12 }, new int[]
1047 AlignedCodonFrame acf = new AlignedCodonFrame();
1048 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
1051 // map aaa...ccc to KP
1052 map = new MapList(new int[]
1053 { 1, 3, 7, 9 }, new int[]
1055 acf = new AlignedCodonFrame();
1056 acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map);
1059 // map aaa......TTT to KF
1060 map = new MapList(new int[]
1061 { 1, 3, 10, 12 }, new int[]
1063 acf = new AlignedCodonFrame();
1064 acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
1067 AlignmentI exal = AlignmentUtils.makeExonAlignment(new SequenceI[]
1068 { dna1 }, mappings);
1071 * Verify we have 3 exon sequences, mapped to pep1/2/3 respectively
1073 List<SequenceI> exons = exal.getSequences();
1074 assertEquals(3, exons.size());
1076 SequenceI exon = exons.get(0);
1077 assertEquals("GGGTTT", exon.getSequenceAsString());
1078 assertEquals("dna1|A12345", exon.getName());
1079 assertEquals(1, exon.getDBRef().length);
1080 DBRefEntry cdsRef = exon.getDBRef()[0];
1081 assertEquals("EMBLCDS", cdsRef.getSource());
1082 assertEquals("2", cdsRef.getVersion());
1083 assertEquals("A12345", cdsRef.getAccessionId());
1085 exon = exons.get(1);
1086 assertEquals("aaaccc", exon.getSequenceAsString());
1087 assertEquals("dna1|A12346", exon.getName());
1088 assertEquals(1, exon.getDBRef().length);
1089 cdsRef = exon.getDBRef()[0];
1090 assertEquals("EMBLCDS", cdsRef.getSource());
1091 assertEquals("3", cdsRef.getVersion());
1092 assertEquals("A12346", cdsRef.getAccessionId());
1094 exon = exons.get(2);
1095 assertEquals("aaaTTT", exon.getSequenceAsString());
1096 assertEquals("dna1|A12347", exon.getName());
1097 assertEquals(1, exon.getDBRef().length);
1098 cdsRef = exon.getDBRef()[0];
1099 assertEquals("EMBLCDS", cdsRef.getSource());
1100 assertEquals("4", cdsRef.getVersion());
1101 assertEquals("A12347", cdsRef.getAccessionId());