2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.junit.Assert.assertEquals;
24 import static org.junit.Assert.assertFalse;
25 import static org.junit.Assert.assertSame;
26 import static org.junit.Assert.assertTrue;
28 import java.io.IOException;
29 import java.util.ArrayList;
30 import java.util.Arrays;
31 import java.util.Collections;
32 import java.util.HashSet;
33 import java.util.List;
37 import org.junit.Test;
39 import jalview.datamodel.AlignedCodonFrame;
40 import jalview.datamodel.Alignment;
41 import jalview.datamodel.AlignmentAnnotation;
42 import jalview.datamodel.AlignmentI;
43 import jalview.datamodel.Annotation;
44 import jalview.datamodel.DBRefEntry;
45 import jalview.datamodel.Mapping;
46 import jalview.datamodel.SearchResults;
47 import jalview.datamodel.SearchResults.Match;
48 import jalview.datamodel.Sequence;
49 import jalview.datamodel.SequenceI;
50 import jalview.io.AppletFormatAdapter;
51 import jalview.io.FormatAdapter;
52 import jalview.util.MapList;
53 import jalview.util.MappingUtils;
55 public class AlignmentUtilsTests
58 private static final String TEST_DATA =
60 "#=GS D.melanogaster.1 AC AY119185.1/838-902\n" +
61 "#=GS D.melanogaster.2 AC AC092237.1/57223-57161\n" +
62 "#=GS D.melanogaster.3 AC AY060611.1/560-627\n" +
63 "D.melanogaster.1 G.AGCC.CU...AUGAUCGA\n" +
64 "#=GR D.melanogaster.1 SS ................((((\n" +
65 "D.melanogaster.2 C.AUUCAACU.UAUGAGGAU\n" +
66 "#=GR D.melanogaster.2 SS ................((((\n" +
67 "D.melanogaster.3 G.UGGCGCU..UAUGACGCA\n" +
68 "#=GR D.melanogaster.3 SS (.(((...(....(((((((\n" +
71 private static final String AA_SEQS_1 =
77 private static final String CDNA_SEQS_1 =
79 "AC-GG--CUC-CAA-CT\n" +
81 "-CG-TTA--ACG---AAGT\n";
83 private static final String CDNA_SEQS_2 =
90 public static Sequence ts=new Sequence("short","ASDASDASDASDASDASDASDASDASDASDASDASDASD");
93 public void testExpandFlanks()
95 AlignmentI al = new Alignment(new Sequence[] {});
96 for (int i=4;i<14;i+=3)
98 SequenceI s1=ts.deriveSequence().getSubSequence(i, i+7);
101 System.out.println(new AppletFormatAdapter().formatSequences("Clustal", al, true));
102 for (int flnk=-1;flnk<25; flnk++)
105 System.out.println("\nFlank size: "+flnk);
106 System.out.println(new AppletFormatAdapter().formatSequences("Clustal", exp=AlignmentUtils.expandContext(al, flnk), true));
108 for (SequenceI sq:exp.getSequences())
110 String ung = sq.getSequenceAsString().replaceAll("-+", "");
111 assertTrue("Flanking sequence not the same as original dataset sequence.\n"+ung+"\n"+sq.getDatasetSequence().getSequenceAsString(),ung.equalsIgnoreCase(sq.getDatasetSequence().getSequenceAsString()));
118 * Test method that returns a map of lists of sequences by sequence name.
120 * @throws IOException
123 public void testGetSequencesByName() throws IOException
125 final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
126 + ">Seq1Name\nABCD\n";
127 AlignmentI al = loadAlignment(data, "FASTA");
128 Map<String, List<SequenceI>> map = AlignmentUtils
129 .getSequencesByName(al);
130 assertEquals(2, map.keySet().size());
131 assertEquals(2, map.get("Seq1Name").size());
132 assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
133 assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
134 assertEquals(1, map.get("Seq2Name").size());
135 assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
138 * Helper method to load an alignment and ensure dataset sequences are set up.
143 * @throws IOException
145 protected AlignmentI loadAlignment(final String data, String format) throws IOException
147 Alignment a = new FormatAdapter().readFile(data,
148 AppletFormatAdapter.PASTE, format);
154 * Test mapping of protein to cDNA, for the case where we have no sequence
155 * cross-references, so mappings are made first-served 1-1 where sequences
158 * @throws IOException
161 public void testMapProteinToCdna_noXrefs() throws IOException
163 List<SequenceI> protseqs = new ArrayList<SequenceI>();
164 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
165 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
166 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
167 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
168 protein.setDataset(null);
170 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
171 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
172 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
173 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
174 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
175 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
176 cdna.setDataset(null);
178 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
180 // 3 mappings made, each from 1 to 1 sequence
181 assertEquals(3, protein.getCodonFrames().size());
182 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
183 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
184 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
186 // V12345 mapped to A22222
187 AlignedCodonFrame acf = protein.getCodonFrame(
188 protein.getSequenceAt(0)).get(0);
189 assertEquals(1, acf.getdnaSeqs().length);
190 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
191 acf.getdnaSeqs()[0]);
192 Mapping[] protMappings = acf.getProtMappings();
193 assertEquals(1, protMappings.length);
194 MapList mapList = protMappings[0].getMap();
195 assertEquals(3, mapList.getFromRatio());
196 assertEquals(1, mapList.getToRatio());
197 assertTrue(Arrays.equals(new int[]
198 { 1, 9 }, mapList.getFromRanges().get(0)));
199 assertEquals(1, mapList.getFromRanges().size());
200 assertTrue(Arrays.equals(new int[]
201 { 1, 3 }, mapList.getToRanges().get(0)));
202 assertEquals(1, mapList.getToRanges().size());
204 // V12346 mapped to A33333
205 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
206 assertEquals(1, acf.getdnaSeqs().length);
207 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
208 acf.getdnaSeqs()[0]);
210 // V12347 mapped to A11111
211 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
212 assertEquals(1, acf.getdnaSeqs().length);
213 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
214 acf.getdnaSeqs()[0]);
216 // no mapping involving the 'extra' A44444
217 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
221 * Test for the alignSequenceAs method that takes two sequences and a mapping.
224 public void testAlignSequenceAs_withMapping_noIntrons()
226 MapList map = new MapList(new int[]
231 * No existing gaps in dna:
233 checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
237 * Now introduce gaps in dna but ignore them when realigning.
239 checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
243 * Now include gaps in dna when realigning. First retaining 'mapped' gaps
244 * only, i.e. those within the exon region.
246 checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
247 "---G-G--G---A--A-A");
250 * Include all gaps in dna when realigning (within and without the exon
251 * region). The leading gap, and the gaps between codons, are subsumed by
252 * the protein alignment gap.
254 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", true, true, map,
258 * Include only unmapped gaps in dna when realigning (outside the exon
259 * region). The leading gap, and the gaps between codons, are subsumed by
260 * the protein alignment gap.
262 checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
267 * Test for the alignSequenceAs method that takes two sequences and a mapping.
270 public void testAlignSequenceAs_withMapping_withIntrons()
273 * Exons at codon 2 (AAA) and 4 (TTT)
275 MapList map = new MapList(new int[]
276 { 4, 6, 10, 12 }, new int[]
280 * Simple case: no gaps in dna
282 checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
283 "GGG---AAACCCTTTGGG");
286 * Add gaps to dna - but ignore when realigning.
288 checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
289 false, false, map, "GGG---AAACCCTTTGGG");
292 * Add gaps to dna - include within exons only when realigning.
294 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
295 true, false, map, "GGG---A--A---ACCCT-TTGGG");
298 * Include gaps outside exons only when realigning.
300 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
301 false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
304 * Include gaps following first intron if we are 'preserving mapped gaps'
306 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
307 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
310 * Include all gaps in dna when realigning.
312 checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
313 true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
317 * Test for the case where not all of the protein sequence is mapped to cDNA.
320 public void testAlignSequenceAs_withMapping_withUnmappedProtein()
324 * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
326 final MapList map = new MapList(new int[]
327 { 4, 6, 10, 12 }, new int[]
328 { 1, 1, 3, 3 }, 3, 1);
332 * Expect alignment does nothing (aborts realignment). Change this test
333 * first if different behaviour wanted.
335 checkAlignSequenceAs("GGGAAACCCTTTGGG", "-A-L-P-", false,
336 false, map, "GGGAAACCCTTTGGG");
340 * Helper method that performs and verifies the method under test.
344 * @param preserveMappedGaps
345 * @param preserveUnmappedGaps
349 protected void checkAlignSequenceAs(final String dnaSeq,
350 final String proteinSeq, final boolean preserveMappedGaps,
351 final boolean preserveUnmappedGaps, MapList map,
352 final String expected)
354 SequenceI dna = new Sequence("Seq1", dnaSeq);
355 dna.createDatasetSequence();
356 SequenceI protein = new Sequence("Seq1", proteinSeq);
357 protein.createDatasetSequence();
358 AlignedCodonFrame acf = new AlignedCodonFrame();
359 acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map);
361 AlignmentUtils.alignSequenceAs(dna, protein, acf, "---", '-',
362 preserveMappedGaps, preserveUnmappedGaps);
363 assertEquals(expected, dna.getSequenceAsString());
367 * Test for the alignSequenceAs method where we preserve gaps in introns only.
370 public void testAlignSequenceAs_keepIntronGapsOnly()
374 * Intron GGGAAA followed by exon CCCTTT
376 MapList map = new MapList(new int[]
380 checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL",
381 false, true, map, "GG-G-AA-ACCCTTT");
385 * Test for the method that generates an aligned translated sequence from one
389 public void testGetAlignedTranslation_dnaLikeProtein()
391 // dna alignment will be replaced
392 SequenceI dna = new Sequence("Seq1", "T-G-CC-A--T-TAC-CAG-");
393 dna.createDatasetSequence();
394 // protein alignment will be 'applied' to dna
395 SequenceI protein = new Sequence("Seq1", "-CH-Y--Q-");
396 protein.createDatasetSequence();
397 MapList map = new MapList(new int[]
400 AlignedCodonFrame acf = new AlignedCodonFrame();
401 acf.addMap(dna.getDatasetSequence(), protein.getDatasetSequence(), map);
403 final SequenceI aligned = AlignmentUtils
404 .getAlignedTranslation(protein, '-', acf);
405 assertEquals("---TGCCAT---TAC------CAG---", aligned.getSequenceAsString());
406 assertSame(aligned.getDatasetSequence(), dna.getDatasetSequence());
410 * Test the method that realigns protein to match mapped codon alignment.
413 public void testAlignProteinAsDna()
415 // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
416 SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
417 // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
418 SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
419 // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
420 SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
421 AlignmentI dna = new Alignment(new SequenceI[]
422 { dna1, dna2, dna3 });
423 dna.setDataset(null);
425 // protein alignment will be realigned like dna
426 SequenceI prot1 = new Sequence("Seq1", "CHYQ");
427 SequenceI prot2 = new Sequence("Seq2", "CHYQ");
428 SequenceI prot3 = new Sequence("Seq3", "CHYQ");
429 AlignmentI protein = new Alignment(new SequenceI[]
430 { prot1, prot2, prot3 });
431 protein.setDataset(null);
433 MapList map = new MapList(new int[]
436 AlignedCodonFrame acf = new AlignedCodonFrame();
437 acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
438 acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
439 acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
440 protein.setCodonFrames(Collections.singleton(acf));
443 * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
444 * [8,9,10] [10,11,12] [11,12,13]
446 AlignmentUtils.alignProteinAsDna(protein, dna);
447 assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
448 assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
449 assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
453 * Test the method that tests whether a CDNA sequence translates to a protein
457 public void testTranslatesAs()
459 assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
460 "FPKG".toCharArray()));
462 assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
463 3, "FPKG".toCharArray()));
465 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
466 0, "FPKG".toCharArray()));
468 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
469 0, "FPKG".toCharArray()));
471 assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
472 0, "FPKG".toCharArray()));
473 // with start and stop codon1
474 assertTrue(AlignmentUtils.translatesAs(
475 "atgtttcccaaaggtaa".toCharArray(), 3, "FPKG".toCharArray()));
476 // with start and stop codon2
477 assertTrue(AlignmentUtils.translatesAs(
478 "atgtttcccaaaggtag".toCharArray(), 3, "FPKG".toCharArray()));
479 // with start and stop codon3
480 assertTrue(AlignmentUtils.translatesAs(
481 "atgtttcccaaaggtga".toCharArray(), 3, "FPKG".toCharArray()));
484 assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
486 "FPMG".toCharArray()));
490 * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
491 * stop codons in addition to the protein coding sequence.
493 * @throws IOException
496 public void testMapProteinToCdna_withStartAndStopCodons()
499 List<SequenceI> protseqs = new ArrayList<SequenceI>();
500 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
501 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
502 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
503 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
504 protein.setDataset(null);
506 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
508 dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
510 dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
511 // = start +EIQ + stop
512 dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
513 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
514 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
515 cdna.setDataset(null);
517 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
519 // 3 mappings made, each from 1 to 1 sequence
520 assertEquals(3, protein.getCodonFrames().size());
521 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
522 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
523 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
525 // V12345 mapped from A22222
526 AlignedCodonFrame acf = protein.getCodonFrame(
527 protein.getSequenceAt(0)).get(0);
528 assertEquals(1, acf.getdnaSeqs().length);
529 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
530 acf.getdnaSeqs()[0]);
531 Mapping[] protMappings = acf.getProtMappings();
532 assertEquals(1, protMappings.length);
533 MapList mapList = protMappings[0].getMap();
534 assertEquals(3, mapList.getFromRatio());
535 assertEquals(1, mapList.getToRatio());
536 assertTrue(Arrays.equals(new int[]
537 { 1, 9 }, mapList.getFromRanges().get(0)));
538 assertEquals(1, mapList.getFromRanges().size());
539 assertTrue(Arrays.equals(new int[]
540 { 1, 3 }, mapList.getToRanges().get(0)));
541 assertEquals(1, mapList.getToRanges().size());
543 // V12346 mapped from A33333 starting position 4
544 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
545 assertEquals(1, acf.getdnaSeqs().length);
546 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
547 acf.getdnaSeqs()[0]);
548 protMappings = acf.getProtMappings();
549 assertEquals(1, protMappings.length);
550 mapList = protMappings[0].getMap();
551 assertEquals(3, mapList.getFromRatio());
552 assertEquals(1, mapList.getToRatio());
553 assertTrue(Arrays.equals(new int[]
554 { 4, 12 }, mapList.getFromRanges().get(0)));
555 assertEquals(1, mapList.getFromRanges().size());
556 assertTrue(Arrays.equals(new int[]
557 { 1, 3 }, mapList.getToRanges().get(0)));
558 assertEquals(1, mapList.getToRanges().size());
560 // V12347 mapped to A11111 starting position 4
561 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
562 assertEquals(1, acf.getdnaSeqs().length);
563 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
564 acf.getdnaSeqs()[0]);
565 protMappings = acf.getProtMappings();
566 assertEquals(1, protMappings.length);
567 mapList = protMappings[0].getMap();
568 assertEquals(3, mapList.getFromRatio());
569 assertEquals(1, mapList.getToRatio());
570 assertTrue(Arrays.equals(new int[]
571 { 4, 12 }, mapList.getFromRanges().get(0)));
572 assertEquals(1, mapList.getFromRanges().size());
573 assertTrue(Arrays.equals(new int[]
574 { 1, 3 }, mapList.getToRanges().get(0)));
575 assertEquals(1, mapList.getToRanges().size());
577 // no mapping involving the 'extra' A44444
578 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
582 * Test mapping of protein to cDNA, for the case where we have some sequence
583 * cross-references. Verify that 1-to-many mappings are made where
584 * cross-references exist and sequences are mappable.
586 * @throws IOException
589 public void testMapProteinToCdna_withXrefs() throws IOException
591 List<SequenceI> protseqs = new ArrayList<SequenceI>();
592 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
593 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
594 protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
595 AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
596 protein.setDataset(null);
598 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
599 dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
600 dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
601 dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
602 dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
603 dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
604 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
605 cdna.setDataset(null);
607 // Xref A22222 to V12345 (should get mapped)
608 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
609 // Xref V12345 to A44444 (should get mapped)
610 protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
611 // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
612 dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
613 // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
614 // it should get paired up with the unmapped A33333
615 // A11111 should be mapped to V12347
616 // A55555 is spare and has no xref so is not mapped
618 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
620 // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
621 assertEquals(3, protein.getCodonFrames().size());
622 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
623 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
624 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
626 // one mapping for each of the first 4 cDNA sequences
627 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
628 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
629 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
630 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
632 // V12345 mapped to A22222 and A44444
633 AlignedCodonFrame acf = protein.getCodonFrame(
634 protein.getSequenceAt(0)).get(0);
635 assertEquals(2, acf.getdnaSeqs().length);
636 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
637 acf.getdnaSeqs()[0]);
638 assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
639 acf.getdnaSeqs()[1]);
641 // V12346 mapped to A33333
642 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
643 assertEquals(1, acf.getdnaSeqs().length);
644 assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
645 acf.getdnaSeqs()[0]);
647 // V12347 mapped to A11111
648 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
649 assertEquals(1, acf.getdnaSeqs().length);
650 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
651 acf.getdnaSeqs()[0]);
653 // no mapping involving the 'extra' A55555
654 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
658 * Test mapping of protein to cDNA, for the case where we have some sequence
659 * cross-references. Verify that once we have made an xref mapping we don't
660 * also map un-xrefd sequeces.
662 * @throws IOException
665 public void testMapProteinToCdna_prioritiseXrefs() throws IOException
667 List<SequenceI> protseqs = new ArrayList<SequenceI>();
668 protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
669 protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
670 AlignmentI protein = new Alignment(
671 protseqs.toArray(new SequenceI[protseqs.size()]));
672 protein.setDataset(null);
674 List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
675 dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
676 dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
677 AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
679 cdna.setDataset(null);
681 // Xref A22222 to V12345 (should get mapped)
682 // A11111 should then be mapped to the unmapped V12346
683 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
685 assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
687 // 2 protein mappings made
688 assertEquals(2, protein.getCodonFrames().size());
689 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
690 assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
692 // one mapping for each of the cDNA sequences
693 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
694 assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
696 // V12345 mapped to A22222
697 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
699 assertEquals(1, acf.getdnaSeqs().length);
700 assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
701 acf.getdnaSeqs()[0]);
703 // V12346 mapped to A11111
704 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
705 assertEquals(1, acf.getdnaSeqs().length);
706 assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
707 acf.getdnaSeqs()[0]);
711 * Test the method that shows or hides sequence annotations by type(s) and
715 public void testShowOrHideSequenceAnnotations()
717 SequenceI seq1 = new Sequence("Seq1", "AAA");
718 SequenceI seq2 = new Sequence("Seq2", "BBB");
719 SequenceI seq3 = new Sequence("Seq3", "CCC");
720 Annotation[] anns = new Annotation[]
721 { new Annotation(2f) };
722 AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1",
724 ann1.setSequenceRef(seq1);
725 AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2",
727 ann2.setSequenceRef(seq2);
728 AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3",
730 AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns);
731 ann4.setSequenceRef(seq1);
732 AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns);
733 ann5.setSequenceRef(seq2);
734 AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns);
735 AlignmentI al = new Alignment(new SequenceI[] {seq1, seq2, seq3});
736 al.addAnnotation(ann1); // Structure for Seq1
737 al.addAnnotation(ann2); // Structure for Seq2
738 al.addAnnotation(ann3); // Structure for no sequence
739 al.addAnnotation(ann4); // Temp for seq1
740 al.addAnnotation(ann5); // Temp for seq2
741 al.addAnnotation(ann6); // Temp for no sequence
742 List<String> types = new ArrayList<String>();
743 List<SequenceI> scope = new ArrayList<SequenceI>();
746 * Set all sequence related Structure to hidden (ann1, ann2)
748 types.add("Structure");
749 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
751 assertFalse(ann1.visible);
752 assertFalse(ann2.visible);
753 assertTrue(ann3.visible); // not sequence-related, not affected
754 assertTrue(ann4.visible); // not Structure, not affected
755 assertTrue(ann5.visible); // "
756 assertTrue(ann6.visible); // not sequence-related, not affected
759 * Set Temp in {seq1, seq3} to hidden
765 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false,
767 assertFalse(ann1.visible); // unchanged
768 assertFalse(ann2.visible); // unchanged
769 assertTrue(ann3.visible); // not sequence-related, not affected
770 assertFalse(ann4.visible); // Temp for seq1 hidden
771 assertTrue(ann5.visible); // not in scope, not affected
772 assertTrue(ann6.visible); // not sequence-related, not affected
775 * Set Temp in all sequences to hidden
781 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
783 assertFalse(ann1.visible); // unchanged
784 assertFalse(ann2.visible); // unchanged
785 assertTrue(ann3.visible); // not sequence-related, not affected
786 assertFalse(ann4.visible); // Temp for seq1 hidden
787 assertFalse(ann5.visible); // Temp for seq2 hidden
788 assertTrue(ann6.visible); // not sequence-related, not affected
791 * Set all types in {seq1, seq3} to visible
797 AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true,
799 assertTrue(ann1.visible); // Structure for seq1 set visible
800 assertFalse(ann2.visible); // not in scope, unchanged
801 assertTrue(ann3.visible); // not sequence-related, not affected
802 assertTrue(ann4.visible); // Temp for seq1 set visible
803 assertFalse(ann5.visible); // not in scope, unchanged
804 assertTrue(ann6.visible); // not sequence-related, not affected
807 * Set all types in all scope to hidden
809 AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true,
811 assertFalse(ann1.visible);
812 assertFalse(ann2.visible);
813 assertTrue(ann3.visible); // not sequence-related, not affected
814 assertFalse(ann4.visible);
815 assertFalse(ann5.visible);
816 assertTrue(ann6.visible); // not sequence-related, not affected
820 * Tests for the method that checks if one sequence cross-references another
823 public void testHasCrossRef()
825 assertFalse(AlignmentUtils.hasCrossRef(null, null));
826 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
827 assertFalse(AlignmentUtils.hasCrossRef(seq1, null));
828 assertFalse(AlignmentUtils.hasCrossRef(null, seq1));
829 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
830 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
833 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193"));
834 assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
836 // case-insensitive; version number is ignored
837 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192"));
838 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
841 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
842 assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
843 // test is one-way only
844 assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1));
848 * Tests for the method that checks if either sequence cross-references the
852 public void testHaveCrossRef()
854 assertFalse(AlignmentUtils.hasCrossRef(null, null));
855 SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
856 assertFalse(AlignmentUtils.haveCrossRef(seq1, null));
857 assertFalse(AlignmentUtils.haveCrossRef(null, seq1));
858 SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
859 assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2));
861 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
862 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
863 // next is true for haveCrossRef, false for hasCrossRef
864 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
866 // now the other way round
868 seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345"));
869 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
870 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
873 seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
874 assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
875 assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
879 * Test the method that extracts the exon-only part of a dna alignment.
882 public void testMakeExonAlignment()
884 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
885 SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
886 SequenceI pep1 = new Sequence("pep1", "GF");
887 SequenceI pep2 = new Sequence("pep2", "GFP");
888 dna1.createDatasetSequence();
889 dna2.createDatasetSequence();
890 pep1.createDatasetSequence();
891 pep2.createDatasetSequence();
893 Set<AlignedCodonFrame> mappings = new HashSet<AlignedCodonFrame>();
894 MapList map = new MapList(new int[]
895 { 4, 6, 10, 12 }, new int[]
897 AlignedCodonFrame acf = new AlignedCodonFrame();
898 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
900 map = new MapList(new int[]
901 { 1, 3, 7, 9, 13, 15 }, new int[]
903 acf = new AlignedCodonFrame();
904 acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
907 AlignmentI exons = AlignmentUtils.makeExonAlignment(new SequenceI[]
908 { dna1, dna2 }, mappings);
909 assertEquals(2, exons.getSequences().size());
910 assertEquals("GGGTTT", exons.getSequenceAt(0).getSequenceAsString());
911 assertEquals("GGGTTTCCC", exons.getSequenceAt(1).getSequenceAsString());
914 * Verify updated mappings
916 assertEquals(2, mappings.size());
919 * Mapping from pep1 to GGGTTT in first new exon sequence
921 List<AlignedCodonFrame> pep1Mapping = MappingUtils
922 .findMappingsForSequence(pep1, mappings);
923 assertEquals(1, pep1Mapping.size());
925 SearchResults sr = MappingUtils.buildSearchResults(pep1, 1, mappings);
926 assertEquals(1, sr.getResults().size());
927 Match m = sr.getResults().get(0);
928 assertEquals(exons.getSequenceAt(0).getDatasetSequence(),
930 assertEquals(1, m.getStart());
931 assertEquals(3, m.getEnd());
933 sr = MappingUtils.buildSearchResults(pep1, 2, mappings);
934 m = sr.getResults().get(0);
935 assertEquals(exons.getSequenceAt(0).getDatasetSequence(),
937 assertEquals(4, m.getStart());
938 assertEquals(6, m.getEnd());
941 * Mapping from pep2 to GGGTTTCCC in second new exon sequence
943 List<AlignedCodonFrame> pep2Mapping = MappingUtils
944 .findMappingsForSequence(pep2, mappings);
945 assertEquals(1, pep2Mapping.size());
947 sr = MappingUtils.buildSearchResults(pep2, 1, mappings);
948 assertEquals(1, sr.getResults().size());
949 m = sr.getResults().get(0);
950 assertEquals(exons.getSequenceAt(1).getDatasetSequence(),
952 assertEquals(1, m.getStart());
953 assertEquals(3, m.getEnd());
955 sr = MappingUtils.buildSearchResults(pep2, 2, mappings);
956 m = sr.getResults().get(0);
957 assertEquals(exons.getSequenceAt(1).getDatasetSequence(),
959 assertEquals(4, m.getStart());
960 assertEquals(6, m.getEnd());
962 sr = MappingUtils.buildSearchResults(pep2, 3, mappings);
963 m = sr.getResults().get(0);
964 assertEquals(exons.getSequenceAt(1).getDatasetSequence(),
966 assertEquals(7, m.getStart());
967 assertEquals(9, m.getEnd());
971 * Test the method that makes an exon-only sequence from a DNA sequence and
972 * its product mapping. Test includes the expected case that the DNA sequence
973 * already has a protein product (Uniprot translation) which in turn has an
974 * x-ref to the EMBLCDS record.
977 public void testMakeExonSequence()
979 SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
980 SequenceI pep1 = new Sequence("pep1", "GF");
981 dna1.createDatasetSequence();
982 pep1.createDatasetSequence();
983 pep1.getDatasetSequence().addDBRef(
984 new DBRefEntry("EMBLCDS", "2", "A12345"));
987 * Make the mapping from dna to protein. The protein sequence has a DBRef to
990 Set<AlignedCodonFrame> mappings = new HashSet<AlignedCodonFrame>();
991 MapList map = new MapList(new int[]
992 { 4, 6, 10, 12 }, new int[]
994 AlignedCodonFrame acf = new AlignedCodonFrame();
995 acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
998 AlignedCodonFrame newMapping = new AlignedCodonFrame();
999 SequenceI exon = AlignmentUtils.makeExonSequence(dna1, acf, newMapping);
1001 assertEquals("GGGTTT", exon.getSequenceAsString());
1002 assertEquals("dna1|A12345", exon.getName());
1003 assertEquals(1, exon.getDBRef().length);
1004 DBRefEntry cdsRef = exon.getDBRef()[0];
1005 assertEquals("EMBLCDS", cdsRef.getSource());
1006 assertEquals("2", cdsRef.getVersion());
1007 assertEquals("A12345", cdsRef.getAccessionId());