2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import static org.testng.AssertJUnit.assertEquals;
24 import static org.testng.AssertJUnit.assertFalse;
25 import static org.testng.AssertJUnit.assertNotNull;
26 import static org.testng.AssertJUnit.assertNotSame;
27 import static org.testng.AssertJUnit.assertNull;
28 import static org.testng.AssertJUnit.assertSame;
29 import static org.testng.AssertJUnit.assertTrue;
31 import jalview.datamodel.AlignedCodonFrame;
32 import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
33 import jalview.datamodel.Alignment;
34 import jalview.datamodel.AlignmentI;
35 import jalview.datamodel.DBRefEntry;
36 import jalview.datamodel.Mapping;
37 import jalview.datamodel.Sequence;
38 import jalview.datamodel.SequenceFeature;
39 import jalview.datamodel.SequenceI;
40 import jalview.gui.JvOptionPane;
41 import jalview.util.DBRefUtils;
42 import jalview.util.MapList;
43 import jalview.ws.SequenceFetcher;
44 import jalview.ws.SequenceFetcherFactory;
46 import java.util.ArrayList;
47 import java.util.List;
49 import org.testng.annotations.AfterClass;
50 import org.testng.annotations.BeforeClass;
51 import org.testng.annotations.Test;
53 public class CrossRefTest
56 @BeforeClass(alwaysRun = true)
57 public void setUpJvOptionPane()
59 JvOptionPane.setInteractiveMode(false);
60 JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION);
63 @Test(groups = { "Functional" })
64 public void testFindXDbRefs()
66 DBRefEntry ref1 = new DBRefEntry("UNIPROT", "1", "A123");
67 DBRefEntry ref2 = new DBRefEntry("UNIPROTKB/TREMBL", "1", "A123");
68 DBRefEntry ref3 = new DBRefEntry("pdb", "1", "A123");
69 DBRefEntry ref4 = new DBRefEntry("EMBLCDSPROTEIN", "1", "A123");
70 DBRefEntry ref5 = new DBRefEntry("embl", "1", "A123");
71 DBRefEntry ref6 = new DBRefEntry("emblCDS", "1", "A123");
72 DBRefEntry ref7 = new DBRefEntry("GeneDB", "1", "A123");
73 DBRefEntry ref8 = new DBRefEntry("PFAM", "1", "A123");
74 // ENSEMBL is a source of either dna or protein sequence data
75 DBRefEntry ref9 = new DBRefEntry("ENSEMBL", "1", "A123");
76 DBRefEntry[] refs = new DBRefEntry[] { ref1, ref2, ref3, ref4, ref5,
77 ref6, ref7, ref8, ref9 };
82 DBRefEntry[] found = DBRefUtils.selectDbRefs(true, refs);
83 assertEquals(4, found.length);
84 assertSame(ref5, found[0]);
85 assertSame(ref6, found[1]);
86 assertSame(ref7, found[2]);
87 assertSame(ref9, found[3]);
90 * Just the protein refs:
92 found = DBRefUtils.selectDbRefs(false, refs);
93 assertEquals(4, found.length);
94 assertSame(ref1, found[0]);
95 assertSame(ref2, found[1]);
96 assertSame(ref4, found[2]);
97 assertSame(ref9, found[3]);
101 * Test the method that finds a sequence's "product" xref source databases,
102 * which may be direct (dbrefs on the sequence), or indirect (dbrefs on
103 * sequences which share a dbref with the sequence
105 @Test(groups = { "Functional" }, enabled = true)
106 public void testFindXrefSourcesForSequence_proteinToDna()
108 SequenceI seq = new Sequence("Seq1", "MGKYQARLSS");
109 List<String> sources = new ArrayList<String>();
110 AlignmentI al = new Alignment(new SequenceI[] {});
113 * first with no dbrefs to search
115 sources = new CrossRef(new SequenceI[] { seq }, al)
116 .findXrefSourcesForSequences(false);
117 assertTrue(sources.isEmpty());
120 * add some dbrefs to sequence
122 // protein db is not a candidate for findXrefSources
123 seq.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234"));
124 // dna coding databatases are
125 seq.addDBRef(new DBRefEntry("EMBL", "0", "E2345"));
126 // a second EMBL xref should not result in a duplicate
127 seq.addDBRef(new DBRefEntry("EMBL", "0", "E2346"));
128 seq.addDBRef(new DBRefEntry("EMBLCDS", "0", "E2347"));
129 seq.addDBRef(new DBRefEntry("GENEDB", "0", "E2348"));
130 seq.addDBRef(new DBRefEntry("ENSEMBL", "0", "E2349"));
131 seq.addDBRef(new DBRefEntry("ENSEMBLGENOMES", "0", "E2350"));
132 sources = new CrossRef(new SequenceI[] { seq }, al)
133 .findXrefSourcesForSequences(false);
134 // method is patched to remove EMBL from the sources to match
135 assertEquals(3, sources.size());
136 assertEquals("[EMBLCDS, GENEDB, ENSEMBL]", sources.toString());
139 * add a sequence to the alignment which has a dbref to UNIPROT|A1234
140 * and others to dna coding databases
144 seq.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234"));
145 seq.addDBRef(new DBRefEntry("EMBLCDS", "0", "E2347"));
146 SequenceI seq2 = new Sequence("Seq2", "MGKYQARLSS");
147 seq2.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234"));
148 seq2.addDBRef(new DBRefEntry("EMBL", "0", "E2345"));
149 seq2.addDBRef(new DBRefEntry("GENEDB", "0", "E2348"));
150 // TODO include ENSEMBLGENOMES in DBRefSource.DNACODINGDBS ?
151 al.addSequence(seq2);
152 sources = new CrossRef(new SequenceI[] { seq, seq2 }, al)
153 .findXrefSourcesForSequences(false);
154 // method removed EMBL from sources to match
155 assertEquals(2, sources.size());
156 assertEquals("[EMBLCDS, GENEDB]", sources.toString());
160 * Test for finding 'product' sequences for the case where only an indirect
161 * xref is found - not on the nucleotide sequence but on a peptide sequence in
162 * the alignment which which it shares a nucleotide dbref
164 @Test(groups = { "Functional" }, enabled = true)
165 public void testFindXrefSequences_indirectDbrefToProtein()
169 * - nucleotide dbref EMBL|AF039662
170 * - peptide dbrefs EMBL|AF039662, UNIPROT|Q9ZTS2
172 SequenceI emblSeq = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
173 emblSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
174 SequenceI uniprotSeq = new Sequence("Q9ZTS2", "MASVSATMISTS");
175 uniprotSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
176 uniprotSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
179 * Find UNIPROT xrefs for nucleotide
180 * - it has no UNIPROT dbref of its own
181 * - but peptide with matching nucleotide dbref does, so is returned
183 AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
184 Alignment xrefs = new CrossRef(new SequenceI[] { emblSeq }, al)
185 .findXrefSequences("UNIPROT", true);
186 assertEquals(1, xrefs.getHeight());
187 assertSame(uniprotSeq, xrefs.getSequenceAt(0));
191 * Test for finding 'product' sequences for the case where only an indirect
192 * xref is found - not on the peptide sequence but on a nucleotide sequence in
193 * the alignment which which it shares a protein dbref
195 @Test(groups = { "Functional" }, enabled = true)
196 public void testFindXrefSequences_indirectDbrefToNucleotide()
200 * - peptide dbref UNIPROT|Q9ZTS2
201 * - nucleotide dbref EMBL|AF039662, UNIPROT|Q9ZTS2
203 SequenceI uniprotSeq = new Sequence("Q9ZTS2", "MASVSATMISTS");
204 uniprotSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
205 SequenceI emblSeq = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
206 emblSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
207 emblSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
210 * find EMBL xrefs for peptide sequence - it has no direct
211 * dbrefs, but the 'corresponding' nucleotide sequence does, so is returned
214 * Find EMBL xrefs for peptide
215 * - it has no EMBL dbref of its own
216 * - but nucleotide with matching peptide dbref does, so is returned
218 AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
219 Alignment xrefs = new CrossRef(new SequenceI[] { uniprotSeq }, al)
220 .findXrefSequences("EMBL", false);
221 assertEquals(1, xrefs.getHeight());
222 assertSame(emblSeq, xrefs.getSequenceAt(0));
226 * Test for finding 'product' sequences for the case where the selected
227 * sequence has no dbref to the desired source, and there are no indirect
228 * references via another sequence in the alignment
230 @Test(groups = { "Functional" })
231 public void testFindXrefSequences_noDbrefs()
234 * two nucleotide sequences, one with UNIPROT dbref
236 SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
237 dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
238 SequenceI dna2 = new Sequence("AJ307031", "AAACCCTTT");
241 * find UNIPROT xrefs for peptide sequence - it has no direct
242 * dbrefs, and the other sequence (which has a UNIPROT dbref) is not
243 * equatable to it, so no results found
245 AlignmentI al = new Alignment(new SequenceI[] { dna1, dna2 });
246 Alignment xrefs = new CrossRef(new SequenceI[] { dna2 }, al)
247 .findXrefSequences("UNIPROT", true);
252 * Tests for the method that searches an alignment (with one sequence
253 * excluded) for protein/nucleotide sequences with a given cross-reference
255 @Test(groups = { "Functional" }, enabled = true)
256 public void testSearchDataset()
259 * nucleotide sequence with UNIPROT AND EMBL dbref
260 * peptide sequence with UNIPROT dbref
262 SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
263 Mapping map = new Mapping(new Sequence("pep2", "MLAVSRG"), new MapList(
264 new int[] { 1, 21 }, new int[] { 1, 7 }, 3, 1));
265 DBRefEntry dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2", map);
266 dna1.addDBRef(dbref);
267 dna1.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
268 SequenceI pep1 = new Sequence("Q9ZTS2", "MLAVSRGQ");
269 dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
270 pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
271 AlignmentI al = new Alignment(new SequenceI[] { dna1, pep1 });
273 List<SequenceI> result = new ArrayList<SequenceI>();
276 * first search for a dbref nowhere on the alignment:
278 dbref = new DBRefEntry("UNIPROT", "0", "P30419");
279 CrossRef testee = new CrossRef(al.getSequencesArray(), al);
280 AlignedCodonFrame acf = new AlignedCodonFrame();
281 boolean found = testee.searchDataset(true, dna1, dbref, result, acf,
284 assertTrue(result.isEmpty());
285 assertTrue(acf.isEmpty());
288 * search for a protein sequence with dbref UNIPROT:Q9ZTS2
290 acf = new AlignedCodonFrame();
291 dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
292 found = testee.searchDataset(!dna1.isProtein(), dna1, dbref, result,
293 acf, false); // search dataset with a protein xref from a dna
294 // sequence to locate the protein product
296 assertEquals(1, result.size());
297 assertSame(pep1, result.get(0));
298 assertTrue(acf.isEmpty());
301 * search for a nucleotide sequence with dbref UNIPROT:Q9ZTS2
304 acf = new AlignedCodonFrame();
305 dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
306 found = testee.searchDataset(!pep1.isProtein(), pep1, dbref, result,
307 acf, false); // search dataset with a protein's direct dbref to
308 // locate dna sequences with matching xref
310 assertEquals(1, result.size());
311 assertSame(dna1, result.get(0));
312 // should now have a mapping from dna to pep1
313 List<SequenceToSequenceMapping> mappings = acf.getMappings();
314 assertEquals(1, mappings.size());
315 SequenceToSequenceMapping mapping = mappings.get(0);
316 assertSame(dna1, mapping.getFromSeq());
317 assertSame(pep1, mapping.getMapping().getTo());
318 MapList mapList = mapping.getMapping().getMap();
319 assertEquals(1, mapList.getToRatio());
320 assertEquals(3, mapList.getFromRatio());
321 assertEquals(1, mapList.getFromRanges().size());
322 assertEquals(1, mapList.getFromRanges().get(0)[0]);
323 assertEquals(21, mapList.getFromRanges().get(0)[1]);
324 assertEquals(1, mapList.getToRanges().size());
325 assertEquals(1, mapList.getToRanges().get(0)[0]);
326 assertEquals(7, mapList.getToRanges().get(0)[1]);
330 * Test for finding 'product' sequences for the case where the selected
331 * sequence has a dbref with a mapping to a sequence. This represents the case
334 * <li>a fetched sequence is already decorated with its cross-reference (e.g.
335 * EMBL + translation), or</li>
336 * <li>Get Cross-References has been done once resulting in instantiated
337 * cross-reference mappings</li>
340 @Test(groups = { "Functional" })
341 public void testFindXrefSequences_fromDbRefMap()
344 * scenario: nucleotide sequence AF039662
345 * with dbref + mapping to Q9ZTS2 and P30419
346 * which themselves each have a dbref and feature
348 SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
349 SequenceI pep1 = new Sequence("Q9ZTS2", "MALFQRSV");
350 SequenceI pep2 = new Sequence("P30419", "MTRRSQIF");
351 dna1.createDatasetSequence();
352 pep1.createDatasetSequence();
353 pep2.createDatasetSequence();
355 pep1.getDatasetSequence().addDBRef(
356 new DBRefEntry("Pfam", "0", "PF00111"));
357 pep1.addSequenceFeature(new SequenceFeature("type", "desc", 12, 14, 1f,
359 pep2.getDatasetSequence().addDBRef(new DBRefEntry("PDB", "0", "3JTK"));
360 pep2.addSequenceFeature(new SequenceFeature("type2", "desc2", 13, 15,
363 MapList mapList = new MapList(new int[] { 1, 24 }, new int[] { 1, 3 },
365 Mapping map = new Mapping(pep1, mapList);
366 DBRefEntry dbRef1 = new DBRefEntry("UNIPROT", "0", "Q9ZTS2", map);
367 dna1.getDatasetSequence().addDBRef(dbRef1);
368 mapList = new MapList(new int[] { 1, 24 }, new int[] { 1, 3 }, 3, 1);
369 map = new Mapping(pep2, mapList);
370 DBRefEntry dbRef2 = new DBRefEntry("UNIPROT", "0", "P30419", map);
371 dna1.getDatasetSequence().addDBRef(dbRef2);
374 * find UNIPROT xrefs for nucleotide sequence - it should pick up
377 AlignmentI al = new Alignment(new SequenceI[] { dna1 });
378 Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al)
379 .findXrefSequences("UNIPROT", true);
380 assertEquals(2, xrefs.getHeight());
383 * cross-refs alignment holds copies of the mapped sequences
384 * including copies of their dbrefs and features
386 checkCopySequence(pep1, xrefs.getSequenceAt(0));
387 checkCopySequence(pep2, xrefs.getSequenceAt(1));
391 * Helper method that verifies that 'copy' has the same name, start, end,
392 * sequence and dataset sequence object as 'original' (but is not the same
398 private void checkCopySequence(SequenceI copy, SequenceI original)
400 assertNotSame(copy, original);
401 assertSame(copy.getDatasetSequence(), original.getDatasetSequence());
402 assertEquals(copy.getName(), original.getName());
403 assertEquals(copy.getStart(), original.getStart());
404 assertEquals(copy.getEnd(), original.getEnd());
405 assertEquals(copy.getSequenceAsString(), original.getSequenceAsString());
409 * Test for finding 'product' sequences for the case where the selected
410 * sequence has a dbref with no mapping, triggering a fetch from database
412 @Test(groups = { "Functional" })
413 public void testFindXrefSequences_withFetch()
415 SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
416 dna1.addDBRef(new DBRefEntry("UNIPROT", "ENA:0", "Q9ZTS2"));
417 dna1.addDBRef(new DBRefEntry("UNIPROT", "ENA:0", "P30419"));
418 dna1.addDBRef(new DBRefEntry("UNIPROT", "ENA:0", "P00314"));
419 final SequenceI pep1 = new Sequence("Q9ZTS2", "MYQLIRSSW");
420 pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
422 final SequenceI pep2 = new Sequence("P00314", "MRKLLAASG");
423 pep2.addDBRef(new DBRefEntry("UNIPROT", "0", "P00314"));
426 * argument false suppresses adding DAS sources
427 * todo: define an interface type SequenceFetcherI and mock that
429 SequenceFetcher mockFetcher = new SequenceFetcher(false)
432 public boolean isFetchable(String source)
438 public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
440 return new SequenceI[] { pep1, pep2 };
443 SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
446 * find UNIPROT xrefs for nucleotide sequence
448 AlignmentI al = new Alignment(new SequenceI[] { dna1 });
449 Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al)
450 .findXrefSequences("UNIPROT", true);
451 assertEquals(2, xrefs.getHeight());
452 assertSame(pep1, xrefs.getSequenceAt(0));
453 assertSame(pep2, xrefs.getSequenceAt(1));
457 public void tearDown()
459 SequenceFetcherFactory.setSequenceFetcher(null);
463 * Test for finding 'product' sequences for the case where both gene and
464 * transcript sequences have dbrefs to Uniprot.
466 @Test(groups = { "Functional" })
467 public void testFindXrefSequences_forGeneAndTranscripts()
472 SequenceI gene = new Sequence("ENSG00000157764", "CGCCTCCCTTCCCC");
473 gene.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
474 gene.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3"));
477 * 'transcript' with CDS feature (supports mapping to protein)
479 SequenceI braf001 = new Sequence("ENST00000288602", "taagATGGCGGCGCTGa");
480 braf001.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
481 braf001.addSequenceFeature(new SequenceFeature("CDS", "", 5, 16, 0f,
485 * 'spliced transcript' with CDS ranges
487 SequenceI braf002 = new Sequence("ENST00000497784", "gCAGGCtaTCTGTTCaa");
488 braf002.addDBRef(new DBRefEntry("UNIPROT", "ENSEMBL|0", "H7C5K3"));
489 braf002.addSequenceFeature(new SequenceFeature("CDS", "", 2, 6, 0f,
491 braf002.addSequenceFeature(new SequenceFeature("CDS", "", 9, 15, 0f,
495 * TODO code is fragile - use of SequenceIdMatcher depends on fetched
496 * sequences having a name starting Source|Accession
497 * which happens to be true for Uniprot,PDB,EMBL but not Pfam,Rfam,Ensembl
499 final SequenceI pep1 = new Sequence("UNIPROT|P15056", "MAAL");
500 pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
501 final SequenceI pep2 = new Sequence("UNIPROT|H7C5K3", "QALF");
502 pep2.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3"));
504 * argument false suppresses adding DAS sources
505 * todo: define an interface type SequenceFetcherI and mock that
507 SequenceFetcher mockFetcher = new SequenceFetcher(false)
510 public boolean isFetchable(String source)
516 public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
518 return new SequenceI[] { pep1, pep2 };
521 SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
524 * find UNIPROT xrefs for gene and transcripts
526 * - the two proteins are retrieved but not duplicated
527 * - mappings are built from transcript (CDS) to proteins
528 * - no mappings from gene to proteins
530 SequenceI[] seqs = new SequenceI[] { gene, braf001, braf002 };
531 AlignmentI al = new Alignment(seqs);
532 Alignment xrefs = new CrossRef(seqs, al).findXrefSequences("UNIPROT",
534 assertEquals(2, xrefs.getHeight());
535 assertSame(pep1, xrefs.getSequenceAt(0));
536 assertSame(pep2, xrefs.getSequenceAt(1));
541 * Test that emulates this (real but simplified) case:
543 * UNIPROT|P0CE19 EMBL|J03321, EMBL|X06707, EMBL|M19487
544 * UNIPROT|P0CE20 EMBL|J03321, EMBL|X06707, EMBL|X07547
545 * Find cross-references for EMBL. These are mocked here as
546 * EMBL|J03321 with mappings to P0CE18, P0CE19, P0CE20
547 * EMBL|X06707 with mappings to P0CE17, P0CE19, P0CE20
548 * EMBL|M19487 with mappings to P0CE19, Q46432
549 * EMBL|X07547 with mappings to P0CE20, B0BCM4
550 * EMBL sequences are first 'fetched' (mocked here) for P0CE19.
551 * The 3 EMBL sequences are added to the alignment dataset.
552 * Their dbrefs to Uniprot products P0CE19 and P0CE20 should be matched in the
553 * alignment dataset and updated to reference the original Uniprot sequences.
554 * For the second Uniprot sequence, the J03321 and X06707 xrefs should be
555 * resolved from the dataset, and only the X07547 dbref fetched.
556 * So the end state to verify is:
557 * - 4 cross-ref sequences returned: J03321, X06707, M19487, X07547
558 * - P0CE19/20 dbrefs to EMBL sequences now have mappings
559 * - J03321 dbrefs to P0CE19/20 mapped to original Uniprot sequences
560 * - X06707 dbrefs to P0CE19/20 mapped to original Uniprot sequences
563 @Test(groups = { "Functional" })
564 public void testFindXrefSequences_uniprotEmblManyToMany()
567 * Uniprot sequences, both with xrefs to EMBL|J03321
570 SequenceI p0ce19 = new Sequence("UNIPROT|P0CE19", "KPFG");
571 p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "J03321"));
572 p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "X06707"));
573 p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "M19487"));
574 SequenceI p0ce20 = new Sequence("UNIPROT|P0CE20", "PFGK");
575 p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "J03321"));
576 p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "X06707"));
577 p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "X07547"));
580 * EMBL sequences to be 'fetched', complete with dbrefs and mappings
581 * to their protein products (CDS location and translations are provided
582 * in EMBL XML); these should be matched to, and replaced with,
583 * the corresponding uniprot sequences after fetching
587 * J03321 with mappings to P0CE19 and P0CE20
589 final SequenceI j03321 = new Sequence("EMBL|J03321", "AAACCCTTTGGGAAAA");
590 DBRefEntry dbref1 = new DBRefEntry("UNIPROT", "0", "P0CE19");
591 MapList mapList = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 },
593 Mapping map = new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"),
595 // add a dbref to the mapped to sequence - should get copied to p0ce19
596 map.getTo().addDBRef(new DBRefEntry("PIR", "0", "S01875"));
598 j03321.addDBRef(dbref1);
599 DBRefEntry dbref2 = new DBRefEntry("UNIPROT", "0", "P0CE20");
600 mapList = new MapList(new int[] { 4, 15 }, new int[] { 2, 5 }, 3, 1);
601 dbref2.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "PFGK"),
602 new MapList(mapList)));
603 j03321.addDBRef(dbref2);
606 * X06707 with mappings to P0CE19 and P0CE20
608 final SequenceI x06707 = new Sequence("EMBL|X06707", "atgAAACCCTTTGGG");
609 DBRefEntry dbref3 = new DBRefEntry("UNIPROT", "0", "P0CE19");
610 MapList map2 = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 }, 3,
612 dbref3.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), map2));
613 x06707.addDBRef(dbref3);
614 DBRefEntry dbref4 = new DBRefEntry("UNIPROT", "0", "P0CE20");
615 MapList map3 = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 }, 3,
617 dbref4.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "PFGK"), map3));
618 x06707.addDBRef(dbref4);
621 * M19487 with mapping to P0CE19 and Q46432
623 final SequenceI m19487 = new Sequence("EMBL|M19487", "AAACCCTTTGGG");
624 DBRefEntry dbref5 = new DBRefEntry("UNIPROT", "0", "P0CE19");
625 dbref5.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"),
626 new MapList(mapList)));
627 m19487.addDBRef(dbref5);
628 DBRefEntry dbref6 = new DBRefEntry("UNIPROT", "0", "Q46432");
629 dbref6.setMap(new Mapping(new Sequence("UNIPROT|Q46432", "KPFG"),
630 new MapList(mapList)));
631 m19487.addDBRef(dbref6);
634 * X07547 with mapping to P0CE20 and B0BCM4
636 final SequenceI x07547 = new Sequence("EMBL|X07547", "cccAAACCCTTTGGG");
637 DBRefEntry dbref7 = new DBRefEntry("UNIPROT", "0", "P0CE20");
638 dbref7.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "PFGK"),
640 x07547.addDBRef(dbref7);
641 DBRefEntry dbref8 = new DBRefEntry("UNIPROT", "0", "B0BCM4");
642 dbref8.setMap(new Mapping(new Sequence("UNIPROT|B0BCM4", "KPFG"),
644 x07547.addDBRef(dbref8);
647 * mock sequence fetcher to 'return' the EMBL sequences
648 * TODO: Mockito would allow .thenReturn().thenReturn() here,
649 * and also capture and verification of the parameters
650 * passed in calls to getSequences() - important to verify that
651 * duplicate sequence fetches are not requested
653 SequenceFetcher mockFetcher = new SequenceFetcher(false)
658 public boolean isFetchable(String source)
664 public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
669 assertEquals("Expected 3 embl seqs in first fetch", 3,
671 return new SequenceI[] { j03321, x06707, m19487 };
675 assertEquals("Expected 1 embl seq in second fetch", 1,
677 return new SequenceI[] { x07547 };
681 SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
684 * find EMBL xrefs for Uniprot seqs and verify that
685 * - the EMBL xref'd sequences are retrieved without duplicates
686 * - mappings are added to the Uniprot dbrefs
687 * - mappings in the EMBL-to-Uniprot dbrefs are updated to the
688 * alignment sequences
689 * - dbrefs on the EMBL sequences are added to the original dbrefs
691 SequenceI[] seqs = new SequenceI[] { p0ce19, p0ce20 };
692 AlignmentI al = new Alignment(seqs);
693 Alignment xrefs = new CrossRef(seqs, al).findXrefSequences("EMBL",
697 * verify retrieved sequences
699 assertNotNull(xrefs);
700 assertEquals(4, xrefs.getHeight());
701 assertSame(j03321, xrefs.getSequenceAt(0));
702 assertSame(x06707, xrefs.getSequenceAt(1));
703 assertSame(m19487, xrefs.getSequenceAt(2));
704 assertSame(x07547, xrefs.getSequenceAt(3));
707 * verify mappings added to Uniprot-to-EMBL dbrefs
709 Mapping mapping = p0ce19.getDBRefs()[0].getMap();
710 assertSame(j03321, mapping.getTo());
711 mapping = p0ce19.getDBRefs()[1].getMap();
712 assertSame(x06707, mapping.getTo());
713 mapping = p0ce20.getDBRefs()[0].getMap();
714 assertSame(j03321, mapping.getTo());
715 mapping = p0ce20.getDBRefs()[1].getMap();
716 assertSame(x06707, mapping.getTo());
719 * verify dbrefs on EMBL are mapped to alignment seqs
721 assertSame(p0ce19, j03321.getDBRefs()[0].getMap().getTo());
722 assertSame(p0ce20, j03321.getDBRefs()[1].getMap().getTo());
723 assertSame(p0ce19, x06707.getDBRefs()[0].getMap().getTo());
724 assertSame(p0ce20, x06707.getDBRefs()[1].getMap().getTo());
727 * verify new dbref on EMBL dbref mapping is copied to the
728 * original Uniprot sequence
730 assertEquals(4, p0ce19.getDBRefs().length);
731 assertEquals("PIR", p0ce19.getDBRefs()[3].getSource());
732 assertEquals("S01875", p0ce19.getDBRefs()[3].getAccessionId());
735 @Test(groups = "Functional")
736 public void testSameSequence()
738 assertTrue(CrossRef.sameSequence(null, null));
739 SequenceI seq1 = new Sequence("seq1", "ABCDEF");
740 assertFalse(CrossRef.sameSequence(seq1, null));
741 assertFalse(CrossRef.sameSequence(null, seq1));
742 assertTrue(CrossRef.sameSequence(seq1, new Sequence("seq2", "ABCDEF")));
743 assertTrue(CrossRef.sameSequence(seq1, new Sequence("seq2", "abcdef")));
745 .sameSequence(seq1, new Sequence("seq2", "ABCDE-F")));
746 assertFalse(CrossRef.sameSequence(seq1, new Sequence("seq2", "BCDEF")));