+/*
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
+ *
+ * This file is part of Jalview.
+ *
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *
+ * Jalview is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
+ * The Jalview Authors are detailed in the 'AUTHORS' file.
+ */
package jalview.analysis;
import static org.testng.AssertJUnit.assertEquals;
+import static org.testng.AssertJUnit.assertFalse;
+import static org.testng.AssertJUnit.assertNotNull;
+import static org.testng.AssertJUnit.assertNotSame;
+import static org.testng.AssertJUnit.assertNull;
import static org.testng.AssertJUnit.assertSame;
+import static org.testng.AssertJUnit.assertTrue;
+import static org.testng.internal.junit.ArrayAsserts.assertArrayEquals;
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.Mapping;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.util.DBRefUtils;
+import jalview.util.MapList;
+import jalview.ws.SequenceFetcher;
+import jalview.ws.SequenceFetcherFactory;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.testng.annotations.AfterClass;
import org.testng.annotations.Test;
public class CrossRefTest
DBRefEntry ref6 = new DBRefEntry("emblCDS", "1", "A123");
DBRefEntry ref7 = new DBRefEntry("GeneDB", "1", "A123");
DBRefEntry ref8 = new DBRefEntry("PFAM", "1", "A123");
+ // ENSEMBL is a source of either dna or protein sequence data
+ DBRefEntry ref9 = new DBRefEntry("ENSEMBL", "1", "A123");
DBRefEntry[] refs = new DBRefEntry[] { ref1, ref2, ref3, ref4, ref5,
- ref6, ref7, ref8 };
+ ref6, ref7, ref8, ref9 };
/*
* Just the DNA refs:
*/
- DBRefEntry[] found = CrossRef.findXDbRefs(false, refs);
- assertEquals(3, found.length);
+ DBRefEntry[] found = DBRefUtils.selectDbRefs(true, refs);
+ assertEquals(4, found.length);
assertSame(ref5, found[0]);
assertSame(ref6, found[1]);
assertSame(ref7, found[2]);
+ assertSame(ref9, found[3]);
/*
* Just the protein refs:
*/
- found = CrossRef.findXDbRefs(true, refs);
- assertEquals(4, found.length);
+ found = DBRefUtils.selectDbRefs(false, refs);
+ assertEquals(5, found.length);
assertSame(ref1, found[0]);
assertSame(ref2, found[1]);
assertSame(ref3, found[2]);
assertSame(ref4, found[3]);
+ assertSame(ref9, found[4]);
+ }
+
+ /**
+ * Test the method that finds a sequence's "product" xref source databases,
+ * which may be direct (dbrefs on the sequence), or indirect (dbrefs on
+ * sequences which share a dbref with the sequence
+ */
+ @Test(groups = { "Functional" }, enabled = false)
+ public void testFindXrefSourcesForSequence_proteinToDna()
+ {
+ SequenceI seq = new Sequence("Seq1", "MGKYQARLSS");
+ List<String> sources = new ArrayList<String>();
+ AlignmentI al = new Alignment(new SequenceI[] {});
+
+ /*
+ * first with no dbrefs to search
+ */
+ sources = new CrossRef(new SequenceI[] { seq }, al)
+ .findXrefSourcesForSequences();
+ assertTrue(sources.isEmpty());
+
+ /*
+ * add some dbrefs to sequence
+ */
+ // protein db is not a candidate for findXrefSources
+ seq.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234"));
+ // dna coding databatases are
+ seq.addDBRef(new DBRefEntry("EMBL", "0", "E2345"));
+ // a second EMBL xref should not result in a duplicate
+ seq.addDBRef(new DBRefEntry("EMBL", "0", "E2346"));
+ seq.addDBRef(new DBRefEntry("EMBLCDS", "0", "E2347"));
+ seq.addDBRef(new DBRefEntry("GENEDB", "0", "E2348"));
+ seq.addDBRef(new DBRefEntry("ENSEMBL", "0", "E2349"));
+ seq.addDBRef(new DBRefEntry("ENSEMBLGENOMES", "0", "E2350"));
+ sources = new CrossRef(new SequenceI[] { seq }, al)
+ .findXrefSourcesForSequences();
+ assertEquals(4, sources.size());
+ assertEquals("[EMBL, EMBLCDS, GENEDB, ENSEMBL]",
+ sources.toString());
+
+ /*
+ * add a sequence to the alignment which has a dbref to UNIPROT|A1234
+ * and others to dna coding databases
+ */
+ sources.clear();
+ seq.setDBRefs(null);
+ seq.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234"));
+ seq.addDBRef(new DBRefEntry("EMBLCDS", "0", "E2347"));
+ SequenceI seq2 = new Sequence("Seq2", "MGKYQARLSS");
+ seq2.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234"));
+ seq2.addDBRef(new DBRefEntry("EMBL", "0", "E2345"));
+ seq2.addDBRef(new DBRefEntry("GENEDB", "0", "E2348"));
+ // TODO include ENSEMBLGENOMES in DBRefSource.DNACODINGDBS ?
+ al.addSequence(seq2);
+ sources = new CrossRef(new SequenceI[] { seq }, al)
+ .findXrefSourcesForSequences();
+ assertEquals(3, sources.size());
+ assertEquals("[EMBLCDS, EMBL, GENEDB]", sources.toString());
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where only an indirect
+ * xref is found - not on the nucleotide sequence but on a peptide sequence in
+ * the alignment which which it shares a nucleotide dbref
+ */
+ @Test(groups = { "Functional" }, enabled = false)
+ public void testFindXrefSequences_indirectDbrefToProtein()
+ {
+ /*
+ * Alignment setup:
+ * - nucleotide dbref EMBL|AF039662
+ * - peptide dbrefs EMBL|AF039662, UNIPROT|Q9ZTS2
+ */
+ SequenceI emblSeq = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ emblSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
+ SequenceI uniprotSeq = new Sequence("Q9ZTS2", "MASVSATMISTS");
+ uniprotSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
+ uniprotSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+
+ /*
+ * Find UNIPROT xrefs for nucleotide
+ * - it has no UNIPROT dbref of its own
+ * - but peptide with matching nucleotide dbref does, so is returned
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
+ Alignment xrefs = new CrossRef(new SequenceI[] { emblSeq }, al)
+ .findXrefSequences("UNIPROT");
+ assertEquals(1, xrefs.getHeight());
+ assertSame(uniprotSeq, xrefs.getSequenceAt(0));
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where only an indirect
+ * xref is found - not on the peptide sequence but on a nucleotide sequence in
+ * the alignment which which it shares a protein dbref
+ */
+ @Test(groups = { "Functional" }, enabled = false)
+ public void testFindXrefSequences_indirectDbrefToNucleotide()
+ {
+ /*
+ * Alignment setup:
+ * - peptide dbref UNIPROT|Q9ZTS2
+ * - nucleotide dbref EMBL|AF039662, UNIPROT|Q9ZTS2
+ */
+ SequenceI uniprotSeq = new Sequence("Q9ZTS2", "MASVSATMISTS");
+ uniprotSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+ SequenceI emblSeq = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ emblSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
+ emblSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+
+ /*
+ * find EMBL xrefs for peptide sequence - it has no direct
+ * dbrefs, but the 'corresponding' nucleotide sequence does, so is returned
+ */
+ /*
+ * Find EMBL xrefs for peptide
+ * - it has no EMBL dbref of its own
+ * - but nucleotide with matching peptide dbref does, so is returned
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
+ Alignment xrefs = new CrossRef(new SequenceI[] { uniprotSeq },
+ al).findXrefSequences("EMBL");
+ assertEquals(1, xrefs.getHeight());
+ assertSame(emblSeq, xrefs.getSequenceAt(0));
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where the selected
+ * sequence has no dbref to the desired source, and there are no indirect
+ * references via another sequence in the alignment
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_noDbrefs()
+ {
+ /*
+ * two nucleotide sequences, one with UNIPROT dbref
+ */
+ SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+ SequenceI dna2 = new Sequence("AJ307031", "AAACCCTTT");
+
+ /*
+ * find UNIPROT xrefs for peptide sequence - it has no direct
+ * dbrefs, and the other sequence (which has a UNIPROT dbref) is not
+ * equatable to it, so no results found
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { dna1, dna2 });
+ Alignment xrefs = new CrossRef(new SequenceI[] { dna2 }, al)
+ .findXrefSequences("UNIPROT");
+ assertNull(xrefs);
+ }
+
+ /**
+ * Tests for the method that searches an alignment (with one sequence
+ * excluded) for protein/nucleotide sequences with a given cross-reference
+ */
+ @Test(groups = { "Functional" }, enabled = false)
+ public void testSearchDataset()
+ {
+ /*
+ * nucleotide sequence with UNIPROT AND EMBL dbref
+ * peptide sequence with UNIPROT dbref
+ */
+ SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+ dna1.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
+ SequenceI pep1 = new Sequence("Q9ZTS2", "MLAVSRGQ");
+ pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+ AlignmentI al = new Alignment(new SequenceI[] { dna1, pep1 });
+
+ List<SequenceI> result = new ArrayList<SequenceI>();
+
+ /*
+ * first search for a dbref nowhere on the alignment:
+ */
+ DBRefEntry dbref = new DBRefEntry("UNIPROT", "0", "P30419");
+ CrossRef testee = new CrossRef(al.getSequencesArray(), al);
+ boolean found = testee.searchDataset(dna1, dbref, result, null, true);
+ assertFalse(found);
+ assertTrue(result.isEmpty());
+
+ // TODO we are setting direct=true here but it is set to
+ // false in Jalview code...
+
+ /*
+ * search for a protein sequence with dbref UNIPROT:Q9ZTS2
+ */
+ dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
+ found = testee.searchDataset(dna1, dbref, result, null, true);
+ assertTrue(found);
+ assertEquals(1, result.size());
+ assertSame(pep1, result.get(0));
+
+ /*
+ * search for a nucleotide sequence with dbref UNIPROT:Q9ZTS2
+ */
+ result.clear();
+ dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
+ found = testee.searchDataset(pep1, dbref, result, null, false);
+ assertTrue(found);
+ assertEquals(1, result.size());
+ assertSame(dna1, result.get(0));
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where the selected
+ * sequence has a dbref with a mapping to a sequence
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_fromDbRefMap()
+ {
+ /*
+ * two peptide sequences each with a DBRef and SequenceFeature
+ */
+ SequenceI pep1 = new Sequence("Q9ZTS2", "MALFQRSV");
+ pep1.addDBRef(new DBRefEntry("Pfam", "0", "PF00111"));
+ pep1.addSequenceFeature(new SequenceFeature("type", "desc", 12, 14, 1f,
+ "group"));
+ SequenceI pep2 = new Sequence("P30419", "MTRRSQIF");
+ pep2.addDBRef(new DBRefEntry("PDB", "0", "3JTK"));
+ pep2.addSequenceFeature(new SequenceFeature("type2", "desc2", 13, 15,
+ 12f, "group2"));
+
+ /*
+ * nucleotide sequence (to go in the alignment)
+ */
+ SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+
+ /*
+ * add DBRefEntry's to dna1 with mappings from dna to both peptides
+ */
+ MapList mapList = new MapList(new int[] { 1, 24 }, new int[] { 1, 3 },
+ 3, 1);
+ Mapping map = new Mapping(pep1, mapList);
+ DBRefEntry dbRef1 = new DBRefEntry("UNIPROT", "0", "Q9ZTS2", map);
+ dna1.addDBRef(dbRef1);
+ mapList = new MapList(new int[] { 1, 24 }, new int[] { 1, 3 }, 3, 1);
+ map = new Mapping(pep2, mapList);
+ DBRefEntry dbRef2 = new DBRefEntry("UNIPROT", "0", "P30419", map);
+ dna1.addDBRef(dbRef2);
+
+ /*
+ * find UNIPROT xrefs for nucleotide sequence - it should pick up
+ * mapped sequences
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { dna1 });
+ Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al)
+ .findXrefSequences("UNIPROT");
+ assertEquals(2, xrefs.getHeight());
+
+ /*
+ * cross-refs alignment holds copies of the mapped sequences
+ * including copies of their dbrefs and features
+ */
+ checkCopySequence(pep1, xrefs.getSequenceAt(0));
+ checkCopySequence(pep2, xrefs.getSequenceAt(1));
+ }
+
+ /**
+ * Helper method to assert seq1 looks like a copy of seq2
+ *
+ * @param seq1
+ * @param seq2
+ */
+ private void checkCopySequence(SequenceI seq1, SequenceI seq2)
+ {
+ assertNotSame(seq1, seq2);
+ assertEquals(seq1.getName(), seq2.getName());
+ assertEquals(seq1.getStart(), seq2.getStart());
+ assertEquals(seq1.getEnd(), seq2.getEnd());
+ assertEquals(seq1.getSequenceAsString(), seq2.getSequenceAsString());
+
+ /*
+ * compare dbrefs
+ */
+ assertArrayEquals(seq1.getDBRefs(), seq2.getDBRefs());
+ // check one to verify a copy, not the same object
+ if (seq1.getDBRefs().length > 0)
+ {
+ assertNotSame(seq1.getDBRefs()[0], seq2.getDBRefs()[0]);
+ }
+
+ /*
+ * compare features
+ */
+ assertArrayEquals(seq1.getSequenceFeatures(),
+ seq2.getSequenceFeatures());
+ if (seq1.getSequenceFeatures().length > 0)
+ {
+ assertNotSame(seq1.getSequenceFeatures()[0],
+ seq2.getSequenceFeatures()[0]);
+ }
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where the selected
+ * sequence has a dbref with no mapping, triggering a fetch from database
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_withFetch()
+ {
+ SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "P30419"));
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "P00314"));
+ final SequenceI pep1 = new Sequence("Q9ZTS2", "MYQLIRSSW");
+ final SequenceI pep2 = new Sequence("P00314", "MRKLLAASG");
+
+ /*
+ * argument false suppresses adding DAS sources
+ * todo: define an interface type SequenceFetcherI and mock that
+ */
+ SequenceFetcher mockFetcher = new SequenceFetcher(false)
+ {
+ @Override
+ public boolean isFetchable(String source)
+ {
+ return true;
+ }
+
+ @Override
+ public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
+ {
+ return new SequenceI[] { pep1, pep2 };
+ }
+ };
+ SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
+
+ /*
+ * find UNIPROT xrefs for nucleotide sequence
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { dna1 });
+ Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al)
+ .findXrefSequences("UNIPROT");
+ assertEquals(2, xrefs.getHeight());
+ assertSame(pep1, xrefs.getSequenceAt(0));
+ assertSame(pep2, xrefs.getSequenceAt(1));
+ }
+
+ @AfterClass
+ public void tearDown()
+ {
+ SequenceFetcherFactory.setSequenceFetcher(null);
}
+ /**
+ * Test for finding 'product' sequences for the case where both gene and
+ * transcript sequences have dbrefs to Uniprot.
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_forGeneAndTranscripts()
+ {
+ /*
+ * 'gene' sequence
+ */
+ SequenceI gene = new Sequence("ENSG00000157764", "CGCCTCCCTTCCCC");
+ gene.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
+ gene.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3"));
+
+ /*
+ * 'transcript' with CDS feature (supports mapping to protein)
+ */
+ SequenceI braf001 = new Sequence("ENST00000288602", "taagATGGCGGCGCTGa");
+ braf001.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
+ braf001.addSequenceFeature(new SequenceFeature("CDS", "", 5, 16, 0f,
+ null));
+
+ /*
+ * 'spliced transcript' with CDS ranges
+ */
+ SequenceI braf002 = new Sequence("ENST00000497784", "gCAGGCtaTCTGTTCaa");
+ braf002.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3"));
+ braf002.addSequenceFeature(new SequenceFeature("CDS", "", 2, 6, 0f,
+ null));
+ braf002.addSequenceFeature(new SequenceFeature("CDS", "", 9, 15, 0f,
+ null));
+
+ /*
+ * TODO code is fragile - use of SequenceIdMatcher depends on fetched
+ * sequences having a name starting Source|Accession
+ * which happens to be true for Uniprot,PDB,EMBL but not Pfam,Rfam,Ensembl
+ */
+ final SequenceI pep1 = new Sequence("UNIPROT|P15056", "MAAL");
+ final SequenceI pep2 = new Sequence("UNIPROT|H7C5K3", "QALF");
+
+ /*
+ * argument false suppresses adding DAS sources
+ * todo: define an interface type SequenceFetcherI and mock that
+ */
+ SequenceFetcher mockFetcher = new SequenceFetcher(false)
+ {
+ @Override
+ public boolean isFetchable(String source)
+ {
+ return true;
+ }
+
+ @Override
+ public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
+ {
+ return new SequenceI[] { pep1, pep2 };
+ }
+ };
+ SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
+
+ /*
+ * find UNIPROT xrefs for gene and transcripts
+ * verify that
+ * - the two proteins are retrieved but not duplicated
+ * - mappings are built from transcript (CDS) to proteins
+ * - no mappings from gene to proteins
+ */
+ SequenceI[] seqs = new SequenceI[] { gene, braf001, braf002 };
+ AlignmentI al = new Alignment(seqs);
+ Alignment xrefs = new CrossRef(seqs, al)
+ .findXrefSequences("UNIPROT");
+ assertEquals(2, xrefs.getHeight());
+ assertSame(pep1, xrefs.getSequenceAt(0));
+ assertSame(pep2, xrefs.getSequenceAt(1));
+ }
+
+ /**
+ * <pre>
+ * Test that emulates this (real but simplified) case:
+ * Alignment: DBrefs
+ * UNIPROT|P0CE19 EMBL|J03321, EMBL|X06707, EMBL|M19487
+ * UNIPROT|P0CE20 EMBL|J03321, EMBL|X06707, EMBL|X07547
+ * Find cross-references for EMBL. These are mocked here as
+ * EMBL|J03321 with mappings to P0CE18, P0CE19, P0CE20
+ * EMBL|X06707 with mappings to P0CE17, P0CE19, P0CE20
+ * EMBL|M19487 with mappings to P0CE19, Q46432
+ * EMBL|X07547 with mappings to P0CE20, B0BCM4
+ * EMBL sequences are first 'fetched' (mocked here) for P0CE19.
+ * The 3 EMBL sequences are added to the alignment dataset.
+ * Their dbrefs to Uniprot products P0CE19 and P0CE20 should be matched in the
+ * alignment dataset and updated to reference the original Uniprot sequences.
+ * For the second Uniprot sequence, the J03321 and X06707 xrefs should be
+ * resolved from the dataset, and only the X07547 dbref fetched.
+ * So the end state to verify is:
+ * - 4 cross-ref sequences returned: J03321, X06707, M19487, X07547
+ * - P0CE19/20 dbrefs to EMBL sequences now have mappings
+ * - J03321 dbrefs to P0CE19/20 mapped to original Uniprot sequences
+ * - X06707 dbrefs to P0CE19/20 mapped to original Uniprot sequences
+ * </pre>
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_uniprotEmblManyToMany()
+ {
+ /*
+ * Uniprot sequences, both with xrefs to EMBL|J03321
+ * and EMBL|X07547
+ */
+ SequenceI p0ce19 = new Sequence("UNIPROT|P0CE19", "KPFG");
+ p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "J03321"));
+ p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "X06707"));
+ p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "M19487"));
+ SequenceI p0ce20 = new Sequence("UNIPROT|P0CE20", "PFGK");
+ p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "J03321"));
+ p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "X06707"));
+ p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "X07547"));
+
+ /*
+ * EMBL sequences to be 'fetched', complete with dbrefs and mappings
+ * to their protein products (CDS location and translations are provided
+ * in EMBL XML); these should be matched to, and replaced with,
+ * the corresponding uniprot sequences after fetching
+ */
+
+ /*
+ * J03321 with mappings to P0CE19 and P0CE20
+ */
+ final SequenceI j03321 = new Sequence("EMBL|J03321", "AAACCCTTTGGGAAAA");
+ DBRefEntry dbref1 = new DBRefEntry("UNIPROT", "0", "P0CE19");
+ MapList mapList = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 },
+ 3, 1);
+ Mapping map = new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), mapList);
+ // add a dbref to the mapped to sequence - should get copied to p0ce19
+ map.getTo().addDBRef(new DBRefEntry("PIR", "0", "S01875"));
+ dbref1.setMap(map);
+ j03321.addDBRef(dbref1);
+ DBRefEntry dbref2 = new DBRefEntry("UNIPROT", "0", "P0CE20");
+ mapList = new MapList(new int[] { 4, 15 }, new int[] { 2, 5 }, 3, 1);
+ dbref2.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "PFGK"),
+ new MapList(mapList)));
+ j03321.addDBRef(dbref2);
+
+ /*
+ * X06707 with mappings to P0CE19 and P0CE20
+ */
+ final SequenceI x06707 = new Sequence("EMBL|X06707", "atgAAACCCTTTGGG");
+ DBRefEntry dbref3 = new DBRefEntry("UNIPROT", "0", "P0CE19");
+ MapList map2 = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 }, 3,
+ 1);
+ dbref3.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), map2));
+ x06707.addDBRef(dbref3);
+ DBRefEntry dbref4 = new DBRefEntry("UNIPROT", "0", "P0CE20");
+ MapList map3 = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 }, 3,
+ 1);
+ dbref4.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "PFGK"), map3));
+ x06707.addDBRef(dbref4);
+
+ /*
+ * M19487 with mapping to P0CE19 and Q46432
+ */
+ final SequenceI m19487 = new Sequence("EMBL|M19487", "AAACCCTTTGGG");
+ DBRefEntry dbref5 = new DBRefEntry("UNIPROT", "0", "P0CE19");
+ dbref5.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"),
+ new MapList(mapList)));
+ m19487.addDBRef(dbref5);
+ DBRefEntry dbref6 = new DBRefEntry("UNIPROT", "0", "Q46432");
+ dbref6.setMap(new Mapping(new Sequence("UNIPROT|Q46432", "KPFG"),
+ new MapList(mapList)));
+ m19487.addDBRef(dbref6);
+
+ /*
+ * X07547 with mapping to P0CE20 and B0BCM4
+ */
+ final SequenceI x07547 = new Sequence("EMBL|X07547", "cccAAACCCTTTGGG");
+ DBRefEntry dbref7 = new DBRefEntry("UNIPROT", "0", "P0CE20");
+ dbref7.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"),
+ new MapList(map2)));
+ x07547.addDBRef(dbref7);
+ DBRefEntry dbref8 = new DBRefEntry("UNIPROT", "0", "B0BCM4");
+ dbref8.setMap(new Mapping(new Sequence("UNIPROT|B0BCM4", "KPFG"),
+ new MapList(map2)));
+ x07547.addDBRef(dbref8);
+
+ /*
+ * mock sequence fetcher to 'return' the EMBL sequences
+ * TODO: Mockito would allow .thenReturn().thenReturn() here,
+ * and also capture and verification of the parameters
+ * passed in calls to getSequences() - important to verify that
+ * duplicate sequence fetches are not requested
+ */
+ SequenceFetcher mockFetcher = new SequenceFetcher(false)
+ {
+ int call = 0;
+ @Override
+ public boolean isFetchable(String source)
+ {
+ return true;
+ }
+ @Override
+ public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
+ {
+ call++;
+ if (call == 1) {
+ assertEquals("Expected 3 embl seqs in first fetch", 3, refs.size());
+ return new SequenceI[] { j03321, x06707, m19487 };
+ } else {
+ assertEquals("Expected 1 embl seq in second fetch", 1, refs.size());
+ return new SequenceI[] { x07547 };
+ }
+ }
+ };
+ SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
+
+ /*
+ * find EMBL xrefs for Uniprot seqs and verify that
+ * - the EMBL xref'd sequences are retrieved without duplicates
+ * - mappings are added to the Uniprot dbrefs
+ * - mappings in the EMBL-to-Uniprot dbrefs are updated to the
+ * alignment sequences
+ * - dbrefs on the EMBL sequences are added to the original dbrefs
+ */
+ SequenceI[] seqs = new SequenceI[] { p0ce19, p0ce20 };
+ AlignmentI al = new Alignment(seqs);
+ Alignment xrefs = new CrossRef(seqs, al)
+ .findXrefSequences("EMBL");
+
+ /*
+ * verify retrieved sequences
+ */
+ assertNotNull(xrefs);
+ assertEquals(4, xrefs.getHeight());
+ assertSame(j03321, xrefs.getSequenceAt(0));
+ assertSame(x06707, xrefs.getSequenceAt(1));
+ assertSame(m19487, xrefs.getSequenceAt(2));
+ assertSame(x07547, xrefs.getSequenceAt(3));
+
+ /*
+ * verify mappings added to Uniprot-to-EMBL dbrefs
+ */
+ Mapping mapping = p0ce19.getDBRefs()[0].getMap();
+ assertSame(j03321, mapping.getTo());
+ mapping = p0ce19.getDBRefs()[1].getMap();
+ assertSame(x06707, mapping.getTo());
+ mapping = p0ce20.getDBRefs()[0].getMap();
+ assertSame(j03321, mapping.getTo());
+ mapping = p0ce20.getDBRefs()[1].getMap();
+ assertSame(x06707, mapping.getTo());
+
+ /*
+ * verify dbrefs on EMBL are mapped to alignment seqs
+ */
+ assertSame(p0ce19, j03321.getDBRefs()[0].getMap().getTo());
+ assertSame(p0ce20, j03321.getDBRefs()[1].getMap().getTo());
+ assertSame(p0ce19, x06707.getDBRefs()[0].getMap().getTo());
+ assertSame(p0ce20, x06707.getDBRefs()[1].getMap().getTo());
+
+ /*
+ * verify new dbref on EMBL dbref mapping is copied to the
+ * original Uniprot sequence
+ */
+ assertEquals(4, p0ce19.getDBRefs().length);
+ assertEquals("PIR", p0ce19.getDBRefs()[3].getSource());
+ assertEquals("S01875", p0ce19.getDBRefs()[3].getAccessionId());
+ }
+
+ @Test(groups = "Functional")
+ public void testSameSequence()
+ {
+ assertTrue(CrossRef.sameSequence(null, null));
+ SequenceI seq1 = new Sequence("seq1", "ABCDEF");
+ assertFalse(CrossRef.sameSequence(seq1, null));
+ assertFalse(CrossRef.sameSequence(null, seq1));
+ assertTrue(CrossRef.sameSequence(seq1, new Sequence("seq2", "ABCDEF")));
+ assertTrue(CrossRef.sameSequence(seq1, new Sequence("seq2", "abcdef")));
+ assertFalse(CrossRef
+ .sameSequence(seq1, new Sequence("seq2", "ABCDE-F")));
+ assertFalse(CrossRef.sameSequence(seq1, new Sequence("seq2", "BCDEF")));
+ }
}