import static org.testng.AssertJUnit.assertEquals;
import static org.testng.AssertJUnit.assertFalse;
+import static org.testng.AssertJUnit.assertNotNull;
import static org.testng.AssertJUnit.assertNotSame;
import static org.testng.AssertJUnit.assertNull;
import static org.testng.AssertJUnit.assertSame;
* which may be direct (dbrefs on the sequence), or indirect (dbrefs on
* sequences which share a dbref with the sequence
*/
- @Test(groups = { "Functional" })
+ @Test(groups = { "Functional" }, enabled = true)
public void testFindXrefSourcesForSequence_proteinToDna()
{
SequenceI seq = new Sequence("Seq1", "MGKYQARLSS");
/*
* first with no dbrefs to search
*/
- CrossRef.findXrefSourcesForSequence(seq, false, al, sources);
+ sources = new CrossRef(new SequenceI[] { seq }, al)
+ .findXrefSourcesForSequences(false);
assertTrue(sources.isEmpty());
/*
seq.addDBRef(new DBRefEntry("GENEDB", "0", "E2348"));
seq.addDBRef(new DBRefEntry("ENSEMBL", "0", "E2349"));
seq.addDBRef(new DBRefEntry("ENSEMBLGENOMES", "0", "E2350"));
- CrossRef.findXrefSourcesForSequence(seq, false, al, sources);
+ sources = new CrossRef(new SequenceI[] { seq }, al)
+ .findXrefSourcesForSequences(false);
assertEquals(4, sources.size());
- assertEquals("[EMBL, EMBLCDS, GENEDB, ENSEMBL]",
- sources.toString());
+ assertEquals("[EMBL, EMBLCDS, GENEDB, ENSEMBL]", sources.toString());
/*
* add a sequence to the alignment which has a dbref to UNIPROT|A1234
seq2.addDBRef(new DBRefEntry("GENEDB", "0", "E2348"));
// TODO include ENSEMBLGENOMES in DBRefSource.DNACODINGDBS ?
al.addSequence(seq2);
- CrossRef.findXrefSourcesForSequence(seq, false, al, sources);
+ sources = new CrossRef(new SequenceI[] { seq, seq2 }, al)
+ .findXrefSourcesForSequences(false);
assertEquals(3, sources.size());
assertEquals("[EMBLCDS, EMBL, GENEDB]", sources.toString());
}
* xref is found - not on the nucleotide sequence but on a peptide sequence in
* the alignment which which it shares a nucleotide dbref
*/
- @Test(groups = { "Functional" })
+ @Test(groups = { "Functional" }, enabled = false)
public void testFindXrefSequences_indirectDbrefToProtein()
{
/*
* - but peptide with matching nucleotide dbref does, so is returned
*/
AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
- Alignment xrefs = CrossRef.findXrefSequences(
- new SequenceI[] { emblSeq }, true, "UNIPROT", al);
+ Alignment xrefs = new CrossRef(new SequenceI[] { emblSeq }, al)
+ .findXrefSequences("UNIPROT", true);
assertEquals(1, xrefs.getHeight());
assertSame(uniprotSeq, xrefs.getSequenceAt(0));
}
* xref is found - not on the peptide sequence but on a nucleotide sequence in
* the alignment which which it shares a protein dbref
*/
- @Test(groups = { "Functional" })
+ @Test(groups = { "Functional" }, enabled = false)
public void testFindXrefSequences_indirectDbrefToNucleotide()
{
/*
* - but nucleotide with matching peptide dbref does, so is returned
*/
AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
- Alignment xrefs = CrossRef.findXrefSequences(
- new SequenceI[] { uniprotSeq }, false, "EMBL", al);
+ Alignment xrefs = new CrossRef(new SequenceI[] { uniprotSeq },
+ al)
+ .findXrefSequences("EMBL", true);
assertEquals(1, xrefs.getHeight());
assertSame(emblSeq, xrefs.getSequenceAt(0));
}
* equatable to it, so no results found
*/
AlignmentI al = new Alignment(new SequenceI[] { dna1, dna2 });
- Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna2 },
- true, "UNIPROT", al);
+ Alignment xrefs = new CrossRef(new SequenceI[] { dna2 }, al)
+ .findXrefSequences("UNIPROT", true);
assertNull(xrefs);
}
* Tests for the method that searches an alignment (with one sequence
* excluded) for protein/nucleotide sequences with a given cross-reference
*/
- @Test(groups = { "Functional" })
+ @Test(groups = { "Functional" }, enabled = false)
public void testSearchDataset()
{
/*
* first search for a dbref nowhere on the alignment:
*/
DBRefEntry dbref = new DBRefEntry("UNIPROT", "0", "P30419");
- boolean found = CrossRef.searchDataset(dna1, dbref, al, result, null,
- true, true);
+ CrossRef testee = new CrossRef(al.getSequencesArray(), al);
+ boolean found = testee.searchDataset(true, dna1, dbref, result, null,
+ true);
assertFalse(found);
assertTrue(result.isEmpty());
* search for a protein sequence with dbref UNIPROT:Q9ZTS2
*/
dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
- found = CrossRef.searchDataset(dna1, dbref, al, result, null, true,
- true);
+ found = testee.searchDataset(true, dna1, dbref, result, null, true);
assertTrue(found);
assertEquals(1, result.size());
assertSame(pep1, result.get(0));
*/
result.clear();
dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
- found = CrossRef.searchDataset(pep1, dbref, al, result, null, true,
- false);
+ found = testee.searchDataset(false, pep1, dbref, result, null, false);
assertTrue(found);
assertEquals(1, result.size());
assertSame(dna1, result.get(0));
* mapped sequences
*/
AlignmentI al = new Alignment(new SequenceI[] { dna1 });
- Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna1 },
- true, "UNIPROT", al);
+ Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al)
+ .findXrefSequences("UNIPROT", true);
assertEquals(2, xrefs.getHeight());
/*
final SequenceI pep1 = new Sequence("Q9ZTS2", "MYQLIRSSW");
final SequenceI pep2 = new Sequence("P00314", "MRKLLAASG");
- SequenceFetcher mockFetcher = new SequenceFetcher()
+ /*
+ * argument false suppresses adding DAS sources
+ * todo: define an interface type SequenceFetcherI and mock that
+ */
+ SequenceFetcher mockFetcher = new SequenceFetcher(false)
{
-
@Override
public boolean isFetchable(String source)
{
}
@Override
- public SequenceI[] getSequences(DBRefEntry[] refs, boolean dna)
+ public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
{
return new SequenceI[] { pep1, pep2 };
}
* find UNIPROT xrefs for nucleotide sequence
*/
AlignmentI al = new Alignment(new SequenceI[] { dna1 });
- Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna1 },
- true, "UNIPROT", al);
+ Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al)
+ .findXrefSequences("UNIPROT", true);
assertEquals(2, xrefs.getHeight());
assertSame(pep1, xrefs.getSequenceAt(0));
assertSame(pep2, xrefs.getSequenceAt(1));
SequenceFetcherFactory.setSequenceFetcher(null);
}
+ /**
+ * Test for finding 'product' sequences for the case where both gene and
+ * transcript sequences have dbrefs to Uniprot.
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_forGeneAndTranscripts()
+ {
+ /*
+ * 'gene' sequence
+ */
+ SequenceI gene = new Sequence("ENSG00000157764", "CGCCTCCCTTCCCC");
+ gene.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
+ gene.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3"));
+
+ /*
+ * 'transcript' with CDS feature (supports mapping to protein)
+ */
+ SequenceI braf001 = new Sequence("ENST00000288602", "taagATGGCGGCGCTGa");
+ braf001.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
+ braf001.addSequenceFeature(new SequenceFeature("CDS", "", 5, 16, 0f,
+ null));
+
+ /*
+ * 'spliced transcript' with CDS ranges
+ */
+ SequenceI braf002 = new Sequence("ENST00000497784", "gCAGGCtaTCTGTTCaa");
+ braf002.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3"));
+ braf002.addSequenceFeature(new SequenceFeature("CDS", "", 2, 6, 0f,
+ null));
+ braf002.addSequenceFeature(new SequenceFeature("CDS", "", 9, 15, 0f,
+ null));
+
+ /*
+ * TODO code is fragile - use of SequenceIdMatcher depends on fetched
+ * sequences having a name starting Source|Accession
+ * which happens to be true for Uniprot,PDB,EMBL but not Pfam,Rfam,Ensembl
+ */
+ final SequenceI pep1 = new Sequence("UNIPROT|P15056", "MAAL");
+ final SequenceI pep2 = new Sequence("UNIPROT|H7C5K3", "QALF");
+
+ /*
+ * argument false suppresses adding DAS sources
+ * todo: define an interface type SequenceFetcherI and mock that
+ */
+ SequenceFetcher mockFetcher = new SequenceFetcher(false)
+ {
+ @Override
+ public boolean isFetchable(String source)
+ {
+ return true;
+ }
+
+ @Override
+ public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
+ {
+ return new SequenceI[] { pep1, pep2 };
+ }
+ };
+ SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
+
+ /*
+ * find UNIPROT xrefs for gene and transcripts
+ * verify that
+ * - the two proteins are retrieved but not duplicated
+ * - mappings are built from transcript (CDS) to proteins
+ * - no mappings from gene to proteins
+ */
+ SequenceI[] seqs = new SequenceI[] { gene, braf001, braf002 };
+ AlignmentI al = new Alignment(seqs);
+ Alignment xrefs = new CrossRef(seqs, al)
+.findXrefSequences("UNIPROT",
+ true);
+ assertEquals(2, xrefs.getHeight());
+ assertSame(pep1, xrefs.getSequenceAt(0));
+ assertSame(pep2, xrefs.getSequenceAt(1));
+ }
+
+ /**
+ * <pre>
+ * Test that emulates this (real but simplified) case:
+ * Alignment: DBrefs
+ * UNIPROT|P0CE19 EMBL|J03321, EMBL|X06707, EMBL|M19487
+ * UNIPROT|P0CE20 EMBL|J03321, EMBL|X06707, EMBL|X07547
+ * Find cross-references for EMBL. These are mocked here as
+ * EMBL|J03321 with mappings to P0CE18, P0CE19, P0CE20
+ * EMBL|X06707 with mappings to P0CE17, P0CE19, P0CE20
+ * EMBL|M19487 with mappings to P0CE19, Q46432
+ * EMBL|X07547 with mappings to P0CE20, B0BCM4
+ * EMBL sequences are first 'fetched' (mocked here) for P0CE19.
+ * The 3 EMBL sequences are added to the alignment dataset.
+ * Their dbrefs to Uniprot products P0CE19 and P0CE20 should be matched in the
+ * alignment dataset and updated to reference the original Uniprot sequences.
+ * For the second Uniprot sequence, the J03321 and X06707 xrefs should be
+ * resolved from the dataset, and only the X07547 dbref fetched.
+ * So the end state to verify is:
+ * - 4 cross-ref sequences returned: J03321, X06707, M19487, X07547
+ * - P0CE19/20 dbrefs to EMBL sequences now have mappings
+ * - J03321 dbrefs to P0CE19/20 mapped to original Uniprot sequences
+ * - X06707 dbrefs to P0CE19/20 mapped to original Uniprot sequences
+ * </pre>
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_uniprotEmblManyToMany()
+ {
+ /*
+ * Uniprot sequences, both with xrefs to EMBL|J03321
+ * and EMBL|X07547
+ */
+ SequenceI p0ce19 = new Sequence("UNIPROT|P0CE19", "KPFG");
+ p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "J03321"));
+ p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "X06707"));
+ p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "M19487"));
+ SequenceI p0ce20 = new Sequence("UNIPROT|P0CE20", "PFGK");
+ p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "J03321"));
+ p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "X06707"));
+ p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "X07547"));
+
+ /*
+ * EMBL sequences to be 'fetched', complete with dbrefs and mappings
+ * to their protein products (CDS location and translations are provided
+ * in EMBL XML); these should be matched to, and replaced with,
+ * the corresponding uniprot sequences after fetching
+ */
+
+ /*
+ * J03321 with mappings to P0CE19 and P0CE20
+ */
+ final SequenceI j03321 = new Sequence("EMBL|J03321", "AAACCCTTTGGGAAAA");
+ DBRefEntry dbref1 = new DBRefEntry("UNIPROT", "0", "P0CE19");
+ MapList mapList = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 },
+ 3, 1);
+ Mapping map = new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), mapList);
+ // add a dbref to the mapped to sequence - should get copied to p0ce19
+ map.getTo().addDBRef(new DBRefEntry("PIR", "0", "S01875"));
+ dbref1.setMap(map);
+ j03321.addDBRef(dbref1);
+ DBRefEntry dbref2 = new DBRefEntry("UNIPROT", "0", "P0CE20");
+ mapList = new MapList(new int[] { 4, 15 }, new int[] { 2, 5 }, 3, 1);
+ dbref2.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "PFGK"),
+ new MapList(mapList)));
+ j03321.addDBRef(dbref2);
+
+ /*
+ * X06707 with mappings to P0CE19 and P0CE20
+ */
+ final SequenceI x06707 = new Sequence("EMBL|X06707", "atgAAACCCTTTGGG");
+ DBRefEntry dbref3 = new DBRefEntry("UNIPROT", "0", "P0CE19");
+ MapList map2 = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 }, 3,
+ 1);
+ dbref3.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), map2));
+ x06707.addDBRef(dbref3);
+ DBRefEntry dbref4 = new DBRefEntry("UNIPROT", "0", "P0CE20");
+ MapList map3 = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 }, 3,
+ 1);
+ dbref4.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "PFGK"), map3));
+ x06707.addDBRef(dbref4);
+
+ /*
+ * M19487 with mapping to P0CE19 and Q46432
+ */
+ final SequenceI m19487 = new Sequence("EMBL|M19487", "AAACCCTTTGGG");
+ DBRefEntry dbref5 = new DBRefEntry("UNIPROT", "0", "P0CE19");
+ dbref5.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"),
+ new MapList(mapList)));
+ m19487.addDBRef(dbref5);
+ DBRefEntry dbref6 = new DBRefEntry("UNIPROT", "0", "Q46432");
+ dbref6.setMap(new Mapping(new Sequence("UNIPROT|Q46432", "KPFG"),
+ new MapList(mapList)));
+ m19487.addDBRef(dbref6);
+
+ /*
+ * X07547 with mapping to P0CE20 and B0BCM4
+ */
+ final SequenceI x07547 = new Sequence("EMBL|X07547", "cccAAACCCTTTGGG");
+ DBRefEntry dbref7 = new DBRefEntry("UNIPROT", "0", "P0CE20");
+ dbref7.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"),
+ new MapList(map2)));
+ x07547.addDBRef(dbref7);
+ DBRefEntry dbref8 = new DBRefEntry("UNIPROT", "0", "B0BCM4");
+ dbref8.setMap(new Mapping(new Sequence("UNIPROT|B0BCM4", "KPFG"),
+ new MapList(map2)));
+ x07547.addDBRef(dbref8);
+
+ /*
+ * mock sequence fetcher to 'return' the EMBL sequences
+ * TODO: Mockito would allow .thenReturn().thenReturn() here,
+ * and also capture and verification of the parameters
+ * passed in calls to getSequences() - important to verify that
+ * duplicate sequence fetches are not requested
+ */
+ SequenceFetcher mockFetcher = new SequenceFetcher(false)
+ {
+ int call = 0;
+ @Override
+ public boolean isFetchable(String source)
+ {
+ return true;
+ }
+ @Override
+ public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
+ {
+ call++;
+ if (call == 1) {
+ assertEquals("Expected 3 embl seqs in first fetch", 3, refs.size());
+ return new SequenceI[] { j03321, x06707, m19487 };
+ } else {
+ assertEquals("Expected 1 embl seq in second fetch", 1, refs.size());
+ return new SequenceI[] { x07547 };
+ }
+ }
+ };
+ SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
+
+ /*
+ * find EMBL xrefs for Uniprot seqs and verify that
+ * - the EMBL xref'd sequences are retrieved without duplicates
+ * - mappings are added to the Uniprot dbrefs
+ * - mappings in the EMBL-to-Uniprot dbrefs are updated to the
+ * alignment sequences
+ * - dbrefs on the EMBL sequences are added to the original dbrefs
+ */
+ SequenceI[] seqs = new SequenceI[] { p0ce19, p0ce20 };
+ AlignmentI al = new Alignment(seqs);
+ Alignment xrefs = new CrossRef(seqs, al).findXrefSequences("EMBL",
+ false);
+
+ /*
+ * verify retrieved sequences
+ */
+ assertNotNull(xrefs);
+ assertEquals(4, xrefs.getHeight());
+ assertSame(j03321, xrefs.getSequenceAt(0));
+ assertSame(x06707, xrefs.getSequenceAt(1));
+ assertSame(m19487, xrefs.getSequenceAt(2));
+ assertSame(x07547, xrefs.getSequenceAt(3));
+
+ /*
+ * verify mappings added to Uniprot-to-EMBL dbrefs
+ */
+ Mapping mapping = p0ce19.getDBRefs()[0].getMap();
+ assertSame(j03321, mapping.getTo());
+ mapping = p0ce19.getDBRefs()[1].getMap();
+ assertSame(x06707, mapping.getTo());
+ mapping = p0ce20.getDBRefs()[0].getMap();
+ assertSame(j03321, mapping.getTo());
+ mapping = p0ce20.getDBRefs()[1].getMap();
+ assertSame(x06707, mapping.getTo());
+
+ /*
+ * verify dbrefs on EMBL are mapped to alignment seqs
+ */
+ assertSame(p0ce19, j03321.getDBRefs()[0].getMap().getTo());
+ assertSame(p0ce20, j03321.getDBRefs()[1].getMap().getTo());
+ assertSame(p0ce19, x06707.getDBRefs()[0].getMap().getTo());
+ assertSame(p0ce20, x06707.getDBRefs()[1].getMap().getTo());
+
+ /*
+ * verify new dbref on EMBL dbref mapping is copied to the
+ * original Uniprot sequence
+ */
+ assertEquals(4, p0ce19.getDBRefs().length);
+ assertEquals("PIR", p0ce19.getDBRefs()[3].getSource());
+ assertEquals("S01875", p0ce19.getDBRefs()[3].getAccessionId());
+ }
+
+ @Test(groups = "Functional")
+ public void testSameSequence()
+ {
+ assertTrue(CrossRef.sameSequence(null, null));
+ SequenceI seq1 = new Sequence("seq1", "ABCDEF");
+ assertFalse(CrossRef.sameSequence(seq1, null));
+ assertFalse(CrossRef.sameSequence(null, seq1));
+ assertTrue(CrossRef.sameSequence(seq1, new Sequence("seq2", "ABCDEF")));
+ assertTrue(CrossRef.sameSequence(seq1, new Sequence("seq2", "abcdef")));
+ assertFalse(CrossRef
+ .sameSequence(seq1, new Sequence("seq2", "ABCDE-F")));
+ assertFalse(CrossRef.sameSequence(seq1, new Sequence("seq2", "BCDEF")));
+ }
}