+ found = DBRefUtils.selectDbRefs(false, refs);
+ assertEquals(4, found.size());
+ assertSame(ref1, found.get(0));
+ assertSame(ref2, found.get(1));
+ assertSame(ref4, found.get(2));
+ assertSame(ref9, found.get(3));
+ }
+
+ /**
+ * Test the method that finds a sequence's "product" xref source databases,
+ * which may be direct (dbrefs on the sequence), or indirect (dbrefs on
+ * sequences which share a dbref with the sequence
+ */
+ @Test(groups = { "Functional" }, enabled = true)
+ public void testFindXrefSourcesForSequence_proteinToDna()
+ {
+ SequenceI seq = new Sequence("Seq1", "MGKYQARLSS");
+ List<String> sources = new ArrayList<>();
+ AlignmentI al = new Alignment(new SequenceI[] {});
+
+ /*
+ * first with no dbrefs to search
+ */
+ sources = new CrossRef(new SequenceI[] { seq }, al)
+ .findXrefSourcesForSequences(false);
+ assertTrue(sources.isEmpty());
+
+ /*
+ * add some dbrefs to sequence
+ */
+ // protein db is not a candidate for findXrefSources
+ seq.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234"));
+ // dna coding databatases are
+ seq.addDBRef(new DBRefEntry("EMBL", "0", "E2345"));
+ // a second EMBL xref should not result in a duplicate
+ seq.addDBRef(new DBRefEntry("EMBL", "0", "E2346"));
+ seq.addDBRef(new DBRefEntry("EMBLCDS", "0", "E2347"));
+ seq.addDBRef(new DBRefEntry("GENEDB", "0", "E2348"));
+ seq.addDBRef(new DBRefEntry("ENSEMBL", "0", "E2349"));
+ seq.addDBRef(new DBRefEntry("ENSEMBLGENOMES", "0", "E2350"));
+ sources = new CrossRef(new SequenceI[] { seq }, al)
+ .findXrefSourcesForSequences(false);
+ // method is patched to remove EMBL from the sources to match
+ assertEquals(4, sources.size());
+ assertEquals("[EMBLCDS, GENEDB, ENSEMBL, ENSEMBLGENOMES]",
+ sources.toString());
+
+ /*
+ * add a sequence to the alignment which has a dbref to UNIPROT|A1234
+ * and others to dna coding databases
+ */
+ sources.clear();
+ seq.setDBRefs(null);
+ seq.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234"));
+ seq.addDBRef(new DBRefEntry("EMBLCDS", "0", "E2347"));
+ SequenceI seq2 = new Sequence("Seq2", "MGKYQARLSS");
+ seq2.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234"));
+ seq2.addDBRef(new DBRefEntry("EMBL", "0", "E2345"));
+ seq2.addDBRef(new DBRefEntry("GENEDB", "0", "E2348"));
+ // TODO include ENSEMBLGENOMES in DBRefSource.DNACODINGDBS ?
+ al.addSequence(seq2);
+ sources = new CrossRef(new SequenceI[] { seq, seq2 }, al)
+ .findXrefSourcesForSequences(false);
+ // method removed EMBL from sources to match
+ assertEquals(2, sources.size());
+ assertEquals("[EMBLCDS, GENEDB]", sources.toString());
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where only an indirect
+ * xref is found - not on the nucleotide sequence but on a peptide sequence in
+ * the alignment which which it shares a nucleotide dbref
+ */
+ @Test(groups = { "Functional" }, enabled = true)
+ public void testFindXrefSequences_indirectDbrefToProtein()
+ {
+ /*
+ * Alignment setup:
+ * - nucleotide dbref EMBL|AF039662
+ * - peptide dbrefs EMBL|AF039662, UNIPROT|Q9ZTS2
+ */
+ SequenceI emblSeq = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ emblSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
+ SequenceI uniprotSeq = new Sequence("Q9ZTS2", "MASVSATMISTS");
+ uniprotSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
+ uniprotSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+
+ /*
+ * Find UNIPROT xrefs for nucleotide
+ * - it has no UNIPROT dbref of its own
+ * - but peptide with matching nucleotide dbref does, so is returned
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
+ Alignment xrefs = new CrossRef(new SequenceI[] { emblSeq }, al)
+ .findXrefSequences("UNIPROT", true);
+ assertEquals(1, xrefs.getHeight());
+ assertSame(uniprotSeq, xrefs.getSequenceAt(0));
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where only an indirect
+ * xref is found - not on the peptide sequence but on a nucleotide sequence in
+ * the alignment which which it shares a protein dbref
+ */
+ @Test(groups = { "Functional" }, enabled = true)
+ public void testFindXrefSequences_indirectDbrefToNucleotide()
+ {
+ /*
+ * Alignment setup:
+ * - peptide dbref UNIPROT|Q9ZTS2
+ * - nucleotide dbref EMBL|AF039662, UNIPROT|Q9ZTS2
+ */
+ SequenceI uniprotSeq = new Sequence("Q9ZTS2", "MASVSATMISTS");
+ uniprotSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+ SequenceI emblSeq = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ emblSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
+ emblSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+
+ /*
+ * find EMBL xrefs for peptide sequence - it has no direct
+ * dbrefs, but the 'corresponding' nucleotide sequence does, so is returned
+ */
+ /*
+ * Find EMBL xrefs for peptide
+ * - it has no EMBL dbref of its own
+ * - but nucleotide with matching peptide dbref does, so is returned
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
+ Alignment xrefs = new CrossRef(new SequenceI[] { uniprotSeq }, al)
+ .findXrefSequences("EMBL", false);
+ assertEquals(1, xrefs.getHeight());
+ assertSame(emblSeq, xrefs.getSequenceAt(0));
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where the selected
+ * sequence has no dbref to the desired source, and there are no indirect
+ * references via another sequence in the alignment
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_noDbrefs()
+ {
+ /*
+ * two nucleotide sequences, one with UNIPROT dbref
+ */
+ SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+ SequenceI dna2 = new Sequence("AJ307031", "AAACCCTTT");
+
+ /*
+ * find UNIPROT xrefs for peptide sequence - it has no direct
+ * dbrefs, and the other sequence (which has a UNIPROT dbref) is not
+ * equatable to it, so no results found
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { dna1, dna2 });
+ Alignment xrefs = new CrossRef(new SequenceI[] { dna2 }, al)
+ .findXrefSequences("UNIPROT", true);
+ assertNull(xrefs);
+ }
+
+ /**
+ * Tests for the method that searches an alignment (with one sequence
+ * excluded) for protein/nucleotide sequences with a given cross-reference
+ */
+ @Test(groups = { "Functional" }, enabled = true)
+ public void testSearchDataset()
+ {
+ /*
+ * nucleotide sequence with UNIPROT AND EMBL dbref
+ * peptide sequence with UNIPROT dbref
+ */
+ SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ Mapping map = new Mapping(new Sequence("pep2", "MLAVSRG"), new MapList(
+ new int[] { 1, 21 }, new int[] { 1, 7 }, 3, 1));
+ DBRefEntry dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2", map);
+ dna1.addDBRef(dbref);
+ dna1.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
+ SequenceI pep1 = new Sequence("Q9ZTS2", "MLAVSRGQ");
+ dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
+ pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+ AlignmentI al = new Alignment(new SequenceI[] { dna1, pep1 });
+
+ List<SequenceI> result = new ArrayList<>();
+
+ /*
+ * first search for a dbref nowhere on the alignment:
+ */
+ dbref = new DBRefEntry("UNIPROT", "0", "P30419");
+ CrossRef testee = new CrossRef(al.getSequencesArray(), al);
+ AlignedCodonFrame acf = new AlignedCodonFrame();
+ boolean found = testee.searchDataset(true, dna1, dbref, result, acf,
+ true, DBRefUtils.SEARCH_MODE_FULL);
+ assertFalse(found);
+ assertTrue(result.isEmpty());
+ assertTrue(acf.isEmpty());
+
+ /*
+ * search for a protein sequence with dbref UNIPROT:Q9ZTS2
+ */
+ acf = new AlignedCodonFrame();
+ dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
+ found = testee.searchDataset(!dna1.isProtein(), dna1, dbref, result,
+ acf, false, DBRefUtils.SEARCH_MODE_FULL); // search dataset with a protein xref from a dna
+ // sequence to locate the protein product
+ assertTrue(found);
+ assertEquals(1, result.size());
+ assertSame(pep1, result.get(0));
+ assertTrue(acf.isEmpty());
+
+ /*
+ * search for a nucleotide sequence with dbref UNIPROT:Q9ZTS2
+ */
+ result.clear();
+ acf = new AlignedCodonFrame();
+ dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
+ found = testee.searchDataset(!pep1.isProtein(), pep1, dbref, result,
+ acf, false, DBRefUtils.SEARCH_MODE_FULL); // search dataset with a protein's direct dbref to
+ // locate dna sequences with matching xref
+ assertTrue(found);
+ assertEquals(1, result.size());
+ assertSame(dna1, result.get(0));
+ // should now have a mapping from dna to pep1
+ List<SequenceToSequenceMapping> mappings = acf.getMappings();
+ assertEquals(1, mappings.size());
+ SequenceToSequenceMapping mapping = mappings.get(0);
+ assertSame(dna1, mapping.getFromSeq());
+ assertSame(pep1, mapping.getMapping().getTo());
+ MapList mapList = mapping.getMapping().getMap();
+ assertEquals(1, mapList.getToRatio());
+ assertEquals(3, mapList.getFromRatio());
+ assertEquals(1, mapList.getFromRanges().size());
+ assertEquals(1, mapList.getFromRanges().get(0)[0]);
+ assertEquals(21, mapList.getFromRanges().get(0)[1]);
+ assertEquals(1, mapList.getToRanges().size());
+ assertEquals(1, mapList.getToRanges().get(0)[0]);
+ assertEquals(7, mapList.getToRanges().get(0)[1]);
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where the selected
+ * sequence has a dbref with a mapping to a sequence. This represents the case
+ * where either
+ * <ul>
+ * <li>a fetched sequence is already decorated with its cross-reference (e.g.
+ * EMBL + translation), or</li>
+ * <li>Get Cross-References has been done once resulting in instantiated
+ * cross-reference mappings</li>
+ * </ul>
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_fromDbRefMap()
+ {
+ /*
+ * scenario: nucleotide sequence AF039662
+ * with dbref + mapping to Q9ZTS2 and P30419
+ * which themselves each have a dbref and feature
+ */
+ SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ SequenceI pep1 = new Sequence("Q9ZTS2", "MALFQRSV");
+ SequenceI pep2 = new Sequence("P30419", "MTRRSQIF");
+ dna1.createDatasetSequence();
+ pep1.createDatasetSequence();
+ pep2.createDatasetSequence();
+
+ pep1.getDatasetSequence().addDBRef(
+ new DBRefEntry("Pfam", "0", "PF00111"));
+ pep1.addSequenceFeature(new SequenceFeature("type", "desc", 12, 14, 1f,
+ "group"));
+ pep2.getDatasetSequence().addDBRef(new DBRefEntry("PDB", "0", "3JTK"));
+ pep2.addSequenceFeature(new SequenceFeature("type2", "desc2", 13, 15,
+ 12f, "group2"));
+
+ MapList mapList = new MapList(new int[] { 1, 24 }, new int[] { 1, 3 },
+ 3, 1);
+ Mapping map = new Mapping(pep1, mapList);
+ DBRefEntry dbRef1 = new DBRefEntry("UNIPROT", "0", "Q9ZTS2", map);
+ dna1.getDatasetSequence().addDBRef(dbRef1);
+ mapList = new MapList(new int[] { 1, 24 }, new int[] { 1, 3 }, 3, 1);
+ map = new Mapping(pep2, mapList);
+ DBRefEntry dbRef2 = new DBRefEntry("UNIPROT", "0", "P30419", map);
+ dna1.getDatasetSequence().addDBRef(dbRef2);
+
+ /*
+ * find UNIPROT xrefs for nucleotide sequence - it should pick up
+ * mapped sequences
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { dna1 });
+ Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al)
+ .findXrefSequences("UNIPROT", true);
+ assertEquals(2, xrefs.getHeight());
+
+ /*
+ * cross-refs alignment holds copies of the mapped sequences
+ * including copies of their dbrefs and features
+ */
+ checkCopySequence(pep1, xrefs.getSequenceAt(0));
+ checkCopySequence(pep2, xrefs.getSequenceAt(1));
+ }
+
+ /**
+ * Helper method that verifies that 'copy' has the same name, start, end,
+ * sequence and dataset sequence object as 'original' (but is not the same
+ * object)
+ *
+ * @param copy
+ * @param original
+ */
+ private void checkCopySequence(SequenceI copy, SequenceI original)
+ {
+ assertNotSame(copy, original);
+ assertSame(copy.getDatasetSequence(), original.getDatasetSequence());
+ assertEquals(copy.getName(), original.getName());
+ assertEquals(copy.getStart(), original.getStart());
+ assertEquals(copy.getEnd(), original.getEnd());
+ assertEquals(copy.getSequenceAsString(), original.getSequenceAsString());
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where the selected
+ * sequence has a dbref with no mapping, triggering a fetch from database
+ */
+ @Test(groups = { "Functional_Failing" })
+ public void testFindXrefSequences_withFetch()
+ {
+ SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "ENA:0", "Q9ZTS2"));
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "ENA:0", "P30419"));
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "ENA:0", "P00314"));
+ final SequenceI pep1 = new Sequence("Q9ZTS2", "MYQLIRSSW");
+ pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+
+ final SequenceI pep2 = new Sequence("P00314", "MRKLLAASG");
+ pep2.addDBRef(new DBRefEntry("UNIPROT", "0", "P00314"));
+
+ /*
+ * argument false suppresses adding DAS sources
+ * todo: define an interface type SequenceFetcherI and mock that
+ */
+ SequenceFetcher mockFetcher = new SequenceFetcher()
+ {
+ @Override
+ public boolean isFetchable(String source)
+ {
+ return true;
+ }
+
+ @Override
+ public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
+ {
+ return new SequenceI[] { pep1, pep2 };
+ }
+ };
+ SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
+
+ /*
+ * find UNIPROT xrefs for nucleotide sequence
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { dna1 });
+ Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al)
+ .findXrefSequences("UNIPROT", true);
+ assertEquals(2, xrefs.getHeight());
+ assertSame(pep1, xrefs.getSequenceAt(0));
+ assertSame(pep2, xrefs.getSequenceAt(1));
+ }
+
+ @AfterClass(alwaysRun = true)
+ public void tearDown()
+ {
+ SequenceFetcherFactory.setSequenceFetcher(null);
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where both gene and
+ * transcript sequences have dbrefs to Uniprot.
+ */
+ @Test(groups = { "Functional_Failing" })
+ public void testFindXrefSequences_forGeneAndTranscripts()
+ {
+ /*
+ * 'gene' sequence
+ */
+ SequenceI gene = new Sequence("ENSG00000157764", "CGCCTCCCTTCCCC");
+ gene.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
+ gene.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3"));
+
+ /*
+ * 'transcript' with CDS feature (supports mapping to protein)
+ */
+ SequenceI braf001 = new Sequence("ENST00000288602", "taagATGGCGGCGCTGa");
+ braf001.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
+ braf001.addSequenceFeature(new SequenceFeature("CDS", "", 5, 16, 0f,
+ null));
+
+ /*
+ * 'spliced transcript' with CDS ranges
+ */
+ SequenceI braf002 = new Sequence("ENST00000497784", "gCAGGCtaTCTGTTCaa");
+ braf002.addDBRef(new DBRefEntry("UNIPROT", "ENSEMBL|0", "H7C5K3"));
+ braf002.addSequenceFeature(new SequenceFeature("CDS", "", 2, 6, 0f,
+ null));
+ braf002.addSequenceFeature(new SequenceFeature("CDS", "", 9, 15, 0f,
+ null));
+
+ /*
+ * TODO code is fragile - use of SequenceIdMatcher depends on fetched
+ * sequences having a name starting Source|Accession
+ * which happens to be true for Uniprot,PDB,EMBL but not Pfam,Rfam,Ensembl
+ */
+ final SequenceI pep1 = new Sequence("UNIPROT|P15056", "MAAL");
+ pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
+ final SequenceI pep2 = new Sequence("UNIPROT|H7C5K3", "QALF");
+ pep2.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3"));
+ /*
+ * argument false suppresses adding DAS sources
+ * todo: define an interface type SequenceFetcherI and mock that
+ */
+ SequenceFetcher mockFetcher = new SequenceFetcher()
+ {
+ @Override
+ public boolean isFetchable(String source)
+ {
+ return true;
+ }
+
+ @Override
+ public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
+ {
+ return new SequenceI[] { pep1, pep2 };
+ }
+ };
+ SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
+
+ /*
+ * find UNIPROT xrefs for gene and transcripts
+ * verify that
+ * - the two proteins are retrieved but not duplicated
+ * - mappings are built from transcript (CDS) to proteins
+ * - no mappings from gene to proteins
+ */
+ SequenceI[] seqs = new SequenceI[] { gene, braf001, braf002 };
+ AlignmentI al = new Alignment(seqs);
+ Alignment xrefs = new CrossRef(seqs, al).findXrefSequences("UNIPROT",
+ true);
+ assertEquals(2, xrefs.getHeight());
+ assertSame(pep1, xrefs.getSequenceAt(0));
+ assertSame(pep2, xrefs.getSequenceAt(1));