X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=test%2Fjalview%2Fws%2Fdbsources%2FEmblSourceTest.java;fp=test%2Fjalview%2Fws%2Fdbsources%2FEmblSourceTest.java;h=d450495fc4319dabeb25a110baaaedf6a5ab6cff;hb=4d64932654de3f6ffe07db11d18f2d21f558c6e6;hp=0000000000000000000000000000000000000000;hpb=382f5ff058a4c2fed1e9874bb712b40f16ec1a4e;p=jalview.git diff --git a/test/jalview/ws/dbsources/EmblSourceTest.java b/test/jalview/ws/dbsources/EmblSourceTest.java new file mode 100644 index 0000000..d450495 --- /dev/null +++ b/test/jalview/ws/dbsources/EmblSourceTest.java @@ -0,0 +1,463 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ +package jalview.ws.dbsources; + +import static org.testng.AssertJUnit.assertEquals; +import static org.testng.AssertJUnit.assertNotNull; +import static org.testng.AssertJUnit.assertNull; +import static org.testng.AssertJUnit.assertSame; +import static org.testng.AssertJUnit.assertTrue; + +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.SequenceI; +import jalview.util.MapList; +import jalview.xml.binding.embl.EntryType; +import jalview.xml.binding.embl.EntryType.Feature; +import jalview.xml.binding.embl.EntryType.Feature.Qualifier; +import jalview.xml.binding.embl.XrefType; + +import java.io.ByteArrayInputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.testng.annotations.Test; + +public class EmblSourceTest +{ + + // adapted from http://www.ebi.ac.uk/ena/data/view/X07547&display=xml + // dna and translations truncated for convenience + static final String TESTDATA = "" + + "" + + "" + + "X07574" + + "C. trachomatis plasmid" + + "plasmidunidentified reading frame" + + "" + + "" + /* + * first CDS (range and translation changed to keep test data manageable) + */ + + "" + // test the case of >1 cross-ref to the same database (JAL-2029) + + "" + + "" + + "ORF 8 (AA 1-330)" + + "CAA30420.1" + + "MLCF" + + "" + /* + * second CDS (range and translation changed to keep test data manageable) + */ + + "" + + "" + + "CAA30421.1" + + "MSSS" + + "" + /* + * third CDS is made up - has no xref - code should synthesize + * one to an assumed EMBLCDSPROTEIN accession + */ + + "" + + "CAA12345.6" + + "MSS" + + "" + /* + * sequence (modified for test purposes) + * emulates EMBL XML 1.2 which splits sequence data every 60 characters + * see EmblSequence.setSequence + */ + + "GGTATGTCCTCTAGTACAAAC\n" + + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT" + + ""; + + @Test(groups = "Functional") + public void testGetCdsRanges() + { + EmblSource testee = new EmblSource(); + + /* + * Make a (CDS) Feature with 5 locations + */ + Feature cds = new Feature(); + cds.setLocation("join(10..20,complement(30..40),50..60,70..80,complement(110..120))"); + + int[] exons = testee.getCdsRanges("EMBL", cds); + assertEquals("[10, 20, 40, 30, 50, 60, 70, 80, 120, 110]", + Arrays.toString(exons)); + } + + @Test(groups = "Functional") + public void testGetSequence() + { + // not the whole sequence but enough for this test... + List peptides = new ArrayList<>(); + List entries = EmblSourceTest.getEmblEntries(); + assertEquals(1, entries.size()); + EntryType entry = entries.get(0); + EmblSource testee = new EmblSource(); + String sourceDb = "EMBL"; + SequenceI dna = testee.getSequence(sourceDb, entry, peptides); + + /* + * newline has been removed from sequence + */ + String seq = dna.getSequenceAsString(); + assertEquals( + "GGTATGTCCTCTAGTACAAACACCCCCAATATTGTGATATAATTAAAAACATAGCAT", + seq); + + /* + * peptides should now have five entries: + * EMBL product and two Uniprot accessions for the first CDS / translation + * EMBL product and one Uniprot accession for the second CDS / " + * EMBL product only for the third + */ + assertEquals(6, peptides.size()); + assertEquals("CAA30420.1", peptides.get(0).getName()); + assertEquals("MLCF", peptides.get(0).getSequenceAsString()); + assertEquals("UNIPROT|B0BCM4", peptides.get(1).getName()); + assertEquals("MLCF", peptides.get(1).getSequenceAsString()); + assertEquals("UNIPROT|P0CE20", peptides.get(2).getName()); + assertEquals("MLCF", peptides.get(2).getSequenceAsString()); + assertEquals("CAA30421.1", peptides.get(3).getName()); + assertEquals("MSSS", peptides.get(3).getSequenceAsString()); + assertEquals("UNIPROT|B0BCM3", peptides.get(4).getName()); + assertEquals("MSSS", peptides.get(4).getSequenceAsString()); + assertEquals("CAA12345.6", peptides.get(5).getName()); + assertEquals("MSS", peptides.get(5).getSequenceAsString()); + + /* + * verify dna sequence has dbrefs + * - to 'self' (synthesized dbref) + * - to EuropePMC + * - to MD5 (with null version as "0") + * - with CDS mappings to the peptide 'products' + */ + MapList mapToSelf = new MapList(new int[] { 1, 57 }, + new int[] + { 1, 57 }, 1, 1); + MapList cds1Map = new MapList(new int[] { 57, 46 }, new int[] { 1, 4 }, + 3, 1); + MapList cds2Map = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 }, + 3, 1); + MapList cds3Map = new MapList(new int[] { 4, 6, 10, 15 }, new int[] { + 1, 3 }, 3, 1); + + DBRefEntry[] dbrefs = dna.getDBRefs(); + assertEquals(7, dbrefs.length); + + DBRefEntry dbRefEntry = dbrefs[0]; + assertEquals("EMBL", dbRefEntry.getSource()); + assertEquals("X07547", dbRefEntry.getAccessionId()); + assertEquals("1", dbRefEntry.getVersion()); + assertNotNull(dbRefEntry.getMap()); + assertNull(dbRefEntry.getMap().getTo()); + assertEquals(mapToSelf, dbRefEntry.getMap().getMap()); + + dbRefEntry = dbrefs[1]; + // DBRefEntry constructor puts dbSource in upper case + assertEquals("EUROPEPMC", dbRefEntry.getSource()); + assertEquals("PMC107176", dbRefEntry.getAccessionId()); + assertEquals("9573186", dbRefEntry.getVersion()); + assertNull(dbRefEntry.getMap()); + + dbRefEntry = dbrefs[2]; + assertEquals("MD5", dbRefEntry.getSource()); + assertEquals("ac73317", dbRefEntry.getAccessionId()); + assertEquals("0", dbRefEntry.getVersion()); + assertNull(dbRefEntry.getMap()); + + dbRefEntry = dbrefs[3]; + assertEquals("UNIPROT", dbRefEntry.getSource()); + assertEquals("B0BCM4", dbRefEntry.getAccessionId()); + assertSame(peptides.get(1), dbRefEntry.getMap().getTo()); + assertEquals(cds1Map, dbRefEntry.getMap().getMap()); + + dbRefEntry = dbrefs[4]; + assertEquals("UNIPROT", dbRefEntry.getSource()); + assertEquals("P0CE20", dbRefEntry.getAccessionId()); + assertSame(peptides.get(2), dbRefEntry.getMap().getTo()); + assertEquals(cds1Map, dbRefEntry.getMap().getMap()); + + dbRefEntry = dbrefs[5]; + assertEquals("UNIPROT", dbRefEntry.getSource()); + assertEquals("B0BCM3", dbRefEntry.getAccessionId()); + assertSame(peptides.get(4), dbRefEntry.getMap().getTo()); + assertEquals(cds2Map, dbRefEntry.getMap().getMap()); + + dbRefEntry = dbrefs[6]; + assertEquals("EMBLCDSPROTEIN", dbRefEntry.getSource()); + assertEquals("CAA12345.6", dbRefEntry.getAccessionId()); + assertSame(peptides.get(5), dbRefEntry.getMap().getTo()); + assertEquals(cds3Map, dbRefEntry.getMap().getMap()); + + /* + * verify peptides have dbrefs + * - to EMBL sequence (with inverse 1:3 cds mapping) + * - to EMBLCDS (with 1:3 mapping) + * - direct (no mapping) to other protein accessions + */ + MapList proteinToCdsMap1 = new MapList(new int[] { 1, 4 }, new int[] { + 1, 12 }, 1, 3); + MapList proteinToCdsMap2 = new MapList(new int[] { 1, 3 }, new int[] { + 1, 9 }, 1, 3); + + // dbrefs for first CDS EMBL product CAA30420.1 + dbrefs = peptides.get(0).getDBRefs(); + assertEquals(5, dbrefs.length); + assertEquals(DBRefSource.EMBL, dbrefs[0].getSource()); + assertEquals("CAA30420.1", dbrefs[0].getAccessionId()); + // TODO: verify getPrimaryDBRefs() for peptide products + assertEquals(cds1Map.getInverse(), dbrefs[0].getMap().getMap()); + assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource()); + assertEquals("CAA30420.1", dbrefs[1].getAccessionId()); + assertEquals(proteinToCdsMap1, dbrefs[1].getMap().getMap()); + assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource()); + assertEquals("CAA30420.1", dbrefs[2].getAccessionId()); + assertNull(dbrefs[2].getMap()); + assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "2.1", "B0BCM4"), + dbrefs[3]); + assertNull(dbrefs[3].getMap()); + assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "P0CE20"), + dbrefs[4]); + assertNull(dbrefs[4].getMap()); + + // dbrefs for first CDS first Uniprot xref + dbrefs = peptides.get(1).getDBRefs(); + assertEquals(2, dbrefs.length); + assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "2.1", "B0BCM4"), + dbrefs[0]); + assertNull(dbrefs[0].getMap()); + assertEquals(DBRefSource.EMBL, dbrefs[1].getSource()); + assertEquals("X07547", dbrefs[1].getAccessionId()); + assertEquals(cds1Map.getInverse(), dbrefs[1].getMap().getMap()); + + // dbrefs for first CDS second Uniprot xref + dbrefs = peptides.get(2).getDBRefs(); + assertEquals(2, dbrefs.length); + assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "P0CE20"), + dbrefs[0]); + assertNull(dbrefs[0].getMap()); + assertEquals(DBRefSource.EMBL, dbrefs[1].getSource()); + assertEquals("X07547", dbrefs[1].getAccessionId()); + assertEquals(cds1Map.getInverse(), dbrefs[1].getMap().getMap()); + + // dbrefs for second CDS EMBL product CAA30421.1 + dbrefs = peptides.get(3).getDBRefs(); + assertEquals(4, dbrefs.length); + assertEquals(DBRefSource.EMBL, dbrefs[0].getSource()); + assertEquals("CAA30421.1", dbrefs[0].getAccessionId()); + assertEquals(cds2Map.getInverse(), dbrefs[0].getMap().getMap()); + assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource()); + assertEquals("CAA30421.1", dbrefs[1].getAccessionId()); + assertEquals(proteinToCdsMap1, dbrefs[1].getMap().getMap()); + assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource()); + assertEquals("CAA30421.1", dbrefs[2].getAccessionId()); + assertNull(dbrefs[2].getMap()); + assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "B0BCM3"), + dbrefs[3]); + assertNull(dbrefs[3].getMap()); + + // dbrefs for second CDS second Uniprot xref + dbrefs = peptides.get(4).getDBRefs(); + assertEquals(2, dbrefs.length); + assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "B0BCM3"), + dbrefs[0]); + assertNull(dbrefs[0].getMap()); + assertEquals(DBRefSource.EMBL, dbrefs[1].getSource()); + assertEquals("X07547", dbrefs[1].getAccessionId()); + assertEquals(cds2Map.getInverse(), dbrefs[1].getMap().getMap()); + + // dbrefs for third CDS inferred EMBL product CAA12345.6 + dbrefs = peptides.get(5).getDBRefs(); + assertEquals(3, dbrefs.length); + assertEquals(DBRefSource.EMBL, dbrefs[0].getSource()); + assertEquals("CAA12345.6", dbrefs[0].getAccessionId()); + assertEquals(cds3Map.getInverse(), dbrefs[0].getMap().getMap()); + assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource()); + assertEquals("CAA12345.6", dbrefs[1].getAccessionId()); + assertEquals(proteinToCdsMap2, dbrefs[1].getMap().getMap()); + assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource()); + assertEquals("CAA12345.6", dbrefs[2].getAccessionId()); + assertNull(dbrefs[2].getMap()); + } + + @Test(groups = "Functional") + public void testAdjustForProteinLength() + { + int[] exons = new int[] { 11, 15, 21, 25, 31, 38 }; // 18 bp + + // exact length match: + assertSame(exons, EmblXmlSource.adjustForProteinLength(6, exons)); + + // match if we assume exons include stop codon not in protein: + assertSame(exons, EmblXmlSource.adjustForProteinLength(5, exons)); + + // truncate last exon by 6bp + int[] truncated = EmblXmlSource.adjustForProteinLength(4, exons); + assertEquals("[11, 15, 21, 25, 31, 32]", Arrays.toString(truncated)); + + // remove last exon and truncate preceding by 1bp + truncated = EmblXmlSource.adjustForProteinLength(3, exons); + assertEquals("[11, 15, 21, 24]", Arrays.toString(truncated)); + + // exact removal of exon case: + exons = new int[] { 11, 15, 21, 27, 33, 38 }; // 18 bp + truncated = EmblXmlSource.adjustForProteinLength(4, exons); + assertEquals("[11, 15, 21, 27]", Arrays.toString(truncated)); + + // what if exons are too short for protein? + truncated = EmblXmlSource.adjustForProteinLength(7, exons); + assertSame(exons, truncated); + } + + @Test(groups = { "Functional" }) + public void testGetEmblEntries() + { + List entries = EmblSourceTest.getEmblEntries(); + assertEquals(1, entries.size()); + EntryType entry = entries.get(0); + + assertEquals("X07547", entry.getAccession()); + assertEquals("C. trachomatis plasmid", entry.getDescription()); + assertEquals("STD", entry.getDataClass()); + assertEquals("PRO", entry.getTaxonomicDivision()); + assertEquals("1999-02-10", entry.getLastUpdated().toString()); + assertEquals(58, entry.getLastUpdatedRelease().intValue()); + assertEquals("1988-11-10", entry.getFirstPublic().toString()); + assertEquals(18, entry.getFirstPublicRelease().intValue()); + assertEquals("genomic DNA", entry.getMoleculeType()); + assertEquals(1, entry.getVersion().intValue()); + assertEquals(8, entry.getEntryVersion().intValue()); + assertEquals("linear", entry.getTopology()); + assertEquals(7499, entry.getSequenceLength().intValue()); + assertEquals(2, entry.getKeyword().size()); + assertEquals("plasmid", entry.getKeyword().get(0)); + assertEquals("unidentified reading frame", entry.getKeyword().get(1)); + + /* + * dbrefs + */ + assertEquals(2, entry.getXref().size()); + XrefType dbref = entry.getXref().get(0); + assertEquals("EuropePMC", dbref.getDb()); + assertEquals("PMC107176", dbref.getId()); + assertEquals("9573186", dbref.getSecondaryId()); + dbref = entry.getXref().get(1); + assertEquals("MD5", dbref.getDb()); + assertEquals("ac73317", dbref.getId()); + assertNull(dbref.getSecondaryId()); + + /* + * three sequence features for CDS + */ + assertEquals(3, entry.getFeature().size()); + /* + * first CDS + */ + Feature ef = entry.getFeature().get(0); + assertEquals("CDS", ef.getName()); + assertEquals("complement(46..57)", ef.getLocation()); + assertEquals(2, ef.getXref().size()); + dbref = ef.getXref().get(0); + assertEquals("UniProtKB/Swiss-Prot", dbref.getDb()); + assertEquals("B0BCM4", dbref.getId()); + assertEquals("2.1", dbref.getSecondaryId()); + dbref = ef.getXref().get(1); + assertEquals("UniProtKB/Swiss-Prot", dbref.getDb()); + assertEquals("P0CE20", dbref.getId()); + assertNull(dbref.getSecondaryId()); + // CDS feature qualifiers + assertEquals(3, ef.getQualifier().size()); + Qualifier q = ef.getQualifier().get(0); + assertEquals("note", q.getName()); + assertEquals("ORF 8 (AA 1-330)", q.getValue()); + q = ef.getQualifier().get(1); + assertEquals("protein_id", q.getName()); + assertEquals("CAA30420.1", q.getValue()); + q = ef.getQualifier().get(2); + assertEquals("translation", q.getName()); + assertEquals("MLCF", q.getValue()); + + /* + * second CDS + */ + ef = entry.getFeature().get(1); + assertEquals("CDS", ef.getName()); + assertEquals("4..15", ef.getLocation()); + assertEquals(1, ef.getXref().size()); + dbref = ef.getXref().get(0); + assertEquals("UniProtKB/Swiss-Prot", dbref.getDb()); + assertEquals("B0BCM3", dbref.getId()); + assertNull(dbref.getSecondaryId()); + assertEquals(2, ef.getQualifier().size()); + q = ef.getQualifier().get(0); + assertEquals("protein_id", q.getName()); + assertEquals("CAA30421.1", q.getValue()); + q = ef.getQualifier().get(1); + assertEquals("translation", q.getName()); + assertEquals("MSSS", q.getValue()); + + /* + * third CDS + */ + ef = entry.getFeature().get(2); + assertEquals("CDS", ef.getName()); + assertEquals("join(4..6,10..15)", ef.getLocation()); + assertNotNull(ef.getXref()); + assertTrue(ef.getXref().isEmpty()); + assertEquals(2, ef.getQualifier().size()); + q = ef.getQualifier().get(0); + assertEquals("protein_id", q.getName()); + assertEquals("CAA12345.6", q.getValue()); + q = ef.getQualifier().get(1); + assertEquals("translation", q.getName()); + assertEquals("MSS", q.getValue()); + + /* + * Sequence - raw data before removal of newlines + */ + String seq = entry.getSequence(); + assertEquals( + "GGTATGTCCTCTAGTACAAAC\n" + + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT", + seq); + + /* + * getSequence() converts empty DBRefEntry.version to "0" + */ + assertNull(entry.getXref().get(1).getSecondaryId()); + assertNull(entry.getFeature().get(0).getXref().get(1).getSecondaryId()); + } + + static List getEmblEntries() + { + return new EmblSource() + .getEmblEntries(new ByteArrayInputStream(TESTDATA.getBytes())); + } +}