From 64784ec1394f014222f2f5d4d8b865b16199419c Mon Sep 17 00:00:00 2001 From: gmungoc Date: Mon, 17 Sep 2018 11:47:51 +0100 Subject: [PATCH] JAL-3116 unit tests adapted to EMBL over JAXB --- src/jalview/ws/dbsources/EmblXmlSource.java | 32 +- test/jalview/datamodel/xdb/embl/EmblEntryTest.java | 264 ----------- test/jalview/datamodel/xdb/embl/EmblFileTest.java | 183 -------- .../jalview/datamodel/xdb/embl/EmblTestHelper.java | 81 ---- test/jalview/ws/dbsources/EmblSourceTest.java | 463 ++++++++++++++++++++ 5 files changed, 483 insertions(+), 540 deletions(-) delete mode 100644 test/jalview/datamodel/xdb/embl/EmblEntryTest.java delete mode 100644 test/jalview/datamodel/xdb/embl/EmblFileTest.java delete mode 100644 test/jalview/datamodel/xdb/embl/EmblTestHelper.java create mode 100644 test/jalview/ws/dbsources/EmblSourceTest.java diff --git a/src/jalview/ws/dbsources/EmblXmlSource.java b/src/jalview/ws/dbsources/EmblXmlSource.java index 8f55080..e114ea9 100644 --- a/src/jalview/ws/dbsources/EmblXmlSource.java +++ b/src/jalview/ws/dbsources/EmblXmlSource.java @@ -85,7 +85,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy * @return * @throws Exception */ - public AlignmentI getEmblSequenceRecords(String emprefx, String query) + protected AlignmentI getEmblSequenceRecords(String emprefx, String query) throws Exception { startQuery(); @@ -118,7 +118,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy * @return * @throws Exception */ - public AlignmentI getEmblSequenceRecords(String emprefx, String query, + protected AlignmentI getEmblSequenceRecords(String emprefx, String query, File reply) throws Exception { List entries = null; @@ -244,14 +244,18 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy /* * add db references */ - List dbRefs = entry.getXref(); - if (dbRefs != null) + List xrefs = entry.getXref(); + if (xrefs != null) { - for (XrefType dbref : dbRefs) + for (XrefType xref : xrefs) { - String acc = dbref.getId(); - String source = DBRefUtils.getCanonicalName(dbref.getDb()); - String version = dbref.getSecondaryId(); + String acc = xref.getId(); + String source = DBRefUtils.getCanonicalName(xref.getDb()); + String version = xref.getSecondaryId(); + if (version == null || "".equals(version)) + { + version = "0"; + } dna.addDBRef(new DBRefEntry(source, version, acc)); } } @@ -524,10 +528,14 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy * ensure UniProtKB/Swiss-Prot converted to UNIPROT */ String source = DBRefUtils.getCanonicalName(xref.getDb()); - DBRefEntry dbref = new DBRefEntry(source, xref.getSecondaryId(), - xref.getId()); - DBRefEntry proteinDbRef = new DBRefEntry(dbref.getSource(), - dbref.getVersion(), dbref.getAccessionId()); + String version = xref.getSecondaryId(); + if (version == null || "".equals(version)) + { + version = "0"; + } + DBRefEntry dbref = new DBRefEntry(source, version, xref.getId()); + DBRefEntry proteinDbRef = new DBRefEntry(source, version, + dbref.getAccessionId()); if (source.equals(DBRefSource.UNIPROT)) { String proteinSeqName = DBRefSource.UNIPROT + "|" diff --git a/test/jalview/datamodel/xdb/embl/EmblEntryTest.java b/test/jalview/datamodel/xdb/embl/EmblEntryTest.java deleted file mode 100644 index 4672574..0000000 --- a/test/jalview/datamodel/xdb/embl/EmblEntryTest.java +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) - * Copyright (C) $$Year-Rel$$ The Jalview Authors - * - * This file is part of Jalview. - * - * Jalview is free software: you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, either version 3 - * of the License, or (at your option) any later version. - * - * Jalview is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty - * of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Jalview. If not, see . - * The Jalview Authors are detailed in the 'AUTHORS' file. - */ -package jalview.datamodel.xdb.embl; - -import static org.testng.AssertJUnit.assertEquals; -import static org.testng.AssertJUnit.assertNull; -import static org.testng.AssertJUnit.assertSame; - -import jalview.analysis.SequenceIdMatcher; -import jalview.datamodel.DBRefEntry; -import jalview.datamodel.DBRefSource; -import jalview.datamodel.SequenceI; -import jalview.gui.JvOptionPane; -import jalview.util.MapList; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -public class EmblEntryTest -{ - - @BeforeClass(alwaysRun = true) - public void setUpJvOptionPane() - { - JvOptionPane.setInteractiveMode(false); - JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION); - } - - @Test(groups = "Functional") - public void testGetCdsRanges() - { - EmblEntry testee = new EmblEntry(); - - /* - * Make a (CDS) Feature with 5 locations - */ - EmblFeature cds = new EmblFeature(); - cds.setLocation("join(10..20,complement(30..40),50..60,70..80,complement(110..120))"); - - int[] exons = testee.getCdsRanges(cds); - assertEquals("[10, 20, 40, 30, 50, 60, 70, 80, 120, 110]", - Arrays.toString(exons)); - } - - @Test(groups = "Functional") - public void testParseCodingFeature() - { - // not the whole sequence but enough for this test... - List peptides = new ArrayList(); - SequenceIdMatcher matcher = new SequenceIdMatcher(peptides); - EmblFile ef = EmblTestHelper.getEmblFile(); - assertEquals(1, ef.getEntries().size()); - EmblEntry testee = ef.getEntries().get(0); - String sourceDb = "EMBL"; - SequenceI dna = testee.makeSequence(sourceDb); - - /* - * parse three CDS features, with two/one/no Uniprot cross-refs - */ - for (EmblFeature feature : ef.getEntries().get(0).getFeatures()) - { - if ("CDS".equals(feature.getName())) - { - testee.parseCodingFeature(feature, sourceDb, dna, peptides, matcher); - } - } - - /* - * peptides should now have five entries: - * EMBL product and two Uniprot accessions for the first CDS / translation - * EMBL product and one Uniprot accession for the second CDS / " - * EMBL product only for the third - */ - assertEquals(6, peptides.size()); - assertEquals("CAA30420.1", peptides.get(0).getName()); - assertEquals("MLCF", peptides.get(0).getSequenceAsString()); - assertEquals("UNIPROT|B0BCM4", peptides.get(1).getName()); - assertEquals("MLCF", peptides.get(1).getSequenceAsString()); - assertEquals("UNIPROT|P0CE20", peptides.get(2).getName()); - assertEquals("MLCF", peptides.get(2).getSequenceAsString()); - assertEquals("CAA30421.1", peptides.get(3).getName()); - assertEquals("MSSS", peptides.get(3).getSequenceAsString()); - assertEquals("UNIPROT|B0BCM3", peptides.get(4).getName()); - assertEquals("MSSS", peptides.get(4).getSequenceAsString()); - assertEquals("CAA12345.6", peptides.get(5).getName()); - assertEquals("MSS", peptides.get(5).getSequenceAsString()); - - /* - * verify dna sequence has dbrefs with CDS mappings to the peptide 'products' - */ - MapList cds1Map = new MapList(new int[] { 57, 46 }, new int[] { 1, 4 }, - 3, 1); - MapList cds2Map = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 }, - 3, 1); - MapList cds3Map = new MapList(new int[] { 4, 6, 10, 15 }, new int[] { - 1, 3 }, 3, 1); - DBRefEntry[] dbrefs = dna.getDBRefs(); - assertEquals(4, dbrefs.length); - DBRefEntry dbRefEntry = dbrefs[0]; - assertEquals("UNIPROT", dbRefEntry.getSource()); - assertEquals("B0BCM4", dbRefEntry.getAccessionId()); - assertSame(peptides.get(1), dbRefEntry.getMap().getTo()); - assertEquals(cds1Map, dbRefEntry.getMap().getMap()); - - dbRefEntry = dbrefs[1]; - assertEquals("UNIPROT", dbRefEntry.getSource()); - assertEquals("P0CE20", dbRefEntry.getAccessionId()); - assertSame(peptides.get(2), dbRefEntry.getMap().getTo()); - assertEquals(cds1Map, dbRefEntry.getMap().getMap()); - - dbRefEntry = dbrefs[2]; - assertEquals("UNIPROT", dbRefEntry.getSource()); - assertEquals("B0BCM3", dbRefEntry.getAccessionId()); - assertSame(peptides.get(4), dbRefEntry.getMap().getTo()); - assertEquals(cds2Map, dbRefEntry.getMap().getMap()); - - dbRefEntry = dbrefs[3]; - assertEquals("EMBLCDSPROTEIN", dbRefEntry.getSource()); - assertEquals("CAA12345.6", dbRefEntry.getAccessionId()); - assertSame(peptides.get(5), dbRefEntry.getMap().getTo()); - assertEquals(cds3Map, dbRefEntry.getMap().getMap()); - - /* - * verify peptides have dbrefs - * - to EMBL sequence (with inverse 1:3 cds mapping) - * - to EMBLCDS (with 1:3 mapping) - * - direct (no mapping) to other protein accessions - */ - MapList proteinToCdsMap1 = new MapList(new int[] { 1, 4 }, new int[] { - 1, 12 }, 1, 3); - MapList proteinToCdsMap2 = new MapList(new int[] { 1, 3 }, new int[] { - 1, 9 }, 1, 3); - - // dbrefs for first CDS EMBL product CAA30420.1 - dbrefs = peptides.get(0).getDBRefs(); - assertEquals(5, dbrefs.length); - assertEquals(DBRefSource.EMBL, dbrefs[0].getSource()); - assertEquals("CAA30420.1", dbrefs[0].getAccessionId()); - // TODO: verify getPrimaryDBRefs() for peptide products - assertEquals(cds1Map.getInverse(), dbrefs[0].getMap().getMap()); - assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource()); - assertEquals("CAA30420.1", dbrefs[1].getAccessionId()); - assertEquals(proteinToCdsMap1, dbrefs[1].getMap().getMap()); - assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource()); - assertEquals("CAA30420.1", dbrefs[2].getAccessionId()); - assertNull(dbrefs[2].getMap()); - assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "2.1", "B0BCM4"), - dbrefs[3]); - assertNull(dbrefs[3].getMap()); - assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "P0CE20"), - dbrefs[4]); - assertNull(dbrefs[4].getMap()); - - // dbrefs for first CDS first Uniprot xref - dbrefs = peptides.get(1).getDBRefs(); - assertEquals(2, dbrefs.length); - assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "2.1", "B0BCM4"), - dbrefs[0]); - assertNull(dbrefs[0].getMap()); - assertEquals(DBRefSource.EMBL, dbrefs[1].getSource()); - assertEquals("X07547", dbrefs[1].getAccessionId()); - assertEquals(cds1Map.getInverse(), dbrefs[1].getMap().getMap()); - - // dbrefs for first CDS second Uniprot xref - dbrefs = peptides.get(2).getDBRefs(); - assertEquals(2, dbrefs.length); - assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "P0CE20"), - dbrefs[0]); - assertNull(dbrefs[0].getMap()); - assertEquals(DBRefSource.EMBL, dbrefs[1].getSource()); - assertEquals("X07547", dbrefs[1].getAccessionId()); - assertEquals(cds1Map.getInverse(), dbrefs[1].getMap().getMap()); - - // dbrefs for second CDS EMBL product CAA30421.1 - dbrefs = peptides.get(3).getDBRefs(); - assertEquals(4, dbrefs.length); - assertEquals(DBRefSource.EMBL, dbrefs[0].getSource()); - assertEquals("CAA30421.1", dbrefs[0].getAccessionId()); - assertEquals(cds2Map.getInverse(), dbrefs[0].getMap().getMap()); - assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource()); - assertEquals("CAA30421.1", dbrefs[1].getAccessionId()); - assertEquals(proteinToCdsMap1, dbrefs[1].getMap().getMap()); - assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource()); - assertEquals("CAA30421.1", dbrefs[2].getAccessionId()); - assertNull(dbrefs[2].getMap()); - assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "B0BCM3"), - dbrefs[3]); - assertNull(dbrefs[3].getMap()); - - // dbrefs for second CDS second Uniprot xref - dbrefs = peptides.get(4).getDBRefs(); - assertEquals(2, dbrefs.length); - assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "B0BCM3"), - dbrefs[0]); - assertNull(dbrefs[0].getMap()); - assertEquals(DBRefSource.EMBL, dbrefs[1].getSource()); - assertEquals("X07547", dbrefs[1].getAccessionId()); - assertEquals(cds2Map.getInverse(), dbrefs[1].getMap().getMap()); - - // dbrefs for third CDS inferred EMBL product CAA12345.6 - dbrefs = peptides.get(5).getDBRefs(); - assertEquals(3, dbrefs.length); - assertEquals(DBRefSource.EMBL, dbrefs[0].getSource()); - assertEquals("CAA12345.6", dbrefs[0].getAccessionId()); - assertEquals(cds3Map.getInverse(), dbrefs[0].getMap().getMap()); - assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource()); - assertEquals("CAA12345.6", dbrefs[1].getAccessionId()); - assertEquals(proteinToCdsMap2, dbrefs[1].getMap().getMap()); - assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource()); - assertEquals("CAA12345.6", dbrefs[2].getAccessionId()); - assertNull(dbrefs[2].getMap()); - } - - @Test(groups = "Functional") - public void testAdjustForProteinLength() - { - int[] exons = new int[] { 11, 15, 21, 25, 31, 38 }; // 18 bp - - // exact length match: - assertSame(exons, EmblEntry.adjustForProteinLength(6, exons)); - - // match if we assume exons include stop codon not in protein: - assertSame(exons, EmblEntry.adjustForProteinLength(5, exons)); - - // truncate last exon by 6bp - int[] truncated = EmblEntry.adjustForProteinLength(4, exons); - assertEquals("[11, 15, 21, 25, 31, 32]", Arrays.toString(truncated)); - - // remove last exon and truncate preceding by 1bp - truncated = EmblEntry.adjustForProteinLength(3, exons); - assertEquals("[11, 15, 21, 24]", Arrays.toString(truncated)); - - // exact removal of exon case: - exons = new int[] { 11, 15, 21, 27, 33, 38 }; // 18 bp - truncated = EmblEntry.adjustForProteinLength(4, exons); - assertEquals("[11, 15, 21, 27]", Arrays.toString(truncated)); - - // what if exons are too short for protein? - truncated = EmblEntry.adjustForProteinLength(7, exons); - assertSame(exons, truncated); - } -} diff --git a/test/jalview/datamodel/xdb/embl/EmblFileTest.java b/test/jalview/datamodel/xdb/embl/EmblFileTest.java deleted file mode 100644 index 7510de1..0000000 --- a/test/jalview/datamodel/xdb/embl/EmblFileTest.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) - * Copyright (C) $$Year-Rel$$ The Jalview Authors - * - * This file is part of Jalview. - * - * Jalview is free software: you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, either version 3 - * of the License, or (at your option) any later version. - * - * Jalview is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty - * of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Jalview. If not, see . - * The Jalview Authors are detailed in the 'AUTHORS' file. - */ -package jalview.datamodel.xdb.embl; - -import static org.testng.AssertJUnit.assertEquals; -import static org.testng.AssertJUnit.assertNull; - -import jalview.datamodel.DBRefEntry; -import jalview.gui.JvOptionPane; - -import java.util.Vector; - -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -public class EmblFileTest -{ - - @BeforeClass(alwaysRun = true) - public void setUpJvOptionPane() - { - JvOptionPane.setInteractiveMode(false); - JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION); - } - - @Test(groups = { "Functional" }) - public void testGetEmblFile() - { - Vector entries = EmblTestHelper.getEmblFile().getEntries(); - assertEquals(1, entries.size()); - EmblEntry entry = entries.get(0); - - assertEquals("X07547", entry.getAccession()); - assertEquals("C. trachomatis plasmid", entry.getDescription()); - assertEquals("STD", entry.getDataClass()); - assertEquals("PRO", entry.getTaxonomicDivision()); - assertEquals("1999-02-10", entry.getLastUpdatedDate()); - assertEquals("58", entry.getLastUpdatedRelease()); - assertEquals("1988-11-10", entry.getFirstPublicDate()); - assertEquals("18", entry.getFirstPublicRelease()); - assertEquals("genomic DNA", entry.getMoleculeType()); - assertEquals("1", entry.getSequenceVersion()); - assertEquals("8", entry.getEntryVersion()); - assertEquals("linear", entry.getTopology()); - assertEquals("7499", entry.getSequenceLength()); - - /* - * FIXME these assertions fail - values are null - why?? Adding or removing - * attributes in the test XML modifies behaviour. eg. inserting an attribute - * _before_ lastUpdated results in a null value in this field. - */ - assertEquals("1988-11-10", entry.getFirstPublicDate()); - assertEquals("18", entry.getFirstPublicRelease()); - - assertEquals(2, entry.getKeywords().size()); - assertEquals("plasmid", entry.getKeywords().get(0)); - assertEquals("unidentified reading frame", entry.getKeywords().get(1)); - - /* - * dbrefs - */ - assertEquals(2, entry.getDbRefs().size()); - DBRefEntry dbref = entry.getDbRefs().get(0); - assertEquals("EuropePMC", dbref.getSource()); - assertEquals("PMC107176", dbref.getAccessionId()); - assertEquals("9573186", dbref.getVersion()); - dbref = entry.getDbRefs().get(1); - assertEquals("MD5", dbref.getSource()); - assertEquals("ac73317", dbref.getAccessionId()); - // blank version has been converted to "0" - assertEquals("0", dbref.getVersion()); - - /* - * three sequence features for CDS - */ - assertEquals(3, entry.getFeatures().size()); - /* - * first CDS - */ - EmblFeature ef = entry.getFeatures().get(0); - assertEquals("CDS", ef.getName()); - assertEquals("complement(46..57)", ef.getLocation()); - assertEquals(2, ef.getDbRefs().size()); - dbref = ef.getDbRefs().get(0); - assertEquals("UniProtKB/Swiss-Prot", dbref.getSource()); - assertEquals("B0BCM4", dbref.getAccessionId()); - assertEquals("2.1", dbref.getVersion()); - dbref = ef.getDbRefs().get(1); - assertEquals("UniProtKB/Swiss-Prot", dbref.getSource()); - assertEquals("P0CE20", dbref.getAccessionId()); - // blank version gets converted to "0": - assertEquals("0", dbref.getVersion()); - // CDS feature qualifiers - assertEquals(3, ef.getQualifiers().size()); - Qualifier q = ef.getQualifiers().get(0); - assertEquals("note", q.getName()); - assertEquals(2, q.getValues().length); - assertEquals("ORF 8 (AA 1-330)", q.getValues()[0]); - assertEquals("pickle", q.getValues()[1]); - assertNull(q.getEvidence()); - q = ef.getQualifiers().get(1); - assertEquals("protein_id", q.getName()); - assertEquals(1, q.getValues().length); - assertEquals("CAA30420.1", q.getValues()[0]); - q = ef.getQualifiers().get(2); - assertEquals("translation", q.getName()); - assertEquals(1, q.getValues().length); - assertEquals("MLCF", q.getValues()[0]); - assertEquals(1, q.getEvidence().length); - assertEquals("Keith", q.getEvidence()[0]); - - /* - * second CDS - */ - ef = entry.getFeatures().get(1); - assertEquals("CDS", ef.getName()); - assertEquals("4..15", ef.getLocation()); - assertEquals(1, ef.getDbRefs().size()); - dbref = ef.getDbRefs().get(0); - assertEquals("UniProtKB/Swiss-Prot", dbref.getSource()); - assertEquals("B0BCM3", dbref.getAccessionId()); - assertEquals("0", dbref.getVersion()); - assertEquals(2, ef.getQualifiers().size()); - q = ef.getQualifiers().get(0); - assertEquals("protein_id", q.getName()); - assertEquals(1, q.getValues().length); - assertEquals("CAA30421.1", q.getValues()[0]); - q = ef.getQualifiers().get(1); - assertEquals("translation", q.getName()); - assertEquals(1, q.getValues().length); - assertEquals("MSSS", q.getValues()[0]); - - /* - * third CDS - */ - ef = entry.getFeatures().get(2); - assertEquals("CDS", ef.getName()); - assertEquals("join(4..6,10..15)", ef.getLocation()); - assertNull(ef.getDbRefs()); - assertEquals(2, ef.getQualifiers().size()); - q = ef.getQualifiers().get(0); - assertEquals("protein_id", q.getName()); - assertEquals(1, q.getValues().length); - assertEquals("CAA12345.6", q.getValues()[0]); - q = ef.getQualifiers().get(1); - assertEquals("translation", q.getName()); - assertEquals(1, q.getValues().length); - assertEquals("MSS", q.getValues()[0]); - - /* - * Sequence - verify newline not converted to space (JAL-2029) - */ - EmblSequence seq = entry.getSequence(); - assertEquals( - "GGTATGTCCTCTAGTACAAACACCCCCAATATTGTGATATAATTAAAAACATAGCAT", - seq.getSequence()); - - /* - * getSequence() converts empty DBRefEntry.version to "0" - */ - assertEquals("0", entry.getDbRefs().get(1).getVersion()); - assertEquals("0", entry.getFeatures().get(0).getDbRefs().get(1) - .getVersion()); - } -} diff --git a/test/jalview/datamodel/xdb/embl/EmblTestHelper.java b/test/jalview/datamodel/xdb/embl/EmblTestHelper.java deleted file mode 100644 index 0c7624f..0000000 --- a/test/jalview/datamodel/xdb/embl/EmblTestHelper.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) - * Copyright (C) $$Year-Rel$$ The Jalview Authors - * - * This file is part of Jalview. - * - * Jalview is free software: you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, either version 3 - * of the License, or (at your option) any later version. - * - * Jalview is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty - * of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Jalview. If not, see . - * The Jalview Authors are detailed in the 'AUTHORS' file. - */ -package jalview.datamodel.xdb.embl; - -import java.io.StringReader; - -public class EmblTestHelper -{ - // adapted from http://www.ebi.ac.uk/ena/data/view/X07547&display=xml - // dna and translations truncated for convenience - private static final String TESTDATA = "" - + "" - + "" - + "X07574" - + "C. trachomatis plasmid" - + "plasmidunidentified reading frame" - + "" - + "" - /* - * first CDS (range and translation changed to keep test data manageable) - */ - + "" - // test the case of >1 cross-ref to the same database (JAL-2029) - + "" - + "" - + "ORF 8 (AA 1-330)pickle" - + "CAA30420.1" - + "MLCFKeith" - + "" - /* - * second CDS (range and translation changed to keep test data manageable) - */ - + "" - + "" - + "CAA30421.1" - + "MSSS" - + "" - /* - * third CDS is made up - has no xref - code should synthesize - * one to an assumed EMBLCDSPROTEIN accession - */ - + "" - + "CAA12345.6" - + "MSS" - + "" - /* - * sequence (modified for test purposes) - * emulates EMBL XML 1.2 which splits sequence data every 60 characters - * see EmblSequence.setSequence - */ - + "GGTATGTCCTCTAGTACAAAC\n" - + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT" - + ""; - - static EmblFile getEmblFile() - { - return EmblFile.getEmblFile(new StringReader(TESTDATA)); - } -} diff --git a/test/jalview/ws/dbsources/EmblSourceTest.java b/test/jalview/ws/dbsources/EmblSourceTest.java new file mode 100644 index 0000000..158264f --- /dev/null +++ b/test/jalview/ws/dbsources/EmblSourceTest.java @@ -0,0 +1,463 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ +package jalview.ws.dbsources; + +import static org.testng.AssertJUnit.assertEquals; +import static org.testng.AssertJUnit.assertNotNull; +import static org.testng.AssertJUnit.assertNull; +import static org.testng.AssertJUnit.assertSame; +import static org.testng.AssertJUnit.assertTrue; + +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.SequenceI; +import jalview.util.MapList; +import jalview.xml.binding.embl.EntryType; +import jalview.xml.binding.embl.EntryType.Feature; +import jalview.xml.binding.embl.EntryType.Feature.Qualifier; +import jalview.xml.binding.embl.XrefType; + +import java.io.ByteArrayInputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.testng.annotations.Test; + +public class EmblSourceTest +{ + + // adapted from http://www.ebi.ac.uk/ena/data/view/X07547&display=xml + // dna and translations truncated for convenience + static final String TESTDATA = "" + + "" + + "" + + "X07574" + + "C. trachomatis plasmid" + + "plasmidunidentified reading frame" + + "" + + "" + /* + * first CDS (range and translation changed to keep test data manageable) + */ + + "" + // test the case of >1 cross-ref to the same database (JAL-2029) + + "" + + "" + + "ORF 8 (AA 1-330)" + + "CAA30420.1" + + "MLCF" + + "" + /* + * second CDS (range and translation changed to keep test data manageable) + */ + + "" + + "" + + "CAA30421.1" + + "MSSS" + + "" + /* + * third CDS is made up - has no xref - code should synthesize + * one to an assumed EMBLCDSPROTEIN accession + */ + + "" + + "CAA12345.6" + + "MSS" + + "" + /* + * sequence (modified for test purposes) + * emulates EMBL XML 1.2 which splits sequence data every 60 characters + * see EmblSequence.setSequence + */ + + "GGTATGTCCTCTAGTACAAAC\n" + + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT" + + ""; + + @Test(groups = "Functional") + public void testGetCdsRanges() + { + EmblSource testee = new EmblSource(); + + /* + * Make a (CDS) Feature with 5 locations + */ + Feature cds = new Feature(); + cds.setLocation("join(10..20,complement(30..40),50..60,70..80,complement(110..120))"); + + int[] exons = testee.getCdsRanges("EMBL", cds); + assertEquals("[10, 20, 40, 30, 50, 60, 70, 80, 120, 110]", + Arrays.toString(exons)); + } + + @Test(groups = "Functional") + public void testGetSequence() + { + // not the whole sequence but enough for this test... + List peptides = new ArrayList<>(); + List entries = EmblSourceTest.getEmblEntries(); + assertEquals(1, entries.size()); + EntryType entry = entries.get(0); + EmblSource testee = new EmblSource(); + String sourceDb = "EMBL"; + SequenceI dna = testee.getSequence(sourceDb, entry, peptides); + + /* + * newline has been removed from sequence + */ + String seq = dna.getSequenceAsString(); + assertEquals( + "GGTATGTCCTCTAGTACAAACACCCCCAATATTGTGATATAATTAAAAACATAGCAT", + seq); + + /* + * peptides should now have five entries: + * EMBL product and two Uniprot accessions for the first CDS / translation + * EMBL product and one Uniprot accession for the second CDS / " + * EMBL product only for the third + */ + assertEquals(6, peptides.size()); + assertEquals("CAA30420.1", peptides.get(0).getName()); + assertEquals("MLCF", peptides.get(0).getSequenceAsString()); + assertEquals("UNIPROT|B0BCM4", peptides.get(1).getName()); + assertEquals("MLCF", peptides.get(1).getSequenceAsString()); + assertEquals("UNIPROT|P0CE20", peptides.get(2).getName()); + assertEquals("MLCF", peptides.get(2).getSequenceAsString()); + assertEquals("CAA30421.1", peptides.get(3).getName()); + assertEquals("MSSS", peptides.get(3).getSequenceAsString()); + assertEquals("UNIPROT|B0BCM3", peptides.get(4).getName()); + assertEquals("MSSS", peptides.get(4).getSequenceAsString()); + assertEquals("CAA12345.6", peptides.get(5).getName()); + assertEquals("MSS", peptides.get(5).getSequenceAsString()); + + /* + * verify dna sequence has dbrefs + * - to 'self' (synthesized dbref) + * - to EuropePMC + * - to MD5 (with null version as "0") + * - with CDS mappings to the peptide 'products' + */ + MapList mapToSelf = new MapList(new int[] { 1, 57 }, + new int[] + { 1, 57 }, 1, 1); + MapList cds1Map = new MapList(new int[] { 57, 46 }, new int[] { 1, 4 }, + 3, 1); + MapList cds2Map = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 }, + 3, 1); + MapList cds3Map = new MapList(new int[] { 4, 6, 10, 15 }, new int[] { + 1, 3 }, 3, 1); + + DBRefEntry[] dbrefs = dna.getDBRefs(); + assertEquals(7, dbrefs.length); + + DBRefEntry dbRefEntry = dbrefs[0]; + assertEquals("EMBL", dbRefEntry.getSource()); + assertEquals("X07547", dbRefEntry.getAccessionId()); + assertEquals("1", dbRefEntry.getVersion()); + assertNotNull(dbRefEntry.getMap()); + assertNull(dbRefEntry.getMap().getTo()); + assertEquals(mapToSelf, dbRefEntry.getMap().getMap()); + + dbRefEntry = dbrefs[1]; + // DBRefEntry constructor puts dbSource in upper case + assertEquals("EUROPEPMC", dbRefEntry.getSource()); + assertEquals("PMC107176", dbRefEntry.getAccessionId()); + assertEquals("9573186", dbRefEntry.getVersion()); + assertNull(dbRefEntry.getMap()); + + dbRefEntry = dbrefs[2]; + assertEquals("MD5", dbRefEntry.getSource()); + assertEquals("ac73317", dbRefEntry.getAccessionId()); + assertEquals("0", dbRefEntry.getVersion()); + assertNull(dbRefEntry.getMap()); + + dbRefEntry = dbrefs[3]; + assertEquals("UNIPROT", dbRefEntry.getSource()); + assertEquals("B0BCM4", dbRefEntry.getAccessionId()); + assertSame(peptides.get(1), dbRefEntry.getMap().getTo()); + assertEquals(cds1Map, dbRefEntry.getMap().getMap()); + + dbRefEntry = dbrefs[4]; + assertEquals("UNIPROT", dbRefEntry.getSource()); + assertEquals("P0CE20", dbRefEntry.getAccessionId()); + assertSame(peptides.get(2), dbRefEntry.getMap().getTo()); + assertEquals(cds1Map, dbRefEntry.getMap().getMap()); + + dbRefEntry = dbrefs[5]; + assertEquals("UNIPROT", dbRefEntry.getSource()); + assertEquals("B0BCM3", dbRefEntry.getAccessionId()); + assertSame(peptides.get(4), dbRefEntry.getMap().getTo()); + assertEquals(cds2Map, dbRefEntry.getMap().getMap()); + + dbRefEntry = dbrefs[6]; + assertEquals("EMBLCDSPROTEIN", dbRefEntry.getSource()); + assertEquals("CAA12345.6", dbRefEntry.getAccessionId()); + assertSame(peptides.get(5), dbRefEntry.getMap().getTo()); + assertEquals(cds3Map, dbRefEntry.getMap().getMap()); + + /* + * verify peptides have dbrefs + * - to EMBL sequence (with inverse 1:3 cds mapping) + * - to EMBLCDS (with 1:3 mapping) + * - direct (no mapping) to other protein accessions + */ + MapList proteinToCdsMap1 = new MapList(new int[] { 1, 4 }, new int[] { + 1, 12 }, 1, 3); + MapList proteinToCdsMap2 = new MapList(new int[] { 1, 3 }, new int[] { + 1, 9 }, 1, 3); + + // dbrefs for first CDS EMBL product CAA30420.1 + dbrefs = peptides.get(0).getDBRefs(); + assertEquals(5, dbrefs.length); + assertEquals(DBRefSource.EMBL, dbrefs[0].getSource()); + assertEquals("CAA30420.1", dbrefs[0].getAccessionId()); + // TODO: verify getPrimaryDBRefs() for peptide products + assertEquals(cds1Map.getInverse(), dbrefs[0].getMap().getMap()); + assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource()); + assertEquals("CAA30420.1", dbrefs[1].getAccessionId()); + assertEquals(proteinToCdsMap1, dbrefs[1].getMap().getMap()); + assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource()); + assertEquals("CAA30420.1", dbrefs[2].getAccessionId()); + assertNull(dbrefs[2].getMap()); + assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "2.1", "B0BCM4"), + dbrefs[3]); + assertNull(dbrefs[3].getMap()); + assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "P0CE20"), + dbrefs[4]); + assertNull(dbrefs[4].getMap()); + + // dbrefs for first CDS first Uniprot xref + dbrefs = peptides.get(1).getDBRefs(); + assertEquals(2, dbrefs.length); + assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "2.1", "B0BCM4"), + dbrefs[0]); + assertNull(dbrefs[0].getMap()); + assertEquals(DBRefSource.EMBL, dbrefs[1].getSource()); + assertEquals("X07547", dbrefs[1].getAccessionId()); + assertEquals(cds1Map.getInverse(), dbrefs[1].getMap().getMap()); + + // dbrefs for first CDS second Uniprot xref + dbrefs = peptides.get(2).getDBRefs(); + assertEquals(2, dbrefs.length); + assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "P0CE20"), + dbrefs[0]); + assertNull(dbrefs[0].getMap()); + assertEquals(DBRefSource.EMBL, dbrefs[1].getSource()); + assertEquals("X07547", dbrefs[1].getAccessionId()); + assertEquals(cds1Map.getInverse(), dbrefs[1].getMap().getMap()); + + // dbrefs for second CDS EMBL product CAA30421.1 + dbrefs = peptides.get(3).getDBRefs(); + assertEquals(4, dbrefs.length); + assertEquals(DBRefSource.EMBL, dbrefs[0].getSource()); + assertEquals("CAA30421.1", dbrefs[0].getAccessionId()); + assertEquals(cds2Map.getInverse(), dbrefs[0].getMap().getMap()); + assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource()); + assertEquals("CAA30421.1", dbrefs[1].getAccessionId()); + assertEquals(proteinToCdsMap1, dbrefs[1].getMap().getMap()); + assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource()); + assertEquals("CAA30421.1", dbrefs[2].getAccessionId()); + assertNull(dbrefs[2].getMap()); + assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "B0BCM3"), + dbrefs[3]); + assertNull(dbrefs[3].getMap()); + + // dbrefs for second CDS second Uniprot xref + dbrefs = peptides.get(4).getDBRefs(); + assertEquals(2, dbrefs.length); + assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "B0BCM3"), + dbrefs[0]); + assertNull(dbrefs[0].getMap()); + assertEquals(DBRefSource.EMBL, dbrefs[1].getSource()); + assertEquals("X07547", dbrefs[1].getAccessionId()); + assertEquals(cds2Map.getInverse(), dbrefs[1].getMap().getMap()); + + // dbrefs for third CDS inferred EMBL product CAA12345.6 + dbrefs = peptides.get(5).getDBRefs(); + assertEquals(3, dbrefs.length); + assertEquals(DBRefSource.EMBL, dbrefs[0].getSource()); + assertEquals("CAA12345.6", dbrefs[0].getAccessionId()); + assertEquals(cds3Map.getInverse(), dbrefs[0].getMap().getMap()); + assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource()); + assertEquals("CAA12345.6", dbrefs[1].getAccessionId()); + assertEquals(proteinToCdsMap2, dbrefs[1].getMap().getMap()); + assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource()); + assertEquals("CAA12345.6", dbrefs[2].getAccessionId()); + assertNull(dbrefs[2].getMap()); + } + + @Test(groups = "Functional") + public void testAdjustForProteinLength() + { + int[] exons = new int[] { 11, 15, 21, 25, 31, 38 }; // 18 bp + + // exact length match: + assertSame(exons, EmblXmlSource.adjustForProteinLength(6, exons)); + + // match if we assume exons include stop codon not in protein: + assertSame(exons, EmblXmlSource.adjustForProteinLength(5, exons)); + + // truncate last exon by 6bp + int[] truncated = EmblXmlSource.adjustForProteinLength(4, exons); + assertEquals("[11, 15, 21, 25, 31, 32]", Arrays.toString(truncated)); + + // remove last exon and truncate preceding by 1bp + truncated = EmblXmlSource.adjustForProteinLength(3, exons); + assertEquals("[11, 15, 21, 24]", Arrays.toString(truncated)); + + // exact removal of exon case: + exons = new int[] { 11, 15, 21, 27, 33, 38 }; // 18 bp + truncated = EmblXmlSource.adjustForProteinLength(4, exons); + assertEquals("[11, 15, 21, 27]", Arrays.toString(truncated)); + + // what if exons are too short for protein? + truncated = EmblXmlSource.adjustForProteinLength(7, exons); + assertSame(exons, truncated); + } + + @Test(groups = { "Functional" }) + public void testGetEmblFile() + { + List entries = EmblSourceTest.getEmblEntries(); + assertEquals(1, entries.size()); + EntryType entry = entries.get(0); + + assertEquals("X07547", entry.getAccession()); + assertEquals("C. trachomatis plasmid", entry.getDescription()); + assertEquals("STD", entry.getDataClass()); + assertEquals("PRO", entry.getTaxonomicDivision()); + assertEquals("1999-02-10", entry.getLastUpdated().toString()); + assertEquals(58, entry.getLastUpdatedRelease().intValue()); + assertEquals("1988-11-10", entry.getFirstPublic().toString()); + assertEquals(18, entry.getFirstPublicRelease().intValue()); + assertEquals("genomic DNA", entry.getMoleculeType()); + assertEquals(1, entry.getVersion().intValue()); + assertEquals(8, entry.getEntryVersion().intValue()); + assertEquals("linear", entry.getTopology()); + assertEquals(7499, entry.getSequenceLength().intValue()); + assertEquals(2, entry.getKeyword().size()); + assertEquals("plasmid", entry.getKeyword().get(0)); + assertEquals("unidentified reading frame", entry.getKeyword().get(1)); + + /* + * dbrefs + */ + assertEquals(2, entry.getXref().size()); + XrefType dbref = entry.getXref().get(0); + assertEquals("EuropePMC", dbref.getDb()); + assertEquals("PMC107176", dbref.getId()); + assertEquals("9573186", dbref.getSecondaryId()); + dbref = entry.getXref().get(1); + assertEquals("MD5", dbref.getDb()); + assertEquals("ac73317", dbref.getId()); + assertNull(dbref.getSecondaryId()); + + /* + * three sequence features for CDS + */ + assertEquals(3, entry.getFeature().size()); + /* + * first CDS + */ + Feature ef = entry.getFeature().get(0); + assertEquals("CDS", ef.getName()); + assertEquals("complement(46..57)", ef.getLocation()); + assertEquals(2, ef.getXref().size()); + dbref = ef.getXref().get(0); + assertEquals("UniProtKB/Swiss-Prot", dbref.getDb()); + assertEquals("B0BCM4", dbref.getId()); + assertEquals("2.1", dbref.getSecondaryId()); + dbref = ef.getXref().get(1); + assertEquals("UniProtKB/Swiss-Prot", dbref.getDb()); + assertEquals("P0CE20", dbref.getId()); + assertNull(dbref.getSecondaryId()); + // CDS feature qualifiers + assertEquals(3, ef.getQualifier().size()); + Qualifier q = ef.getQualifier().get(0); + assertEquals("note", q.getName()); + assertEquals("ORF 8 (AA 1-330)", q.getValue()); + q = ef.getQualifier().get(1); + assertEquals("protein_id", q.getName()); + assertEquals("CAA30420.1", q.getValue()); + q = ef.getQualifier().get(2); + assertEquals("translation", q.getName()); + assertEquals("MLCF", q.getValue()); + + /* + * second CDS + */ + ef = entry.getFeature().get(1); + assertEquals("CDS", ef.getName()); + assertEquals("4..15", ef.getLocation()); + assertEquals(1, ef.getXref().size()); + dbref = ef.getXref().get(0); + assertEquals("UniProtKB/Swiss-Prot", dbref.getDb()); + assertEquals("B0BCM3", dbref.getId()); + assertNull(dbref.getSecondaryId()); + assertEquals(2, ef.getQualifier().size()); + q = ef.getQualifier().get(0); + assertEquals("protein_id", q.getName()); + assertEquals("CAA30421.1", q.getValue()); + q = ef.getQualifier().get(1); + assertEquals("translation", q.getName()); + assertEquals("MSSS", q.getValue()); + + /* + * third CDS + */ + ef = entry.getFeature().get(2); + assertEquals("CDS", ef.getName()); + assertEquals("join(4..6,10..15)", ef.getLocation()); + assertNotNull(ef.getXref()); + assertTrue(ef.getXref().isEmpty()); + assertEquals(2, ef.getQualifier().size()); + q = ef.getQualifier().get(0); + assertEquals("protein_id", q.getName()); + assertEquals("CAA12345.6", q.getValue()); + q = ef.getQualifier().get(1); + assertEquals("translation", q.getName()); + assertEquals("MSS", q.getValue()); + + /* + * Sequence - raw data before removal of newlines + */ + String seq = entry.getSequence(); + assertEquals( + "GGTATGTCCTCTAGTACAAAC\n" + + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT", + seq); + + /* + * getSequence() converts empty DBRefEntry.version to "0" + */ + assertNull(entry.getXref().get(1).getSecondaryId()); + assertNull(entry.getFeature().get(0).getXref().get(1).getSecondaryId()); + } + + static List getEmblEntries() + { + return new EmblSource() + .getEmblEntries(new ByteArrayInputStream(TESTDATA.getBytes())); + } +} -- 1.7.10.2