/* * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.ws.dbsources; import static org.testng.AssertJUnit.assertEquals; import static org.testng.AssertJUnit.assertNotNull; import static org.testng.AssertJUnit.assertNull; import static org.testng.AssertJUnit.assertSame; import static org.testng.AssertJUnit.assertTrue; import java.io.ByteArrayInputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.SequenceI; import jalview.util.MapList; import jalview.xml.binding.embl.EntryType; import jalview.xml.binding.embl.EntryType.Feature; import jalview.xml.binding.embl.EntryType.Feature.Qualifier; import jalview.xml.binding.embl.XrefType; public class EmblXmlSourceTest { // adapted from http://www.ebi.ac.uk/ena/data/view/X07547&display=xml // dna and translations truncated for convenience static final String TESTDATA = "" + "" + "" + "X07574" + "C. trachomatis plasmid" + "plasmidunidentified reading frame" + "" + "" /* * first CDS (range and translation changed to keep test data manageable) */ + "" // test the case of >1 cross-ref to the same database (JAL-2029) + "" + "" + "ORF 8 (AA 1-330)" + "CAA30420.1" + "MLCF" + "" /* * second CDS (range and translation changed to keep test data manageable) */ + "" + "" + "CAA30421.1" + "MSSS" + "" /* * third CDS is made up - has no xref - code should synthesize * one to an assumed EMBLCDSPROTEIN accession */ + "" + "CAA12345.6" + "MSS" + "" /* * sequence (modified for test purposes) * emulates EMBL XML 1.2 which splits sequence data every 60 characters * see EmblSequence.setSequence */ + "GGTATGTCCTCTAGTACAAAC\n" + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT" + ""; private EmblXmlSource testee; @BeforeClass(alwaysRun = true) public void setUp() { testee = new EmblXmlSource() { @Override public String getDbSource() { return null; } @Override public String getDbName() { return null; } @Override public String getTestQuery() { return null; } @Override public AlignmentI getSequenceRecords(String queries) throws Exception { return null; } }; } @Test(groups = "Functional") public void testGetCdsRanges() { /* * Make a (CDS) Feature with 5 locations */ Feature cds = new Feature(); cds.setLocation( "join(10..20,complement(30..40),50..60,70..80,complement(110..120))"); int[] exons = testee.getCdsRanges("EMBL", cds); assertEquals("[10, 20, 40, 30, 50, 60, 70, 80, 120, 110]", Arrays.toString(exons)); } @Test(groups = "Functional") public void testGetSequence() { // not the whole sequence but enough for this test... List peptides = new ArrayList<>(); List entries = getEmblEntries(); assertEquals(1, entries.size()); EntryType entry = entries.get(0); String sourceDb = "EMBL"; SequenceI dna = testee.getSequence(sourceDb, entry, peptides); /* * newline has been removed from sequence */ String seq = dna.getSequenceAsString(); assertEquals( "GGTATGTCCTCTAGTACAAACACCCCCAATATTGTGATATAATTAAAAACATAGCAT", seq); /* * peptides should now have five entries: * EMBL product and two Uniprot accessions for the first CDS / translation * EMBL product and one Uniprot accession for the second CDS / " * EMBL product only for the third */ assertEquals(6, peptides.size()); assertEquals("CAA30420.1", peptides.get(0).getName()); assertEquals("MLCF", peptides.get(0).getSequenceAsString()); assertEquals("UNIPROT|B0BCM4", peptides.get(1).getName()); assertEquals("MLCF", peptides.get(1).getSequenceAsString()); assertEquals("UNIPROT|P0CE20", peptides.get(2).getName()); assertEquals("MLCF", peptides.get(2).getSequenceAsString()); assertEquals("CAA30421.1", peptides.get(3).getName()); assertEquals("MSSS", peptides.get(3).getSequenceAsString()); assertEquals("UNIPROT|B0BCM3", peptides.get(4).getName()); assertEquals("MSSS", peptides.get(4).getSequenceAsString()); assertEquals("CAA12345.6", peptides.get(5).getName()); assertEquals("MSS", peptides.get(5).getSequenceAsString()); /* * verify dna sequence has dbrefs * - to 'self' (synthesized dbref) * - to EuropePMC * - to MD5 (with null version as "0") * - with CDS mappings to the peptide 'products' */ MapList mapToSelf = new MapList(new int[] { 1, 57 }, new int[] { 1, 57 }, 1, 1); MapList cds1Map = new MapList(new int[] { 57, 46 }, new int[] { 1, 4 }, 3, 1); MapList cds2Map = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 }, 3, 1); MapList cds3Map = new MapList(new int[] { 4, 6, 10, 15 }, new int[] { 1, 3 }, 3, 1); DBRefEntry[] dbrefs = dna.getDBRefs(); assertEquals(7, dbrefs.length); DBRefEntry dbRefEntry = dbrefs[0]; assertEquals("EMBL", dbRefEntry.getSource()); assertEquals("X07547", dbRefEntry.getAccessionId()); assertEquals("1", dbRefEntry.getVersion()); assertNotNull(dbRefEntry.getMap()); assertNull(dbRefEntry.getMap().getTo()); assertEquals(mapToSelf, dbRefEntry.getMap().getMap()); dbRefEntry = dbrefs[1]; // DBRefEntry constructor puts dbSource in upper case assertEquals("EUROPEPMC", dbRefEntry.getSource()); assertEquals("PMC107176", dbRefEntry.getAccessionId()); assertEquals("9573186", dbRefEntry.getVersion()); assertNull(dbRefEntry.getMap()); dbRefEntry = dbrefs[2]; assertEquals("MD5", dbRefEntry.getSource()); assertEquals("ac73317", dbRefEntry.getAccessionId()); assertEquals("0", dbRefEntry.getVersion()); assertNull(dbRefEntry.getMap()); dbRefEntry = dbrefs[3]; assertEquals("UNIPROT", dbRefEntry.getSource()); assertEquals("B0BCM4", dbRefEntry.getAccessionId()); assertSame(peptides.get(1), dbRefEntry.getMap().getTo()); assertEquals(cds1Map, dbRefEntry.getMap().getMap()); dbRefEntry = dbrefs[4]; assertEquals("UNIPROT", dbRefEntry.getSource()); assertEquals("P0CE20", dbRefEntry.getAccessionId()); assertSame(peptides.get(2), dbRefEntry.getMap().getTo()); assertEquals(cds1Map, dbRefEntry.getMap().getMap()); dbRefEntry = dbrefs[5]; assertEquals("UNIPROT", dbRefEntry.getSource()); assertEquals("B0BCM3", dbRefEntry.getAccessionId()); assertSame(peptides.get(4), dbRefEntry.getMap().getTo()); assertEquals(cds2Map, dbRefEntry.getMap().getMap()); dbRefEntry = dbrefs[6]; assertEquals("EMBLCDSPROTEIN", dbRefEntry.getSource()); assertEquals("CAA12345.6", dbRefEntry.getAccessionId()); assertSame(peptides.get(5), dbRefEntry.getMap().getTo()); assertEquals(cds3Map, dbRefEntry.getMap().getMap()); /* * verify peptides have dbrefs * - to EMBL sequence (with inverse 1:3 cds mapping) * - to EMBLCDS (with 1:3 mapping) * - direct (no mapping) to other protein accessions */ MapList proteinToCdsMap1 = new MapList(new int[] { 1, 4 }, new int[] { 1, 12 }, 1, 3); MapList proteinToCdsMap2 = new MapList(new int[] { 1, 3 }, new int[] { 1, 9 }, 1, 3); // dbrefs for first CDS EMBL product CAA30420.1 dbrefs = peptides.get(0).getDBRefs(); assertEquals(5, dbrefs.length); assertEquals(DBRefSource.EMBL, dbrefs[0].getSource()); assertEquals("CAA30420.1", dbrefs[0].getAccessionId()); // TODO: verify getPrimaryDBRefs() for peptide products assertEquals(cds1Map.getInverse(), dbrefs[0].getMap().getMap()); assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource()); assertEquals("CAA30420.1", dbrefs[1].getAccessionId()); assertEquals(proteinToCdsMap1, dbrefs[1].getMap().getMap()); assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource()); assertEquals("CAA30420.1", dbrefs[2].getAccessionId()); assertNull(dbrefs[2].getMap()); assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "2.1", "B0BCM4"), dbrefs[3]); assertNull(dbrefs[3].getMap()); assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "P0CE20"), dbrefs[4]); assertNull(dbrefs[4].getMap()); // dbrefs for first CDS first Uniprot xref dbrefs = peptides.get(1).getDBRefs(); assertEquals(2, dbrefs.length); assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "2.1", "B0BCM4"), dbrefs[0]); assertNull(dbrefs[0].getMap()); assertEquals(DBRefSource.EMBL, dbrefs[1].getSource()); assertEquals("X07547", dbrefs[1].getAccessionId()); assertEquals(cds1Map.getInverse(), dbrefs[1].getMap().getMap()); // dbrefs for first CDS second Uniprot xref dbrefs = peptides.get(2).getDBRefs(); assertEquals(2, dbrefs.length); assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "P0CE20"), dbrefs[0]); assertNull(dbrefs[0].getMap()); assertEquals(DBRefSource.EMBL, dbrefs[1].getSource()); assertEquals("X07547", dbrefs[1].getAccessionId()); assertEquals(cds1Map.getInverse(), dbrefs[1].getMap().getMap()); // dbrefs for second CDS EMBL product CAA30421.1 dbrefs = peptides.get(3).getDBRefs(); assertEquals(4, dbrefs.length); assertEquals(DBRefSource.EMBL, dbrefs[0].getSource()); assertEquals("CAA30421.1", dbrefs[0].getAccessionId()); assertEquals(cds2Map.getInverse(), dbrefs[0].getMap().getMap()); assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource()); assertEquals("CAA30421.1", dbrefs[1].getAccessionId()); assertEquals(proteinToCdsMap1, dbrefs[1].getMap().getMap()); assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource()); assertEquals("CAA30421.1", dbrefs[2].getAccessionId()); assertNull(dbrefs[2].getMap()); assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "B0BCM3"), dbrefs[3]); assertNull(dbrefs[3].getMap()); // dbrefs for second CDS second Uniprot xref dbrefs = peptides.get(4).getDBRefs(); assertEquals(2, dbrefs.length); assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "B0BCM3"), dbrefs[0]); assertNull(dbrefs[0].getMap()); assertEquals(DBRefSource.EMBL, dbrefs[1].getSource()); assertEquals("X07547", dbrefs[1].getAccessionId()); assertEquals(cds2Map.getInverse(), dbrefs[1].getMap().getMap()); // dbrefs for third CDS inferred EMBL product CAA12345.6 dbrefs = peptides.get(5).getDBRefs(); assertEquals(3, dbrefs.length); assertEquals(DBRefSource.EMBL, dbrefs[0].getSource()); assertEquals("CAA12345.6", dbrefs[0].getAccessionId()); assertEquals(cds3Map.getInverse(), dbrefs[0].getMap().getMap()); assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource()); assertEquals("CAA12345.6", dbrefs[1].getAccessionId()); assertEquals(proteinToCdsMap2, dbrefs[1].getMap().getMap()); assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource()); assertEquals("CAA12345.6", dbrefs[2].getAccessionId()); assertNull(dbrefs[2].getMap()); } @Test(groups = "Functional") public void testAdjustForProteinLength() { int[] exons = new int[] { 11, 15, 21, 25, 31, 38 }; // 18 bp // exact length match: assertSame(exons, EmblXmlSource.adjustForProteinLength(6, exons)); // truncate last exon by 3bp (e.g. stop codon) int[] truncated = EmblXmlSource.adjustForProteinLength(5, exons); assertEquals("[11, 15, 21, 25, 31, 35]", Arrays.toString(truncated)); // truncate last exon by 6bp truncated = EmblXmlSource.adjustForProteinLength(4, exons); assertEquals("[11, 15, 21, 25, 31, 32]", Arrays.toString(truncated)); // remove last exon and truncate preceding by 1bp truncated = EmblXmlSource.adjustForProteinLength(3, exons); assertEquals("[11, 15, 21, 24]", Arrays.toString(truncated)); // exact removal of exon case: exons = new int[] { 11, 15, 21, 27, 33, 38 }; // 18 bp truncated = EmblXmlSource.adjustForProteinLength(4, exons); assertEquals("[11, 15, 21, 27]", Arrays.toString(truncated)); // what if exons are too short for protein? truncated = EmblXmlSource.adjustForProteinLength(7, exons); assertSame(exons, truncated); } @Test(groups = { "Functional" }) public void testGetEmblEntries() { List entries = getEmblEntries(); assertEquals(1, entries.size()); EntryType entry = entries.get(0); assertEquals("X07547", entry.getAccession()); assertEquals("C. trachomatis plasmid", entry.getDescription()); assertEquals("STD", entry.getDataClass()); assertEquals("PRO", entry.getTaxonomicDivision()); assertEquals("1999-02-10", entry.getLastUpdated().toString()); assertEquals(58, entry.getLastUpdatedRelease().intValue()); assertEquals("1988-11-10", entry.getFirstPublic().toString()); assertEquals(18, entry.getFirstPublicRelease().intValue()); assertEquals("genomic DNA", entry.getMoleculeType()); assertEquals(1, entry.getVersion().intValue()); assertEquals(8, entry.getEntryVersion().intValue()); assertEquals("linear", entry.getTopology()); assertEquals(7499, entry.getSequenceLength().intValue()); assertEquals(2, entry.getKeyword().size()); assertEquals("plasmid", entry.getKeyword().get(0)); assertEquals("unidentified reading frame", entry.getKeyword().get(1)); /* * dbrefs */ assertEquals(2, entry.getXref().size()); XrefType dbref = entry.getXref().get(0); assertEquals("EuropePMC", dbref.getDb()); assertEquals("PMC107176", dbref.getId()); assertEquals("9573186", dbref.getSecondaryId()); dbref = entry.getXref().get(1); assertEquals("MD5", dbref.getDb()); assertEquals("ac73317", dbref.getId()); assertNull(dbref.getSecondaryId()); /* * three sequence features for CDS */ assertEquals(3, entry.getFeature().size()); /* * first CDS */ Feature ef = entry.getFeature().get(0); assertEquals("CDS", ef.getName()); assertEquals("complement(46..57)", ef.getLocation()); assertEquals(2, ef.getXref().size()); dbref = ef.getXref().get(0); assertEquals("UniProtKB/Swiss-Prot", dbref.getDb()); assertEquals("B0BCM4", dbref.getId()); assertEquals("2.1", dbref.getSecondaryId()); dbref = ef.getXref().get(1); assertEquals("UniProtKB/Swiss-Prot", dbref.getDb()); assertEquals("P0CE20", dbref.getId()); assertNull(dbref.getSecondaryId()); // CDS feature qualifiers assertEquals(3, ef.getQualifier().size()); Qualifier q = ef.getQualifier().get(0); assertEquals("note", q.getName()); assertEquals("ORF 8 (AA 1-330)", q.getValue()); q = ef.getQualifier().get(1); assertEquals("protein_id", q.getName()); assertEquals("CAA30420.1", q.getValue()); q = ef.getQualifier().get(2); assertEquals("translation", q.getName()); assertEquals("MLCF", q.getValue()); /* * second CDS */ ef = entry.getFeature().get(1); assertEquals("CDS", ef.getName()); assertEquals("4..15", ef.getLocation()); assertEquals(1, ef.getXref().size()); dbref = ef.getXref().get(0); assertEquals("UniProtKB/Swiss-Prot", dbref.getDb()); assertEquals("B0BCM3", dbref.getId()); assertNull(dbref.getSecondaryId()); assertEquals(2, ef.getQualifier().size()); q = ef.getQualifier().get(0); assertEquals("protein_id", q.getName()); assertEquals("CAA30421.1", q.getValue()); q = ef.getQualifier().get(1); assertEquals("translation", q.getName()); assertEquals("MSSS", q.getValue()); /* * third CDS */ ef = entry.getFeature().get(2); assertEquals("CDS", ef.getName()); assertEquals("join(4..6,10..15)", ef.getLocation()); assertNotNull(ef.getXref()); assertTrue(ef.getXref().isEmpty()); assertEquals(2, ef.getQualifier().size()); q = ef.getQualifier().get(0); assertEquals("protein_id", q.getName()); assertEquals("CAA12345.6", q.getValue()); q = ef.getQualifier().get(1); assertEquals("translation", q.getName()); assertEquals("MSS", q.getValue()); /* * Sequence - raw data before removal of newlines */ String seq = entry.getSequence(); assertEquals("GGTATGTCCTCTAGTACAAAC\n" + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT", seq); /* * getSequence() converts empty DBRefEntry.version to "0" */ assertNull(entry.getXref().get(1).getSecondaryId()); assertNull(entry.getFeature().get(0).getXref().get(1).getSecondaryId()); } List getEmblEntries() { return testee .getEmblEntries(new ByteArrayInputStream(TESTDATA.getBytes())); } }