2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.ws.dbsources;
23 import static org.testng.AssertJUnit.assertEquals;
24 import static org.testng.AssertJUnit.assertNotNull;
25 import static org.testng.AssertJUnit.assertNull;
26 import static org.testng.AssertJUnit.assertSame;
27 import static org.testng.AssertJUnit.assertTrue;
29 import jalview.datamodel.DBRefEntry;
30 import jalview.datamodel.DBRefSource;
31 import jalview.datamodel.SequenceI;
32 import jalview.util.MapList;
33 import jalview.xml.binding.embl.EntryType;
34 import jalview.xml.binding.embl.EntryType.Feature;
35 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
36 import jalview.xml.binding.embl.XrefType;
38 import java.io.ByteArrayInputStream;
39 import java.util.ArrayList;
40 import java.util.Arrays;
41 import java.util.List;
43 import org.testng.annotations.Test;
45 public class EmblSourceTest
48 // adapted from http://www.ebi.ac.uk/ena/data/view/X07547&display=xml
49 // dna and translations truncated for convenience
50 static final String TESTDATA = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
52 + "<entry accession=\"X07547\" version=\"1\" entryVersion=\"8\""
53 + " dataClass=\"STD\" taxonomicDivision=\"PRO\""
54 + " moleculeType=\"genomic DNA\" sequenceLength=\"7499\" topology=\"linear\""
55 + " firstPublic=\"1988-11-10\" firstPublicRelease=\"18\""
56 + " lastUpdated=\"1999-02-10\" lastUpdatedRelease=\"58\">"
57 + "<secondaryAccession>X07574</secondaryAccession>"
58 + "<description>C. trachomatis plasmid</description>"
59 + "<keyword>plasmid</keyword><keyword>unidentified reading frame</keyword>"
60 + "<xref db=\"EuropePMC\" id=\"PMC107176\" secondaryId=\"9573186\" />"
61 + "<xref db=\"MD5\" id=\"ac73317\" />"
63 * first CDS (range and translation changed to keep test data manageable)
65 + "<feature name=\"CDS\" location=\"complement(46..57)\">"
66 // test the case of >1 cross-ref to the same database (JAL-2029)
67 + "<xref db=\"UniProtKB/Swiss-Prot\" id=\"B0BCM4\" secondaryId=\"2.1\" />"
68 + "<xref db=\"UniProtKB/Swiss-Prot\" id=\"P0CE20\" />"
69 + "<qualifier name=\"note\"><value>ORF 8 (AA 1-330)</value></qualifier>"
70 + "<qualifier name=\"protein_id\"><value>CAA30420.1</value></qualifier>"
71 + "<qualifier name=\"translation\"><value>MLCF</value></qualifier>"
74 * second CDS (range and translation changed to keep test data manageable)
76 + "<feature name=\"CDS\" location=\"4..15\">"
77 + "<xref db=\"UniProtKB/Swiss-Prot\" id=\"B0BCM3\" />"
78 + "<qualifier name=\"protein_id\"><value>CAA30421.1</value></qualifier>"
79 + "<qualifier name=\"translation\"><value>MSSS</value></qualifier>"
82 * third CDS is made up - has no xref - code should synthesize
83 * one to an assumed EMBLCDSPROTEIN accession
85 + "<feature name=\"CDS\" location=\"join(4..6,10..15)\">"
86 + "<qualifier name=\"protein_id\"><value>CAA12345.6</value></qualifier>"
87 + "<qualifier name=\"translation\"><value>MSS</value></qualifier>"
90 * sequence (modified for test purposes)
91 * emulates EMBL XML 1.2 which splits sequence data every 60 characters
92 * see EmblSequence.setSequence
94 + "<sequence>GGTATGTCCTCTAGTACAAAC\n"
95 + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT"
96 + "</sequence></entry></ROOT>";
98 @Test(groups = "Functional")
99 public void testGetCdsRanges()
101 EmblSource testee = new EmblSource();
104 * Make a (CDS) Feature with 5 locations
106 Feature cds = new Feature();
107 cds.setLocation("join(10..20,complement(30..40),50..60,70..80,complement(110..120))");
109 int[] exons = testee.getCdsRanges("EMBL", cds);
110 assertEquals("[10, 20, 40, 30, 50, 60, 70, 80, 120, 110]",
111 Arrays.toString(exons));
114 @Test(groups = "Functional")
115 public void testGetSequence()
117 // not the whole sequence but enough for this test...
118 List<SequenceI> peptides = new ArrayList<>();
119 List<EntryType> entries = EmblSourceTest.getEmblEntries();
120 assertEquals(1, entries.size());
121 EntryType entry = entries.get(0);
122 EmblSource testee = new EmblSource();
123 String sourceDb = "EMBL";
124 SequenceI dna = testee.getSequence(sourceDb, entry, peptides);
127 * newline has been removed from sequence
129 String seq = dna.getSequenceAsString();
131 "GGTATGTCCTCTAGTACAAACACCCCCAATATTGTGATATAATTAAAAACATAGCAT",
135 * peptides should now have five entries:
136 * EMBL product and two Uniprot accessions for the first CDS / translation
137 * EMBL product and one Uniprot accession for the second CDS / "
138 * EMBL product only for the third
140 assertEquals(6, peptides.size());
141 assertEquals("CAA30420.1", peptides.get(0).getName());
142 assertEquals("MLCF", peptides.get(0).getSequenceAsString());
143 assertEquals("UNIPROT|B0BCM4", peptides.get(1).getName());
144 assertEquals("MLCF", peptides.get(1).getSequenceAsString());
145 assertEquals("UNIPROT|P0CE20", peptides.get(2).getName());
146 assertEquals("MLCF", peptides.get(2).getSequenceAsString());
147 assertEquals("CAA30421.1", peptides.get(3).getName());
148 assertEquals("MSSS", peptides.get(3).getSequenceAsString());
149 assertEquals("UNIPROT|B0BCM3", peptides.get(4).getName());
150 assertEquals("MSSS", peptides.get(4).getSequenceAsString());
151 assertEquals("CAA12345.6", peptides.get(5).getName());
152 assertEquals("MSS", peptides.get(5).getSequenceAsString());
155 * verify dna sequence has dbrefs
156 * - to 'self' (synthesized dbref)
158 * - to MD5 (with null version as "0")
159 * - with CDS mappings to the peptide 'products'
161 MapList mapToSelf = new MapList(new int[] { 1, 57 },
164 MapList cds1Map = new MapList(new int[] { 57, 46 }, new int[] { 1, 4 },
166 MapList cds2Map = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 },
168 MapList cds3Map = new MapList(new int[] { 4, 6, 10, 15 }, new int[] {
171 DBRefEntry[] dbrefs = dna.getDBRefs();
172 assertEquals(7, dbrefs.length);
174 DBRefEntry dbRefEntry = dbrefs[0];
175 assertEquals("EMBL", dbRefEntry.getSource());
176 assertEquals("X07547", dbRefEntry.getAccessionId());
177 assertEquals("1", dbRefEntry.getVersion());
178 assertNotNull(dbRefEntry.getMap());
179 assertNull(dbRefEntry.getMap().getTo());
180 assertEquals(mapToSelf, dbRefEntry.getMap().getMap());
182 dbRefEntry = dbrefs[1];
183 // DBRefEntry constructor puts dbSource in upper case
184 assertEquals("EUROPEPMC", dbRefEntry.getSource());
185 assertEquals("PMC107176", dbRefEntry.getAccessionId());
186 assertEquals("9573186", dbRefEntry.getVersion());
187 assertNull(dbRefEntry.getMap());
189 dbRefEntry = dbrefs[2];
190 assertEquals("MD5", dbRefEntry.getSource());
191 assertEquals("ac73317", dbRefEntry.getAccessionId());
192 assertEquals("0", dbRefEntry.getVersion());
193 assertNull(dbRefEntry.getMap());
195 dbRefEntry = dbrefs[3];
196 assertEquals("UNIPROT", dbRefEntry.getSource());
197 assertEquals("B0BCM4", dbRefEntry.getAccessionId());
198 assertSame(peptides.get(1), dbRefEntry.getMap().getTo());
199 assertEquals(cds1Map, dbRefEntry.getMap().getMap());
201 dbRefEntry = dbrefs[4];
202 assertEquals("UNIPROT", dbRefEntry.getSource());
203 assertEquals("P0CE20", dbRefEntry.getAccessionId());
204 assertSame(peptides.get(2), dbRefEntry.getMap().getTo());
205 assertEquals(cds1Map, dbRefEntry.getMap().getMap());
207 dbRefEntry = dbrefs[5];
208 assertEquals("UNIPROT", dbRefEntry.getSource());
209 assertEquals("B0BCM3", dbRefEntry.getAccessionId());
210 assertSame(peptides.get(4), dbRefEntry.getMap().getTo());
211 assertEquals(cds2Map, dbRefEntry.getMap().getMap());
213 dbRefEntry = dbrefs[6];
214 assertEquals("EMBLCDSPROTEIN", dbRefEntry.getSource());
215 assertEquals("CAA12345.6", dbRefEntry.getAccessionId());
216 assertSame(peptides.get(5), dbRefEntry.getMap().getTo());
217 assertEquals(cds3Map, dbRefEntry.getMap().getMap());
220 * verify peptides have dbrefs
221 * - to EMBL sequence (with inverse 1:3 cds mapping)
222 * - to EMBLCDS (with 1:3 mapping)
223 * - direct (no mapping) to other protein accessions
225 MapList proteinToCdsMap1 = new MapList(new int[] { 1, 4 }, new int[] {
227 MapList proteinToCdsMap2 = new MapList(new int[] { 1, 3 }, new int[] {
230 // dbrefs for first CDS EMBL product CAA30420.1
231 dbrefs = peptides.get(0).getDBRefs();
232 assertEquals(5, dbrefs.length);
233 assertEquals(DBRefSource.EMBL, dbrefs[0].getSource());
234 assertEquals("CAA30420.1", dbrefs[0].getAccessionId());
235 // TODO: verify getPrimaryDBRefs() for peptide products
236 assertEquals(cds1Map.getInverse(), dbrefs[0].getMap().getMap());
237 assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource());
238 assertEquals("CAA30420.1", dbrefs[1].getAccessionId());
239 assertEquals(proteinToCdsMap1, dbrefs[1].getMap().getMap());
240 assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource());
241 assertEquals("CAA30420.1", dbrefs[2].getAccessionId());
242 assertNull(dbrefs[2].getMap());
243 assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "2.1", "B0BCM4"),
245 assertNull(dbrefs[3].getMap());
246 assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "P0CE20"),
248 assertNull(dbrefs[4].getMap());
250 // dbrefs for first CDS first Uniprot xref
251 dbrefs = peptides.get(1).getDBRefs();
252 assertEquals(2, dbrefs.length);
253 assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "2.1", "B0BCM4"),
255 assertNull(dbrefs[0].getMap());
256 assertEquals(DBRefSource.EMBL, dbrefs[1].getSource());
257 assertEquals("X07547", dbrefs[1].getAccessionId());
258 assertEquals(cds1Map.getInverse(), dbrefs[1].getMap().getMap());
260 // dbrefs for first CDS second Uniprot xref
261 dbrefs = peptides.get(2).getDBRefs();
262 assertEquals(2, dbrefs.length);
263 assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "P0CE20"),
265 assertNull(dbrefs[0].getMap());
266 assertEquals(DBRefSource.EMBL, dbrefs[1].getSource());
267 assertEquals("X07547", dbrefs[1].getAccessionId());
268 assertEquals(cds1Map.getInverse(), dbrefs[1].getMap().getMap());
270 // dbrefs for second CDS EMBL product CAA30421.1
271 dbrefs = peptides.get(3).getDBRefs();
272 assertEquals(4, dbrefs.length);
273 assertEquals(DBRefSource.EMBL, dbrefs[0].getSource());
274 assertEquals("CAA30421.1", dbrefs[0].getAccessionId());
275 assertEquals(cds2Map.getInverse(), dbrefs[0].getMap().getMap());
276 assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource());
277 assertEquals("CAA30421.1", dbrefs[1].getAccessionId());
278 assertEquals(proteinToCdsMap1, dbrefs[1].getMap().getMap());
279 assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource());
280 assertEquals("CAA30421.1", dbrefs[2].getAccessionId());
281 assertNull(dbrefs[2].getMap());
282 assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "B0BCM3"),
284 assertNull(dbrefs[3].getMap());
286 // dbrefs for second CDS second Uniprot xref
287 dbrefs = peptides.get(4).getDBRefs();
288 assertEquals(2, dbrefs.length);
289 assertEquals(new DBRefEntry(DBRefSource.UNIPROT, "0", "B0BCM3"),
291 assertNull(dbrefs[0].getMap());
292 assertEquals(DBRefSource.EMBL, dbrefs[1].getSource());
293 assertEquals("X07547", dbrefs[1].getAccessionId());
294 assertEquals(cds2Map.getInverse(), dbrefs[1].getMap().getMap());
296 // dbrefs for third CDS inferred EMBL product CAA12345.6
297 dbrefs = peptides.get(5).getDBRefs();
298 assertEquals(3, dbrefs.length);
299 assertEquals(DBRefSource.EMBL, dbrefs[0].getSource());
300 assertEquals("CAA12345.6", dbrefs[0].getAccessionId());
301 assertEquals(cds3Map.getInverse(), dbrefs[0].getMap().getMap());
302 assertEquals(DBRefSource.EMBLCDS, dbrefs[1].getSource());
303 assertEquals("CAA12345.6", dbrefs[1].getAccessionId());
304 assertEquals(proteinToCdsMap2, dbrefs[1].getMap().getMap());
305 assertEquals(DBRefSource.EMBLCDSProduct, dbrefs[2].getSource());
306 assertEquals("CAA12345.6", dbrefs[2].getAccessionId());
307 assertNull(dbrefs[2].getMap());
310 @Test(groups = "Functional")
311 public void testAdjustForProteinLength()
313 int[] exons = new int[] { 11, 15, 21, 25, 31, 38 }; // 18 bp
315 // exact length match:
316 assertSame(exons, EmblXmlSource.adjustForProteinLength(6, exons));
318 // match if we assume exons include stop codon not in protein:
319 assertSame(exons, EmblXmlSource.adjustForProteinLength(5, exons));
321 // truncate last exon by 6bp
322 int[] truncated = EmblXmlSource.adjustForProteinLength(4, exons);
323 assertEquals("[11, 15, 21, 25, 31, 32]", Arrays.toString(truncated));
325 // remove last exon and truncate preceding by 1bp
326 truncated = EmblXmlSource.adjustForProteinLength(3, exons);
327 assertEquals("[11, 15, 21, 24]", Arrays.toString(truncated));
329 // exact removal of exon case:
330 exons = new int[] { 11, 15, 21, 27, 33, 38 }; // 18 bp
331 truncated = EmblXmlSource.adjustForProteinLength(4, exons);
332 assertEquals("[11, 15, 21, 27]", Arrays.toString(truncated));
334 // what if exons are too short for protein?
335 truncated = EmblXmlSource.adjustForProteinLength(7, exons);
336 assertSame(exons, truncated);
339 @Test(groups = { "Functional" })
340 public void testGetEmblEntries()
342 List<EntryType> entries = EmblSourceTest.getEmblEntries();
343 assertEquals(1, entries.size());
344 EntryType entry = entries.get(0);
346 assertEquals("X07547", entry.getAccession());
347 assertEquals("C. trachomatis plasmid", entry.getDescription());
348 assertEquals("STD", entry.getDataClass());
349 assertEquals("PRO", entry.getTaxonomicDivision());
350 assertEquals("1999-02-10", entry.getLastUpdated().toString());
351 assertEquals(58, entry.getLastUpdatedRelease().intValue());
352 assertEquals("1988-11-10", entry.getFirstPublic().toString());
353 assertEquals(18, entry.getFirstPublicRelease().intValue());
354 assertEquals("genomic DNA", entry.getMoleculeType());
355 assertEquals(1, entry.getVersion().intValue());
356 assertEquals(8, entry.getEntryVersion().intValue());
357 assertEquals("linear", entry.getTopology());
358 assertEquals(7499, entry.getSequenceLength().intValue());
359 assertEquals(2, entry.getKeyword().size());
360 assertEquals("plasmid", entry.getKeyword().get(0));
361 assertEquals("unidentified reading frame", entry.getKeyword().get(1));
366 assertEquals(2, entry.getXref().size());
367 XrefType dbref = entry.getXref().get(0);
368 assertEquals("EuropePMC", dbref.getDb());
369 assertEquals("PMC107176", dbref.getId());
370 assertEquals("9573186", dbref.getSecondaryId());
371 dbref = entry.getXref().get(1);
372 assertEquals("MD5", dbref.getDb());
373 assertEquals("ac73317", dbref.getId());
374 assertNull(dbref.getSecondaryId());
377 * three sequence features for CDS
379 assertEquals(3, entry.getFeature().size());
383 Feature ef = entry.getFeature().get(0);
384 assertEquals("CDS", ef.getName());
385 assertEquals("complement(46..57)", ef.getLocation());
386 assertEquals(2, ef.getXref().size());
387 dbref = ef.getXref().get(0);
388 assertEquals("UniProtKB/Swiss-Prot", dbref.getDb());
389 assertEquals("B0BCM4", dbref.getId());
390 assertEquals("2.1", dbref.getSecondaryId());
391 dbref = ef.getXref().get(1);
392 assertEquals("UniProtKB/Swiss-Prot", dbref.getDb());
393 assertEquals("P0CE20", dbref.getId());
394 assertNull(dbref.getSecondaryId());
395 // CDS feature qualifiers
396 assertEquals(3, ef.getQualifier().size());
397 Qualifier q = ef.getQualifier().get(0);
398 assertEquals("note", q.getName());
399 assertEquals("ORF 8 (AA 1-330)", q.getValue());
400 q = ef.getQualifier().get(1);
401 assertEquals("protein_id", q.getName());
402 assertEquals("CAA30420.1", q.getValue());
403 q = ef.getQualifier().get(2);
404 assertEquals("translation", q.getName());
405 assertEquals("MLCF", q.getValue());
410 ef = entry.getFeature().get(1);
411 assertEquals("CDS", ef.getName());
412 assertEquals("4..15", ef.getLocation());
413 assertEquals(1, ef.getXref().size());
414 dbref = ef.getXref().get(0);
415 assertEquals("UniProtKB/Swiss-Prot", dbref.getDb());
416 assertEquals("B0BCM3", dbref.getId());
417 assertNull(dbref.getSecondaryId());
418 assertEquals(2, ef.getQualifier().size());
419 q = ef.getQualifier().get(0);
420 assertEquals("protein_id", q.getName());
421 assertEquals("CAA30421.1", q.getValue());
422 q = ef.getQualifier().get(1);
423 assertEquals("translation", q.getName());
424 assertEquals("MSSS", q.getValue());
429 ef = entry.getFeature().get(2);
430 assertEquals("CDS", ef.getName());
431 assertEquals("join(4..6,10..15)", ef.getLocation());
432 assertNotNull(ef.getXref());
433 assertTrue(ef.getXref().isEmpty());
434 assertEquals(2, ef.getQualifier().size());
435 q = ef.getQualifier().get(0);
436 assertEquals("protein_id", q.getName());
437 assertEquals("CAA12345.6", q.getValue());
438 q = ef.getQualifier().get(1);
439 assertEquals("translation", q.getName());
440 assertEquals("MSS", q.getValue());
443 * Sequence - raw data before removal of newlines
445 String seq = entry.getSequence();
447 "GGTATGTCCTCTAGTACAAAC\n"
448 + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT",
452 * getSequence() converts empty DBRefEntry.version to "0"
454 assertNull(entry.getXref().get(1).getSecondaryId());
455 assertNull(entry.getFeature().get(0).getXref().get(1).getSecondaryId());
458 static List<EntryType> getEmblEntries()
460 return new EmblSource()
461 .getEmblEntries(new ByteArrayInputStream(TESTDATA.getBytes()));