From d734217a33aa6bfa39a4ac207592b4696ffdc48d Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 26 May 2016 10:27:02 +0100 Subject: [PATCH] JAL-2113 additions to embl_mapping.xml / data models / tests --- resources/embl_mapping.xml | 29 ++- src/jalview/datamodel/FeatureProperties.java | 3 +- src/jalview/datamodel/xdb/embl/EmblEntry.java | 202 ++++++++------------ test/jalview/datamodel/xdb/embl/EmblEntryTest.java | 78 +++++++- test/jalview/datamodel/xdb/embl/EmblFileTest.java | 92 ++++++--- .../jalview/datamodel/xdb/embl/EmblTestHelper.java | 53 +++-- 6 files changed, 274 insertions(+), 183 deletions(-) diff --git a/resources/embl_mapping.xml b/resources/embl_mapping.xml index da1bba4..01b921a 100644 --- a/resources/embl_mapping.xml +++ b/resources/embl_mapping.xml @@ -39,12 +39,9 @@ - - - @@ -52,6 +49,12 @@ + + + + + + @@ -61,13 +64,19 @@ - - + + + + + + + + - - + + - + diff --git a/src/jalview/datamodel/FeatureProperties.java b/src/jalview/datamodel/FeatureProperties.java index d25eb96..2306bec 100644 --- a/src/jalview/datamodel/FeatureProperties.java +++ b/src/jalview/datamodel/FeatureProperties.java @@ -28,8 +28,7 @@ package jalview.datamodel; */ public class FeatureProperties { - - private static final String EMBL_CODING_FEATURE = "CDS"; + public static final String EMBL_CODING_FEATURE = "CDS"; public static final String EXONPOS = "exon number"; diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index 50a262f..cfe87d9 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -62,21 +62,25 @@ public class EmblEntry String sequenceVersion; + String dataClass; + String moleculeType; String topology; String sequenceLength; - String taxDivision; + String taxonomicDivision; + + String description; - String desc; + String firstPublicDate; - String rCreated; + String firstPublicRelease; - String rLastUpdated; + String lastUpdatedDate; - String lastUpdated; + String lastUpdatedRelease; Vector keywords; @@ -121,23 +125,6 @@ public class EmblEntry } /** - * @return the desc - */ - public String getDesc() - { - return desc; - } - - /** - * @param desc - * the desc to set - */ - public void setDesc(String desc) - { - this.desc = desc; - } - - /** * @return the features */ public Vector getFeatures() @@ -172,57 +159,6 @@ public class EmblEntry } /** - * @return the lastUpdated - */ - public String getLastUpdated() - { - return lastUpdated; - } - - /** - * @param lastUpdated - * the lastUpdated to set - */ - public void setLastUpdated(String lastUpdated) - { - this.lastUpdated = lastUpdated; - } - - /** - * @return the releaseCreated - */ - public String getRCreated() - { - return rCreated; - } - - /** - * @param releaseCreated - * the releaseCreated to set - */ - public void setRCreated(String releaseCreated) - { - this.rCreated = releaseCreated; - } - - /** - * @return the releaseLastUpdated - */ - public String getRLastUpdated() - { - return rLastUpdated; - } - - /** - * @param releaseLastUpdated - * the releaseLastUpdated to set - */ - public void setRLastUpdated(String releaseLastUpdated) - { - this.rLastUpdated = releaseLastUpdated; - } - - /** * @return the sequence */ public EmblSequence getSequence() @@ -240,40 +176,6 @@ public class EmblEntry } /** - * @return the taxDivision - */ - public String getTaxDivision() - { - return taxDivision; - } - - /** - * @param taxDivision - * the taxDivision to set - */ - public void setTaxDivision(String taxDivision) - { - this.taxDivision = taxDivision; - } - - /** - * @return the entry version - */ - public String getEntryVersion() - { - return entryVersion; - } - - /** - * @param version - * the version to set - */ - public void setEntryVersion(String version) - { - this.entryVersion = version; - } - - /** * Recover annotated sequences from EMBL file * * @param sourceDb @@ -285,7 +187,7 @@ public class EmblEntry { SequenceI dna = new Sequence(sourceDb + "|" + accession, sequence.getSequence()); - dna.setDescription(desc); + dna.setDescription(description); DBRefEntry retrievedref = new DBRefEntry(sourceDb, getSequenceVersion(), accession); dna.addDBRef(retrievedref); @@ -780,6 +682,26 @@ public class EmblEntry this.sequenceVersion = sequenceVersion; } + public String getSequenceLength() + { + return sequenceLength; + } + + public void setSequenceLength(String sequenceLength) + { + this.sequenceLength = sequenceLength; + } + + public String getEntryVersion() + { + return entryVersion; + } + + public void setEntryVersion(String entryVersion) + { + this.entryVersion = entryVersion; + } + public String getMoleculeType() { return moleculeType; @@ -800,33 +722,73 @@ public class EmblEntry this.topology = topology; } - public String getSequenceLength() + public String getTaxonomicDivision() { - return sequenceLength; + return taxonomicDivision; } - public void setSequenceLength(String sequenceLength) + public void setTaxonomicDivision(String taxonomicDivision) { - this.sequenceLength = sequenceLength; + this.taxonomicDivision = taxonomicDivision; + } + + public String getDescription() + { + return description; + } + + public void setDescription(String description) + { + this.description = description; + } + + public String getFirstPublicDate() + { + return firstPublicDate; + } + + public void setFirstPublicDate(String firstPublicDate) + { + this.firstPublicDate = firstPublicDate; + } + + public String getFirstPublicRelease() + { + return firstPublicRelease; + } + + public void setFirstPublicRelease(String firstPublicRelease) + { + this.firstPublicRelease = firstPublicRelease; + } + + public String getLastUpdatedDate() + { + return lastUpdatedDate; + } + + public void setLastUpdatedDate(String lastUpdatedDate) + { + this.lastUpdatedDate = lastUpdatedDate; } - public String getrCreated() + public String getLastUpdatedRelease() { - return rCreated; + return lastUpdatedRelease; } - public void setrCreated(String rCreated) + public void setLastUpdatedRelease(String lastUpdatedRelease) { - this.rCreated = rCreated; + this.lastUpdatedRelease = lastUpdatedRelease; } - public String getrLastUpdated() + public String getDataClass() { - return rLastUpdated; + return dataClass; } - public void setrLastUpdated(String rLastUpdated) + public void setDataClass(String dataClass) { - this.rLastUpdated = rLastUpdated; + this.dataClass = dataClass; } } diff --git a/test/jalview/datamodel/xdb/embl/EmblEntryTest.java b/test/jalview/datamodel/xdb/embl/EmblEntryTest.java index c36b7d3..e8760bd 100644 --- a/test/jalview/datamodel/xdb/embl/EmblEntryTest.java +++ b/test/jalview/datamodel/xdb/embl/EmblEntryTest.java @@ -1,7 +1,9 @@ package jalview.datamodel.xdb.embl; import static org.testng.AssertJUnit.assertEquals; +import static org.testng.AssertJUnit.assertSame; +import jalview.datamodel.DBRefEntry; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; @@ -36,17 +38,79 @@ public class EmblEntryTest SequenceI dna = new Sequence("J03321", "GGATCCGTAAGTTAGACGAAATT"); List peptides = new ArrayList(); EmblFile ef = EmblTestHelper.getEmblFile(); - EmblFeature feature = null; - for (EmblFeature feat : ef.getEntries().get(0).getFeatures()) + + /* + * parse two CDS features, one with two Uniprot cross-refs, + * the other with one + */ + EmblEntry testee = new EmblEntry(); + for (EmblFeature feature : ef.getEntries().get(0).getFeatures()) { - if ("CDS".equals(feat.getName())) + if ("CDS".equals(feature.getName())) { - feature = feat; - break; + testee.parseCodingFeature(feature, "EMBL", dna, peptides); } } - EmblEntry testee = new EmblEntry(); - testee.parseCodingFeature(feature, "EMBL", dna, peptides); + /* + * peptides should now have five entries: + * EMBL product and two Uniprot accessions for the first CDS / translation + * EMBL product and one Uniprot accession for the second CDS / translation + */ + assertEquals(5, peptides.size()); + assertEquals("CAA30420.1", peptides.get(0).getName()); + assertEquals("MLCF", peptides.get(0).getSequenceAsString()); + assertEquals("UNIPROT|B0BCM4", peptides.get(1).getName()); + assertEquals("MLCF", peptides.get(1).getSequenceAsString()); + assertEquals("UNIPROT|P0CE20", peptides.get(2).getName()); + assertEquals("MLCF", peptides.get(2).getSequenceAsString()); + assertEquals("CAA30421.1", peptides.get(3).getName()); + assertEquals("MSSS", peptides.get(3).getSequenceAsString()); + assertEquals("UNIPROT|B0BCM3", peptides.get(4).getName()); + assertEquals("MSSS", peptides.get(4).getSequenceAsString()); + + /* + * verify dna sequence has dbrefs with mappings to the peptide 'products' + */ + DBRefEntry[] dbrefs = dna.getDBRefs(); + assertEquals(3, dbrefs.length); + DBRefEntry dbRefEntry = dbrefs[0]; + assertEquals("UNIPROT", dbRefEntry.getSource()); + assertEquals("B0BCM4", dbRefEntry.getAccessionId()); + assertSame(peptides.get(1), dbRefEntry.getMap().getTo()); + List fromRanges = dbRefEntry.getMap().getMap().getFromRanges(); + assertEquals(1, fromRanges.size()); + assertEquals(57, fromRanges.get(0)[0]); + assertEquals(46, fromRanges.get(0)[1]); + List toRanges = dbRefEntry.getMap().getMap().getToRanges(); + assertEquals(1, toRanges.size()); + assertEquals(1, toRanges.get(0)[0]); + assertEquals(4, toRanges.get(0)[1]); + + dbRefEntry = dbrefs[1]; + assertEquals("UNIPROT", dbRefEntry.getSource()); + assertEquals("P0CE20", dbRefEntry.getAccessionId()); + assertSame(peptides.get(2), dbRefEntry.getMap().getTo()); + fromRanges = dbRefEntry.getMap().getMap().getFromRanges(); + assertEquals(1, fromRanges.size()); + assertEquals(57, fromRanges.get(0)[0]); + assertEquals(46, fromRanges.get(0)[1]); + toRanges = dbRefEntry.getMap().getMap().getToRanges(); + assertEquals(1, toRanges.size()); + assertEquals(1, toRanges.get(0)[0]); + assertEquals(4, toRanges.get(0)[1]); + + dbRefEntry = dbrefs[2]; + assertEquals("UNIPROT", dbRefEntry.getSource()); + assertEquals("B0BCM3", dbRefEntry.getAccessionId()); + assertSame(peptides.get(4), dbRefEntry.getMap().getTo()); + fromRanges = dbRefEntry.getMap().getMap().getFromRanges(); + assertEquals(1, fromRanges.size()); + assertEquals(4, fromRanges.get(0)[0]); + assertEquals(15, fromRanges.get(0)[1]); + toRanges = dbRefEntry.getMap().getMap().getToRanges(); + assertEquals(1, toRanges.size()); + assertEquals(1, toRanges.get(0)[0]); + assertEquals(4, toRanges.get(0)[1]); } } diff --git a/test/jalview/datamodel/xdb/embl/EmblFileTest.java b/test/jalview/datamodel/xdb/embl/EmblFileTest.java index a62cb87..6955833 100644 --- a/test/jalview/datamodel/xdb/embl/EmblFileTest.java +++ b/test/jalview/datamodel/xdb/embl/EmblFileTest.java @@ -39,28 +39,31 @@ public class EmblFileTest assertEquals(1, entries.size()); EmblEntry entry = entries.get(0); - assertEquals("X53828", entry.getAccession()); - assertEquals( - "Chicken LDH-A mRNA for lactate dehydrogenase A chain (EC 1.1.1.27)", - entry.getDesc()); - assertEquals("2005-04-18", entry.getLastUpdated()); - assertEquals("mRNA", entry.getMoleculeType()); + assertEquals("X07547", entry.getAccession()); + assertEquals("C. trachomatis plasmid", entry.getDescription()); + assertEquals("STD", entry.getDataClass()); + assertEquals("PRO", entry.getTaxonomicDivision()); + assertEquals("1999-02-10", entry.getLastUpdatedDate()); + assertEquals("58", entry.getLastUpdatedRelease()); + assertEquals("1988-11-10", entry.getFirstPublicDate()); + assertEquals("18", entry.getFirstPublicRelease()); + assertEquals("genomic DNA", entry.getMoleculeType()); assertEquals("1", entry.getSequenceVersion()); - assertEquals("3", entry.getEntryVersion()); + assertEquals("8", entry.getEntryVersion()); assertEquals("linear", entry.getTopology()); - assertEquals("1575", entry.getSequenceLength()); + assertEquals("7499", entry.getSequenceLength()); /* * FIXME these assertions fail - values are null - why?? Adding or removing * attributes in the test XML modifies behaviour. eg. inserting an attribute * _before_ lastUpdated results in a null value in this field. */ - // assertEquals("25", entry.getRCreated()); - // assertEquals("83", entry.getRLastUpdated()); + assertEquals("1988-11-10", entry.getFirstPublicDate()); + assertEquals("18", entry.getFirstPublicRelease()); assertEquals(2, entry.getKeywords().size()); - assertEquals("L-lactate dehydrogenase", entry.getKeywords().get(0)); - assertEquals("chutney", entry.getKeywords().get(1)); + assertEquals("plasmid", entry.getKeywords().get(0)); + assertEquals("unidentified reading frame", entry.getKeywords().get(1)); /* * dbrefs @@ -68,52 +71,81 @@ public class EmblFileTest assertEquals(2, entry.getDbRefs().size()); DBRefEntry dbref = entry.getDbRefs().get(0); assertEquals("EuropePMC", dbref.getSource()); - assertEquals("PMC1460223", dbref.getAccessionId()); - assertEquals("9649548", dbref.getVersion()); + assertEquals("PMC107176", dbref.getAccessionId()); + assertEquals("9573186", dbref.getVersion()); dbref = entry.getDbRefs().get(1); assertEquals("MD5", dbref.getSource()); - assertEquals("d3b68", dbref.getAccessionId()); + assertEquals("ac73317", dbref.getAccessionId()); // blank version has been converted to "0" assertEquals("0", dbref.getVersion()); /* - * sequence feature for CDS + * two sequence features for CDS + */ + assertEquals(2, entry.getFeatures().size()); + /* + * first CDS */ - assertEquals(1, entry.getFeatures().size()); EmblFeature ef = entry.getFeatures().get(0); assertEquals("CDS", ef.getName()); - assertEquals("60..1058", ef.getLocation()); + assertEquals("complement(46..57)", ef.getLocation()); assertEquals(2, ef.getDbRefs().size()); dbref = ef.getDbRefs().get(0); - assertEquals("GOA", dbref.getSource()); - assertEquals("P00340", dbref.getAccessionId()); + assertEquals("UniProtKB/Swiss-Prot", dbref.getSource()); + assertEquals("B0BCM4", dbref.getAccessionId()); assertEquals("2.1", dbref.getVersion()); dbref = ef.getDbRefs().get(1); - assertEquals("InterPro", dbref.getSource()); - assertEquals("IPR001236", dbref.getAccessionId()); - // blank version converted to "0": + assertEquals("UniProtKB/Swiss-Prot", dbref.getSource()); + assertEquals("P0CE20", dbref.getAccessionId()); + // blank version gets converted to "0": assertEquals("0", dbref.getVersion()); - assertEquals(2, ef.getQualifiers().size()); - - // feature qualifiers + // CDS feature qualifiers + assertEquals(3, ef.getQualifiers().size()); Qualifier q = ef.getQualifiers().get(0); assertEquals("note", q.getName()); assertEquals(2, q.getValues().length); - assertEquals("L-lactate dehydrogenase A-chain", q.getValues()[0]); + assertEquals("ORF 8 (AA 1-330)", q.getValues()[0]); assertEquals("pickle", q.getValues()[1]); assertNull(q.getEvidence()); q = ef.getQualifiers().get(1); + assertEquals("protein_id", q.getName()); + assertEquals(1, q.getValues().length); + assertEquals("CAA30420.1", q.getValues()[0]); + q = ef.getQualifiers().get(2); assertEquals("translation", q.getName()); assertEquals(1, q.getValues().length); - assertEquals("MSLKDHLIHN", q.getValues()[0]); + assertEquals("MLCF", q.getValues()[0]); assertEquals(1, q.getEvidence().length); assertEquals("Keith", q.getEvidence()[0]); /* - * Sequence + * second CDS + */ + ef = entry.getFeatures().get(1); + assertEquals("CDS", ef.getName()); + assertEquals("4..15", ef.getLocation()); + assertEquals(1, ef.getDbRefs().size()); + dbref = ef.getDbRefs().get(0); + assertEquals("UniProtKB/Swiss-Prot", dbref.getSource()); + assertEquals("B0BCM3", dbref.getAccessionId()); + assertEquals("0", dbref.getVersion()); + assertEquals(2, ef.getQualifiers().size()); + q = ef.getQualifiers().get(0); + assertEquals("protein_id", q.getName()); + assertEquals(1, q.getValues().length); + assertEquals("CAA30421.1", q.getValues()[0]); + q = ef.getQualifiers().get(1); + assertEquals("translation", q.getName()); + assertEquals(1, q.getValues().length); + assertEquals("MSSS", q.getValues()[0]); + + /* + * Sequence - verify newline not converted to space (JAL-2029) */ EmblSequence seq = entry.getSequence(); - assertEquals("GTGACG", seq.getSequence()); + assertEquals( + "GGTATGTCCTCTAGTACAAACACCCCCAATATTGTGATATAATTAAAAACATAGCAT", + seq.getSequence()); /* * getSequence() converts empty DBRefEntry.version to "0" diff --git a/test/jalview/datamodel/xdb/embl/EmblTestHelper.java b/test/jalview/datamodel/xdb/embl/EmblTestHelper.java index 71ca508..9957c72 100644 --- a/test/jalview/datamodel/xdb/embl/EmblTestHelper.java +++ b/test/jalview/datamodel/xdb/embl/EmblTestHelper.java @@ -4,22 +4,47 @@ import java.io.StringReader; public class EmblTestHelper { - // adapted from http://www.ebi.ac.uk/ena/data/view/x53828&display=xml + // adapted from http://www.ebi.ac.uk/ena/data/view/X07547&display=xml + // dna and translations truncated for convenience private static final String TESTDATA = "" + "" - + "" - + "Chicken LDH-A mRNA for lactate dehydrogenase A chain (EC 1.1.1.27)" - + "L-lactate dehydrogenasechutney" - + "" - + "" - + "" - + "" - + "L-lactate dehydrogenase A-chainpickle" - + "MSLKDHLIHNKeith" - // emulate EMBL XML 1.2 which splits sequence data every 60 characters - // see EmblSequence.setSequence - + "" + "GTG\nACG"; + + "" + + "X07574" + + "C. trachomatis plasmid" + + "plasmidunidentified reading frame" + + "" + + "" + /* + * first CDS (range and translation changed to keep test data manageable) + */ + + "" + // test the case of >1 cross-ref to the same database (JAL-2029) + + "" + + "" + + "ORF 8 (AA 1-330)pickle" + + "CAA30420.1" + + "MLCFKeith" + + "" + /* + * second CDS (range and translation changed to keep test data manageable) + */ + + "" + + "" + + "CAA30421.1" + + "MSSS" + + "" + /* + * sequence (modified for test purposes) + * emulates EMBL XML 1.2 which splits sequence data every 60 characters + * see EmblSequence.setSequence + */ + + "GGTATGTCCTCTAGTACAAAC\n" + + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT" + + ""; static EmblFile getEmblFile() { -- 1.7.10.2