<field name="accession" type="string">
<bind-xml name="accession" node="attribute"/>
</field>
- <field name="lastUpdated" type="string">
- <bind-xml name="lastUpdated" node="attribute"/>
- </field>
<!--
- in EMBL XML 1.2 sequence/@version becomes entry/version
- entry/@version becomes entry/@entryVersion
+ in EMBL XML 1.2 sequence/@version became entry/version
+ entry/@version became entry/@entryVersion
-->
<field name="sequenceVersion" type="string">
<bind-xml name="version" node="attribute"/>
<field name="entryVersion" type="string">
<bind-xml name="entryVersion" node="attribute"/>
</field>
+ <field name="dataClass" type="string">
+ <bind-xml name="dataClass" node="attribute"/>
+ </field>
+ <field name="taxonomicDivision" type="string">
+ <bind-xml name="taxonomicDivision" node="attribute"/>
+ </field>
<field name="moleculeType" type="string">
<bind-xml name="moleculeType" node="attribute"/>
</field>
<field name="topology" type="string">
<bind-xml name="topology" node="attribute" location="type"/>
</field>
- <field name="rCreated" type="string">
- <bind-xml name="releaseCreated" node="attribute"/>
+ <field name="firstPublicDate" type="string">
+ <bind-xml name="firstPublic" node="attribute"/>
+ </field>
+ <field name="firstPublicRelease" type="string">
+ <bind-xml name="firstPublicRelease" node="attribute"/>
+ </field>
+ <field name="lastUpdatedDate" type="string">
+ <bind-xml name="lastUpdated" node="attribute"/>
</field>
- <field name="rLastUpdated" type="string">
- <bind-xml name="releaseLastUpdated" node="attribute"/>
+ <field name="lastUpdatedRelease" type="string">
+ <bind-xml name="lastUpdatedRelease" node="attribute"/>
</field>
- <field name="desc" type="string">
+ <field name="description" type="string">
<bind-xml name="description" node="element"/>
</field>
<field name="keywords" type="string" collection="vector">
*/
public class FeatureProperties
{
-
- private static final String EMBL_CODING_FEATURE = "CDS";
+ public static final String EMBL_CODING_FEATURE = "CDS";
public static final String EXONPOS = "exon number";
String sequenceVersion;
+ String dataClass;
+
String moleculeType;
String topology;
String sequenceLength;
- String taxDivision;
+ String taxonomicDivision;
+
+ String description;
- String desc;
+ String firstPublicDate;
- String rCreated;
+ String firstPublicRelease;
- String rLastUpdated;
+ String lastUpdatedDate;
- String lastUpdated;
+ String lastUpdatedRelease;
Vector<String> keywords;
}
/**
- * @return the desc
- */
- public String getDesc()
- {
- return desc;
- }
-
- /**
- * @param desc
- * the desc to set
- */
- public void setDesc(String desc)
- {
- this.desc = desc;
- }
-
- /**
* @return the features
*/
public Vector<EmblFeature> getFeatures()
}
/**
- * @return the lastUpdated
- */
- public String getLastUpdated()
- {
- return lastUpdated;
- }
-
- /**
- * @param lastUpdated
- * the lastUpdated to set
- */
- public void setLastUpdated(String lastUpdated)
- {
- this.lastUpdated = lastUpdated;
- }
-
- /**
- * @return the releaseCreated
- */
- public String getRCreated()
- {
- return rCreated;
- }
-
- /**
- * @param releaseCreated
- * the releaseCreated to set
- */
- public void setRCreated(String releaseCreated)
- {
- this.rCreated = releaseCreated;
- }
-
- /**
- * @return the releaseLastUpdated
- */
- public String getRLastUpdated()
- {
- return rLastUpdated;
- }
-
- /**
- * @param releaseLastUpdated
- * the releaseLastUpdated to set
- */
- public void setRLastUpdated(String releaseLastUpdated)
- {
- this.rLastUpdated = releaseLastUpdated;
- }
-
- /**
* @return the sequence
*/
public EmblSequence getSequence()
}
/**
- * @return the taxDivision
- */
- public String getTaxDivision()
- {
- return taxDivision;
- }
-
- /**
- * @param taxDivision
- * the taxDivision to set
- */
- public void setTaxDivision(String taxDivision)
- {
- this.taxDivision = taxDivision;
- }
-
- /**
- * @return the entry version
- */
- public String getEntryVersion()
- {
- return entryVersion;
- }
-
- /**
- * @param version
- * the version to set
- */
- public void setEntryVersion(String version)
- {
- this.entryVersion = version;
- }
-
- /**
* Recover annotated sequences from EMBL file
*
* @param sourceDb
{
SequenceI dna = new Sequence(sourceDb + "|" + accession,
sequence.getSequence());
- dna.setDescription(desc);
+ dna.setDescription(description);
DBRefEntry retrievedref = new DBRefEntry(sourceDb,
getSequenceVersion(), accession);
dna.addDBRef(retrievedref);
this.sequenceVersion = sequenceVersion;
}
+ public String getSequenceLength()
+ {
+ return sequenceLength;
+ }
+
+ public void setSequenceLength(String sequenceLength)
+ {
+ this.sequenceLength = sequenceLength;
+ }
+
+ public String getEntryVersion()
+ {
+ return entryVersion;
+ }
+
+ public void setEntryVersion(String entryVersion)
+ {
+ this.entryVersion = entryVersion;
+ }
+
public String getMoleculeType()
{
return moleculeType;
this.topology = topology;
}
- public String getSequenceLength()
+ public String getTaxonomicDivision()
{
- return sequenceLength;
+ return taxonomicDivision;
}
- public void setSequenceLength(String sequenceLength)
+ public void setTaxonomicDivision(String taxonomicDivision)
{
- this.sequenceLength = sequenceLength;
+ this.taxonomicDivision = taxonomicDivision;
+ }
+
+ public String getDescription()
+ {
+ return description;
+ }
+
+ public void setDescription(String description)
+ {
+ this.description = description;
+ }
+
+ public String getFirstPublicDate()
+ {
+ return firstPublicDate;
+ }
+
+ public void setFirstPublicDate(String firstPublicDate)
+ {
+ this.firstPublicDate = firstPublicDate;
+ }
+
+ public String getFirstPublicRelease()
+ {
+ return firstPublicRelease;
+ }
+
+ public void setFirstPublicRelease(String firstPublicRelease)
+ {
+ this.firstPublicRelease = firstPublicRelease;
+ }
+
+ public String getLastUpdatedDate()
+ {
+ return lastUpdatedDate;
+ }
+
+ public void setLastUpdatedDate(String lastUpdatedDate)
+ {
+ this.lastUpdatedDate = lastUpdatedDate;
}
- public String getrCreated()
+ public String getLastUpdatedRelease()
{
- return rCreated;
+ return lastUpdatedRelease;
}
- public void setrCreated(String rCreated)
+ public void setLastUpdatedRelease(String lastUpdatedRelease)
{
- this.rCreated = rCreated;
+ this.lastUpdatedRelease = lastUpdatedRelease;
}
- public String getrLastUpdated()
+ public String getDataClass()
{
- return rLastUpdated;
+ return dataClass;
}
- public void setrLastUpdated(String rLastUpdated)
+ public void setDataClass(String dataClass)
{
- this.rLastUpdated = rLastUpdated;
+ this.dataClass = dataClass;
}
}
package jalview.datamodel.xdb.embl;
import static org.testng.AssertJUnit.assertEquals;
+import static org.testng.AssertJUnit.assertSame;
+import jalview.datamodel.DBRefEntry;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
SequenceI dna = new Sequence("J03321", "GGATCCGTAAGTTAGACGAAATT");
List<SequenceI> peptides = new ArrayList<SequenceI>();
EmblFile ef = EmblTestHelper.getEmblFile();
- EmblFeature feature = null;
- for (EmblFeature feat : ef.getEntries().get(0).getFeatures())
+
+ /*
+ * parse two CDS features, one with two Uniprot cross-refs,
+ * the other with one
+ */
+ EmblEntry testee = new EmblEntry();
+ for (EmblFeature feature : ef.getEntries().get(0).getFeatures())
{
- if ("CDS".equals(feat.getName()))
+ if ("CDS".equals(feature.getName()))
{
- feature = feat;
- break;
+ testee.parseCodingFeature(feature, "EMBL", dna, peptides);
}
}
- EmblEntry testee = new EmblEntry();
- testee.parseCodingFeature(feature, "EMBL", dna, peptides);
+ /*
+ * peptides should now have five entries:
+ * EMBL product and two Uniprot accessions for the first CDS / translation
+ * EMBL product and one Uniprot accession for the second CDS / translation
+ */
+ assertEquals(5, peptides.size());
+ assertEquals("CAA30420.1", peptides.get(0).getName());
+ assertEquals("MLCF", peptides.get(0).getSequenceAsString());
+ assertEquals("UNIPROT|B0BCM4", peptides.get(1).getName());
+ assertEquals("MLCF", peptides.get(1).getSequenceAsString());
+ assertEquals("UNIPROT|P0CE20", peptides.get(2).getName());
+ assertEquals("MLCF", peptides.get(2).getSequenceAsString());
+ assertEquals("CAA30421.1", peptides.get(3).getName());
+ assertEquals("MSSS", peptides.get(3).getSequenceAsString());
+ assertEquals("UNIPROT|B0BCM3", peptides.get(4).getName());
+ assertEquals("MSSS", peptides.get(4).getSequenceAsString());
+
+ /*
+ * verify dna sequence has dbrefs with mappings to the peptide 'products'
+ */
+ DBRefEntry[] dbrefs = dna.getDBRefs();
+ assertEquals(3, dbrefs.length);
+ DBRefEntry dbRefEntry = dbrefs[0];
+ assertEquals("UNIPROT", dbRefEntry.getSource());
+ assertEquals("B0BCM4", dbRefEntry.getAccessionId());
+ assertSame(peptides.get(1), dbRefEntry.getMap().getTo());
+ List<int[]> fromRanges = dbRefEntry.getMap().getMap().getFromRanges();
+ assertEquals(1, fromRanges.size());
+ assertEquals(57, fromRanges.get(0)[0]);
+ assertEquals(46, fromRanges.get(0)[1]);
+ List<int[]> toRanges = dbRefEntry.getMap().getMap().getToRanges();
+ assertEquals(1, toRanges.size());
+ assertEquals(1, toRanges.get(0)[0]);
+ assertEquals(4, toRanges.get(0)[1]);
+
+ dbRefEntry = dbrefs[1];
+ assertEquals("UNIPROT", dbRefEntry.getSource());
+ assertEquals("P0CE20", dbRefEntry.getAccessionId());
+ assertSame(peptides.get(2), dbRefEntry.getMap().getTo());
+ fromRanges = dbRefEntry.getMap().getMap().getFromRanges();
+ assertEquals(1, fromRanges.size());
+ assertEquals(57, fromRanges.get(0)[0]);
+ assertEquals(46, fromRanges.get(0)[1]);
+ toRanges = dbRefEntry.getMap().getMap().getToRanges();
+ assertEquals(1, toRanges.size());
+ assertEquals(1, toRanges.get(0)[0]);
+ assertEquals(4, toRanges.get(0)[1]);
+
+ dbRefEntry = dbrefs[2];
+ assertEquals("UNIPROT", dbRefEntry.getSource());
+ assertEquals("B0BCM3", dbRefEntry.getAccessionId());
+ assertSame(peptides.get(4), dbRefEntry.getMap().getTo());
+ fromRanges = dbRefEntry.getMap().getMap().getFromRanges();
+ assertEquals(1, fromRanges.size());
+ assertEquals(4, fromRanges.get(0)[0]);
+ assertEquals(15, fromRanges.get(0)[1]);
+ toRanges = dbRefEntry.getMap().getMap().getToRanges();
+ assertEquals(1, toRanges.size());
+ assertEquals(1, toRanges.get(0)[0]);
+ assertEquals(4, toRanges.get(0)[1]);
}
}
assertEquals(1, entries.size());
EmblEntry entry = entries.get(0);
- assertEquals("X53828", entry.getAccession());
- assertEquals(
- "Chicken LDH-A mRNA for lactate dehydrogenase A chain (EC 1.1.1.27)",
- entry.getDesc());
- assertEquals("2005-04-18", entry.getLastUpdated());
- assertEquals("mRNA", entry.getMoleculeType());
+ assertEquals("X07547", entry.getAccession());
+ assertEquals("C. trachomatis plasmid", entry.getDescription());
+ assertEquals("STD", entry.getDataClass());
+ assertEquals("PRO", entry.getTaxonomicDivision());
+ assertEquals("1999-02-10", entry.getLastUpdatedDate());
+ assertEquals("58", entry.getLastUpdatedRelease());
+ assertEquals("1988-11-10", entry.getFirstPublicDate());
+ assertEquals("18", entry.getFirstPublicRelease());
+ assertEquals("genomic DNA", entry.getMoleculeType());
assertEquals("1", entry.getSequenceVersion());
- assertEquals("3", entry.getEntryVersion());
+ assertEquals("8", entry.getEntryVersion());
assertEquals("linear", entry.getTopology());
- assertEquals("1575", entry.getSequenceLength());
+ assertEquals("7499", entry.getSequenceLength());
/*
* FIXME these assertions fail - values are null - why?? Adding or removing
* attributes in the test XML modifies behaviour. eg. inserting an attribute
* _before_ lastUpdated results in a null value in this field.
*/
- // assertEquals("25", entry.getRCreated());
- // assertEquals("83", entry.getRLastUpdated());
+ assertEquals("1988-11-10", entry.getFirstPublicDate());
+ assertEquals("18", entry.getFirstPublicRelease());
assertEquals(2, entry.getKeywords().size());
- assertEquals("L-lactate dehydrogenase", entry.getKeywords().get(0));
- assertEquals("chutney", entry.getKeywords().get(1));
+ assertEquals("plasmid", entry.getKeywords().get(0));
+ assertEquals("unidentified reading frame", entry.getKeywords().get(1));
/*
* dbrefs
assertEquals(2, entry.getDbRefs().size());
DBRefEntry dbref = entry.getDbRefs().get(0);
assertEquals("EuropePMC", dbref.getSource());
- assertEquals("PMC1460223", dbref.getAccessionId());
- assertEquals("9649548", dbref.getVersion());
+ assertEquals("PMC107176", dbref.getAccessionId());
+ assertEquals("9573186", dbref.getVersion());
dbref = entry.getDbRefs().get(1);
assertEquals("MD5", dbref.getSource());
- assertEquals("d3b68", dbref.getAccessionId());
+ assertEquals("ac73317", dbref.getAccessionId());
// blank version has been converted to "0"
assertEquals("0", dbref.getVersion());
/*
- * sequence feature for CDS
+ * two sequence features for CDS
+ */
+ assertEquals(2, entry.getFeatures().size());
+ /*
+ * first CDS
*/
- assertEquals(1, entry.getFeatures().size());
EmblFeature ef = entry.getFeatures().get(0);
assertEquals("CDS", ef.getName());
- assertEquals("60..1058", ef.getLocation());
+ assertEquals("complement(46..57)", ef.getLocation());
assertEquals(2, ef.getDbRefs().size());
dbref = ef.getDbRefs().get(0);
- assertEquals("GOA", dbref.getSource());
- assertEquals("P00340", dbref.getAccessionId());
+ assertEquals("UniProtKB/Swiss-Prot", dbref.getSource());
+ assertEquals("B0BCM4", dbref.getAccessionId());
assertEquals("2.1", dbref.getVersion());
dbref = ef.getDbRefs().get(1);
- assertEquals("InterPro", dbref.getSource());
- assertEquals("IPR001236", dbref.getAccessionId());
- // blank version converted to "0":
+ assertEquals("UniProtKB/Swiss-Prot", dbref.getSource());
+ assertEquals("P0CE20", dbref.getAccessionId());
+ // blank version gets converted to "0":
assertEquals("0", dbref.getVersion());
- assertEquals(2, ef.getQualifiers().size());
-
- // feature qualifiers
+ // CDS feature qualifiers
+ assertEquals(3, ef.getQualifiers().size());
Qualifier q = ef.getQualifiers().get(0);
assertEquals("note", q.getName());
assertEquals(2, q.getValues().length);
- assertEquals("L-lactate dehydrogenase A-chain", q.getValues()[0]);
+ assertEquals("ORF 8 (AA 1-330)", q.getValues()[0]);
assertEquals("pickle", q.getValues()[1]);
assertNull(q.getEvidence());
q = ef.getQualifiers().get(1);
+ assertEquals("protein_id", q.getName());
+ assertEquals(1, q.getValues().length);
+ assertEquals("CAA30420.1", q.getValues()[0]);
+ q = ef.getQualifiers().get(2);
assertEquals("translation", q.getName());
assertEquals(1, q.getValues().length);
- assertEquals("MSLKDHLIHN", q.getValues()[0]);
+ assertEquals("MLCF", q.getValues()[0]);
assertEquals(1, q.getEvidence().length);
assertEquals("Keith", q.getEvidence()[0]);
/*
- * Sequence
+ * second CDS
+ */
+ ef = entry.getFeatures().get(1);
+ assertEquals("CDS", ef.getName());
+ assertEquals("4..15", ef.getLocation());
+ assertEquals(1, ef.getDbRefs().size());
+ dbref = ef.getDbRefs().get(0);
+ assertEquals("UniProtKB/Swiss-Prot", dbref.getSource());
+ assertEquals("B0BCM3", dbref.getAccessionId());
+ assertEquals("0", dbref.getVersion());
+ assertEquals(2, ef.getQualifiers().size());
+ q = ef.getQualifiers().get(0);
+ assertEquals("protein_id", q.getName());
+ assertEquals(1, q.getValues().length);
+ assertEquals("CAA30421.1", q.getValues()[0]);
+ q = ef.getQualifiers().get(1);
+ assertEquals("translation", q.getName());
+ assertEquals(1, q.getValues().length);
+ assertEquals("MSSS", q.getValues()[0]);
+
+ /*
+ * Sequence - verify newline not converted to space (JAL-2029)
*/
EmblSequence seq = entry.getSequence();
- assertEquals("GTGACG", seq.getSequence());
+ assertEquals(
+ "GGTATGTCCTCTAGTACAAACACCCCCAATATTGTGATATAATTAAAAACATAGCAT",
+ seq.getSequence());
/*
* getSequence() converts empty DBRefEntry.version to "0"
public class EmblTestHelper
{
- // adapted from http://www.ebi.ac.uk/ena/data/view/x53828&display=xml
+ // adapted from http://www.ebi.ac.uk/ena/data/view/X07547&display=xml
+ // dna and translations truncated for convenience
private static final String TESTDATA = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+ "<ROOT>"
- + "<entry accession=\"X53828\" entryVersion=\"3\" lastUpdated=\"2005-04-18\" releaseCreated=\"25\" releaseLastUpdated=\"83\""
- + " version=\"1\" moleculeType=\"mRNA\" topology=\"linear\" sequenceLength=\"1575\">"
- + "<description>Chicken LDH-A mRNA for lactate dehydrogenase A chain (EC 1.1.1.27)</description>"
- + "<keyword>L-lactate dehydrogenase</keyword><keyword>chutney</keyword>"
- + "<xref db=\"EuropePMC\" id=\"PMC1460223\" secondaryId=\"9649548\" />"
- + "<xref db=\"MD5\" id=\"d3b68\" />"
- + "<feature name=\"CDS\" location=\"60..1058\">"
- + "<xref db=\"GOA\" id=\"P00340\" secondaryId=\"2.1\" /><xref db=\"InterPro\" id=\"IPR001236\" />"
- + "<qualifier name=\"note\"><value>L-lactate dehydrogenase A-chain</value><value>pickle</value></qualifier>"
- + "<qualifier name=\"translation\"><value>MSLKDHLIHN</value><evidence>Keith</evidence></qualifier>"
- // emulate EMBL XML 1.2 which splits sequence data every 60 characters
- // see EmblSequence.setSequence
- + "</feature>" + "<sequence>GTG\nACG</sequence></entry></ROOT>";
+ + "<entry accession=\"X07547\" version=\"1\" entryVersion=\"8\""
+ + " dataClass=\"STD\" taxonomicDivision=\"PRO\""
+ + " moleculeType=\"genomic DNA\" sequenceLength=\"7499\" topology=\"linear\""
+ + " firstPublic=\"1988-11-10\" firstPublicRelease=\"18\""
+ + " lastUpdated=\"1999-02-10\" lastUpdatedRelease=\"58\">"
+ + "<secondaryAccession>X07574</secondaryAccession>"
+ + "<description>C. trachomatis plasmid</description>"
+ + "<keyword>plasmid</keyword><keyword>unidentified reading frame</keyword>"
+ + "<xref db=\"EuropePMC\" id=\"PMC107176\" secondaryId=\"9573186\" />"
+ + "<xref db=\"MD5\" id=\"ac73317\" />"
+ /*
+ * first CDS (range and translation changed to keep test data manageable)
+ */
+ + "<feature name=\"CDS\" location=\"complement(46..57)\">"
+ // test the case of >1 cross-ref to the same database (JAL-2029)
+ + "<xref db=\"UniProtKB/Swiss-Prot\" id=\"B0BCM4\" secondaryId=\"2.1\" />"
+ + "<xref db=\"UniProtKB/Swiss-Prot\" id=\"P0CE20\" />"
+ + "<qualifier name=\"note\"><value>ORF 8 (AA 1-330)</value><value>pickle</value></qualifier>"
+ + "<qualifier name=\"protein_id\"><value>CAA30420.1</value></qualifier>"
+ + "<qualifier name=\"translation\"><value>MLCF</value><evidence>Keith</evidence></qualifier>"
+ + "</feature>"
+ /*
+ * second CDS (range and translation changed to keep test data manageable)
+ */
+ + "<feature name=\"CDS\" location=\"4..15\">"
+ + "<xref db=\"UniProtKB/Swiss-Prot\" id=\"B0BCM3\" />"
+ + "<qualifier name=\"protein_id\"><value>CAA30421.1</value></qualifier>"
+ + "<qualifier name=\"translation\"><value>MSSS</value></qualifier>"
+ + "</feature>"
+ /*
+ * sequence (modified for test purposes)
+ * emulates EMBL XML 1.2 which splits sequence data every 60 characters
+ * see EmblSequence.setSequence
+ */
+ + "<sequence>GGTATGTCCTCTAGTACAAAC\n"
+ + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT"
+ + "</sequence></entry></ROOT>";
static EmblFile getEmblFile()
{