From: gmungoc Date: Thu, 16 Jun 2016 08:49:04 +0000 (+0100) Subject: JAL-2112 optimisation of EMBL CDS parsing X-Git-Tag: Release_2_10_0~175 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=77a7aa4d0c1be9ac6d0e6195a3b0ac3cd73f2875;hp=1fea862e99e1fdd5eb85d5feaec2d28de58828b4;p=jalview.git JAL-2112 optimisation of EMBL CDS parsing --- diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index 9a07c36..f8c0bbe 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -197,7 +197,7 @@ public class EmblEntry // dbref retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1, 1)); - // TODO: transform EMBL Database refs to canonical form + if (dbRefs != null) { for (DBRefEntry dbref : dbRefs) @@ -206,6 +206,7 @@ public class EmblEntry } } + SequenceIdMatcher matcher = new SequenceIdMatcher(peptides); try { for (EmblFeature feature : features) @@ -219,7 +220,7 @@ public class EmblEntry } if (FeatureProperties.isCodingFeature(sourceDb, feature.getName())) { - parseCodingFeature(feature, sourceDb, dna, peptides); + parseCodingFeature(feature, sourceDb, dna, peptides, matcher); } } } catch (Exception e) @@ -249,7 +250,7 @@ public class EmblEntry * list of protein product sequences for Embl entry */ void parseCodingFeature(EmblFeature feature, String sourceDb, - SequenceI dna, List peptides) + SequenceI dna, List peptides, SequenceIdMatcher matcher) { boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS); @@ -259,7 +260,6 @@ public class EmblEntry String prname = ""; String prid = null; Map vals = new Hashtable(); - SequenceIdMatcher matcher = new SequenceIdMatcher(peptides); /* * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS @@ -283,13 +283,13 @@ public class EmblEntry } else if (qname.equals("protein_id")) { - prid = q.getValues()[0]; + prid = q.getValues()[0].trim(); } else if (qname.equals("codon_start")) { try { - codonStart = Integer.parseInt(q.getValues()[0]); + codonStart = Integer.parseInt(q.getValues()[0].trim()); } catch (NumberFormatException e) { System.err.println("Invalid codon_start in XML for " @@ -299,7 +299,7 @@ public class EmblEntry else if (qname.equals("product")) { // sometimes name is returned e.g. for V00488 - prname = q.getValues()[0]; + prname = q.getValues()[0].trim(); } else { diff --git a/test/jalview/datamodel/xdb/embl/EmblEntryTest.java b/test/jalview/datamodel/xdb/embl/EmblEntryTest.java index e8760bd..3de5e3f 100644 --- a/test/jalview/datamodel/xdb/embl/EmblEntryTest.java +++ b/test/jalview/datamodel/xdb/embl/EmblEntryTest.java @@ -3,6 +3,7 @@ package jalview.datamodel.xdb.embl; import static org.testng.AssertJUnit.assertEquals; import static org.testng.AssertJUnit.assertSame; +import jalview.analysis.SequenceIdMatcher; import jalview.datamodel.DBRefEntry; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; @@ -37,6 +38,7 @@ public class EmblEntryTest // not the whole sequence but enough for this test... SequenceI dna = new Sequence("J03321", "GGATCCGTAAGTTAGACGAAATT"); List peptides = new ArrayList(); + SequenceIdMatcher matcher = new SequenceIdMatcher(peptides); EmblFile ef = EmblTestHelper.getEmblFile(); /* @@ -48,7 +50,7 @@ public class EmblEntryTest { if ("CDS".equals(feature.getName())) { - testee.parseCodingFeature(feature, "EMBL", dna, peptides); + testee.parseCodingFeature(feature, "EMBL", dna, peptides, matcher); } }