From 77a7aa4d0c1be9ac6d0e6195a3b0ac3cd73f2875 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 16 Jun 2016 09:49:04 +0100 Subject: [PATCH 1/1] JAL-2112 optimisation of EMBL CDS parsing --- src/jalview/datamodel/xdb/embl/EmblEntry.java | 14 +++++++------- test/jalview/datamodel/xdb/embl/EmblEntryTest.java | 4 +++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index 9a07c36..f8c0bbe 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -197,7 +197,7 @@ public class EmblEntry // dbref retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1, 1)); - // TODO: transform EMBL Database refs to canonical form + if (dbRefs != null) { for (DBRefEntry dbref : dbRefs) @@ -206,6 +206,7 @@ public class EmblEntry } } + SequenceIdMatcher matcher = new SequenceIdMatcher(peptides); try { for (EmblFeature feature : features) @@ -219,7 +220,7 @@ public class EmblEntry } if (FeatureProperties.isCodingFeature(sourceDb, feature.getName())) { - parseCodingFeature(feature, sourceDb, dna, peptides); + parseCodingFeature(feature, sourceDb, dna, peptides, matcher); } } } catch (Exception e) @@ -249,7 +250,7 @@ public class EmblEntry * list of protein product sequences for Embl entry */ void parseCodingFeature(EmblFeature feature, String sourceDb, - SequenceI dna, List peptides) + SequenceI dna, List peptides, SequenceIdMatcher matcher) { boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS); @@ -259,7 +260,6 @@ public class EmblEntry String prname = ""; String prid = null; Map vals = new Hashtable(); - SequenceIdMatcher matcher = new SequenceIdMatcher(peptides); /* * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS @@ -283,13 +283,13 @@ public class EmblEntry } else if (qname.equals("protein_id")) { - prid = q.getValues()[0]; + prid = q.getValues()[0].trim(); } else if (qname.equals("codon_start")) { try { - codonStart = Integer.parseInt(q.getValues()[0]); + codonStart = Integer.parseInt(q.getValues()[0].trim()); } catch (NumberFormatException e) { System.err.println("Invalid codon_start in XML for " @@ -299,7 +299,7 @@ public class EmblEntry else if (qname.equals("product")) { // sometimes name is returned e.g. for V00488 - prname = q.getValues()[0]; + prname = q.getValues()[0].trim(); } else { diff --git a/test/jalview/datamodel/xdb/embl/EmblEntryTest.java b/test/jalview/datamodel/xdb/embl/EmblEntryTest.java index e8760bd..3de5e3f 100644 --- a/test/jalview/datamodel/xdb/embl/EmblEntryTest.java +++ b/test/jalview/datamodel/xdb/embl/EmblEntryTest.java @@ -3,6 +3,7 @@ package jalview.datamodel.xdb.embl; import static org.testng.AssertJUnit.assertEquals; import static org.testng.AssertJUnit.assertSame; +import jalview.analysis.SequenceIdMatcher; import jalview.datamodel.DBRefEntry; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; @@ -37,6 +38,7 @@ public class EmblEntryTest // not the whole sequence but enough for this test... SequenceI dna = new Sequence("J03321", "GGATCCGTAAGTTAGACGAAATT"); List peptides = new ArrayList(); + SequenceIdMatcher matcher = new SequenceIdMatcher(peptides); EmblFile ef = EmblTestHelper.getEmblFile(); /* @@ -48,7 +50,7 @@ public class EmblEntryTest { if ("CDS".equals(feature.getName())) { - testee.parseCodingFeature(feature, "EMBL", dna, peptides); + testee.parseCodingFeature(feature, "EMBL", dna, peptides, matcher); } } -- 1.7.10.2