From cb79cecb722e9e3b17eb3013ed24f61c5fc6c800 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Tue, 18 Aug 2020 14:59:12 +0100 Subject: [PATCH] JAL-3692 parse multiline CDS location correctly --- src/jalview/io/EmblFlatFile.java | 17 +++++++++++------ test/jalview/io/EmblFlatFileTest.java | 10 +++++++++- test/jalview/io/J03321.embl.txt | 4 +++- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java index 900aef8..bfae4ed 100644 --- a/src/jalview/io/EmblFlatFile.java +++ b/src/jalview/io/EmblFlatFile.java @@ -118,6 +118,7 @@ public class EmblFlatFile extends AlignFile // FileParse * * @throws IOException */ + @Override public void parse() throws IOException { String line = nextLine(); @@ -322,11 +323,14 @@ public class EmblFlatFile extends AlignFile // FileParse return nextLine(); } + /* + * parse location - which may be over more than one line e.g. EAW51554 + */ CdsData data = new CdsData(); - data.cdsLocation = tokens[2]; - // TODO location can be over >1 line e.g. EAW51554 + StringBuilder sb = new StringBuilder().append(tokens[2]); + line = parseFeatureQualifier(sb, "CDS"); + data.cdsLocation = sb.toString(); - line = nextLine(); while (line != null) { if (!line.startsWith("FT ")) // 4 spaces @@ -359,7 +363,7 @@ public class EmblFlatFile extends AlignFile // FileParse String qualifier = line.substring(slashPos + 1, eqPos); String value = line.substring(eqPos + 1); value = removeQuotes(value); - StringBuilder sb = new StringBuilder().append(value); + sb = new StringBuilder().append(value); line = parseFeatureQualifier(sb, qualifier); String featureValue = sb.toString(); @@ -427,7 +431,7 @@ public class EmblFlatFile extends AlignFile // FileParse */ static String removeQuotes(String value) { - if (value == null) + if (value == null) { return null; } @@ -493,7 +497,8 @@ public class EmblFlatFile extends AlignFile // FileParse * heuristic rule: most multi-line value (e.g. /product) are text, * so add a space for word boundary at a new line; not for translation */ - if (!"translation".equals(qualifierName)) + if (!"translation".equals(qualifierName) + && !"CDS".equals(qualifierName)) { sb.append(" "); } diff --git a/test/jalview/io/EmblFlatFileTest.java b/test/jalview/io/EmblFlatFileTest.java index 2898a06..5d8ef21 100644 --- a/test/jalview/io/EmblFlatFileTest.java +++ b/test/jalview/io/EmblFlatFileTest.java @@ -3,9 +3,9 @@ package jalview.io; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; import static org.testng.AssertJUnit.assertNotNull; +import static org.testng.AssertJUnit.assertNull; import static org.testng.AssertJUnit.assertSame; import static org.testng.AssertJUnit.fail; -import static org.testng.AssertJUnit.assertNull; import java.io.File; import java.io.IOException; @@ -14,8 +14,10 @@ import java.util.Arrays; import java.util.List; import java.util.Set; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; +import jalview.bin.Cache; import jalview.datamodel.DBRefEntry; import jalview.datamodel.Mapping; import jalview.datamodel.SequenceFeature; @@ -25,6 +27,12 @@ import jalview.util.MapList; public class EmblFlatFileTest { + @BeforeClass(alwaysRun = true) + public void setUp() + { + Cache.initLogger(); + } + /** * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features, * one of them reverse strand diff --git a/test/jalview/io/J03321.embl.txt b/test/jalview/io/J03321.embl.txt index 92065b9..b0bc512 100644 --- a/test/jalview/io/J03321.embl.txt +++ b/test/jalview/io/J03321.embl.txt @@ -38,6 +38,7 @@ XX CC Draft entry and computer-readable sequence kindly submitted by CC G.Ratti, 28-MAR-1990. XX +XX ! first CDS location below split across two lines for test purposes ! FH Key Location/Qualifiers FH FT source 1..7502 @@ -48,7 +49,8 @@ FT /serotype="D" FT /mol_type="genomic DNA" FT /isolation_source="trachoma" FT /db_xref="taxon:813" -FT CDS join(7022..7502,1..437) +FT CDS join(7022..7502, +FT 1..437) FT /codon_start=1 FT /transl_table=11 FT /product="hypothetical protein" -- 1.7.10.2