From 049cfa5fc078e6daac6158f25cea7d9a60b48969 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Wed, 5 Aug 2020 16:53:17 +0100 Subject: [PATCH] JAL-3692 parse multiline feature qualifiers and escaped quotes --- src/jalview/io/EmblFlatFile.java | 122 ++++++++++++++++------ src/jalview/ws/dbsources/EmblFlatfileSource.java | 10 +- src/jalview/ws/ebi/EBIFetchClient.java | 5 +- test/jalview/io/EmblFlatFileTest.java | 17 ++- 4 files changed, 115 insertions(+), 39 deletions(-) diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java index 13f224b..900aef8 100644 --- a/src/jalview/io/EmblFlatFile.java +++ b/src/jalview/io/EmblFlatFile.java @@ -47,6 +47,8 @@ public class EmblFlatFile extends AlignFile // FileParse { private static final String QUOTE = "\""; + private static final String DOUBLED_QUOTE = QUOTE + QUOTE; + /** * A data bean class to hold values parsed from one CDS Feature (FT) */ @@ -103,7 +105,7 @@ public class EmblFlatFile extends AlignFile // FileParse super(false, fp); // don't parse immediately this.sourceDb = sourceId; dbrefs = new ArrayList<>(); - + /* * using TreeMap gives CDS sequences in alphabetical, so readable, order */ @@ -322,6 +324,7 @@ public class EmblFlatFile extends AlignFile // FileParse CdsData data = new CdsData(); data.cdsLocation = tokens[2]; + // TODO location can be over >1 line e.g. EAW51554 line = nextLine(); while (line != null) @@ -334,48 +337,50 @@ public class EmblFlatFile extends AlignFile // FileParse /* * extract qualifier, e.g. FT /protein_id="CAA37824.1" + * - the value may extend over more than one line + * - if the value has enclosing quotes, these are removed + * - escaped double quotes ("") are reduced to a single character */ int slashPos = line.indexOf('/'); if (slashPos == -1) { Cache.log.error("Unexpected EMBL line ignored: " + line); + line = nextLine(); continue; } int eqPos = line.indexOf('=', slashPos + 1); if (eqPos == -1) { // can happen, e.g. /ribosomal_slippage -// Cache.log.error("Unexpected EMBL line ignored: " + line); + // Cache.log.error("Unexpected EMBL line ignored: " + line); line = nextLine(); continue; } String qualifier = line.substring(slashPos + 1, eqPos); String value = line.substring(eqPos + 1); - if (value.startsWith(QUOTE) && value.endsWith(QUOTE)) - { - value = value.substring(1, value.length() - 1); - } + value = removeQuotes(value); + StringBuilder sb = new StringBuilder().append(value); + line = parseFeatureQualifier(sb, qualifier); + String featureValue = sb.toString(); if ("protein_id".equals(qualifier)) { - data.proteinId = value; - line = nextLine(); + data.proteinId = featureValue; } else if ("codon_start".equals(qualifier)) { try { - data.codonStart = Integer.parseInt(value.trim()); + data.codonStart = Integer.parseInt(featureValue.trim()); } catch (NumberFormatException e) { Cache.log.error("Invalid codon_start in XML for " + this.accession + ": " + e.getMessage()); } - line = nextLine(); } else if ("db_xref".equals(qualifier)) { - String[] parts = value.split(":"); + String[] parts = featureValue.split(":"); if (parts.length == 2) { String db = parts[0].trim(); @@ -383,23 +388,19 @@ public class EmblFlatFile extends AlignFile // FileParse DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim()); data.xrefs.add(dbref); } - line = nextLine(); } else if ("product".equals(qualifier)) { - // sometimes name is returned e.g. for V00488 - data.proteinName = value; - line = nextLine(); + data.proteinName = featureValue; } else if ("translation".equals(qualifier)) { - line = parseTranslation(value, data); + data.translation = featureValue; } - else if (!"".equals(value)) + else if (!"".equals(featureValue)) { // throw anything else into the additional properties hash - data.cdsProps.put(qualifier, value); - line = nextLine(); + data.cdsProps.put(qualifier, featureValue); } } @@ -417,20 +418,58 @@ public class EmblFlatFile extends AlignFile // FileParse } /** - * Reads and returns the CDS translation from one or more lines of the file, - * and returns the next line after that + * Removes leading or trailing double quotes (") unless doubled, and changes + * any 'escaped' (doubled) double quotes to single characters. As per the + * Feature Table specification for Qualifiers, Free Text. * * @param value - * the first line of the translation (likely quoted) - * @param data * @return - * @throws IOException */ - String parseTranslation(String value, CdsData data) throws IOException + static String removeQuotes(String value) { - StringBuilder sb = new StringBuilder(this.length / 3 + 1); - sb.append(value.replace(QUOTE, "")); + if (value == null) + { + return null; + } + if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE)) + { + value = value.substring(1); + } + if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE)) + { + value = value.substring(0, value.length() - 1); + } + value = value.replace(DOUBLED_QUOTE, QUOTE); + return value; + } + /** + * Reads the value of a feature (FT) qualifier from one or more lines of the + * file, and returns the next line after that. Values are appended to the + * string buffer, which should be already primed with the value read from the + * first line for the qualifier (with any leading double quote removed). + * Enclosing double quotes are removed, and escaped (repeated) double quotes + * reduced to one only. For example for + * + *
+   * FT      /note="gene_id=hCG28070.3 
+   * FT      ""foobar"" isoform=CRA_b"
+   * the returned value is
+   * gene_id=hCG28070.3 "foobar" isoform=CRA_b
+   * 
+ * + * Note the side-effect of this method, to advance data reading to the next + * line after the feature qualifier. + * + * @param sb + * a string buffer primed with the first line of the value + * @param qualifierName + * @return + * @throws IOException + */ + String parseFeatureQualifier(StringBuilder sb, String qualifierName) + throws IOException + { String line; while ((line = nextLine()) != null) { @@ -441,17 +480,30 @@ public class EmblFlatFile extends AlignFile // FileParse String[] tokens = line.split(WHITESPACE); if (tokens.length < 2) { - Cache.log.error("Ignoring bad EMBL line: " + line); + Cache.log.error("Ignoring bad EMBL line for " + this.accession + + ": " + line); break; } if (tokens[1].startsWith("/")) { break; // next feature qualifier } - sb.append(tokens[1].replace(QUOTE, "")); - } - data.translation = sb.toString(); + /* + * heuristic rule: most multi-line value (e.g. /product) are text, + * so add a space for word boundary at a new line; not for translation + */ + if (!"translation".equals(qualifierName)) + { + sb.append(" "); + } + + /* + * remove trailing " and unescape doubled "" + */ + String data = removeQuotes(tokens[1]); + sb.append(data); + } return line; } @@ -461,6 +513,12 @@ public class EmblFlatFile extends AlignFile // FileParse */ void buildSequence() { + if (this.accession == null || this.sequenceString == null) + { + Cache.log.error("Failed to parse data from EMBL"); + return; + } + String name = this.accession; if (this.sourceDb != null) { @@ -611,7 +669,7 @@ public class EmblFlatFile extends AlignFile // FileParse map.setMappedFromId(data.proteinId); dnaToEmblProteinRef.setMap(map); dna.addDBRef(dnaToEmblProteinRef); - } + } /* * comment brought forward from EmblXmlSource, lines 447-451: diff --git a/src/jalview/ws/dbsources/EmblFlatfileSource.java b/src/jalview/ws/dbsources/EmblFlatfileSource.java index 2353f22..6536958 100644 --- a/src/jalview/ws/dbsources/EmblFlatfileSource.java +++ b/src/jalview/ws/dbsources/EmblFlatfileSource.java @@ -73,12 +73,12 @@ public abstract class EmblFlatfileSource extends EbiFileRetrievedProxy try { reply = dbFetch.fetchDataAsFile( - dbName.toLowerCase() + ":" + query.trim(), null, "txt"); + dbName.toLowerCase() + ":" + query.trim(), null, "gz"); } catch (Exception e) { stopQuery(); throw new Exception( - String.format("EBI EMBL XML retrieval failed for %s:%s", + String.format("EBI EMBL retrieval failed for %s:%s", dbName.toLowerCase(), query.trim()), e); } @@ -112,4 +112,10 @@ public abstract class EmblFlatfileSource extends EbiFileRetrievedProxy stopQuery(); return al; } + + @Override + public boolean isDnaCoding() + { + return true; + } } diff --git a/src/jalview/ws/ebi/EBIFetchClient.java b/src/jalview/ws/ebi/EBIFetchClient.java index 9a77087..8ab5fbb 100644 --- a/src/jalview/ws/ebi/EBIFetchClient.java +++ b/src/jalview/ws/ebi/EBIFetchClient.java @@ -295,9 +295,8 @@ public class EBIFetchClient if (database.equalsIgnoreCase(DBRefSource.EMBL) || database.equalsIgnoreCase(DBRefSource.EMBLCDS)) { -// url = "https://www.ebi.ac.uk/ena/data/view/" + ids.toLowerCase() -// + (format != null ? "&" + format : ""); - url = "https://www.ebi.ac.uk/ena/browser/api/embl/" + ids.toLowerCase(); + url = "https://www.ebi.ac.uk/ena/browser/api/embl/" + + ids.toLowerCase() + "?download=true&gzip=true"; } else { diff --git a/test/jalview/io/EmblFlatFileTest.java b/test/jalview/io/EmblFlatFileTest.java index 35b378b..2898a06 100644 --- a/test/jalview/io/EmblFlatFileTest.java +++ b/test/jalview/io/EmblFlatFileTest.java @@ -242,11 +242,14 @@ public class EmblFlatFileTest public void testParse_noUniprotXref() throws IOException { // MN908947 cut down to 40BP, one CDS, length 5 peptide for test purposes + // plus an additional (invented) test case: + // - multi-line /product qualifier including escaped quotes String data = "ID MN908947; SV 3; linear; genomic RNA; STD; VRL; 20 BP.\n" + "DE Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1,\n" + "FT CDS 3..17\n" + "FT /protein_id=\"QHD43415.1\"\n" - + "FT /product=\"orf1ab polyprotein\"\n" + + "FT /product=\"orf1ab polyprotein\n" + + "FT \"\"foobar\"\" \"\n" + "FT /translation=\"MRKLD\n" + "SQ Sequence 7496 BP; 2450 A; 1290 C; 1434 G; 2322 T; 0 other;\n" + " ggatGcgtaa gttagacgaa attttgtctt tgcgcacaga 40\n"; @@ -284,7 +287,8 @@ public class EmblFlatFileTest mapping = dbref.getMap(); SequenceI mapTo = mapping.getTo(); assertEquals(mapTo.getName(), "QHD43415.1"); - assertEquals(mapTo.getDescription(), "orf1ab polyprotein"); + // the /product qualifier transfers to protein product description + assertEquals(mapTo.getDescription(), "orf1ab polyprotein \"foobar\""); assertEquals(mapTo.getSequenceAsString(), "MRKLD"); map = mapping.getMap(); assertEquals(map.getFromLowest(), 3); @@ -323,4 +327,13 @@ public class EmblFlatFileTest truncated = EmblFlatFile.adjustForProteinLength(7, exons); assertSame(exons, truncated); } + + @Test(groups = "Functional") + public void testRemoveQuotes() + { + assertNull(EmblFlatFile.removeQuotes(null)); + assertEquals(EmblFlatFile.removeQuotes("No quotes here"), "No quotes here"); + assertEquals(EmblFlatFile.removeQuotes("\"Enclosing quotes\""), "Enclosing quotes"); + assertEquals(EmblFlatFile.removeQuotes("\"Escaped \"\"quotes\"\" example\""), "Escaped \"quotes\" example"); + } } -- 1.7.10.2