X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2FEmblFlatFile.java;fp=src%2Fjalview%2Fio%2FEmblFlatFile.java;h=900aef82205f867947617c424a3c2c256e56e3a9;hb=049cfa5fc078e6daac6158f25cea7d9a60b48969;hp=13f224bcd1ab7441198365f3155fd5072d01594e;hpb=84478e43fc8694f4c0c4c16515faf813744194da;p=jalview.git diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java index 13f224b..900aef8 100644 --- a/src/jalview/io/EmblFlatFile.java +++ b/src/jalview/io/EmblFlatFile.java @@ -47,6 +47,8 @@ public class EmblFlatFile extends AlignFile // FileParse { private static final String QUOTE = "\""; + private static final String DOUBLED_QUOTE = QUOTE + QUOTE; + /** * A data bean class to hold values parsed from one CDS Feature (FT) */ @@ -103,7 +105,7 @@ public class EmblFlatFile extends AlignFile // FileParse super(false, fp); // don't parse immediately this.sourceDb = sourceId; dbrefs = new ArrayList<>(); - + /* * using TreeMap gives CDS sequences in alphabetical, so readable, order */ @@ -322,6 +324,7 @@ public class EmblFlatFile extends AlignFile // FileParse CdsData data = new CdsData(); data.cdsLocation = tokens[2]; + // TODO location can be over >1 line e.g. EAW51554 line = nextLine(); while (line != null) @@ -334,48 +337,50 @@ public class EmblFlatFile extends AlignFile // FileParse /* * extract qualifier, e.g. FT /protein_id="CAA37824.1" + * - the value may extend over more than one line + * - if the value has enclosing quotes, these are removed + * - escaped double quotes ("") are reduced to a single character */ int slashPos = line.indexOf('/'); if (slashPos == -1) { Cache.log.error("Unexpected EMBL line ignored: " + line); + line = nextLine(); continue; } int eqPos = line.indexOf('=', slashPos + 1); if (eqPos == -1) { // can happen, e.g. /ribosomal_slippage -// Cache.log.error("Unexpected EMBL line ignored: " + line); + // Cache.log.error("Unexpected EMBL line ignored: " + line); line = nextLine(); continue; } String qualifier = line.substring(slashPos + 1, eqPos); String value = line.substring(eqPos + 1); - if (value.startsWith(QUOTE) && value.endsWith(QUOTE)) - { - value = value.substring(1, value.length() - 1); - } + value = removeQuotes(value); + StringBuilder sb = new StringBuilder().append(value); + line = parseFeatureQualifier(sb, qualifier); + String featureValue = sb.toString(); if ("protein_id".equals(qualifier)) { - data.proteinId = value; - line = nextLine(); + data.proteinId = featureValue; } else if ("codon_start".equals(qualifier)) { try { - data.codonStart = Integer.parseInt(value.trim()); + data.codonStart = Integer.parseInt(featureValue.trim()); } catch (NumberFormatException e) { Cache.log.error("Invalid codon_start in XML for " + this.accession + ": " + e.getMessage()); } - line = nextLine(); } else if ("db_xref".equals(qualifier)) { - String[] parts = value.split(":"); + String[] parts = featureValue.split(":"); if (parts.length == 2) { String db = parts[0].trim(); @@ -383,23 +388,19 @@ public class EmblFlatFile extends AlignFile // FileParse DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim()); data.xrefs.add(dbref); } - line = nextLine(); } else if ("product".equals(qualifier)) { - // sometimes name is returned e.g. for V00488 - data.proteinName = value; - line = nextLine(); + data.proteinName = featureValue; } else if ("translation".equals(qualifier)) { - line = parseTranslation(value, data); + data.translation = featureValue; } - else if (!"".equals(value)) + else if (!"".equals(featureValue)) { // throw anything else into the additional properties hash - data.cdsProps.put(qualifier, value); - line = nextLine(); + data.cdsProps.put(qualifier, featureValue); } } @@ -417,20 +418,58 @@ public class EmblFlatFile extends AlignFile // FileParse } /** - * Reads and returns the CDS translation from one or more lines of the file, - * and returns the next line after that + * Removes leading or trailing double quotes (") unless doubled, and changes + * any 'escaped' (doubled) double quotes to single characters. As per the + * Feature Table specification for Qualifiers, Free Text. * * @param value - * the first line of the translation (likely quoted) - * @param data * @return - * @throws IOException */ - String parseTranslation(String value, CdsData data) throws IOException + static String removeQuotes(String value) { - StringBuilder sb = new StringBuilder(this.length / 3 + 1); - sb.append(value.replace(QUOTE, "")); + if (value == null) + { + return null; + } + if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE)) + { + value = value.substring(1); + } + if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE)) + { + value = value.substring(0, value.length() - 1); + } + value = value.replace(DOUBLED_QUOTE, QUOTE); + return value; + } + /** + * Reads the value of a feature (FT) qualifier from one or more lines of the + * file, and returns the next line after that. Values are appended to the + * string buffer, which should be already primed with the value read from the + * first line for the qualifier (with any leading double quote removed). + * Enclosing double quotes are removed, and escaped (repeated) double quotes + * reduced to one only. For example for + * + *
+   * FT      /note="gene_id=hCG28070.3 
+   * FT      ""foobar"" isoform=CRA_b"
+   * the returned value is
+   * gene_id=hCG28070.3 "foobar" isoform=CRA_b
+   * 
+ * + * Note the side-effect of this method, to advance data reading to the next + * line after the feature qualifier. + * + * @param sb + * a string buffer primed with the first line of the value + * @param qualifierName + * @return + * @throws IOException + */ + String parseFeatureQualifier(StringBuilder sb, String qualifierName) + throws IOException + { String line; while ((line = nextLine()) != null) { @@ -441,17 +480,30 @@ public class EmblFlatFile extends AlignFile // FileParse String[] tokens = line.split(WHITESPACE); if (tokens.length < 2) { - Cache.log.error("Ignoring bad EMBL line: " + line); + Cache.log.error("Ignoring bad EMBL line for " + this.accession + + ": " + line); break; } if (tokens[1].startsWith("/")) { break; // next feature qualifier } - sb.append(tokens[1].replace(QUOTE, "")); - } - data.translation = sb.toString(); + /* + * heuristic rule: most multi-line value (e.g. /product) are text, + * so add a space for word boundary at a new line; not for translation + */ + if (!"translation".equals(qualifierName)) + { + sb.append(" "); + } + + /* + * remove trailing " and unescape doubled "" + */ + String data = removeQuotes(tokens[1]); + sb.append(data); + } return line; } @@ -461,6 +513,12 @@ public class EmblFlatFile extends AlignFile // FileParse */ void buildSequence() { + if (this.accession == null || this.sequenceString == null) + { + Cache.log.error("Failed to parse data from EMBL"); + return; + } + String name = this.accession; if (this.sourceDb != null) { @@ -611,7 +669,7 @@ public class EmblFlatFile extends AlignFile // FileParse map.setMappedFromId(data.proteinId); dnaToEmblProteinRef.setMap(map); dna.addDBRef(dnaToEmblProteinRef); - } + } /* * comment brought forward from EmblXmlSource, lines 447-451: