JAL-3692 parse multiline feature qualifiers and escaped quotes

author gmungoc <g.m.carstairs@dundee.ac.uk>

Wed, 5 Aug 2020 15:53:17 +0000 (16:53 +0100)

committer Jim Procter <jprocter@issues.jalview.org>

Wed, 5 Aug 2020 16:29:02 +0000 (17:29 +0100)
author gmungoc <g.m.carstairs@dundee.ac.uk>
Wed, 5 Aug 2020 15:53:17 +0000 (16:53 +0100)
committer Jim Procter <jprocter@issues.jalview.org>
Wed, 5 Aug 2020 16:29:02 +0000 (17:29 +0100)
diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java

index 13f224b..900aef8 100644 (file)
--- a/src/jalview/io/EmblFlatFile.java
+++ b/src/jalview/io/EmblFlatFile.java
@@ -47,6 +47,8 @@ public class EmblFlatFile extends AlignFile // FileParse
  {
    private static final String QUOTE = "\"";
  
+  private static final String DOUBLED_QUOTE = QUOTE + QUOTE;
+
    /**
     * A data bean class to hold values parsed from one CDS Feature (FT)
     */
@@ -103,7 +105,7 @@ public class EmblFlatFile extends AlignFile // FileParse
      super(false, fp); // don't parse immediately
      this.sourceDb = sourceId;
      dbrefs = new ArrayList<>();
-    
+
      /*
       * using TreeMap gives CDS sequences in alphabetical, so readable, order
       */
@@ -322,6 +324,7 @@ public class EmblFlatFile extends AlignFile // FileParse
  
      CdsData data = new CdsData();
      data.cdsLocation = tokens[2];
+    // TODO location can be over >1 line e.g. EAW51554
  
      line = nextLine();
      while (line != null)
@@ -334,48 +337,50 @@ public class EmblFlatFile extends AlignFile // FileParse
  
        /*
         * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
+       * - the value may extend over more than one line
+       * - if the value has enclosing quotes, these are removed
+       * - escaped double quotes ("") are reduced to a single character
         */
        int slashPos = line.indexOf('/');
        if (slashPos == -1)
        {
          Cache.log.error("Unexpected EMBL line ignored: " + line);
+        line = nextLine();
          continue;
        }
        int eqPos = line.indexOf('=', slashPos + 1);
        if (eqPos == -1)
        {
          // can happen, e.g. /ribosomal_slippage
-//        Cache.log.error("Unexpected EMBL line ignored: " + line);
+        // Cache.log.error("Unexpected EMBL line ignored: " + line);
          line = nextLine();
          continue;
        }
        String qualifier = line.substring(slashPos + 1, eqPos);
        String value = line.substring(eqPos + 1);
-      if (value.startsWith(QUOTE) && value.endsWith(QUOTE))
-      {
-        value = value.substring(1, value.length() - 1);
-      }
+      value = removeQuotes(value);
+      StringBuilder sb = new StringBuilder().append(value);
+      line = parseFeatureQualifier(sb, qualifier);
+      String featureValue = sb.toString();
  
        if ("protein_id".equals(qualifier))
        {
-        data.proteinId = value;
-        line = nextLine();
+        data.proteinId = featureValue;
        }
        else if ("codon_start".equals(qualifier))
        {
          try
          {
-          data.codonStart = Integer.parseInt(value.trim());
+          data.codonStart = Integer.parseInt(featureValue.trim());
          } catch (NumberFormatException e)
          {
            Cache.log.error("Invalid codon_start in XML for " + this.accession
                    + ": " + e.getMessage());
          }
-        line = nextLine();
        }
        else if ("db_xref".equals(qualifier))
        {
-        String[] parts = value.split(":");
+        String[] parts = featureValue.split(":");
          if (parts.length == 2)
          {
            String db = parts[0].trim();
@@ -383,23 +388,19 @@ public class EmblFlatFile extends AlignFile // FileParse
            DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
            data.xrefs.add(dbref);
          }
-        line = nextLine();
        }
        else if ("product".equals(qualifier))
        {
-        // sometimes name is returned e.g. for V00488
-        data.proteinName = value;
-        line = nextLine();
+        data.proteinName = featureValue;
        }
        else if ("translation".equals(qualifier))
        {
-        line = parseTranslation(value, data);
+        data.translation = featureValue;
        }
-      else if (!"".equals(value))
+      else if (!"".equals(featureValue))
        {
          // throw anything else into the additional properties hash
-        data.cdsProps.put(qualifier, value);
-        line = nextLine();
+        data.cdsProps.put(qualifier, featureValue);
        }
      }
  
@@ -417,20 +418,58 @@ public class EmblFlatFile extends AlignFile // FileParse
    }
  
    /**
-   * Reads and returns the CDS translation from one or more lines of the file,
-   * and returns the next line after that
+   * Removes leading or trailing double quotes (") unless doubled, and changes
+   * any 'escaped' (doubled) double quotes to single characters. As per the
+   * Feature Table specification for Qualifiers, Free Text.
     * 
     * @param value
-   *          the first line of the translation (likely quoted)
-   * @param data
     * @return
-   * @throws IOException
     */
-  String parseTranslation(String value, CdsData data) throws IOException
+  static String removeQuotes(String value)
    {
-    StringBuilder sb = new StringBuilder(this.length / 3 + 1);
-    sb.append(value.replace(QUOTE, ""));
+    if (value == null) 
+    {
+      return null;
+    }
+    if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
+    {
+      value = value.substring(1);
+    }
+    if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
+    {
+      value = value.substring(0, value.length() - 1);
+    }
+    value = value.replace(DOUBLED_QUOTE, QUOTE);
+    return value;
+  }
  
+  /**
+   * Reads the value of a feature (FT) qualifier from one or more lines of the
+   * file, and returns the next line after that. Values are appended to the
+   * string buffer, which should be already primed with the value read from the
+   * first line for the qualifier (with any leading double quote removed).
+   * Enclosing double quotes are removed, and escaped (repeated) double quotes
+   * reduced to one only. For example for
+   * 
+   * <pre>
+   * FT      /note="gene_id=hCG28070.3 
+   * FT      ""foobar"" isoform=CRA_b"
+   * the returned value is
+   * gene_id=hCG28070.3 "foobar" isoform=CRA_b
+   * </pre>
+   * 
+   * Note the side-effect of this method, to advance data reading to the next
+   * line after the feature qualifier.
+   * 
+   * @param sb
+   *          a string buffer primed with the first line of the value
+   * @param qualifierName
+   * @return
+   * @throws IOException
+   */
+  String parseFeatureQualifier(StringBuilder sb, String qualifierName)
+          throws IOException
+  {
      String line;
      while ((line = nextLine()) != null)
      {
@@ -441,17 +480,30 @@ public class EmblFlatFile extends AlignFile // FileParse
        String[] tokens = line.split(WHITESPACE);
        if (tokens.length < 2)
        {
-        Cache.log.error("Ignoring bad EMBL line: " + line);
+        Cache.log.error("Ignoring bad EMBL line for " + this.accession
+                + ": " + line);
          break;
        }
        if (tokens[1].startsWith("/"))
        {
          break; // next feature qualifier
        }
-      sb.append(tokens[1].replace(QUOTE, ""));
-    }
  
-    data.translation = sb.toString();
+      /*
+       * heuristic rule: most multi-line value (e.g. /product) are text,
+       * so add a space for word boundary at a new line; not for translation
+       */
+      if (!"translation".equals(qualifierName))
+      {
+        sb.append(" ");
+      }
+
+      /*
+       * remove trailing " and unescape doubled ""
+       */
+      String data = removeQuotes(tokens[1]);
+      sb.append(data);
+    }
  
      return line;
    }
@@ -461,6 +513,12 @@ public class EmblFlatFile extends AlignFile // FileParse
     */
    void buildSequence()
    {
+    if (this.accession == null || this.sequenceString == null)
+    {
+      Cache.log.error("Failed to parse data from EMBL");
+      return;
+    }
+
      String name = this.accession;
      if (this.sourceDb != null)
      {
@@ -611,7 +669,7 @@ public class EmblFlatFile extends AlignFile // FileParse
        map.setMappedFromId(data.proteinId);
        dnaToEmblProteinRef.setMap(map);
        dna.addDBRef(dnaToEmblProteinRef);
-     }
+    }
  
      /*
       * comment brought forward from EmblXmlSource, lines 447-451:
diff --git a/src/jalview/ws/dbsources/EmblFlatfileSource.java b/src/jalview/ws/dbsources/EmblFlatfileSource.java

index 2353f22..6536958 100644 (file)
--- a/src/jalview/ws/dbsources/EmblFlatfileSource.java
+++ b/src/jalview/ws/dbsources/EmblFlatfileSource.java
@@ -73,12 +73,12 @@ public abstract class EmblFlatfileSource extends EbiFileRetrievedProxy
      try
      {
        reply = dbFetch.fetchDataAsFile(
-              dbName.toLowerCase() + ":" + query.trim(), null, "txt");
+              dbName.toLowerCase() + ":" + query.trim(), null, "gz");
      } catch (Exception e)
      {
        stopQuery();
        throw new Exception(
-              String.format("EBI EMBL XML retrieval failed for %s:%s",
+              String.format("EBI EMBL retrieval failed for %s:%s",
                        dbName.toLowerCase(), query.trim()),
                e);
      }
@@ -112,4 +112,10 @@ public abstract class EmblFlatfileSource extends EbiFileRetrievedProxy
      stopQuery();
      return al;
    }
+
+  @Override
+  public boolean isDnaCoding()
+  {
+    return true;
+  }
  }
diff --git a/src/jalview/ws/ebi/EBIFetchClient.java b/src/jalview/ws/ebi/EBIFetchClient.java

index 9a77087..8ab5fbb 100644 (file)
--- a/src/jalview/ws/ebi/EBIFetchClient.java
+++ b/src/jalview/ws/ebi/EBIFetchClient.java
@@ -295,9 +295,8 @@ public class EBIFetchClient
      if (database.equalsIgnoreCase(DBRefSource.EMBL)
              || database.equalsIgnoreCase(DBRefSource.EMBLCDS))
      {
-//      url = "https://www.ebi.ac.uk/ena/data/view/" + ids.toLowerCase()
-//              + (format != null ? "&" + format : "");
-      url = "https://www.ebi.ac.uk/ena/browser/api/embl/" + ids.toLowerCase();
+      url = "https://www.ebi.ac.uk/ena/browser/api/embl/"
+              + ids.toLowerCase() + "?download=true&gzip=true";
      }
      else
      {
diff --git a/test/jalview/io/EmblFlatFileTest.java b/test/jalview/io/EmblFlatFileTest.java

index 35b378b..2898a06 100644 (file)
--- a/test/jalview/io/EmblFlatFileTest.java
+++ b/test/jalview/io/EmblFlatFileTest.java
@@ -242,11 +242,14 @@ public class EmblFlatFileTest
    public void testParse_noUniprotXref() throws IOException
    {
      // MN908947 cut down to 40BP, one CDS, length 5 peptide for test purposes
+    // plus an additional (invented) test case:
+    // - multi-line /product qualifier including escaped quotes
      String data = "ID   MN908947; SV 3; linear; genomic RNA; STD; VRL; 20 BP.\n"
              + "DE   Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1,\n"
              + "FT   CDS             3..17\n"
              + "FT                   /protein_id=\"QHD43415.1\"\n"
-            + "FT                   /product=\"orf1ab polyprotein\"\n"
+            + "FT                   /product=\"orf1ab polyprotein\n"
+            + "FT                   \"\"foobar\"\" \"\n"
              + "FT                   /translation=\"MRKLD\n"
              + "SQ   Sequence 7496 BP; 2450 A; 1290 C; 1434 G; 2322 T; 0 other;\n"
              + "     ggatGcgtaa gttagacgaa attttgtctt tgcgcacaga        40\n";
@@ -284,7 +287,8 @@ public class EmblFlatFileTest
      mapping = dbref.getMap();
      SequenceI mapTo = mapping.getTo();
      assertEquals(mapTo.getName(), "QHD43415.1");
-    assertEquals(mapTo.getDescription(), "orf1ab polyprotein");
+    // the /product qualifier transfers to protein product description
+    assertEquals(mapTo.getDescription(), "orf1ab polyprotein \"foobar\"");
      assertEquals(mapTo.getSequenceAsString(), "MRKLD");
      map = mapping.getMap();
      assertEquals(map.getFromLowest(), 3);
@@ -323,4 +327,13 @@ public class EmblFlatFileTest
      truncated = EmblFlatFile.adjustForProteinLength(7, exons);
      assertSame(exons, truncated);
    }
+
+  @Test(groups = "Functional")
+  public void testRemoveQuotes()
+  {
+    assertNull(EmblFlatFile.removeQuotes(null));
+    assertEquals(EmblFlatFile.removeQuotes("No quotes here"), "No quotes here");
+    assertEquals(EmblFlatFile.removeQuotes("\"Enclosing quotes\""), "Enclosing quotes");
+    assertEquals(EmblFlatFile.removeQuotes("\"Escaped \"\"quotes\"\" example\""), "Escaped \"quotes\" example");
+  }
  }
author	gmungoc <g.m.carstairs@dundee.ac.uk>
	Wed, 5 Aug 2020 15:53:17 +0000 (16:53 +0100)
committer	Jim Procter <jprocter@issues.jalview.org>
	Wed, 5 Aug 2020 16:29:02 +0000 (17:29 +0100)
src/jalview/io/EmblFlatFile.java		patch \| blob \| history
src/jalview/ws/dbsources/EmblFlatfileSource.java		patch \| blob \| history
src/jalview/ws/ebi/EBIFetchClient.java		patch \| blob \| history
test/jalview/io/EmblFlatFileTest.java		patch \| blob \| history