JAL-3692 unit test (J03321), fixes, dbrefs; todo: protein mappings

[jalview.git] / src / jalview / io / EmblFlatFile.java
diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java

index 759fa28..9214f7e 100644 (file)
--- a/src/jalview/io/EmblFlatFile.java
+++ b/src/jalview/io/EmblFlatFile.java
@@ -15,6 +15,7 @@ import jalview.datamodel.FeatureProperties;
  import jalview.datamodel.Sequence;
  import jalview.datamodel.SequenceFeature;
  import jalview.datamodel.SequenceI;
+import jalview.util.DBRefUtils;
  import jalview.util.DnaUtils;
  import jalview.util.MappingUtils;
  
@@ -39,10 +40,28 @@ import jalview.util.MappingUtils;
   */
  public class EmblFlatFile extends AlignFile // FileParse
  {
+  /**
+   * A data bean class to hold values parsed from one CDS Feature (FT)
+   */
+  class CdsData
+  {
+    String translation; // from CDS feature /translation
+
+    String cdsLocation; // CDS /location raw value
+
+    int codonStart = 1; // from CDS /codon_start
+
+    String proteinName; // from CDS /product; TODO: use for protein description
+
+    String proteinId; // from CDS /protein_id
+
+    Map<String, String> cdsProps  = new Hashtable<>(); // CDS other qualifiers
+  }
+
    private static final String WHITESPACE = "\\s+";
  
    private String sourceDb;
-  
+
    /*
     * values parsed from the EMBL flatfile record
     */
@@ -56,20 +75,11 @@ public class EmblFlatFile extends AlignFile // FileParse
  
    private String sequenceString; // from SQ lines
  
-  private String translation; // from CDS feature /translation
-
-  private String cdsLocation; // CDS /location raw value
-
-  private int codonStart = 1; // from CDS /codon_start
-
-  private String proteinName; // from CDS /product
-
-  private String proteinId; // from CDS /protein_id
-
-  private Map<String, String> cdsProps; // CDS other qualifiers e.g. 'note'
-
+  private List<CdsData> cds;
+  
    /**
     * Constructor
+   * 
     * @param fp
     * @param sourceId
     * @throws IOException
@@ -79,7 +89,7 @@ public class EmblFlatFile extends AlignFile // FileParse
      super(false, fp); // don't parse immediately
      this.sourceDb = sourceId;
      dbrefs = new ArrayList<>();
-    cdsProps = new Hashtable<>();
+    cds = new ArrayList<>();
    }
  
    /**
@@ -95,19 +105,19 @@ public class EmblFlatFile extends AlignFile // FileParse
      {
        if (line.startsWith("ID"))
        {
-        line = processID(line);
+        line = parseID(line);
        }
        else if (line.startsWith("DR"))
        {
-        line = processDR(line);
+        line = parseDR(line);
        }
        else if (line.startsWith("SQ"))
        {
-        line = processSQ();
+        line = parseSQ();
        }
        else if (line.startsWith("FT"))
        {
-        line = processFT(line);
+        line = parseFT(line);
        }
        else
        {
@@ -124,7 +134,7 @@ public class EmblFlatFile extends AlignFile // FileParse
     * @param line
     * @throws IOException
     */
-  String processID(String line) throws IOException
+  String parseID(String line) throws IOException
    {
      String[] tokens = line.substring(2).split(";");
  
@@ -176,18 +186,32 @@ public class EmblFlatFile extends AlignFile // FileParse
     * @param line
     * @throws IOException
     */
-  String processDR(String line) throws IOException
+  String parseDR(String line) throws IOException
    {
      String[] tokens = line.substring(2).split(";");
      if (tokens.length > 1)
      {
+      /*
+       * ensure UniProtKB/Swiss-Prot converted to UNIPROT
+       */
        String db = tokens[0].trim();
+      db = DBRefUtils.getCanonicalName(db);
        String acc = tokens[1].trim();
        if (acc.endsWith("."))
        {
          acc = acc.substring(0, acc.length() - 1);
        }
-      this.dbrefs.add(new DBRefEntry(db, "0", acc));
+      String version = "0";
+      if (tokens.length > 2)
+      {
+        String secondaryId = tokens[2].trim();
+        if (!secondaryId.isEmpty())
+        {
+          // todo: is this right? secondary id is not a version number
+      //    version = secondaryId;
+        }
+      }
+      this.dbrefs.add(new DBRefEntry(db, version, acc));
      }
  
      return nextLine();
@@ -201,7 +225,7 @@ public class EmblFlatFile extends AlignFile // FileParse
     * 
     * @throws IOException
     */
-  String processSQ() throws IOException
+  String parseSQ() throws IOException
    {
      StringBuilder sb = new StringBuilder(this.length);
      String line = nextLine();
@@ -234,7 +258,7 @@ public class EmblFlatFile extends AlignFile // FileParse
     * @return
     * @throws IOException
     */
-  String processFT(String line) throws IOException
+  String parseFT(String line) throws IOException
    {
      String[] tokens = line.split(WHITESPACE);
      if (tokens.length < 3 || !"CDS".equals(tokens[1]))
@@ -242,9 +266,11 @@ public class EmblFlatFile extends AlignFile // FileParse
        return nextLine();
      }
  
-    this.cdsLocation = tokens[2];
+    CdsData data = new CdsData();
+    data.cdsLocation = tokens[2];
  
-    while ((line = nextLine()) != null)
+    line = nextLine();
+    while (line  != null)
      {
        if (!line.startsWith("FT    ")) // 4 spaces
        {
@@ -276,48 +302,67 @@ public class EmblFlatFile extends AlignFile // FileParse
  
        if ("protein_id".equals(qualifier))
        {
-        proteinId = value;
+        data.proteinId = value;
+        line = nextLine();
        }
        else if ("codon_start".equals(qualifier))
        {
          try
          {
-          codonStart = Integer.parseInt(value.trim());
+          data.codonStart = Integer.parseInt(value.trim());
          } catch (NumberFormatException e)
          {
            Cache.log.error("Invalid codon_start in XML for " + this.accession
                    + ": " + e.getMessage());
          }
+        line = nextLine();
+      }
+      else if ("db_xref".equals(qualifier))
+      {
+        String[] parts = value.split(":");
+        if (parts.length == 2)
+        {
+          String db = parts[0].trim();
+          db = DBRefUtils.getCanonicalName(db);
+          DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
+          this.dbrefs.add(dbref);
+        }
+        line = nextLine();
        }
        else if ("product".equals(qualifier))
        {
          // sometimes name is returned e.g. for V00488
-        proteinName = value;
+        data.proteinName = value;
+        line = nextLine();
        }
        else if ("translation".equals(qualifier))
        {
-        line = readTranslation(value);
+        line = readTranslation(value, data);
        }
        else if (!"".equals(value))
        {
          // throw anything else into the additional properties hash
-        cdsProps.put(qualifier, value);
+        data.cdsProps.put(qualifier, value);
+        line = nextLine();
        }
      }
+
+    this.cds.add(data);
      
      return line;
    }
  
    /**
-   * Reads and saves the CDS translation from one or more lines of the file, and
-   * returns the next line after that
+   * Reads and returns the CDS translation from one or more lines of the file,
+   * and returns the next line after that
     * 
     * @param value
     *          the first line of the translation (likely quoted)
+   * @param data 
     * @return
     * @throws IOException
     */
-  String readTranslation(String value) throws IOException
+  String readTranslation(String value, CdsData data) throws IOException
    {
      StringBuilder sb = new StringBuilder(this.length / 3 + 1);
      sb.append(value.replace("\"", ""));
@@ -342,7 +387,9 @@ public class EmblFlatFile extends AlignFile // FileParse
        sb.append(tokens[1].replace("\"", ""));
      }
  
-    return sb.toString();
+    data.translation = sb.toString();
+    
+    return line;
    }
  
    /**
@@ -351,20 +398,21 @@ public class EmblFlatFile extends AlignFile // FileParse
     * <li>add a CDS feature to the sequence for each CDS start-end range</li>
     * <li>create a protein product sequence for the translation</li>
     * <li>create a cross-reference to protein with mapping from dna</li>
-   * <li>add any CDS dbrefs to the sequence and to the protein product</li> 
+   * <li>add any CDS dbrefs to the sequence and to the protein product</li>
     * </ul>
-   * @param SequenceI dna
+   * 
+   * @param SequenceI
+   *          dna
     */
-  void processCDS(SequenceI dna)
+  void processCDS(SequenceI dna, CdsData data)
    {
      /*
       * parse location into a list of [start, end, start, end] positions
       */
-    int[] exons = getCdsRanges(this.accession, this.cdsLocation);
+    int[] exons = getCdsRanges(this.accession, data.cdsLocation);
      int exonNumber = 0;
-    
-    for (int xint = 0; exons != null
-            && xint < exons.length - 1; xint += 2)
+
+    for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
      {
        int exonStart = exons[xint];
        int exonEnd = exons[xint + 1];
@@ -372,29 +420,37 @@ public class EmblFlatFile extends AlignFile // FileParse
        int end = Math.max(exonStart, exonEnd);
        exonNumber++;
        String desc = String.format("Exon %d for protein EMBLCDS:%s",
-              exonNumber, proteinId);
+              exonNumber, data.proteinId);
  
-      SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, this.sourceDb);
-      if (!cdsProps.isEmpty())
+      SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
+              this.sourceDb);
+      for (Entry<String, String> val : data.cdsProps.entrySet())
        {
-        for (Entry<String, String> val : cdsProps.entrySet())
-        {
-          sf.setValue(val.getKey(), val.getValue());
-        }
+        sf.setValue(val.getKey(), val.getValue());
        }
  
-      sf.setEnaLocation(this.cdsLocation);
+      sf.setEnaLocation(data.cdsLocation);
        boolean forwardStrand = exonStart <= exonEnd;
        sf.setStrand(forwardStrand ? "+" : "-");
-      sf.setPhase(String.valueOf(codonStart - 1));
+      sf.setPhase(String.valueOf(data.codonStart - 1));
        sf.setValue(FeatureProperties.EXONPOS, exonNumber);
-      sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
+      sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
  
        dna.addSequenceFeature(sf);
      }
    }
  
    /**
+   * Constructs a sequence for the protein product (if there is one), and dbrefs
+   * with mappings from dna to protein and the reverse
+   */
+  void processTranslation()
+  {
+    // TODO Auto-generated method stub
+
+  }
+
+  /**
     * Constructs and saves the sequence from parsed components
     */
    void assembleSequence()
@@ -409,10 +465,16 @@ public class EmblFlatFile extends AlignFile // FileParse
      {
        seq.addDBRef(dbref);
      }
-    
-    processCDS(seq);
+
+    for (CdsData data : cds)
+    {
+      processCDS(seq, data);
+    };
+
+    processTranslation();
+
      seq.deriveSequence();
-    
+
      addSequence(seq);
    }