JAL-3692 parse DE for description, and other refactoring...

author gmungoc <g.m.carstairs@dundee.ac.uk>

Fri, 24 Jul 2020 08:38:45 +0000 (09:38 +0100)

committer Jim Procter <jprocter@issues.jalview.org>

Mon, 3 Aug 2020 11:01:26 +0000 (12:01 +0100)
author gmungoc <g.m.carstairs@dundee.ac.uk>
Fri, 24 Jul 2020 08:38:45 +0000 (09:38 +0100)
committer Jim Procter <jprocter@issues.jalview.org>
Mon, 3 Aug 2020 11:01:26 +0000 (12:01 +0100)
diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java

index 9214f7e..5be4364 100644 (file)
--- a/src/jalview/io/EmblFlatFile.java
+++ b/src/jalview/io/EmblFlatFile.java
@@ -3,6 +3,7 @@ package jalview.io;
  import java.io.IOException;
  import java.text.ParseException;
  import java.util.ArrayList;
+import java.util.HashMap;
  import java.util.Hashtable;
  import java.util.List;
  import java.util.Map;
@@ -10,8 +11,8 @@ import java.util.Map.Entry;
  
  import jalview.bin.Cache;
  import jalview.datamodel.DBRefEntry;
-import jalview.datamodel.DBRefSource;
  import jalview.datamodel.FeatureProperties;
+import jalview.datamodel.Mapping;
  import jalview.datamodel.Sequence;
  import jalview.datamodel.SequenceFeature;
  import jalview.datamodel.SequenceI;
@@ -51,11 +52,11 @@ public class EmblFlatFile extends AlignFile // FileParse
  
      int codonStart = 1; // from CDS /codon_start
  
-    String proteinName; // from CDS /product; TODO: use for protein description
+    String proteinName; // from CDS /product; used for protein description
  
      String proteinId; // from CDS /protein_id
  
-    Map<String, String> cdsProps  = new Hashtable<>(); // CDS other qualifiers
+    Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
    }
  
    private static final String WHITESPACE = "\\s+";
@@ -69,6 +70,8 @@ public class EmblFlatFile extends AlignFile // FileParse
  
    private String version; // from ID (second token)
  
+  private String description; // from (first) DE line
+
    private int length = 128; // from ID (7th token), with usable default
  
    private List<DBRefEntry> dbrefs; // from DR and also CDS /db_xref qualifiers
@@ -76,7 +79,7 @@ public class EmblFlatFile extends AlignFile // FileParse
    private String sequenceString; // from SQ lines
  
    private List<CdsData> cds;
-  
+
    /**
     * Constructor
     * 
@@ -107,6 +110,10 @@ public class EmblFlatFile extends AlignFile // FileParse
        {
          line = parseID(line);
        }
+      else if (line.startsWith("DE"))
+      {
+        line = parseDE(line);
+      }
        else if (line.startsWith("DR"))
        {
          line = parseDR(line);
@@ -180,6 +187,38 @@ public class EmblFlatFile extends AlignFile // FileParse
    }
  
    /**
+   * Reads sequence description from the first DE line found. Any trailing
+   * period is discarded. If there are multiple DE lines, only the first (short
+   * description) is read, the rest are ignored.
+   * 
+   * @param line
+   * @return
+   * @throws IOException
+   */
+  String parseDE(String line) throws IOException
+  {
+    String desc = line.substring(2).trim();
+    if (desc.endsWith("."))
+    {
+      desc = desc.substring(0, desc.length() - 1);
+    }
+    this.description = desc;
+
+    /*
+     * pass over any additional DE lines
+     */
+    while ((line = nextLine()) != null)
+    {
+      if (!line.startsWith("DE"))
+      {
+        break;
+      }
+    }
+
+    return line;
+  }
+
+  /**
     * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
     * the line following the line processed.
     * 
@@ -208,7 +247,7 @@ public class EmblFlatFile extends AlignFile // FileParse
          if (!secondaryId.isEmpty())
          {
            // todo: is this right? secondary id is not a version number
-      //    version = secondaryId;
+          // version = secondaryId;
          }
        }
        this.dbrefs.add(new DBRefEntry(db, version, acc));
@@ -270,7 +309,7 @@ public class EmblFlatFile extends AlignFile // FileParse
      data.cdsLocation = tokens[2];
  
      line = nextLine();
-    while (line  != null)
+    while (line != null)
      {
        if (!line.startsWith("FT    ")) // 4 spaces
        {
@@ -348,7 +387,7 @@ public class EmblFlatFile extends AlignFile // FileParse
      }
  
      this.cds.add(data);
-    
+
      return line;
    }
  
@@ -358,7 +397,7 @@ public class EmblFlatFile extends AlignFile // FileParse
     * 
     * @param value
     *          the first line of the translation (likely quoted)
-   * @param data 
+   * @param data
     * @return
     * @throws IOException
     */
@@ -388,11 +427,63 @@ public class EmblFlatFile extends AlignFile // FileParse
      }
  
      data.translation = sb.toString();
-    
+
      return line;
    }
  
    /**
+   * Constructs and saves the sequence from parsed components
+   */
+  void assembleSequence()
+  {
+    String name = this.accession;
+    if (this.sourceDb != null)
+    {
+      name = this.sourceDb + "|" + name;
+    }
+    SequenceI seq = new Sequence(name, this.sequenceString);
+    seq.setDescription(this.description);
+
+    /*
+     * add a DBRef to itself
+     */
+    DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
+    int[] startEnd = new int[] { 1, seq.getLength() };
+    selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
+    seq.addDBRef(selfRef);
+
+    for (DBRefEntry dbref : this.dbrefs)
+    {
+      seq.addDBRef(dbref);
+    }
+
+    processAllCDS(seq);
+
+    seq.deriveSequence();
+
+    addSequence(seq);
+  }
+
+  /**
+   * Process the CDS features, including generation of cross-references and
+   * mappings to the protein products (translation)
+   * 
+   * @param seq
+   */
+  protected void processAllCDS(SequenceI seq)
+  {
+    /*
+     * record protein products found to avoid duplication i.e. >1 CDS with 
+     * the same /protein_id [though not sure I can find an example of this]
+     */
+    Map<String, SequenceI> proteins = new HashMap<>();
+    for (CdsData data : cds)
+    {
+      processOneCDS(seq, data, proteins);
+    }
+  }
+
+  /**
     * Processes the parsed CDS feature data to
     * <ul>
     * <li>add a CDS feature to the sequence for each CDS start-end range</li>
@@ -403,8 +494,11 @@ public class EmblFlatFile extends AlignFile // FileParse
     * 
     * @param SequenceI
     *          dna
+   * @param proteins
+   *          map of protein products so far derived from CDS data
     */
-  void processCDS(SequenceI dna, CdsData data)
+  void processOneCDS(SequenceI dna, CdsData data,
+          Map<String, SequenceI> proteins)
    {
      /*
       * parse location into a list of [start, end, start, end] positions
@@ -437,54 +531,41 @@ public class EmblFlatFile extends AlignFile // FileParse
        sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
  
        dna.addSequenceFeature(sf);
-    }
-  }
-
-  /**
-   * Constructs a sequence for the protein product (if there is one), and dbrefs
-   * with mappings from dna to protein and the reverse
-   */
-  void processTranslation()
-  {
-    // TODO Auto-generated method stub
  
+      linkProteinProduct(dna, data, proteins);
+    }
    }
  
    /**
-   * Constructs and saves the sequence from parsed components
+   * Constructs a sequence for the protein product for the CDS data (if there is
+   * one), and dbrefs with mappings from CDS to protein and the reverse
+   * 
+   * @param dna
+   * @param data
+   * @param proteins
     */
-  void assembleSequence()
+  void linkProteinProduct(SequenceI dna, CdsData data, Map<String, SequenceI> proteins)
    {
-    String name = this.accession;
-    if (this.sourceDb != null)
+    /*
+     * check we have some data to work with
+     */
+    if (data.proteinId == null || data.translation == null)
      {
-      name = this.sourceDb + "|" + name;
+      return;
      }
-    SequenceI seq = new Sequence(name, this.sequenceString);
-    for (DBRefEntry dbref : this.dbrefs)
+    
+    /*
+     * Construct the protein sequence (if not already seen)
+     */
+    SequenceI protein = proteins.get(data.proteinId);
+    if (protein == null)
      {
-      seq.addDBRef(dbref);
+      protein = new Sequence(data.proteinId, data.translation, 1,
+              data.translation.length());
+      protein.setDescription(data.proteinName != null ? data.proteinName
+              : "Protein Product from " + sourceDb);
+      proteins.put(data.proteinId, protein);
      }
-
-    for (CdsData data : cds)
-    {
-      processCDS(seq, data);
-    };
-
-    processTranslation();
-
-    seq.deriveSequence();
-
-    addSequence(seq);
-  }
-
-  /**
-   * Output (print) is not implemented for EMBL flat file format
-   */
-  @Override
-  public String print(SequenceI[] seqs, boolean jvsuffix)
-  {
-    return null;
    }
  
    /**
@@ -514,4 +595,13 @@ public class EmblFlatFile extends AlignFile // FileParse
        return new int[] {};
      }
    }
+
+  /**
+   * Output (print) is not implemented for EMBL flat file format
+   */
+  @Override
+  public String print(SequenceI[] seqs, boolean jvsuffix)
+  {
+    return null;
+  }
  }
diff --git a/test/jalview/io/EmblFlatFileTest.java b/test/jalview/io/EmblFlatFileTest.java

index 6d9874e..b1023d1 100644 (file)
--- a/test/jalview/io/EmblFlatFileTest.java
+++ b/test/jalview/io/EmblFlatFileTest.java
@@ -2,7 +2,6 @@ package jalview.io;
  
  import static org.testng.Assert.assertEquals;
  import static org.testng.Assert.assertTrue;
-import static org.testng.Assert.assertNull;
  
  import java.io.File;
  import java.io.IOException;
@@ -13,6 +12,7 @@ import java.util.Set;
  import org.testng.annotations.Test;
  
  import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.Mapping;
  import jalview.datamodel.SequenceFeature;
  import jalview.datamodel.SequenceI;
  import jalview.datamodel.features.SequenceFeatures;
@@ -39,6 +39,7 @@ public class EmblFlatFileTest
      SequenceI seq = seqs.get(0);
      assertEquals(seq.getName(), "EmblTest|J03321");
      assertEquals(seq.getLength(), 7502);
+    assertEquals(seq.getDescription(), "Chlamydia trachomatis plasmid pCHL1, complete sequence");
  
      /*
       * should be 9 CDS features (one is a 'join' of two exons)
@@ -48,7 +49,7 @@ public class EmblFlatFileTest
      assertTrue(featureTypes.contains("CDS"));
      
      /*
-     * inspect some features (sort them for convenience of test assertions)
+     * inspect some features (sorted just for convenience of test assertions)
       */
      List<SequenceFeature> features = seq.getFeatures()
              .getAllFeatures("CDS");
@@ -65,7 +66,7 @@ public class EmblFlatFileTest
      assertEquals(sf.getPhase(), "0");
      assertEquals(sf.getStrand(), 1);
      assertEquals(sf.getValue("note"), "pGP7-D");
-    // second exon of circular DNA!
+    // this is the second exon of circular CDS!
      assertEquals(sf.getValue("exon number"), 2);
      assertEquals(sf.getValue("product"), "hypothetical protein");
      assertEquals(sf.getValue("transl_table"), "11");
@@ -97,7 +98,7 @@ public class EmblFlatFileTest
      assertEquals(sf.getValue("product"), "hypothetical protein");
      
      /*
-     * CDS at 7022-7502 is the first exon of the circular DNA CDS
+     * CDS at 7022-7502 is the first exon of the circular CDS
       */
      sf = features.get(8);
      assertEquals(sf.getBegin(), 7022);
@@ -113,18 +114,25 @@ public class EmblFlatFileTest
      assertEquals(sf.getValue("product"), "hypothetical protein");
  
      /*
-     * there are 4 'direct' (DR) dbrefs, and numerous CDS /db_xref entries,
-     * some of them (e.g. INTERPRO) duplicates; sample a few here
+     * Jalview adds a dbref to 'self', and  there are 4 'direct' (DR) dbrefs, 
+     * and numerous CDS /db_xref entries (some e.g. INTERPRO are duplicates)
+     * sample a few here
       * Note DBRefEntry constructor capitalises source
       */
      List<DBRefEntry> dbrefs = seq.getDBRefs();
-    assertEquals(dbrefs.size(), 31);
+    assertEquals(dbrefs.size(), 32);
+    // xref to 'self':
+    DBRefEntry selfRef = new DBRefEntry("EMBLTEST", "1", "J03321");
+    int[] range = new int[] {1, seq.getLength()};
+    selfRef.setMap(new Mapping(null, range, range, 1, 1));
+    assertTrue(dbrefs.contains(selfRef));
+    
      // 1st DR line; note trailing period is removed
      assertTrue(dbrefs.contains(new DBRefEntry("MD5", "0",
              "d4c4942a634e3df4995fd5ac75c26a61")));
      // the 4th DR line:
      assertTrue(
-            dbrefs.contains(new DBRefEntry("EuropePMC", "0", "PMC87941")));
+            dbrefs.contains(new DBRefEntry("EUROPEPMC", "0", "PMC87941")));
      // from the first CDS feature; note canonicalisation to "UNIPROT"
      assertTrue(dbrefs.contains(new DBRefEntry("GOA", "0", "P0CE19")));
      assertTrue(dbrefs.contains(new DBRefEntry("UNIPROT", "0", "P0CE19")));
author	gmungoc <g.m.carstairs@dundee.ac.uk>
	Fri, 24 Jul 2020 08:38:45 +0000 (09:38 +0100)
committer	Jim Procter <jprocter@issues.jalview.org>
	Mon, 3 Aug 2020 11:01:26 +0000 (12:01 +0100)
src/jalview/io/EmblFlatFile.java		patch \| blob \| history
test/jalview/io/EmblFlatFileTest.java		patch \| blob \| history