JAL-3692 unit test (J03321), fixes, dbrefs; todo: protein mappings
authorgmungoc <g.m.carstairs@dundee.ac.uk>
Wed, 22 Jul 2020 09:55:43 +0000 (10:55 +0100)
committerJim Procter <jprocter@issues.jalview.org>
Mon, 3 Aug 2020 11:01:25 +0000 (12:01 +0100)
src/jalview/io/EmblFlatFile.java
test/jalview/io/EmblFlatFileTest.java [new file with mode: 0644]
test/jalview/io/J03321.embl.txt [new file with mode: 0644]

index 759fa28..9214f7e 100644 (file)
@@ -15,6 +15,7 @@ import jalview.datamodel.FeatureProperties;
 import jalview.datamodel.Sequence;
 import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
+import jalview.util.DBRefUtils;
 import jalview.util.DnaUtils;
 import jalview.util.MappingUtils;
 
@@ -39,10 +40,28 @@ import jalview.util.MappingUtils;
  */
 public class EmblFlatFile extends AlignFile // FileParse
 {
+  /**
+   * A data bean class to hold values parsed from one CDS Feature (FT)
+   */
+  class CdsData
+  {
+    String translation; // from CDS feature /translation
+
+    String cdsLocation; // CDS /location raw value
+
+    int codonStart = 1; // from CDS /codon_start
+
+    String proteinName; // from CDS /product; TODO: use for protein description
+
+    String proteinId; // from CDS /protein_id
+
+    Map<String, String> cdsProps  = new Hashtable<>(); // CDS other qualifiers
+  }
+
   private static final String WHITESPACE = "\\s+";
 
   private String sourceDb;
-  
+
   /*
    * values parsed from the EMBL flatfile record
    */
@@ -56,20 +75,11 @@ public class EmblFlatFile extends AlignFile // FileParse
 
   private String sequenceString; // from SQ lines
 
-  private String translation; // from CDS feature /translation
-
-  private String cdsLocation; // CDS /location raw value
-
-  private int codonStart = 1; // from CDS /codon_start
-
-  private String proteinName; // from CDS /product
-
-  private String proteinId; // from CDS /protein_id
-
-  private Map<String, String> cdsProps; // CDS other qualifiers e.g. 'note'
-
+  private List<CdsData> cds;
+  
   /**
    * Constructor
+   * 
    * @param fp
    * @param sourceId
    * @throws IOException
@@ -79,7 +89,7 @@ public class EmblFlatFile extends AlignFile // FileParse
     super(false, fp); // don't parse immediately
     this.sourceDb = sourceId;
     dbrefs = new ArrayList<>();
-    cdsProps = new Hashtable<>();
+    cds = new ArrayList<>();
   }
 
   /**
@@ -95,19 +105,19 @@ public class EmblFlatFile extends AlignFile // FileParse
     {
       if (line.startsWith("ID"))
       {
-        line = processID(line);
+        line = parseID(line);
       }
       else if (line.startsWith("DR"))
       {
-        line = processDR(line);
+        line = parseDR(line);
       }
       else if (line.startsWith("SQ"))
       {
-        line = processSQ();
+        line = parseSQ();
       }
       else if (line.startsWith("FT"))
       {
-        line = processFT(line);
+        line = parseFT(line);
       }
       else
       {
@@ -124,7 +134,7 @@ public class EmblFlatFile extends AlignFile // FileParse
    * @param line
    * @throws IOException
    */
-  String processID(String line) throws IOException
+  String parseID(String line) throws IOException
   {
     String[] tokens = line.substring(2).split(";");
 
@@ -176,18 +186,32 @@ public class EmblFlatFile extends AlignFile // FileParse
    * @param line
    * @throws IOException
    */
-  String processDR(String line) throws IOException
+  String parseDR(String line) throws IOException
   {
     String[] tokens = line.substring(2).split(";");
     if (tokens.length > 1)
     {
+      /*
+       * ensure UniProtKB/Swiss-Prot converted to UNIPROT
+       */
       String db = tokens[0].trim();
+      db = DBRefUtils.getCanonicalName(db);
       String acc = tokens[1].trim();
       if (acc.endsWith("."))
       {
         acc = acc.substring(0, acc.length() - 1);
       }
-      this.dbrefs.add(new DBRefEntry(db, "0", acc));
+      String version = "0";
+      if (tokens.length > 2)
+      {
+        String secondaryId = tokens[2].trim();
+        if (!secondaryId.isEmpty())
+        {
+          // todo: is this right? secondary id is not a version number
+      //    version = secondaryId;
+        }
+      }
+      this.dbrefs.add(new DBRefEntry(db, version, acc));
     }
 
     return nextLine();
@@ -201,7 +225,7 @@ public class EmblFlatFile extends AlignFile // FileParse
    * 
    * @throws IOException
    */
-  String processSQ() throws IOException
+  String parseSQ() throws IOException
   {
     StringBuilder sb = new StringBuilder(this.length);
     String line = nextLine();
@@ -234,7 +258,7 @@ public class EmblFlatFile extends AlignFile // FileParse
    * @return
    * @throws IOException
    */
-  String processFT(String line) throws IOException
+  String parseFT(String line) throws IOException
   {
     String[] tokens = line.split(WHITESPACE);
     if (tokens.length < 3 || !"CDS".equals(tokens[1]))
@@ -242,9 +266,11 @@ public class EmblFlatFile extends AlignFile // FileParse
       return nextLine();
     }
 
-    this.cdsLocation = tokens[2];
+    CdsData data = new CdsData();
+    data.cdsLocation = tokens[2];
 
-    while ((line = nextLine()) != null)
+    line = nextLine();
+    while (line  != null)
     {
       if (!line.startsWith("FT    ")) // 4 spaces
       {
@@ -276,48 +302,67 @@ public class EmblFlatFile extends AlignFile // FileParse
 
       if ("protein_id".equals(qualifier))
       {
-        proteinId = value;
+        data.proteinId = value;
+        line = nextLine();
       }
       else if ("codon_start".equals(qualifier))
       {
         try
         {
-          codonStart = Integer.parseInt(value.trim());
+          data.codonStart = Integer.parseInt(value.trim());
         } catch (NumberFormatException e)
         {
           Cache.log.error("Invalid codon_start in XML for " + this.accession
                   + ": " + e.getMessage());
         }
+        line = nextLine();
+      }
+      else if ("db_xref".equals(qualifier))
+      {
+        String[] parts = value.split(":");
+        if (parts.length == 2)
+        {
+          String db = parts[0].trim();
+          db = DBRefUtils.getCanonicalName(db);
+          DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
+          this.dbrefs.add(dbref);
+        }
+        line = nextLine();
       }
       else if ("product".equals(qualifier))
       {
         // sometimes name is returned e.g. for V00488
-        proteinName = value;
+        data.proteinName = value;
+        line = nextLine();
       }
       else if ("translation".equals(qualifier))
       {
-        line = readTranslation(value);
+        line = readTranslation(value, data);
       }
       else if (!"".equals(value))
       {
         // throw anything else into the additional properties hash
-        cdsProps.put(qualifier, value);
+        data.cdsProps.put(qualifier, value);
+        line = nextLine();
       }
     }
+
+    this.cds.add(data);
     
     return line;
   }
 
   /**
-   * Reads and saves the CDS translation from one or more lines of the file, and
-   * returns the next line after that
+   * Reads and returns the CDS translation from one or more lines of the file,
+   * and returns the next line after that
    * 
    * @param value
    *          the first line of the translation (likely quoted)
+   * @param data 
    * @return
    * @throws IOException
    */
-  String readTranslation(String value) throws IOException
+  String readTranslation(String value, CdsData data) throws IOException
   {
     StringBuilder sb = new StringBuilder(this.length / 3 + 1);
     sb.append(value.replace("\"", ""));
@@ -342,7 +387,9 @@ public class EmblFlatFile extends AlignFile // FileParse
       sb.append(tokens[1].replace("\"", ""));
     }
 
-    return sb.toString();
+    data.translation = sb.toString();
+    
+    return line;
   }
 
   /**
@@ -351,20 +398,21 @@ public class EmblFlatFile extends AlignFile // FileParse
    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
    * <li>create a protein product sequence for the translation</li>
    * <li>create a cross-reference to protein with mapping from dna</li>
-   * <li>add any CDS dbrefs to the sequence and to the protein product</li> 
+   * <li>add any CDS dbrefs to the sequence and to the protein product</li>
    * </ul>
-   * @param SequenceI dna
+   * 
+   * @param SequenceI
+   *          dna
    */
-  void processCDS(SequenceI dna)
+  void processCDS(SequenceI dna, CdsData data)
   {
     /*
      * parse location into a list of [start, end, start, end] positions
      */
-    int[] exons = getCdsRanges(this.accession, this.cdsLocation);
+    int[] exons = getCdsRanges(this.accession, data.cdsLocation);
     int exonNumber = 0;
-    
-    for (int xint = 0; exons != null
-            && xint < exons.length - 1; xint += 2)
+
+    for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
     {
       int exonStart = exons[xint];
       int exonEnd = exons[xint + 1];
@@ -372,29 +420,37 @@ public class EmblFlatFile extends AlignFile // FileParse
       int end = Math.max(exonStart, exonEnd);
       exonNumber++;
       String desc = String.format("Exon %d for protein EMBLCDS:%s",
-              exonNumber, proteinId);
+              exonNumber, data.proteinId);
 
-      SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, this.sourceDb);
-      if (!cdsProps.isEmpty())
+      SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
+              this.sourceDb);
+      for (Entry<String, String> val : data.cdsProps.entrySet())
       {
-        for (Entry<String, String> val : cdsProps.entrySet())
-        {
-          sf.setValue(val.getKey(), val.getValue());
-        }
+        sf.setValue(val.getKey(), val.getValue());
       }
 
-      sf.setEnaLocation(this.cdsLocation);
+      sf.setEnaLocation(data.cdsLocation);
       boolean forwardStrand = exonStart <= exonEnd;
       sf.setStrand(forwardStrand ? "+" : "-");
-      sf.setPhase(String.valueOf(codonStart - 1));
+      sf.setPhase(String.valueOf(data.codonStart - 1));
       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
-      sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
+      sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 
       dna.addSequenceFeature(sf);
     }
   }
 
   /**
+   * Constructs a sequence for the protein product (if there is one), and dbrefs
+   * with mappings from dna to protein and the reverse
+   */
+  void processTranslation()
+  {
+    // TODO Auto-generated method stub
+
+  }
+
+  /**
    * Constructs and saves the sequence from parsed components
    */
   void assembleSequence()
@@ -409,10 +465,16 @@ public class EmblFlatFile extends AlignFile // FileParse
     {
       seq.addDBRef(dbref);
     }
-    
-    processCDS(seq);
+
+    for (CdsData data : cds)
+    {
+      processCDS(seq, data);
+    };
+
+    processTranslation();
+
     seq.deriveSequence();
-    
+
     addSequence(seq);
   }
 
diff --git a/test/jalview/io/EmblFlatFileTest.java b/test/jalview/io/EmblFlatFileTest.java
new file mode 100644 (file)
index 0000000..6d9874e
--- /dev/null
@@ -0,0 +1,136 @@
+package jalview.io;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertTrue;
+import static org.testng.Assert.assertNull;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.List;
+import java.util.Set;
+
+import org.testng.annotations.Test;
+
+import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.datamodel.features.SequenceFeatures;
+
+public class EmblFlatFileTest
+{
+  /**
+   * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features,
+   * one of them reverse strand
+   * 
+   * @throws MalformedURLException
+   * @throws IOException
+   */
+  @Test(groups = "Functional")
+  public void testParse() throws MalformedURLException, IOException
+  {
+    File dataFile = new File("test/jalview/io/J03321.embl.txt");
+    FileParse fp = new FileParse(dataFile, DataSourceType.FILE);
+    EmblFlatFile parser = new EmblFlatFile(fp, "EmblTest");
+    parser.parse();
+    List<SequenceI> seqs = parser.getSeqs();
+
+    assertEquals(seqs.size(), 1);
+    SequenceI seq = seqs.get(0);
+    assertEquals(seq.getName(), "EmblTest|J03321");
+    assertEquals(seq.getLength(), 7502);
+
+    /*
+     * should be 9 CDS features (one is a 'join' of two exons)
+     */
+    Set<String> featureTypes = seq.getFeatures().getFeatureTypes();
+    assertEquals(featureTypes.size(), 1);
+    assertTrue(featureTypes.contains("CDS"));
+    
+    /*
+     * inspect some features (sort them for convenience of test assertions)
+     */
+    List<SequenceFeature> features = seq.getFeatures()
+            .getAllFeatures("CDS");
+    SequenceFeatures.sortFeatures(features,  true);
+    assertEquals(features.size(), 9);
+    
+    SequenceFeature sf = features.get(0);
+    assertEquals(sf.getBegin(), 1);
+    assertEquals(sf.getEnd(), 437);
+    assertEquals(sf.getDescription(),
+            "Exon 2 for protein EMBLCDS:AAA91567.1");
+    assertEquals(sf.getFeatureGroup(), "EmblTest");
+    assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)");
+    assertEquals(sf.getPhase(), "0");
+    assertEquals(sf.getStrand(), 1);
+    assertEquals(sf.getValue("note"), "pGP7-D");
+    // second exon of circular DNA!
+    assertEquals(sf.getValue("exon number"), 2);
+    assertEquals(sf.getValue("product"), "hypothetical protein");
+    assertEquals(sf.getValue("transl_table"), "11");
+    
+    sf = features.get(1);
+    assertEquals(sf.getBegin(), 488);
+    assertEquals(sf.getEnd(), 1480);
+    assertEquals(sf.getDescription(),
+            "Exon 1 for protein EMBLCDS:AAA91568.1");
+    assertEquals(sf.getFeatureGroup(), "EmblTest");
+    assertEquals(sf.getEnaLocation(), "complement(488..1480)");
+    assertEquals(sf.getPhase(), "0");
+    assertEquals(sf.getStrand(), -1); // reverse strand!
+    assertEquals(sf.getValue("note"), "pGP8-D");
+    assertEquals(sf.getValue("exon number"), 1);
+    assertEquals(sf.getValue("product"), "hypothetical protein");
+    
+    sf = features.get(7);
+    assertEquals(sf.getBegin(), 6045);
+    assertEquals(sf.getEnd(), 6788);
+    assertEquals(sf.getDescription(),
+            "Exon 1 for protein EMBLCDS:AAA91574.1");
+    assertEquals(sf.getFeatureGroup(), "EmblTest");
+    assertEquals(sf.getEnaLocation(), "6045..6788");
+    assertEquals(sf.getPhase(), "0");
+    assertEquals(sf.getStrand(), 1);
+    assertEquals(sf.getValue("note"), "pGP6-D (gtg start codon)");
+    assertEquals(sf.getValue("exon number"), 1);
+    assertEquals(sf.getValue("product"), "hypothetical protein");
+    
+    /*
+     * CDS at 7022-7502 is the first exon of the circular DNA CDS
+     */
+    sf = features.get(8);
+    assertEquals(sf.getBegin(), 7022);
+    assertEquals(sf.getEnd(), 7502);
+    assertEquals(sf.getDescription(),
+            "Exon 1 for protein EMBLCDS:AAA91567.1");
+    assertEquals(sf.getFeatureGroup(), "EmblTest");
+    assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)");
+    assertEquals(sf.getPhase(), "0");
+    assertEquals(sf.getStrand(), 1);
+    assertEquals(sf.getValue("note"), "pGP7-D");
+    assertEquals(sf.getValue("exon number"), 1);
+    assertEquals(sf.getValue("product"), "hypothetical protein");
+
+    /*
+     * there are 4 'direct' (DR) dbrefs, and numerous CDS /db_xref entries,
+     * some of them (e.g. INTERPRO) duplicates; sample a few here
+     * Note DBRefEntry constructor capitalises source
+     */
+    List<DBRefEntry> dbrefs = seq.getDBRefs();
+    assertEquals(dbrefs.size(), 31);
+    // 1st DR line; note trailing period is removed
+    assertTrue(dbrefs.contains(new DBRefEntry("MD5", "0",
+            "d4c4942a634e3df4995fd5ac75c26a61")));
+    // the 4th DR line:
+    assertTrue(
+            dbrefs.contains(new DBRefEntry("EuropePMC", "0", "PMC87941")));
+    // from the first CDS feature; note canonicalisation to "UNIPROT"
+    assertTrue(dbrefs.contains(new DBRefEntry("GOA", "0", "P0CE19")));
+    assertTrue(dbrefs.contains(new DBRefEntry("UNIPROT", "0", "P0CE19")));
+    // from the last CDS feature
+    assertTrue(dbrefs.contains(new DBRefEntry("INTERPRO", "0", "IPR005350")));
+
+    // todo: mappings to, and sequences for, UNIPROT proteins
+  }
+}
diff --git a/test/jalview/io/J03321.embl.txt b/test/jalview/io/J03321.embl.txt
new file mode 100644 (file)
index 0000000..92065b9
--- /dev/null
@@ -0,0 +1,304 @@
+ID   J03321; SV 1; circular; genomic DNA; STD; PRO; 7502 BP.
+XX
+AC   J03321;
+XX
+DT   27-JUL-1990 (Rel. 24, Created)
+DT   10-APR-2020 (Rel. 144, Last updated, Version 9)
+XX
+DE   Chlamydia trachomatis plasmid pCHL1, complete sequence.
+XX
+KW   .
+XX
+OS   Chlamydia trachomatis
+OC   Bacteria; Chlamydiae; Chlamydiales; Chlamydiaceae;
+OC   Chlamydia/Chlamydophila group; Chlamydia.
+OG   Plasmid pCHL1
+XX
+RN   [1]
+RP   1-7502
+RX   DOI; 10.1016/0147-619X(90)90034-A.
+RX   PUBMED; 2194229.
+RA   Comanducci M., Ricci S., Cevenini R., Ratti G.;
+RT   "Diversity of the Chlamydia trachomatis common plasmid in biovars with
+RT   different pathogenicity";
+RL   Plasmid 23(2):149-154(1990).
+XX
+RN   [2]
+RP   1-7502
+RA   Comanducci M., Ricci S., Cevenini R., Ratti G.;
+RT   ;
+RL   Submitted (23-JUN-2010) to the INSDC.
+RL   Sclavo Research Centre, Siena, Italy
+XX
+DR   MD5; d4c4942a634e3df4995fd5ac75c26a61.
+DR   BioSample; SAMN14225621.
+DR   EuropePMC; PMC4450983; 26031715.
+DR   EuropePMC; PMC87941; 11283058.
+XX
+CC   Draft entry and computer-readable sequence kindly submitted by
+CC   G.Ratti, 28-MAR-1990.
+XX
+FH   Key             Location/Qualifiers
+FH
+FT   source          1..7502
+FT                   /organism="Chlamydia trachomatis"
+FT                   /plasmid="pCHL1"
+FT                   /isolate="G0/86"
+FT                   /serotype="D"
+FT                   /mol_type="genomic DNA"
+FT                   /isolation_source="trachoma"
+FT                   /db_xref="taxon:813"
+FT   CDS             join(7022..7502,1..437)
+FT                   /codon_start=1
+FT                   /transl_table=11
+FT                   /product="hypothetical protein"
+FT                   /note="pGP7-D"
+FT                   /db_xref="GOA:P0CE19"
+FT                   /db_xref="InterPro:IPR002104"
+FT                   /db_xref="InterPro:IPR011010"
+FT                   /db_xref="InterPro:IPR013762"
+FT                   /db_xref="UniProtKB/Swiss-Prot:P0CE19"
+FT                   /protein_id="AAA91567.1"
+FT                   /translation="MGSMAFHKSRLFLTFGDASEIWLSTLSYLTRKNYASGINFLVSLE
+FT                   ILDLSETLIKAISLDHSESLFKIKSLDVFNGKVVSEASKQARAACYISFTKFLYRLTKG
+FT                   YIKPAIPLKDFGNTTFFKIRDKIKTESISKQEWTVFFEALRIVNYRDYLIGKLIVQGIR
+FT                   KLDEILSLRTDDLFFASNQISFRIKKRQNKETKILITFPISLMEELQKYTCGRNGRVFV
+FT                   SKIGIPVTTSQVAHNFRLAEFHSAMKIKITPRVLRASALIHLKQIGLKDEEIMRISCLS
+FT                   SRQSVCSYCSGEEVIPLVQTPTIL"
+FT   CDS             complement(488..1480)
+FT                   /codon_start=1
+FT                   /transl_table=11
+FT                   /product="hypothetical protein"
+FT                   /note="pGP8-D"
+FT                   /db_xref="GOA:P0CE20"
+FT                   /db_xref="InterPro:IPR002104"
+FT                   /db_xref="InterPro:IPR011010"
+FT                   /db_xref="InterPro:IPR013762"
+FT                   /db_xref="UniProtKB/Swiss-Prot:P0CE20"
+FT                   /protein_id="AAA91568.1"
+FT                   /translation="MGKGILSLQQEMSLEYSEKSYQEVLKIRQESYWKRMKSFSLFEVI
+FT                   MHWTASLNKHTCRSYRGSFLSLEKIGLLSLDMNLQEFSLLNHNLILDAIKKVSSAKTSW
+FT                   TEGTKQVRAASYISLTRFLNRMTQGIVAIAQPSKQENSRTFFKTREIVKTDAMNSLQTA
+FT                   SFLKELKKINARDWLIAQTMLQGGKRSSEVLSLEISQICFQQATISFSQLKNRQTEKRI
+FT                   IITYPQKFMHFLQEYIGQRRGFVFVTRSGKMVGLRQIARTFSQAGLQAAIPFKITPHVL
+FT                   RATAVTEYKRLGCSDSDIMKVTGHATAKMIFAYDKSSREDNASKKMALI"
+FT   CDS             1579..2934
+FT                   /codon_start=1
+FT                   /transl_table=11
+FT                   /product="hypothetical protein"
+FT                   /note="pGP1-D"
+FT                   /db_xref="GOA:P0CE16"
+FT                   /db_xref="InterPro:IPR003593"
+FT                   /db_xref="InterPro:IPR007693"
+FT                   /db_xref="InterPro:IPR007694"
+FT                   /db_xref="InterPro:IPR027417"
+FT                   /db_xref="InterPro:IPR036185"
+FT                   /db_xref="UniProtKB/Swiss-Prot:P0CE16"
+FT                   /protein_id="AAA91569.1"
+FT                   /translation="MKTRSEIENRMQDIEYALLGKALIFEDSTEYILRQLANYEFKCSH
+FT                   HKNIFIVFKHLKDNGLPITVDSAWEELLRRRIKDMDKSYLGLMLHDALSNDKLRSVSHT
+FT                   VFLDDLSVCSAEENLSNFIFRSFNEYNENPLRRSPFLLLERIKGRLDSAIAKTFSIRSA
+FT                   RGRSIYDIFSQSEIGVLARIKKRRVAFSENQNSFFDGFPTGYKDIDDKGVILAKGNFVI
+FT                   IAARPSIGKTALAIDMAINLAVTQQRRVGFLSLEMSAGQIVERIIANLTGISGEKLQRG
+FT                   DLSKEELFRVEEAGETVRESHFYICSDSQYKLNLIANQIRLLRKEDRVDVIFIDYLQLI
+FT                   NSSVGENRQNEIADISRTLRGLASELNIPIVCLSQLSRKVEDRANKVPMLSDLRDSGQI
+FT                   EQDADVILFINRKESSSNCEITVGKNRHGSVFSSVLHFDPKISKFSAIKKVW"
+FT   CDS             2928..3992
+FT                   /codon_start=1
+FT                   /transl_table=11
+FT                   /product="hypothetical protein"
+FT                   /note="pGP2-D"
+FT                   /db_xref="InterPro:IPR040719"
+FT                   /db_xref="UniProtKB/Swiss-Prot:P0CE17"
+FT                   /protein_id="AAA91570.1"
+FT                   /translation="MVNYSNCHFIKSPIHLENQKFGRRPGQSIKISPKLAQNGMVEVIG
+FT                   LDFLSSHYHALAAIQRLLTATNYKGNTKGVVLSRESNSFQFEGWIPRIRFTKTEFLEAY
+FT                   GVKRYKTSRNKYEFSGKEAETALEALYHLGHQPFLIVATRTRWTNGTQIVDRYQTLSPI
+FT                   IRIYEGWEGLTDEENIDIDLTPFNSPPTRKHKGFVVEPCPILVDQIESYFVIKPANVYQ
+FT                   EIKMRFPNASKYAYTFIDWVITAAAKKRRKLTKDNSWPENLLLNVNVKSLAYILRMNRY
+FT                   ICTRNWKKIELAIDKCIEIAIQLGWLSRRKRIEFLDSSKLSKKEILYLNKERFEEITKK
+FT                   SKEQMEQLEQESIN"
+FT   CDS             4054..4848
+FT                   /codon_start=1
+FT                   /transl_table=11
+FT                   /product="hypothetical protein"
+FT                   /note="pGP3-D"
+FT                   /db_xref="InterPro:IPR008444"
+FT                   /db_xref="InterPro:IPR033758"
+FT                   /db_xref="InterPro:IPR038264"
+FT                   /db_xref="PDB:6GJT"
+FT                   /db_xref="UniProtKB/Swiss-Prot:P0CE18"
+FT                   /protein_id="AAA91571.1"
+FT                   /translation="MGNSGFYLYNTENCVFADNIKVGQMTEPLKDQQIILGTTSTPVAA
+FT                   KMTASDGISLTVSNNSSTNASITIGLDAEKAYQLILEKLGDQILDGIADTIVDSTVQDI
+FT                   LDKIKTDPSLGLLKAFNNFPITNKIQCNGLFTPSNIETLLGGTEIGKFTVTPKSSGSMF
+FT                   LVSADIIASRMEGGVVLALVREGDSKPCAISYGYSSGIPNLCSLRTSITNTGLTPTTYS
+FT                   LRVGGLESGVVWVNALSNGNDILGITNTSNVSFLEVIPQTNA"
+FT   CDS             4918..5226
+FT                   /codon_start=1
+FT                   /transl_table=11
+FT                   /product="hypothetical protein"
+FT                   /note="pGP4-D"
+FT                   /db_xref="UniProtKB/Swiss-Prot:P0CE23"
+FT                   /protein_id="AAA91572.1"
+FT                   /translation="MQNKRKVRDDFIKIVKDVKKDFPELDLKIRVNKEKVTFLNSPLEL
+FT                   YHKSVSLILGLLQQIENSLGLFPDSPVLEKLEDNSLKLKKALIMLILSRKDMFSKAE"
+FT   CDS             5317..6048
+FT                   /codon_start=1
+FT                   /transl_table=11
+FT                   /product="hypothetical protein"
+FT                   /note="pGP5-D (gtg start codon)"
+FT                   /db_xref="GOA:P10559"
+FT                   /db_xref="InterPro:IPR025669"
+FT                   /db_xref="InterPro:IPR027417"
+FT                   /db_xref="UniProtKB/Swiss-Prot:P10559"
+FT                   /protein_id="AAA91573.1"
+FT                   /translation="MGCNLAQFLGKKVLLADLDPQSNLSSGLGASVRSDQKGLHDIVYT
+FT                   SNDLKSIICETKKDSVDLIPASFSSEQFRELDIHRGPSNNLKLFLNEYCAPFYDICIID
+FT                   TPPSLGGLTKEAFVAGDKLIACLTPEPFSILGLQKIREFLSSVGKPEEEHILGIALSFW
+FT                   DDRNSTNQMYIDIIESIYKNKLFSTKIRRDISLSRSLLKEDSVANVYPNSRAAEDILKL
+FT                   THEIANILHIEYERDYSQRTT"
+FT   CDS             6045..6788
+FT                   /codon_start=1
+FT                   /transl_table=11
+FT                   /product="hypothetical protein"
+FT                   /note="pGP6-D (gtg start codon)"
+FT                   /db_xref="InterPro:IPR005350"
+FT                   /db_xref="UniProtKB/Swiss-Prot:P10560"
+FT                   /protein_id="AAA91574.1"
+FT                   /translation="MNKLKKEADVFFKKNQTAASLDFKKTLPSIELFSATLNSEESQSL
+FT                   DRLFLSESQNYSDEEFYQEDILAVKLLTGQIKSIQKQHVLLLGEKIYNARKILSKDHFS
+FT                   STTFSSWIELVFRTKSSAYNALAYYELFINLPNQTLQKEFQSIPYKSAYILAARKGDLK
+FT                   TKVDVIGKVCGMSNSSAIRVLDQFLPSSRNKDVRETIDKSDSEKNRQLSDFLIEILRIM
+FT                   CSGVSLSSYNENLLQQLFELFKQKS"
+FT   repeat_region   6857..6945
+FT                   /note="four tandem 22bp repeats"
+XX
+SQ   Sequence 7502 BP; 2460 A; 1285 C; 1433 G; 2324 T; 0 other;
+     ggatccgtaa gttagacgaa attttgtctt tgcgcacaga cgatctattt tttgcatcca        60
+     atcagatttc ctttcgcatt aaaaaaagac agaataaaga aaccaaaatt ctaatcacat       120
+     ttcctatcag cttaatggaa gagttgcaaa aatacacttg tgggagaaat gggagagtat       180
+     ttgtttctaa aatagggatt cctgtaacaa caagtcaggt tgcgcataat tttaggcttg       240
+     cagagttcca tagtgctatg aaaataaaaa ttactcccag agtacttcgt gcaagcgctt       300
+     tgattcattt aaagcaaata ggattaaaag atgaggaaat catgcgtatt tcctgtcttt       360
+     catcgagaca aagtgtgtgt tcttattgtt ctggggaaga ggtaattcct ctagtacaaa       420
+     cacccacaat attgtgatat aattaaaatt atattcatat tctgttgcca gaaaaaacac       480
+     ctttaggcta tattagagcc atcttctttg aagcgttgtc ttctcgagaa gatttatcgt       540
+     acgcaaatat catctttgcg gttgcgtgtc ctgtgacctt cattatgtcg gagtctgagc       600
+     accctaggcg tttgtactcc gtcacagcgg ttgctcgaag cacgtgcggg gttattttaa       660
+     aagggattgc agcttgtagt cctgcttgag agaacgtgcg ggcgatttgc cttaacccca       720
+     ccatttttcc ggagcgagtt acgaagacaa aacctcttcg ttgaccgatg tactcttgta       780
+     gaaagtgcat aaacttctga ggataagtta taataatcct cttttctgtc tgacggttct       840
+     taagctggga gaaagaaatg gtagcttgtt ggaaacaaat ctgactaatc tccaagctta       900
+     agacttcaga ggagcgttta cctccttgga gcattgtctg ggcgatcaac caatcccggg       960
+     cattgatttt ttttagctct tttaggaagg atgctgtttg caaactgttc atcgcatccg      1020
+     tttttactat ttccctggtt ttaaaaaatg ttcgactatt ttcttgttta gaaggttgcg      1080
+     ctatagcgac tattccttga gtcatcctgt ttaggaatct tgttaaggaa atatagcttg      1140
+     ctgctcgaac ttgtttagta ccttcggtcc aagaagtctt ggcagaggaa acttttttaa      1200
+     tcgcatctag gattagatta tgatttaaaa gggaaaactc ttgcagattc atatccaagg      1260
+     acaatagacc aatcttttct aaagacaaaa aagatcctcg atatgatcta caagtatgtt      1320
+     tgttgagtga tgcggtccaa tgcataataa cttcgaataa ggagaagctt ttcatgcgtt      1380
+     tccaatagga ttcttggcga atttttaaaa cttcctgata agacttttca ctatattcta      1440
+     acgacatttc ttgctgcaaa gataaaatcc ctttacccat gaaatccctc gtgatataac      1500
+     ctatccgtaa aatgtcctga ttagtgaaat aatcaggttg ttaacaggat agcacgctcg      1560
+     gtattttttt atataaacat gaaaactcgt tccgaaatag aaaatcgcat gcaagatatc      1620
+     gagtatgcgt tgttaggtaa agctctgata tttgaagact ctactgagta tattctgagg      1680
+     cagcttgcta attatgagtt taagtgttct catcataaaa acatattcat agtatttaaa      1740
+     cacttaaaag acaatggatt acctataact gtagactcgg cttgggaaga gcttttgcgg      1800
+     cgtcgtatca aagatatgga caaatcgtat ctcgggttaa tgttgcatga tgctttatca      1860
+     aatgacaagc ttagatccgt ttctcatacg gttttcctcg atgatttgag cgtgtgtagc      1920
+     gctgaagaaa atttgagtaa tttcattttc cgctcgttta atgagtacaa tgaaaatcca      1980
+     ttgcgtagat ctccgtttct attgcttgag cgtataaagg gaaggcttga tagtgctata      2040
+     gcaaagactt tttctattcg cagcgctaga ggccggtcta tttatgatat attctcacag      2100
+     tcagaaattg gagtgctggc tcgtataaaa aaaagacgag tagcgttctc tgagaatcaa      2160
+     aattctttct ttgatggctt cccaacagga tacaaggata ttgatgataa aggagttatc      2220
+     ttagctaaag gtaatttcgt gattatagca gctagaccat ctatagggaa aacagcttta      2280
+     gctatagaca tggcgataaa tcttgcggtt actcaacagc gtagagttgg tttcctatct      2340
+     ctagaaatga gcgcaggtca aattgttgag cggattattg ctaatttaac aggaatatct      2400
+     ggtgaaaaat tacaaagagg ggatctctct aaagaagaat tattccgagt agaagaagct      2460
+     ggagaaacgg ttagagaatc acatttttat atctgcagtg atagtcagta taagcttaac      2520
+     ttaatcgcga atcagatccg gttgctgaga aaagaagatc gagtagacgt aatatttatc      2580
+     gattacttgc agttgatcaa ctcatcggtt ggagaaaatc gtcaaaatga aatagcagat      2640
+     atatctagaa ccttaagagg tttagcctca gagctaaaca ttcctatagt ttgtttatcc      2700
+     caactatcta gaaaagttga ggatagagca aataaagttc ccatgctttc agatttgcga      2760
+     gacagcggtc aaatagagca agacgcagat gtgattttgt ttatcaatag gaaggaatcg      2820
+     tcttctaatt gtgagataac tgttgggaaa aatagacatg gatcggtttt ctcttcggta      2880
+     ttacatttcg atccaaaaat tagtaaattc tccgctatta aaaaagtatg gtaaattata      2940
+     gtaactgcca cttcatcaaa agtcctatcc accttgaaaa tcagaagttt ggaagaagac      3000
+     ctggtcaatc tattaagata tctcccaaat tggctcaaaa tgggatggta gaagttatag      3060
+     gtcttgattt tctttcatct cattaccatg cattagcagc tatccaaaga ttactgaccg      3120
+     caacgaatta caaggggaac acaaaagggg ttgttttatc cagagaatca aatagttttc      3180
+     aatttgaagg atggatacca agaatccgtt ttacaaaaac tgaattctta gaggcttatg      3240
+     gagttaagcg gtataaaaca tccagaaata agtatgagtt tagtggaaaa gaagctgaaa      3300
+     ctgctttaga agccttatac catttaggac atcaaccgtt tttaatagtg gcaactagaa      3360
+     ctcgatggac taatggaaca caaatagtag accgttacca aactctttct ccgatcatta      3420
+     ggatttacga aggatgggaa ggtttaactg acgaagaaaa tatagatata gacttaacac      3480
+     cttttaattc accacctaca cggaaacata aagggttcgt tgtagagcca tgtcctatct      3540
+     tggtagatca aatagaatcc tactttgtaa tcaagcctgc aaatgtatac caagaaataa      3600
+     aaatgcgttt cccaaatgca tcaaagtatg cttacacatt tatcgactgg gtgattacag      3660
+     cagctgcgaa aaagagacga aaattaacta aggataattc ttggccagaa aacttgttat      3720
+     taaacgttaa cgttaaaagt cttgcatata ttttaaggat gaatcggtac atctgtacaa      3780
+     ggaactggaa aaaaatcgag ttagctatcg ataaatgtat agaaatcgcc attcagcttg      3840
+     gctggttatc tagaagaaaa cgcattgaat ttctggattc ttctaaactc tctaaaaaag      3900
+     aaattctata tctaaataaa gagcgctttg aagaaataac taagaaatct aaagaacaaa      3960
+     tggaacaatt agaacaagaa tctattaatt aatagcaagc ttgaaactaa aaacctaatt      4020
+     tatttaaagc tcaaaataaa aaagagtttt aaaatgggaa attctggttt ttatttgtat      4080
+     aacactgaaa actgcgtctt tgctgataat atcaaagttg ggcaaatgac agagccgctc      4140
+     aaggaccagc aaataatcct tgggacaaca tcaacacctg tcgcagccaa aatgacagct      4200
+     tctgatggaa tatctttaac agtctccaat aattcatcaa ccaatgcttc tattacaatt      4260
+     ggtttggatg cggaaaaagc ttaccagctt attctagaaa agttgggaga tcaaattctt      4320
+     gatggaattg ctgatactat tgttgatagt acagtccaag atattttaga caaaatcaaa      4380
+     acagaccctt ctctaggttt gttgaaagct tttaacaact ttccaatcac taataaaatt      4440
+     caatgcaacg ggttattcac tcccagtaac attgaaactt tattaggagg aactgaaata      4500
+     ggaaaattca cagtcacacc caaaagctct gggagcatgt tcttagtctc agcagatatt      4560
+     attgcatcaa gaatggaagg cggcgttgtt ctagctttgg tacgagaagg tgattctaag      4620
+     ccctgcgcga ttagttatgg atactcatca ggcattccta atttatgtag tctaagaacc      4680
+     agtattacta atacaggatt gactccgaca acgtattcat tacgtgtagg cggtttagaa      4740
+     agcggtgtgg tatgggttaa tgccctttct aatggcaatg atattttagg aataacaaat      4800
+     acttctaatg tatctttttt agaggtaata cctcaaacaa acgcttaaac aatttttatt      4860
+     ggatttttct tataggtttt atatttagag aaaacagttc gaattacggg gtttgttatg      4920
+     caaaataaaa gaaaagtgag ggacgatttt attaaaattg ttaaagatgt gaaaaaagat      4980
+     ttccccgaat tagacctaaa aatacgagta aacaaggaaa aagtaacttt cttaaattct      5040
+     cccttagaac tctaccataa aagtgtctca ctaattctag gactgcttca acaaatagaa      5100
+     aactctttag gattattccc agactctcct gttcttgaaa aattagagga taacagttta      5160
+     aagctaaaaa aggctttgat tatgcttatc ttgtctagaa aagacatgtt ttccaaggct      5220
+     gaatagacaa cttactctaa cgttggagtt gatttgcaca ccttagtttt ttgctctttt      5280
+     aagggaggaa ctggaaaaac aacactttct ctaaacgtgg gatgcaactt ggcccaattt      5340
+     ttagggaaaa aagtgttact tgctgaccta gacccgcaat ccaatttatc ttctggattg      5400
+     ggggctagtg tcagaagtga ccaaaaaggc ttgcacgaca tagtatacac atcaaacgat      5460
+     ttaaaatcaa tcatttgcga aacaaaaaaa gatagtgtgg acctaattcc tgcatcattt      5520
+     tcatccgaac agtttagaga attggatatt catagaggac ctagtaacaa cttaaagtta      5580
+     tttctgaatg agtactgcgc tcctttttat gacatctgca taatagacac tccacctagc      5640
+     ctaggagggt taacgaaaga agcttttgtt gcaggagaca aattaattgc ttgtttaact      5700
+     ccagaacctt tttctattct agggttacaa aagatacgtg aattcttaag ttcggtcgga      5760
+     aaacctgaag aagaacacat tcttggaata gctttgtctt tttgggatga tcgtaactcg      5820
+     actaaccaaa tgtatataga cattatcgag tctatttaca aaaacaagct tttttcaaca      5880
+     aaaattcgtc gagatatttc tctcagccgt tctcttctta aagaagattc tgtagctaat      5940
+     gtctatccaa attctagggc cgcagaagat attctgaagt taacgcatga aatagcaaat      6000
+     attttgcata tcgaatatga acgagattac tctcagagga caacgtgaac aaactaaaaa      6060
+     aagaagcgga tgtctttttt aaaaaaaatc aaactgccgc ttctctagat tttaagaaga      6120
+     cgcttccctc cattgaacta ttctcagcaa ctttgaattc tgaggaaagt cagagtttgg      6180
+     atcgattatt tttatcagag tcccaaaact attcggatga agaattttat caagaagaca      6240
+     tcctagcggt aaaactgctt actggtcaga taaaatccat acagaagcaa cacgtacttc      6300
+     ttttaggaga aaaaatctat aatgctagaa aaatcctgag taaggatcac ttctcctcaa      6360
+     caactttttc atcttggata gagttagttt ttagaactaa gtcttctgct tacaatgctc      6420
+     ttgcatatta cgagcttttt ataaacctcc ccaaccaaac tctacaaaaa gagtttcaat      6480
+     cgatccccta taaatccgca tatattttgg ccgctagaaa aggcgattta aaaaccaagg      6540
+     tcgatgtgat agggaaagta tgtggaatgt cgaactcatc ggcgataagg gtgttggatc      6600
+     aatttcttcc ttcatctaga aacaaagacg ttagagaaac gatagataag tctgattcag      6660
+     agaagaatcg ccaattatct gatttcttaa tagagatact tcgcatcatg tgttccggag      6720
+     tttctttgtc ctcctataac gaaaatcttc tacaacagct ttttgaactt tttaagcaaa      6780
+     agagctgatc ctccgtcagc tcatatatat atatctatta tatatatata tttagggatt      6840
+     tgatttcacg agagagattt gcaactcttg gtggtagact ttgcaactct tggtggtaga      6900
+     ctttgcaact cttggtggta gactttgcaa ctcttggtgg tagacttggt cataatggac      6960
+     ttttgttaaa aaatttatta aaatcttaga gctccgattt tgaatagctt tggttaagaa      7020
+     aatgggctcg atggctttcc ataaaagtag attgttttta acttttgggg acgcgtcgga      7080
+     aatttggtta tctactttat cttatctaac tagaaaaaat tatgcgtctg ggattaactt      7140
+     tcttgtttct ttagagattc tggatttatc ggaaaccttg ataaaggcta tttctcttga      7200
+     ccacagcgaa tctttgttta aaatcaagtc tctagatgtt tttaatggaa aagttgtttc      7260
+     agaggcatct aaacaggcta gagcggcatg ctacatatct ttcacaaagt ttttgtatag      7320
+     attgaccaag ggatatatta aacccgctat tccattgaaa gattttggaa acactacatt      7380
+     ttttaaaatc cgagacaaaa tcaaaacaga atcgatttct aagcaggaat ggacagtttt      7440
+     ttttgaagcg ctccggatag tgaattatag agactattta atcggtaaat tgattgtaca      7500
+     ag                                                                     7502
+//