From: gmungoc Date: Wed, 22 Jul 2020 09:55:43 +0000 (+0100) Subject: JAL-3692 unit test (J03321), fixes, dbrefs; todo: protein mappings X-Git-Tag: Release_2_11_2_0~34^2~34^2~6 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=98b152cadada4da85e79a572459f753ae8d92f4e;p=jalview.git JAL-3692 unit test (J03321), fixes, dbrefs; todo: protein mappings --- diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java index 759fa28..9214f7e 100644 --- a/src/jalview/io/EmblFlatFile.java +++ b/src/jalview/io/EmblFlatFile.java @@ -15,6 +15,7 @@ import jalview.datamodel.FeatureProperties; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.util.DBRefUtils; import jalview.util.DnaUtils; import jalview.util.MappingUtils; @@ -39,10 +40,28 @@ import jalview.util.MappingUtils; */ public class EmblFlatFile extends AlignFile // FileParse { + /** + * A data bean class to hold values parsed from one CDS Feature (FT) + */ + class CdsData + { + String translation; // from CDS feature /translation + + String cdsLocation; // CDS /location raw value + + int codonStart = 1; // from CDS /codon_start + + String proteinName; // from CDS /product; TODO: use for protein description + + String proteinId; // from CDS /protein_id + + Map cdsProps = new Hashtable<>(); // CDS other qualifiers + } + private static final String WHITESPACE = "\\s+"; private String sourceDb; - + /* * values parsed from the EMBL flatfile record */ @@ -56,20 +75,11 @@ public class EmblFlatFile extends AlignFile // FileParse private String sequenceString; // from SQ lines - private String translation; // from CDS feature /translation - - private String cdsLocation; // CDS /location raw value - - private int codonStart = 1; // from CDS /codon_start - - private String proteinName; // from CDS /product - - private String proteinId; // from CDS /protein_id - - private Map cdsProps; // CDS other qualifiers e.g. 'note' - + private List cds; + /** * Constructor + * * @param fp * @param sourceId * @throws IOException @@ -79,7 +89,7 @@ public class EmblFlatFile extends AlignFile // FileParse super(false, fp); // don't parse immediately this.sourceDb = sourceId; dbrefs = new ArrayList<>(); - cdsProps = new Hashtable<>(); + cds = new ArrayList<>(); } /** @@ -95,19 +105,19 @@ public class EmblFlatFile extends AlignFile // FileParse { if (line.startsWith("ID")) { - line = processID(line); + line = parseID(line); } else if (line.startsWith("DR")) { - line = processDR(line); + line = parseDR(line); } else if (line.startsWith("SQ")) { - line = processSQ(); + line = parseSQ(); } else if (line.startsWith("FT")) { - line = processFT(line); + line = parseFT(line); } else { @@ -124,7 +134,7 @@ public class EmblFlatFile extends AlignFile // FileParse * @param line * @throws IOException */ - String processID(String line) throws IOException + String parseID(String line) throws IOException { String[] tokens = line.substring(2).split(";"); @@ -176,18 +186,32 @@ public class EmblFlatFile extends AlignFile // FileParse * @param line * @throws IOException */ - String processDR(String line) throws IOException + String parseDR(String line) throws IOException { String[] tokens = line.substring(2).split(";"); if (tokens.length > 1) { + /* + * ensure UniProtKB/Swiss-Prot converted to UNIPROT + */ String db = tokens[0].trim(); + db = DBRefUtils.getCanonicalName(db); String acc = tokens[1].trim(); if (acc.endsWith(".")) { acc = acc.substring(0, acc.length() - 1); } - this.dbrefs.add(new DBRefEntry(db, "0", acc)); + String version = "0"; + if (tokens.length > 2) + { + String secondaryId = tokens[2].trim(); + if (!secondaryId.isEmpty()) + { + // todo: is this right? secondary id is not a version number + // version = secondaryId; + } + } + this.dbrefs.add(new DBRefEntry(db, version, acc)); } return nextLine(); @@ -201,7 +225,7 @@ public class EmblFlatFile extends AlignFile // FileParse * * @throws IOException */ - String processSQ() throws IOException + String parseSQ() throws IOException { StringBuilder sb = new StringBuilder(this.length); String line = nextLine(); @@ -234,7 +258,7 @@ public class EmblFlatFile extends AlignFile // FileParse * @return * @throws IOException */ - String processFT(String line) throws IOException + String parseFT(String line) throws IOException { String[] tokens = line.split(WHITESPACE); if (tokens.length < 3 || !"CDS".equals(tokens[1])) @@ -242,9 +266,11 @@ public class EmblFlatFile extends AlignFile // FileParse return nextLine(); } - this.cdsLocation = tokens[2]; + CdsData data = new CdsData(); + data.cdsLocation = tokens[2]; - while ((line = nextLine()) != null) + line = nextLine(); + while (line != null) { if (!line.startsWith("FT ")) // 4 spaces { @@ -276,48 +302,67 @@ public class EmblFlatFile extends AlignFile // FileParse if ("protein_id".equals(qualifier)) { - proteinId = value; + data.proteinId = value; + line = nextLine(); } else if ("codon_start".equals(qualifier)) { try { - codonStart = Integer.parseInt(value.trim()); + data.codonStart = Integer.parseInt(value.trim()); } catch (NumberFormatException e) { Cache.log.error("Invalid codon_start in XML for " + this.accession + ": " + e.getMessage()); } + line = nextLine(); + } + else if ("db_xref".equals(qualifier)) + { + String[] parts = value.split(":"); + if (parts.length == 2) + { + String db = parts[0].trim(); + db = DBRefUtils.getCanonicalName(db); + DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim()); + this.dbrefs.add(dbref); + } + line = nextLine(); } else if ("product".equals(qualifier)) { // sometimes name is returned e.g. for V00488 - proteinName = value; + data.proteinName = value; + line = nextLine(); } else if ("translation".equals(qualifier)) { - line = readTranslation(value); + line = readTranslation(value, data); } else if (!"".equals(value)) { // throw anything else into the additional properties hash - cdsProps.put(qualifier, value); + data.cdsProps.put(qualifier, value); + line = nextLine(); } } + + this.cds.add(data); return line; } /** - * Reads and saves the CDS translation from one or more lines of the file, and - * returns the next line after that + * Reads and returns the CDS translation from one or more lines of the file, + * and returns the next line after that * * @param value * the first line of the translation (likely quoted) + * @param data * @return * @throws IOException */ - String readTranslation(String value) throws IOException + String readTranslation(String value, CdsData data) throws IOException { StringBuilder sb = new StringBuilder(this.length / 3 + 1); sb.append(value.replace("\"", "")); @@ -342,7 +387,9 @@ public class EmblFlatFile extends AlignFile // FileParse sb.append(tokens[1].replace("\"", "")); } - return sb.toString(); + data.translation = sb.toString(); + + return line; } /** @@ -351,20 +398,21 @@ public class EmblFlatFile extends AlignFile // FileParse *
  • add a CDS feature to the sequence for each CDS start-end range
  • *
  • create a protein product sequence for the translation
  • *
  • create a cross-reference to protein with mapping from dna
  • - *
  • add any CDS dbrefs to the sequence and to the protein product
  • + *
  • add any CDS dbrefs to the sequence and to the protein product
  • * - * @param SequenceI dna + * + * @param SequenceI + * dna */ - void processCDS(SequenceI dna) + void processCDS(SequenceI dna, CdsData data) { /* * parse location into a list of [start, end, start, end] positions */ - int[] exons = getCdsRanges(this.accession, this.cdsLocation); + int[] exons = getCdsRanges(this.accession, data.cdsLocation); int exonNumber = 0; - - for (int xint = 0; exons != null - && xint < exons.length - 1; xint += 2) + + for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2) { int exonStart = exons[xint]; int exonEnd = exons[xint + 1]; @@ -372,29 +420,37 @@ public class EmblFlatFile extends AlignFile // FileParse int end = Math.max(exonStart, exonEnd); exonNumber++; String desc = String.format("Exon %d for protein EMBLCDS:%s", - exonNumber, proteinId); + exonNumber, data.proteinId); - SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, this.sourceDb); - if (!cdsProps.isEmpty()) + SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, + this.sourceDb); + for (Entry val : data.cdsProps.entrySet()) { - for (Entry val : cdsProps.entrySet()) - { - sf.setValue(val.getKey(), val.getValue()); - } + sf.setValue(val.getKey(), val.getValue()); } - sf.setEnaLocation(this.cdsLocation); + sf.setEnaLocation(data.cdsLocation); boolean forwardStrand = exonStart <= exonEnd; sf.setStrand(forwardStrand ? "+" : "-"); - sf.setPhase(String.valueOf(codonStart - 1)); + sf.setPhase(String.valueOf(data.codonStart - 1)); sf.setValue(FeatureProperties.EXONPOS, exonNumber); - sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); + sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName); dna.addSequenceFeature(sf); } } /** + * Constructs a sequence for the protein product (if there is one), and dbrefs + * with mappings from dna to protein and the reverse + */ + void processTranslation() + { + // TODO Auto-generated method stub + + } + + /** * Constructs and saves the sequence from parsed components */ void assembleSequence() @@ -409,10 +465,16 @@ public class EmblFlatFile extends AlignFile // FileParse { seq.addDBRef(dbref); } - - processCDS(seq); + + for (CdsData data : cds) + { + processCDS(seq, data); + }; + + processTranslation(); + seq.deriveSequence(); - + addSequence(seq); } diff --git a/test/jalview/io/EmblFlatFileTest.java b/test/jalview/io/EmblFlatFileTest.java new file mode 100644 index 0000000..6d9874e --- /dev/null +++ b/test/jalview/io/EmblFlatFileTest.java @@ -0,0 +1,136 @@ +package jalview.io; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertNull; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.List; +import java.util.Set; + +import org.testng.annotations.Test; + +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.datamodel.features.SequenceFeatures; + +public class EmblFlatFileTest +{ + /** + * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features, + * one of them reverse strand + * + * @throws MalformedURLException + * @throws IOException + */ + @Test(groups = "Functional") + public void testParse() throws MalformedURLException, IOException + { + File dataFile = new File("test/jalview/io/J03321.embl.txt"); + FileParse fp = new FileParse(dataFile, DataSourceType.FILE); + EmblFlatFile parser = new EmblFlatFile(fp, "EmblTest"); + parser.parse(); + List seqs = parser.getSeqs(); + + assertEquals(seqs.size(), 1); + SequenceI seq = seqs.get(0); + assertEquals(seq.getName(), "EmblTest|J03321"); + assertEquals(seq.getLength(), 7502); + + /* + * should be 9 CDS features (one is a 'join' of two exons) + */ + Set featureTypes = seq.getFeatures().getFeatureTypes(); + assertEquals(featureTypes.size(), 1); + assertTrue(featureTypes.contains("CDS")); + + /* + * inspect some features (sort them for convenience of test assertions) + */ + List features = seq.getFeatures() + .getAllFeatures("CDS"); + SequenceFeatures.sortFeatures(features, true); + assertEquals(features.size(), 9); + + SequenceFeature sf = features.get(0); + assertEquals(sf.getBegin(), 1); + assertEquals(sf.getEnd(), 437); + assertEquals(sf.getDescription(), + "Exon 2 for protein EMBLCDS:AAA91567.1"); + assertEquals(sf.getFeatureGroup(), "EmblTest"); + assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)"); + assertEquals(sf.getPhase(), "0"); + assertEquals(sf.getStrand(), 1); + assertEquals(sf.getValue("note"), "pGP7-D"); + // second exon of circular DNA! + assertEquals(sf.getValue("exon number"), 2); + assertEquals(sf.getValue("product"), "hypothetical protein"); + assertEquals(sf.getValue("transl_table"), "11"); + + sf = features.get(1); + assertEquals(sf.getBegin(), 488); + assertEquals(sf.getEnd(), 1480); + assertEquals(sf.getDescription(), + "Exon 1 for protein EMBLCDS:AAA91568.1"); + assertEquals(sf.getFeatureGroup(), "EmblTest"); + assertEquals(sf.getEnaLocation(), "complement(488..1480)"); + assertEquals(sf.getPhase(), "0"); + assertEquals(sf.getStrand(), -1); // reverse strand! + assertEquals(sf.getValue("note"), "pGP8-D"); + assertEquals(sf.getValue("exon number"), 1); + assertEquals(sf.getValue("product"), "hypothetical protein"); + + sf = features.get(7); + assertEquals(sf.getBegin(), 6045); + assertEquals(sf.getEnd(), 6788); + assertEquals(sf.getDescription(), + "Exon 1 for protein EMBLCDS:AAA91574.1"); + assertEquals(sf.getFeatureGroup(), "EmblTest"); + assertEquals(sf.getEnaLocation(), "6045..6788"); + assertEquals(sf.getPhase(), "0"); + assertEquals(sf.getStrand(), 1); + assertEquals(sf.getValue("note"), "pGP6-D (gtg start codon)"); + assertEquals(sf.getValue("exon number"), 1); + assertEquals(sf.getValue("product"), "hypothetical protein"); + + /* + * CDS at 7022-7502 is the first exon of the circular DNA CDS + */ + sf = features.get(8); + assertEquals(sf.getBegin(), 7022); + assertEquals(sf.getEnd(), 7502); + assertEquals(sf.getDescription(), + "Exon 1 for protein EMBLCDS:AAA91567.1"); + assertEquals(sf.getFeatureGroup(), "EmblTest"); + assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)"); + assertEquals(sf.getPhase(), "0"); + assertEquals(sf.getStrand(), 1); + assertEquals(sf.getValue("note"), "pGP7-D"); + assertEquals(sf.getValue("exon number"), 1); + assertEquals(sf.getValue("product"), "hypothetical protein"); + + /* + * there are 4 'direct' (DR) dbrefs, and numerous CDS /db_xref entries, + * some of them (e.g. INTERPRO) duplicates; sample a few here + * Note DBRefEntry constructor capitalises source + */ + List dbrefs = seq.getDBRefs(); + assertEquals(dbrefs.size(), 31); + // 1st DR line; note trailing period is removed + assertTrue(dbrefs.contains(new DBRefEntry("MD5", "0", + "d4c4942a634e3df4995fd5ac75c26a61"))); + // the 4th DR line: + assertTrue( + dbrefs.contains(new DBRefEntry("EuropePMC", "0", "PMC87941"))); + // from the first CDS feature; note canonicalisation to "UNIPROT" + assertTrue(dbrefs.contains(new DBRefEntry("GOA", "0", "P0CE19"))); + assertTrue(dbrefs.contains(new DBRefEntry("UNIPROT", "0", "P0CE19"))); + // from the last CDS feature + assertTrue(dbrefs.contains(new DBRefEntry("INTERPRO", "0", "IPR005350"))); + + // todo: mappings to, and sequences for, UNIPROT proteins + } +} diff --git a/test/jalview/io/J03321.embl.txt b/test/jalview/io/J03321.embl.txt new file mode 100644 index 0000000..92065b9 --- /dev/null +++ b/test/jalview/io/J03321.embl.txt @@ -0,0 +1,304 @@ +ID J03321; SV 1; circular; genomic DNA; STD; PRO; 7502 BP. +XX +AC J03321; +XX +DT 27-JUL-1990 (Rel. 24, Created) +DT 10-APR-2020 (Rel. 144, Last updated, Version 9) +XX +DE Chlamydia trachomatis plasmid pCHL1, complete sequence. +XX +KW . +XX +OS Chlamydia trachomatis +OC Bacteria; Chlamydiae; Chlamydiales; Chlamydiaceae; +OC Chlamydia/Chlamydophila group; Chlamydia. +OG Plasmid pCHL1 +XX +RN [1] +RP 1-7502 +RX DOI; 10.1016/0147-619X(90)90034-A. +RX PUBMED; 2194229. +RA Comanducci M., Ricci S., Cevenini R., Ratti G.; +RT "Diversity of the Chlamydia trachomatis common plasmid in biovars with +RT different pathogenicity"; +RL Plasmid 23(2):149-154(1990). +XX +RN [2] +RP 1-7502 +RA Comanducci M., Ricci S., Cevenini R., Ratti G.; +RT ; +RL Submitted (23-JUN-2010) to the INSDC. +RL Sclavo Research Centre, Siena, Italy +XX +DR MD5; d4c4942a634e3df4995fd5ac75c26a61. +DR BioSample; SAMN14225621. +DR EuropePMC; PMC4450983; 26031715. +DR EuropePMC; PMC87941; 11283058. +XX +CC Draft entry and computer-readable sequence kindly submitted by +CC G.Ratti, 28-MAR-1990. +XX +FH Key Location/Qualifiers +FH +FT source 1..7502 +FT /organism="Chlamydia trachomatis" +FT /plasmid="pCHL1" +FT /isolate="G0/86" +FT /serotype="D" +FT /mol_type="genomic DNA" +FT /isolation_source="trachoma" +FT /db_xref="taxon:813" +FT CDS join(7022..7502,1..437) +FT /codon_start=1 +FT /transl_table=11 +FT /product="hypothetical protein" +FT /note="pGP7-D" +FT /db_xref="GOA:P0CE19" +FT /db_xref="InterPro:IPR002104" +FT /db_xref="InterPro:IPR011010" +FT /db_xref="InterPro:IPR013762" +FT /db_xref="UniProtKB/Swiss-Prot:P0CE19" +FT /protein_id="AAA91567.1" +FT /translation="MGSMAFHKSRLFLTFGDASEIWLSTLSYLTRKNYASGINFLVSLE +FT ILDLSETLIKAISLDHSESLFKIKSLDVFNGKVVSEASKQARAACYISFTKFLYRLTKG +FT YIKPAIPLKDFGNTTFFKIRDKIKTESISKQEWTVFFEALRIVNYRDYLIGKLIVQGIR +FT KLDEILSLRTDDLFFASNQISFRIKKRQNKETKILITFPISLMEELQKYTCGRNGRVFV +FT SKIGIPVTTSQVAHNFRLAEFHSAMKIKITPRVLRASALIHLKQIGLKDEEIMRISCLS +FT SRQSVCSYCSGEEVIPLVQTPTIL" +FT CDS complement(488..1480) +FT /codon_start=1 +FT /transl_table=11 +FT /product="hypothetical protein" +FT /note="pGP8-D" +FT /db_xref="GOA:P0CE20" +FT /db_xref="InterPro:IPR002104" +FT /db_xref="InterPro:IPR011010" +FT /db_xref="InterPro:IPR013762" +FT /db_xref="UniProtKB/Swiss-Prot:P0CE20" +FT /protein_id="AAA91568.1" +FT /translation="MGKGILSLQQEMSLEYSEKSYQEVLKIRQESYWKRMKSFSLFEVI +FT MHWTASLNKHTCRSYRGSFLSLEKIGLLSLDMNLQEFSLLNHNLILDAIKKVSSAKTSW +FT TEGTKQVRAASYISLTRFLNRMTQGIVAIAQPSKQENSRTFFKTREIVKTDAMNSLQTA +FT SFLKELKKINARDWLIAQTMLQGGKRSSEVLSLEISQICFQQATISFSQLKNRQTEKRI +FT IITYPQKFMHFLQEYIGQRRGFVFVTRSGKMVGLRQIARTFSQAGLQAAIPFKITPHVL +FT RATAVTEYKRLGCSDSDIMKVTGHATAKMIFAYDKSSREDNASKKMALI" +FT CDS 1579..2934 +FT /codon_start=1 +FT /transl_table=11 +FT /product="hypothetical protein" +FT /note="pGP1-D" +FT /db_xref="GOA:P0CE16" +FT /db_xref="InterPro:IPR003593" +FT /db_xref="InterPro:IPR007693" +FT /db_xref="InterPro:IPR007694" +FT /db_xref="InterPro:IPR027417" +FT /db_xref="InterPro:IPR036185" +FT /db_xref="UniProtKB/Swiss-Prot:P0CE16" +FT /protein_id="AAA91569.1" +FT /translation="MKTRSEIENRMQDIEYALLGKALIFEDSTEYILRQLANYEFKCSH +FT HKNIFIVFKHLKDNGLPITVDSAWEELLRRRIKDMDKSYLGLMLHDALSNDKLRSVSHT +FT VFLDDLSVCSAEENLSNFIFRSFNEYNENPLRRSPFLLLERIKGRLDSAIAKTFSIRSA +FT RGRSIYDIFSQSEIGVLARIKKRRVAFSENQNSFFDGFPTGYKDIDDKGVILAKGNFVI +FT IAARPSIGKTALAIDMAINLAVTQQRRVGFLSLEMSAGQIVERIIANLTGISGEKLQRG +FT DLSKEELFRVEEAGETVRESHFYICSDSQYKLNLIANQIRLLRKEDRVDVIFIDYLQLI +FT NSSVGENRQNEIADISRTLRGLASELNIPIVCLSQLSRKVEDRANKVPMLSDLRDSGQI +FT EQDADVILFINRKESSSNCEITVGKNRHGSVFSSVLHFDPKISKFSAIKKVW" +FT CDS 2928..3992 +FT /codon_start=1 +FT /transl_table=11 +FT /product="hypothetical protein" +FT /note="pGP2-D" +FT /db_xref="InterPro:IPR040719" +FT /db_xref="UniProtKB/Swiss-Prot:P0CE17" +FT /protein_id="AAA91570.1" +FT /translation="MVNYSNCHFIKSPIHLENQKFGRRPGQSIKISPKLAQNGMVEVIG +FT LDFLSSHYHALAAIQRLLTATNYKGNTKGVVLSRESNSFQFEGWIPRIRFTKTEFLEAY +FT GVKRYKTSRNKYEFSGKEAETALEALYHLGHQPFLIVATRTRWTNGTQIVDRYQTLSPI +FT IRIYEGWEGLTDEENIDIDLTPFNSPPTRKHKGFVVEPCPILVDQIESYFVIKPANVYQ +FT EIKMRFPNASKYAYTFIDWVITAAAKKRRKLTKDNSWPENLLLNVNVKSLAYILRMNRY +FT ICTRNWKKIELAIDKCIEIAIQLGWLSRRKRIEFLDSSKLSKKEILYLNKERFEEITKK +FT SKEQMEQLEQESIN" +FT CDS 4054..4848 +FT /codon_start=1 +FT /transl_table=11 +FT /product="hypothetical protein" +FT /note="pGP3-D" +FT /db_xref="InterPro:IPR008444" +FT /db_xref="InterPro:IPR033758" +FT /db_xref="InterPro:IPR038264" +FT /db_xref="PDB:6GJT" +FT /db_xref="UniProtKB/Swiss-Prot:P0CE18" +FT /protein_id="AAA91571.1" +FT /translation="MGNSGFYLYNTENCVFADNIKVGQMTEPLKDQQIILGTTSTPVAA +FT KMTASDGISLTVSNNSSTNASITIGLDAEKAYQLILEKLGDQILDGIADTIVDSTVQDI +FT LDKIKTDPSLGLLKAFNNFPITNKIQCNGLFTPSNIETLLGGTEIGKFTVTPKSSGSMF +FT LVSADIIASRMEGGVVLALVREGDSKPCAISYGYSSGIPNLCSLRTSITNTGLTPTTYS +FT LRVGGLESGVVWVNALSNGNDILGITNTSNVSFLEVIPQTNA" +FT CDS 4918..5226 +FT /codon_start=1 +FT /transl_table=11 +FT /product="hypothetical protein" +FT /note="pGP4-D" +FT /db_xref="UniProtKB/Swiss-Prot:P0CE23" +FT /protein_id="AAA91572.1" +FT /translation="MQNKRKVRDDFIKIVKDVKKDFPELDLKIRVNKEKVTFLNSPLEL +FT YHKSVSLILGLLQQIENSLGLFPDSPVLEKLEDNSLKLKKALIMLILSRKDMFSKAE" +FT CDS 5317..6048 +FT /codon_start=1 +FT /transl_table=11 +FT /product="hypothetical protein" +FT /note="pGP5-D (gtg start codon)" +FT /db_xref="GOA:P10559" +FT /db_xref="InterPro:IPR025669" +FT /db_xref="InterPro:IPR027417" +FT /db_xref="UniProtKB/Swiss-Prot:P10559" +FT /protein_id="AAA91573.1" +FT /translation="MGCNLAQFLGKKVLLADLDPQSNLSSGLGASVRSDQKGLHDIVYT +FT SNDLKSIICETKKDSVDLIPASFSSEQFRELDIHRGPSNNLKLFLNEYCAPFYDICIID +FT TPPSLGGLTKEAFVAGDKLIACLTPEPFSILGLQKIREFLSSVGKPEEEHILGIALSFW +FT DDRNSTNQMYIDIIESIYKNKLFSTKIRRDISLSRSLLKEDSVANVYPNSRAAEDILKL +FT THEIANILHIEYERDYSQRTT" +FT CDS 6045..6788 +FT /codon_start=1 +FT /transl_table=11 +FT /product="hypothetical protein" +FT /note="pGP6-D (gtg start codon)" +FT /db_xref="InterPro:IPR005350" +FT /db_xref="UniProtKB/Swiss-Prot:P10560" +FT /protein_id="AAA91574.1" +FT /translation="MNKLKKEADVFFKKNQTAASLDFKKTLPSIELFSATLNSEESQSL +FT DRLFLSESQNYSDEEFYQEDILAVKLLTGQIKSIQKQHVLLLGEKIYNARKILSKDHFS +FT STTFSSWIELVFRTKSSAYNALAYYELFINLPNQTLQKEFQSIPYKSAYILAARKGDLK +FT TKVDVIGKVCGMSNSSAIRVLDQFLPSSRNKDVRETIDKSDSEKNRQLSDFLIEILRIM +FT CSGVSLSSYNENLLQQLFELFKQKS" +FT repeat_region 6857..6945 +FT /note="four tandem 22bp repeats" +XX +SQ Sequence 7502 BP; 2460 A; 1285 C; 1433 G; 2324 T; 0 other; + ggatccgtaa gttagacgaa attttgtctt tgcgcacaga cgatctattt tttgcatcca 60 + atcagatttc ctttcgcatt aaaaaaagac agaataaaga aaccaaaatt ctaatcacat 120 + ttcctatcag cttaatggaa gagttgcaaa aatacacttg tgggagaaat gggagagtat 180 + ttgtttctaa aatagggatt cctgtaacaa caagtcaggt tgcgcataat tttaggcttg 240 + cagagttcca tagtgctatg aaaataaaaa ttactcccag agtacttcgt gcaagcgctt 300 + tgattcattt aaagcaaata ggattaaaag atgaggaaat catgcgtatt tcctgtcttt 360 + catcgagaca aagtgtgtgt tcttattgtt ctggggaaga ggtaattcct ctagtacaaa 420 + cacccacaat attgtgatat aattaaaatt atattcatat tctgttgcca gaaaaaacac 480 + ctttaggcta tattagagcc atcttctttg aagcgttgtc ttctcgagaa gatttatcgt 540 + acgcaaatat catctttgcg gttgcgtgtc ctgtgacctt cattatgtcg gagtctgagc 600 + accctaggcg tttgtactcc gtcacagcgg ttgctcgaag cacgtgcggg gttattttaa 660 + aagggattgc agcttgtagt cctgcttgag agaacgtgcg ggcgatttgc cttaacccca 720 + ccatttttcc ggagcgagtt acgaagacaa aacctcttcg ttgaccgatg tactcttgta 780 + gaaagtgcat aaacttctga ggataagtta taataatcct cttttctgtc tgacggttct 840 + taagctggga gaaagaaatg gtagcttgtt ggaaacaaat ctgactaatc tccaagctta 900 + agacttcaga ggagcgttta cctccttgga gcattgtctg ggcgatcaac caatcccggg 960 + cattgatttt ttttagctct tttaggaagg atgctgtttg caaactgttc atcgcatccg 1020 + tttttactat ttccctggtt ttaaaaaatg ttcgactatt ttcttgttta gaaggttgcg 1080 + ctatagcgac tattccttga gtcatcctgt ttaggaatct tgttaaggaa atatagcttg 1140 + ctgctcgaac ttgtttagta ccttcggtcc aagaagtctt ggcagaggaa acttttttaa 1200 + tcgcatctag gattagatta tgatttaaaa gggaaaactc ttgcagattc atatccaagg 1260 + acaatagacc aatcttttct aaagacaaaa aagatcctcg atatgatcta caagtatgtt 1320 + tgttgagtga tgcggtccaa tgcataataa cttcgaataa ggagaagctt ttcatgcgtt 1380 + tccaatagga ttcttggcga atttttaaaa cttcctgata agacttttca ctatattcta 1440 + acgacatttc ttgctgcaaa gataaaatcc ctttacccat gaaatccctc gtgatataac 1500 + ctatccgtaa aatgtcctga ttagtgaaat aatcaggttg ttaacaggat agcacgctcg 1560 + gtattttttt atataaacat gaaaactcgt tccgaaatag aaaatcgcat gcaagatatc 1620 + gagtatgcgt tgttaggtaa agctctgata tttgaagact ctactgagta tattctgagg 1680 + cagcttgcta attatgagtt taagtgttct catcataaaa acatattcat agtatttaaa 1740 + cacttaaaag acaatggatt acctataact gtagactcgg cttgggaaga gcttttgcgg 1800 + cgtcgtatca aagatatgga caaatcgtat ctcgggttaa tgttgcatga tgctttatca 1860 + aatgacaagc ttagatccgt ttctcatacg gttttcctcg atgatttgag cgtgtgtagc 1920 + gctgaagaaa atttgagtaa tttcattttc cgctcgttta atgagtacaa tgaaaatcca 1980 + ttgcgtagat ctccgtttct attgcttgag cgtataaagg gaaggcttga tagtgctata 2040 + gcaaagactt tttctattcg cagcgctaga ggccggtcta tttatgatat attctcacag 2100 + tcagaaattg gagtgctggc tcgtataaaa aaaagacgag tagcgttctc tgagaatcaa 2160 + aattctttct ttgatggctt cccaacagga tacaaggata ttgatgataa aggagttatc 2220 + ttagctaaag gtaatttcgt gattatagca gctagaccat ctatagggaa aacagcttta 2280 + gctatagaca tggcgataaa tcttgcggtt actcaacagc gtagagttgg tttcctatct 2340 + ctagaaatga gcgcaggtca aattgttgag cggattattg ctaatttaac aggaatatct 2400 + ggtgaaaaat tacaaagagg ggatctctct aaagaagaat tattccgagt agaagaagct 2460 + ggagaaacgg ttagagaatc acatttttat atctgcagtg atagtcagta taagcttaac 2520 + ttaatcgcga atcagatccg gttgctgaga aaagaagatc gagtagacgt aatatttatc 2580 + gattacttgc agttgatcaa ctcatcggtt ggagaaaatc gtcaaaatga aatagcagat 2640 + atatctagaa ccttaagagg tttagcctca gagctaaaca ttcctatagt ttgtttatcc 2700 + caactatcta gaaaagttga ggatagagca aataaagttc ccatgctttc agatttgcga 2760 + gacagcggtc aaatagagca agacgcagat gtgattttgt ttatcaatag gaaggaatcg 2820 + tcttctaatt gtgagataac tgttgggaaa aatagacatg gatcggtttt ctcttcggta 2880 + ttacatttcg atccaaaaat tagtaaattc tccgctatta aaaaagtatg gtaaattata 2940 + gtaactgcca cttcatcaaa agtcctatcc accttgaaaa tcagaagttt ggaagaagac 3000 + ctggtcaatc tattaagata tctcccaaat tggctcaaaa tgggatggta gaagttatag 3060 + gtcttgattt tctttcatct cattaccatg cattagcagc tatccaaaga ttactgaccg 3120 + caacgaatta caaggggaac acaaaagggg ttgttttatc cagagaatca aatagttttc 3180 + aatttgaagg atggatacca agaatccgtt ttacaaaaac tgaattctta gaggcttatg 3240 + gagttaagcg gtataaaaca tccagaaata agtatgagtt tagtggaaaa gaagctgaaa 3300 + ctgctttaga agccttatac catttaggac atcaaccgtt tttaatagtg gcaactagaa 3360 + ctcgatggac taatggaaca caaatagtag accgttacca aactctttct ccgatcatta 3420 + ggatttacga aggatgggaa ggtttaactg acgaagaaaa tatagatata gacttaacac 3480 + cttttaattc accacctaca cggaaacata aagggttcgt tgtagagcca tgtcctatct 3540 + tggtagatca aatagaatcc tactttgtaa tcaagcctgc aaatgtatac caagaaataa 3600 + aaatgcgttt cccaaatgca tcaaagtatg cttacacatt tatcgactgg gtgattacag 3660 + cagctgcgaa aaagagacga aaattaacta aggataattc ttggccagaa aacttgttat 3720 + taaacgttaa cgttaaaagt cttgcatata ttttaaggat gaatcggtac atctgtacaa 3780 + ggaactggaa aaaaatcgag ttagctatcg ataaatgtat agaaatcgcc attcagcttg 3840 + gctggttatc tagaagaaaa cgcattgaat ttctggattc ttctaaactc tctaaaaaag 3900 + aaattctata tctaaataaa gagcgctttg aagaaataac taagaaatct aaagaacaaa 3960 + tggaacaatt agaacaagaa tctattaatt aatagcaagc ttgaaactaa aaacctaatt 4020 + tatttaaagc tcaaaataaa aaagagtttt aaaatgggaa attctggttt ttatttgtat 4080 + aacactgaaa actgcgtctt tgctgataat atcaaagttg ggcaaatgac agagccgctc 4140 + aaggaccagc aaataatcct tgggacaaca tcaacacctg tcgcagccaa aatgacagct 4200 + tctgatggaa tatctttaac agtctccaat aattcatcaa ccaatgcttc tattacaatt 4260 + ggtttggatg cggaaaaagc ttaccagctt attctagaaa agttgggaga tcaaattctt 4320 + gatggaattg ctgatactat tgttgatagt acagtccaag atattttaga caaaatcaaa 4380 + acagaccctt ctctaggttt gttgaaagct tttaacaact ttccaatcac taataaaatt 4440 + caatgcaacg ggttattcac tcccagtaac attgaaactt tattaggagg aactgaaata 4500 + ggaaaattca cagtcacacc caaaagctct gggagcatgt tcttagtctc agcagatatt 4560 + attgcatcaa gaatggaagg cggcgttgtt ctagctttgg tacgagaagg tgattctaag 4620 + ccctgcgcga ttagttatgg atactcatca ggcattccta atttatgtag tctaagaacc 4680 + agtattacta atacaggatt gactccgaca acgtattcat tacgtgtagg cggtttagaa 4740 + agcggtgtgg tatgggttaa tgccctttct aatggcaatg atattttagg aataacaaat 4800 + acttctaatg tatctttttt agaggtaata cctcaaacaa acgcttaaac aatttttatt 4860 + ggatttttct tataggtttt atatttagag aaaacagttc gaattacggg gtttgttatg 4920 + caaaataaaa gaaaagtgag ggacgatttt attaaaattg ttaaagatgt gaaaaaagat 4980 + ttccccgaat tagacctaaa aatacgagta aacaaggaaa aagtaacttt cttaaattct 5040 + cccttagaac tctaccataa aagtgtctca ctaattctag gactgcttca acaaatagaa 5100 + aactctttag gattattccc agactctcct gttcttgaaa aattagagga taacagttta 5160 + aagctaaaaa aggctttgat tatgcttatc ttgtctagaa aagacatgtt ttccaaggct 5220 + gaatagacaa cttactctaa cgttggagtt gatttgcaca ccttagtttt ttgctctttt 5280 + aagggaggaa ctggaaaaac aacactttct ctaaacgtgg gatgcaactt ggcccaattt 5340 + ttagggaaaa aagtgttact tgctgaccta gacccgcaat ccaatttatc ttctggattg 5400 + ggggctagtg tcagaagtga ccaaaaaggc ttgcacgaca tagtatacac atcaaacgat 5460 + ttaaaatcaa tcatttgcga aacaaaaaaa gatagtgtgg acctaattcc tgcatcattt 5520 + tcatccgaac agtttagaga attggatatt catagaggac ctagtaacaa cttaaagtta 5580 + tttctgaatg agtactgcgc tcctttttat gacatctgca taatagacac tccacctagc 5640 + ctaggagggt taacgaaaga agcttttgtt gcaggagaca aattaattgc ttgtttaact 5700 + ccagaacctt tttctattct agggttacaa aagatacgtg aattcttaag ttcggtcgga 5760 + aaacctgaag aagaacacat tcttggaata gctttgtctt tttgggatga tcgtaactcg 5820 + actaaccaaa tgtatataga cattatcgag tctatttaca aaaacaagct tttttcaaca 5880 + aaaattcgtc gagatatttc tctcagccgt tctcttctta aagaagattc tgtagctaat 5940 + gtctatccaa attctagggc cgcagaagat attctgaagt taacgcatga aatagcaaat 6000 + attttgcata tcgaatatga acgagattac tctcagagga caacgtgaac aaactaaaaa 6060 + aagaagcgga tgtctttttt aaaaaaaatc aaactgccgc ttctctagat tttaagaaga 6120 + cgcttccctc cattgaacta ttctcagcaa ctttgaattc tgaggaaagt cagagtttgg 6180 + atcgattatt tttatcagag tcccaaaact attcggatga agaattttat caagaagaca 6240 + tcctagcggt aaaactgctt actggtcaga taaaatccat acagaagcaa cacgtacttc 6300 + ttttaggaga aaaaatctat aatgctagaa aaatcctgag taaggatcac ttctcctcaa 6360 + caactttttc atcttggata gagttagttt ttagaactaa gtcttctgct tacaatgctc 6420 + ttgcatatta cgagcttttt ataaacctcc ccaaccaaac tctacaaaaa gagtttcaat 6480 + cgatccccta taaatccgca tatattttgg ccgctagaaa aggcgattta aaaaccaagg 6540 + tcgatgtgat agggaaagta tgtggaatgt cgaactcatc ggcgataagg gtgttggatc 6600 + aatttcttcc ttcatctaga aacaaagacg ttagagaaac gatagataag tctgattcag 6660 + agaagaatcg ccaattatct gatttcttaa tagagatact tcgcatcatg tgttccggag 6720 + tttctttgtc ctcctataac gaaaatcttc tacaacagct ttttgaactt tttaagcaaa 6780 + agagctgatc ctccgtcagc tcatatatat atatctatta tatatatata tttagggatt 6840 + tgatttcacg agagagattt gcaactcttg gtggtagact ttgcaactct tggtggtaga 6900 + ctttgcaact cttggtggta gactttgcaa ctcttggtgg tagacttggt cataatggac 6960 + ttttgttaaa aaatttatta aaatcttaga gctccgattt tgaatagctt tggttaagaa 7020 + aatgggctcg atggctttcc ataaaagtag attgttttta acttttgggg acgcgtcgga 7080 + aatttggtta tctactttat cttatctaac tagaaaaaat tatgcgtctg ggattaactt 7140 + tcttgtttct ttagagattc tggatttatc ggaaaccttg ataaaggcta tttctcttga 7200 + ccacagcgaa tctttgttta aaatcaagtc tctagatgtt tttaatggaa aagttgtttc 7260 + agaggcatct aaacaggcta gagcggcatg ctacatatct ttcacaaagt ttttgtatag 7320 + attgaccaag ggatatatta aacccgctat tccattgaaa gattttggaa acactacatt 7380 + ttttaaaatc cgagacaaaa tcaaaacaga atcgatttct aagcaggaat ggacagtttt 7440 + ttttgaagcg ctccggatag tgaattatag agactattta atcggtaaat tgattgtaca 7500 + ag 7502 +//