import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
+import jalview.util.DBRefUtils;
import jalview.util.DnaUtils;
import jalview.util.MappingUtils;
*/
public class EmblFlatFile extends AlignFile // FileParse
{
+ /**
+ * A data bean class to hold values parsed from one CDS Feature (FT)
+ */
+ class CdsData
+ {
+ String translation; // from CDS feature /translation
+
+ String cdsLocation; // CDS /location raw value
+
+ int codonStart = 1; // from CDS /codon_start
+
+ String proteinName; // from CDS /product; TODO: use for protein description
+
+ String proteinId; // from CDS /protein_id
+
+ Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
+ }
+
private static final String WHITESPACE = "\\s+";
private String sourceDb;
-
+
/*
* values parsed from the EMBL flatfile record
*/
private String sequenceString; // from SQ lines
- private String translation; // from CDS feature /translation
-
- private String cdsLocation; // CDS /location raw value
-
- private int codonStart = 1; // from CDS /codon_start
-
- private String proteinName; // from CDS /product
-
- private String proteinId; // from CDS /protein_id
-
- private Map<String, String> cdsProps; // CDS other qualifiers e.g. 'note'
-
+ private List<CdsData> cds;
+
/**
* Constructor
+ *
* @param fp
* @param sourceId
* @throws IOException
super(false, fp); // don't parse immediately
this.sourceDb = sourceId;
dbrefs = new ArrayList<>();
- cdsProps = new Hashtable<>();
+ cds = new ArrayList<>();
}
/**
{
if (line.startsWith("ID"))
{
- line = processID(line);
+ line = parseID(line);
}
else if (line.startsWith("DR"))
{
- line = processDR(line);
+ line = parseDR(line);
}
else if (line.startsWith("SQ"))
{
- line = processSQ();
+ line = parseSQ();
}
else if (line.startsWith("FT"))
{
- line = processFT(line);
+ line = parseFT(line);
}
else
{
* @param line
* @throws IOException
*/
- String processID(String line) throws IOException
+ String parseID(String line) throws IOException
{
String[] tokens = line.substring(2).split(";");
* @param line
* @throws IOException
*/
- String processDR(String line) throws IOException
+ String parseDR(String line) throws IOException
{
String[] tokens = line.substring(2).split(";");
if (tokens.length > 1)
{
+ /*
+ * ensure UniProtKB/Swiss-Prot converted to UNIPROT
+ */
String db = tokens[0].trim();
+ db = DBRefUtils.getCanonicalName(db);
String acc = tokens[1].trim();
if (acc.endsWith("."))
{
acc = acc.substring(0, acc.length() - 1);
}
- this.dbrefs.add(new DBRefEntry(db, "0", acc));
+ String version = "0";
+ if (tokens.length > 2)
+ {
+ String secondaryId = tokens[2].trim();
+ if (!secondaryId.isEmpty())
+ {
+ // todo: is this right? secondary id is not a version number
+ // version = secondaryId;
+ }
+ }
+ this.dbrefs.add(new DBRefEntry(db, version, acc));
}
return nextLine();
*
* @throws IOException
*/
- String processSQ() throws IOException
+ String parseSQ() throws IOException
{
StringBuilder sb = new StringBuilder(this.length);
String line = nextLine();
* @return
* @throws IOException
*/
- String processFT(String line) throws IOException
+ String parseFT(String line) throws IOException
{
String[] tokens = line.split(WHITESPACE);
if (tokens.length < 3 || !"CDS".equals(tokens[1]))
return nextLine();
}
- this.cdsLocation = tokens[2];
+ CdsData data = new CdsData();
+ data.cdsLocation = tokens[2];
- while ((line = nextLine()) != null)
+ line = nextLine();
+ while (line != null)
{
if (!line.startsWith("FT ")) // 4 spaces
{
if ("protein_id".equals(qualifier))
{
- proteinId = value;
+ data.proteinId = value;
+ line = nextLine();
}
else if ("codon_start".equals(qualifier))
{
try
{
- codonStart = Integer.parseInt(value.trim());
+ data.codonStart = Integer.parseInt(value.trim());
} catch (NumberFormatException e)
{
Cache.log.error("Invalid codon_start in XML for " + this.accession
+ ": " + e.getMessage());
}
+ line = nextLine();
+ }
+ else if ("db_xref".equals(qualifier))
+ {
+ String[] parts = value.split(":");
+ if (parts.length == 2)
+ {
+ String db = parts[0].trim();
+ db = DBRefUtils.getCanonicalName(db);
+ DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
+ this.dbrefs.add(dbref);
+ }
+ line = nextLine();
}
else if ("product".equals(qualifier))
{
// sometimes name is returned e.g. for V00488
- proteinName = value;
+ data.proteinName = value;
+ line = nextLine();
}
else if ("translation".equals(qualifier))
{
- line = readTranslation(value);
+ line = readTranslation(value, data);
}
else if (!"".equals(value))
{
// throw anything else into the additional properties hash
- cdsProps.put(qualifier, value);
+ data.cdsProps.put(qualifier, value);
+ line = nextLine();
}
}
+
+ this.cds.add(data);
return line;
}
/**
- * Reads and saves the CDS translation from one or more lines of the file, and
- * returns the next line after that
+ * Reads and returns the CDS translation from one or more lines of the file,
+ * and returns the next line after that
*
* @param value
* the first line of the translation (likely quoted)
+ * @param data
* @return
* @throws IOException
*/
- String readTranslation(String value) throws IOException
+ String readTranslation(String value, CdsData data) throws IOException
{
StringBuilder sb = new StringBuilder(this.length / 3 + 1);
sb.append(value.replace("\"", ""));
sb.append(tokens[1].replace("\"", ""));
}
- return sb.toString();
+ data.translation = sb.toString();
+
+ return line;
}
/**
* <li>add a CDS feature to the sequence for each CDS start-end range</li>
* <li>create a protein product sequence for the translation</li>
* <li>create a cross-reference to protein with mapping from dna</li>
- * <li>add any CDS dbrefs to the sequence and to the protein product</li>
+ * <li>add any CDS dbrefs to the sequence and to the protein product</li>
* </ul>
- * @param SequenceI dna
+ *
+ * @param SequenceI
+ * dna
*/
- void processCDS(SequenceI dna)
+ void processCDS(SequenceI dna, CdsData data)
{
/*
* parse location into a list of [start, end, start, end] positions
*/
- int[] exons = getCdsRanges(this.accession, this.cdsLocation);
+ int[] exons = getCdsRanges(this.accession, data.cdsLocation);
int exonNumber = 0;
-
- for (int xint = 0; exons != null
- && xint < exons.length - 1; xint += 2)
+
+ for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
{
int exonStart = exons[xint];
int exonEnd = exons[xint + 1];
int end = Math.max(exonStart, exonEnd);
exonNumber++;
String desc = String.format("Exon %d for protein EMBLCDS:%s",
- exonNumber, proteinId);
+ exonNumber, data.proteinId);
- SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, this.sourceDb);
- if (!cdsProps.isEmpty())
+ SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
+ this.sourceDb);
+ for (Entry<String, String> val : data.cdsProps.entrySet())
{
- for (Entry<String, String> val : cdsProps.entrySet())
- {
- sf.setValue(val.getKey(), val.getValue());
- }
+ sf.setValue(val.getKey(), val.getValue());
}
- sf.setEnaLocation(this.cdsLocation);
+ sf.setEnaLocation(data.cdsLocation);
boolean forwardStrand = exonStart <= exonEnd;
sf.setStrand(forwardStrand ? "+" : "-");
- sf.setPhase(String.valueOf(codonStart - 1));
+ sf.setPhase(String.valueOf(data.codonStart - 1));
sf.setValue(FeatureProperties.EXONPOS, exonNumber);
- sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
+ sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
dna.addSequenceFeature(sf);
}
}
/**
+ * Constructs a sequence for the protein product (if there is one), and dbrefs
+ * with mappings from dna to protein and the reverse
+ */
+ void processTranslation()
+ {
+ // TODO Auto-generated method stub
+
+ }
+
+ /**
* Constructs and saves the sequence from parsed components
*/
void assembleSequence()
{
seq.addDBRef(dbref);
}
-
- processCDS(seq);
+
+ for (CdsData data : cds)
+ {
+ processCDS(seq, data);
+ };
+
+ processTranslation();
+
seq.deriveSequence();
-
+
addSequence(seq);
}
--- /dev/null
+package jalview.io;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertTrue;
+import static org.testng.Assert.assertNull;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.List;
+import java.util.Set;
+
+import org.testng.annotations.Test;
+
+import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.datamodel.features.SequenceFeatures;
+
+public class EmblFlatFileTest
+{
+ /**
+ * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features,
+ * one of them reverse strand
+ *
+ * @throws MalformedURLException
+ * @throws IOException
+ */
+ @Test(groups = "Functional")
+ public void testParse() throws MalformedURLException, IOException
+ {
+ File dataFile = new File("test/jalview/io/J03321.embl.txt");
+ FileParse fp = new FileParse(dataFile, DataSourceType.FILE);
+ EmblFlatFile parser = new EmblFlatFile(fp, "EmblTest");
+ parser.parse();
+ List<SequenceI> seqs = parser.getSeqs();
+
+ assertEquals(seqs.size(), 1);
+ SequenceI seq = seqs.get(0);
+ assertEquals(seq.getName(), "EmblTest|J03321");
+ assertEquals(seq.getLength(), 7502);
+
+ /*
+ * should be 9 CDS features (one is a 'join' of two exons)
+ */
+ Set<String> featureTypes = seq.getFeatures().getFeatureTypes();
+ assertEquals(featureTypes.size(), 1);
+ assertTrue(featureTypes.contains("CDS"));
+
+ /*
+ * inspect some features (sort them for convenience of test assertions)
+ */
+ List<SequenceFeature> features = seq.getFeatures()
+ .getAllFeatures("CDS");
+ SequenceFeatures.sortFeatures(features, true);
+ assertEquals(features.size(), 9);
+
+ SequenceFeature sf = features.get(0);
+ assertEquals(sf.getBegin(), 1);
+ assertEquals(sf.getEnd(), 437);
+ assertEquals(sf.getDescription(),
+ "Exon 2 for protein EMBLCDS:AAA91567.1");
+ assertEquals(sf.getFeatureGroup(), "EmblTest");
+ assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)");
+ assertEquals(sf.getPhase(), "0");
+ assertEquals(sf.getStrand(), 1);
+ assertEquals(sf.getValue("note"), "pGP7-D");
+ // second exon of circular DNA!
+ assertEquals(sf.getValue("exon number"), 2);
+ assertEquals(sf.getValue("product"), "hypothetical protein");
+ assertEquals(sf.getValue("transl_table"), "11");
+
+ sf = features.get(1);
+ assertEquals(sf.getBegin(), 488);
+ assertEquals(sf.getEnd(), 1480);
+ assertEquals(sf.getDescription(),
+ "Exon 1 for protein EMBLCDS:AAA91568.1");
+ assertEquals(sf.getFeatureGroup(), "EmblTest");
+ assertEquals(sf.getEnaLocation(), "complement(488..1480)");
+ assertEquals(sf.getPhase(), "0");
+ assertEquals(sf.getStrand(), -1); // reverse strand!
+ assertEquals(sf.getValue("note"), "pGP8-D");
+ assertEquals(sf.getValue("exon number"), 1);
+ assertEquals(sf.getValue("product"), "hypothetical protein");
+
+ sf = features.get(7);
+ assertEquals(sf.getBegin(), 6045);
+ assertEquals(sf.getEnd(), 6788);
+ assertEquals(sf.getDescription(),
+ "Exon 1 for protein EMBLCDS:AAA91574.1");
+ assertEquals(sf.getFeatureGroup(), "EmblTest");
+ assertEquals(sf.getEnaLocation(), "6045..6788");
+ assertEquals(sf.getPhase(), "0");
+ assertEquals(sf.getStrand(), 1);
+ assertEquals(sf.getValue("note"), "pGP6-D (gtg start codon)");
+ assertEquals(sf.getValue("exon number"), 1);
+ assertEquals(sf.getValue("product"), "hypothetical protein");
+
+ /*
+ * CDS at 7022-7502 is the first exon of the circular DNA CDS
+ */
+ sf = features.get(8);
+ assertEquals(sf.getBegin(), 7022);
+ assertEquals(sf.getEnd(), 7502);
+ assertEquals(sf.getDescription(),
+ "Exon 1 for protein EMBLCDS:AAA91567.1");
+ assertEquals(sf.getFeatureGroup(), "EmblTest");
+ assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)");
+ assertEquals(sf.getPhase(), "0");
+ assertEquals(sf.getStrand(), 1);
+ assertEquals(sf.getValue("note"), "pGP7-D");
+ assertEquals(sf.getValue("exon number"), 1);
+ assertEquals(sf.getValue("product"), "hypothetical protein");
+
+ /*
+ * there are 4 'direct' (DR) dbrefs, and numerous CDS /db_xref entries,
+ * some of them (e.g. INTERPRO) duplicates; sample a few here
+ * Note DBRefEntry constructor capitalises source
+ */
+ List<DBRefEntry> dbrefs = seq.getDBRefs();
+ assertEquals(dbrefs.size(), 31);
+ // 1st DR line; note trailing period is removed
+ assertTrue(dbrefs.contains(new DBRefEntry("MD5", "0",
+ "d4c4942a634e3df4995fd5ac75c26a61")));
+ // the 4th DR line:
+ assertTrue(
+ dbrefs.contains(new DBRefEntry("EuropePMC", "0", "PMC87941")));
+ // from the first CDS feature; note canonicalisation to "UNIPROT"
+ assertTrue(dbrefs.contains(new DBRefEntry("GOA", "0", "P0CE19")));
+ assertTrue(dbrefs.contains(new DBRefEntry("UNIPROT", "0", "P0CE19")));
+ // from the last CDS feature
+ assertTrue(dbrefs.contains(new DBRefEntry("INTERPRO", "0", "IPR005350")));
+
+ // todo: mappings to, and sequences for, UNIPROT proteins
+ }
+}
--- /dev/null
+ID J03321; SV 1; circular; genomic DNA; STD; PRO; 7502 BP.
+XX
+AC J03321;
+XX
+DT 27-JUL-1990 (Rel. 24, Created)
+DT 10-APR-2020 (Rel. 144, Last updated, Version 9)
+XX
+DE Chlamydia trachomatis plasmid pCHL1, complete sequence.
+XX
+KW .
+XX
+OS Chlamydia trachomatis
+OC Bacteria; Chlamydiae; Chlamydiales; Chlamydiaceae;
+OC Chlamydia/Chlamydophila group; Chlamydia.
+OG Plasmid pCHL1
+XX
+RN [1]
+RP 1-7502
+RX DOI; 10.1016/0147-619X(90)90034-A.
+RX PUBMED; 2194229.
+RA Comanducci M., Ricci S., Cevenini R., Ratti G.;
+RT "Diversity of the Chlamydia trachomatis common plasmid in biovars with
+RT different pathogenicity";
+RL Plasmid 23(2):149-154(1990).
+XX
+RN [2]
+RP 1-7502
+RA Comanducci M., Ricci S., Cevenini R., Ratti G.;
+RT ;
+RL Submitted (23-JUN-2010) to the INSDC.
+RL Sclavo Research Centre, Siena, Italy
+XX
+DR MD5; d4c4942a634e3df4995fd5ac75c26a61.
+DR BioSample; SAMN14225621.
+DR EuropePMC; PMC4450983; 26031715.
+DR EuropePMC; PMC87941; 11283058.
+XX
+CC Draft entry and computer-readable sequence kindly submitted by
+CC G.Ratti, 28-MAR-1990.
+XX
+FH Key Location/Qualifiers
+FH
+FT source 1..7502
+FT /organism="Chlamydia trachomatis"
+FT /plasmid="pCHL1"
+FT /isolate="G0/86"
+FT /serotype="D"
+FT /mol_type="genomic DNA"
+FT /isolation_source="trachoma"
+FT /db_xref="taxon:813"
+FT CDS join(7022..7502,1..437)
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP7-D"
+FT /db_xref="GOA:P0CE19"
+FT /db_xref="InterPro:IPR002104"
+FT /db_xref="InterPro:IPR011010"
+FT /db_xref="InterPro:IPR013762"
+FT /db_xref="UniProtKB/Swiss-Prot:P0CE19"
+FT /protein_id="AAA91567.1"
+FT /translation="MGSMAFHKSRLFLTFGDASEIWLSTLSYLTRKNYASGINFLVSLE
+FT ILDLSETLIKAISLDHSESLFKIKSLDVFNGKVVSEASKQARAACYISFTKFLYRLTKG
+FT YIKPAIPLKDFGNTTFFKIRDKIKTESISKQEWTVFFEALRIVNYRDYLIGKLIVQGIR
+FT KLDEILSLRTDDLFFASNQISFRIKKRQNKETKILITFPISLMEELQKYTCGRNGRVFV
+FT SKIGIPVTTSQVAHNFRLAEFHSAMKIKITPRVLRASALIHLKQIGLKDEEIMRISCLS
+FT SRQSVCSYCSGEEVIPLVQTPTIL"
+FT CDS complement(488..1480)
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP8-D"
+FT /db_xref="GOA:P0CE20"
+FT /db_xref="InterPro:IPR002104"
+FT /db_xref="InterPro:IPR011010"
+FT /db_xref="InterPro:IPR013762"
+FT /db_xref="UniProtKB/Swiss-Prot:P0CE20"
+FT /protein_id="AAA91568.1"
+FT /translation="MGKGILSLQQEMSLEYSEKSYQEVLKIRQESYWKRMKSFSLFEVI
+FT MHWTASLNKHTCRSYRGSFLSLEKIGLLSLDMNLQEFSLLNHNLILDAIKKVSSAKTSW
+FT TEGTKQVRAASYISLTRFLNRMTQGIVAIAQPSKQENSRTFFKTREIVKTDAMNSLQTA
+FT SFLKELKKINARDWLIAQTMLQGGKRSSEVLSLEISQICFQQATISFSQLKNRQTEKRI
+FT IITYPQKFMHFLQEYIGQRRGFVFVTRSGKMVGLRQIARTFSQAGLQAAIPFKITPHVL
+FT RATAVTEYKRLGCSDSDIMKVTGHATAKMIFAYDKSSREDNASKKMALI"
+FT CDS 1579..2934
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP1-D"
+FT /db_xref="GOA:P0CE16"
+FT /db_xref="InterPro:IPR003593"
+FT /db_xref="InterPro:IPR007693"
+FT /db_xref="InterPro:IPR007694"
+FT /db_xref="InterPro:IPR027417"
+FT /db_xref="InterPro:IPR036185"
+FT /db_xref="UniProtKB/Swiss-Prot:P0CE16"
+FT /protein_id="AAA91569.1"
+FT /translation="MKTRSEIENRMQDIEYALLGKALIFEDSTEYILRQLANYEFKCSH
+FT HKNIFIVFKHLKDNGLPITVDSAWEELLRRRIKDMDKSYLGLMLHDALSNDKLRSVSHT
+FT VFLDDLSVCSAEENLSNFIFRSFNEYNENPLRRSPFLLLERIKGRLDSAIAKTFSIRSA
+FT RGRSIYDIFSQSEIGVLARIKKRRVAFSENQNSFFDGFPTGYKDIDDKGVILAKGNFVI
+FT IAARPSIGKTALAIDMAINLAVTQQRRVGFLSLEMSAGQIVERIIANLTGISGEKLQRG
+FT DLSKEELFRVEEAGETVRESHFYICSDSQYKLNLIANQIRLLRKEDRVDVIFIDYLQLI
+FT NSSVGENRQNEIADISRTLRGLASELNIPIVCLSQLSRKVEDRANKVPMLSDLRDSGQI
+FT EQDADVILFINRKESSSNCEITVGKNRHGSVFSSVLHFDPKISKFSAIKKVW"
+FT CDS 2928..3992
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP2-D"
+FT /db_xref="InterPro:IPR040719"
+FT /db_xref="UniProtKB/Swiss-Prot:P0CE17"
+FT /protein_id="AAA91570.1"
+FT /translation="MVNYSNCHFIKSPIHLENQKFGRRPGQSIKISPKLAQNGMVEVIG
+FT LDFLSSHYHALAAIQRLLTATNYKGNTKGVVLSRESNSFQFEGWIPRIRFTKTEFLEAY
+FT GVKRYKTSRNKYEFSGKEAETALEALYHLGHQPFLIVATRTRWTNGTQIVDRYQTLSPI
+FT IRIYEGWEGLTDEENIDIDLTPFNSPPTRKHKGFVVEPCPILVDQIESYFVIKPANVYQ
+FT EIKMRFPNASKYAYTFIDWVITAAAKKRRKLTKDNSWPENLLLNVNVKSLAYILRMNRY
+FT ICTRNWKKIELAIDKCIEIAIQLGWLSRRKRIEFLDSSKLSKKEILYLNKERFEEITKK
+FT SKEQMEQLEQESIN"
+FT CDS 4054..4848
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP3-D"
+FT /db_xref="InterPro:IPR008444"
+FT /db_xref="InterPro:IPR033758"
+FT /db_xref="InterPro:IPR038264"
+FT /db_xref="PDB:6GJT"
+FT /db_xref="UniProtKB/Swiss-Prot:P0CE18"
+FT /protein_id="AAA91571.1"
+FT /translation="MGNSGFYLYNTENCVFADNIKVGQMTEPLKDQQIILGTTSTPVAA
+FT KMTASDGISLTVSNNSSTNASITIGLDAEKAYQLILEKLGDQILDGIADTIVDSTVQDI
+FT LDKIKTDPSLGLLKAFNNFPITNKIQCNGLFTPSNIETLLGGTEIGKFTVTPKSSGSMF
+FT LVSADIIASRMEGGVVLALVREGDSKPCAISYGYSSGIPNLCSLRTSITNTGLTPTTYS
+FT LRVGGLESGVVWVNALSNGNDILGITNTSNVSFLEVIPQTNA"
+FT CDS 4918..5226
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP4-D"
+FT /db_xref="UniProtKB/Swiss-Prot:P0CE23"
+FT /protein_id="AAA91572.1"
+FT /translation="MQNKRKVRDDFIKIVKDVKKDFPELDLKIRVNKEKVTFLNSPLEL
+FT YHKSVSLILGLLQQIENSLGLFPDSPVLEKLEDNSLKLKKALIMLILSRKDMFSKAE"
+FT CDS 5317..6048
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP5-D (gtg start codon)"
+FT /db_xref="GOA:P10559"
+FT /db_xref="InterPro:IPR025669"
+FT /db_xref="InterPro:IPR027417"
+FT /db_xref="UniProtKB/Swiss-Prot:P10559"
+FT /protein_id="AAA91573.1"
+FT /translation="MGCNLAQFLGKKVLLADLDPQSNLSSGLGASVRSDQKGLHDIVYT
+FT SNDLKSIICETKKDSVDLIPASFSSEQFRELDIHRGPSNNLKLFLNEYCAPFYDICIID
+FT TPPSLGGLTKEAFVAGDKLIACLTPEPFSILGLQKIREFLSSVGKPEEEHILGIALSFW
+FT DDRNSTNQMYIDIIESIYKNKLFSTKIRRDISLSRSLLKEDSVANVYPNSRAAEDILKL
+FT THEIANILHIEYERDYSQRTT"
+FT CDS 6045..6788
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP6-D (gtg start codon)"
+FT /db_xref="InterPro:IPR005350"
+FT /db_xref="UniProtKB/Swiss-Prot:P10560"
+FT /protein_id="AAA91574.1"
+FT /translation="MNKLKKEADVFFKKNQTAASLDFKKTLPSIELFSATLNSEESQSL
+FT DRLFLSESQNYSDEEFYQEDILAVKLLTGQIKSIQKQHVLLLGEKIYNARKILSKDHFS
+FT STTFSSWIELVFRTKSSAYNALAYYELFINLPNQTLQKEFQSIPYKSAYILAARKGDLK
+FT TKVDVIGKVCGMSNSSAIRVLDQFLPSSRNKDVRETIDKSDSEKNRQLSDFLIEILRIM
+FT CSGVSLSSYNENLLQQLFELFKQKS"
+FT repeat_region 6857..6945
+FT /note="four tandem 22bp repeats"
+XX
+SQ Sequence 7502 BP; 2460 A; 1285 C; 1433 G; 2324 T; 0 other;
+ ggatccgtaa gttagacgaa attttgtctt tgcgcacaga cgatctattt tttgcatcca 60
+ atcagatttc ctttcgcatt aaaaaaagac agaataaaga aaccaaaatt ctaatcacat 120
+ ttcctatcag cttaatggaa gagttgcaaa aatacacttg tgggagaaat gggagagtat 180
+ ttgtttctaa aatagggatt cctgtaacaa caagtcaggt tgcgcataat tttaggcttg 240
+ cagagttcca tagtgctatg aaaataaaaa ttactcccag agtacttcgt gcaagcgctt 300
+ tgattcattt aaagcaaata ggattaaaag atgaggaaat catgcgtatt tcctgtcttt 360
+ catcgagaca aagtgtgtgt tcttattgtt ctggggaaga ggtaattcct ctagtacaaa 420
+ cacccacaat attgtgatat aattaaaatt atattcatat tctgttgcca gaaaaaacac 480
+ ctttaggcta tattagagcc atcttctttg aagcgttgtc ttctcgagaa gatttatcgt 540
+ acgcaaatat catctttgcg gttgcgtgtc ctgtgacctt cattatgtcg gagtctgagc 600
+ accctaggcg tttgtactcc gtcacagcgg ttgctcgaag cacgtgcggg gttattttaa 660
+ aagggattgc agcttgtagt cctgcttgag agaacgtgcg ggcgatttgc cttaacccca 720
+ ccatttttcc ggagcgagtt acgaagacaa aacctcttcg ttgaccgatg tactcttgta 780
+ gaaagtgcat aaacttctga ggataagtta taataatcct cttttctgtc tgacggttct 840
+ taagctggga gaaagaaatg gtagcttgtt ggaaacaaat ctgactaatc tccaagctta 900
+ agacttcaga ggagcgttta cctccttgga gcattgtctg ggcgatcaac caatcccggg 960
+ cattgatttt ttttagctct tttaggaagg atgctgtttg caaactgttc atcgcatccg 1020
+ tttttactat ttccctggtt ttaaaaaatg ttcgactatt ttcttgttta gaaggttgcg 1080
+ ctatagcgac tattccttga gtcatcctgt ttaggaatct tgttaaggaa atatagcttg 1140
+ ctgctcgaac ttgtttagta ccttcggtcc aagaagtctt ggcagaggaa acttttttaa 1200
+ tcgcatctag gattagatta tgatttaaaa gggaaaactc ttgcagattc atatccaagg 1260
+ acaatagacc aatcttttct aaagacaaaa aagatcctcg atatgatcta caagtatgtt 1320
+ tgttgagtga tgcggtccaa tgcataataa cttcgaataa ggagaagctt ttcatgcgtt 1380
+ tccaatagga ttcttggcga atttttaaaa cttcctgata agacttttca ctatattcta 1440
+ acgacatttc ttgctgcaaa gataaaatcc ctttacccat gaaatccctc gtgatataac 1500
+ ctatccgtaa aatgtcctga ttagtgaaat aatcaggttg ttaacaggat agcacgctcg 1560
+ gtattttttt atataaacat gaaaactcgt tccgaaatag aaaatcgcat gcaagatatc 1620
+ gagtatgcgt tgttaggtaa agctctgata tttgaagact ctactgagta tattctgagg 1680
+ cagcttgcta attatgagtt taagtgttct catcataaaa acatattcat agtatttaaa 1740
+ cacttaaaag acaatggatt acctataact gtagactcgg cttgggaaga gcttttgcgg 1800
+ cgtcgtatca aagatatgga caaatcgtat ctcgggttaa tgttgcatga tgctttatca 1860
+ aatgacaagc ttagatccgt ttctcatacg gttttcctcg atgatttgag cgtgtgtagc 1920
+ gctgaagaaa atttgagtaa tttcattttc cgctcgttta atgagtacaa tgaaaatcca 1980
+ ttgcgtagat ctccgtttct attgcttgag cgtataaagg gaaggcttga tagtgctata 2040
+ gcaaagactt tttctattcg cagcgctaga ggccggtcta tttatgatat attctcacag 2100
+ tcagaaattg gagtgctggc tcgtataaaa aaaagacgag tagcgttctc tgagaatcaa 2160
+ aattctttct ttgatggctt cccaacagga tacaaggata ttgatgataa aggagttatc 2220
+ ttagctaaag gtaatttcgt gattatagca gctagaccat ctatagggaa aacagcttta 2280
+ gctatagaca tggcgataaa tcttgcggtt actcaacagc gtagagttgg tttcctatct 2340
+ ctagaaatga gcgcaggtca aattgttgag cggattattg ctaatttaac aggaatatct 2400
+ ggtgaaaaat tacaaagagg ggatctctct aaagaagaat tattccgagt agaagaagct 2460
+ ggagaaacgg ttagagaatc acatttttat atctgcagtg atagtcagta taagcttaac 2520
+ ttaatcgcga atcagatccg gttgctgaga aaagaagatc gagtagacgt aatatttatc 2580
+ gattacttgc agttgatcaa ctcatcggtt ggagaaaatc gtcaaaatga aatagcagat 2640
+ atatctagaa ccttaagagg tttagcctca gagctaaaca ttcctatagt ttgtttatcc 2700
+ caactatcta gaaaagttga ggatagagca aataaagttc ccatgctttc agatttgcga 2760
+ gacagcggtc aaatagagca agacgcagat gtgattttgt ttatcaatag gaaggaatcg 2820
+ tcttctaatt gtgagataac tgttgggaaa aatagacatg gatcggtttt ctcttcggta 2880
+ ttacatttcg atccaaaaat tagtaaattc tccgctatta aaaaagtatg gtaaattata 2940
+ gtaactgcca cttcatcaaa agtcctatcc accttgaaaa tcagaagttt ggaagaagac 3000
+ ctggtcaatc tattaagata tctcccaaat tggctcaaaa tgggatggta gaagttatag 3060
+ gtcttgattt tctttcatct cattaccatg cattagcagc tatccaaaga ttactgaccg 3120
+ caacgaatta caaggggaac acaaaagggg ttgttttatc cagagaatca aatagttttc 3180
+ aatttgaagg atggatacca agaatccgtt ttacaaaaac tgaattctta gaggcttatg 3240
+ gagttaagcg gtataaaaca tccagaaata agtatgagtt tagtggaaaa gaagctgaaa 3300
+ ctgctttaga agccttatac catttaggac atcaaccgtt tttaatagtg gcaactagaa 3360
+ ctcgatggac taatggaaca caaatagtag accgttacca aactctttct ccgatcatta 3420
+ ggatttacga aggatgggaa ggtttaactg acgaagaaaa tatagatata gacttaacac 3480
+ cttttaattc accacctaca cggaaacata aagggttcgt tgtagagcca tgtcctatct 3540
+ tggtagatca aatagaatcc tactttgtaa tcaagcctgc aaatgtatac caagaaataa 3600
+ aaatgcgttt cccaaatgca tcaaagtatg cttacacatt tatcgactgg gtgattacag 3660
+ cagctgcgaa aaagagacga aaattaacta aggataattc ttggccagaa aacttgttat 3720
+ taaacgttaa cgttaaaagt cttgcatata ttttaaggat gaatcggtac atctgtacaa 3780
+ ggaactggaa aaaaatcgag ttagctatcg ataaatgtat agaaatcgcc attcagcttg 3840
+ gctggttatc tagaagaaaa cgcattgaat ttctggattc ttctaaactc tctaaaaaag 3900
+ aaattctata tctaaataaa gagcgctttg aagaaataac taagaaatct aaagaacaaa 3960
+ tggaacaatt agaacaagaa tctattaatt aatagcaagc ttgaaactaa aaacctaatt 4020
+ tatttaaagc tcaaaataaa aaagagtttt aaaatgggaa attctggttt ttatttgtat 4080
+ aacactgaaa actgcgtctt tgctgataat atcaaagttg ggcaaatgac agagccgctc 4140
+ aaggaccagc aaataatcct tgggacaaca tcaacacctg tcgcagccaa aatgacagct 4200
+ tctgatggaa tatctttaac agtctccaat aattcatcaa ccaatgcttc tattacaatt 4260
+ ggtttggatg cggaaaaagc ttaccagctt attctagaaa agttgggaga tcaaattctt 4320
+ gatggaattg ctgatactat tgttgatagt acagtccaag atattttaga caaaatcaaa 4380
+ acagaccctt ctctaggttt gttgaaagct tttaacaact ttccaatcac taataaaatt 4440
+ caatgcaacg ggttattcac tcccagtaac attgaaactt tattaggagg aactgaaata 4500
+ ggaaaattca cagtcacacc caaaagctct gggagcatgt tcttagtctc agcagatatt 4560
+ attgcatcaa gaatggaagg cggcgttgtt ctagctttgg tacgagaagg tgattctaag 4620
+ ccctgcgcga ttagttatgg atactcatca ggcattccta atttatgtag tctaagaacc 4680
+ agtattacta atacaggatt gactccgaca acgtattcat tacgtgtagg cggtttagaa 4740
+ agcggtgtgg tatgggttaa tgccctttct aatggcaatg atattttagg aataacaaat 4800
+ acttctaatg tatctttttt agaggtaata cctcaaacaa acgcttaaac aatttttatt 4860
+ ggatttttct tataggtttt atatttagag aaaacagttc gaattacggg gtttgttatg 4920
+ caaaataaaa gaaaagtgag ggacgatttt attaaaattg ttaaagatgt gaaaaaagat 4980
+ ttccccgaat tagacctaaa aatacgagta aacaaggaaa aagtaacttt cttaaattct 5040
+ cccttagaac tctaccataa aagtgtctca ctaattctag gactgcttca acaaatagaa 5100
+ aactctttag gattattccc agactctcct gttcttgaaa aattagagga taacagttta 5160
+ aagctaaaaa aggctttgat tatgcttatc ttgtctagaa aagacatgtt ttccaaggct 5220
+ gaatagacaa cttactctaa cgttggagtt gatttgcaca ccttagtttt ttgctctttt 5280
+ aagggaggaa ctggaaaaac aacactttct ctaaacgtgg gatgcaactt ggcccaattt 5340
+ ttagggaaaa aagtgttact tgctgaccta gacccgcaat ccaatttatc ttctggattg 5400
+ ggggctagtg tcagaagtga ccaaaaaggc ttgcacgaca tagtatacac atcaaacgat 5460
+ ttaaaatcaa tcatttgcga aacaaaaaaa gatagtgtgg acctaattcc tgcatcattt 5520
+ tcatccgaac agtttagaga attggatatt catagaggac ctagtaacaa cttaaagtta 5580
+ tttctgaatg agtactgcgc tcctttttat gacatctgca taatagacac tccacctagc 5640
+ ctaggagggt taacgaaaga agcttttgtt gcaggagaca aattaattgc ttgtttaact 5700
+ ccagaacctt tttctattct agggttacaa aagatacgtg aattcttaag ttcggtcgga 5760
+ aaacctgaag aagaacacat tcttggaata gctttgtctt tttgggatga tcgtaactcg 5820
+ actaaccaaa tgtatataga cattatcgag tctatttaca aaaacaagct tttttcaaca 5880
+ aaaattcgtc gagatatttc tctcagccgt tctcttctta aagaagattc tgtagctaat 5940
+ gtctatccaa attctagggc cgcagaagat attctgaagt taacgcatga aatagcaaat 6000
+ attttgcata tcgaatatga acgagattac tctcagagga caacgtgaac aaactaaaaa 6060
+ aagaagcgga tgtctttttt aaaaaaaatc aaactgccgc ttctctagat tttaagaaga 6120
+ cgcttccctc cattgaacta ttctcagcaa ctttgaattc tgaggaaagt cagagtttgg 6180
+ atcgattatt tttatcagag tcccaaaact attcggatga agaattttat caagaagaca 6240
+ tcctagcggt aaaactgctt actggtcaga taaaatccat acagaagcaa cacgtacttc 6300
+ ttttaggaga aaaaatctat aatgctagaa aaatcctgag taaggatcac ttctcctcaa 6360
+ caactttttc atcttggata gagttagttt ttagaactaa gtcttctgct tacaatgctc 6420
+ ttgcatatta cgagcttttt ataaacctcc ccaaccaaac tctacaaaaa gagtttcaat 6480
+ cgatccccta taaatccgca tatattttgg ccgctagaaa aggcgattta aaaaccaagg 6540
+ tcgatgtgat agggaaagta tgtggaatgt cgaactcatc ggcgataagg gtgttggatc 6600
+ aatttcttcc ttcatctaga aacaaagacg ttagagaaac gatagataag tctgattcag 6660
+ agaagaatcg ccaattatct gatttcttaa tagagatact tcgcatcatg tgttccggag 6720
+ tttctttgtc ctcctataac gaaaatcttc tacaacagct ttttgaactt tttaagcaaa 6780
+ agagctgatc ctccgtcagc tcatatatat atatctatta tatatatata tttagggatt 6840
+ tgatttcacg agagagattt gcaactcttg gtggtagact ttgcaactct tggtggtaga 6900
+ ctttgcaact cttggtggta gactttgcaa ctcttggtgg tagacttggt cataatggac 6960
+ ttttgttaaa aaatttatta aaatcttaga gctccgattt tgaatagctt tggttaagaa 7020
+ aatgggctcg atggctttcc ataaaagtag attgttttta acttttgggg acgcgtcgga 7080
+ aatttggtta tctactttat cttatctaac tagaaaaaat tatgcgtctg ggattaactt 7140
+ tcttgtttct ttagagattc tggatttatc ggaaaccttg ataaaggcta tttctcttga 7200
+ ccacagcgaa tctttgttta aaatcaagtc tctagatgtt tttaatggaa aagttgtttc 7260
+ agaggcatct aaacaggcta gagcggcatg ctacatatct ttcacaaagt ttttgtatag 7320
+ attgaccaag ggatatatta aacccgctat tccattgaaa gattttggaa acactacatt 7380
+ ttttaaaatc cgagacaaaa tcaaaacaga atcgatttct aagcaggaat ggacagtttt 7440
+ ttttgaagcg ctccggatag tgaattatag agactattta atcggtaaat tgattgtaca 7500
+ ag 7502
+//