3 import java.io.IOException;
6 * A class that provides selective parsing of the GenBank flatfile format.
8 * The initial implementation is limited to extracting fields used by Jalview
9 * after fetching an EMBL or EMBLCDS entry:
12 * accession, version, sequence, xref
13 * and (for CDS feature) location, protein_id, product, codon_start, translation
17 * @see https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
19 public class GenBankFile extends EMBLLikeFlatFile
21 private static final String DEFINITION = "DEFINITION";
24 * Constructor given a data source and the id of the source database
30 public GenBankFile(FileParse fp, String sourceId) throws IOException
36 * Parses the flatfile, and if successful, saves as an annotated sequence
37 * which may be retrieved by calling {@code getSequence()}
40 * @see https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
43 public void parse() throws IOException
45 String line = nextLine();
48 if (line.startsWith("LOCUS"))
50 line = parseLocus(line);
52 else if (line.startsWith(DEFINITION))
54 line = parseDefinition(line);
56 else if (line.startsWith("ACCESSION"))
58 this.accession = line.split(WHITESPACE)[1];
61 else if (line.startsWith("VERSION"))
63 line = parseVersion(line);
65 else if (line.startsWith("ORIGIN"))
67 line = parseSequence();
69 else if (line.startsWith("FEATURES"))
72 while (line.startsWith(" "))
74 line = parseFeature(line);
86 * Extracts and saves the primary accession and version (SV value) from an ID
87 * line, or null if not found. Returns the next line after the one processed.
92 String parseLocus(String line) throws IOException
94 String[] tokens = line.split(WHITESPACE);
97 * first should be "LOCUS"
99 if (tokens.length < 2 || !"LOCUS".equals(tokens[0]))
104 * second is primary accession
106 String token = tokens[1].trim();
107 if (!token.isEmpty())
109 this.accession = token;
112 // not going to guess the rest just yet, but third is length with unit (bp)
118 * Reads sequence description from DEFINITION lines. Any trailing period is
119 * discarded. Returns the next line after the definition line(s).
123 * @throws IOException
125 String parseDefinition(String line) throws IOException
127 String desc = line.substring(DEFINITION.length()).trim();
128 if (desc.endsWith("."))
130 desc = desc.substring(0, desc.length() - 1);
134 * pass over any additional DE lines
136 while ((line = nextLine()) != null)
138 if (line.startsWith(" "))
140 // definition continuation line
148 this.description = desc;
154 * Parses the VERSION line e.g.
160 * and returns the next line
163 * @throws IOException
165 String parseVersion(String line) throws IOException
168 * extract version part of <accession>.<version>
169 * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html#VersionB
171 String[] tokens = line.split(WHITESPACE);
172 if (tokens.length > 1)
174 tokens = tokens[1].split("\\.");
175 if (tokens.length > 1)
177 this.version = tokens[1];
185 protected boolean isFeatureContinuationLine(String line)
187 return line.startsWith(" "); // 6 spaces