3 import java.io.IOException;
5 import jalview.bin.Cache;
8 * A class that provides selective parsing of the GenBank flatfile format.
10 * The initial implementation is limited to extracting fields used by Jalview
11 * after fetching an EMBL or EMBLCDS entry:
14 * accession, version, sequence, xref
15 * and (for CDS feature) location, protein_id, product, codon_start, translation
19 * @see https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
21 public class GenBankFile extends FlatFile
23 private static final String DEFINITION = "DEFINITION";
26 * Constructor given a data source and the id of the source database
32 public GenBankFile(FileParse fp, String sourceId) throws IOException
38 * Parses the flatfile, and if successful, saves as an annotated sequence
39 * which may be retrieved by calling {@code getSequence()}
42 * @see https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
45 public void parse() throws IOException
47 String line = nextLine();
50 if (line.startsWith(DEFINITION))
52 line = parseDefinition(line);
54 else if (line.startsWith("ACCESSION"))
56 this.accession = line.split(WHITESPACE)[1];
59 else if (line.startsWith("VERSION"))
61 line = parseVersion(line);
63 else if (line.startsWith("ORIGIN"))
65 line = parseSequence();
67 else if (line.startsWith("FEATURES"))
70 while (line.startsWith(" "))
72 line = parseFeature(line);
84 * Extracts and saves the primary accession and version (SV value) from an ID
85 * line, or null if not found. Returns the next line after the one processed.
90 String parseLocus(String line) throws IOException
92 String[] tokens = line.substring(2).split(";");
95 * first is primary accession
97 String token = tokens[0].trim();
100 this.accession = token;
104 * second token is 'SV versionNo'
106 if (tokens.length > 1)
108 token = tokens[1].trim();
109 if (token.startsWith("SV"))
111 String[] bits = token.trim().split(WHITESPACE);
112 this.version = bits[bits.length - 1];
117 * seventh token is 'length BP'
119 if (tokens.length > 6)
121 token = tokens[6].trim();
122 String[] bits = token.trim().split(WHITESPACE);
125 this.length = Integer.valueOf(bits[0]);
126 } catch (NumberFormatException e)
128 Cache.log.error("bad length read in flatfile, line: " + line);
136 * Reads sequence description from DEFINITION lines. Any trailing period is
137 * discarded. Returns the next line after the definition line(s).
141 * @throws IOException
143 String parseDefinition(String line) throws IOException
145 String desc = line.substring(DEFINITION.length()).trim();
146 if (desc.endsWith("."))
148 desc = desc.substring(0, desc.length() - 1);
152 * pass over any additional DE lines
154 while ((line = nextLine()) != null)
156 if (line.startsWith(" "))
158 // definition continuation line
166 this.description = desc;
172 * Parses the VERSION line e.g.
178 * and returns the next line
181 * @throws IOException
183 String parseVersion(String line) throws IOException
186 * extract version part of <accession>.<version>
187 * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html#VersionB
189 String[] tokens = line.split(WHITESPACE);
190 if (tokens.length > 1)
192 tokens = tokens[1].split("\\.");
193 if (tokens.length > 1)
195 this.version = tokens[1];
203 protected boolean isFeatureContinuationLine(String line)
205 return line.startsWith(" "); // 6 spaces