2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import java.io.IOException;
26 * A class that provides selective parsing of the GenBank flatfile format.
28 * The initial implementation is limited to extracting fields used by Jalview
29 * after fetching an EMBL or EMBLCDS entry:
32 * accession, version, sequence, xref
33 * and (for CDS feature) location, protein_id, product, codon_start, translation
37 * @see https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
39 public class GenBankFile extends EMBLLikeFlatFile
41 private static final String DEFINITION = "DEFINITION";
44 * Constructor given a data source and the id of the source database
50 public GenBankFile(FileParse fp, String sourceId) throws IOException
56 * Parses the flatfile, and if successful, saves as an annotated sequence
57 * which may be retrieved by calling {@code getSequence()}
60 * @see https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
63 public void parse() throws IOException
65 String line = nextLine();
68 if (line.startsWith("LOCUS"))
70 line = parseLocus(line);
72 else if (line.startsWith(DEFINITION))
74 line = parseDefinition(line);
76 else if (line.startsWith("ACCESSION"))
78 this.accession = line.split(WHITESPACE)[1];
81 else if (line.startsWith("VERSION"))
83 line = parseVersion(line);
85 else if (line.startsWith("ORIGIN"))
87 line = parseSequence();
89 else if (line.startsWith("FEATURES"))
92 while (line.startsWith(" "))
94 line = parseFeature(line);
106 * Extracts and saves the primary accession and version (SV value) from an ID
107 * line, or null if not found. Returns the next line after the one processed.
110 * @throws IOException
112 String parseLocus(String line) throws IOException
114 String[] tokens = line.split(WHITESPACE);
117 * first should be "LOCUS"
119 if (tokens.length < 2 || !"LOCUS".equals(tokens[0]))
124 * second is primary accession
126 String token = tokens[1].trim();
127 if (!token.isEmpty())
129 this.accession = token;
132 // not going to guess the rest just yet, but third is length with unit (bp)
138 * Reads sequence description from DEFINITION lines. Any trailing period is
139 * discarded. Returns the next line after the definition line(s).
143 * @throws IOException
145 String parseDefinition(String line) throws IOException
147 String desc = line.substring(DEFINITION.length()).trim();
148 if (desc.endsWith("."))
150 desc = desc.substring(0, desc.length() - 1);
154 * pass over any additional DE lines
156 while ((line = nextLine()) != null)
158 if (line.startsWith(" "))
160 // definition continuation line
168 this.description = desc;
174 * Parses the VERSION line e.g.
180 * and returns the next line
183 * @throws IOException
185 String parseVersion(String line) throws IOException
188 * extract version part of <accession>.<version>
189 * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html#VersionB
191 String[] tokens = line.split(WHITESPACE);
192 if (tokens.length > 1)
194 tokens = tokens[1].split("\\.");
195 if (tokens.length > 1)
197 this.version = tokens[1];
205 protected boolean isFeatureContinuationLine(String line)
207 return line.startsWith(" "); // 6 spaces