X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2FGenBankFile.java;fp=src%2Fjalview%2Fio%2FGenBankFile.java;h=798876453bdc183775e35ce89d4a73269bd6e112;hb=be9181aeed5d9694eec169dea3c069c3d4936599;hp=0000000000000000000000000000000000000000;hpb=2b4de132b5bed677734d481b8700cbe15b63c198;p=jalview.git diff --git a/src/jalview/io/GenBankFile.java b/src/jalview/io/GenBankFile.java new file mode 100644 index 0000000..7988764 --- /dev/null +++ b/src/jalview/io/GenBankFile.java @@ -0,0 +1,207 @@ +package jalview.io; + +import java.io.IOException; + +import jalview.bin.Cache; + +/** + * A class that provides selective parsing of the GenBank flatfile format. + *

+ * The initial implementation is limited to extracting fields used by Jalview + * after fetching an EMBL or EMBLCDS entry: + * + *

+ * accession, version, sequence, xref
+ * and (for CDS feature) location, protein_id, product, codon_start, translation
+ * 
+ * + * @author gmcarstairs + * @see https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html + */ +public class GenBankFile extends FlatFile +{ + private static final String DEFINITION = "DEFINITION"; + + /** + * Constructor given a data source and the id of the source database + * + * @param fp + * @param sourceId + * @throws IOException + */ + public GenBankFile(FileParse fp, String sourceId) throws IOException + { + super(fp, sourceId); + } + + /** + * Parses the flatfile, and if successful, saves as an annotated sequence + * which may be retrieved by calling {@code getSequence()} + * + * @throws IOException + * @see https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html + */ + @Override + public void parse() throws IOException + { + String line = nextLine(); + while (line != null) + { + if (line.startsWith(DEFINITION)) + { + line = parseDefinition(line); + } + else if (line.startsWith("ACCESSION")) + { + this.accession = line.split(WHITESPACE)[1]; + line = nextLine(); + } + else if (line.startsWith("VERSION")) + { + line = parseVersion(line); + } + else if (line.startsWith("ORIGIN")) + { + line = parseSequence(); + } + else if (line.startsWith("FEATURES")) + { + line = nextLine(); + while (line.startsWith(" ")) + { + line = parseFeature(line); + } + } + else + { + line = nextLine(); + } + } + buildSequence(); + } + + /** + * Extracts and saves the primary accession and version (SV value) from an ID + * line, or null if not found. Returns the next line after the one processed. + * + * @param line + * @throws IOException + */ + String parseLocus(String line) throws IOException + { + String[] tokens = line.substring(2).split(";"); + + /* + * first is primary accession + */ + String token = tokens[0].trim(); + if (!token.isEmpty()) + { + this.accession = token; + } + + /* + * second token is 'SV versionNo' + */ + if (tokens.length > 1) + { + token = tokens[1].trim(); + if (token.startsWith("SV")) + { + String[] bits = token.trim().split(WHITESPACE); + this.version = bits[bits.length - 1]; + } + } + + /* + * seventh token is 'length BP' + */ + if (tokens.length > 6) + { + token = tokens[6].trim(); + String[] bits = token.trim().split(WHITESPACE); + try + { + this.length = Integer.valueOf(bits[0]); + } catch (NumberFormatException e) + { + Cache.log.error("bad length read in flatfile, line: " + line); + } + } + + return nextLine(); + } + + /** + * Reads sequence description from DEFINITION lines. Any trailing period is + * discarded. Returns the next line after the definition line(s). + * + * @param line + * @return + * @throws IOException + */ + String parseDefinition(String line) throws IOException + { + String desc = line.substring(DEFINITION.length()).trim(); + if (desc.endsWith(".")) + { + desc = desc.substring(0, desc.length() - 1); + } + + /* + * pass over any additional DE lines + */ + while ((line = nextLine()) != null) + { + if (line.startsWith(" ")) + { + // definition continuation line + desc += line.trim(); + } + else + { + break; + } + } + this.description = desc; + + return line; + } + + /** + * Parses the VERSION line e.g. + * + *
+   * VERSION     X81322.1
+   * 
+ * + * and returns the next line + * + * @param line + * @throws IOException + */ + String parseVersion(String line) throws IOException + { + /* + * extract version part of . + * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html#VersionB + */ + String[] tokens = line.split(WHITESPACE); + if (tokens.length > 1) + { + tokens = tokens[1].split("\\."); + if (tokens.length > 1) + { + this.version = tokens[1]; + } + } + + return nextLine(); + } + + @Override + protected boolean isFeatureContinuationLine(String line) + { + return line.startsWith(" "); // 6 spaces + } +}