3 import java.io.IOException;
5 import jalview.bin.Cache;
6 import jalview.datamodel.DBRefEntry;
7 import jalview.util.DBRefUtils;
10 * A class that provides selective parsing of the EMBL flatfile format.
12 * The initial implementation is limited to extracting fields used by Jalview
13 * after fetching an EMBL or EMBLCDS entry:
16 * accession, version, sequence, xref
17 * and (for CDS feature) location, protein_id, product, codon_start, translation
20 * For a complete parser, it may be best to adopt that provided in
21 * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
22 * (but note this has a dependency on the Apache Commons library)
25 * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
26 * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
28 public class EmblFlatFile extends EMBLLikeFlatFile
31 * Constructor given a data source and the id of the source database
37 public EmblFlatFile(FileParse fp, String sourceId) throws IOException
43 * Parses the flatfile, and if successful, saves as an annotated sequence
44 * which may be retrieved by calling {@code getSequence()}
49 public void parse() throws IOException
51 String line = nextLine();
54 if (line.startsWith("ID"))
58 else if (line.startsWith("DE"))
62 else if (line.startsWith("DR"))
66 else if (line.startsWith("SQ"))
68 line = parseSequence();
70 else if (line.startsWith("FT"))
72 line = parseFeature(line.substring(2));
83 * Extracts and saves the primary accession and version (SV value) from an ID
84 * line, or null if not found. Returns the next line after the one processed.
89 String parseID(String line) throws IOException
91 String[] tokens = line.substring(2).split(";");
94 * first is primary accession
96 String token = tokens[0].trim();
99 this.accession = token;
103 * second token is 'SV versionNo'
105 if (tokens.length > 1)
107 token = tokens[1].trim();
108 if (token.startsWith("SV"))
110 String[] bits = token.trim().split(WHITESPACE);
111 this.version = bits[bits.length - 1];
116 * seventh token is 'length BP'
118 if (tokens.length > 6)
120 token = tokens[6].trim();
121 String[] bits = token.trim().split(WHITESPACE);
124 this.length = Integer.valueOf(bits[0]);
125 } catch (NumberFormatException e)
127 Cache.log.error("bad length read in flatfile, line: " + line);
135 * Reads sequence description from the first DE line found. Any trailing
136 * period is discarded. If there are multiple DE lines, only the first (short
137 * description) is read, the rest are ignored.
141 * @throws IOException
143 String parseDE(String line) throws IOException
145 String desc = line.substring(2).trim();
146 if (desc.endsWith("."))
148 desc = desc.substring(0, desc.length() - 1);
150 this.description = desc;
153 * pass over any additional DE lines
155 while ((line = nextLine()) != null)
157 if (!line.startsWith("DE"))
167 * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
168 * the line following the line processed.
171 * @throws IOException
173 String parseDR(String line) throws IOException
175 String[] tokens = line.substring(2).split(";");
176 if (tokens.length > 1)
179 * ensure UniProtKB/Swiss-Prot converted to UNIPROT
181 String db = tokens[0].trim();
182 db = DBRefUtils.getCanonicalName(db);
183 String acc = tokens[1].trim();
184 if (acc.endsWith("."))
186 acc = acc.substring(0, acc.length() - 1);
188 String version = "0";
189 if (tokens.length > 2)
191 String secondaryId = tokens[2].trim();
192 if (!secondaryId.isEmpty())
194 // todo: is this right? secondary id is not a version number
195 // version = secondaryId;
198 this.dbrefs.add(new DBRefEntry(db, version, acc));
205 protected boolean isFeatureContinuationLine(String line)
207 return line.startsWith("FT "); // 4 spaces