package jalview.io; import java.io.IOException; import java.text.ParseException; import java.util.ArrayList; import java.util.Hashtable; import java.util.List; import java.util.Map; import java.util.Map.Entry; import jalview.bin.Cache; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.FeatureProperties; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.util.DnaUtils; import jalview.util.MappingUtils; /** * A class that provides selective parsing of the EMBL flatfile format. *

* The initial implementation is limited to extracting fields used by Jalview * after fetching an EMBL or EMBLCDS entry: * *

 * accession, version, sequence, xref
 * and (for CDS feature) location, protein_id, product, codon_start, translation
 * 
* * For a complete parser, it may be best to adopt that provided in * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile * (but note this has a dependency on the Apache Commons library) * * @author gmcarstairs * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html */ public class EmblFlatFile extends AlignFile // FileParse { private static final String WHITESPACE = "\\s+"; private String sourceDb; /* * values parsed from the EMBL flatfile record */ private String accession; // from ID (first token) private String version; // from ID (second token) private int length = 128; // from ID (7th token), with usable default private List dbrefs; // from DR and also CDS /db_xref qualifiers private String sequenceString; // from SQ lines private String translation; // from CDS feature /translation private String cdsLocation; // CDS /location raw value private int codonStart = 1; // from CDS /codon_start private String proteinName; // from CDS /product private String proteinId; // from CDS /protein_id private Map cdsProps; // CDS other qualifiers e.g. 'note' /** * Constructor * @param fp * @param sourceId * @throws IOException */ public EmblFlatFile(FileParse fp, String sourceId) throws IOException { super(false, fp); // don't parse immediately this.sourceDb = sourceId; dbrefs = new ArrayList<>(); cdsProps = new Hashtable<>(); } /** * Parses the flatfile, and if successful, saves as an annotated sequence * which may be retrieved by calling {@code getSequence()} * * @throws IOException */ public void parse() throws IOException { String line = nextLine(); while (line != null) { if (line.startsWith("ID")) { line = processID(line); } else if (line.startsWith("DR")) { line = processDR(line); } else if (line.startsWith("SQ")) { line = processSQ(); } else if (line.startsWith("FT")) { line = processFT(line); } else { line = nextLine(); } } assembleSequence(); } /** * Extracts and saves the primary accession and version (SV value) from an ID * line, or null if not found. Returns the next line after the one processed. * * @param line * @throws IOException */ String processID(String line) throws IOException { String[] tokens = line.substring(2).split(";"); /* * first is primary accession */ String token = tokens[0].trim(); if (!token.isEmpty()) { this.accession = token; } /* * second token is 'SV versionNo' */ if (tokens.length > 1) { token = tokens[1].trim(); if (token.startsWith("SV")) { String[] bits = token.trim().split(WHITESPACE); this.version = bits[bits.length - 1]; } } /* * seventh token is 'length BP' */ if (tokens.length > 6) { token = tokens[6].trim(); String[] bits = token.trim().split(WHITESPACE); try { this.length = Integer.valueOf(bits[0]); } catch (NumberFormatException e) { Cache.log.error("bad length read in flatfile, line: " + line); } } return nextLine(); } /** * Processes one DR line and saves as a DBRefEntry cross-reference. Returns * the line following the line processed. * * @param line * @throws IOException */ String processDR(String line) throws IOException { String[] tokens = line.substring(2).split(";"); if (tokens.length > 1) { String db = tokens[0].trim(); String acc = tokens[1].trim(); if (acc.endsWith(".")) { acc = acc.substring(0, acc.length() - 1); } this.dbrefs.add(new DBRefEntry(db, "0", acc)); } return nextLine(); } /** * Reads and saves the sequence, read from the lines following the SQ line. * Whitespace and position counters are discarded. Returns the next line * following the sequence data (the next line that doesn't start with * whitespace). * * @throws IOException */ String processSQ() throws IOException { StringBuilder sb = new StringBuilder(this.length); String line = nextLine(); while (line != null && line.startsWith(" ")) { line = line.trim(); String[] blocks = line.split(WHITESPACE); /* * omit the last block (position counter) on each line */ for (int i = 0; i < blocks.length - 1; i++) { sb.append(blocks[i]); } line = nextLine(); } this.sequenceString = sb.toString(); return line; } /** * Processes an FT line. If it declares a feature type of interest (currently, * only CDS is processed), processes all of the associated lines (feature * qualifiers), and returns the next line after that, otherwise simply returns * the next line. * * @param line * @return * @throws IOException */ String processFT(String line) throws IOException { String[] tokens = line.split(WHITESPACE); if (tokens.length < 3 || !"CDS".equals(tokens[1])) { return nextLine(); } this.cdsLocation = tokens[2]; while ((line = nextLine()) != null) { if (!line.startsWith("FT ")) // 4 spaces { // e.g. start of next feature "FT source..." break; } /* * extract qualifier, e.g. FT /protein_id="CAA37824.1" */ int slashPos = line.indexOf('/'); if (slashPos == -1) { Cache.log.error("Unexpected EMBL line ignored: " + line); continue; } int eqPos = line.indexOf('=', slashPos + 1); if (eqPos == -1) { Cache.log.error("Unexpected EMBL line ignored: " + line); continue; } String qualifier = line.substring(slashPos + 1, eqPos); String value = line.substring(eqPos + 1); if (value.startsWith("\"") && value.endsWith("\"")) { value = value.substring(1, value.length() - 1); } if ("protein_id".equals(qualifier)) { proteinId = value; } else if ("codon_start".equals(qualifier)) { try { codonStart = Integer.parseInt(value.trim()); } catch (NumberFormatException e) { Cache.log.error("Invalid codon_start in XML for " + this.accession + ": " + e.getMessage()); } } else if ("product".equals(qualifier)) { // sometimes name is returned e.g. for V00488 proteinName = value; } else if ("translation".equals(qualifier)) { line = readTranslation(value); } else if (!"".equals(value)) { // throw anything else into the additional properties hash cdsProps.put(qualifier, value); } } return line; } /** * Reads and saves the CDS translation from one or more lines of the file, and * returns the next line after that * * @param value * the first line of the translation (likely quoted) * @return * @throws IOException */ String readTranslation(String value) throws IOException { StringBuilder sb = new StringBuilder(this.length / 3 + 1); sb.append(value.replace("\"", "")); String line; while ((line = nextLine()) != null) { if (!line.startsWith("FT ")) { break; // reached next feature or other input line } String[] tokens = line.split(WHITESPACE); if (tokens.length < 2) { Cache.log.error("Ignoring bad EMBL line: " + line); break; } if (tokens[1].startsWith("/")) { break; // next feature qualifier } sb.append(tokens[1].replace("\"", "")); } return sb.toString(); } /** * Processes the parsed CDS feature data to * * @param SequenceI dna */ void processCDS(SequenceI dna) { /* * parse location into a list of [start, end, start, end] positions */ int[] exons = getCdsRanges(this.accession, this.cdsLocation); int exonNumber = 0; for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2) { int exonStart = exons[xint]; int exonEnd = exons[xint + 1]; int begin = Math.min(exonStart, exonEnd); int end = Math.max(exonStart, exonEnd); exonNumber++; String desc = String.format("Exon %d for protein EMBLCDS:%s", exonNumber, proteinId); SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, this.sourceDb); if (!cdsProps.isEmpty()) { for (Entry val : cdsProps.entrySet()) { sf.setValue(val.getKey(), val.getValue()); } } sf.setEnaLocation(this.cdsLocation); boolean forwardStrand = exonStart <= exonEnd; sf.setStrand(forwardStrand ? "+" : "-"); sf.setPhase(String.valueOf(codonStart - 1)); sf.setValue(FeatureProperties.EXONPOS, exonNumber); sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); dna.addSequenceFeature(sf); } } /** * Constructs and saves the sequence from parsed components */ void assembleSequence() { String name = this.accession; if (this.sourceDb != null) { name = this.sourceDb + "|" + name; } SequenceI seq = new Sequence(name, this.sequenceString); for (DBRefEntry dbref : this.dbrefs) { seq.addDBRef(dbref); } processCDS(seq); seq.deriveSequence(); addSequence(seq); } /** * Output (print) is not implemented for EMBL flat file format */ @Override public String print(SequenceI[] seqs, boolean jvsuffix) { return null; } /** * Returns the CDS location as a single array of [start, end, start, end...] * positions. If on the reverse strand, these will be in descending order. * * @param accession * @param location * @return */ protected int[] getCdsRanges(String accession, String location) { if (location == null) { return new int[] {}; } try { List ranges = DnaUtils.parseLocation(location); return MappingUtils.listToArray(ranges); } catch (ParseException e) { Cache.log.warn( String.format("Not parsing inexact CDS location %s in ENA %s", location, accession)); return new int[] {}; } } }