package jalview.io; import java.io.IOException; import java.text.ParseException; import java.util.ArrayList; import java.util.HashMap; import java.util.Hashtable; import java.util.List; import java.util.Map; import java.util.Map.Entry; import jalview.bin.Cache; import jalview.datamodel.DBRefEntry; import jalview.datamodel.FeatureProperties; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.util.DBRefUtils; import jalview.util.DnaUtils; import jalview.util.MappingUtils; /** * A class that provides selective parsing of the EMBL flatfile format. *

* The initial implementation is limited to extracting fields used by Jalview * after fetching an EMBL or EMBLCDS entry: * *

 * accession, version, sequence, xref
 * and (for CDS feature) location, protein_id, product, codon_start, translation
 *

* * For a complete parser, it may be best to adopt that provided in * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile * (but note this has a dependency on the Apache Commons library) * * @author gmcarstairs * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html */ public class EmblFlatFile extends AlignFile // FileParse { /** * A data bean class to hold values parsed from one CDS Feature (FT) */ class CdsData { String translation; // from CDS feature /translation String cdsLocation; // CDS /location raw value int codonStart = 1; // from CDS /codon_start String proteinName; // from CDS /product; used for protein description String proteinId; // from CDS /protein_id Map cdsProps = new Hashtable<>(); // CDS other qualifiers } private static final String WHITESPACE = "\\s+"; private String sourceDb; /* * values parsed from the EMBL flatfile record */ private String accession; // from ID (first token) private String version; // from ID (second token) private String description; // from (first) DE line private int length = 128; // from ID (7th token), with usable default private List dbrefs; // from DR and also CDS /db_xref qualifiers private String sequenceString; // from SQ lines private List cds; /** * Constructor * * @param fp * @param sourceId * @throws IOException */ public EmblFlatFile(FileParse fp, String sourceId) throws IOException { super(false, fp); // don't parse immediately this.sourceDb = sourceId; dbrefs = new ArrayList<>(); cds = new ArrayList<>(); } /** * Parses the flatfile, and if successful, saves as an annotated sequence * which may be retrieved by calling {@code getSequence()} * * @throws IOException */ public void parse() throws IOException { String line = nextLine(); while (line != null) { if (line.startsWith("ID")) { line = parseID(line); } else if (line.startsWith("DE")) { line = parseDE(line); } else if (line.startsWith("DR")) { line = parseDR(line); } else if (line.startsWith("SQ")) { line = parseSQ(); } else if (line.startsWith("FT")) { line = parseFT(line); } else { line = nextLine(); } } assembleSequence(); } /** * Extracts and saves the primary accession and version (SV value) from an ID * line, or null if not found. Returns the next line after the one processed. * * @param line * @throws IOException */ String parseID(String line) throws IOException { String[] tokens = line.substring(2).split(";"); /* * first is primary accession */ String token = tokens[0].trim(); if (!token.isEmpty()) { this.accession = token; } /* * second token is 'SV versionNo' */ if (tokens.length > 1) { token = tokens[1].trim(); if (token.startsWith("SV")) { String[] bits = token.trim().split(WHITESPACE); this.version = bits[bits.length - 1]; } } /* * seventh token is 'length BP' */ if (tokens.length > 6) { token = tokens[6].trim(); String[] bits = token.trim().split(WHITESPACE); try { this.length = Integer.valueOf(bits[0]); } catch (NumberFormatException e) { Cache.log.error("bad length read in flatfile, line: " + line); } } return nextLine(); } /** * Reads sequence description from the first DE line found. Any trailing * period is discarded. If there are multiple DE lines, only the first (short * description) is read, the rest are ignored. * * @param line * @return * @throws IOException */ String parseDE(String line) throws IOException { String desc = line.substring(2).trim(); if (desc.endsWith(".")) { desc = desc.substring(0, desc.length() - 1); } this.description = desc; /* * pass over any additional DE lines */ while ((line = nextLine()) != null) { if (!line.startsWith("DE")) { break; } } return line; } /** * Processes one DR line and saves as a DBRefEntry cross-reference. Returns * the line following the line processed. * * @param line * @throws IOException */ String parseDR(String line) throws IOException { String[] tokens = line.substring(2).split(";"); if (tokens.length > 1) { /* * ensure UniProtKB/Swiss-Prot converted to UNIPROT */ String db = tokens[0].trim(); db = DBRefUtils.getCanonicalName(db); String acc = tokens[1].trim(); if (acc.endsWith(".")) { acc = acc.substring(0, acc.length() - 1); } String version = "0"; if (tokens.length > 2) { String secondaryId = tokens[2].trim(); if (!secondaryId.isEmpty()) { // todo: is this right? secondary id is not a version number // version = secondaryId; } } this.dbrefs.add(new DBRefEntry(db, version, acc)); } return nextLine(); } /** * Reads and saves the sequence, read from the lines following the SQ line. * Whitespace and position counters are discarded. Returns the next line * following the sequence data (the next line that doesn't start with * whitespace). * * @throws IOException */ String parseSQ() throws IOException { StringBuilder sb = new StringBuilder(this.length); String line = nextLine(); while (line != null && line.startsWith(" ")) { line = line.trim(); String[] blocks = line.split(WHITESPACE); /* * omit the last block (position counter) on each line */ for (int i = 0; i < blocks.length - 1; i++) { sb.append(blocks[i]); } line = nextLine(); } this.sequenceString = sb.toString(); return line; } /** * Processes an FT line. If it declares a feature type of interest (currently, * only CDS is processed), processes all of the associated lines (feature * qualifiers), and returns the next line after that, otherwise simply returns * the next line. * * @param line * @return * @throws IOException */ String parseFT(String line) throws IOException { String[] tokens = line.split(WHITESPACE); if (tokens.length < 3 || !"CDS".equals(tokens[1])) { return nextLine(); } CdsData data = new CdsData(); data.cdsLocation = tokens[2]; line = nextLine(); while (line != null) { if (!line.startsWith("FT ")) // 4 spaces { // e.g. start of next feature "FT source..." break; } /* * extract qualifier, e.g. FT /protein_id="CAA37824.1" */ int slashPos = line.indexOf('/'); if (slashPos == -1) { Cache.log.error("Unexpected EMBL line ignored: " + line); continue; } int eqPos = line.indexOf('=', slashPos + 1); if (eqPos == -1) { Cache.log.error("Unexpected EMBL line ignored: " + line); continue; } String qualifier = line.substring(slashPos + 1, eqPos); String value = line.substring(eqPos + 1); if (value.startsWith("\"") && value.endsWith("\"")) { value = value.substring(1, value.length() - 1); } if ("protein_id".equals(qualifier)) { data.proteinId = value; line = nextLine(); } else if ("codon_start".equals(qualifier)) { try { data.codonStart = Integer.parseInt(value.trim()); } catch (NumberFormatException e) { Cache.log.error("Invalid codon_start in XML for " + this.accession + ": " + e.getMessage()); } line = nextLine(); } else if ("db_xref".equals(qualifier)) { String[] parts = value.split(":"); if (parts.length == 2) { String db = parts[0].trim(); db = DBRefUtils.getCanonicalName(db); DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim()); this.dbrefs.add(dbref); } line = nextLine(); } else if ("product".equals(qualifier)) { // sometimes name is returned e.g. for V00488 data.proteinName = value; line = nextLine(); } else if ("translation".equals(qualifier)) { line = readTranslation(value, data); } else if (!"".equals(value)) { // throw anything else into the additional properties hash data.cdsProps.put(qualifier, value); line = nextLine(); } } this.cds.add(data); return line; } /** * Reads and returns the CDS translation from one or more lines of the file, * and returns the next line after that * * @param value * the first line of the translation (likely quoted) * @param data * @return * @throws IOException */ String readTranslation(String value, CdsData data) throws IOException { StringBuilder sb = new StringBuilder(this.length / 3 + 1); sb.append(value.replace("\"", "")); String line; while ((line = nextLine()) != null) { if (!line.startsWith("FT ")) { break; // reached next feature or other input line } String[] tokens = line.split(WHITESPACE); if (tokens.length < 2) { Cache.log.error("Ignoring bad EMBL line: " + line); break; } if (tokens[1].startsWith("/")) { break; // next feature qualifier } sb.append(tokens[1].replace("\"", "")); } data.translation = sb.toString(); return line; } /** * Constructs and saves the sequence from parsed components */ void assembleSequence() { String name = this.accession; if (this.sourceDb != null) { name = this.sourceDb + "|" + name; } SequenceI seq = new Sequence(name, this.sequenceString); seq.setDescription(this.description); /* * add a DBRef to itself */ DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession); int[] startEnd = new int[] { 1, seq.getLength() }; selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1)); seq.addDBRef(selfRef); for (DBRefEntry dbref : this.dbrefs) { seq.addDBRef(dbref); } processAllCDS(seq); seq.deriveSequence(); addSequence(seq); } /** * Process the CDS features, including generation of cross-references and * mappings to the protein products (translation) * * @param seq */ protected void processAllCDS(SequenceI seq) { /* * record protein products found to avoid duplication i.e. >1 CDS with * the same /protein_id [though not sure I can find an example of this] */ Map proteins = new HashMap<>(); for (CdsData data : cds) { processOneCDS(seq, data, proteins); } } /** * Processes the parsed CDS feature data to *

add a CDS feature to the sequence for each CDS start-end range
create a protein product sequence for the translation
create a cross-reference to protein with mapping from dna
add any CDS dbrefs to the sequence and to the protein product

* * @param SequenceI * dna * @param proteins * map of protein products so far derived from CDS data */ void processOneCDS(SequenceI dna, CdsData data, Map proteins) { /* * parse location into a list of [start, end, start, end] positions */ int[] exons = getCdsRanges(this.accession, data.cdsLocation); int exonNumber = 0; for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2) { int exonStart = exons[xint]; int exonEnd = exons[xint + 1]; int begin = Math.min(exonStart, exonEnd); int end = Math.max(exonStart, exonEnd); exonNumber++; String desc = String.format("Exon %d for protein EMBLCDS:%s", exonNumber, data.proteinId); SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, this.sourceDb); for (Entry val : data.cdsProps.entrySet()) { sf.setValue(val.getKey(), val.getValue()); } sf.setEnaLocation(data.cdsLocation); boolean forwardStrand = exonStart <= exonEnd; sf.setStrand(forwardStrand ? "+" : "-"); sf.setPhase(String.valueOf(data.codonStart - 1)); sf.setValue(FeatureProperties.EXONPOS, exonNumber); sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName); dna.addSequenceFeature(sf); linkProteinProduct(dna, data, proteins); } } /** * Constructs a sequence for the protein product for the CDS data (if there is * one), and dbrefs with mappings from CDS to protein and the reverse * * @param dna * @param data * @param proteins */ void linkProteinProduct(SequenceI dna, CdsData data, Map proteins) { /* * check we have some data to work with */ if (data.proteinId == null || data.translation == null) { return; } /* * Construct the protein sequence (if not already seen) */ SequenceI protein = proteins.get(data.proteinId); if (protein == null) { protein = new Sequence(data.proteinId, data.translation, 1, data.translation.length()); protein.setDescription(data.proteinName != null ? data.proteinName : "Protein Product from " + sourceDb); proteins.put(data.proteinId, protein); } } /** * Returns the CDS location as a single array of [start, end, start, end...] * positions. If on the reverse strand, these will be in descending order. * * @param accession * @param location * @return */ protected int[] getCdsRanges(String accession, String location) { if (location == null) { return new int[] {}; } try { List ranges = DnaUtils.parseLocation(location); return MappingUtils.listToArray(ranges); } catch (ParseException e) { Cache.log.warn( String.format("Not parsing inexact CDS location %s in ENA %s", location, accession)); return new int[] {}; } } /** * Output (print) is not implemented for EMBL flat file format */ @Override public String print(SequenceI[] seqs, boolean jvsuffix) { return null; } }