3 import java.io.IOException;
4 import java.text.ParseException;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.HashMap;
8 import java.util.Hashtable;
11 import java.util.Map.Entry;
12 import java.util.TreeMap;
14 import jalview.bin.Cache;
15 import jalview.datamodel.DBRefEntry;
16 import jalview.datamodel.DBRefSource;
17 import jalview.datamodel.FeatureProperties;
18 import jalview.datamodel.Mapping;
19 import jalview.datamodel.Sequence;
20 import jalview.datamodel.SequenceFeature;
21 import jalview.datamodel.SequenceI;
22 import jalview.util.DBRefUtils;
23 import jalview.util.DnaUtils;
24 import jalview.util.MapList;
25 import jalview.util.MappingUtils;
28 * A class that provides selective parsing of the EMBL flatfile format.
30 * The initial implementation is limited to extracting fields used by Jalview
31 * after fetching an EMBL or EMBLCDS entry:
34 * accession, version, sequence, xref
35 * and (for CDS feature) location, protein_id, product, codon_start, translation
38 * For a complete parser, it may be best to adopt that provided in
39 * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
40 * (but note this has a dependency on the Apache Commons library)
43 * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
44 * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
46 public class EmblFlatFile extends AlignFile // FileParse
48 private static final String QUOTE = "\"";
50 private static final String DOUBLED_QUOTE = QUOTE + QUOTE;
53 * A data bean class to hold values parsed from one CDS Feature (FT)
57 String translation; // from CDS feature /translation
59 String cdsLocation; // CDS /location raw value
61 int codonStart = 1; // from CDS /codon_start
63 String proteinName; // from CDS /product; used for protein description
65 String proteinId; // from CDS /protein_id
67 List<DBRefEntry> xrefs = new ArrayList<>(); // from CDS /db_xref qualifiers
69 Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
72 private static final String WHITESPACE = "\\s+";
74 private String sourceDb;
77 * values parsed from the EMBL flatfile record
79 private String accession; // from ID (first token)
81 private String version; // from ID (second token)
83 private String description; // from (first) DE line
85 private int length = 128; // from ID (7th token), with usable default
87 private List<DBRefEntry> dbrefs; // from DR
89 private String sequenceString; // from SQ lines
92 * parsed CDS data fields, keyed by protein_id
94 private Map<String, CdsData> cds;
101 * @throws IOException
103 public EmblFlatFile(FileParse fp, String sourceId) throws IOException
105 super(false, fp); // don't parse immediately
106 this.sourceDb = sourceId;
107 dbrefs = new ArrayList<>();
110 * using TreeMap gives CDS sequences in alphabetical, so readable, order
112 cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
116 * Parses the flatfile, and if successful, saves as an annotated sequence
117 * which may be retrieved by calling {@code getSequence()}
119 * @throws IOException
122 public void parse() throws IOException
124 String line = nextLine();
127 if (line.startsWith("ID"))
129 line = parseID(line);
131 else if (line.startsWith("DE"))
133 line = parseDE(line);
135 else if (line.startsWith("DR"))
137 line = parseDR(line);
139 else if (line.startsWith("SQ"))
143 else if (line.startsWith("FT"))
145 line = parseFT(line);
156 * Extracts and saves the primary accession and version (SV value) from an ID
157 * line, or null if not found. Returns the next line after the one processed.
160 * @throws IOException
162 String parseID(String line) throws IOException
164 String[] tokens = line.substring(2).split(";");
167 * first is primary accession
169 String token = tokens[0].trim();
170 if (!token.isEmpty())
172 this.accession = token;
176 * second token is 'SV versionNo'
178 if (tokens.length > 1)
180 token = tokens[1].trim();
181 if (token.startsWith("SV"))
183 String[] bits = token.trim().split(WHITESPACE);
184 this.version = bits[bits.length - 1];
189 * seventh token is 'length BP'
191 if (tokens.length > 6)
193 token = tokens[6].trim();
194 String[] bits = token.trim().split(WHITESPACE);
197 this.length = Integer.valueOf(bits[0]);
198 } catch (NumberFormatException e)
200 Cache.log.error("bad length read in flatfile, line: " + line);
208 * Reads sequence description from the first DE line found. Any trailing
209 * period is discarded. If there are multiple DE lines, only the first (short
210 * description) is read, the rest are ignored.
214 * @throws IOException
216 String parseDE(String line) throws IOException
218 String desc = line.substring(2).trim();
219 if (desc.endsWith("."))
221 desc = desc.substring(0, desc.length() - 1);
223 this.description = desc;
226 * pass over any additional DE lines
228 while ((line = nextLine()) != null)
230 if (!line.startsWith("DE"))
240 * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
241 * the line following the line processed.
244 * @throws IOException
246 String parseDR(String line) throws IOException
248 String[] tokens = line.substring(2).split(";");
249 if (tokens.length > 1)
252 * ensure UniProtKB/Swiss-Prot converted to UNIPROT
254 String db = tokens[0].trim();
255 db = DBRefUtils.getCanonicalName(db);
256 String acc = tokens[1].trim();
257 if (acc.endsWith("."))
259 acc = acc.substring(0, acc.length() - 1);
261 String version = "0";
262 if (tokens.length > 2)
264 String secondaryId = tokens[2].trim();
265 if (!secondaryId.isEmpty())
267 // todo: is this right? secondary id is not a version number
268 // version = secondaryId;
271 this.dbrefs.add(new DBRefEntry(db, version, acc));
278 * Reads and saves the sequence, read from the lines following the SQ line.
279 * Whitespace and position counters are discarded. Returns the next line
280 * following the sequence data (the next line that doesn't start with
283 * @throws IOException
285 String parseSQ() throws IOException
287 StringBuilder sb = new StringBuilder(this.length);
288 String line = nextLine();
289 while (line != null && line.startsWith(" "))
292 String[] blocks = line.split(WHITESPACE);
295 * omit the last block (position counter) on each line
297 for (int i = 0; i < blocks.length - 1; i++)
299 sb.append(blocks[i]);
303 this.sequenceString = sb.toString();
309 * Processes an FT line. If it declares a feature type of interest (currently,
310 * only CDS is processed), processes all of the associated lines (feature
311 * qualifiers), and returns the next line after that, otherwise simply returns
316 * @throws IOException
318 String parseFT(String line) throws IOException
320 String[] tokens = line.split(WHITESPACE);
321 if (tokens.length < 3 || !"CDS".equals(tokens[1]))
327 * parse location - which may be over more than one line e.g. EAW51554
329 CdsData data = new CdsData();
330 StringBuilder sb = new StringBuilder().append(tokens[2]);
331 line = parseFeatureQualifier(sb, "CDS");
332 data.cdsLocation = sb.toString();
336 if (!line.startsWith("FT ")) // 4 spaces
338 // e.g. start of next feature "FT source..."
343 * extract qualifier, e.g. FT /protein_id="CAA37824.1"
344 * - the value may extend over more than one line
345 * - if the value has enclosing quotes, these are removed
346 * - escaped double quotes ("") are reduced to a single character
348 int slashPos = line.indexOf('/');
351 Cache.log.error("Unexpected EMBL line ignored: " + line);
355 int eqPos = line.indexOf('=', slashPos + 1);
358 // can happen, e.g. /ribosomal_slippage
359 // Cache.log.error("Unexpected EMBL line ignored: " + line);
363 String qualifier = line.substring(slashPos + 1, eqPos);
364 String value = line.substring(eqPos + 1);
365 value = removeQuotes(value);
366 sb = new StringBuilder().append(value);
367 line = parseFeatureQualifier(sb, qualifier);
368 String featureValue = sb.toString();
370 if ("protein_id".equals(qualifier))
372 data.proteinId = featureValue;
374 else if ("codon_start".equals(qualifier))
378 data.codonStart = Integer.parseInt(featureValue.trim());
379 } catch (NumberFormatException e)
381 Cache.log.error("Invalid codon_start in XML for " + this.accession
382 + ": " + e.getMessage());
385 else if ("db_xref".equals(qualifier))
387 String[] parts = featureValue.split(":");
388 if (parts.length == 2)
390 String db = parts[0].trim();
391 db = DBRefUtils.getCanonicalName(db);
392 DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
393 data.xrefs.add(dbref);
396 else if ("product".equals(qualifier))
398 data.proteinName = featureValue;
400 else if ("translation".equals(qualifier))
402 data.translation = featureValue;
404 else if (!"".equals(featureValue))
406 // throw anything else into the additional properties hash
407 data.cdsProps.put(qualifier, featureValue);
411 if (data.proteinId != null)
413 this.cds.put(data.proteinId, data);
417 Cache.log.error("Ignoring CDS feature with no protein_id for "
418 + sourceDb + ":" + accession);
425 * Removes leading or trailing double quotes (") unless doubled, and changes
426 * any 'escaped' (doubled) double quotes to single characters. As per the
427 * Feature Table specification for Qualifiers, Free Text.
432 static String removeQuotes(String value)
438 if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
440 value = value.substring(1);
442 if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
444 value = value.substring(0, value.length() - 1);
446 value = value.replace(DOUBLED_QUOTE, QUOTE);
451 * Reads the value of a feature (FT) qualifier from one or more lines of the
452 * file, and returns the next line after that. Values are appended to the
453 * string buffer, which should be already primed with the value read from the
454 * first line for the qualifier (with any leading double quote removed).
455 * Enclosing double quotes are removed, and escaped (repeated) double quotes
456 * reduced to one only. For example for
459 * FT /note="gene_id=hCG28070.3
460 * FT ""foobar"" isoform=CRA_b"
461 * the returned value is
462 * gene_id=hCG28070.3 "foobar" isoform=CRA_b
465 * Note the side-effect of this method, to advance data reading to the next
466 * line after the feature qualifier.
469 * a string buffer primed with the first line of the value
470 * @param qualifierName
472 * @throws IOException
474 String parseFeatureQualifier(StringBuilder sb, String qualifierName)
478 while ((line = nextLine()) != null)
480 if (!line.startsWith("FT "))
482 break; // reached next feature or other input line
484 String[] tokens = line.split(WHITESPACE);
485 if (tokens.length < 2)
487 Cache.log.error("Ignoring bad EMBL line for " + this.accession
491 if (tokens[1].startsWith("/"))
493 break; // next feature qualifier
497 * heuristic rule: most multi-line value (e.g. /product) are text,
498 * so add a space for word boundary at a new line; not for translation
500 if (!"translation".equals(qualifierName)
501 && !"CDS".equals(qualifierName))
507 * remove trailing " and unescape doubled ""
509 String data = removeQuotes(tokens[1]);
517 * Constructs and saves the sequence from parsed components
521 if (this.accession == null || this.sequenceString == null)
523 Cache.log.error("Failed to parse data from EMBL");
527 String name = this.accession;
528 if (this.sourceDb != null)
530 name = this.sourceDb + "|" + name;
532 SequenceI seq = new Sequence(name, this.sequenceString);
533 seq.setDescription(this.description);
536 * add a DBRef to itself
538 DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
539 int[] startEnd = new int[] { 1, seq.getLength() };
540 selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
541 seq.addDBRef(selfRef);
543 for (DBRefEntry dbref : this.dbrefs)
548 processCDSFeatures(seq);
550 seq.deriveSequence();
556 * Process the CDS features, including generation of cross-references and
557 * mappings to the protein products (translation)
561 protected void processCDSFeatures(SequenceI seq)
564 * record protein products found to avoid duplication i.e. >1 CDS with
565 * the same /protein_id [though not sure I can find an example of this]
567 Map<String, SequenceI> proteins = new HashMap<>();
568 for (CdsData data : cds.values())
570 processCDSFeature(seq, data, proteins);
575 * Processes data for one parsed CDS feature to
577 * <li>create a protein product sequence for the translation</li>
578 * <li>create a cross-reference to protein with mapping from dna</li>
579 * <li>add a CDS feature to the sequence for each CDS start-end range</li>
580 * <li>add any CDS dbrefs to the sequence and to the protein product</li>
586 * map of protein products so far derived from CDS data
588 void processCDSFeature(SequenceI dna, CdsData data,
589 Map<String, SequenceI> proteins)
592 * parse location into a list of [start, end, start, end] positions
594 int[] exons = getCdsRanges(this.accession, data.cdsLocation);
596 MapList maplist = buildMappingToProtein(dna, exons, data);
600 for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
602 int exonStart = exons[xint];
603 int exonEnd = exons[xint + 1];
604 int begin = Math.min(exonStart, exonEnd);
605 int end = Math.max(exonStart, exonEnd);
607 String desc = String.format("Exon %d for protein EMBLCDS:%s",
608 exonNumber, data.proteinId);
610 SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
612 for (Entry<String, String> val : data.cdsProps.entrySet())
614 sf.setValue(val.getKey(), val.getValue());
617 sf.setEnaLocation(data.cdsLocation);
618 boolean forwardStrand = exonStart <= exonEnd;
619 sf.setStrand(forwardStrand ? "+" : "-");
620 sf.setPhase(String.valueOf(data.codonStart - 1));
621 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
622 sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
624 dna.addSequenceFeature(sf);
627 boolean hasUniprotDbref = false;
628 for (DBRefEntry xref : data.xrefs)
631 if (xref.getSource().equals(DBRefSource.UNIPROT))
634 * construct (or find) the sequence for (data.protein_id, data.translation)
636 SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
637 Mapping map = new Mapping(protein, maplist);
638 map.setMappedFromId(data.proteinId);
642 * add DBRefs with mappings from dna to protein and the inverse
644 DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
645 db1.setMap(new Mapping(dna, maplist.getInverse()));
646 protein.addDBRef(db1);
648 hasUniprotDbref = true;
653 * if we have a product (translation) but no explicit Uniprot dbref
654 * (example: EMBL M19487 protein_id AAB02592.1)
655 * then construct mappings to an assumed EMBLCDSPROTEIN accession
657 if (!hasUniprotDbref)
659 SequenceI protein = proteins.get(data.proteinId);
662 protein = new Sequence(data.proteinId, data.translation);
663 protein.setDescription(data.proteinName);
664 proteins.put(data.proteinId, protein);
666 // assuming CDSPROTEIN sequence version = dna version (?!)
667 DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
668 this.version, data.proteinId);
669 protein.addDBRef(db1);
671 DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
672 DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
673 Mapping map = new Mapping(protein, maplist);
674 map.setMappedFromId(data.proteinId);
675 dnaToEmblProteinRef.setMap(map);
676 dna.addDBRef(dnaToEmblProteinRef);
680 * comment brought forward from EmblXmlSource, lines 447-451:
681 * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
682 * sequence with the exon map; if given a dataset reference, search
683 * dataset for parent EMBL sequence if it exists and set its map;
684 * make a new feature annotating the coding contig
689 * Computes a mapping from CDS positions in DNA sequence to protein product
690 * positions, with allowance for stop codon or incomplete start codon
697 MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
700 MapList dnaToProteinMapping = null;
701 int peptideLength = data.translation.length();
703 int[] proteinRange = new int[] { 1, peptideLength };
704 if (exons != null && exons.length > 0)
707 * We were able to parse 'location'; do a final
708 * product length truncation check
710 int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
711 dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
716 * workaround until we handle all 'location' formats fully
717 * e.g. X53828.1:60..1058 or <123..>289
719 Cache.log.error(String.format(
720 "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
721 + " - Making up the CDNA region of (%s:%s)... may be incorrect",
722 data.cdsLocation, sourceDb, this.accession));
724 int completeCodonsLength = 1 - data.codonStart + dna.getLength();
725 int mappedDnaEnd = dna.getEnd();
726 if (peptideLength * 3 == completeCodonsLength)
728 // this might occur for CDS sequences where no features are marked
729 Cache.log.warn("Assuming no stop codon at end of cDNA fragment");
730 mappedDnaEnd = dna.getEnd();
732 else if ((peptideLength + 1) * 3 == completeCodonsLength)
734 Cache.log.warn("Assuming stop codon at end of cDNA fragment");
735 mappedDnaEnd = dna.getEnd() - 3;
738 if (mappedDnaEnd != -1)
740 int[] cdsRanges = new int[] {
741 dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
742 dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
746 return dnaToProteinMapping;
750 * Constructs a sequence for the protein product for the CDS data (if there is
751 * one), and dbrefs with mappings from CDS to protein and the reverse
759 SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
760 CdsData data, Map<String, SequenceI> proteins)
763 * check we have some data to work with
765 if (data.proteinId == null || data.translation == null)
771 * Construct the protein sequence (if not already seen)
773 String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
774 SequenceI protein = proteins.get(proteinSeqName);
777 protein = new Sequence(proteinSeqName, data.translation, 1,
778 data.translation.length());
779 protein.setDescription(data.proteinName != null ? data.proteinName
780 : "Protein Product from " + sourceDb);
781 proteins.put(proteinSeqName, protein);
788 * Returns the CDS location as a single array of [start, end, start, end...]
789 * positions. If on the reverse strand, these will be in descending order.
795 protected int[] getCdsRanges(String accession, String location)
797 if (location == null)
804 List<int[]> ranges = DnaUtils.parseLocation(location);
805 return MappingUtils.listToArray(ranges);
806 } catch (ParseException e)
809 String.format("Not parsing inexact CDS location %s in ENA %s",
810 location, accession));
816 * Output (print) is not implemented for EMBL flat file format
819 public String print(SequenceI[] seqs, boolean jvsuffix)
825 * Truncates (if necessary) the exon intervals to match 3 times the length of
826 * the protein; also accepts 3 bases longer (for stop codon not included in
829 * @param proteinLength
831 * an array of [start, end, start, end...] intervals
832 * @return the same array (if unchanged) or a truncated copy
834 static int[] adjustForProteinLength(int proteinLength, int[] exon)
836 if (proteinLength <= 0 || exon == null)
840 int expectedCdsLength = proteinLength * 3;
841 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
844 * if exon length matches protein, or is shorter, or longer by the
845 * length of a stop codon (3 bases), then leave it unchanged
847 if (expectedCdsLength >= exonLength
848 || expectedCdsLength == exonLength - 3)
856 origxon = new int[exon.length];
857 System.arraycopy(exon, 0, origxon, 0, exon.length);
859 for (int x = 0; x < exon.length; x += 2)
861 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
862 if (expectedCdsLength <= cdspos)
864 // advanced beyond last codon.
866 if (expectedCdsLength != cdspos)
869 // .println("Truncating final exon interval on region by "
870 // + (cdspos - cdslength));
874 * shrink the final exon - reduce end position if forward
875 * strand, increase it if reverse
877 if (exon[x + 1] >= exon[x])
879 endxon = exon[x + 1] - cdspos + expectedCdsLength;
883 endxon = exon[x + 1] + cdspos - expectedCdsLength;
891 // and trim the exon interval set if necessary
892 int[] nxon = new int[sxpos + 2];
893 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
894 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon