3 import java.io.IOException;
4 import java.text.ParseException;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.HashMap;
8 import java.util.Hashtable;
11 import java.util.Map.Entry;
12 import java.util.TreeMap;
14 import jalview.bin.Cache;
15 import jalview.datamodel.DBRefEntry;
16 import jalview.datamodel.DBRefSource;
17 import jalview.datamodel.FeatureProperties;
18 import jalview.datamodel.Mapping;
19 import jalview.datamodel.Sequence;
20 import jalview.datamodel.SequenceFeature;
21 import jalview.datamodel.SequenceI;
22 import jalview.util.DBRefUtils;
23 import jalview.util.DnaUtils;
24 import jalview.util.MapList;
25 import jalview.util.MappingUtils;
28 * A class that provides selective parsing of the EMBL flatfile format.
30 * The initial implementation is limited to extracting fields used by Jalview
31 * after fetching an EMBL or EMBLCDS entry:
34 * accession, version, sequence, xref
35 * and (for CDS feature) location, protein_id, product, codon_start, translation
38 * For a complete parser, it may be best to adopt that provided in
39 * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
40 * (but note this has a dependency on the Apache Commons library)
43 * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
44 * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
46 public class EmblFlatFile extends AlignFile // FileParse
48 private static final String QUOTE = "\"";
50 private static final String DOUBLED_QUOTE = QUOTE + QUOTE;
53 * when true, interpret the mol_type 'source' feature attribute and generate
54 * an RNA sequence from the DNA record
56 private boolean produceRna = true;
59 * A data bean class to hold values parsed from one CDS Feature (FT)
63 String translation; // from CDS feature /translation
65 String cdsLocation; // CDS /location raw value
67 int codonStart = 1; // from CDS /codon_start
69 String proteinName; // from CDS /product; used for protein description
71 String proteinId; // from CDS /protein_id
73 List<DBRefEntry> xrefs = new ArrayList<>(); // from CDS /db_xref qualifiers
75 Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
78 private static final String WHITESPACE = "\\s+";
80 private String sourceDb;
83 * values parsed from the EMBL flatfile record
85 private String accession; // from ID (first token)
87 private String version; // from ID (second token)
89 private String description; // from (first) DE line
91 private int length = 128; // from ID (7th token), with usable default
93 private List<DBRefEntry> dbrefs; // from DR
95 private boolean sequenceStringIsRNA = false;
97 private String sequenceString; // from SQ lines
100 * parsed CDS data fields, keyed by protein_id
102 private Map<String, CdsData> cds;
109 * @throws IOException
111 public EmblFlatFile(FileParse fp, String sourceId) throws IOException
113 super(false, fp); // don't parse immediately
114 this.sourceDb = sourceId;
115 dbrefs = new ArrayList<>();
118 * using TreeMap gives CDS sequences in alphabetical, so readable, order
120 cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
124 * Parses the flatfile, and if successful, saves as an annotated sequence
125 * which may be retrieved by calling {@code getSequence()}
127 * @throws IOException
129 public void parse() throws IOException
131 String line = nextLine();
134 if (line.startsWith("ID"))
136 line = parseID(line);
138 else if (line.startsWith("DE"))
140 line = parseDE(line);
142 else if (line.startsWith("DR"))
144 line = parseDR(line);
146 else if (line.startsWith("SQ"))
150 else if (line.startsWith("FT"))
152 line = parseFT(line);
163 * Extracts and saves the primary accession and version (SV value) from an ID
164 * line, or null if not found. Returns the next line after the one processed.
167 * @throws IOException
169 String parseID(String line) throws IOException
171 String[] tokens = line.substring(2).split(";");
174 * first is primary accession
176 String token = tokens[0].trim();
177 if (!token.isEmpty())
179 this.accession = token;
183 * second token is 'SV versionNo'
185 if (tokens.length > 1)
187 token = tokens[1].trim();
188 if (token.startsWith("SV"))
190 String[] bits = token.trim().split(WHITESPACE);
191 this.version = bits[bits.length - 1];
196 * seventh token is 'length BP'
198 if (tokens.length > 6)
200 token = tokens[6].trim();
201 String[] bits = token.trim().split(WHITESPACE);
204 this.length = Integer.valueOf(bits[0]);
205 } catch (NumberFormatException e)
207 Cache.log.error("bad length read in flatfile, line: " + line);
215 * Reads sequence description from the first DE line found. Any trailing
216 * period is discarded. If there are multiple DE lines, only the first (short
217 * description) is read, the rest are ignored.
221 * @throws IOException
223 String parseDE(String line) throws IOException
225 String desc = line.substring(2).trim();
226 if (desc.endsWith("."))
228 desc = desc.substring(0, desc.length() - 1);
230 this.description = desc;
233 * pass over any additional DE lines
235 while ((line = nextLine()) != null)
237 if (!line.startsWith("DE"))
247 * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
248 * the line following the line processed.
251 * @throws IOException
253 String parseDR(String line) throws IOException
255 String[] tokens = line.substring(2).split(";");
256 if (tokens.length > 1)
259 * ensure UniProtKB/Swiss-Prot converted to UNIPROT
261 String db = tokens[0].trim();
262 db = DBRefUtils.getCanonicalName(db);
263 String acc = tokens[1].trim();
264 if (acc.endsWith("."))
266 acc = acc.substring(0, acc.length() - 1);
268 String version = "0";
269 if (tokens.length > 2)
271 String secondaryId = tokens[2].trim();
272 if (!secondaryId.isEmpty())
274 // todo: is this right? secondary id is not a version number
275 // version = secondaryId;
278 this.dbrefs.add(new DBRefEntry(db, version, acc));
285 * Reads and saves the sequence, read from the lines following the SQ line.
286 * Whitespace and position counters are discarded. Returns the next line
287 * following the sequence data (the next line that doesn't start with
290 * @throws IOException
292 String parseSQ() throws IOException
294 StringBuilder sb = new StringBuilder(this.length);
295 String line = nextLine();
296 while (line != null && line.startsWith(" "))
299 String[] blocks = line.split(WHITESPACE);
302 * omit the last block (position counter) on each line
304 for (int i = 0; i < blocks.length - 1; i++)
306 sb.append(blocks[i]);
310 this.sequenceString = sb.toString();
316 * Processes an FT line. If it declares a feature type of interest (currently,
317 * only CDS is processed), processes all of the associated lines (feature
318 * qualifiers), and returns the next line after that, otherwise simply returns
323 * @throws IOException
325 String parseFT(String line) throws IOException
327 String[] tokens = line.split(WHITESPACE);
328 if (tokens.length < 3
329 || (!"CDS".equals(tokens[1]) && !"source".equals(tokens[1])))
334 if (tokens[1].equals("source"))
336 return parseSourceQualifiers(tokens);
340 * parse location - which may be over more than one line e.g. EAW51554
342 CdsData data = new CdsData();
343 data.cdsLocation = tokens[2];
344 // TODO location can be over >1 line e.g. EAW51554
349 if (!line.startsWith("FT ")) // 4 spaces
351 // e.g. start of next feature "FT source..."
356 * extract qualifier, e.g. FT /protein_id="CAA37824.1"
357 * - the value may extend over more than one line
358 * - if the value has enclosing quotes, these are removed
359 * - escaped double quotes ("") are reduced to a single character
361 int slashPos = line.indexOf('/');
364 Cache.log.error("Unexpected EMBL line ignored: " + line);
368 int eqPos = line.indexOf('=', slashPos + 1);
371 // can happen, e.g. /ribosomal_slippage
372 // Cache.log.error("Unexpected EMBL line ignored: " + line);
376 String qualifier = line.substring(slashPos + 1, eqPos);
377 String value = line.substring(eqPos + 1);
378 value = removeQuotes(value);
379 StringBuilder sb = new StringBuilder().append(value);
380 line = parseFeatureQualifier(sb, qualifier);
381 String featureValue = sb.toString();
383 if ("protein_id".equals(qualifier))
385 data.proteinId = featureValue;
387 else if ("codon_start".equals(qualifier))
391 data.codonStart = Integer.parseInt(featureValue.trim());
392 } catch (NumberFormatException e)
394 Cache.log.error("Invalid codon_start in XML for " + this.accession
395 + ": " + e.getMessage());
398 else if ("db_xref".equals(qualifier))
400 String[] parts = featureValue.split(":");
401 if (parts.length == 2)
403 String db = parts[0].trim();
404 db = DBRefUtils.getCanonicalName(db);
405 DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
406 data.xrefs.add(dbref);
409 else if ("product".equals(qualifier))
411 data.proteinName = featureValue;
413 else if ("translation".equals(qualifier))
415 data.translation = featureValue;
417 else if (!"".equals(featureValue))
419 // throw anything else into the additional properties hash
420 data.cdsProps.put(qualifier, featureValue);
424 if (data.proteinId != null)
426 this.cds.put(data.proteinId, data);
430 Cache.log.error("Ignoring CDS feature with no protein_id for "
431 + sourceDb + ":" + accession);
438 * process attributes for 'source' until the next FT feature entry only
439 * interested in 'mol_type'
443 * @throws IOException
445 private String parseSourceQualifiers(String[] tokens) throws IOException
447 if (!"source".equals(tokens[1]))
449 throw (new RuntimeException("Not given a source qualifier"));
451 // search for mol_type attribute
453 StringBuilder sb = new StringBuilder().append(tokens[2]); // extent of
456 String line = parseFeatureQualifier(sb, "source");
459 if (!line.startsWith("FT ")) // four spaces, end of this feature table
465 int p = line.indexOf("\\mol_type");
466 int qs = line.indexOf("\"", p);
467 int qe = line.indexOf("\"", qs + 1);
468 String qualifier = line.substring(qs, qe).toLowerCase();
469 if (qualifier.indexOf("rna") > -1)
471 sequenceStringIsRNA = true;
473 if (qualifier.indexOf("dna") > -1)
475 sequenceStringIsRNA = false;
477 line = parseFeatureQualifier(sb, "source");
483 * Removes leading or trailing double quotes (") unless doubled, and changes
484 * any 'escaped' (doubled) double quotes to single characters. As per the
485 * Feature Table specification for Qualifiers, Free Text.
490 static String removeQuotes(String value)
496 if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
498 value = value.substring(1);
500 if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
502 value = value.substring(0, value.length() - 1);
504 value = value.replace(DOUBLED_QUOTE, QUOTE);
509 * Reads the value of a feature (FT) qualifier from one or more lines of the
510 * file, and returns the next line after that. Values are appended to the
511 * string buffer, which should be already primed with the value read from the
512 * first line for the qualifier (with any leading double quote removed).
513 * Enclosing double quotes are removed, and escaped (repeated) double quotes
514 * reduced to one only. For example for
517 * FT /note="gene_id=hCG28070.3
518 * FT ""foobar"" isoform=CRA_b"
519 * the returned value is
520 * gene_id=hCG28070.3 "foobar" isoform=CRA_b
523 * Note the side-effect of this method, to advance data reading to the next
524 * line after the feature qualifier.
527 * a string buffer primed with the first line of the value
528 * @param qualifierName
530 * @throws IOException
532 String parseFeatureQualifier(StringBuilder sb, String qualifierName)
536 while ((line = nextLine()) != null)
538 if (!line.startsWith("FT "))
540 break; // reached next feature or other input line
542 String[] tokens = line.split(WHITESPACE);
543 if (tokens.length < 2)
545 Cache.log.error("Ignoring bad EMBL line for " + this.accession
549 if (tokens[1].startsWith("/"))
551 break; // next feature qualifier
555 * heuristic rule: most multi-line value (e.g. /product) are text,
556 * so add a space for word boundary at a new line; not for translation
558 if (!"translation".equals(qualifierName))
564 * remove trailing " and unescape doubled ""
566 String data = removeQuotes(tokens[1]);
574 * Constructs and saves the sequence from parsed components
578 if (this.accession == null || this.sequenceString == null)
580 Cache.log.error("Failed to parse data from EMBL");
584 String name = this.accession;
585 if (this.sourceDb != null)
587 name = this.sourceDb + "|" + name;
590 if (produceRna && sequenceStringIsRNA)
592 sequenceString = sequenceString.replace('T', 'U').replace('t', 'u');
595 SequenceI seq = new Sequence(name, this.sequenceString);
596 seq.setDescription(this.description);
599 * add a DBRef to itself
601 DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
602 int[] startEnd = new int[] { 1, seq.getLength() };
603 selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
604 seq.addDBRef(selfRef);
606 for (DBRefEntry dbref : this.dbrefs)
611 processCDSFeatures(seq);
613 seq.deriveSequence();
619 * Process the CDS features, including generation of cross-references and
620 * mappings to the protein products (translation)
624 protected void processCDSFeatures(SequenceI seq)
627 * record protein products found to avoid duplication i.e. >1 CDS with
628 * the same /protein_id [though not sure I can find an example of this]
630 Map<String, SequenceI> proteins = new HashMap<>();
631 for (CdsData data : cds.values())
633 processCDSFeature(seq, data, proteins);
638 * Processes data for one parsed CDS feature to
640 * <li>create a protein product sequence for the translation</li>
641 * <li>create a cross-reference to protein with mapping from dna</li>
642 * <li>add a CDS feature to the sequence for each CDS start-end range</li>
643 * <li>add any CDS dbrefs to the sequence and to the protein product</li>
649 * map of protein products so far derived from CDS data
651 void processCDSFeature(SequenceI dna, CdsData data,
652 Map<String, SequenceI> proteins)
655 * parse location into a list of [start, end, start, end] positions
657 int[] exons = getCdsRanges(this.accession, data.cdsLocation);
659 MapList maplist = buildMappingToProtein(dna, exons, data);
663 for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
665 int exonStart = exons[xint];
666 int exonEnd = exons[xint + 1];
667 int begin = Math.min(exonStart, exonEnd);
668 int end = Math.max(exonStart, exonEnd);
670 String desc = String.format("Exon %d for protein EMBLCDS:%s",
671 exonNumber, data.proteinId);
673 SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
675 for (Entry<String, String> val : data.cdsProps.entrySet())
677 sf.setValue(val.getKey(), val.getValue());
680 sf.setEnaLocation(data.cdsLocation);
681 boolean forwardStrand = exonStart <= exonEnd;
682 sf.setStrand(forwardStrand ? "+" : "-");
683 sf.setPhase(String.valueOf(data.codonStart - 1));
684 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
685 sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
687 dna.addSequenceFeature(sf);
690 boolean hasUniprotDbref = false;
691 for (DBRefEntry xref : data.xrefs)
694 if (xref.getSource().equals(DBRefSource.UNIPROT))
697 * construct (or find) the sequence for (data.protein_id, data.translation)
699 SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
700 Mapping map = new Mapping(protein, maplist);
701 map.setMappedFromId(data.proteinId);
705 * add DBRefs with mappings from dna to protein and the inverse
707 DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
708 db1.setMap(new Mapping(dna, maplist.getInverse()));
709 protein.addDBRef(db1);
711 hasUniprotDbref = true;
716 * if we have a product (translation) but no explicit Uniprot dbref
717 * (example: EMBL M19487 protein_id AAB02592.1)
718 * then construct mappings to an assumed EMBLCDSPROTEIN accession
720 if (!hasUniprotDbref)
722 SequenceI protein = proteins.get(data.proteinId);
725 protein = new Sequence(data.proteinId, data.translation);
726 protein.setDescription(data.proteinName);
727 proteins.put(data.proteinId, protein);
729 // assuming CDSPROTEIN sequence version = dna version (?!)
730 DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
731 this.version, data.proteinId);
732 protein.addDBRef(db1);
734 DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
735 DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
736 Mapping map = new Mapping(protein, maplist);
737 map.setMappedFromId(data.proteinId);
738 dnaToEmblProteinRef.setMap(map);
739 dna.addDBRef(dnaToEmblProteinRef);
743 * comment brought forward from EmblXmlSource, lines 447-451:
744 * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
745 * sequence with the exon map; if given a dataset reference, search
746 * dataset for parent EMBL sequence if it exists and set its map;
747 * make a new feature annotating the coding contig
752 * Computes a mapping from CDS positions in DNA sequence to protein product
753 * positions, with allowance for stop codon or incomplete start codon
760 MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
763 MapList dnaToProteinMapping = null;
764 int peptideLength = data.translation.length();
766 int[] proteinRange = new int[] { 1, peptideLength };
767 if (exons != null && exons.length > 0)
770 * We were able to parse 'location'; do a final
771 * product length truncation check
773 int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
774 dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
779 * workaround until we handle all 'location' formats fully
780 * e.g. X53828.1:60..1058 or <123..>289
782 Cache.log.error(String.format(
783 "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
784 + " - Making up the CDNA region of (%s:%s)... may be incorrect",
785 data.cdsLocation, sourceDb, this.accession));
787 int completeCodonsLength = 1 - data.codonStart + dna.getLength();
788 int mappedDnaEnd = dna.getEnd();
789 if (peptideLength * 3 == completeCodonsLength)
791 // this might occur for CDS sequences where no features are marked
792 Cache.log.warn("Assuming no stop codon at end of cDNA fragment");
793 mappedDnaEnd = dna.getEnd();
795 else if ((peptideLength + 1) * 3 == completeCodonsLength)
797 Cache.log.warn("Assuming stop codon at end of cDNA fragment");
798 mappedDnaEnd = dna.getEnd() - 3;
801 if (mappedDnaEnd != -1)
803 int[] cdsRanges = new int[] {
804 dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
805 dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
809 return dnaToProteinMapping;
813 * Constructs a sequence for the protein product for the CDS data (if there is
814 * one), and dbrefs with mappings from CDS to protein and the reverse
822 SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
823 CdsData data, Map<String, SequenceI> proteins)
826 * check we have some data to work with
828 if (data.proteinId == null || data.translation == null)
834 * Construct the protein sequence (if not already seen)
836 String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
837 SequenceI protein = proteins.get(proteinSeqName);
840 protein = new Sequence(proteinSeqName, data.translation, 1,
841 data.translation.length());
842 protein.setDescription(data.proteinName != null ? data.proteinName
843 : "Protein Product from " + sourceDb);
844 proteins.put(proteinSeqName, protein);
851 * Returns the CDS location as a single array of [start, end, start, end...]
852 * positions. If on the reverse strand, these will be in descending order.
858 protected int[] getCdsRanges(String accession, String location)
860 if (location == null)
867 List<int[]> ranges = DnaUtils.parseLocation(location);
868 return MappingUtils.rangeListToArray(ranges);
869 } catch (ParseException e)
872 String.format("Not parsing inexact CDS location %s in ENA %s",
873 location, accession));
879 * Output (print) is not implemented for EMBL flat file format
882 public String print(SequenceI[] seqs, boolean jvsuffix)
888 * Truncates (if necessary) the exon intervals to match 3 times the length of
889 * the protein; also accepts 3 bases longer (for stop codon not included in
892 * @param proteinLength
894 * an array of [start, end, start, end...] intervals
895 * @return the same array (if unchanged) or a truncated copy
897 static int[] adjustForProteinLength(int proteinLength, int[] exon)
899 if (proteinLength <= 0 || exon == null)
903 int expectedCdsLength = proteinLength * 3;
904 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
907 * if exon length matches protein, or is shorter, or longer by the
908 * length of a stop codon (3 bases), then leave it unchanged
910 if (expectedCdsLength >= exonLength
911 || expectedCdsLength == exonLength - 3)
919 origxon = new int[exon.length];
920 System.arraycopy(exon, 0, origxon, 0, exon.length);
922 for (int x = 0; x < exon.length; x += 2)
924 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
925 if (expectedCdsLength <= cdspos)
927 // advanced beyond last codon.
929 if (expectedCdsLength != cdspos)
932 // .println("Truncating final exon interval on region by "
933 // + (cdspos - cdslength));
937 * shrink the final exon - reduce end position if forward
938 * strand, increase it if reverse
940 if (exon[x + 1] >= exon[x])
942 endxon = exon[x + 1] - cdspos + expectedCdsLength;
946 endxon = exon[x + 1] + cdspos - expectedCdsLength;
954 // and trim the exon interval set if necessary
955 int[] nxon = new int[sxpos + 2];
956 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
957 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon