3 import java.io.IOException;
4 import java.text.ParseException;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.HashMap;
8 import java.util.Hashtable;
11 import java.util.Map.Entry;
12 import java.util.TreeMap;
14 import jalview.bin.Cache;
15 import jalview.datamodel.DBRefEntry;
16 import jalview.datamodel.DBRefSource;
17 import jalview.datamodel.FeatureProperties;
18 import jalview.datamodel.Mapping;
19 import jalview.datamodel.Sequence;
20 import jalview.datamodel.SequenceFeature;
21 import jalview.datamodel.SequenceI;
22 import jalview.util.DBRefUtils;
23 import jalview.util.DnaUtils;
24 import jalview.util.MapList;
25 import jalview.util.MappingUtils;
28 * A class that provides selective parsing of the EMBL flatfile format.
30 * The initial implementation is limited to extracting fields used by Jalview
31 * after fetching an EMBL or EMBLCDS entry:
34 * accession, version, sequence, xref
35 * and (for CDS feature) location, protein_id, product, codon_start, translation
38 * For a complete parser, it may be best to adopt that provided in
39 * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
40 * (but note this has a dependency on the Apache Commons library)
43 * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
44 * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
46 public class EmblFlatFile extends AlignFile // FileParse
48 private static final String QUOTE = "\"";
51 * A data bean class to hold values parsed from one CDS Feature (FT)
55 String translation; // from CDS feature /translation
57 String cdsLocation; // CDS /location raw value
59 int codonStart = 1; // from CDS /codon_start
61 String proteinName; // from CDS /product; used for protein description
63 String proteinId; // from CDS /protein_id
65 List<DBRefEntry> xrefs = new ArrayList<>(); // from CDS /db_xref qualifiers
67 Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
70 private static final String WHITESPACE = "\\s+";
72 private String sourceDb;
75 * values parsed from the EMBL flatfile record
77 private String accession; // from ID (first token)
79 private String version; // from ID (second token)
81 private String description; // from (first) DE line
83 private int length = 128; // from ID (7th token), with usable default
85 private List<DBRefEntry> dbrefs; // from DR
87 private String sequenceString; // from SQ lines
90 * parsed CDS data fields, keyed by protein_id
92 private Map<String, CdsData> cds;
101 public EmblFlatFile(FileParse fp, String sourceId) throws IOException
103 super(false, fp); // don't parse immediately
104 this.sourceDb = sourceId;
105 dbrefs = new ArrayList<>();
108 * using TreeMap gives CDS sequences in alphabetical, so readable, order
110 cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
114 * Parses the flatfile, and if successful, saves as an annotated sequence
115 * which may be retrieved by calling {@code getSequence()}
117 * @throws IOException
119 public void parse() throws IOException
121 String line = nextLine();
124 if (line.startsWith("ID"))
126 line = parseID(line);
128 else if (line.startsWith("DE"))
130 line = parseDE(line);
132 else if (line.startsWith("DR"))
134 line = parseDR(line);
136 else if (line.startsWith("SQ"))
140 else if (line.startsWith("FT"))
142 line = parseFT(line);
153 * Extracts and saves the primary accession and version (SV value) from an ID
154 * line, or null if not found. Returns the next line after the one processed.
157 * @throws IOException
159 String parseID(String line) throws IOException
161 String[] tokens = line.substring(2).split(";");
164 * first is primary accession
166 String token = tokens[0].trim();
167 if (!token.isEmpty())
169 this.accession = token;
173 * second token is 'SV versionNo'
175 if (tokens.length > 1)
177 token = tokens[1].trim();
178 if (token.startsWith("SV"))
180 String[] bits = token.trim().split(WHITESPACE);
181 this.version = bits[bits.length - 1];
186 * seventh token is 'length BP'
188 if (tokens.length > 6)
190 token = tokens[6].trim();
191 String[] bits = token.trim().split(WHITESPACE);
194 this.length = Integer.valueOf(bits[0]);
195 } catch (NumberFormatException e)
197 Cache.log.error("bad length read in flatfile, line: " + line);
205 * Reads sequence description from the first DE line found. Any trailing
206 * period is discarded. If there are multiple DE lines, only the first (short
207 * description) is read, the rest are ignored.
211 * @throws IOException
213 String parseDE(String line) throws IOException
215 String desc = line.substring(2).trim();
216 if (desc.endsWith("."))
218 desc = desc.substring(0, desc.length() - 1);
220 this.description = desc;
223 * pass over any additional DE lines
225 while ((line = nextLine()) != null)
227 if (!line.startsWith("DE"))
237 * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
238 * the line following the line processed.
241 * @throws IOException
243 String parseDR(String line) throws IOException
245 String[] tokens = line.substring(2).split(";");
246 if (tokens.length > 1)
249 * ensure UniProtKB/Swiss-Prot converted to UNIPROT
251 String db = tokens[0].trim();
252 db = DBRefUtils.getCanonicalName(db);
253 String acc = tokens[1].trim();
254 if (acc.endsWith("."))
256 acc = acc.substring(0, acc.length() - 1);
258 String version = "0";
259 if (tokens.length > 2)
261 String secondaryId = tokens[2].trim();
262 if (!secondaryId.isEmpty())
264 // todo: is this right? secondary id is not a version number
265 // version = secondaryId;
268 this.dbrefs.add(new DBRefEntry(db, version, acc));
275 * Reads and saves the sequence, read from the lines following the SQ line.
276 * Whitespace and position counters are discarded. Returns the next line
277 * following the sequence data (the next line that doesn't start with
280 * @throws IOException
282 String parseSQ() throws IOException
284 StringBuilder sb = new StringBuilder(this.length);
285 String line = nextLine();
286 while (line != null && line.startsWith(" "))
289 String[] blocks = line.split(WHITESPACE);
292 * omit the last block (position counter) on each line
294 for (int i = 0; i < blocks.length - 1; i++)
296 sb.append(blocks[i]);
300 this.sequenceString = sb.toString();
306 * Processes an FT line. If it declares a feature type of interest (currently,
307 * only CDS is processed), processes all of the associated lines (feature
308 * qualifiers), and returns the next line after that, otherwise simply returns
313 * @throws IOException
315 String parseFT(String line) throws IOException
317 String[] tokens = line.split(WHITESPACE);
318 if (tokens.length < 3 || !"CDS".equals(tokens[1]))
323 CdsData data = new CdsData();
324 data.cdsLocation = tokens[2];
329 if (!line.startsWith("FT ")) // 4 spaces
331 // e.g. start of next feature "FT source..."
336 * extract qualifier, e.g. FT /protein_id="CAA37824.1"
338 int slashPos = line.indexOf('/');
341 Cache.log.error("Unexpected EMBL line ignored: " + line);
344 int eqPos = line.indexOf('=', slashPos + 1);
347 // can happen, e.g. /ribosomal_slippage
348 // Cache.log.error("Unexpected EMBL line ignored: " + line);
352 String qualifier = line.substring(slashPos + 1, eqPos);
353 String value = line.substring(eqPos + 1);
354 if (value.startsWith(QUOTE) && value.endsWith(QUOTE))
356 value = value.substring(1, value.length() - 1);
359 if ("protein_id".equals(qualifier))
361 data.proteinId = value;
364 else if ("codon_start".equals(qualifier))
368 data.codonStart = Integer.parseInt(value.trim());
369 } catch (NumberFormatException e)
371 Cache.log.error("Invalid codon_start in XML for " + this.accession
372 + ": " + e.getMessage());
376 else if ("db_xref".equals(qualifier))
378 String[] parts = value.split(":");
379 if (parts.length == 2)
381 String db = parts[0].trim();
382 db = DBRefUtils.getCanonicalName(db);
383 DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
384 data.xrefs.add(dbref);
388 else if ("product".equals(qualifier))
390 // sometimes name is returned e.g. for V00488
391 data.proteinName = value;
394 else if ("translation".equals(qualifier))
396 line = parseTranslation(value, data);
398 else if (!"".equals(value))
400 // throw anything else into the additional properties hash
401 data.cdsProps.put(qualifier, value);
406 if (data.proteinId != null)
408 this.cds.put(data.proteinId, data);
412 Cache.log.error("Ignoring CDS feature with no protein_id for "
413 + sourceDb + ":" + accession);
420 * Reads and returns the CDS translation from one or more lines of the file,
421 * and returns the next line after that
424 * the first line of the translation (likely quoted)
427 * @throws IOException
429 String parseTranslation(String value, CdsData data) throws IOException
431 StringBuilder sb = new StringBuilder(this.length / 3 + 1);
432 sb.append(value.replace(QUOTE, ""));
435 while ((line = nextLine()) != null)
437 if (!line.startsWith("FT "))
439 break; // reached next feature or other input line
441 String[] tokens = line.split(WHITESPACE);
442 if (tokens.length < 2)
444 Cache.log.error("Ignoring bad EMBL line: " + line);
447 if (tokens[1].startsWith("/"))
449 break; // next feature qualifier
451 sb.append(tokens[1].replace(QUOTE, ""));
454 data.translation = sb.toString();
460 * Constructs and saves the sequence from parsed components
464 String name = this.accession;
465 if (this.sourceDb != null)
467 name = this.sourceDb + "|" + name;
469 SequenceI seq = new Sequence(name, this.sequenceString);
470 seq.setDescription(this.description);
473 * add a DBRef to itself
475 DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
476 int[] startEnd = new int[] { 1, seq.getLength() };
477 selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
478 seq.addDBRef(selfRef);
480 for (DBRefEntry dbref : this.dbrefs)
485 processCDSFeatures(seq);
487 seq.deriveSequence();
493 * Process the CDS features, including generation of cross-references and
494 * mappings to the protein products (translation)
498 protected void processCDSFeatures(SequenceI seq)
501 * record protein products found to avoid duplication i.e. >1 CDS with
502 * the same /protein_id [though not sure I can find an example of this]
504 Map<String, SequenceI> proteins = new HashMap<>();
505 for (CdsData data : cds.values())
507 processCDSFeature(seq, data, proteins);
512 * Processes data for one parsed CDS feature to
514 * <li>create a protein product sequence for the translation</li>
515 * <li>create a cross-reference to protein with mapping from dna</li>
516 * <li>add a CDS feature to the sequence for each CDS start-end range</li>
517 * <li>add any CDS dbrefs to the sequence and to the protein product</li>
523 * map of protein products so far derived from CDS data
525 void processCDSFeature(SequenceI dna, CdsData data,
526 Map<String, SequenceI> proteins)
529 * parse location into a list of [start, end, start, end] positions
531 int[] exons = getCdsRanges(this.accession, data.cdsLocation);
533 MapList maplist = buildMappingToProtein(dna, exons, data);
537 for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
539 int exonStart = exons[xint];
540 int exonEnd = exons[xint + 1];
541 int begin = Math.min(exonStart, exonEnd);
542 int end = Math.max(exonStart, exonEnd);
544 String desc = String.format("Exon %d for protein EMBLCDS:%s",
545 exonNumber, data.proteinId);
547 SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
549 for (Entry<String, String> val : data.cdsProps.entrySet())
551 sf.setValue(val.getKey(), val.getValue());
554 sf.setEnaLocation(data.cdsLocation);
555 boolean forwardStrand = exonStart <= exonEnd;
556 sf.setStrand(forwardStrand ? "+" : "-");
557 sf.setPhase(String.valueOf(data.codonStart - 1));
558 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
559 sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
561 dna.addSequenceFeature(sf);
564 boolean hasUniprotDbref = false;
565 for (DBRefEntry xref : data.xrefs)
568 if (xref.getSource().equals(DBRefSource.UNIPROT))
571 * construct (or find) the sequence for (data.protein_id, data.translation)
573 SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
574 Mapping map = new Mapping(protein, maplist);
575 map.setMappedFromId(data.proteinId);
579 * add DBRefs with mappings from dna to protein and the inverse
581 DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
582 db1.setMap(new Mapping(dna, maplist.getInverse()));
583 protein.addDBRef(db1);
585 hasUniprotDbref = true;
590 * if we have a product (translation) but no explicit Uniprot dbref
591 * (example: EMBL M19487 protein_id AAB02592.1)
592 * then construct mappings to an assumed EMBLCDSPROTEIN accession
594 if (!hasUniprotDbref)
596 SequenceI protein = proteins.get(data.proteinId);
599 protein = new Sequence(data.proteinId, data.translation);
600 proteins.put(data.proteinId, protein);
602 // assuming CDSPROTEIN sequence version = dna version (?!)
603 DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
604 this.version, data.proteinId);
605 protein.addDBRef(db1);
607 DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
608 DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
609 Mapping map = new Mapping(protein, maplist);
610 map.setMappedFromId(data.proteinId);
611 dnaToEmblProteinRef.setMap(map);
612 dna.addDBRef(dnaToEmblProteinRef);
616 * comment brought forward from EmblXmlSource, lines 447-451:
617 * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
618 * sequence with the exon map; if given a dataset reference, search
619 * dataset for parent EMBL sequence if it exists and set its map;
620 * make a new feature annotating the coding contig
625 * Computes a mapping from CDS positions in DNA sequence to protein product
626 * positions, with allowance for stop codon or incomplete start codon
633 MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
636 MapList dnaToProteinMapping = null;
637 int peptideLength = data.translation.length();
639 int[] proteinRange = new int[] { 1, peptideLength };
640 if (exons != null && exons.length > 0)
643 * We were able to parse 'location'; do a final
644 * product length truncation check
646 int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
647 dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
652 * workaround until we handle all 'location' formats fully
653 * e.g. X53828.1:60..1058 or <123..>289
655 Cache.log.error(String.format(
656 "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
657 + " - Making up the CDNA region of (%s:%s)... may be incorrect",
658 data.cdsLocation, sourceDb, this.accession));
660 int completeCodonsLength = 1 - data.codonStart + dna.getLength();
661 int mappedDnaEnd = dna.getEnd();
662 if (peptideLength * 3 == completeCodonsLength)
664 // this might occur for CDS sequences where no features are marked
665 Cache.log.warn("Assuming no stop codon at end of cDNA fragment");
666 mappedDnaEnd = dna.getEnd();
668 else if ((peptideLength + 1) * 3 == completeCodonsLength)
670 Cache.log.warn("Assuming stop codon at end of cDNA fragment");
671 mappedDnaEnd = dna.getEnd() - 3;
674 if (mappedDnaEnd != -1)
676 int[] cdsRanges = new int[] {
677 dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
678 dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
682 return dnaToProteinMapping;
686 * Constructs a sequence for the protein product for the CDS data (if there is
687 * one), and dbrefs with mappings from CDS to protein and the reverse
695 SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
696 CdsData data, Map<String, SequenceI> proteins)
699 * check we have some data to work with
701 if (data.proteinId == null || data.translation == null)
707 * Construct the protein sequence (if not already seen)
709 String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
710 SequenceI protein = proteins.get(proteinSeqName);
713 protein = new Sequence(proteinSeqName, data.translation, 1,
714 data.translation.length());
715 protein.setDescription(data.proteinName != null ? data.proteinName
716 : "Protein Product from " + sourceDb);
717 proteins.put(proteinSeqName, protein);
724 * Returns the CDS location as a single array of [start, end, start, end...]
725 * positions. If on the reverse strand, these will be in descending order.
731 protected int[] getCdsRanges(String accession, String location)
733 if (location == null)
740 List<int[]> ranges = DnaUtils.parseLocation(location);
741 return MappingUtils.listToArray(ranges);
742 } catch (ParseException e)
745 String.format("Not parsing inexact CDS location %s in ENA %s",
746 location, accession));
752 * Output (print) is not implemented for EMBL flat file format
755 public String print(SequenceI[] seqs, boolean jvsuffix)
761 * Truncates (if necessary) the exon intervals to match 3 times the length of
762 * the protein; also accepts 3 bases longer (for stop codon not included in
765 * @param proteinLength
767 * an array of [start, end, start, end...] intervals
768 * @return the same array (if unchanged) or a truncated copy
770 static int[] adjustForProteinLength(int proteinLength, int[] exon)
772 if (proteinLength <= 0 || exon == null)
776 int expectedCdsLength = proteinLength * 3;
777 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
780 * if exon length matches protein, or is shorter, or longer by the
781 * length of a stop codon (3 bases), then leave it unchanged
783 if (expectedCdsLength >= exonLength
784 || expectedCdsLength == exonLength - 3)
792 origxon = new int[exon.length];
793 System.arraycopy(exon, 0, origxon, 0, exon.length);
795 for (int x = 0; x < exon.length; x += 2)
797 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
798 if (expectedCdsLength <= cdspos)
800 // advanced beyond last codon.
802 if (expectedCdsLength != cdspos)
805 // .println("Truncating final exon interval on region by "
806 // + (cdspos - cdslength));
810 * shrink the final exon - reduce end position if forward
811 * strand, increase it if reverse
813 if (exon[x + 1] >= exon[x])
815 endxon = exon[x + 1] - cdspos + expectedCdsLength;
819 endxon = exon[x + 1] + cdspos - expectedCdsLength;
827 // and trim the exon interval set if necessary
828 int[] nxon = new int[sxpos + 2];
829 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
830 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon