2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.ws.dbsources;
23 import java.util.Locale;
26 import java.io.FileInputStream;
27 import java.io.InputStream;
28 import java.text.ParseException;
29 import java.util.ArrayList;
30 import java.util.Arrays;
31 import java.util.Hashtable;
32 import java.util.List;
34 import java.util.Map.Entry;
36 import javax.xml.bind.JAXBContext;
37 import javax.xml.bind.JAXBElement;
38 import javax.xml.bind.JAXBException;
39 import javax.xml.stream.FactoryConfigurationError;
40 import javax.xml.stream.XMLInputFactory;
41 import javax.xml.stream.XMLStreamException;
42 import javax.xml.stream.XMLStreamReader;
45 import jalview.analysis.SequenceIdMatcher;
46 import jalview.bin.Console;
47 import jalview.datamodel.Alignment;
48 import jalview.datamodel.AlignmentI;
49 import jalview.datamodel.DBRefEntry;
50 import jalview.datamodel.DBRefSource;
51 import jalview.datamodel.FeatureProperties;
52 import jalview.datamodel.Mapping;
53 import jalview.datamodel.Sequence;
54 import jalview.datamodel.SequenceFeature;
55 import jalview.datamodel.SequenceI;
56 import jalview.util.DBRefUtils;
57 import jalview.util.DnaUtils;
58 import jalview.util.MapList;
59 import jalview.util.MappingUtils;
60 import jalview.util.MessageManager;
61 import jalview.util.Platform;
62 import jalview.ws.ebi.EBIFetchClient;
63 import jalview.xml.binding.embl.EntryType;
64 import jalview.xml.binding.embl.EntryType.Feature;
65 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
66 import jalview.xml.binding.embl.ROOT;
67 import jalview.xml.binding.embl.XrefType;
69 import com.stevesoft.pat.Regex;
72 * Provides XML binding and parsing of EMBL or EMBLCDS records retrieved from
73 * (e.g.) {@code https://www.ebi.ac.uk/ena/data/view/x53828&display=xml}.
75 * @deprecated endpoint withdrawn August 2020 (JAL-3692), use EmblFlatfileSource
78 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
80 // TODO: delete class or update tyhis validator for 2.12 style Platform.regex
81 private static final Regex ACCESSION_REGEX = Platform.newRegex("^[A-Z]+[0-9]+");
84 * JAL-1856 Embl returns this text for query not found
86 private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
88 public EmblXmlSource()
94 * Retrieves and parses an emblxml file, and returns an alignment containing
95 * the parsed sequences, or null if none were found
98 * "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
103 protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
107 EBIFetchClient dbFetch = new EBIFetchClient();
111 reply = dbFetch.fetchDataAsFile(
112 emprefx.toLowerCase(Locale.ROOT) + ":" + query.trim(), "display=xml",
114 } catch (Exception e)
118 String.format("EBI EMBL XML retrieval failed for %s:%s",
119 emprefx.toLowerCase(Locale.ROOT), query.trim()),
122 return getEmblSequenceRecords(emprefx, query, reply);
126 * parse an emblxml file stored locally
129 * either EMBL or EMBLCDS strings are allowed - anything else will
130 * not retrieve emblxml
133 * the EMBL XML file containing the results of a query
137 protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
138 File reply) throws Exception
140 List<EntryType> entries = null;
141 if (reply != null && reply.exists())
143 file = reply.getAbsolutePath();
144 if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
146 InputStream is = new FileInputStream(reply);
147 entries = getEmblEntries(is);
152 * invalid accession gets a reply with no <entry> elements, text content of
153 * EmbFile reads something like (e.g.) this ungrammatical phrase
154 * Entry: <acc> display type is either not supported or entry is not found.
156 AlignmentI al = null;
157 List<SequenceI> seqs = new ArrayList<>();
158 List<SequenceI> peptides = new ArrayList<>();
161 for (EntryType entry : entries)
163 SequenceI seq = getSequence(emprefx, entry, peptides);
166 seqs.add(seq.deriveSequence());
167 // place DBReferences on dataset and refer
172 al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
177 "No record found for '" + emprefx + ":" + query + "'");
186 * Reads the XML reply from file and unmarshals it to Java objects. Answers a
187 * (possibly empty) list of <code>EntryType</code> objects.
193 List<EntryType> getEmblEntries(InputStream is)
195 List<EntryType> entries = new ArrayList<>();
198 JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
199 XMLStreamReader streamReader = XMLInputFactory.newInstance()
200 .createXMLStreamReader(is);
201 javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
202 JAXBElement<ROOT> rootElement = um.unmarshal(streamReader,
204 ROOT root = rootElement.getValue();
207 * document root contains either "entry" or "entrySet"
213 if (root.getEntrySet() != null)
215 entries = root.getEntrySet().getEntry();
217 else if (root.getEntry() != null)
219 entries.add(root.getEntry());
221 } catch (JAXBException | XMLStreamException
222 | FactoryConfigurationError e)
230 * A helper method to parse XML data and construct a sequence, with any
231 * available database references and features
238 SequenceI getSequence(String sourceDb, EntryType entry,
239 List<SequenceI> peptides)
241 String seqString = entry.getSequence();
242 if (seqString == null)
246 seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
248 String accession = entry.getAccession();
249 SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
251 dna.setDescription(entry.getDescription());
252 String sequenceVersion = String.valueOf(entry.getVersion().intValue());
253 DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
255 dna.addDBRef(selfRref);
257 new Mapping(null, new int[]
258 { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
264 List<XrefType> xrefs = entry.getXref();
267 for (XrefType xref : xrefs)
269 String acc = xref.getId();
270 String source = DBRefUtils.getCanonicalName(xref.getDb());
271 String version = xref.getSecondaryId();
272 if (version == null || "".equals(version))
276 dna.addDBRef(new DBRefEntry(source, version, acc));
280 SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
283 List<Feature> features = entry.getFeature();
284 if (features != null)
286 for (Feature feature : features)
288 if (FeatureProperties.isCodingFeature(sourceDb,
291 parseCodingFeature(entry, feature, sourceDb, dna, peptides,
296 } catch (Exception e)
298 System.err.println("EMBL Record Features parsing error!");
300 .println("Please report the following to help@jalview.org :");
301 System.err.println("EMBL Record " + accession);
302 System.err.println("Resulted in exception: " + e.getMessage());
303 e.printStackTrace(System.err);
310 * Extracts coding region and product from a CDS feature and decorates it with
320 void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
321 SequenceI dna, List<SequenceI> peptides,
322 SequenceIdMatcher matcher)
324 final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
325 final String accession = entry.getAccession();
326 final String sequenceVersion = entry.getVersion().toString();
328 int[] exons = getCdsRanges(entry.getAccession(), feature);
330 String translation = null;
331 String proteinName = "";
332 String proteinId = null;
333 Map<String, String> vals = new Hashtable<>();
336 * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
337 * (phase is required for CDS features in GFF3 format)
342 * parse qualifiers, saving protein translation, protein id,
343 * codon start position, product (name), and 'other values'
345 if (feature.getQualifier() != null)
347 for (Qualifier q : feature.getQualifier())
349 String qname = q.getName();
350 String value = q.getValue();
351 value = value == null ? ""
352 : value.trim().replace(" ", "").replace("\n", "")
354 if (qname.equals("translation"))
358 else if (qname.equals("protein_id"))
362 else if (qname.equals("codon_start"))
366 codonStart = Integer.parseInt(value.trim());
367 } catch (NumberFormatException e)
369 System.err.println("Invalid codon_start in XML for "
370 + entry.getAccession() + ": " + e.getMessage());
373 else if (qname.equals("product"))
375 // sometimes name is returned e.g. for V00488
380 // throw anything else into the additional properties hash
381 if (!"".equals(value))
383 vals.put(qname, value);
389 DBRefEntry proteinToEmblProteinRef = null;
390 exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
392 SequenceI product = null;
393 Mapping dnaToProteinMapping = null;
394 if (translation != null && proteinName != null && proteinId != null)
396 int translationLength = translation.length();
399 * look for product in peptides list, if not found, add it
401 product = matcher.findIdMatch(proteinId);
404 product = new Sequence(proteinId, translation, 1,
406 product.setDescription(((proteinName.length() == 0)
407 ? "Protein Product from " + sourceDb
409 peptides.add(product);
410 matcher.add(product);
413 // we have everything - create the mapping and perhaps the protein
415 if (exons == null || exons.length == 0)
418 * workaround until we handle dna location for CDS sequence
419 * e.g. location="X53828.1:60..1058" correctly
422 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
423 + sourceDb + ":" + entry.getAccession() + ")");
424 int dnaLength = dna.getLength();
425 if (translationLength * 3 == (1 - codonStart + dnaLength))
428 "Not allowing for additional stop codon at end of cDNA fragment... !");
429 // this might occur for CDS sequences where no features are marked
430 exons = new int[] { dna.getStart() + (codonStart - 1),
432 dnaToProteinMapping = new Mapping(product, exons,
434 { 1, translationLength }, 3, 1);
436 if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
439 "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
440 exons = new int[] { dna.getStart() + (codonStart - 1),
442 dnaToProteinMapping = new Mapping(product, exons,
444 { 1, translationLength }, 3, 1);
449 // Trim the exon mapping if necessary - the given product may only be a
450 // fragment of a larger protein. (EMBL:AY043181 is an example)
454 // TODO: Add a DbRef back to the parent EMBL sequence with the exon
456 // if given a dataset reference, search dataset for parent EMBL
457 // sequence if it exists and set its map
458 // make a new feature annotating the coding contig
462 // final product length truncation check
463 int[] cdsRanges = adjustForProteinLength(translationLength,
465 dnaToProteinMapping = new Mapping(product, cdsRanges,
467 { 1, translationLength }, 3, 1);
471 * make xref with mapping from protein to EMBL dna
473 DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
474 sequenceVersion, proteinId,
475 new Mapping(dnaToProteinMapping.getMap().getInverse()));
476 product.addDBRef(proteinToEmblRef);
479 * make xref from protein to EMBLCDS; we assume here that the
480 * CDS sequence version is same as dna sequence (?!)
482 MapList proteinToCdsMapList = new MapList(
484 { 1, translationLength },
486 { 1 + (codonStart - 1),
487 (codonStart - 1) + 3 * translationLength },
489 DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
490 DBRefSource.EMBLCDS, sequenceVersion, proteinId,
491 new Mapping(proteinToCdsMapList));
492 product.addDBRef(proteinToEmblCdsRef);
495 * make 'direct' xref from protein to EMBLCDSPROTEIN
497 proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
498 proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
499 proteinToEmblProteinRef.setMap(null);
500 product.addDBRef(proteinToEmblProteinRef);
506 * add cds features to dna sequence
508 String cds = feature.getName(); // "CDS"
509 for (int xint = 0; exons != null
510 && xint < exons.length - 1; xint += 2)
512 int exonStart = exons[xint];
513 int exonEnd = exons[xint + 1];
514 int begin = Math.min(exonStart, exonEnd);
515 int end = Math.max(exonStart, exonEnd);
516 int exonNumber = xint / 2 + 1;
517 String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
518 exonNumber, proteinName, proteinId);
520 SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
523 sf.setEnaLocation(feature.getLocation());
524 boolean forwardStrand = exonStart <= exonEnd;
525 sf.setStrand(forwardStrand ? "+" : "-");
526 sf.setPhase(String.valueOf(codonStart - 1));
527 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
528 sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
530 dna.addSequenceFeature(sf);
535 * add feature dbRefs to sequence, and mappings for Uniprot xrefs
537 boolean hasUniprotDbref = false;
538 List<XrefType> xrefs = feature.getXref();
541 boolean mappingUsed = false;
542 for (XrefType xref : xrefs)
545 * ensure UniProtKB/Swiss-Prot converted to UNIPROT
547 String source = DBRefUtils.getCanonicalName(xref.getDb());
548 String version = xref.getSecondaryId();
549 if (version == null || "".equals(version))
553 DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
554 DBRefEntry proteinDbRef = new DBRefEntry(source, version,
555 dbref.getAccessionId());
556 if (source.equals(DBRefSource.UNIPROT))
558 String proteinSeqName = DBRefSource.UNIPROT + "|"
559 + dbref.getAccessionId();
560 if (dnaToProteinMapping != null
561 && dnaToProteinMapping.getTo() != null)
566 * two or more Uniprot xrefs for the same CDS -
567 * each needs a distinct Mapping (as to a different sequence)
569 dnaToProteinMapping = new Mapping(dnaToProteinMapping);
574 * try to locate the protein mapped to (possibly by a
575 * previous CDS feature); if not found, construct it from
576 * the EMBL translation
578 SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
579 if (proteinSeq == null)
581 proteinSeq = new Sequence(proteinSeqName,
582 product.getSequenceAsString());
583 matcher.add(proteinSeq);
584 proteinSeq.setDescription(product.getDescription());
585 peptides.add(proteinSeq);
587 dnaToProteinMapping.setTo(proteinSeq);
588 dnaToProteinMapping.setMappedFromId(proteinId);
589 proteinSeq.addDBRef(proteinDbRef);
590 dbref.setMap(dnaToProteinMapping);
592 hasUniprotDbref = true;
597 * copy feature dbref to our protein product
599 DBRefEntry pref = proteinDbRef;
600 pref.setMap(null); // reference is direct
601 product.addDBRef(pref);
602 // Add converse mapping reference
603 if (dnaToProteinMapping != null)
605 Mapping pmap = new Mapping(dna,
606 dnaToProteinMapping.getMap().getInverse());
607 pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
609 if (dnaToProteinMapping.getTo() != null)
611 dnaToProteinMapping.getTo().addDBRef(pref);
620 * if we have a product (translation) but no explicit Uniprot dbref
621 * (example: EMBL AAFI02000057 protein_id EAL65544.1)
622 * then construct mappings to an assumed EMBLCDSPROTEIN accession
624 if (!hasUniprotDbref && product != null)
626 if (proteinToEmblProteinRef == null)
628 // assuming CDSPROTEIN sequence version = dna version (?!)
629 proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
630 sequenceVersion, proteinId);
632 product.addDBRef(proteinToEmblProteinRef);
634 if (dnaToProteinMapping != null
635 && dnaToProteinMapping.getTo() != null)
637 DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
638 DBRefSource.EMBLCDSProduct, sequenceVersion, proteinId);
639 dnaToEmblProteinRef.setMap(dnaToProteinMapping);
640 dnaToProteinMapping.setMappedFromId(proteinId);
641 dna.addDBRef(dnaToEmblProteinRef);
647 public boolean isDnaCoding()
653 * Returns the CDS positions as a single array of [start, end, start, end...]
654 * positions. If on the reverse strand, these will be in descending order.
660 protected int[] getCdsRanges(String accession, Feature feature)
662 String location = feature.getLocation();
663 if (location == null)
670 List<int[]> ranges = DnaUtils.parseLocation(location);
671 return listToArray(ranges);
672 } catch (ParseException e)
675 String.format("Not parsing inexact CDS location %s in ENA %s",
676 location, accession));
682 * Converts a list of [start, end] ranges to a single array of [start, end,
688 int[] listToArray(List<int[]> ranges)
690 int[] result = new int[ranges.size() * 2];
692 for (int[] range : ranges)
694 result[i++] = range[0];
695 result[i++] = range[1];
701 * Helper method to construct a SequenceFeature for one cds range
704 * feature type ("CDS")
714 * map of 'miscellaneous values' for feature
717 protected SequenceFeature makeCdsFeature(String type, String desc,
718 int begin, int end, String group, Map<String, String> vals)
720 SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
723 for (Entry<String, String> val : vals.entrySet())
725 sf.setValue(val.getKey(), val.getValue());
732 public String getAccessionSeparator()
738 public Regex getAccessionValidator()
740 return ACCESSION_REGEX;
744 public String getDbVersion()
756 public boolean isValidReference(String accession)
758 if (accession == null || accession.length() < 2)
762 return getAccessionValidator().search(accession);
766 * Truncates (if necessary) the exon intervals to match 3 times the length of
767 * the protein (including truncation for stop codon included in exon)
769 * @param proteinLength
771 * an array of [start, end, start, end...] intervals
772 * @return the same array (if unchanged) or a truncated copy
774 static int[] adjustForProteinLength(int proteinLength, int[] exon)
776 if (proteinLength <= 0 || exon == null)
780 int expectedCdsLength = proteinLength * 3;
781 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
784 * if exon length matches protein, or is shorter, then leave it unchanged
786 if (expectedCdsLength >= exonLength)
794 origxon = new int[exon.length];
795 System.arraycopy(exon, 0, origxon, 0, exon.length);
797 for (int x = 0; x < exon.length; x += 2)
799 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
800 if (expectedCdsLength <= cdspos)
802 // advanced beyond last codon.
804 if (expectedCdsLength != cdspos)
807 // .println("Truncating final exon interval on region by "
808 // + (cdspos - cdslength));
812 * shrink the final exon - reduce end position if forward
813 * strand, increase it if reverse
815 if (exon[x + 1] >= exon[x])
817 endxon = exon[x + 1] - cdspos + expectedCdsLength;
821 endxon = exon[x + 1] + cdspos - expectedCdsLength;
829 // and trim the exon interval set if necessary
830 int[] nxon = new int[sxpos + 2];
831 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
832 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon