2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.ws.dbsources;
24 import java.io.FileInputStream;
25 import java.io.InputStream;
26 import java.text.ParseException;
27 import java.util.ArrayList;
28 import java.util.Arrays;
29 import java.util.Hashtable;
30 import java.util.List;
32 import java.util.Map.Entry;
34 import javax.xml.bind.JAXBContext;
35 import javax.xml.bind.JAXBElement;
36 import javax.xml.bind.JAXBException;
37 import javax.xml.stream.FactoryConfigurationError;
38 import javax.xml.stream.XMLInputFactory;
39 import javax.xml.stream.XMLStreamException;
40 import javax.xml.stream.XMLStreamReader;
42 import com.stevesoft.pat.Regex;
44 import jalview.analysis.SequenceIdMatcher;
45 import jalview.bin.Cache;
46 import jalview.datamodel.Alignment;
47 import jalview.datamodel.AlignmentI;
48 import jalview.datamodel.DBRefEntry;
49 import jalview.datamodel.DBRefSource;
50 import jalview.datamodel.FeatureProperties;
51 import jalview.datamodel.Mapping;
52 import jalview.datamodel.Sequence;
53 import jalview.datamodel.SequenceFeature;
54 import jalview.datamodel.SequenceI;
55 import jalview.util.DBRefUtils;
56 import jalview.util.DnaUtils;
57 import jalview.util.MapList;
58 import jalview.util.MappingUtils;
59 import jalview.ws.ebi.EBIFetchClient;
60 import jalview.xml.binding.embl.EntryType;
61 import jalview.xml.binding.embl.EntryType.Feature;
62 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
63 import jalview.xml.binding.embl.ROOT;
64 import jalview.xml.binding.embl.XrefType;
67 * Provides XML binding and parsing of EMBL or EMBLCDS records retrieved from
68 * (e.g.) {@code https://www.ebi.ac.uk/ena/data/view/x53828&display=xml}.
70 * @deprecated endpoint withdrawn August 2020 (JAL-3692), use EmblFlatfileSource
73 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
75 private static final Regex ACCESSION_REGEX = new Regex("^[A-Z]+[0-9]+");
78 * JAL-1856 Embl returns this text for query not found
80 private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
82 public EmblXmlSource()
88 * Retrieves and parses an emblxml file, and returns an alignment containing
89 * the parsed sequences, or null if none were found
92 * "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
97 protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
101 EBIFetchClient dbFetch = new EBIFetchClient();
105 reply = dbFetch.fetchDataAsFile(
106 emprefx.toLowerCase() + ":" + query.trim(), "display=xml",
108 } catch (Exception e)
112 String.format("EBI EMBL XML retrieval failed for %s:%s",
113 emprefx.toLowerCase(), query.trim()),
116 return getEmblSequenceRecords(emprefx, query, reply);
120 * parse an emblxml file stored locally
123 * either EMBL or EMBLCDS strings are allowed - anything else will
124 * not retrieve emblxml
127 * the EMBL XML file containing the results of a query
131 protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
132 File reply) throws Exception
134 List<EntryType> entries = null;
135 if (reply != null && reply.exists())
137 file = reply.getAbsolutePath();
138 if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
140 InputStream is = new FileInputStream(reply);
141 entries = getEmblEntries(is);
146 * invalid accession gets a reply with no <entry> elements, text content of
147 * EmbFile reads something like (e.g.) this ungrammatical phrase
148 * Entry: <acc> display type is either not supported or entry is not found.
150 AlignmentI al = null;
151 List<SequenceI> seqs = new ArrayList<>();
152 List<SequenceI> peptides = new ArrayList<>();
155 for (EntryType entry : entries)
157 SequenceI seq = getSequence(emprefx, entry, peptides);
160 seqs.add(seq.deriveSequence());
161 // place DBReferences on dataset and refer
166 al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
171 "No record found for '" + emprefx + ":" + query + "'");
180 * Reads the XML reply from file and unmarshals it to Java objects. Answers a
181 * (possibly empty) list of <code>EntryType</code> objects.
187 List<EntryType> getEmblEntries(InputStream is)
189 List<EntryType> entries = new ArrayList<>();
192 JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
193 XMLStreamReader streamReader = XMLInputFactory.newInstance()
194 .createXMLStreamReader(is);
195 javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
196 JAXBElement<ROOT> rootElement = um.unmarshal(streamReader,
198 ROOT root = rootElement.getValue();
201 * document root contains either "entry" or "entrySet"
207 if (root.getEntrySet() != null)
209 entries = root.getEntrySet().getEntry();
211 else if (root.getEntry() != null)
213 entries.add(root.getEntry());
215 } catch (JAXBException | XMLStreamException
216 | FactoryConfigurationError e)
224 * A helper method to parse XML data and construct a sequence, with any
225 * available database references and features
232 SequenceI getSequence(String sourceDb, EntryType entry,
233 List<SequenceI> peptides)
235 String seqString = entry.getSequence();
236 if (seqString == null)
240 seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
242 String accession = entry.getAccession();
243 SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
245 dna.setDescription(entry.getDescription());
246 String sequenceVersion = String.valueOf(entry.getVersion().intValue());
247 DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
249 dna.addDBRef(selfRref);
251 new Mapping(null, new int[]
252 { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
258 List<XrefType> xrefs = entry.getXref();
261 for (XrefType xref : xrefs)
263 String acc = xref.getId();
264 String source = DBRefUtils.getCanonicalName(xref.getDb());
265 String version = xref.getSecondaryId();
266 if (version == null || "".equals(version))
270 dna.addDBRef(new DBRefEntry(source, version, acc));
274 SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
277 List<Feature> features = entry.getFeature();
278 if (features != null)
280 for (Feature feature : features)
282 if (FeatureProperties.isCodingFeature(sourceDb,
285 parseCodingFeature(entry, feature, sourceDb, dna, peptides,
290 } catch (Exception e)
292 System.err.println("EMBL Record Features parsing error!");
294 .println("Please report the following to help@jalview.org :");
295 System.err.println("EMBL Record " + accession);
296 System.err.println("Resulted in exception: " + e.getMessage());
297 e.printStackTrace(System.err);
304 * Extracts coding region and product from a CDS feature and decorates it with
314 void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
315 SequenceI dna, List<SequenceI> peptides,
316 SequenceIdMatcher matcher)
318 final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
319 final String accession = entry.getAccession();
320 final String sequenceVersion = entry.getVersion().toString();
322 int[] exons = getCdsRanges(entry.getAccession(), feature);
324 String translation = null;
325 String proteinName = "";
326 String proteinId = null;
327 Map<String, String> vals = new Hashtable<>();
330 * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
331 * (phase is required for CDS features in GFF3 format)
336 * parse qualifiers, saving protein translation, protein id,
337 * codon start position, product (name), and 'other values'
339 if (feature.getQualifier() != null)
341 for (Qualifier q : feature.getQualifier())
343 String qname = q.getName();
344 String value = q.getValue();
345 value = value == null ? ""
346 : value.trim().replace(" ", "").replace("\n", "")
348 if (qname.equals("translation"))
352 else if (qname.equals("protein_id"))
356 else if (qname.equals("codon_start"))
360 codonStart = Integer.parseInt(value.trim());
361 } catch (NumberFormatException e)
363 System.err.println("Invalid codon_start in XML for "
364 + entry.getAccession() + ": " + e.getMessage());
367 else if (qname.equals("product"))
369 // sometimes name is returned e.g. for V00488
374 // throw anything else into the additional properties hash
375 if (!"".equals(value))
377 vals.put(qname, value);
383 DBRefEntry proteinToEmblProteinRef = null;
384 exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
386 SequenceI product = null;
387 Mapping dnaToProteinMapping = null;
388 if (translation != null && proteinName != null && proteinId != null)
390 int translationLength = translation.length();
393 * look for product in peptides list, if not found, add it
395 product = matcher.findIdMatch(proteinId);
398 product = new Sequence(proteinId, translation, 1,
400 product.setDescription(((proteinName.length() == 0)
401 ? "Protein Product from " + sourceDb
403 peptides.add(product);
404 matcher.add(product);
407 // we have everything - create the mapping and perhaps the protein
409 if (exons == null || exons.length == 0)
412 * workaround until we handle dna location for CDS sequence
413 * e.g. location="X53828.1:60..1058" correctly
416 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
417 + sourceDb + ":" + entry.getAccession() + ")");
418 int dnaLength = dna.getLength();
419 if (translationLength * 3 == (1 - codonStart + dnaLength))
422 "Not allowing for additional stop codon at end of cDNA fragment... !");
423 // this might occur for CDS sequences where no features are marked
424 exons = new int[] { dna.getStart() + (codonStart - 1),
426 dnaToProteinMapping = new Mapping(product, exons,
428 { 1, translationLength }, 3, 1);
430 if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
433 "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
434 exons = new int[] { dna.getStart() + (codonStart - 1),
436 dnaToProteinMapping = new Mapping(product, exons,
438 { 1, translationLength }, 3, 1);
443 // Trim the exon mapping if necessary - the given product may only be a
444 // fragment of a larger protein. (EMBL:AY043181 is an example)
448 // TODO: Add a DbRef back to the parent EMBL sequence with the exon
450 // if given a dataset reference, search dataset for parent EMBL
451 // sequence if it exists and set its map
452 // make a new feature annotating the coding contig
456 // final product length truncation check
457 int [] exons2 = adjustForProteinLength(translationLength,
459 dnaToProteinMapping = new Mapping(product, exons2,
461 { 1, translationLength }, 3, 1);
465 * make xref with mapping from protein to EMBL dna
467 DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
468 sequenceVersion, proteinId,
469 new Mapping(dnaToProteinMapping.getMap().getInverse()));
470 product.addDBRef(proteinToEmblRef);
473 * make xref from protein to EMBLCDS; we assume here that the
474 * CDS sequence version is same as dna sequence (?!)
476 MapList proteinToCdsMapList = new MapList(
478 { 1, translationLength },
480 { 1 + (codonStart - 1),
481 (codonStart - 1) + 3 * translationLength },
483 DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
484 DBRefSource.EMBLCDS, sequenceVersion, proteinId,
485 new Mapping(proteinToCdsMapList));
486 product.addDBRef(proteinToEmblCdsRef);
489 * make 'direct' xref from protein to EMBLCDSPROTEIN
491 proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
492 proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
493 proteinToEmblProteinRef.setMap(null);
494 product.addDBRef(proteinToEmblProteinRef);
500 * add cds features to dna sequence
502 String cds = feature.getName(); // "CDS"
503 for (int xint = 0; exons != null
504 && xint < exons.length - 1; xint += 2)
506 int exonStart = exons[xint];
507 int exonEnd = exons[xint + 1];
508 int begin = Math.min(exonStart, exonEnd);
509 int end = Math.max(exonStart, exonEnd);
510 int exonNumber = xint / 2 + 1;
511 String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
512 exonNumber, proteinName, proteinId);
514 SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
517 sf.setEnaLocation(feature.getLocation());
518 boolean forwardStrand = exonStart <= exonEnd;
519 sf.setStrand(forwardStrand ? "+" : "-");
520 sf.setPhase(String.valueOf(codonStart - 1));
521 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
522 sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
524 dna.addSequenceFeature(sf);
529 * add feature dbRefs to sequence, and mappings for Uniprot xrefs
531 boolean hasUniprotDbref = false;
532 List<XrefType> xrefs = feature.getXref();
535 boolean mappingUsed = false;
536 for (XrefType xref : xrefs)
539 * ensure UniProtKB/Swiss-Prot converted to UNIPROT
541 String source = DBRefUtils.getCanonicalName(xref.getDb());
542 String version = xref.getSecondaryId();
543 if (version == null || "".equals(version))
547 DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
548 DBRefEntry proteinDbRef = new DBRefEntry(source, version,
549 dbref.getAccessionId());
550 if (source.equals(DBRefSource.UNIPROT))
552 String proteinSeqName = DBRefSource.UNIPROT + "|"
553 + dbref.getAccessionId();
554 if (dnaToProteinMapping != null
555 && dnaToProteinMapping.getTo() != null)
560 * two or more Uniprot xrefs for the same CDS -
561 * each needs a distinct Mapping (as to a different sequence)
563 dnaToProteinMapping = new Mapping(dnaToProteinMapping);
568 * try to locate the protein mapped to (possibly by a
569 * previous CDS feature); if not found, construct it from
570 * the EMBL translation
572 SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
573 if (proteinSeq == null)
575 proteinSeq = new Sequence(proteinSeqName,
576 product.getSequenceAsString());
577 matcher.add(proteinSeq);
578 proteinSeq.setDescription(product.getDescription());
579 peptides.add(proteinSeq);
581 dnaToProteinMapping.setTo(proteinSeq);
582 dnaToProteinMapping.setMappedFromId(proteinId);
583 proteinSeq.addDBRef(proteinDbRef);
584 dbref.setMap(dnaToProteinMapping);
586 hasUniprotDbref = true;
591 * copy feature dbref to our protein product
593 DBRefEntry pref = proteinDbRef;
594 pref.setMap(null); // reference is direct
595 product.addDBRef(pref);
596 // Add converse mapping reference
597 if (dnaToProteinMapping != null)
599 Mapping pmap = new Mapping(dna,
600 dnaToProteinMapping.getMap().getInverse());
601 pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
603 if (dnaToProteinMapping.getTo() != null)
605 dnaToProteinMapping.getTo().addDBRef(pref);
614 * if we have a product (translation) but no explicit Uniprot dbref
615 * (example: EMBL AAFI02000057 protein_id EAL65544.1)
616 * then construct mappings to an assumed EMBLCDSPROTEIN accession
618 if (!hasUniprotDbref && product != null)
620 if (proteinToEmblProteinRef == null)
622 // assuming CDSPROTEIN sequence version = dna version (?!)
623 proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
624 sequenceVersion, proteinId);
626 product.addDBRef(proteinToEmblProteinRef);
628 if (dnaToProteinMapping != null
629 && dnaToProteinMapping.getTo() != null)
631 DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
632 DBRefSource.EMBLCDSProduct, sequenceVersion, proteinId);
633 dnaToEmblProteinRef.setMap(dnaToProteinMapping);
634 dnaToProteinMapping.setMappedFromId(proteinId);
635 dna.addDBRef(dnaToEmblProteinRef);
641 public boolean isDnaCoding()
647 * Returns the CDS positions as a single array of [start, end, start, end...]
648 * positions. If on the reverse strand, these will be in descending order.
654 protected int[] getCdsRanges(String accession, Feature feature)
656 String location = feature.getLocation();
657 if (location == null)
664 List<int[]> ranges = DnaUtils.parseLocation(location);
665 return listToArray(ranges);
666 } catch (ParseException e)
669 String.format("Not parsing inexact CDS location %s in ENA %s",
670 location, accession));
676 * Converts a list of [start, end] ranges to a single array of [start, end,
682 int[] listToArray(List<int[]> ranges)
684 int[] result = new int[ranges.size() * 2];
686 for (int[] range : ranges)
688 result[i++] = range[0];
689 result[i++] = range[1];
695 * Helper method to construct a SequenceFeature for one cds range
698 * feature type ("CDS")
708 * map of 'miscellaneous values' for feature
711 protected SequenceFeature makeCdsFeature(String type, String desc,
712 int begin, int end, String group, Map<String, String> vals)
714 SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
717 for (Entry<String, String> val : vals.entrySet())
719 sf.setValue(val.getKey(), val.getValue());
726 public String getAccessionSeparator()
732 public Regex getAccessionValidator()
734 return ACCESSION_REGEX;
738 public String getDbVersion()
750 public boolean isValidReference(String accession)
752 if (accession == null || accession.length() < 2)
756 return getAccessionValidator().search(accession);
760 * Truncates (if necessary) the exon intervals to match 3 times the length of
761 * the protein (including truncation for stop codon included in exon)
763 * @param proteinLength
765 * an array of [start, end, start, end...] intervals
766 * @return the same array (if unchanged) or a truncated copy
768 static int[] adjustForProteinLength(int proteinLength, int[] exon)
770 if (proteinLength <= 0 || exon == null)
774 int expectedCdsLength = proteinLength * 3;
775 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
778 * if exon length matches protein, or is shorter, then leave it unchanged
780 if (expectedCdsLength >= exonLength)
788 origxon = new int[exon.length];
789 System.arraycopy(exon, 0, origxon, 0, exon.length);
791 for (int x = 0; x < exon.length; x += 2)
793 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
794 if (expectedCdsLength <= cdspos)
796 // advanced beyond last codon.
798 if (expectedCdsLength != cdspos)
801 // .println("Truncating final exon interval on region by "
802 // + (cdspos - cdslength));
806 * shrink the final exon - reduce end position if forward
807 * strand, increase it if reverse
809 if (exon[x + 1] >= exon[x])
811 endxon = exon[x + 1] - cdspos + expectedCdsLength;
815 endxon = exon[x + 1] + cdspos - expectedCdsLength;
823 // and trim the exon interval set if necessary
824 int[] nxon = new int[sxpos + 2];
825 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
826 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon