2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.ws.dbsources;
23 import jalview.analysis.SequenceIdMatcher;
24 import jalview.bin.Cache;
25 import jalview.datamodel.Alignment;
26 import jalview.datamodel.AlignmentI;
27 import jalview.datamodel.DBRefEntry;
28 import jalview.datamodel.DBRefSource;
29 import jalview.datamodel.FeatureProperties;
30 import jalview.datamodel.Mapping;
31 import jalview.datamodel.Sequence;
32 import jalview.datamodel.SequenceFeature;
33 import jalview.datamodel.SequenceI;
34 import jalview.util.DBRefUtils;
35 import jalview.util.DnaUtils;
36 import jalview.util.MapList;
37 import jalview.util.MappingUtils;
38 import jalview.util.MessageManager;
39 import jalview.ws.ebi.EBIFetchClient;
40 import jalview.xml.binding.embl.EntryType;
41 import jalview.xml.binding.embl.EntryType.Feature;
42 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
43 import jalview.xml.binding.embl.ROOT;
44 import jalview.xml.binding.embl.XrefType;
47 import java.io.FileInputStream;
48 import java.io.InputStream;
49 import java.text.ParseException;
50 import java.util.ArrayList;
51 import java.util.Arrays;
52 import java.util.Hashtable;
53 import java.util.List;
55 import java.util.Map.Entry;
56 import java.util.regex.Pattern;
58 import javax.xml.bind.JAXBContext;
59 import javax.xml.bind.JAXBElement;
60 import javax.xml.bind.JAXBException;
61 import javax.xml.stream.FactoryConfigurationError;
62 import javax.xml.stream.XMLInputFactory;
63 import javax.xml.stream.XMLStreamException;
64 import javax.xml.stream.XMLStreamReader;
66 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
69 * JAL-1856 Embl returns this text for query not found
71 private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
73 private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
75 public EmblXmlSource()
81 * Retrieves and parses an emblxml file, and returns an alignment containing
82 * the parsed sequences, or null if none were found
85 * "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
90 protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
94 EBIFetchClient dbFetch = new EBIFetchClient();
98 reply = dbFetch.fetchDataAsFile(
99 emprefx.toLowerCase() + ":" + query.trim(), "display=xml",
101 } catch (Exception e)
104 throw new Exception(MessageManager.formatMessage(
105 "exception.ebiembl_retrieval_failed_on", new String[]
106 { emprefx.toLowerCase(), query.trim() }), e);
108 return getEmblSequenceRecords(emprefx, query, reply);
112 * parse an emblxml file stored locally
115 * either EMBL or EMBLCDS strings are allowed - anything else will
116 * not retrieve emblxml
119 * the EMBL XML file containing the results of a query
123 protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
124 File reply) throws Exception
126 List<EntryType> entries = null;
127 if (reply != null && reply.exists())
129 file = reply.getAbsolutePath();
130 if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
132 InputStream is = new FileInputStream(reply);
133 entries = getEmblEntries(is);
138 * invalid accession gets a reply with no <entry> elements, text content of
139 * EmbFile reads something like (e.g.) this ungrammatical phrase
140 * Entry: <acc> display type is either not supported or entry is not found.
142 AlignmentI al = null;
143 List<SequenceI> seqs = new ArrayList<>();
144 List<SequenceI> peptides = new ArrayList<>();
147 for (EntryType entry : entries)
149 SequenceI seq = getSequence(emprefx, entry, peptides);
152 seqs.add(seq.deriveSequence());
153 // place DBReferences on dataset and refer
158 al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
163 "No record found for '" + emprefx + ":" + query + "'");
172 * Reads the XML reply from file and unmarshals it to Java objects. Answers a
173 * (possibly empty) list of <code>EntryType</code> objects.
179 List<EntryType> getEmblEntries(InputStream is)
181 List<EntryType> entries = new ArrayList<>();
184 JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
185 XMLStreamReader streamReader = XMLInputFactory.newInstance()
186 .createXMLStreamReader(is);
187 javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
188 JAXBElement<ROOT> rootElement = um.unmarshal(streamReader, ROOT.class);
189 ROOT root = rootElement.getValue();
192 * document root contains either "entry" or "entrySet"
198 if (root.getEntrySet() != null)
200 entries = root.getEntrySet().getEntry();
202 else if (root.getEntry() != null)
204 entries.add(root.getEntry());
206 } catch (JAXBException | XMLStreamException
207 | FactoryConfigurationError e)
215 * A helper method to parse XML data and construct a sequence, with any
216 * available database references and features
223 SequenceI getSequence(String sourceDb, EntryType entry,
224 List<SequenceI> peptides)
226 String seqString = entry.getSequence();
227 if (seqString == null)
231 seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
233 String accession = entry.getAccession();
234 SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
236 dna.setDescription(entry.getDescription());
237 String sequenceVersion = String.valueOf(entry.getVersion().intValue());
238 DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
240 dna.addDBRef(selfRref);
242 new Mapping(null, new int[]
243 { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
249 List<XrefType> xrefs = entry.getXref();
252 for (XrefType xref : xrefs)
254 String acc = xref.getId();
255 String source = DBRefUtils.getCanonicalName(xref.getDb());
256 String version = xref.getSecondaryId();
257 if (version == null || "".equals(version))
261 dna.addDBRef(new DBRefEntry(source, version, acc));
265 SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
268 List<Feature> features = entry.getFeature();
269 if (features != null)
271 for (Feature feature : features)
273 if (FeatureProperties.isCodingFeature(sourceDb,
276 parseCodingFeature(entry, feature, sourceDb, dna, peptides,
281 } catch (Exception e)
283 System.err.println("EMBL Record Features parsing error!");
285 .println("Please report the following to help@jalview.org :");
286 System.err.println("EMBL Record " + accession);
287 System.err.println("Resulted in exception: " + e.getMessage());
288 e.printStackTrace(System.err);
295 * Extracts coding region and product from a CDS feature and decorates it with
305 void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
306 SequenceI dna, List<SequenceI> peptides,
307 SequenceIdMatcher matcher)
309 final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
310 final String accession = entry.getAccession();
311 final String sequenceVersion = entry.getVersion().toString();
313 int[] exons = getCdsRanges(entry.getAccession(), feature);
315 String translation = null;
316 String proteinName = "";
317 String proteinId = null;
318 Map<String, String> vals = new Hashtable<>();
321 * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
322 * (phase is required for CDS features in GFF3 format)
327 * parse qualifiers, saving protein translation, protein id,
328 * codon start position, product (name), and 'other values'
330 if (feature.getQualifier() != null)
332 for (Qualifier q : feature.getQualifier())
334 String qname = q.getName();
335 String value = q.getValue();
336 value = value == null ? ""
337 : value.trim().replace(" ", "").replace("\n", "")
339 if (qname.equals("translation"))
343 else if (qname.equals("protein_id"))
347 else if (qname.equals("codon_start"))
351 codonStart = Integer.parseInt(value.trim());
352 } catch (NumberFormatException e)
354 System.err.println("Invalid codon_start in XML for "
355 + entry.getAccession() + ": " + e.getMessage());
358 else if (qname.equals("product"))
360 // sometimes name is returned e.g. for V00488
365 // throw anything else into the additional properties hash
366 if (!"".equals(value))
368 vals.put(qname, value);
374 DBRefEntry proteinToEmblProteinRef = null;
375 exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
377 SequenceI product = null;
378 Mapping dnaToProteinMapping = null;
379 if (translation != null && proteinName != null && proteinId != null)
381 int translationLength = translation.length();
384 * look for product in peptides list, if not found, add it
386 product = matcher.findIdMatch(proteinId);
389 product = new Sequence(proteinId, translation, 1,
391 product.setDescription(((proteinName.length() == 0)
392 ? "Protein Product from " + sourceDb
394 peptides.add(product);
395 matcher.add(product);
398 // we have everything - create the mapping and perhaps the protein
400 if (exons == null || exons.length == 0)
403 * workaround until we handle dna location for CDS sequence
404 * e.g. location="X53828.1:60..1058" correctly
407 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
408 + sourceDb + ":" + entry.getAccession() + ")");
409 int dnaLength = dna.getLength();
410 if (translationLength * 3 == (1 - codonStart + dnaLength))
413 "Not allowing for additional stop codon at end of cDNA fragment... !");
414 // this might occur for CDS sequences where no features are marked
415 exons = new int[] { dna.getStart() + (codonStart - 1),
417 dnaToProteinMapping = new Mapping(product, exons,
419 { 1, translationLength }, 3, 1);
421 if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
424 "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
425 exons = new int[] { dna.getStart() + (codonStart - 1),
427 dnaToProteinMapping = new Mapping(product, exons,
429 { 1, translationLength }, 3, 1);
434 // Trim the exon mapping if necessary - the given product may only be a
435 // fragment of a larger protein. (EMBL:AY043181 is an example)
439 // TODO: Add a DbRef back to the parent EMBL sequence with the exon
441 // if given a dataset reference, search dataset for parent EMBL
442 // sequence if it exists and set its map
443 // make a new feature annotating the coding contig
447 // final product length truncation check
448 int[] cdsRanges = adjustForProteinLength(translationLength,
450 dnaToProteinMapping = new Mapping(product, cdsRanges,
452 { 1, translationLength }, 3, 1);
456 * make xref with mapping from protein to EMBL dna
458 DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
459 sequenceVersion, proteinId,
460 new Mapping(dnaToProteinMapping.getMap().getInverse()));
461 product.addDBRef(proteinToEmblRef);
464 * make xref from protein to EMBLCDS; we assume here that the
465 * CDS sequence version is same as dna sequence (?!)
467 MapList proteinToCdsMapList = new MapList(
469 { 1, translationLength },
471 { 1 + (codonStart - 1),
472 (codonStart - 1) + 3 * translationLength },
474 DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
475 DBRefSource.EMBLCDS, sequenceVersion, proteinId,
476 new Mapping(proteinToCdsMapList));
477 product.addDBRef(proteinToEmblCdsRef);
480 * make 'direct' xref from protein to EMBLCDSPROTEIN
482 proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
483 proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
484 proteinToEmblProteinRef.setMap(null);
485 product.addDBRef(proteinToEmblProteinRef);
491 * add cds features to dna sequence
493 String cds = feature.getName(); // "CDS"
494 for (int xint = 0; exons != null
495 && xint < exons.length - 1; xint += 2)
497 int exonStart = exons[xint];
498 int exonEnd = exons[xint + 1];
499 int begin = Math.min(exonStart, exonEnd);
500 int end = Math.max(exonStart, exonEnd);
501 int exonNumber = xint / 2 + 1;
502 String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
503 exonNumber, proteinName, proteinId);
505 SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
508 sf.setEnaLocation(feature.getLocation());
509 boolean forwardStrand = exonStart <= exonEnd;
510 sf.setStrand(forwardStrand ? "+" : "-");
511 sf.setPhase(String.valueOf(codonStart - 1));
512 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
513 sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
515 dna.addSequenceFeature(sf);
520 * add feature dbRefs to sequence, and mappings for Uniprot xrefs
522 boolean hasUniprotDbref = false;
523 List<XrefType> xrefs = feature.getXref();
526 boolean mappingUsed = false;
527 for (XrefType xref : xrefs)
530 * ensure UniProtKB/Swiss-Prot converted to UNIPROT
532 String source = DBRefUtils.getCanonicalName(xref.getDb());
533 String version = xref.getSecondaryId();
534 if (version == null || "".equals(version))
538 DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
539 DBRefEntry proteinDbRef = new DBRefEntry(source, version,
540 dbref.getAccessionId());
541 if (source.equals(DBRefSource.UNIPROT))
543 String proteinSeqName = DBRefSource.UNIPROT + "|"
544 + dbref.getAccessionId();
545 if (dnaToProteinMapping != null
546 && dnaToProteinMapping.getTo() != null)
551 * two or more Uniprot xrefs for the same CDS -
552 * each needs a distinct Mapping (as to a different sequence)
554 dnaToProteinMapping = new Mapping(dnaToProteinMapping);
559 * try to locate the protein mapped to (possibly by a
560 * previous CDS feature); if not found, construct it from
561 * the EMBL translation
563 SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
564 if (proteinSeq == null)
566 proteinSeq = new Sequence(proteinSeqName,
567 product.getSequenceAsString());
568 matcher.add(proteinSeq);
569 peptides.add(proteinSeq);
571 dnaToProteinMapping.setTo(proteinSeq);
572 dnaToProteinMapping.setMappedFromId(proteinId);
573 proteinSeq.addDBRef(proteinDbRef);
574 dbref.setMap(dnaToProteinMapping);
576 hasUniprotDbref = true;
581 * copy feature dbref to our protein product
583 DBRefEntry pref = proteinDbRef;
584 pref.setMap(null); // reference is direct
585 product.addDBRef(pref);
586 // Add converse mapping reference
587 if (dnaToProteinMapping != null)
589 Mapping pmap = new Mapping(dna,
590 dnaToProteinMapping.getMap().getInverse());
591 pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
593 if (dnaToProteinMapping.getTo() != null)
595 dnaToProteinMapping.getTo().addDBRef(pref);
604 * if we have a product (translation) but no explicit Uniprot dbref
605 * (example: EMBL AAFI02000057 protein_id EAL65544.1)
606 * then construct mappings to an assumed EMBLCDSPROTEIN accession
608 if (!hasUniprotDbref && product != null)
610 if (proteinToEmblProteinRef == null)
612 // assuming CDSPROTEIN sequence version = dna version (?!)
613 proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
614 sequenceVersion, proteinId);
616 product.addDBRef(proteinToEmblProteinRef);
618 if (dnaToProteinMapping != null
619 && dnaToProteinMapping.getTo() != null)
621 DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
622 DBRefSource.EMBLCDSProduct, sequenceVersion,
624 dnaToEmblProteinRef.setMap(dnaToProteinMapping);
625 dnaToProteinMapping.setMappedFromId(proteinId);
626 dna.addDBRef(dnaToEmblProteinRef);
632 public boolean isDnaCoding()
638 * Returns the CDS positions as a single array of [start, end, start, end...]
639 * positions. If on the reverse strand, these will be in descending order.
645 protected int[] getCdsRanges(String accession, Feature feature)
647 String location = feature.getLocation();
648 if (location == null)
655 // BH 2019.10.06 note: exo-location of "X53828.1:60..1058" causes
656 // RemoteFormatTest to fail at this point.
657 List<int[]> ranges = DnaUtils.parseLocation(location);
658 return listToArray(ranges);
659 } catch (ParseException e)
662 String.format("Not parsing inexact CDS location %s in ENA %s",
663 location, accession));
669 * Converts a list of [start, end] ranges to a single array of [start, end,
675 int[] listToArray(List<int[]> ranges)
677 int[] result = new int[ranges.size() * 2];
679 for (int[] range : ranges)
681 result[i++] = range[0];
682 result[i++] = range[1];
688 * Helper method to construct a SequenceFeature for one cds range
691 * feature type ("CDS")
701 * map of 'miscellaneous values' for feature
704 protected SequenceFeature makeCdsFeature(String type, String desc,
705 int begin, int end, String group, Map<String, String> vals)
707 SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
710 StringBuilder sb = new StringBuilder();
711 boolean first = true;
712 for (Entry<String, String> val : vals.entrySet())
718 sb.append(val.getKey()).append("=").append(val.getValue());
720 sf.setValue(val.getKey(), val.getValue());
722 sf.setAttributes(sb.toString());
728 * Truncates (if necessary) the exon intervals to match 3 times the length of
729 * the protein; also accepts 3 bases longer (for stop codon not included in
732 * @param proteinLength
734 * an array of [start, end, start, end...] intervals
735 * @return the same array (if unchanged) or a truncated copy
737 static int[] adjustForProteinLength(int proteinLength, int[] exon)
739 if (proteinLength <= 0 || exon == null)
743 int expectedCdsLength = proteinLength * 3;
744 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
747 * if exon length matches protein, or is shorter, or longer by the
748 * length of a stop codon (3 bases), then leave it unchanged
750 if (expectedCdsLength >= exonLength
751 || expectedCdsLength == exonLength - 3)
759 origxon = new int[exon.length];
760 System.arraycopy(exon, 0, origxon, 0, exon.length);
762 for (int x = 0; x < exon.length; x += 2)
764 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
765 if (expectedCdsLength <= cdspos)
767 // advanced beyond last codon.
769 if (expectedCdsLength != cdspos)
772 // .println("Truncating final exon interval on region by "
773 // + (cdspos - cdslength));
777 * shrink the final exon - reduce end position if forward
778 * strand, increase it if reverse
780 if (exon[x + 1] >= exon[x])
782 endxon = exon[x + 1] - cdspos + expectedCdsLength;
786 endxon = exon[x + 1] + cdspos - expectedCdsLength;
794 // and trim the exon interval set if necessary
795 int[] nxon = new int[sxpos + 2];
796 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
797 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon