2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.ws.dbsources;
23 import jalview.analysis.SequenceIdMatcher;
24 import jalview.bin.Cache;
25 import jalview.datamodel.Alignment;
26 import jalview.datamodel.AlignmentI;
27 import jalview.datamodel.DBRefEntry;
28 import jalview.datamodel.DBRefSource;
29 import jalview.datamodel.FeatureProperties;
30 import jalview.datamodel.Mapping;
31 import jalview.datamodel.Sequence;
32 import jalview.datamodel.SequenceFeature;
33 import jalview.datamodel.SequenceI;
34 import jalview.util.DBRefUtils;
35 import jalview.util.DnaUtils;
36 import jalview.util.MapList;
37 import jalview.util.MappingUtils;
38 import jalview.util.MessageManager;
39 import jalview.ws.ebi.EBIFetchClient;
40 import jalview.xml.binding.embl.EntryType;
41 import jalview.xml.binding.embl.EntryType.Feature;
42 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
43 import jalview.xml.binding.embl.XrefType;
46 import java.io.FileInputStream;
47 import java.io.InputStream;
48 import java.text.ParseException;
49 import java.util.ArrayList;
50 import java.util.Arrays;
51 import java.util.Hashtable;
52 import java.util.List;
54 import java.util.Map.Entry;
55 import java.util.regex.Pattern;
57 import javax.xml.bind.JAXBContext;
58 import javax.xml.bind.JAXBException;
59 import javax.xml.stream.FactoryConfigurationError;
60 import javax.xml.stream.XMLInputFactory;
61 import javax.xml.stream.XMLStreamException;
62 import javax.xml.stream.XMLStreamReader;
64 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
67 * JAL-1856 Embl returns this text for query not found
69 private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
71 private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
73 public EmblXmlSource()
79 * Retrieves and parses an emblxml file, and returns an alignment containing
80 * the parsed sequences, or null if none were found
83 * "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
88 protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
92 EBIFetchClient dbFetch = new EBIFetchClient();
96 reply = dbFetch.fetchDataAsFile(
97 emprefx.toLowerCase() + ":" + query.trim(), "display=xml",
102 throw new Exception(MessageManager.formatMessage(
103 "exception.ebiembl_retrieval_failed_on", new String[]
104 { emprefx.toLowerCase(), query.trim() }), e);
106 return getEmblSequenceRecords(emprefx, query, reply);
110 * parse an emblxml file stored locally
113 * either EMBL or EMBLCDS strings are allowed - anything else will
114 * not retrieve emblxml
117 * the EMBL XML file containing the results of a query
121 protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
122 File reply) throws Exception
124 List<EntryType> entries = null;
125 if (reply != null && reply.exists())
127 file = reply.getAbsolutePath();
128 if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
130 InputStream is = new FileInputStream(reply);
131 entries = getEmblEntries(is);
136 * invalid accession gets a reply with no <entry> elements, text content of
137 * EmbFile reads something like (e.g.) this ungrammatical phrase
138 * Entry: <acc> display type is either not supported or entry is not found.
140 AlignmentI al = null;
141 List<SequenceI> seqs = new ArrayList<>();
142 List<SequenceI> peptides = new ArrayList<>();
145 for (EntryType entry : entries)
147 SequenceI seq = getSequence(emprefx, entry, peptides);
150 seqs.add(seq.deriveSequence());
151 // place DBReferences on dataset and refer
156 al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
161 "No record found for '" + emprefx + ":" + query + "'");
170 * Reads the XML reply from file and unmarshals it to Java objects. Answers a
171 * (possibly empty) list of <code>EntryType</code> objects.
177 List<EntryType> getEmblEntries(InputStream is)
179 List<EntryType> entries = new ArrayList<>();
182 JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
183 XMLStreamReader streamReader = XMLInputFactory.newInstance()
184 .createXMLStreamReader(is);
185 javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
186 jalview.xml.binding.embl.ROOT root = (jalview.xml.binding.embl.ROOT) um
187 .unmarshal(streamReader);
190 * document root contains either "entry" or "entrySet"
196 if (root.getEntrySet() != null)
198 entries = root.getEntrySet().getEntry();
200 else if (root.getEntry() != null)
202 entries.add(root.getEntry());
204 } catch (JAXBException | XMLStreamException
205 | FactoryConfigurationError e)
213 * A helper method to parse XML data and construct a sequence, with any
214 * available database references and features
221 SequenceI getSequence(String sourceDb, EntryType entry,
222 List<SequenceI> peptides)
224 String seqString = entry.getSequence();
225 if (seqString == null)
229 seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
231 String accession = entry.getAccession();
232 SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
234 dna.setDescription(entry.getDescription());
235 String sequenceVersion = String.valueOf(entry.getVersion().intValue());
236 DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
238 dna.addDBRef(selfRref);
240 new Mapping(null, new int[]
241 { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
247 List<XrefType> xrefs = entry.getXref();
250 for (XrefType xref : xrefs)
252 String acc = xref.getId();
253 String source = DBRefUtils.getCanonicalName(xref.getDb());
254 String version = xref.getSecondaryId();
255 if (version == null || "".equals(version))
259 dna.addDBRef(new DBRefEntry(source, version, acc));
263 SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
266 List<Feature> features = entry.getFeature();
267 if (features != null)
269 for (Feature feature : features)
271 if (FeatureProperties.isCodingFeature(sourceDb,
274 parseCodingFeature(entry, feature, sourceDb, dna, peptides,
279 } catch (Exception e)
281 System.err.println("EMBL Record Features parsing error!");
283 .println("Please report the following to help@jalview.org :");
284 System.err.println("EMBL Record " + accession);
285 System.err.println("Resulted in exception: " + e.getMessage());
286 e.printStackTrace(System.err);
293 * Extracts coding region and product from a CDS feature and decorates it with
303 void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
304 SequenceI dna, List<SequenceI> peptides,
305 SequenceIdMatcher matcher)
307 final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
308 final String accession = entry.getAccession();
309 final String sequenceVersion = entry.getVersion().toString();
311 int[] exons = getCdsRanges(entry.getAccession(), feature);
313 String translation = null;
314 String proteinName = "";
315 String proteinId = null;
316 Map<String, String> vals = new Hashtable<>();
319 * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
320 * (phase is required for CDS features in GFF3 format)
325 * parse qualifiers, saving protein translation, protein id,
326 * codon start position, product (name), and 'other values'
328 if (feature.getQualifier() != null)
330 for (Qualifier q : feature.getQualifier())
332 String qname = q.getName();
333 String value = q.getValue();
334 value = value == null ? ""
335 : value.trim().replace(" ", "").replace("\n", "")
337 if (qname.equals("translation"))
341 else if (qname.equals("protein_id"))
345 else if (qname.equals("codon_start"))
349 codonStart = Integer.parseInt(value.trim());
350 } catch (NumberFormatException e)
352 System.err.println("Invalid codon_start in XML for "
353 + entry.getAccession() + ": " + e.getMessage());
356 else if (qname.equals("product"))
358 // sometimes name is returned e.g. for V00488
363 // throw anything else into the additional properties hash
364 if (!"".equals(value))
366 vals.put(qname, value);
372 DBRefEntry proteinToEmblProteinRef = null;
373 exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
375 SequenceI product = null;
376 Mapping dnaToProteinMapping = null;
377 if (translation != null && proteinName != null && proteinId != null)
379 int translationLength = translation.length();
382 * look for product in peptides list, if not found, add it
384 product = matcher.findIdMatch(proteinId);
387 product = new Sequence(proteinId, translation, 1,
389 product.setDescription(((proteinName.length() == 0)
390 ? "Protein Product from " + sourceDb
392 peptides.add(product);
393 matcher.add(product);
396 // we have everything - create the mapping and perhaps the protein
398 if (exons == null || exons.length == 0)
401 * workaround until we handle dna location for CDS sequence
402 * e.g. location="X53828.1:60..1058" correctly
405 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
406 + sourceDb + ":" + entry.getAccession() + ")");
407 int dnaLength = dna.getLength();
408 if (translationLength * 3 == (1 - codonStart + dnaLength))
411 "Not allowing for additional stop codon at end of cDNA fragment... !");
412 // this might occur for CDS sequences where no features are marked
413 exons = new int[] { dna.getStart() + (codonStart - 1),
415 dnaToProteinMapping = new Mapping(product, exons,
417 { 1, translationLength }, 3, 1);
419 if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
422 "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
423 exons = new int[] { dna.getStart() + (codonStart - 1),
425 dnaToProteinMapping = new Mapping(product, exons,
427 { 1, translationLength }, 3, 1);
432 // Trim the exon mapping if necessary - the given product may only be a
433 // fragment of a larger protein. (EMBL:AY043181 is an example)
437 // TODO: Add a DbRef back to the parent EMBL sequence with the exon
439 // if given a dataset reference, search dataset for parent EMBL
440 // sequence if it exists and set its map
441 // make a new feature annotating the coding contig
445 // final product length truncation check
446 int[] cdsRanges = adjustForProteinLength(translationLength,
448 dnaToProteinMapping = new Mapping(product, cdsRanges,
450 { 1, translationLength }, 3, 1);
454 * make xref with mapping from protein to EMBL dna
456 DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
457 sequenceVersion, proteinId,
458 new Mapping(dnaToProteinMapping.getMap().getInverse()));
459 product.addDBRef(proteinToEmblRef);
462 * make xref from protein to EMBLCDS; we assume here that the
463 * CDS sequence version is same as dna sequence (?!)
465 MapList proteinToCdsMapList = new MapList(
467 { 1, translationLength },
469 { 1 + (codonStart - 1),
470 (codonStart - 1) + 3 * translationLength },
472 DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
473 DBRefSource.EMBLCDS, sequenceVersion, proteinId,
474 new Mapping(proteinToCdsMapList));
475 product.addDBRef(proteinToEmblCdsRef);
478 * make 'direct' xref from protein to EMBLCDSPROTEIN
480 proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
481 proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
482 proteinToEmblProteinRef.setMap(null);
483 product.addDBRef(proteinToEmblProteinRef);
489 * add cds features to dna sequence
491 String cds = feature.getName(); // "CDS"
492 for (int xint = 0; exons != null
493 && xint < exons.length - 1; xint += 2)
495 int exonStart = exons[xint];
496 int exonEnd = exons[xint + 1];
497 int begin = Math.min(exonStart, exonEnd);
498 int end = Math.max(exonStart, exonEnd);
499 int exonNumber = xint / 2 + 1;
500 String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
501 exonNumber, proteinName, proteinId);
503 SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
506 sf.setEnaLocation(feature.getLocation());
507 boolean forwardStrand = exonStart <= exonEnd;
508 sf.setStrand(forwardStrand ? "+" : "-");
509 sf.setPhase(String.valueOf(codonStart - 1));
510 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
511 sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
513 dna.addSequenceFeature(sf);
518 * add feature dbRefs to sequence, and mappings for Uniprot xrefs
520 boolean hasUniprotDbref = false;
521 List<XrefType> xrefs = feature.getXref();
524 boolean mappingUsed = false;
525 for (XrefType xref : xrefs)
528 * ensure UniProtKB/Swiss-Prot converted to UNIPROT
530 String source = DBRefUtils.getCanonicalName(xref.getDb());
531 String version = xref.getSecondaryId();
532 if (version == null || "".equals(version))
536 DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
537 DBRefEntry proteinDbRef = new DBRefEntry(source, version,
538 dbref.getAccessionId());
539 if (source.equals(DBRefSource.UNIPROT))
541 String proteinSeqName = DBRefSource.UNIPROT + "|"
542 + dbref.getAccessionId();
543 if (dnaToProteinMapping != null
544 && dnaToProteinMapping.getTo() != null)
549 * two or more Uniprot xrefs for the same CDS -
550 * each needs a distinct Mapping (as to a different sequence)
552 dnaToProteinMapping = new Mapping(dnaToProteinMapping);
557 * try to locate the protein mapped to (possibly by a
558 * previous CDS feature); if not found, construct it from
559 * the EMBL translation
561 SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
562 if (proteinSeq == null)
564 proteinSeq = new Sequence(proteinSeqName,
565 product.getSequenceAsString());
566 matcher.add(proteinSeq);
567 peptides.add(proteinSeq);
569 dnaToProteinMapping.setTo(proteinSeq);
570 dnaToProteinMapping.setMappedFromId(proteinId);
571 proteinSeq.addDBRef(proteinDbRef);
572 dbref.setMap(dnaToProteinMapping);
574 hasUniprotDbref = true;
579 * copy feature dbref to our protein product
581 DBRefEntry pref = proteinDbRef;
582 pref.setMap(null); // reference is direct
583 product.addDBRef(pref);
584 // Add converse mapping reference
585 if (dnaToProteinMapping != null)
587 Mapping pmap = new Mapping(dna,
588 dnaToProteinMapping.getMap().getInverse());
589 pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
591 if (dnaToProteinMapping.getTo() != null)
593 dnaToProteinMapping.getTo().addDBRef(pref);
602 * if we have a product (translation) but no explicit Uniprot dbref
603 * (example: EMBL AAFI02000057 protein_id EAL65544.1)
604 * then construct mappings to an assumed EMBLCDSPROTEIN accession
606 if (!hasUniprotDbref && product != null)
608 if (proteinToEmblProteinRef == null)
610 // assuming CDSPROTEIN sequence version = dna version (?!)
611 proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
612 sequenceVersion, proteinId);
614 product.addDBRef(proteinToEmblProteinRef);
616 if (dnaToProteinMapping != null
617 && dnaToProteinMapping.getTo() != null)
619 DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
620 DBRefSource.EMBLCDSProduct, sequenceVersion,
622 dnaToEmblProteinRef.setMap(dnaToProteinMapping);
623 dnaToProteinMapping.setMappedFromId(proteinId);
624 dna.addDBRef(dnaToEmblProteinRef);
630 public boolean isDnaCoding()
636 * Returns the CDS positions as a single array of [start, end, start, end...]
637 * positions. If on the reverse strand, these will be in descending order.
643 protected int[] getCdsRanges(String accession, Feature feature)
645 String location = feature.getLocation();
646 if (location == null)
653 List<int[]> ranges = DnaUtils.parseLocation(location);
654 return listToArray(ranges);
655 } catch (ParseException e)
658 String.format("Not parsing inexact CDS location %s in ENA %s",
659 location, accession));
665 * Converts a list of [start, end] ranges to a single array of [start, end,
671 int[] listToArray(List<int[]> ranges)
673 int[] result = new int[ranges.size() * 2];
675 for (int[] range : ranges)
677 result[i++] = range[0];
678 result[i++] = range[1];
684 * Helper method to construct a SequenceFeature for one cds range
687 * feature type ("CDS")
697 * map of 'miscellaneous values' for feature
700 protected SequenceFeature makeCdsFeature(String type, String desc,
701 int begin, int end, String group, Map<String, String> vals)
703 SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
706 StringBuilder sb = new StringBuilder();
707 boolean first = true;
708 for (Entry<String, String> val : vals.entrySet())
714 sb.append(val.getKey()).append("=").append(val.getValue());
716 sf.setValue(val.getKey(), val.getValue());
718 sf.setAttributes(sb.toString());
724 * Truncates (if necessary) the exon intervals to match 3 times the length of
725 * the protein; also accepts 3 bases longer (for stop codon not included in
728 * @param proteinLength
730 * an array of [start, end, start, end...] intervals
731 * @return the same array (if unchanged) or a truncated copy
733 static int[] adjustForProteinLength(int proteinLength, int[] exon)
735 if (proteinLength <= 0 || exon == null)
739 int expectedCdsLength = proteinLength * 3;
740 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
743 * if exon length matches protein, or is shorter, or longer by the
744 * length of a stop codon (3 bases), then leave it unchanged
746 if (expectedCdsLength >= exonLength
747 || expectedCdsLength == exonLength - 3)
755 origxon = new int[exon.length];
756 System.arraycopy(exon, 0, origxon, 0, exon.length);
758 for (int x = 0; x < exon.length; x += 2)
760 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
761 if (expectedCdsLength <= cdspos)
763 // advanced beyond last codon.
765 if (expectedCdsLength != cdspos)
768 // .println("Truncating final exon interval on region by "
769 // + (cdspos - cdslength));
773 * shrink the final exon - reduce end position if forward
774 * strand, increase it if reverse
776 if (exon[x + 1] >= exon[x])
778 endxon = exon[x + 1] - cdspos + expectedCdsLength;
782 endxon = exon[x + 1] + cdspos - expectedCdsLength;
790 // and trim the exon interval set if necessary
791 int[] nxon = new int[sxpos + 2];
792 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
793 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon