2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.ws.dbsources;
23 import jalview.analysis.SequenceIdMatcher;
24 import jalview.bin.Cache;
25 import jalview.datamodel.Alignment;
26 import jalview.datamodel.AlignmentI;
27 import jalview.datamodel.DBRefEntry;
28 import jalview.datamodel.DBRefSource;
29 import jalview.datamodel.FeatureProperties;
30 import jalview.datamodel.Mapping;
31 import jalview.datamodel.Sequence;
32 import jalview.datamodel.SequenceFeature;
33 import jalview.datamodel.SequenceI;
34 import jalview.util.DBRefUtils;
35 import jalview.util.DnaUtils;
36 import jalview.util.MapList;
37 import jalview.util.MappingUtils;
38 import jalview.util.MessageManager;
39 import jalview.ws.ebi.EBIFetchClient;
40 import jalview.xml.binding.embl.EntryType;
41 import jalview.xml.binding.embl.EntryType.Feature;
42 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
43 import jalview.xml.binding.embl.XrefType;
46 import java.io.FileInputStream;
47 import java.io.InputStream;
48 import java.text.ParseException;
49 import java.util.ArrayList;
50 import java.util.Arrays;
51 import java.util.Hashtable;
52 import java.util.List;
54 import java.util.Map.Entry;
56 import javax.xml.bind.JAXBContext;
57 import javax.xml.bind.JAXBException;
58 import javax.xml.stream.FactoryConfigurationError;
59 import javax.xml.stream.XMLInputFactory;
60 import javax.xml.stream.XMLStreamException;
61 import javax.xml.stream.XMLStreamReader;
63 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
66 * JAL-1856 Embl returns this text for query not found
68 private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
70 public EmblXmlSource()
76 * Retrieves and parses an emblxml file, and returns an alignment containing
77 * the parsed sequences, or null if none were found
80 * "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
85 protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
89 EBIFetchClient dbFetch = new EBIFetchClient();
93 reply = dbFetch.fetchDataAsFile(
94 emprefx.toLowerCase() + ":" + query.trim(), "display=xml",
99 throw new Exception(MessageManager.formatMessage(
100 "exception.ebiembl_retrieval_failed_on", new String[]
101 { emprefx.toLowerCase(), query.trim() }), e);
103 return getEmblSequenceRecords(emprefx, query, reply);
107 * parse an emblxml file stored locally
110 * either EMBL or EMBLCDS strings are allowed - anything else will
111 * not retrieve emblxml
114 * the EMBL XML file containing the results of a query
118 protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
119 File reply) throws Exception
121 List<EntryType> entries = null;
122 if (reply != null && reply.exists())
124 file = reply.getAbsolutePath();
125 if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
127 InputStream is = new FileInputStream(reply);
128 entries = getEmblEntries(is);
133 * invalid accession gets a reply with no <entry> elements, text content of
134 * EmbFile reads something like (e.g.) this ungrammatical phrase
135 * Entry: <acc> display type is either not supported or entry is not found.
137 AlignmentI al = null;
138 List<SequenceI> seqs = new ArrayList<>();
139 List<SequenceI> peptides = new ArrayList<>();
142 for (EntryType entry : entries)
144 SequenceI seq = getSequence(emprefx, entry, peptides);
147 seqs.add(seq.deriveSequence());
148 // place DBReferences on dataset and refer
153 al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
158 "No record found for '" + emprefx + ":" + query + "'");
167 * Reads the XML reply from file and unmarshals it to Java objects. Answers a
168 * (possibly empty) list of <code>EntryType</code> objects.
174 List<EntryType> getEmblEntries(InputStream is)
176 List<EntryType> entries = new ArrayList<>();
179 JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
180 XMLStreamReader streamReader = XMLInputFactory.newInstance()
181 .createXMLStreamReader(is);
182 javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
183 jalview.xml.binding.embl.ROOT root = (jalview.xml.binding.embl.ROOT) um
184 .unmarshal(streamReader);
187 * document root contains either "entry" or "entrySet"
193 if (root.getEntrySet() != null)
195 entries = root.getEntrySet().getEntry();
197 else if (root.getEntry() != null)
199 entries.add(root.getEntry());
201 } catch (JAXBException | XMLStreamException
202 | FactoryConfigurationError e)
210 * A helper method to parse XML data and construct a sequence, with any
211 * available database references and features
218 SequenceI getSequence(String sourceDb, EntryType entry,
219 List<SequenceI> peptides)
221 String seqString = entry.getSequence();
222 if (seqString == null)
226 seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
228 String accession = entry.getAccession();
229 SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
231 dna.setDescription(entry.getDescription());
232 String sequenceVersion = String.valueOf(entry.getVersion().intValue());
233 DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
235 dna.addDBRef(selfRref);
237 new Mapping(null, new int[]
238 { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
244 List<XrefType> xrefs = entry.getXref();
247 for (XrefType xref : xrefs)
249 String acc = xref.getId();
250 String source = DBRefUtils.getCanonicalName(xref.getDb());
251 String version = xref.getSecondaryId();
252 if (version == null || "".equals(version))
256 dna.addDBRef(new DBRefEntry(source, version, acc));
260 SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
263 List<Feature> features = entry.getFeature();
264 if (features != null)
266 for (Feature feature : features)
268 if (FeatureProperties.isCodingFeature(sourceDb,
271 parseCodingFeature(entry, feature, sourceDb, dna, peptides,
276 } catch (Exception e)
278 System.err.println("EMBL Record Features parsing error!");
280 .println("Please report the following to help@jalview.org :");
281 System.err.println("EMBL Record " + accession);
282 System.err.println("Resulted in exception: " + e.getMessage());
283 e.printStackTrace(System.err);
290 * Extracts coding region and product from a CDS feature and decorates it with
300 void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
301 SequenceI dna, List<SequenceI> peptides,
302 SequenceIdMatcher matcher)
304 final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
305 final String accession = entry.getAccession();
306 final String sequenceVersion = entry.getVersion().toString();
308 int[] exons = getCdsRanges(entry.getAccession(), feature);
310 String translation = null;
311 String proteinName = "";
312 String proteinId = null;
313 Map<String, String> vals = new Hashtable<>();
316 * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
317 * (phase is required for CDS features in GFF3 format)
322 * parse qualifiers, saving protein translation, protein id,
323 * codon start position, product (name), and 'other values'
325 if (feature.getQualifier() != null)
327 for (Qualifier q : feature.getQualifier())
329 String qname = q.getName();
330 String value = q.getValue();
331 value = value == null ? ""
332 : value.trim().replace(" ", "").replace("\n", "")
334 if (qname.equals("translation"))
338 else if (qname.equals("protein_id"))
342 else if (qname.equals("codon_start"))
346 codonStart = Integer.parseInt(value.trim());
347 } catch (NumberFormatException e)
349 System.err.println("Invalid codon_start in XML for "
350 + entry.getAccession() + ": " + e.getMessage());
353 else if (qname.equals("product"))
355 // sometimes name is returned e.g. for V00488
360 // throw anything else into the additional properties hash
361 if (!"".equals(value))
363 vals.put(qname, value);
369 DBRefEntry proteinToEmblProteinRef = null;
370 exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
372 SequenceI product = null;
373 Mapping dnaToProteinMapping = null;
374 if (translation != null && proteinName != null && proteinId != null)
376 int translationLength = translation.length();
379 * look for product in peptides list, if not found, add it
381 product = matcher.findIdMatch(proteinId);
384 product = new Sequence(proteinId, translation, 1,
386 product.setDescription(((proteinName.length() == 0)
387 ? "Protein Product from " + sourceDb
389 peptides.add(product);
390 matcher.add(product);
393 // we have everything - create the mapping and perhaps the protein
395 if (exons == null || exons.length == 0)
398 * workaround until we handle dna location for CDS sequence
399 * e.g. location="X53828.1:60..1058" correctly
402 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
403 + sourceDb + ":" + entry.getAccession() + ")");
404 int dnaLength = dna.getLength();
405 if (translationLength * 3 == (1 - codonStart + dnaLength))
408 "Not allowing for additional stop codon at end of cDNA fragment... !");
409 // this might occur for CDS sequences where no features are marked
410 exons = new int[] { dna.getStart() + (codonStart - 1),
412 dnaToProteinMapping = new Mapping(product, exons,
414 { 1, translationLength }, 3, 1);
416 if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
419 "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
420 exons = new int[] { dna.getStart() + (codonStart - 1),
422 dnaToProteinMapping = new Mapping(product, exons,
424 { 1, translationLength }, 3, 1);
429 // Trim the exon mapping if necessary - the given product may only be a
430 // fragment of a larger protein. (EMBL:AY043181 is an example)
434 // TODO: Add a DbRef back to the parent EMBL sequence with the exon
436 // if given a dataset reference, search dataset for parent EMBL
437 // sequence if it exists and set its map
438 // make a new feature annotating the coding contig
442 // final product length truncation check
443 int[] cdsRanges = adjustForProteinLength(translationLength,
445 dnaToProteinMapping = new Mapping(product, cdsRanges,
447 { 1, translationLength }, 3, 1);
451 * make xref with mapping from protein to EMBL dna
453 DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
454 sequenceVersion, proteinId,
455 new Mapping(dnaToProteinMapping.getMap().getInverse()));
456 product.addDBRef(proteinToEmblRef);
459 * make xref from protein to EMBLCDS; we assume here that the
460 * CDS sequence version is same as dna sequence (?!)
462 MapList proteinToCdsMapList = new MapList(
464 { 1, translationLength },
466 { 1 + (codonStart - 1),
467 (codonStart - 1) + 3 * translationLength },
469 DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
470 DBRefSource.EMBLCDS, sequenceVersion, proteinId,
471 new Mapping(proteinToCdsMapList));
472 product.addDBRef(proteinToEmblCdsRef);
475 * make 'direct' xref from protein to EMBLCDSPROTEIN
477 proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
478 proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
479 proteinToEmblProteinRef.setMap(null);
480 product.addDBRef(proteinToEmblProteinRef);
486 * add cds features to dna sequence
488 String cds = feature.getName(); // "CDS"
489 for (int xint = 0; exons != null
490 && xint < exons.length - 1; xint += 2)
492 int exonStart = exons[xint];
493 int exonEnd = exons[xint + 1];
494 int begin = Math.min(exonStart, exonEnd);
495 int end = Math.max(exonStart, exonEnd);
496 int exonNumber = xint / 2 + 1;
497 String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
498 exonNumber, proteinName, proteinId);
500 SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
503 sf.setEnaLocation(feature.getLocation());
504 boolean forwardStrand = exonStart <= exonEnd;
505 sf.setStrand(forwardStrand ? "+" : "-");
506 sf.setPhase(String.valueOf(codonStart - 1));
507 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
508 sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
510 dna.addSequenceFeature(sf);
515 * add feature dbRefs to sequence, and mappings for Uniprot xrefs
517 boolean hasUniprotDbref = false;
518 List<XrefType> xrefs = feature.getXref();
521 boolean mappingUsed = false;
522 for (XrefType xref : xrefs)
525 * ensure UniProtKB/Swiss-Prot converted to UNIPROT
527 String source = DBRefUtils.getCanonicalName(xref.getDb());
528 String version = xref.getSecondaryId();
529 if (version == null || "".equals(version))
533 DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
534 DBRefEntry proteinDbRef = new DBRefEntry(source, version,
535 dbref.getAccessionId());
536 if (source.equals(DBRefSource.UNIPROT))
538 String proteinSeqName = DBRefSource.UNIPROT + "|"
539 + dbref.getAccessionId();
540 if (dnaToProteinMapping != null
541 && dnaToProteinMapping.getTo() != null)
546 * two or more Uniprot xrefs for the same CDS -
547 * each needs a distinct Mapping (as to a different sequence)
549 dnaToProteinMapping = new Mapping(dnaToProteinMapping);
554 * try to locate the protein mapped to (possibly by a
555 * previous CDS feature); if not found, construct it from
556 * the EMBL translation
558 SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
559 if (proteinSeq == null)
561 proteinSeq = new Sequence(proteinSeqName,
562 product.getSequenceAsString());
563 matcher.add(proteinSeq);
564 peptides.add(proteinSeq);
566 dnaToProteinMapping.setTo(proteinSeq);
567 dnaToProteinMapping.setMappedFromId(proteinId);
568 proteinSeq.addDBRef(proteinDbRef);
569 dbref.setMap(dnaToProteinMapping);
571 hasUniprotDbref = true;
576 * copy feature dbref to our protein product
578 DBRefEntry pref = proteinDbRef;
579 pref.setMap(null); // reference is direct
580 product.addDBRef(pref);
581 // Add converse mapping reference
582 if (dnaToProteinMapping != null)
584 Mapping pmap = new Mapping(dna,
585 dnaToProteinMapping.getMap().getInverse());
586 pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
588 if (dnaToProteinMapping.getTo() != null)
590 dnaToProteinMapping.getTo().addDBRef(pref);
599 * if we have a product (translation) but no explicit Uniprot dbref
600 * (example: EMBL AAFI02000057 protein_id EAL65544.1)
601 * then construct mappings to an assumed EMBLCDSPROTEIN accession
603 if (!hasUniprotDbref && product != null)
605 if (proteinToEmblProteinRef == null)
607 // assuming CDSPROTEIN sequence version = dna version (?!)
608 proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
609 sequenceVersion, proteinId);
611 product.addDBRef(proteinToEmblProteinRef);
613 if (dnaToProteinMapping != null
614 && dnaToProteinMapping.getTo() != null)
616 DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
617 DBRefSource.EMBLCDSProduct, sequenceVersion,
619 dnaToEmblProteinRef.setMap(dnaToProteinMapping);
620 dnaToProteinMapping.setMappedFromId(proteinId);
621 dna.addDBRef(dnaToEmblProteinRef);
627 public boolean isDnaCoding()
633 * Returns the CDS positions as a single array of [start, end, start, end...]
634 * positions. If on the reverse strand, these will be in descending order.
640 protected int[] getCdsRanges(String accession, Feature feature)
642 String location = feature.getLocation();
643 if (location == null)
650 List<int[]> ranges = DnaUtils.parseLocation(location);
651 return listToArray(ranges);
652 } catch (ParseException e)
655 String.format("Not parsing inexact CDS location %s in ENA %s",
656 location, accession));
662 * Converts a list of [start, end] ranges to a single array of [start, end,
668 int[] listToArray(List<int[]> ranges)
670 int[] result = new int[ranges.size() * 2];
672 for (int[] range : ranges)
674 result[i++] = range[0];
675 result[i++] = range[1];
681 * Helper method to construct a SequenceFeature for one cds range
684 * feature type ("CDS")
694 * map of 'miscellaneous values' for feature
697 protected SequenceFeature makeCdsFeature(String type, String desc,
698 int begin, int end, String group, Map<String, String> vals)
700 SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
703 for (Entry<String, String> val : vals.entrySet())
705 sf.setValue(val.getKey(), val.getValue());
712 * Truncates (if necessary) the exon intervals to match 3 times the length of
713 * the protein; also accepts 3 bases longer (for stop codon not included in
716 * @param proteinLength
718 * an array of [start, end, start, end...] intervals
719 * @return the same array (if unchanged) or a truncated copy
721 static int[] adjustForProteinLength(int proteinLength, int[] exon)
723 if (proteinLength <= 0 || exon == null)
727 int expectedCdsLength = proteinLength * 3;
728 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
731 * if exon length matches protein, or is shorter, or longer by the
732 * length of a stop codon (3 bases), then leave it unchanged
734 if (expectedCdsLength >= exonLength
735 || expectedCdsLength == exonLength - 3)
743 origxon = new int[exon.length];
744 System.arraycopy(exon, 0, origxon, 0, exon.length);
746 for (int x = 0; x < exon.length; x += 2)
748 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
749 if (expectedCdsLength <= cdspos)
751 // advanced beyond last codon.
753 if (expectedCdsLength != cdspos)
756 // .println("Truncating final exon interval on region by "
757 // + (cdspos - cdslength));
761 * shrink the final exon - reduce end position if forward
762 * strand, increase it if reverse
764 if (exon[x + 1] >= exon[x])
766 endxon = exon[x + 1] - cdspos + expectedCdsLength;
770 endxon = exon[x + 1] + cdspos - expectedCdsLength;
778 // and trim the exon interval set if necessary
779 int[] nxon = new int[sxpos + 2];
780 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
781 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon