/* * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.ws.dbsources; import java.util.Locale; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.Hashtable; import java.util.List; import java.util.Map; import java.util.Map.Entry; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBElement; import javax.xml.bind.JAXBException; import javax.xml.stream.FactoryConfigurationError; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import com.stevesoft.pat.Regex; import jalview.analysis.SequenceIdMatcher; import jalview.bin.Cache; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.FeatureProperties; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.util.DBRefUtils; import jalview.util.DnaUtils; import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.ws.ebi.EBIFetchClient; import jalview.xml.binding.embl.EntryType; import jalview.xml.binding.embl.EntryType.Feature; import jalview.xml.binding.embl.EntryType.Feature.Qualifier; import jalview.xml.binding.embl.ROOT; import jalview.xml.binding.embl.XrefType; /** * Provides XML binding and parsing of EMBL or EMBLCDS records retrieved from * (e.g.) {@code https://www.ebi.ac.uk/ena/data/view/x53828&display=xml}. * * @deprecated endpoint withdrawn August 2020 (JAL-3692), use EmblFlatfileSource */ public abstract class EmblXmlSource extends EbiFileRetrievedProxy { private static final Regex ACCESSION_REGEX = new Regex("^[A-Z]+[0-9]+"); /* * JAL-1856 Embl returns this text for query not found */ private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found."; public EmblXmlSource() { super(); } /** * Retrieves and parses an emblxml file, and returns an alignment containing * the parsed sequences, or null if none were found * * @param emprefx * "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml * @param query * @return * @throws Exception */ protected AlignmentI getEmblSequenceRecords(String emprefx, String query) throws Exception { startQuery(); EBIFetchClient dbFetch = new EBIFetchClient(); File reply; try { reply = dbFetch.fetchDataAsFile( emprefx.toLowerCase(Locale.ROOT) + ":" + query.trim(), "display=xml", "xml"); } catch (Exception e) { stopQuery(); throw new Exception( String.format("EBI EMBL XML retrieval failed for %s:%s", emprefx.toLowerCase(Locale.ROOT), query.trim()), e); } return getEmblSequenceRecords(emprefx, query, reply); } /** * parse an emblxml file stored locally * * @param emprefx * either EMBL or EMBLCDS strings are allowed - anything else will * not retrieve emblxml * @param query * @param file * the EMBL XML file containing the results of a query * @return * @throws Exception */ protected AlignmentI getEmblSequenceRecords(String emprefx, String query, File reply) throws Exception { List entries = null; if (reply != null && reply.exists()) { file = reply.getAbsolutePath(); if (reply.length() > EMBL_NOT_FOUND_REPLY.length()) { InputStream is = new FileInputStream(reply); entries = getEmblEntries(is); } } /* * invalid accession gets a reply with no elements, text content of * EmbFile reads something like (e.g.) this ungrammatical phrase * Entry: display type is either not supported or entry is not found. */ AlignmentI al = null; List seqs = new ArrayList<>(); List peptides = new ArrayList<>(); if (entries != null) { for (EntryType entry : entries) { SequenceI seq = getSequence(emprefx, entry, peptides); if (seq != null) { seqs.add(seq.deriveSequence()); // place DBReferences on dataset and refer } } if (!seqs.isEmpty()) { al = new Alignment(seqs.toArray(new SequenceI[seqs.size()])); } else { System.out.println( "No record found for '" + emprefx + ":" + query + "'"); } } stopQuery(); return al; } /** * Reads the XML reply from file and unmarshals it to Java objects. Answers a * (possibly empty) list of EntryType objects. * * is * * @return */ List getEmblEntries(InputStream is) { List entries = new ArrayList<>(); try { JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl"); XMLStreamReader streamReader = XMLInputFactory.newInstance() .createXMLStreamReader(is); javax.xml.bind.Unmarshaller um = jc.createUnmarshaller(); JAXBElement rootElement = um.unmarshal(streamReader, ROOT.class); ROOT root = rootElement.getValue(); /* * document root contains either "entry" or "entrySet" */ if (root == null) { return entries; } if (root.getEntrySet() != null) { entries = root.getEntrySet().getEntry(); } else if (root.getEntry() != null) { entries.add(root.getEntry()); } } catch (JAXBException | XMLStreamException | FactoryConfigurationError e) { e.printStackTrace(); } return entries; } /** * A helper method to parse XML data and construct a sequence, with any * available database references and features * * @param emprefx * @param entry * @param peptides * @return */ SequenceI getSequence(String sourceDb, EntryType entry, List peptides) { String seqString = entry.getSequence(); if (seqString == null) { return null; } seqString = seqString.replace(" ", "").replace("\n", "").replace("\t", ""); String accession = entry.getAccession(); SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString); dna.setDescription(entry.getDescription()); String sequenceVersion = String.valueOf(entry.getVersion().intValue()); DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion, accession); dna.addDBRef(selfRref); selfRref.setMap( new Mapping(null, new int[] { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1, 1)); /* * add db references */ List xrefs = entry.getXref(); if (xrefs != null) { for (XrefType xref : xrefs) { String acc = xref.getId(); String source = DBRefUtils.getCanonicalName(xref.getDb()); String version = xref.getSecondaryId(); if (version == null || "".equals(version)) { version = "0"; } dna.addDBRef(new DBRefEntry(source, version, acc)); } } SequenceIdMatcher matcher = new SequenceIdMatcher(peptides); try { List features = entry.getFeature(); if (features != null) { for (Feature feature : features) { if (FeatureProperties.isCodingFeature(sourceDb, feature.getName())) { parseCodingFeature(entry, feature, sourceDb, dna, peptides, matcher); } } } } catch (Exception e) { System.err.println("EMBL Record Features parsing error!"); System.err .println("Please report the following to help@jalview.org :"); System.err.println("EMBL Record " + accession); System.err.println("Resulted in exception: " + e.getMessage()); e.printStackTrace(System.err); } return dna; } /** * Extracts coding region and product from a CDS feature and decorates it with * annotations * * @param entry * @param feature * @param sourceDb * @param dna * @param peptides * @param matcher */ void parseCodingFeature(EntryType entry, Feature feature, String sourceDb, SequenceI dna, List peptides, SequenceIdMatcher matcher) { final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS); final String accession = entry.getAccession(); final String sequenceVersion = entry.getVersion().toString(); int[] exons = getCdsRanges(entry.getAccession(), feature); String translation = null; String proteinName = ""; String proteinId = null; Map vals = new Hashtable<>(); /* * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS * (phase is required for CDS features in GFF3 format) */ int codonStart = 1; /* * parse qualifiers, saving protein translation, protein id, * codon start position, product (name), and 'other values' */ if (feature.getQualifier() != null) { for (Qualifier q : feature.getQualifier()) { String qname = q.getName(); String value = q.getValue(); value = value == null ? "" : value.trim().replace(" ", "").replace("\n", "") .replace("\t", ""); if (qname.equals("translation")) { translation = value; } else if (qname.equals("protein_id")) { proteinId = value; } else if (qname.equals("codon_start")) { try { codonStart = Integer.parseInt(value.trim()); } catch (NumberFormatException e) { System.err.println("Invalid codon_start in XML for " + entry.getAccession() + ": " + e.getMessage()); } } else if (qname.equals("product")) { // sometimes name is returned e.g. for V00488 proteinName = value; } else { // throw anything else into the additional properties hash if (!"".equals(value)) { vals.put(qname, value); } } } } DBRefEntry proteinToEmblProteinRef = null; exons = MappingUtils.removeStartPositions(codonStart - 1, exons); SequenceI product = null; Mapping dnaToProteinMapping = null; if (translation != null && proteinName != null && proteinId != null) { int translationLength = translation.length(); /* * look for product in peptides list, if not found, add it */ product = matcher.findIdMatch(proteinId); if (product == null) { product = new Sequence(proteinId, translation, 1, translationLength); product.setDescription(((proteinName.length() == 0) ? "Protein Product from " + sourceDb : proteinName)); peptides.add(product); matcher.add(product); } // we have everything - create the mapping and perhaps the protein // sequence if (exons == null || exons.length == 0) { /* * workaround until we handle dna location for CDS sequence * e.g. location="X53828.1:60..1058" correctly */ System.err.println( "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + sourceDb + ":" + entry.getAccession() + ")"); int dnaLength = dna.getLength(); if (translationLength * 3 == (1 - codonStart + dnaLength)) { System.err.println( "Not allowing for additional stop codon at end of cDNA fragment... !"); // this might occur for CDS sequences where no features are marked exons = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() }; dnaToProteinMapping = new Mapping(product, exons, new int[] { 1, translationLength }, 3, 1); } if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength)) { System.err.println( "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!"); exons = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() - 3 }; dnaToProteinMapping = new Mapping(product, exons, new int[] { 1, translationLength }, 3, 1); } } else { // Trim the exon mapping if necessary - the given product may only be a // fragment of a larger protein. (EMBL:AY043181 is an example) if (isEmblCdna) { // TODO: Add a DbRef back to the parent EMBL sequence with the exon // map // if given a dataset reference, search dataset for parent EMBL // sequence if it exists and set its map // make a new feature annotating the coding contig } else { // final product length truncation check int[] cdsRanges = adjustForProteinLength(translationLength, exons); dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] { 1, translationLength }, 3, 1); if (product != null) { /* * make xref with mapping from protein to EMBL dna */ DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL, sequenceVersion, proteinId, new Mapping(dnaToProteinMapping.getMap().getInverse())); product.addDBRef(proteinToEmblRef); /* * make xref from protein to EMBLCDS; we assume here that the * CDS sequence version is same as dna sequence (?!) */ MapList proteinToCdsMapList = new MapList( new int[] { 1, translationLength }, new int[] { 1 + (codonStart - 1), (codonStart - 1) + 3 * translationLength }, 1, 3); DBRefEntry proteinToEmblCdsRef = new DBRefEntry( DBRefSource.EMBLCDS, sequenceVersion, proteinId, new Mapping(proteinToCdsMapList)); product.addDBRef(proteinToEmblCdsRef); /* * make 'direct' xref from protein to EMBLCDSPROTEIN */ proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef); proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct); proteinToEmblProteinRef.setMap(null); product.addDBRef(proteinToEmblProteinRef); } } } /* * add cds features to dna sequence */ String cds = feature.getName(); // "CDS" for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2) { int exonStart = exons[xint]; int exonEnd = exons[xint + 1]; int begin = Math.min(exonStart, exonEnd); int end = Math.max(exonStart, exonEnd); int exonNumber = xint / 2 + 1; String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s", exonNumber, proteinName, proteinId); SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb, vals); sf.setEnaLocation(feature.getLocation()); boolean forwardStrand = exonStart <= exonEnd; sf.setStrand(forwardStrand ? "+" : "-"); sf.setPhase(String.valueOf(codonStart - 1)); sf.setValue(FeatureProperties.EXONPOS, exonNumber); sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); dna.addSequenceFeature(sf); } } /* * add feature dbRefs to sequence, and mappings for Uniprot xrefs */ boolean hasUniprotDbref = false; List xrefs = feature.getXref(); if (xrefs != null) { boolean mappingUsed = false; for (XrefType xref : xrefs) { /* * ensure UniProtKB/Swiss-Prot converted to UNIPROT */ String source = DBRefUtils.getCanonicalName(xref.getDb()); String version = xref.getSecondaryId(); if (version == null || "".equals(version)) { version = "0"; } DBRefEntry dbref = new DBRefEntry(source, version, xref.getId()); DBRefEntry proteinDbRef = new DBRefEntry(source, version, dbref.getAccessionId()); if (source.equals(DBRefSource.UNIPROT)) { String proteinSeqName = DBRefSource.UNIPROT + "|" + dbref.getAccessionId(); if (dnaToProteinMapping != null && dnaToProteinMapping.getTo() != null) { if (mappingUsed) { /* * two or more Uniprot xrefs for the same CDS - * each needs a distinct Mapping (as to a different sequence) */ dnaToProteinMapping = new Mapping(dnaToProteinMapping); } mappingUsed = true; /* * try to locate the protein mapped to (possibly by a * previous CDS feature); if not found, construct it from * the EMBL translation */ SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName); if (proteinSeq == null) { proteinSeq = new Sequence(proteinSeqName, product.getSequenceAsString()); matcher.add(proteinSeq); proteinSeq.setDescription(product.getDescription()); peptides.add(proteinSeq); } dnaToProteinMapping.setTo(proteinSeq); dnaToProteinMapping.setMappedFromId(proteinId); proteinSeq.addDBRef(proteinDbRef); dbref.setMap(dnaToProteinMapping); } hasUniprotDbref = true; } if (product != null) { /* * copy feature dbref to our protein product */ DBRefEntry pref = proteinDbRef; pref.setMap(null); // reference is direct product.addDBRef(pref); // Add converse mapping reference if (dnaToProteinMapping != null) { Mapping pmap = new Mapping(dna, dnaToProteinMapping.getMap().getInverse()); pref = new DBRefEntry(sourceDb, sequenceVersion, accession); pref.setMap(pmap); if (dnaToProteinMapping.getTo() != null) { dnaToProteinMapping.getTo().addDBRef(pref); } } } dna.addDBRef(dbref); } } /* * if we have a product (translation) but no explicit Uniprot dbref * (example: EMBL AAFI02000057 protein_id EAL65544.1) * then construct mappings to an assumed EMBLCDSPROTEIN accession */ if (!hasUniprotDbref && product != null) { if (proteinToEmblProteinRef == null) { // assuming CDSPROTEIN sequence version = dna version (?!) proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct, sequenceVersion, proteinId); } product.addDBRef(proteinToEmblProteinRef); if (dnaToProteinMapping != null && dnaToProteinMapping.getTo() != null) { DBRefEntry dnaToEmblProteinRef = new DBRefEntry( DBRefSource.EMBLCDSProduct, sequenceVersion, proteinId); dnaToEmblProteinRef.setMap(dnaToProteinMapping); dnaToProteinMapping.setMappedFromId(proteinId); dna.addDBRef(dnaToEmblProteinRef); } } } @Override public boolean isDnaCoding() { return true; } /** * Returns the CDS positions as a single array of [start, end, start, end...] * positions. If on the reverse strand, these will be in descending order. * * @param accession * @param feature * @return */ protected int[] getCdsRanges(String accession, Feature feature) { String location = feature.getLocation(); if (location == null) { return new int[] {}; } try { List ranges = DnaUtils.parseLocation(location); return listToArray(ranges); } catch (ParseException e) { Cache.log.warn( String.format("Not parsing inexact CDS location %s in ENA %s", location, accession)); return new int[] {}; } } /** * Converts a list of [start, end] ranges to a single array of [start, end, * start, end ...] * * @param ranges * @return */ int[] listToArray(List ranges) { int[] result = new int[ranges.size() * 2]; int i = 0; for (int[] range : ranges) { result[i++] = range[0]; result[i++] = range[1]; } return result; } /** * Helper method to construct a SequenceFeature for one cds range * * @param type * feature type ("CDS") * @param desc * description * @param begin * start position * @param end * end position * @param group * feature group * @param vals * map of 'miscellaneous values' for feature * @return */ protected SequenceFeature makeCdsFeature(String type, String desc, int begin, int end, String group, Map vals) { SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group); if (!vals.isEmpty()) { for (Entry val : vals.entrySet()) { sf.setValue(val.getKey(), val.getValue()); } } return sf; } @Override public String getAccessionSeparator() { return null; } @Override public Regex getAccessionValidator() { return ACCESSION_REGEX; } @Override public String getDbVersion() { return "0"; } @Override public int getTier() { return 0; } @Override public boolean isValidReference(String accession) { if (accession == null || accession.length() < 2) { return false; } return getAccessionValidator().search(accession); } /** * Truncates (if necessary) the exon intervals to match 3 times the length of * the protein; also accepts 3 bases longer (for stop codon not included in * protein) * * @param proteinLength * @param exon * an array of [start, end, start, end...] intervals * @return the same array (if unchanged) or a truncated copy */ static int[] adjustForProteinLength(int proteinLength, int[] exon) { if (proteinLength <= 0 || exon == null) { return exon; } int expectedCdsLength = proteinLength * 3; int exonLength = MappingUtils.getLength(Arrays.asList(exon)); /* * if exon length matches protein, or is shorter, or longer by the * length of a stop codon (3 bases), then leave it unchanged */ if (expectedCdsLength >= exonLength || expectedCdsLength == exonLength - 3) { return exon; } int origxon[]; int sxpos = -1; int endxon = 0; origxon = new int[exon.length]; System.arraycopy(exon, 0, origxon, 0, exon.length); int cdspos = 0; for (int x = 0; x < exon.length; x += 2) { cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; if (expectedCdsLength <= cdspos) { // advanced beyond last codon. sxpos = x; if (expectedCdsLength != cdspos) { // System.err // .println("Truncating final exon interval on region by " // + (cdspos - cdslength)); } /* * shrink the final exon - reduce end position if forward * strand, increase it if reverse */ if (exon[x + 1] >= exon[x]) { endxon = exon[x + 1] - cdspos + expectedCdsLength; } else { endxon = exon[x + 1] + cdspos - expectedCdsLength; } break; } } if (sxpos != -1) { // and trim the exon interval set if necessary int[] nxon = new int[sxpos + 2]; System.arraycopy(exon, 0, nxon, 0, sxpos + 2); nxon[sxpos + 1] = endxon; // update the end boundary for the new exon // set exon = nxon; } return exon; } }