X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fws%2Fdbsources%2FEmblXmlSource.java;h=8f55080532694eb96e52b2b7c8bd9e178807698c;hb=7e82e1fed011077e5cd4cc40ac8ad3519d7c47a8;hp=ec5c62dc1ba3d0c655f86096ba278a332dca1253;hpb=aced09c4feeaf3406269442c14e54abeeb4cad81;p=jalview.git diff --git a/src/jalview/ws/dbsources/EmblXmlSource.java b/src/jalview/ws/dbsources/EmblXmlSource.java index ec5c62d..8f55080 100644 --- a/src/jalview/ws/dbsources/EmblXmlSource.java +++ b/src/jalview/ws/dbsources/EmblXmlSource.java @@ -1,6 +1,6 @@ /* - * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2) - * Copyright (C) 2014 The Jalview Authors + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * @@ -20,23 +20,55 @@ */ package jalview.ws.dbsources; +import jalview.analysis.SequenceIdMatcher; +import jalview.bin.Cache; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.FeatureProperties; +import jalview.datamodel.Mapping; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.datamodel.xdb.embl.EmblEntry; +import jalview.util.DBRefUtils; +import jalview.util.DnaUtils; +import jalview.util.MapList; +import jalview.util.MappingUtils; import jalview.util.MessageManager; import jalview.ws.ebi.EBIFetchClient; +import jalview.xml.binding.embl.EntryType; +import jalview.xml.binding.embl.EntryType.Feature; +import jalview.xml.binding.embl.EntryType.Feature.Qualifier; +import jalview.xml.binding.embl.XrefType; import java.io.File; -import java.util.Iterator; +import java.io.FileInputStream; +import java.io.InputStream; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Hashtable; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.regex.Pattern; + +import javax.xml.bind.JAXBContext; +import javax.xml.bind.JAXBException; +import javax.xml.stream.FactoryConfigurationError; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; public abstract class EmblXmlSource extends EbiFileRetrievedProxy { - - /** - * Last properly parsed embl file. + /* + * JAL-1856 Embl returns this text for query not found */ - public jalview.datamodel.xdb.embl.EmblFile efile = null; + private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found."; + + private static final Pattern SPACE_PATTERN = Pattern.compile(" "); public EmblXmlSource() { @@ -44,11 +76,11 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy } /** - * retrieve and parse an emblxml file + * Retrieves and parses an emblxml file, and returns an alignment containing + * the parsed sequences, or null if none were found * * @param emprefx - * either EMBL or EMBLCDS strings are allowed - anything else will - * not retrieve emblxml + * "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml * @param query * @return * @throws Exception @@ -62,11 +94,14 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy try { reply = dbFetch.fetchDataAsFile( - emprefx.toLowerCase() + ":" + query.trim(), "emblxml", null); + emprefx.toLowerCase() + ":" + query.trim(), "display=xml", + "xml"); } catch (Exception e) { stopQuery(); - throw new Exception(MessageManager.formatMessage("exception.ebiembl_retrieval_failed_on", new String[]{emprefx.toLowerCase(),query.trim()}), e); + throw new Exception(MessageManager.formatMessage( + "exception.ebiembl_retrieval_failed_on", new String[] + { emprefx.toLowerCase(), query.trim() }), e); } return getEmblSequenceRecords(emprefx, query, reply); } @@ -86,77 +121,672 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy public AlignmentI getEmblSequenceRecords(String emprefx, String query, File reply) throws Exception { - SequenceI seqs[] = null; - StringBuffer result = new StringBuffer(); + List entries = null; if (reply != null && reply.exists()) { - efile = null; file = reply.getAbsolutePath(); - if (reply.length() > 25) + if (reply.length() > EMBL_NOT_FOUND_REPLY.length()) { - efile = jalview.datamodel.xdb.embl.EmblFile.getEmblFile(reply); + InputStream is = new FileInputStream(reply); + entries = getEmblEntries(is); + } + } + + /* + * invalid accession gets a reply with no elements, text content of + * EmbFile reads something like (e.g.) this ungrammatical phrase + * Entry: display type is either not supported or entry is not found. + */ + AlignmentI al = null; + List seqs = new ArrayList<>(); + List peptides = new ArrayList<>(); + if (entries != null) + { + for (EntryType entry : entries) + { + SequenceI seq = getSequence(emprefx, entry, peptides); + if (seq != null) + { + seqs.add(seq.deriveSequence()); + // place DBReferences on dataset and refer + } + } + if (!seqs.isEmpty()) + { + al = new Alignment(seqs.toArray(new SequenceI[seqs.size()])); } else { - result.append(MessageManager.formatMessage("label.no_embl_record_found", new String[]{emprefx.toLowerCase(),query.trim()})); + System.out.println( + "No record found for '" + emprefx + ":" + query + "'"); + } + } + + stopQuery(); + return al; + } + + /** + * Reads the XML reply from file and unmarshals it to Java objects. Answers a + * (possibly empty) list of EntryType objects. + * + * is + * + * @return + */ + List getEmblEntries(InputStream is) + { + List entries = new ArrayList<>(); + try + { + JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl"); + XMLStreamReader streamReader = XMLInputFactory.newInstance() + .createXMLStreamReader(is); + javax.xml.bind.Unmarshaller um = jc.createUnmarshaller(); + jalview.xml.binding.embl.ROOT root = (jalview.xml.binding.embl.ROOT) um + .unmarshal(streamReader); + + /* + * document root contains either "entry" or "entrySet" + */ + if (root == null) + { + return entries; + } + if (root.getEntrySet() != null) + { + entries = root.getEntrySet().getEntry(); + } + else if (root.getEntry() != null) + { + entries.add(root.getEntry()); + } + } catch (JAXBException | XMLStreamException + | FactoryConfigurationError e) + { + e.printStackTrace(); + } + return entries; + } + + /** + * A helper method to parse XML data and construct a sequence, with any + * available database references and features + * + * @param emprefx + * @param entry + * @param peptides + * @return + */ + SequenceI getSequence(String sourceDb, EntryType entry, + List peptides) + { + String seqString = entry.getSequence(); + if (seqString == null) + { + return null; + } + seqString = seqString.replace(" ", "").replace("\n", "").replace("\t", + ""); + String accession = entry.getAccession(); + SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString); + + dna.setDescription(entry.getDescription()); + String sequenceVersion = String.valueOf(entry.getVersion().intValue()); + DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion, + accession); + dna.addDBRef(selfRref); + selfRref.setMap( + new Mapping(null, new int[] + { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1, + 1)); + + /* + * add db references + */ + List dbRefs = entry.getXref(); + if (dbRefs != null) + { + for (XrefType dbref : dbRefs) + { + String acc = dbref.getId(); + String source = DBRefUtils.getCanonicalName(dbref.getDb()); + String version = dbref.getSecondaryId(); + dna.addDBRef(new DBRefEntry(source, version, acc)); } } - if (efile != null) + + SequenceIdMatcher matcher = new SequenceIdMatcher(peptides); + try + { + List features = entry.getFeature(); + if (features != null) + { + for (Feature feature : features) + { + if (FeatureProperties.isCodingFeature(sourceDb, + feature.getName())) + { + parseCodingFeature(entry, feature, sourceDb, dna, peptides, + matcher); + } + } + } + } catch (Exception e) + { + System.err.println("EMBL Record Features parsing error!"); + System.err + .println("Please report the following to help@jalview.org :"); + System.err.println("EMBL Record " + accession); + System.err.println("Resulted in exception: " + e.getMessage()); + e.printStackTrace(System.err); + } + + return dna; + } + + /** + * Extracts coding region and product from a CDS feature and decorates it with + * annotations + * + * @param entry + * @param feature + * @param sourceDb + * @param dna + * @param peptides + * @param matcher + */ + void parseCodingFeature(EntryType entry, Feature feature, String sourceDb, + SequenceI dna, List peptides, + SequenceIdMatcher matcher) + { + final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS); + final String accession = entry.getAccession(); + final String sequenceVersion = entry.getVersion().toString(); + + int[] exons = getCdsRanges(entry.getAccession(), feature); + + String translation = null; + String proteinName = ""; + String proteinId = null; + Map vals = new Hashtable<>(); + + /* + * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS + * (phase is required for CDS features in GFF3 format) + */ + int codonStart = 1; + + /* + * parse qualifiers, saving protein translation, protein id, + * codon start position, product (name), and 'other values' + */ + if (feature.getQualifier() != null) { - for (Iterator i = efile.getEntries().iterator(); i.hasNext();) + for (Qualifier q : feature.getQualifier()) { - EmblEntry entry = (EmblEntry) i.next(); - SequenceI[] seqparts = entry.getSequences(false, true, emprefx); // TODO: - // use - // !fetchNa,!fetchPeptide - // here - // instead - // - - // see - // todo - // in - // emblEntry - if (seqparts != null) + String qname = q.getName(); + String value = q.getValue(); + value = value == null ? "" + : value.trim().replace(" ", "").replace("\n", "") + .replace("\t", ""); + if (qname.equals("translation")) + { + translation = value; + } + else if (qname.equals("protein_id")) { - SequenceI[] newseqs = null; - int si = 0; - if (seqs == null) + proteinId = value; + } + else if (qname.equals("codon_start")) + { + try + { + codonStart = Integer.parseInt(value.trim()); + } catch (NumberFormatException e) { - newseqs = new SequenceI[seqparts.length]; + System.err.println("Invalid codon_start in XML for " + + entry.getAccession() + ": " + e.getMessage()); } - else + } + else if (qname.equals("product")) + { + // sometimes name is returned e.g. for V00488 + proteinName = value; + } + else + { + // throw anything else into the additional properties hash + if (!"".equals(value)) { - newseqs = new SequenceI[seqs.length + seqparts.length]; + vals.put(qname, value); + } + } + } + } + + DBRefEntry proteinToEmblProteinRef = null; + exons = MappingUtils.removeStartPositions(codonStart - 1, exons); + + SequenceI product = null; + Mapping dnaToProteinMapping = null; + if (translation != null && proteinName != null && proteinId != null) + { + int translationLength = translation.length(); + + /* + * look for product in peptides list, if not found, add it + */ + product = matcher.findIdMatch(proteinId); + if (product == null) + { + product = new Sequence(proteinId, translation, 1, + translationLength); + product.setDescription(((proteinName.length() == 0) + ? "Protein Product from " + sourceDb + : proteinName)); + peptides.add(product); + matcher.add(product); + } - for (; si < seqs.length; si++) + // we have everything - create the mapping and perhaps the protein + // sequence + if (exons == null || exons.length == 0) + { + /* + * workaround until we handle dna location for CDS sequence + * e.g. location="X53828.1:60..1058" correctly + */ + System.err.println( + "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + + sourceDb + ":" + entry.getAccession() + ")"); + int dnaLength = dna.getLength(); + if (translationLength * 3 == (1 - codonStart + dnaLength)) + { + System.err.println( + "Not allowing for additional stop codon at end of cDNA fragment... !"); + // this might occur for CDS sequences where no features are marked + exons = new int[] { dna.getStart() + (codonStart - 1), + dna.getEnd() }; + dnaToProteinMapping = new Mapping(product, exons, + new int[] + { 1, translationLength }, 3, 1); + } + if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength)) + { + System.err.println( + "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!"); + exons = new int[] { dna.getStart() + (codonStart - 1), + dna.getEnd() - 3 }; + dnaToProteinMapping = new Mapping(product, exons, + new int[] + { 1, translationLength }, 3, 1); + } + } + else + { + // Trim the exon mapping if necessary - the given product may only be a + // fragment of a larger protein. (EMBL:AY043181 is an example) + + if (isEmblCdna) + { + // TODO: Add a DbRef back to the parent EMBL sequence with the exon + // map + // if given a dataset reference, search dataset for parent EMBL + // sequence if it exists and set its map + // make a new feature annotating the coding contig + } + else + { + // final product length truncation check + int[] cdsRanges = adjustForProteinLength(translationLength, + exons); + dnaToProteinMapping = new Mapping(product, cdsRanges, + new int[] + { 1, translationLength }, 3, 1); + if (product != null) + { + /* + * make xref with mapping from protein to EMBL dna + */ + DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL, + sequenceVersion, proteinId, + new Mapping(dnaToProteinMapping.getMap().getInverse())); + product.addDBRef(proteinToEmblRef); + + /* + * make xref from protein to EMBLCDS; we assume here that the + * CDS sequence version is same as dna sequence (?!) + */ + MapList proteinToCdsMapList = new MapList( + new int[] + { 1, translationLength }, + new int[] + { 1 + (codonStart - 1), + (codonStart - 1) + 3 * translationLength }, + 1, 3); + DBRefEntry proteinToEmblCdsRef = new DBRefEntry( + DBRefSource.EMBLCDS, sequenceVersion, proteinId, + new Mapping(proteinToCdsMapList)); + product.addDBRef(proteinToEmblCdsRef); + + /* + * make 'direct' xref from protein to EMBLCDSPROTEIN + */ + proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef); + proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct); + proteinToEmblProteinRef.setMap(null); + product.addDBRef(proteinToEmblProteinRef); + } + } + } + + /* + * add cds features to dna sequence + */ + String cds = feature.getName(); // "CDS" + for (int xint = 0; exons != null + && xint < exons.length - 1; xint += 2) + { + int exonStart = exons[xint]; + int exonEnd = exons[xint + 1]; + int begin = Math.min(exonStart, exonEnd); + int end = Math.max(exonStart, exonEnd); + int exonNumber = xint / 2 + 1; + String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s", + exonNumber, proteinName, proteinId); + + SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb, + vals); + + sf.setEnaLocation(feature.getLocation()); + boolean forwardStrand = exonStart <= exonEnd; + sf.setStrand(forwardStrand ? "+" : "-"); + sf.setPhase(String.valueOf(codonStart - 1)); + sf.setValue(FeatureProperties.EXONPOS, exonNumber); + sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); + + dna.addSequenceFeature(sf); + } + } + + /* + * add feature dbRefs to sequence, and mappings for Uniprot xrefs + */ + boolean hasUniprotDbref = false; + List xrefs = feature.getXref(); + if (xrefs != null) + { + boolean mappingUsed = false; + for (XrefType xref : xrefs) + { + /* + * ensure UniProtKB/Swiss-Prot converted to UNIPROT + */ + String source = DBRefUtils.getCanonicalName(xref.getDb()); + DBRefEntry dbref = new DBRefEntry(source, xref.getSecondaryId(), + xref.getId()); + DBRefEntry proteinDbRef = new DBRefEntry(dbref.getSource(), + dbref.getVersion(), dbref.getAccessionId()); + if (source.equals(DBRefSource.UNIPROT)) + { + String proteinSeqName = DBRefSource.UNIPROT + "|" + + dbref.getAccessionId(); + if (dnaToProteinMapping != null + && dnaToProteinMapping.getTo() != null) + { + if (mappingUsed) + { + /* + * two or more Uniprot xrefs for the same CDS - + * each needs a distinct Mapping (as to a different sequence) + */ + dnaToProteinMapping = new Mapping(dnaToProteinMapping); + } + mappingUsed = true; + + /* + * try to locate the protein mapped to (possibly by a + * previous CDS feature); if not found, construct it from + * the EMBL translation + */ + SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName); + if (proteinSeq == null) { - newseqs[si] = seqs[si]; - seqs[si] = null; + proteinSeq = new Sequence(proteinSeqName, + product.getSequenceAsString()); + matcher.add(proteinSeq); + peptides.add(proteinSeq); } + dnaToProteinMapping.setTo(proteinSeq); + dnaToProteinMapping.setMappedFromId(proteinId); + proteinSeq.addDBRef(proteinDbRef); + dbref.setMap(dnaToProteinMapping); } - for (int j = 0; j < seqparts.length; si++, j++) + hasUniprotDbref = true; + } + if (product != null) + { + /* + * copy feature dbref to our protein product + */ + DBRefEntry pref = proteinDbRef; + pref.setMap(null); // reference is direct + product.addDBRef(pref); + // Add converse mapping reference + if (dnaToProteinMapping != null) { - newseqs[si] = seqparts[j].deriveSequence(); // place DBReferences on - // dataset and refer + Mapping pmap = new Mapping(dna, + dnaToProteinMapping.getMap().getInverse()); + pref = new DBRefEntry(sourceDb, sequenceVersion, accession); + pref.setMap(pmap); + if (dnaToProteinMapping.getTo() != null) + { + dnaToProteinMapping.getTo().addDBRef(pref); + } } - seqs = newseqs; + } + dna.addDBRef(dbref); + } + } + /* + * if we have a product (translation) but no explicit Uniprot dbref + * (example: EMBL AAFI02000057 protein_id EAL65544.1) + * then construct mappings to an assumed EMBLCDSPROTEIN accession + */ + if (!hasUniprotDbref && product != null) + { + if (proteinToEmblProteinRef == null) + { + // assuming CDSPROTEIN sequence version = dna version (?!) + proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct, + sequenceVersion, proteinId); + } + product.addDBRef(proteinToEmblProteinRef); + + if (dnaToProteinMapping != null + && dnaToProteinMapping.getTo() != null) + { + DBRefEntry dnaToEmblProteinRef = new DBRefEntry( + DBRefSource.EMBLCDSProduct, sequenceVersion, + proteinId); + dnaToEmblProteinRef.setMap(dnaToProteinMapping); + dnaToProteinMapping.setMappedFromId(proteinId); + dna.addDBRef(dnaToEmblProteinRef); + } + } + } + + @Override + public boolean isDnaCoding() + { + return true; + } + + /** + * Returns the CDS positions as a single array of [start, end, start, end...] + * positions. If on the reverse strand, these will be in descending order. + * + * @param accession + * @param feature + * @return + */ + protected int[] getCdsRanges(String accession, Feature feature) + { + String location = feature.getLocation(); + if (location == null) + { + return new int[] {}; + } + + try + { + List ranges = DnaUtils.parseLocation(location); + return listToArray(ranges); + } catch (ParseException e) + { + Cache.log.warn( + String.format("Not parsing inexact CDS location %s in ENA %s", + location, accession)); + return new int[] {}; + } + } + + /** + * Converts a list of [start, end] ranges to a single array of [start, end, + * start, end ...] + * + * @param ranges + * @return + */ + int[] listToArray(List ranges) + { + int[] result = new int[ranges.size() * 2]; + int i = 0; + for (int[] range : ranges) + { + result[i++] = range[0]; + result[i++] = range[1]; + } + return result; + } + + /** + * Helper method to construct a SequenceFeature for one cds range + * + * @param type + * feature type ("CDS") + * @param desc + * description + * @param begin + * start position + * @param end + * end position + * @param group + * feature group + * @param vals + * map of 'miscellaneous values' for feature + * @return + */ + protected SequenceFeature makeCdsFeature(String type, String desc, + int begin, int end, String group, Map vals) + { + SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group); + if (!vals.isEmpty()) + { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for (Entry val : vals.entrySet()) + { + if (!first) + { + sb.append(";"); } + sb.append(val.getKey()).append("=").append(val.getValue()); + first = false; + sf.setValue(val.getKey(), val.getValue()); } + sf.setAttributes(sb.toString()); } - else + return sf; + } + + /** + * Truncates (if necessary) the exon intervals to match 3 times the length of + * the protein; also accepts 3 bases longer (for stop codon not included in + * protein) + * + * @param proteinLength + * @param exon + * an array of [start, end, start, end...] intervals + * @return the same array (if unchanged) or a truncated copy + */ + static int[] adjustForProteinLength(int proteinLength, int[] exon) + { + if (proteinLength <= 0 || exon == null) { - result = null; + return exon; } - AlignmentI al = null; - if (seqs != null && seqs.length > 0) + int expectedCdsLength = proteinLength * 3; + int exonLength = MappingUtils.getLength(Arrays.asList(exon)); + + /* + * if exon length matches protein, or is shorter, or longer by the + * length of a stop codon (3 bases), then leave it unchanged + */ + if (expectedCdsLength >= exonLength + || expectedCdsLength == exonLength - 3) { - al = new Alignment(seqs); - result.append(MessageManager.formatMessage("label.embl_successfully_parsed", new String[]{emprefx})); - results = result; + return exon; } - stopQuery(); - return al; + + int origxon[]; + int sxpos = -1; + int endxon = 0; + origxon = new int[exon.length]; + System.arraycopy(exon, 0, origxon, 0, exon.length); + int cdspos = 0; + for (int x = 0; x < exon.length; x += 2) + { + cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; + if (expectedCdsLength <= cdspos) + { + // advanced beyond last codon. + sxpos = x; + if (expectedCdsLength != cdspos) + { + // System.err + // .println("Truncating final exon interval on region by " + // + (cdspos - cdslength)); + } + + /* + * shrink the final exon - reduce end position if forward + * strand, increase it if reverse + */ + if (exon[x + 1] >= exon[x]) + { + endxon = exon[x + 1] - cdspos + expectedCdsLength; + } + else + { + endxon = exon[x + 1] + cdspos - expectedCdsLength; + } + break; + } + } + + if (sxpos != -1) + { + // and trim the exon interval set if necessary + int[] nxon = new int[sxpos + 2]; + System.arraycopy(exon, 0, nxon, 0, sxpos + 2); + nxon[sxpos + 1] = endxon; // update the end boundary for the new exon + // set + exon = nxon; + } + return exon; } }