X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fws%2Fdbsources%2FUniprot.java;h=69ee379c6a0b5144884860703d962a2520662757;hb=cdc539ef6f32d74747ab857831268dd5954b28f5;hp=0d38ca66c766a760287ef82b1ebaed8867cad757;hpb=17e77c3f2949a0729322b4a8d907f3f34b6a9914;p=jalview.git diff --git a/src/jalview/ws/dbsources/Uniprot.java b/src/jalview/ws/dbsources/Uniprot.java index 0d38ca6..69ee379 100644 --- a/src/jalview/ws/dbsources/Uniprot.java +++ b/src/jalview/ws/dbsources/Uniprot.java @@ -1,6 +1,6 @@ /* - * Jalview - A Sequence Alignment Editor and Viewer (Version 2.9) - * Copyright (C) 2015 The Jalview Authors + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * @@ -20,49 +20,69 @@ */ package jalview.ws.dbsources; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.Vector; + +import javax.xml.bind.JAXBContext; +import javax.xml.bind.JAXBElement; +import javax.xml.bind.JAXBException; +import javax.xml.stream.FactoryConfigurationError; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; + +import com.stevesoft.pat.Regex; + +import jalview.bin.Cache; +import jalview.bin.Console; +import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.PDBEntry; +import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.datamodel.UniprotEntry; -import jalview.datamodel.UniprotFile; -import jalview.ws.ebi.EBIFetchClient; -import jalview.ws.seqfetcher.DbSourceProxy; +import jalview.schemes.ResidueProperties; +import jalview.util.StringUtils; import jalview.ws.seqfetcher.DbSourceProxyImpl; - -import java.io.File; -import java.io.FileReader; -import java.io.Reader; -import java.util.Vector; - -import org.exolab.castor.xml.Unmarshaller; - -import com.stevesoft.pat.Regex; +import jalview.xml.binding.uniprot.DbReferenceType; +import jalview.xml.binding.uniprot.Entry; +import jalview.xml.binding.uniprot.FeatureType; +import jalview.xml.binding.uniprot.LocationType; +import jalview.xml.binding.uniprot.PositionType; +import jalview.xml.binding.uniprot.PropertyType; /** + * This class queries the Uniprot database for sequence data, unmarshals the + * returned XML, and converts it to Jalview Sequence records (including attached + * database references and sequence features) + * * @author JimP * */ -public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy +public class Uniprot extends DbSourceProxyImpl { + private static final String DEFAULT_UNIPROT_DOMAIN = "https://www.uniprot.org"; private static final String BAR_DELIMITER = "|"; - private static final String NEWLINE = "\n"; - - private static org.exolab.castor.mapping.Mapping map; - /** * Constructor */ public Uniprot() { super(); - addDbSourceProperty(DBRefSource.SEQDB, DBRefSource.SEQDB); - addDbSourceProperty(DBRefSource.PROTSEQDB); - // addDbSourceProperty(DBRefSource.MULTIACC, new Integer(50)); + } + + private String getDomain() + { + return Cache.getDefault("UNIPROT_DOMAIN", DEFAULT_UNIPROT_DOMAIN); } /* @@ -70,9 +90,10 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getAccessionSeparator() */ + @Override public String getAccessionSeparator() { - return null; // ";"; + return null; } /* @@ -80,6 +101,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getAccessionValidator() */ + @Override public Regex getAccessionValidator() { return new Regex("([A-Z]+[0-9]+[A-Z0-9]+|[A-Z0-9]+_[A-Z0-9]+)"); @@ -90,6 +112,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getDbSource() */ + @Override public String getDbSource() { return DBRefSource.UNIPROT; @@ -100,201 +123,402 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getDbVersion() */ + @Override public String getDbVersion() { return "0"; // we really don't know what version we're on. } - /** - * Reads a file containing the reply to the EBI Fetch Uniprot data query, - * unmarshals it to a UniprotFile object, and returns the list of UniprotEntry - * data models (mapped from <entry> elements) - * - * @param fileReader - * @return - */ - public Vector getUniprotEntries(Reader fileReader) - { - UniprotFile uni = new UniprotFile(); - try - { - if (map == null) - { - // 1. Load the mapping information from the file - map = new org.exolab.castor.mapping.Mapping(uni.getClass() - .getClassLoader()); - java.net.URL url = getClass().getResource("/uniprot_mapping.xml"); - map.loadMapping(url); - } - - // 2. Unmarshal the data - Unmarshaller unmar = new Unmarshaller(uni); - unmar.setIgnoreExtraElements(true); - unmar.setMapping(map); - if (fileReader != null) - { - uni = (UniprotFile) unmar.unmarshal(fileReader); - } - } catch (Exception e) - { - System.out.println("Error getUniprotEntries() " + e); - } - - return uni.getUniprotEntries(); - } - /* * (non-Javadoc) * * @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[]) */ + @Override public AlignmentI getSequenceRecords(String queries) throws Exception { startQuery(); try { - queries = queries.toUpperCase().replaceAll( + queries = queries.toUpperCase(Locale.ROOT).replaceAll( "(UNIPROT\\|?|UNIPROT_|UNIREF\\d+_|UNIREF\\d+\\|?)", ""); AlignmentI al = null; - EBIFetchClient ebi = new EBIFetchClient(); - // uniprotxml parameter required since december 2007 - // uniprotkb dbname changed introduced december 2008 - File file = ebi.fetchDataAsFile("uniprotkb:" + queries, "uniprotxml", - null); - Vector entries = getUniprotEntries(new FileReader(file)); - - if (entries != null) - { - /* - * If Castor binding included sequence@length, we could guesstimate the - * size of buffer to hold the alignment - */ - StringBuffer result = new StringBuffer(128); - // First, make the new sequences - for (UniprotEntry entry : entries) - { - StringBuilder name = constructSequenceFastaHeader(entry); - result.append(name).append(NEWLINE) - .append(entry.getUniprotSequence().getContent()) - .append(NEWLINE); - } + String downloadstring = getDomain() + "/uniprot/" + queries + ".xml"; - // Then read in the features and apply them to the dataset - al = parseResult(result.toString()); - if (al != null) - { - // Decorate the alignment with database entries. - addUniprotXrefs(al, entries); - } - else + URL url = new URL(downloadstring); + HttpURLConnection urlconn = (HttpURLConnection) url.openConnection(); + // anything other than 200 means we don't have data + // TODO: JAL-3882 reuse the EnsemblRestClient's fair + // use/backoff logic to retry when the server tells us to go away + if (urlconn.getResponseCode() == 200) + { + InputStream istr = urlconn.getInputStream(); + List entries = getUniprotEntries(istr); + if (entries != null) { - results = result; + List seqs = new ArrayList<>(); + for (Entry entry : entries) + { + seqs.add(uniprotEntryToSequence(entry)); + } + al = new Alignment(seqs.toArray(new SequenceI[seqs.size()])); } } stopQuery(); return al; + } catch (Exception e) { - stopQuery(); throw (e); + } finally + { + stopQuery(); } } /** - * Construct a Fasta-format sequence header by concatenating the source, - * accession id(s) and name(s), delimited by '|', plus any protein names, now - * with space rather than bar delimiter + * Converts an Entry object (bound from Uniprot XML) to a Jalview Sequence * * @param entry * @return */ - public static StringBuilder constructSequenceFastaHeader( - UniprotEntry entry) + SequenceI uniprotEntryToSequence(Entry entry) { - StringBuilder name = new StringBuilder(32); - name.append(">UniProt/Swiss-Prot"); + String id = getUniprotEntryId(entry); + /* + * Sequence should not include any whitespace, but JAXB leaves these in + */ + String seqString = entry.getSequence().getValue().replaceAll("\\s*", + ""); + + SequenceI sequence = new Sequence(id, seqString); + sequence.setDescription(getUniprotEntryDescription(entry)); + final String uniprotRecordVersion = "" + entry.getVersion(); + /* + * add a 'self' DBRefEntry for each accession + */ + final String dbVersion = getDbVersion(); + List dbRefs = new ArrayList<>(); + boolean canonical = true; for (String accessionId : entry.getAccession()) { - name.append(BAR_DELIMITER); - name.append(accessionId); + DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT, + uniprotRecordVersion, accessionId, null, canonical); + canonical = false; + dbRefs.add(dbRef); } - for (String n : entry.getName()) + + /* + * add a DBRefEntry for each dbReference element in the XML; + * also add a PDBEntry if type="PDB"; + * also add an EMBLCDS dbref if protein sequence id is given + * also add an Ensembl dbref " " " " " " + */ + Vector pdbRefs = new Vector<>(); + for (DbReferenceType dbref : entry.getDbReference()) { - name.append(BAR_DELIMITER); - name.append(n); + String type = dbref.getType(); + DBRefEntry dbr = new DBRefEntry(type, + DBRefSource.UNIPROT + ":" + dbVersion, dbref.getId()); + dbRefs.add(dbr); + if ("PDB".equals(type)) + { + pdbRefs.add(new PDBEntry(dbr)); + } + if ("EMBL".equals(type)) + { + /* + * e.g. Uniprot accession Q9BXM7 has + * + * + * + * + */ + String cdsId = getProperty(dbref.getProperty(), + "protein sequence ID"); + if (cdsId != null && cdsId.trim().length() > 0) + { + // remove version + String[] vrs = cdsId.split("\\."); + String version = vrs.length > 1 ? vrs[1] + : DBRefSource.UNIPROT + ":" + uniprotRecordVersion; + dbr = new DBRefEntry(DBRefSource.EMBLCDS, version, vrs[0]); + // TODO: process VARIANT features to allow EMBLCDS record's product to + // match Uniprot + dbr.setCanonical(true); + dbRefs.add(dbr); + } + } + if (type != null + && type.toLowerCase(Locale.ROOT).startsWith("ensembl")) + { + // remove version + String[] vrs = dbref.getId().split("\\."); + String version = vrs.length > 1 ? vrs[1] + : DBRefSource.UNIPROT + ":" + uniprotRecordVersion; + dbr.setAccessionId(vrs[0]); + dbr.setVersion(version); + /* + * e.g. Uniprot accession Q9BXM7 has + * + * + * + * + * + */ + String cdsId = getProperty(dbref.getProperty(), + "protein sequence ID"); + if (cdsId != null && cdsId.trim().length() > 0) + { + // remove version + String[] cdsVrs = cdsId.split("\\."); + String cdsVersion = cdsVrs.length > 1 ? cdsVrs[1] + : DBRefSource.UNIPROT + ":" + uniprotRecordVersion; + dbr = new DBRefEntry(DBRefSource.ENSEMBL, + DBRefSource.UNIPROT + ":" + cdsVersion, cdsVrs[0]); + dbRefs.add(dbr); + } + } } - if (entry.getProtein() != null && entry.getProtein().getName() != null) + /* + * create features; they have either begin and end, or position, in XML + */ + sequence.setPDBId(pdbRefs); + if (entry.getFeature() != null) { - for (String nm : entry.getProtein().getName()) + for (FeatureType uf : entry.getFeature()) { - name.append(" ").append(nm); + LocationType location = uf.getLocation(); + int start = 0; + int end = 0; + String uncertain_start = null, uncertain_end = null, + uncertain_pos = null; + if (location.getPosition() != null) + { + if (location.getPosition().getPosition() == null + || "unknown".equals(location.getPosition().getStatus())) + { + Console.warn( + "Ignoring single position feature with uncertain location " + + uf.getType() + ":" + getDescription(uf)); + uncertain_pos = location.getPosition().getStatus() == null + ? "unknown" + : location.getPosition().getStatus(); + } + else + { + start = location.getPosition().getPosition().intValue(); + end = start; + } + } + else + { + if (location.getBegin().getPosition() == null) + { + Console.warn( + "Setting start position of feature with uncertain start to 1: " + + uf.getType() + ":" + getDescription(uf)); + start = sequence.getStart(); + uncertain_start = location.getBegin().getStatus(); + } + else + { + start = location.getBegin().getPosition().intValue(); + } + if (location.getEnd().getPosition() == null) + { + Console.warn( + "Setting start position of feature with uncertain start to 1: " + + uf.getType() + ":" + getDescription(uf)); + end = sequence.getEnd(); + uncertain_end = location.getEnd().getStatus(); + } + else + { + end = location.getEnd().getPosition().intValue(); + } + } + SequenceFeature sf = new SequenceFeature(uf.getType(), + getDescription(uf), start, end, "Uniprot"); + sf.setStatus(uf.getStatus()); + if (uncertain_end != null) + { + sf.setValue("end_status", uncertain_end); + } + if (uncertain_start != null) + { + sf.setValue("start_status", uncertain_start); + } + if (uncertain_pos != null) + { + sf.setValue("pos_status", uncertain_pos); + } + sequence.addSequenceFeature(sf); } } - return name; + for (DBRefEntry dbr : dbRefs) + { + sequence.addDBRef(dbr); + } + return sequence; } /** - * add an ordered set of UniprotEntry objects to an ordered set of seuqences. + * A helper method that builds a sequence feature description * - * @param al - * - a sequence of n sequences - * @param entries - * a list of n uniprot entries to be analysed. + * @param feature + * @return */ - public void addUniprotXrefs(AlignmentI al, Vector entries) + static String getDescription(FeatureType feature) { - final String dbVersion = getDbVersion(); - - for (int i = 0; i < entries.size(); i++) + String orig = feature.getOriginal(); + List variants = feature.getVariation(); + StringBuilder sb = new StringBuilder(); + + /* + * append variant in standard format if present + * e.g. p.Arg59Lys + * multiple variants are split over lines using
+ */ + boolean asHtml = false; + if (orig != null && !orig.isEmpty() && variants != null + && !variants.isEmpty()) { - UniprotEntry entry = entries.elementAt(i); - Vector onlyPdbEntries = new Vector(); - Vector dbxrefs = new Vector(); - - for (PDBEntry pdb : entry.getDbReference()) + int p = 0; + for (String var : variants) { - DBRefEntry dbr = new DBRefEntry(); - dbr.setSource(pdb.getType()); - dbr.setAccessionId(pdb.getId()); - dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion); - dbxrefs.addElement(dbr); - if ("PDB".equals(pdb.getType())) + // TODO proper HGVS nomenclature for delins structural variations + // http://varnomen.hgvs.org/recommendations/protein/variant/delins/ + // for now we are pragmatic - any orig/variant sequence longer than + // three characters is shown with single-character notation rather than + // three-letter notation + sb.append("p."); + if (orig.length() < 4) { - onlyPdbEntries.addElement(pdb); + for (int c = 0, clen = orig.length(); c < clen; c++) + { + char origchar = orig.charAt(c); + String orig3 = ResidueProperties.aa2Triplet.get("" + origchar); + sb.append(orig3 == null ? origchar + : StringUtils.toSentenceCase(orig3)); + } + } + else + { + sb.append(orig); } - } - SequenceI sq = al.getSequenceAt(i); - while (sq.getDatasetSequence() != null) - { - sq = sq.getDatasetSequence(); - } + LocationType location = feature.getLocation(); + PositionType start = location.getPosition() == null + ? location.getBegin() + : location.getPosition(); + sb.append(Integer.toString(start.getPosition().intValue())); - for (String accessionId : entry.getAccession()) - { - /* - * add as uniprot whether retrieved from uniprot or uniprot_name - */ - sq.addDBRef(new DBRefEntry(DBRefSource.UNIPROT, dbVersion, - accessionId)); + if (var.length() < 4) + { + for (int c = 0, clen = var.length(); c < clen; c++) + { + char varchar = var.charAt(c); + String var3 = ResidueProperties.aa2Triplet.get("" + varchar); + + sb.append(var3 != null ? StringUtils.toSentenceCase(var3) + : "" + varchar); + } + } + else + { + sb.append(var); + } + if (++p != variants.size()) + { + sb.append("
  "); + asHtml = true; + } + else + { + sb.append(" "); + } } + } + String description = feature.getDescription(); + if (description != null) + { + sb.append(description); + } + if (asHtml) + { + sb.insert(0, ""); + sb.append(""); + } - for (DBRefEntry dbRef : dbxrefs) - { - sq.addDBRef(dbRef); - } - sq.setPDBId(onlyPdbEntries); - if (entry.getFeature() != null) + return sb.toString(); + } + + /** + * A helper method that searches the list of properties for one with the given + * key, and if found returns the property value, else returns null + * + * @param properties + * @param key + * @return + */ + static String getProperty(List properties, String key) + { + String value = null; + if (properties != null) + { + for (PropertyType prop : properties) { - for (SequenceFeature sf : entry.getFeature()) + if (key.equals(prop.getType())) { - sf.setFeatureGroup("Uniprot"); - sq.addSequenceFeature(sf); + value = prop.getValue(); + break; } } } + return value; + } + + /** + * Extracts xml element entry/protein/recommendedName/fullName + * + * @param entry + * @return + */ + static String getUniprotEntryDescription(Entry entry) + { + String desc = ""; + if (entry.getProtein() != null + && entry.getProtein().getRecommendedName() != null) + { + // fullName is mandatory if recommendedName is present + desc = entry.getProtein().getRecommendedName().getFullName() + .getValue(); + } + return desc; + } + + /** + * Constructs a sequence id by concatenating all entry/name elements with '|' + * separator + * + * @param entry + * @return + */ + static String getUniprotEntryId(Entry entry) + { + StringBuilder name = new StringBuilder(32); + for (String n : entry.getName()) + { + if (name.length() > 0) + { + name.append(BAR_DELIMITER); + } + name.append(n); + } + return name.toString(); } /* @@ -302,6 +526,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#isValidReference(java.lang.String) */ + @Override public boolean isValidReference(String accession) { // TODO: make the following a standard validator @@ -312,11 +537,13 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy /** * return LDHA_CHICK uniprot entry */ + @Override public String getTestQuery() { return "P00340"; } + @Override public String getDbName() { return "Uniprot"; // getDbSource(); @@ -327,4 +554,48 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy { return 0; } + + /** + * Reads the reply to the EBI Fetch Uniprot data query, unmarshals it to an + * Uniprot object, and returns the enclosed Entry objects, or null on any + * failure + * + * @param is + * @return + */ + public List getUniprotEntries(InputStream is) + { + List entries = null; + try + { + JAXBContext jc = JAXBContext + .newInstance("jalview.xml.binding.uniprot"); + XMLStreamReader streamReader = XMLInputFactory.newInstance() + .createXMLStreamReader(is); + javax.xml.bind.Unmarshaller um = jc.createUnmarshaller(); + JAXBElement uniprotElement = um + .unmarshal(streamReader, + jalview.xml.binding.uniprot.Uniprot.class); + jalview.xml.binding.uniprot.Uniprot uniprot = uniprotElement + .getValue(); + + if (uniprot != null && !uniprot.getEntry().isEmpty()) + { + entries = uniprot.getEntry(); + } + } catch (JAXBException | XMLStreamException + | FactoryConfigurationError e) + { + if (e instanceof javax.xml.bind.UnmarshalException + && e.getCause() != null + && e.getCause() instanceof XMLStreamException + && e.getCause().getMessage().contains("[row,col]:[1,1]")) + { + // trying to parse an empty stream + return null; + } + e.printStackTrace(); + } + return entries; + } }