X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fws%2Fdbsources%2FUniprot.java;h=d97bf692168c93a8e770b8e77bdce563e69a977a;hb=0173acde216fc54fb54d935c2f7d2550e4c25016;hp=23bfbca24015f38fe00bec08a25c1650c912cd0f;hpb=0eda2ac2f334f55f6e5f137789453a9f4dfe4d73;p=jalview.git diff --git a/src/jalview/ws/dbsources/Uniprot.java b/src/jalview/ws/dbsources/Uniprot.java index 23bfbca..d97bf69 100644 --- a/src/jalview/ws/dbsources/Uniprot.java +++ b/src/jalview/ws/dbsources/Uniprot.java @@ -1,220 +1,516 @@ -/** - * - */ -package jalview.ws.dbsources; - -import java.io.File; -import java.io.IOException; -import java.util.Enumeration; -import java.util.Hashtable; -import java.util.Vector; - -import com.stevesoft.pat.Regex; - -import jalview.datamodel.Alignment; -import jalview.datamodel.AlignmentI; -import jalview.datamodel.DBRefEntry; -import jalview.datamodel.DBRefSource; -import jalview.datamodel.PDBEntry; -import jalview.datamodel.SequenceFeature; -import jalview.datamodel.SequenceI; -import jalview.datamodel.UniprotEntry; -import jalview.io.FormatAdapter; -import jalview.io.IdentifyFile; -import jalview.ws.DBRefFetcher; -import jalview.ws.ebi.EBIFetchClient; -import jalview.ws.seqfetcher.DbSourceProxy; -import jalview.ws.seqfetcher.DbSourceProxyImpl; - -/** - * @author JimP - * - */ -public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy -{ - public Uniprot() { - super(); - addDbSourceProperty(DBRefSource.SEQDB, DBRefSource.SEQDB); - addDbSourceProperty(DBRefSource.PROTSEQDB); - addDbSourceProperty(DBRefSource.MULTIACC); - } - - /* - * (non-Javadoc) - * - * @see jalview.ws.DbSourceProxy#getAccessionSeparator() - */ - public String getAccessionSeparator() - { - return ";"; - } - - /* - * (non-Javadoc) - * - * @see jalview.ws.DbSourceProxy#getAccessionValidator() - */ - public Regex getAccessionValidator() - { - return null; - } - - /* - * (non-Javadoc) - * - * @see jalview.ws.DbSourceProxy#getDbSource() - */ - public String getDbSource() - { - return DBRefSource.UNIPROT; - } - - /* - * (non-Javadoc) - * - * @see jalview.ws.DbSourceProxy#getDbVersion() - */ - public String getDbVersion() - { - return "0"; // we really don't know what version we're on. - } - - private EBIFetchClient ebi = null; - - /* - * (non-Javadoc) - * - * @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[]) - */ - public AlignmentI getSequenceRecords(String queries) throws Exception - { - startQuery(); - try - { - Alignment al=null; - ebi = new EBIFetchClient(); - StringBuffer result=new StringBuffer(); - File file = ebi.fetchDataAsFile("uniprot:" + queries, "xml", null); - DBRefFetcher dbref = new DBRefFetcher(); - Vector entries = dbref.getUniprotEntries(file); - - if (entries != null) - { - // First, make the new sequences - Enumeration en = entries.elements(); - while (en.hasMoreElements()) - { - UniprotEntry entry = (UniprotEntry) en.nextElement(); - - StringBuffer name = new StringBuffer(">UniProt/Swiss-Prot"); - Enumeration en2 = entry.getAccession().elements(); - while (en2.hasMoreElements()) - { - name.append("|"); - name.append(en2.nextElement()); - } - en2 = entry.getName().elements(); - while (en2.hasMoreElements()) - { - name.append("|"); - name.append(en2.nextElement()); - } - - if (entry.getProtein() != null) - { - name.append(" " + entry.getProtein().getName().elementAt(0)); - } - - result.append(name + "\n" - + entry.getUniprotSequence().getContent() + "\n"); - - } - - // Then read in the features and apply them to the dataset - al = parseResult(result.toString()); - if (al!=null) - { - // Decorate the alignment with database entries. - addUniprotXrefs(al, entries); - } else { - results = result; - } - } - stopQuery(); - return al; - } catch (Exception e) - { - stopQuery(); - throw(e); - } - } - - /** - * add an ordered set of UniprotEntry objects to an ordered set of seuqences. - * - * @param al - - * a sequence of n sequences - * @param entries - * a seuqence of n uniprot entries to be analysed. - */ - public void addUniprotXrefs(Alignment al, Vector entries) - { - for (int i = 0; i < entries.size(); i++) - { - UniprotEntry entry = (UniprotEntry) entries.elementAt(i); - Enumeration e = entry.getDbReference().elements(); - Vector onlyPdbEntries = new Vector(); - while (e.hasMoreElements()) - { - PDBEntry pdb = (PDBEntry) e.nextElement(); - if (!pdb.getType().equals("PDB")) - { - continue; - } - - onlyPdbEntries.addElement(pdb); - } - SequenceI sq = al.getSequenceAt(i); - while (sq.getDatasetSequence()!=null) - { - sq = sq.getDatasetSequence(); - } - - Enumeration en2 = entry.getAccession().elements(); - while (en2.hasMoreElements()) - { - sq.addDBRef( - new DBRefEntry(getDbSource(), getDbVersion(), en2.nextElement() - .toString())); - } - sq.setPDBId(onlyPdbEntries); - if (entry.getFeature() != null) - { - e = entry.getFeature().elements(); - while (e.hasMoreElements()) - { - SequenceFeature sf = (SequenceFeature) e.nextElement(); - sf.setFeatureGroup("Uniprot"); - sq.addSequenceFeature(sf); - } - } - } - } - - /* - * (non-Javadoc) - * - * @see jalview.ws.DbSourceProxy#isValidReference(java.lang.String) - */ - public boolean isValidReference(String accession) - { - return true; - } - /** - * return LDHA_CHICK uniprot entry - */ - public String getTestQuery() - { - return "P00340"; - } -} +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ +package jalview.ws.dbsources; + +import jalview.bin.Cache; +import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.PDBEntry; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.schemes.ResidueProperties; +import jalview.util.StringUtils; +import jalview.ws.seqfetcher.DbSourceProxyImpl; +import jalview.xml.binding.uniprot.DbReferenceType; +import jalview.xml.binding.uniprot.Entry; +import jalview.xml.binding.uniprot.FeatureType; +import jalview.xml.binding.uniprot.LocationType; +import jalview.xml.binding.uniprot.PositionType; +import jalview.xml.binding.uniprot.PropertyType; + +import java.io.InputStream; +import java.net.URL; +import java.net.URLConnection; +import java.util.ArrayList; +import java.util.List; +import java.util.Vector; + +import javax.xml.bind.JAXBContext; +import javax.xml.bind.JAXBException; +import javax.xml.stream.FactoryConfigurationError; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; + +import com.stevesoft.pat.Regex; + +/** + * This class queries the Uniprot database for sequence data, unmarshals the + * returned XML, and converts it to Jalview Sequence records (including attached + * database references and sequence features) + * + * @author JimP + * + */ +public class Uniprot extends DbSourceProxyImpl +{ + private static final String DEFAULT_UNIPROT_DOMAIN = "https://www.uniprot.org"; + + private static final String BAR_DELIMITER = "|"; + + /** + * Constructor + */ + public Uniprot() + { + super(); + } + + private String getDomain() + { + return Cache.getDefault("UNIPROT_DOMAIN", DEFAULT_UNIPROT_DOMAIN); + } + + /* + * (non-Javadoc) + * + * @see jalview.ws.DbSourceProxy#getAccessionSeparator() + */ + @Override + public String getAccessionSeparator() + { + return null; + } + + /* + * (non-Javadoc) + * + * @see jalview.ws.DbSourceProxy#getAccessionValidator() + */ + @Override + public Regex getAccessionValidator() + { + return new Regex("([A-Z]+[0-9]+[A-Z0-9]+|[A-Z0-9]+_[A-Z0-9]+)"); + } + + /* + * (non-Javadoc) + * + * @see jalview.ws.DbSourceProxy#getDbSource() + */ + @Override + public String getDbSource() + { + return DBRefSource.UNIPROT; + } + + /* + * (non-Javadoc) + * + * @see jalview.ws.DbSourceProxy#getDbVersion() + */ + @Override + public String getDbVersion() + { + return "0"; // we really don't know what version we're on. + } + + /* + * (non-Javadoc) + * + * @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[]) + */ + @Override + public AlignmentI getSequenceRecords(String queries) throws Exception + { + startQuery(); + try + { + queries = queries.toUpperCase().replaceAll( + "(UNIPROT\\|?|UNIPROT_|UNIREF\\d+_|UNIREF\\d+\\|?)", ""); + AlignmentI al = null; + + String downloadstring = getDomain() + "/uniprot/" + queries + + ".xml"; + + URL url = new URL(downloadstring); + URLConnection urlconn = url.openConnection(); + InputStream istr = urlconn.getInputStream(); + List entries = getUniprotEntries(istr); + if (entries != null) + { + List seqs = new ArrayList<>(); + for (Entry entry : entries) + { + seqs.add(uniprotEntryToSequence(entry)); + } + al = new Alignment(seqs.toArray(new SequenceI[seqs.size()])); + } + + stopQuery(); + return al; + } catch (Exception e) + { + throw (e); + } finally + { + stopQuery(); + } + } + + /** + * Converts an Entry object (bound from Uniprot XML) to a Jalview Sequence + * + * @param entry + * @return + */ + SequenceI uniprotEntryToSequence(Entry entry) + { + String id = getUniprotEntryId(entry); + /* + * Sequence should not include any whitespace, but JAXB leaves these in + */ + String seqString = entry.getSequence().getValue().replaceAll("\\s*", + ""); + + SequenceI sequence = new Sequence(id, + seqString); + sequence.setDescription(getUniprotEntryDescription(entry)); + + /* + * add a 'self' DBRefEntry for each accession + */ + final String dbVersion = getDbVersion(); + List dbRefs = new ArrayList<>(); + for (String accessionId : entry.getAccession()) + { + DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT, dbVersion, + accessionId); + dbRefs.add(dbRef); + } + + /* + * add a DBRefEntry for each dbReference element in the XML; + * also add a PDBEntry if type="PDB"; + * also add an EMBLCDS dbref if protein sequence id is given + * also add an Ensembl dbref " " " " " " + */ + Vector pdbRefs = new Vector<>(); + for (DbReferenceType dbref : entry.getDbReference()) + { + String type = dbref.getType(); + DBRefEntry dbr = new DBRefEntry(type, + DBRefSource.UNIPROT + ":" + dbVersion, dbref.getId()); + dbRefs.add(dbr); + if ("PDB".equals(type)) + { + pdbRefs.add(new PDBEntry(dbr)); + } + if ("EMBL".equals(type)) + { + /* + * e.g. Uniprot accession Q9BXM7 has + * + * + * + * + */ + String cdsId = getProperty(dbref.getProperty(), + "protein sequence ID"); + if (cdsId != null && cdsId.trim().length() > 0) + { + // remove version + String[] vrs = cdsId.split("\\."); + String version = vrs.length > 1 ? vrs[1] + : DBRefSource.UNIPROT + ":" + dbVersion; + dbr = new DBRefEntry(DBRefSource.EMBLCDS, version, vrs[0]); + dbRefs.add(dbr); + } + } + if ("Ensembl".equals(type)) + { + /* + * e.g. Uniprot accession Q9BXM7 has + * + * + * + * + * + */ + String cdsId = getProperty(dbref.getProperty(), + "protein sequence ID"); + if (cdsId != null && cdsId.trim().length() > 0) + { + dbr = new DBRefEntry(DBRefSource.ENSEMBL, + DBRefSource.UNIPROT + ":" + dbVersion, cdsId.trim()); + dbRefs.add(dbr); + } + } + } + + /* + * create features; they have either begin and end, or position, in XML + */ + sequence.setPDBId(pdbRefs); + if (entry.getFeature() != null) + { + for (FeatureType uf : entry.getFeature()) + { + LocationType location = uf.getLocation(); + int start = 0; + int end = 0; + if (location.getPosition() != null) + { + start = location.getPosition().getPosition().intValue(); + end = start; + } + else + { + start = location.getBegin().getPosition().intValue(); + end = location.getEnd().getPosition().intValue(); + } + SequenceFeature sf = new SequenceFeature(uf.getType(), + getDescription(uf), start, end, "Uniprot"); + sf.setStatus(uf.getStatus()); + sequence.addSequenceFeature(sf); + } + } + for (DBRefEntry dbr : dbRefs) + { + sequence.addDBRef(dbr); + } + return sequence; + } + + /** + * A helper method that builds a sequence feature description + * + * @param feature + * @return + */ + static String getDescription(FeatureType feature) + { + String orig = feature.getOriginal(); + List variants = feature.getVariation(); + StringBuilder sb = new StringBuilder(); + + /* + * append variant in standard format if present + * e.g. p.Arg59Lys + * multiple variants are split over lines using
+ */ + boolean asHtml = false; + if (orig != null && !orig.isEmpty() && variants != null + && !variants.isEmpty()) + { + int p = 0; + for (String var : variants) + { + // TODO proper HGVS nomenclature for delins structural variations + // http://varnomen.hgvs.org/recommendations/protein/variant/delins/ + // for now we are pragmatic - any orig/variant sequence longer than + // three characters is shown with single-character notation rather than + // three-letter notation + sb.append("p."); + if (orig.length() < 4) + { + for (int c = 0, clen = orig.length(); c < clen; c++) + { + char origchar = orig.charAt(c); + String orig3 = ResidueProperties.aa2Triplet.get("" + origchar); + sb.append(orig3 == null ? origchar + : StringUtils.toSentenceCase(orig3)); + } + } + else + { + sb.append(orig); + } + + LocationType location = feature.getLocation(); + PositionType start = location.getPosition() == null + ? location.getBegin() + : location.getPosition(); + sb.append(Integer.toString(start.getPosition().intValue())); + + if (var.length() < 4) + { + for (int c = 0, clen = var.length(); c < clen; c++) + { + char varchar = var.charAt(c); + String var3 = ResidueProperties.aa2Triplet.get("" + varchar); + + sb.append(var3 != null ? StringUtils.toSentenceCase(var3) + : "" + varchar); + } + } + else + { + sb.append(var); + } + if (++p != variants.size()) + { + sb.append("
  "); + asHtml = true; + } + else + { + sb.append(" "); + } + } + } + String description = feature.getDescription(); + if (description != null) + { + sb.append(description); + } + if (asHtml) + { + sb.insert(0, ""); + sb.append(""); + } + + return sb.toString(); + } + + /** + * A helper method that searches the list of properties for one with the given + * key, and if found returns the property value, else returns null + * + * @param properties + * @param key + * @return + */ + static String getProperty(List properties, String key) + { + String value = null; + if (properties != null) + { + for (PropertyType prop : properties) + { + if (key.equals(prop.getType())) + { + value = prop.getValue(); + break; + } + } + } + return value; + } + + /** + * Extracts xml element entry/protein/recommendedName/fullName + * + * @param entry + * @return + */ + static String getUniprotEntryDescription(Entry entry) + { + String desc = ""; + if (entry.getProtein() != null + && entry.getProtein().getRecommendedName() != null) + { + // fullName is mandatory if recommendedName is present + desc = entry.getProtein().getRecommendedName().getFullName() + .getValue(); + } + return desc; + } + + /** + * Constructs a sequence id by concatenating all entry/name elements with '|' + * separator + * + * @param entry + * @return + */ + static String getUniprotEntryId(Entry entry) + { + StringBuilder name = new StringBuilder(32); + for (String n : entry.getName()) + { + if (name.length() > 0) + { + name.append(BAR_DELIMITER); + } + name.append(n); + } + return name.toString(); + } + + /* + * (non-Javadoc) + * + * @see jalview.ws.DbSourceProxy#isValidReference(java.lang.String) + */ + @Override + public boolean isValidReference(String accession) + { + // TODO: make the following a standard validator + return (accession == null || accession.length() < 2) ? false + : getAccessionValidator().search(accession); + } + + /** + * return LDHA_CHICK uniprot entry + */ + @Override + public String getTestQuery() + { + return "P00340"; + } + + @Override + public String getDbName() + { + return "Uniprot"; // getDbSource(); + } + + @Override + public int getTier() + { + return 0; + } + + /** + * Reads the reply to the EBI Fetch Uniprot data query, unmarshals it to an + * Uniprot object, and returns the enclosed Entry objects, or null on any + * failure + * + * @param is + * @return + */ + public List getUniprotEntries(InputStream is) + { + List entries = null; + try + { + JAXBContext jc = JAXBContext + .newInstance("jalview.xml.binding.uniprot"); + XMLStreamReader streamReader = XMLInputFactory.newInstance() + .createXMLStreamReader(is); + javax.xml.bind.Unmarshaller um = jc.createUnmarshaller(); + jalview.xml.binding.uniprot.Uniprot uniprot = (jalview.xml.binding.uniprot.Uniprot) um.unmarshal(streamReader); + if (uniprot != null && !uniprot.getEntry().isEmpty()) + { + entries = uniprot.getEntry(); + } + } catch (JAXBException | XMLStreamException + | FactoryConfigurationError e) + { + e.printStackTrace(); + } + return entries; + } +}