X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fws%2Fdbsources%2FUniprot.java;h=274ad3248e81f57142e07e2a956d93d2cbd7377c;hb=11f6e7a63cc627f5dffb0dd382343bd99d15121c;hp=6a871ee84cb9791dd6831250c2efbc0f33bd4f54;hpb=d62b90cb6effb7b380e5f7d590691dd884b024cf;p=jalview.git diff --git a/src/jalview/ws/dbsources/Uniprot.java b/src/jalview/ws/dbsources/Uniprot.java index 6a871ee..274ad32 100644 --- a/src/jalview/ws/dbsources/Uniprot.java +++ b/src/jalview/ws/dbsources/Uniprot.java @@ -20,40 +20,50 @@ */ package jalview.ws.dbsources; -import java.io.File; -import java.io.FileReader; -import java.io.Reader; -import java.util.Vector; - -import org.exolab.castor.xml.Unmarshaller; - -import com.stevesoft.pat.Regex; - +import jalview.bin.Cache; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.PDBEntry; +import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.datamodel.UniprotEntry; -import jalview.datamodel.UniprotFile; -import jalview.ws.ebi.EBIFetchClient; -import jalview.ws.seqfetcher.DbSourceProxy; +import jalview.datamodel.xdb.uniprot.UniprotEntry; +import jalview.datamodel.xdb.uniprot.UniprotFeature; +import jalview.datamodel.xdb.uniprot.UniprotFile; +import jalview.schemes.ResidueProperties; +import jalview.util.StringUtils; import jalview.ws.seqfetcher.DbSourceProxyImpl; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.net.URL; +import java.net.URLConnection; +import java.util.ArrayList; +import java.util.List; +import java.util.Vector; + +import org.exolab.castor.mapping.Mapping; +import org.exolab.castor.xml.Unmarshaller; + +import com.stevesoft.pat.Regex; + /** * @author JimP * */ -public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy +public class Uniprot extends DbSourceProxyImpl { + private static final String DEFAULT_UNIPROT_DOMAIN = "https://www.uniprot.org"; private static final String BAR_DELIMITER = "|"; - private static final String NEWLINE = "\n"; - - private static org.exolab.castor.mapping.Mapping map; + /* + * Castor mapping loaded from uniprot_mapping.xml + */ + private static Mapping map; /** * Constructor @@ -61,9 +71,11 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy public Uniprot() { super(); - addDbSourceProperty(DBRefSource.SEQDB, DBRefSource.SEQDB); - addDbSourceProperty(DBRefSource.PROTSEQDB); - // addDbSourceProperty(DBRefSource.MULTIACC, new Integer(50)); + } + + private String getDomain() + { + return Cache.getDefault("UNIPROT_DOMAIN", DEFAULT_UNIPROT_DOMAIN); } /* @@ -71,9 +83,10 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getAccessionSeparator() */ + @Override public String getAccessionSeparator() { - return null; // ";"; + return null; } /* @@ -81,6 +94,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getAccessionValidator() */ + @Override public Regex getAccessionValidator() { return new Regex("([A-Z]+[0-9]+[A-Z0-9]+|[A-Z0-9]+_[A-Z0-9]+)"); @@ -91,6 +105,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getDbSource() */ + @Override public String getDbSource() { return DBRefSource.UNIPROT; @@ -101,6 +116,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getDbVersion() */ + @Override public String getDbVersion() { return "0"; // we really don't know what version we're on. @@ -122,9 +138,8 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy if (map == null) { // 1. Load the mapping information from the file - map = new org.exolab.castor.mapping.Mapping(uni.getClass() - .getClassLoader()); - java.net.URL url = getClass().getResource("/uniprot_mapping.xml"); + map = new Mapping(uni.getClass().getClassLoader()); + URL url = getClass().getResource("/uniprot_mapping.xml"); map.loadMapping(url); } @@ -149,6 +164,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[]) */ + @Override public AlignmentI getSequenceRecords(String queries) throws Exception { startQuery(); @@ -156,147 +172,232 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy { queries = queries.toUpperCase().replaceAll( "(UNIPROT\\|?|UNIPROT_|UNIREF\\d+_|UNIREF\\d+\\|?)", ""); - Alignment al = null; - EBIFetchClient ebi = new EBIFetchClient(); - // uniprotxml parameter required since december 2007 - // uniprotkb dbname changed introduced december 2008 - File file = ebi.fetchDataAsFile("uniprotkb:" + queries, "uniprotxml", - null); - Vector entries = getUniprotEntries(new FileReader(file)); + AlignmentI al = null; + + String downloadstring = getDomain() + "/uniprot/" + queries + + ".xml"; + URL url = null; + URLConnection urlconn = null; + + url = new URL(downloadstring); + urlconn = url.openConnection(); + InputStream istr = urlconn.getInputStream(); + Vector entries = getUniprotEntries( + new InputStreamReader(istr, "UTF-8")); if (entries != null) { - /* - * If Castor binding included sequence@length, we could guesstimate the - * size of buffer to hold the alignment - */ - StringBuffer result = new StringBuffer(128); - // First, make the new sequences + ArrayList seqs = new ArrayList<>(); for (UniprotEntry entry : entries) { - StringBuilder name = constructSequenceFastaHeader(entry); - - result.append(name).append(NEWLINE) - .append(entry.getUniprotSequence().getContent()) - .append(NEWLINE); + seqs.add(uniprotEntryToSequenceI(entry)); } + al = new Alignment(seqs.toArray(new SequenceI[0])); - // Then read in the features and apply them to the dataset - al = parseResult(result.toString()); - if (al != null) - { - // Decorate the alignment with database entries. - addUniprotXrefs(al, entries); - } - else - { - results = result; - } } stopQuery(); return al; } catch (Exception e) { - stopQuery(); throw (e); + } finally + { + stopQuery(); } } /** - * Construct a Fasta-format sequence header by concatenating the source, - * accession id(s) and name(s), delimited by '|', plus any protein names, now - * with space rather than bar delimiter * * @param entry - * @return + * UniprotEntry + * @return SequenceI instance created from the UniprotEntry instance */ - public static StringBuilder constructSequenceFastaHeader( - UniprotEntry entry) + public SequenceI uniprotEntryToSequenceI(UniprotEntry entry) { - StringBuilder name = new StringBuilder(32); - name.append(">UniProt/Swiss-Prot"); + String id = getUniprotEntryId(entry); + SequenceI sequence = new Sequence(id, + entry.getUniprotSequence().getContent()); + sequence.setDescription(getUniprotEntryDescription(entry)); + + final String dbVersion = getDbVersion(); + ArrayList dbRefs = new ArrayList<>(); for (String accessionId : entry.getAccession()) { - name.append(BAR_DELIMITER); - name.append(accessionId); + DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT, dbVersion, + accessionId); + + // mark dbRef as a primary reference for this sequence + dbRefs.add(dbRef); } - for (String n : entry.getName()) + + Vector onlyPdbEntries = new Vector<>(); + for (PDBEntry pdb : entry.getDbReference()) { - name.append(BAR_DELIMITER); - name.append(n); + DBRefEntry dbr = new DBRefEntry(); + dbr.setSource(pdb.getType()); + dbr.setAccessionId(pdb.getId()); + dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion); + dbRefs.add(dbr); + if ("PDB".equals(pdb.getType())) + { + onlyPdbEntries.addElement(pdb); + } + if ("EMBL".equals(pdb.getType())) + { + // look for a CDS reference and add it, too. + String cdsId = (String) pdb.getProperty("protein sequence ID"); + if (cdsId != null && cdsId.trim().length() > 0) + { + // remove version + String[] vrs = cdsId.split("\\."); + dbr = new DBRefEntry(DBRefSource.EMBLCDS, vrs.length > 1 ? vrs[1] + : DBRefSource.UNIPROT + ":" + dbVersion, vrs[0]); + dbRefs.add(dbr); + } + } + if ("Ensembl".equals(pdb.getType())) + { + /*UniprotXML + * + * + * + * + * + */ + String cdsId = (String) pdb.getProperty("protein sequence ID"); + if (cdsId != null && cdsId.trim().length() > 0) + { + dbr = new DBRefEntry(DBRefSource.ENSEMBL, + DBRefSource.UNIPROT + ":" + dbVersion, cdsId.trim()); + dbRefs.add(dbr); + + } + } } - if (entry.getProtein() != null - && entry.getProtein().getName() != null) + sequence.setPDBId(onlyPdbEntries); + if (entry.getFeature() != null) { - for (String nm : entry.getProtein().getName()) + for (UniprotFeature uf : entry.getFeature()) { - name.append(" ").append(nm); + SequenceFeature copy = new SequenceFeature(uf.getType(), + getDescription(uf), uf.getBegin(), uf.getEnd(), "Uniprot"); + copy.setStatus(uf.getStatus()); + sequence.addSequenceFeature(copy); } } - return name; + for (DBRefEntry dbr : dbRefs) + { + sequence.addDBRef(dbr); + } + return sequence; } /** - * add an ordered set of UniprotEntry objects to an ordered set of seuqences. + * Constructs a feature description from the description and (optionally) + * original and variant fields of the Uniprot XML feature * - * @param al - * - a sequence of n sequences - * @param entries - * a list of n uniprot entries to be analysed. + * @param uf + * @return */ - public void addUniprotXrefs(Alignment al, Vector entries) + protected static String getDescription(UniprotFeature uf) { - final String dbVersion = getDbVersion(); - - for (int i = 0; i < entries.size(); i++) + String orig = uf.getOriginal(); + List variants = uf.getVariation(); + StringBuilder sb = new StringBuilder(); + + /* + * append variant in standard format if present + * e.g. p.Arg59Lys + */ + if (orig != null && !orig.isEmpty() && variants != null + && !variants.isEmpty()) { - UniprotEntry entry = entries.elementAt(i); - Vector onlyPdbEntries = new Vector(); - Vector dbxrefs = new Vector(); - - for (PDBEntry pdb : entry.getDbReference()) + int p = 0; + for (String var : variants) { - DBRefEntry dbr = new DBRefEntry(); - dbr.setSource(pdb.getType()); - dbr.setAccessionId(pdb.getId()); - dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion); - dbxrefs.addElement(dbr); - if ("PDB".equals(pdb.getType())) + // TODO proper HGVC nomenclature for delins structural variations + sb.append("p."); + for (int c = 0, clen = orig.length(); c < clen; c++) { - onlyPdbEntries.addElement(pdb); + char origchar = orig.charAt(c); + String orig3 = ResidueProperties.aa2Triplet.get("" + origchar); + sb.append(orig3 == null ? origchar + : StringUtils.toSentenceCase(orig3)); } - } - SequenceI sq = al.getSequenceAt(i); - while (sq.getDatasetSequence() != null) - { - sq = sq.getDatasetSequence(); - } + sb.append(Integer.toString(uf.getPosition())); - for (String accessionId : entry.getAccession()) - { - /* - * add as uniprot whether retrieved from uniprot or uniprot_name - */ - sq.addDBRef(new DBRefEntry(DBRefSource.UNIPROT, dbVersion, - accessionId)); - } + for (int c = 0, clen = var.length(); c < clen; c++) + { + char varchar = var.charAt(c); + String var3 = ResidueProperties.aa2Triplet.get("" + varchar); - for (DBRefEntry dbRef : dbxrefs) - { - sq.addDBRef(dbRef); + sb.append(var3 != null ? StringUtils.toSentenceCase(var3) + : "" + varchar); + } + if (++p != variants.size()) + { + sb.append("\n"); + } + else + { + sb.append(" "); + } } - sq.setPDBId(onlyPdbEntries); - if (entry.getFeature() != null) + } + String description = uf.getDescription(); + if (description != null) + { + sb.append(description); + } + + return sb.toString(); + } + + /** + * + * @param entry + * UniportEntry + * @return protein name(s) delimited by a white space character + */ + public static String getUniprotEntryDescription(UniprotEntry entry) + { + StringBuilder desc = new StringBuilder(32); + if (entry.getProtein() != null && entry.getProtein().getName() != null) + { + boolean first = true; + for (String nm : entry.getProtein().getName()) { - for (SequenceFeature sf : entry.getFeature()) + if (!first) { - sf.setFeatureGroup("Uniprot"); - sq.addSequenceFeature(sf); + desc.append(" "); } + first = false; + desc.append(nm); + } + } + return desc.toString(); + } + + /** + * + * @param entry + * UniprotEntry + * @return The accession id(s) and name(s) delimited by '|'. + */ + public static String getUniprotEntryId(UniprotEntry entry) + { + StringBuilder name = new StringBuilder(32); + for (String n : entry.getName()) + { + if (name.length() > 0) + { + name.append(BAR_DELIMITER); } + name.append(n); } + return name.toString(); } /* @@ -304,6 +405,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#isValidReference(java.lang.String) */ + @Override public boolean isValidReference(String accession) { // TODO: make the following a standard validator @@ -314,11 +416,13 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy /** * return LDHA_CHICK uniprot entry */ + @Override public String getTestQuery() { return "P00340"; } + @Override public String getDbName() { return "Uniprot"; // getDbSource();