X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fws%2Fdbsources%2FUniprot.java;h=167cd977d325a06d6ddf2d2128053715ecb772d6;hb=743af46c3c50e67706009be4360fa504bfcd4fdc;hp=02da009c5f975a3d3d7f09d97db7c532a56908a7;hpb=a6b324e3f5edac3df0b968f0037b1cc8b651598e;p=jalview.git diff --git a/src/jalview/ws/dbsources/Uniprot.java b/src/jalview/ws/dbsources/Uniprot.java index 02da009..167cd97 100644 --- a/src/jalview/ws/dbsources/Uniprot.java +++ b/src/jalview/ws/dbsources/Uniprot.java @@ -20,6 +20,7 @@ */ package jalview.ws.dbsources; +import jalview.bin.Cache; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; @@ -28,16 +29,20 @@ import jalview.datamodel.PDBEntry; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.datamodel.UniprotEntry; -import jalview.datamodel.UniprotFile; -import jalview.ws.ebi.EBIFetchClient; +import jalview.datamodel.xdb.uniprot.UniprotEntry; +import jalview.datamodel.xdb.uniprot.UniprotFeature; +import jalview.datamodel.xdb.uniprot.UniprotFile; +import jalview.schemes.ResidueProperties; +import jalview.util.StringUtils; import jalview.ws.seqfetcher.DbSourceProxyImpl; -import java.io.File; -import java.io.FileReader; +import java.io.InputStream; +import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; +import java.net.URLConnection; import java.util.ArrayList; +import java.util.List; import java.util.Vector; import org.exolab.castor.mapping.Mapping; @@ -51,6 +56,8 @@ import com.stevesoft.pat.Regex; */ public class Uniprot extends DbSourceProxyImpl { + private static final String DEFAULT_UNIPROT_DOMAIN = "https://www.uniprot.org"; + private static final String BAR_DELIMITER = "|"; /* @@ -66,6 +73,11 @@ public class Uniprot extends DbSourceProxyImpl super(); } + private String getDomain() + { + return Cache.getDefault("UNIPROT_DOMAIN", DEFAULT_UNIPROT_DOMAIN); + } + /* * (non-Javadoc) * @@ -161,16 +173,21 @@ public class Uniprot extends DbSourceProxyImpl queries = queries.toUpperCase().replaceAll( "(UNIPROT\\|?|UNIPROT_|UNIREF\\d+_|UNIREF\\d+\\|?)", ""); AlignmentI al = null; - EBIFetchClient ebi = new EBIFetchClient(); - // uniprotxml parameter required since december 2007 - // uniprotkb dbname changed introduced december 2008 - File file = ebi.fetchDataAsFile("uniprotkb:" + queries, "uniprotxml", - null); - Vector entries = getUniprotEntries(new FileReader(file)); + + String downloadstring = getDomain() + "/uniprot/" + queries + + ".xml"; + URL url = null; + URLConnection urlconn = null; + + url = new URL(downloadstring); + urlconn = url.openConnection(); + InputStream istr = urlconn.getInputStream(); + Vector entries = getUniprotEntries( + new InputStreamReader(istr, "UTF-8")); if (entries != null) { - ArrayList seqs = new ArrayList(); + ArrayList seqs = new ArrayList<>(); for (UniprotEntry entry : entries) { seqs.add(uniprotEntryToSequenceI(entry)); @@ -182,8 +199,10 @@ public class Uniprot extends DbSourceProxyImpl return al; } catch (Exception e) { - stopQuery(); throw (e); + } finally + { + stopQuery(); } } @@ -193,24 +212,25 @@ public class Uniprot extends DbSourceProxyImpl * UniprotEntry * @return SequenceI instance created from the UniprotEntry instance */ - public SequenceI uniprotEntryToSequenceI(UniprotEntry entry){ + public SequenceI uniprotEntryToSequenceI(UniprotEntry entry) + { String id = getUniprotEntryId(entry); - SequenceI sequence = new Sequence(id, entry.getUniprotSequence() - .getContent()); + SequenceI sequence = new Sequence(id, + entry.getUniprotSequence().getContent()); sequence.setDescription(getUniprotEntryDescription(entry)); final String dbVersion = getDbVersion(); - ArrayList dbRefs = new ArrayList(); + ArrayList dbRefs = new ArrayList<>(); for (String accessionId : entry.getAccession()) { DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT, dbVersion, accessionId); + + // mark dbRef as a primary reference for this sequence dbRefs.add(dbRef); } - sequence.setSourceDBRef((dbRefs != null && dbRefs.size() > 0) ? dbRefs - .get(0) : null); - Vector onlyPdbEntries = new Vector(); + Vector onlyPdbEntries = new Vector<>(); for (PDBEntry pdb : entry.getDbReference()) { DBRefEntry dbr = new DBRefEntry(); @@ -222,22 +242,146 @@ public class Uniprot extends DbSourceProxyImpl { onlyPdbEntries.addElement(pdb); } + if ("EMBL".equals(pdb.getType())) + { + // look for a CDS reference and add it, too. + String cdsId = (String) pdb.getProperty("protein sequence ID"); + if (cdsId != null && cdsId.trim().length() > 0) + { + // remove version + String[] vrs = cdsId.split("\\."); + dbr = new DBRefEntry(DBRefSource.EMBLCDS, vrs.length > 1 ? vrs[1] + : DBRefSource.UNIPROT + ":" + dbVersion, vrs[0]); + dbRefs.add(dbr); + } + } + if ("Ensembl".equals(pdb.getType())) + { + /*UniprotXML + * + * + * + * + * + */ + String cdsId = (String) pdb.getProperty("protein sequence ID"); + if (cdsId != null && cdsId.trim().length() > 0) + { + dbr = new DBRefEntry(DBRefSource.ENSEMBL, + DBRefSource.UNIPROT + ":" + dbVersion, cdsId.trim()); + dbRefs.add(dbr); + + } + } } sequence.setPDBId(onlyPdbEntries); if (entry.getFeature() != null) { - for (SequenceFeature sf : entry.getFeature()) + for (UniprotFeature uf : entry.getFeature()) { - sf.setFeatureGroup("Uniprot"); - sequence.addSequenceFeature(sf); + SequenceFeature copy = new SequenceFeature(uf.getType(), + getDescription(uf), uf.getBegin(), uf.getEnd(), "Uniprot"); + copy.setStatus(uf.getStatus()); + sequence.addSequenceFeature(copy); } } - sequence.setDBRefs(dbRefs.toArray(new DBRefEntry[0])); + for (DBRefEntry dbr : dbRefs) + { + sequence.addDBRef(dbr); + } return sequence; } /** + * Constructs a feature description from the description and (optionally) + * original and variant fields of the Uniprot XML feature + * + * @param uf + * @return + */ + protected static String getDescription(UniprotFeature uf) + { + String orig = uf.getOriginal(); + List variants = uf.getVariation(); + StringBuilder sb = new StringBuilder(); + + /* + * append variant in standard format if present + * e.g. p.Arg59Lys + * multiple variants are split over lines using
+ */ + boolean asHtml = false; + if (orig != null && !orig.isEmpty() && variants != null + && !variants.isEmpty()) + { + int p = 0; + for (String var : variants) + { + // TODO proper HGVS nomenclature for delins structural variations + // http://varnomen.hgvs.org/recommendations/protein/variant/delins/ + // for now we are pragmatic - any orig/variant sequence longer than + // three characters is shown with single-character notation rather than + // three-letter notation + sb.append("p."); + if (orig.length() < 4) + { + for (int c = 0, clen = orig.length(); c < clen; c++) + { + char origchar = orig.charAt(c); + String orig3 = ResidueProperties.aa2Triplet.get("" + origchar); + sb.append(orig3 == null ? origchar + : StringUtils.toSentenceCase(orig3)); + } + } + else + { + sb.append(orig); + } + + sb.append(Integer.toString(uf.getPosition())); + + if (var.length() < 4) + { + for (int c = 0, clen = var.length(); c < clen; c++) + { + char varchar = var.charAt(c); + String var3 = ResidueProperties.aa2Triplet.get("" + varchar); + + sb.append(var3 != null ? StringUtils.toSentenceCase(var3) + : "" + varchar); + } + } + else + { + sb.append(var); + } + if (++p != variants.size()) + { + sb.append("
  "); + asHtml = true; + } + else + { + sb.append(" "); + } + } + } + String description = uf.getDescription(); + if (description != null) + { + sb.append(description); + } + if (asHtml) + { + sb.insert(0, ""); + sb.append(""); + } + + return sb.toString(); + } + + /** * * @param entry * UniportEntry @@ -248,9 +392,15 @@ public class Uniprot extends DbSourceProxyImpl StringBuilder desc = new StringBuilder(32); if (entry.getProtein() != null && entry.getProtein().getName() != null) { + boolean first = true; for (String nm : entry.getProtein().getName()) { - desc.append(nm).append(" "); + if (!first) + { + desc.append(" "); + } + first = false; + desc.append(nm); } } return desc.toString(); @@ -259,21 +409,18 @@ public class Uniprot extends DbSourceProxyImpl /** * * @param entry - * UniportEntry + * UniprotEntry * @return The accession id(s) and name(s) delimited by '|'. */ public static String getUniprotEntryId(UniprotEntry entry) { StringBuilder name = new StringBuilder(32); - name.append("UniProt/Swiss-Prot"); - for (String accessionId : entry.getAccession()) - { - name.append(BAR_DELIMITER); - name.append(accessionId); - } for (String n : entry.getName()) { - name.append(BAR_DELIMITER); + if (name.length() > 0) + { + name.append(BAR_DELIMITER); + } name.append(n); } return name.toString();