X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fws%2Fdbsources%2FUniprot.java;h=7261cba4910eb80a35d3db2141afd487416919ff;hb=136c0793b90b72b928c4d77dc109dd5c644e00d3;hp=ab0b10c408ae867a5ab90f0d00b752c23a40f3cd;hpb=506d60f0e188723ddc91c26824b41ac7034df3fe;p=jalview.git diff --git a/src/jalview/ws/dbsources/Uniprot.java b/src/jalview/ws/dbsources/Uniprot.java index ab0b10c..7261cba 100644 --- a/src/jalview/ws/dbsources/Uniprot.java +++ b/src/jalview/ws/dbsources/Uniprot.java @@ -1,62 +1,70 @@ /* - * Jalview - A Sequence Alignment Editor and Viewer (Version 2.4) - * Copyright (C) 2008 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. + * This file is part of Jalview. * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.ws.dbsources; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.Enumeration; -import java.util.Hashtable; -import java.util.Vector; - -import org.exolab.castor.xml.Unmarshaller; - -import com.stevesoft.pat.Regex; - import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.PDBEntry; +import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.datamodel.UniprotEntry; -import jalview.datamodel.UniprotFile; -import jalview.io.FormatAdapter; -import jalview.io.IdentifyFile; -import jalview.ws.DBRefFetcher; +import jalview.datamodel.xdb.uniprot.UniprotEntry; +import jalview.datamodel.xdb.uniprot.UniprotFeature; +import jalview.datamodel.xdb.uniprot.UniprotFile; import jalview.ws.ebi.EBIFetchClient; -import jalview.ws.seqfetcher.DbSourceProxy; import jalview.ws.seqfetcher.DbSourceProxyImpl; +import java.io.File; +import java.io.FileReader; +import java.io.Reader; +import java.net.URL; +import java.util.ArrayList; +import java.util.Vector; + +import org.exolab.castor.mapping.Mapping; +import org.exolab.castor.xml.Unmarshaller; + +import com.stevesoft.pat.Regex; + /** * @author JimP * */ -public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy +public class Uniprot extends DbSourceProxyImpl { + private static final String BAR_DELIMITER = "|"; + + /* + * Castor mapping loaded from uniprot_mapping.xml + */ + private static Mapping map; + + /** + * Constructor + */ public Uniprot() { super(); - addDbSourceProperty(DBRefSource.SEQDB, DBRefSource.SEQDB); - addDbSourceProperty(DBRefSource.PROTSEQDB); - // addDbSourceProperty(DBRefSource.MULTIACC, new Integer(50)); } /* @@ -64,9 +72,10 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getAccessionSeparator() */ + @Override public String getAccessionSeparator() { - return null; // ";"; + return null; } /* @@ -74,9 +83,10 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getAccessionValidator() */ + @Override public Regex getAccessionValidator() { - return null; + return new Regex("([A-Z]+[0-9]+[A-Z0-9]+|[A-Z0-9]+_[A-Z0-9]+)"); } /* @@ -84,6 +94,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getDbSource() */ + @Override public String getDbSource() { return DBRefSource.UNIPROT; @@ -94,30 +105,41 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getDbVersion() */ + @Override public String getDbVersion() { return "0"; // we really don't know what version we're on. } - private EBIFetchClient ebi = null; - - public Vector getUniprotEntries(File file) + /** + * Reads a file containing the reply to the EBI Fetch Uniprot data query, + * unmarshals it to a UniprotFile object, and returns the list of UniprotEntry + * data models (mapped from <entry> elements) + * + * @param fileReader + * @return + */ + public Vector getUniprotEntries(Reader fileReader) { UniprotFile uni = new UniprotFile(); try { - // 1. Load the mapping information from the file - org.exolab.castor.mapping.Mapping map = new org.exolab.castor.mapping.Mapping( - uni.getClass().getClassLoader()); - java.net.URL url = getClass().getResource("/uniprot_mapping.xml"); - map.loadMapping(url); + if (map == null) + { + // 1. Load the mapping information from the file + map = new Mapping(uni.getClass().getClassLoader()); + URL url = getClass().getResource("/uniprot_mapping.xml"); + map.loadMapping(url); + } // 2. Unmarshal the data Unmarshaller unmar = new Unmarshaller(uni); unmar.setIgnoreExtraElements(true); unmar.setMapping(map); - - uni = (UniprotFile) unmar.unmarshal(new FileReader(file)); + if (fileReader != null) + { + uni = (UniprotFile) unmar.unmarshal(fileReader); + } } catch (Exception e) { System.out.println("Error getUniprotEntries() " + e); @@ -131,66 +153,31 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[]) */ + @Override public AlignmentI getSequenceRecords(String queries) throws Exception { startQuery(); try { - Alignment al = null; - ebi = new EBIFetchClient(); - StringBuffer result = new StringBuffer(); + queries = queries.toUpperCase().replaceAll( + "(UNIPROT\\|?|UNIPROT_|UNIREF\\d+_|UNIREF\\d+\\|?)", ""); + AlignmentI al = null; + EBIFetchClient ebi = new EBIFetchClient(); // uniprotxml parameter required since december 2007 - File file = ebi.fetchDataAsFile("uniprot:" + queries, "uniprotxml", - null); - Vector entries = getUniprotEntries(file); + // uniprotkb dbname changed introduced december 2008 + File file = ebi.fetchDataAsFile("uniprotkb:" + queries, "uniprotxml", + "xml"); + Vector entries = getUniprotEntries(new FileReader(file)); if (entries != null) { - // First, make the new sequences - Enumeration en = entries.elements(); - while (en.hasMoreElements()) + ArrayList seqs = new ArrayList(); + for (UniprotEntry entry : entries) { - UniprotEntry entry = (UniprotEntry) en.nextElement(); - - StringBuffer name = new StringBuffer(">UniProt/Swiss-Prot"); - Enumeration en2 = entry.getAccession().elements(); - while (en2.hasMoreElements()) - { - name.append("|"); - name.append(en2.nextElement()); - } - en2 = entry.getName().elements(); - while (en2.hasMoreElements()) - { - name.append("|"); - name.append(en2.nextElement()); - } - - if (entry.getProtein() != null - && entry.getProtein().getName() != null) - { - for (int nm = 0, nmSize = entry.getProtein().getName().size(); nm < nmSize; nm++) - { - name.append(" " + entry.getProtein().getName().elementAt(nm)); - } - } - - result.append(name + "\n" - + entry.getUniprotSequence().getContent() + "\n"); - + seqs.add(uniprotEntryToSequenceI(entry)); } + al = new Alignment(seqs.toArray(new SequenceI[0])); - // Then read in the features and apply them to the dataset - al = parseResult(result.toString()); - if (al != null) - { - // Decorate the alignment with database entries. - addUniprotXrefs(al, entries); - } - else - { - results = result; - } } stopQuery(); return al; @@ -202,68 +189,140 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy } /** - * add an ordered set of UniprotEntry objects to an ordered set of seuqences. * - * @param al - - * a sequence of n sequences - * @param entries - * a seuqence of n uniprot entries to be analysed. + * @param entry + * UniprotEntry + * @return SequenceI instance created from the UniprotEntry instance */ - public void addUniprotXrefs(Alignment al, Vector entries) + public SequenceI uniprotEntryToSequenceI(UniprotEntry entry) { - for (int i = 0; i < entries.size(); i++) + String id = getUniprotEntryId(entry); + SequenceI sequence = new Sequence(id, entry.getUniprotSequence() + .getContent()); + sequence.setDescription(getUniprotEntryDescription(entry)); + + final String dbVersion = getDbVersion(); + ArrayList dbRefs = new ArrayList(); + for (String accessionId : entry.getAccession()) + { + DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT, dbVersion, + accessionId); + + // mark dbRef as a primary reference for this sequence + dbRefs.add(dbRef); + } + + Vector onlyPdbEntries = new Vector(); + for (PDBEntry pdb : entry.getDbReference()) { - UniprotEntry entry = (UniprotEntry) entries.elementAt(i); - Enumeration e = entry.getDbReference().elements(); - Vector onlyPdbEntries = new Vector(); - Vector dbxrefs = new Vector(); - while (e.hasMoreElements()) + DBRefEntry dbr = new DBRefEntry(); + dbr.setSource(pdb.getType()); + dbr.setAccessionId(pdb.getId()); + dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion); + dbRefs.add(dbr); + if ("PDB".equals(pdb.getType())) + { + onlyPdbEntries.addElement(pdb); + } + if ("EMBL".equals(pdb.getType())) { - PDBEntry pdb = (PDBEntry) e.nextElement(); - DBRefEntry dbr = new DBRefEntry(); - dbr.setSource(pdb.getType()); - dbr.setAccessionId(pdb.getId()); - dbr.setVersion(DBRefSource.UNIPROT + ":" + getDbVersion()); - dbxrefs.addElement(dbr); - if (!pdb.getType().equals("PDB")) + // look for a CDS reference and add it, too. + String cdsId = (String) pdb.getProperty("protein sequence ID"); + if (cdsId != null && cdsId.trim().length() > 0) { - continue; + // remove version + String[] vrs = cdsId.split("\\."); + dbr = new DBRefEntry(DBRefSource.EMBLCDS, vrs.length > 1 ? vrs[1] + : DBRefSource.UNIPROT + ":" + dbVersion, vrs[0]); + dbRefs.add(dbr); } - - onlyPdbEntries.addElement(pdb); } - SequenceI sq = al.getSequenceAt(i); - while (sq.getDatasetSequence() != null) + if ("Ensembl".equals(pdb.getType())) { - sq = sq.getDatasetSequence(); + /*UniprotXML + * + * + * + * + * + */ + String cdsId = (String) pdb.getProperty("protein sequence ID"); + if (cdsId != null && cdsId.trim().length() > 0) + { + dbr = new DBRefEntry(DBRefSource.ENSEMBL, DBRefSource.UNIPROT + + ":" + dbVersion, cdsId.trim()); + dbRefs.add(dbr); + + } } + } - Enumeration en2 = entry.getAccession().elements(); - while (en2.hasMoreElements()) + sequence.setPDBId(onlyPdbEntries); + if (entry.getFeature() != null) + { + for (UniprotFeature uf : entry.getFeature()) { - // we always add as uniprot if we retrieved from uniprot or uniprot name - sq.addDBRef(new DBRefEntry(DBRefSource.UNIPROT, getDbVersion(), en2 - .nextElement().toString())); + SequenceFeature copy = new SequenceFeature(uf.getType(), + uf.getDescription(), uf.getBegin(), uf.getEnd(), "Uniprot"); + copy.setStatus(uf.getStatus()); + sequence.addSequenceFeature(copy); } - en2 = dbxrefs.elements(); - while (en2.hasMoreElements()) - { - // we always add as uniprot if we retrieved from uniprot or uniprot name - sq.addDBRef((DBRefEntry) en2.nextElement()); + } + for (DBRefEntry dbr : dbRefs) + { + sequence.addDBRef(dbr); + } + return sequence; + } - } - sq.setPDBId(onlyPdbEntries); - if (entry.getFeature() != null) + /** + * + * @param entry + * UniportEntry + * @return protein name(s) delimited by a white space character + */ + public static String getUniprotEntryDescription(UniprotEntry entry) + { + StringBuilder desc = new StringBuilder(32); + if (entry.getProtein() != null && entry.getProtein().getName() != null) + { + boolean first = true; + for (String nm : entry.getProtein().getName()) { - e = entry.getFeature().elements(); - while (e.hasMoreElements()) + if (!first) { - SequenceFeature sf = (SequenceFeature) e.nextElement(); - sf.setFeatureGroup("Uniprot"); - sq.addSequenceFeature(sf); + desc.append(" "); } + first = false; + desc.append(nm); } } + return desc.toString(); + } + + /** + * + * @param entry + * UniportEntry + * @return The accession id(s) and name(s) delimited by '|'. + */ + public static String getUniprotEntryId(UniprotEntry entry) + { + StringBuilder name = new StringBuilder(32); + // name.append("UniProt/Swiss-Prot"); + // use 'canonicalised' name for optimal id matching + name.append(DBRefSource.UNIPROT); + for (String accessionId : entry.getAccession()) + { + name.append(BAR_DELIMITER); + name.append(accessionId); + } + for (String n : entry.getName()) + { + name.append(BAR_DELIMITER); + name.append(n); + } + return name.toString(); } /* @@ -271,21 +330,32 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#isValidReference(java.lang.String) */ + @Override public boolean isValidReference(String accession) { - return true; + // TODO: make the following a standard validator + return (accession == null || accession.length() < 2) ? false + : getAccessionValidator().search(accession); } /** * return LDHA_CHICK uniprot entry */ + @Override public String getTestQuery() { return "P00340"; } + @Override public String getDbName() { return "Uniprot"; // getDbSource(); } + + @Override + public int getTier() + { + return 0; + } }