2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.ws.dbsources;
23 import jalview.datamodel.Alignment;
24 import jalview.datamodel.AlignmentI;
25 import jalview.datamodel.DBRefEntry;
26 import jalview.datamodel.DBRefSource;
27 import jalview.datamodel.PDBEntry;
28 import jalview.datamodel.Sequence;
29 import jalview.datamodel.SequenceFeature;
30 import jalview.datamodel.SequenceI;
31 import jalview.datamodel.xdb.uniprot.UniprotEntry;
32 import jalview.datamodel.xdb.uniprot.UniprotFeature;
33 import jalview.datamodel.xdb.uniprot.UniprotFile;
34 import jalview.ws.seqfetcher.DbSourceProxyImpl;
36 import java.io.InputStream;
37 import java.io.InputStreamReader;
38 import java.io.Reader;
40 import java.net.URLConnection;
41 import java.util.ArrayList;
42 import java.util.Vector;
44 import org.exolab.castor.mapping.Mapping;
45 import org.exolab.castor.xml.Unmarshaller;
47 import com.stevesoft.pat.Regex;
53 public class Uniprot extends DbSourceProxyImpl
55 private static final String BAR_DELIMITER = "|";
58 * Castor mapping loaded from uniprot_mapping.xml
60 private static Mapping map;
63 * configurable parameter controlling prefixing of entry names with accessions
65 private boolean includeAllIds = false;
78 * @see jalview.ws.DbSourceProxy#getAccessionSeparator()
81 public String getAccessionSeparator()
89 * @see jalview.ws.DbSourceProxy#getAccessionValidator()
92 public Regex getAccessionValidator()
94 return new Regex("([A-Z]+[0-9]+[A-Z0-9]+|[A-Z0-9]+_[A-Z0-9]+)");
100 * @see jalview.ws.DbSourceProxy#getDbSource()
103 public String getDbSource()
105 return DBRefSource.UNIPROT;
111 * @see jalview.ws.DbSourceProxy#getDbVersion()
114 public String getDbVersion()
116 return "0"; // we really don't know what version we're on.
120 * Reads a file containing the reply to the EBI Fetch Uniprot data query,
121 * unmarshals it to a UniprotFile object, and returns the list of UniprotEntry
122 * data models (mapped from <entry> elements)
127 public Vector<UniprotEntry> getUniprotEntries(Reader fileReader)
129 UniprotFile uni = new UniprotFile();
134 // 1. Load the mapping information from the file
135 map = new Mapping(uni.getClass().getClassLoader());
136 URL url = getClass().getResource("/uniprot_mapping.xml");
137 map.loadMapping(url);
140 // 2. Unmarshal the data
141 Unmarshaller unmar = new Unmarshaller(uni);
142 unmar.setIgnoreExtraElements(true);
143 unmar.setMapping(map);
144 if (fileReader != null)
146 uni = (UniprotFile) unmar.unmarshal(fileReader);
148 } catch (Exception e)
150 System.out.println("Error getUniprotEntries() " + e);
153 return uni.getUniprotEntries();
159 * @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[])
162 public AlignmentI getSequenceRecords(String queries) throws Exception
167 queries = queries.toUpperCase().replaceAll(
168 "(UNIPROT\\|?|UNIPROT_|UNIREF\\d+_|UNIREF\\d+\\|?)", "");
169 AlignmentI al = null;
171 String downloadstring = "http://www.uniprot.org/uniprot/" + queries
174 URLConnection urlconn = null;
176 url = new URL(downloadstring);
177 urlconn = url.openConnection();
178 InputStream istr = urlconn.getInputStream();
179 Vector<UniprotEntry> entries = getUniprotEntries(
180 new InputStreamReader(istr, "UTF-8"));
184 ArrayList<SequenceI> seqs = new ArrayList<>();
185 for (UniprotEntry entry : entries)
187 seqs.add(uniprotEntryToSequenceI(entry));
189 al = new Alignment(seqs.toArray(new SequenceI[0]));
194 } catch (Exception e)
207 * @return SequenceI instance created from the UniprotEntry instance
209 public SequenceI uniprotEntryToSequenceI(UniprotEntry entry)
211 String id = getUniprotEntryId(entry, includeAllIds);
212 SequenceI sequence = new Sequence(id,
213 entry.getUniprotSequence().getContent());
214 sequence.setDescription(getUniprotEntryDescription(entry));
216 final String dbVersion = getDbVersion();
217 ArrayList<DBRefEntry> dbRefs = new ArrayList<>();
218 for (String accessionId : entry.getAccession())
220 DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT, dbVersion,
223 // mark dbRef as a primary reference for this sequence
227 Vector<PDBEntry> onlyPdbEntries = new Vector<>();
228 for (PDBEntry pdb : entry.getDbReference())
230 DBRefEntry dbr = new DBRefEntry();
231 dbr.setSource(pdb.getType());
232 dbr.setAccessionId(pdb.getId());
233 dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion);
235 if ("PDB".equals(pdb.getType()))
237 onlyPdbEntries.addElement(pdb);
239 if ("EMBL".equals(pdb.getType()))
241 // look for a CDS reference and add it, too.
242 String cdsId = (String) pdb.getProperty("protein sequence ID");
243 if (cdsId != null && cdsId.trim().length() > 0)
246 String[] vrs = cdsId.split("\\.");
247 dbr = new DBRefEntry(DBRefSource.EMBLCDS, vrs.length > 1 ? vrs[1]
248 : DBRefSource.UNIPROT + ":" + dbVersion, vrs[0]);
252 if ("Ensembl".equals(pdb.getType()))
255 * <dbReference type="Ensembl" id="ENST00000321556">
256 * <molecule id="Q9BXM7-1"/>
257 * <property type="protein sequence ID" value="ENSP00000364204"/>
258 * <property type="gene ID" value="ENSG00000158828"/>
261 String cdsId = (String) pdb.getProperty("protein sequence ID");
262 if (cdsId != null && cdsId.trim().length() > 0)
264 dbr = new DBRefEntry(DBRefSource.ENSEMBL,
265 DBRefSource.UNIPROT + ":" + dbVersion, cdsId.trim());
272 sequence.setPDBId(onlyPdbEntries);
273 if (entry.getFeature() != null)
275 for (UniprotFeature uf : entry.getFeature())
277 SequenceFeature copy = new SequenceFeature(uf.getType(),
278 uf.getDescription(), uf.getBegin(), uf.getEnd(), "Uniprot");
279 copy.setStatus(uf.getStatus());
280 sequence.addSequenceFeature(copy);
283 for (DBRefEntry dbr : dbRefs)
285 sequence.addDBRef(dbr);
294 * @return protein name(s) delimited by a white space character
296 public static String getUniprotEntryDescription(UniprotEntry entry)
298 StringBuilder desc = new StringBuilder(32);
299 if (entry.getProtein() != null && entry.getProtein().getName() != null)
301 boolean first = true;
302 for (String nm : entry.getProtein().getName())
312 return desc.toString();
319 * @return The accession id(s) and name(s) delimited by '|'.
321 public static String getUniprotEntryId(UniprotEntry entry,
322 boolean includeAllIds)
324 StringBuilder name = new StringBuilder(32);
327 // // use 'canonicalised' name for optimal id matching
328 name.append(DBRefSource.UNIPROT);
329 for (String accessionId : entry.getAccession())
331 name.append(BAR_DELIMITER);
332 name.append(accessionId);
336 for (String n : entry.getName())
338 if (name.length() > 0)
340 name.append(BAR_DELIMITER);
344 return name.toString();
350 * @see jalview.ws.DbSourceProxy#isValidReference(java.lang.String)
353 public boolean isValidReference(String accession)
355 // TODO: make the following a standard validator
356 return (accession == null || accession.length() < 2) ? false
357 : getAccessionValidator().search(accession);
361 * return LDHA_CHICK uniprot entry
364 public String getTestQuery()
370 public String getDbName()
372 return "Uniprot"; // getDbSource();