2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.ws.dbsources;
23 import jalview.bin.Cache;
24 import jalview.datamodel.Alignment;
25 import jalview.datamodel.AlignmentI;
26 import jalview.datamodel.DBRefEntry;
27 import jalview.datamodel.DBRefSource;
28 import jalview.datamodel.PDBEntry;
29 import jalview.datamodel.Sequence;
30 import jalview.datamodel.SequenceFeature;
31 import jalview.datamodel.SequenceI;
32 import jalview.datamodel.xdb.uniprot.UniprotEntry;
33 import jalview.datamodel.xdb.uniprot.UniprotFeature;
34 import jalview.datamodel.xdb.uniprot.UniprotFile;
35 import jalview.schemes.ResidueProperties;
36 import jalview.util.StringUtils;
37 import jalview.ws.seqfetcher.DbSourceProxyImpl;
39 import java.io.InputStream;
40 import java.io.InputStreamReader;
41 import java.io.Reader;
43 import java.net.URLConnection;
44 import java.util.ArrayList;
45 import java.util.List;
46 import java.util.Vector;
48 import org.exolab.castor.mapping.Mapping;
49 import org.exolab.castor.xml.Unmarshaller;
51 import com.stevesoft.pat.Regex;
57 public class Uniprot extends DbSourceProxyImpl
59 private static final String DEFAULT_UNIPROT_DOMAIN = "https://www.uniprot.org";
61 private static final String BAR_DELIMITER = "|";
64 * Castor mapping loaded from uniprot_mapping.xml
66 private static Mapping map;
76 private String getDomain()
78 return Cache.getDefault("UNIPROT_DOMAIN", DEFAULT_UNIPROT_DOMAIN);
84 * @see jalview.ws.DbSourceProxy#getAccessionSeparator()
87 public String getAccessionSeparator()
95 * @see jalview.ws.DbSourceProxy#getAccessionValidator()
98 public Regex getAccessionValidator()
100 return new Regex("([A-Z]+[0-9]+[A-Z0-9]+|[A-Z0-9]+_[A-Z0-9]+)");
106 * @see jalview.ws.DbSourceProxy#getDbSource()
109 public String getDbSource()
111 return DBRefSource.UNIPROT;
117 * @see jalview.ws.DbSourceProxy#getDbVersion()
120 public String getDbVersion()
122 return "0"; // we really don't know what version we're on.
126 * Reads a file containing the reply to the EBI Fetch Uniprot data query,
127 * unmarshals it to a UniprotFile object, and returns the list of UniprotEntry
128 * data models (mapped from <entry> elements)
133 public Vector<UniprotEntry> getUniprotEntries(Reader fileReader)
135 UniprotFile uni = new UniprotFile();
140 // 1. Load the mapping information from the file
141 map = new Mapping(uni.getClass().getClassLoader());
142 URL url = getClass().getResource("/uniprot_mapping.xml");
143 map.loadMapping(url);
146 // 2. Unmarshal the data
147 Unmarshaller unmar = new Unmarshaller(uni);
148 unmar.setIgnoreExtraElements(true);
149 unmar.setMapping(map);
150 if (fileReader != null)
152 uni = (UniprotFile) unmar.unmarshal(fileReader);
154 } catch (Exception e)
156 System.out.println("Error getUniprotEntries() " + e);
159 return uni.getUniprotEntries();
165 * @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[])
168 public AlignmentI getSequenceRecords(String queries) throws Exception
173 queries = queries.toUpperCase().replaceAll(
174 "(UNIPROT\\|?|UNIPROT_|UNIREF\\d+_|UNIREF\\d+\\|?)", "");
175 AlignmentI al = null;
177 String downloadstring = getDomain() + "/uniprot/" + queries
180 URLConnection urlconn = null;
182 url = new URL(downloadstring);
183 urlconn = url.openConnection();
184 InputStream istr = urlconn.getInputStream();
185 Vector<UniprotEntry> entries = getUniprotEntries(
186 new InputStreamReader(istr, "UTF-8"));
190 ArrayList<SequenceI> seqs = new ArrayList<>();
191 for (UniprotEntry entry : entries)
193 seqs.add(uniprotEntryToSequenceI(entry));
195 al = new Alignment(seqs.toArray(new SequenceI[0]));
200 } catch (Exception e)
213 * @return SequenceI instance created from the UniprotEntry instance
215 public SequenceI uniprotEntryToSequenceI(UniprotEntry entry)
217 String id = getUniprotEntryId(entry);
218 SequenceI sequence = new Sequence(id,
219 entry.getUniprotSequence().getContent());
220 sequence.setDescription(getUniprotEntryDescription(entry));
222 final String dbVersion = getDbVersion();
223 ArrayList<DBRefEntry> dbRefs = new ArrayList<>();
224 for (String accessionId : entry.getAccession())
226 DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT, dbVersion,
229 // mark dbRef as a primary reference for this sequence
233 Vector<PDBEntry> onlyPdbEntries = new Vector<>();
234 for (PDBEntry pdb : entry.getDbReference())
236 DBRefEntry dbr = new DBRefEntry();
237 dbr.setSource(pdb.getType());
238 dbr.setAccessionId(pdb.getId());
239 dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion);
241 if ("PDB".equals(pdb.getType()))
243 onlyPdbEntries.addElement(pdb);
245 if ("EMBL".equals(pdb.getType()))
247 // look for a CDS reference and add it, too.
248 String cdsId = (String) pdb.getProperty("protein sequence ID");
249 if (cdsId != null && cdsId.trim().length() > 0)
252 String[] vrs = cdsId.split("\\.");
253 dbr = new DBRefEntry(DBRefSource.EMBLCDS, vrs.length > 1 ? vrs[1]
254 : DBRefSource.UNIPROT + ":" + dbVersion, vrs[0]);
258 if ("Ensembl".equals(pdb.getType()))
261 * <dbReference type="Ensembl" id="ENST00000321556">
262 * <molecule id="Q9BXM7-1"/>
263 * <property type="protein sequence ID" value="ENSP00000364204"/>
264 * <property type="gene ID" value="ENSG00000158828"/>
267 String cdsId = (String) pdb.getProperty("protein sequence ID");
268 if (cdsId != null && cdsId.trim().length() > 0)
270 dbr = new DBRefEntry(DBRefSource.ENSEMBL,
271 DBRefSource.UNIPROT + ":" + dbVersion, cdsId.trim());
278 sequence.setPDBId(onlyPdbEntries);
279 if (entry.getFeature() != null)
281 for (UniprotFeature uf : entry.getFeature())
283 SequenceFeature copy = new SequenceFeature(uf.getType(),
284 getDescription(uf), uf.getBegin(), uf.getEnd(), "Uniprot");
285 copy.setStatus(uf.getStatus());
286 sequence.addSequenceFeature(copy);
289 for (DBRefEntry dbr : dbRefs)
291 sequence.addDBRef(dbr);
297 * Constructs a feature description from the description and (optionally)
298 * original and variant fields of the Uniprot XML feature
303 protected static String getDescription(UniprotFeature uf)
305 String orig = uf.getOriginal();
306 List<String> variants = uf.getVariation();
307 StringBuilder sb = new StringBuilder();
310 * append variant in standard format if present
312 * multiple variants are split over lines using <br>
314 boolean asHtml = false;
315 if (orig != null && !orig.isEmpty() && variants != null
316 && !variants.isEmpty())
319 for (String var : variants)
321 // TODO proper HGVS nomenclature for delins structural variations
322 // http://varnomen.hgvs.org/recommendations/protein/variant/delins/
323 // for now we are pragmatic - any orig/variant sequence longer than
324 // three characters is shown with single-character notation rather than
325 // three-letter notation
327 if (orig.length() < 4)
329 for (int c = 0, clen = orig.length(); c < clen; c++)
331 char origchar = orig.charAt(c);
332 String orig3 = ResidueProperties.aa2Triplet.get("" + origchar);
333 sb.append(orig3 == null ? origchar
334 : StringUtils.toSentenceCase(orig3));
342 sb.append(Integer.toString(uf.getPosition()));
344 if (var.length() < 4)
346 for (int c = 0, clen = var.length(); c < clen; c++)
348 char varchar = var.charAt(c);
349 String var3 = ResidueProperties.aa2Triplet.get("" + varchar);
351 sb.append(var3 != null ? StringUtils.toSentenceCase(var3)
359 if (++p != variants.size())
361 sb.append("<br/> ");
370 String description = uf.getDescription();
371 if (description != null)
373 sb.append(description);
377 sb.insert(0, "<html>");
378 sb.append("</html>");
381 return sb.toString();
388 * @return protein name(s) delimited by a white space character
390 public static String getUniprotEntryDescription(UniprotEntry entry)
392 StringBuilder desc = new StringBuilder(32);
393 if (entry.getProtein() != null && entry.getProtein().getName() != null)
395 boolean first = true;
396 for (String nm : entry.getProtein().getName())
406 return desc.toString();
413 * @return The accession id(s) and name(s) delimited by '|'.
415 public static String getUniprotEntryId(UniprotEntry entry)
417 StringBuilder name = new StringBuilder(32);
418 for (String n : entry.getName())
420 if (name.length() > 0)
422 name.append(BAR_DELIMITER);
426 return name.toString();
432 * @see jalview.ws.DbSourceProxy#isValidReference(java.lang.String)
435 public boolean isValidReference(String accession)
437 // TODO: make the following a standard validator
438 return (accession == null || accession.length() < 2) ? false
439 : getAccessionValidator().search(accession);
443 * return LDHA_CHICK uniprot entry
446 public String getTestQuery()
452 public String getDbName()
454 return "Uniprot"; // getDbSource();