X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fws%2Fdbsources%2FUniprot.java;h=86282c7a5b151bf041b51f7beb1ccaba6bf1096b;hb=7e82e1fed011077e5cd4cc40ac8ad3519d7c47a8;hp=c311ea9e2d685ec997be71934814627c5bb58ea4;hpb=10cb8ffc129d24c1a4bc506199d2e144ec99c038;p=jalview.git diff --git a/src/jalview/ws/dbsources/Uniprot.java b/src/jalview/ws/dbsources/Uniprot.java index c311ea9..86282c7 100644 --- a/src/jalview/ws/dbsources/Uniprot.java +++ b/src/jalview/ws/dbsources/Uniprot.java @@ -29,28 +29,37 @@ import jalview.datamodel.PDBEntry; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.datamodel.xdb.uniprot.UniprotEntry; -import jalview.datamodel.xdb.uniprot.UniprotFeature; -import jalview.datamodel.xdb.uniprot.UniprotFile; import jalview.schemes.ResidueProperties; import jalview.util.StringUtils; import jalview.ws.seqfetcher.DbSourceProxyImpl; +import jalview.xml.binding.uniprot.DbReferenceType; +import jalview.xml.binding.uniprot.Entry; +import jalview.xml.binding.uniprot.FeatureType; +import jalview.xml.binding.uniprot.LocationType; +import jalview.xml.binding.uniprot.PositionType; +import jalview.xml.binding.uniprot.PropertyType; import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import java.util.Vector; -import org.exolab.castor.mapping.Mapping; -import org.exolab.castor.xml.Unmarshaller; +import javax.xml.bind.JAXBContext; +import javax.xml.bind.JAXBException; +import javax.xml.stream.FactoryConfigurationError; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; import com.stevesoft.pat.Regex; /** + * This class queries the Uniprot database for sequence data, unmarshals the + * returned XML, and converts it to Jalview Sequence records (including attached + * database references and sequence features) + * * @author JimP * */ @@ -60,11 +69,6 @@ public class Uniprot extends DbSourceProxyImpl private static final String BAR_DELIMITER = "|"; - /* - * Castor mapping loaded from uniprot_mapping.xml - */ - private static Mapping map; - /** * Constructor */ @@ -122,43 +126,6 @@ public class Uniprot extends DbSourceProxyImpl return "0"; // we really don't know what version we're on. } - /** - * Reads a file containing the reply to the EBI Fetch Uniprot data query, - * unmarshals it to a UniprotFile object, and returns the list of UniprotEntry - * data models (mapped from <entry> elements) - * - * @param fileReader - * @return - */ - public Vector getUniprotEntries(Reader fileReader) - { - UniprotFile uni = new UniprotFile(); - try - { - if (map == null) - { - // 1. Load the mapping information from the file - map = new Mapping(uni.getClass().getClassLoader()); - URL url = getClass().getResource("/uniprot_mapping.xml"); - map.loadMapping(url); - } - - // 2. Unmarshal the data - Unmarshaller unmar = new Unmarshaller(uni); - unmar.setIgnoreExtraElements(true); - unmar.setMapping(map); - if (fileReader != null) - { - uni = (UniprotFile) unmar.unmarshal(fileReader); - } - } catch (Exception e) - { - System.out.println("Error getUniprotEntries() " + e); - } - - return uni.getUniprotEntries(); - } - /* * (non-Javadoc) * @@ -176,25 +143,21 @@ public class Uniprot extends DbSourceProxyImpl String downloadstring = getDomain() + "/uniprot/" + queries + ".xml"; - URL url = null; - URLConnection urlconn = null; - url = new URL(downloadstring); - urlconn = url.openConnection(); + URL url = new URL(downloadstring); + URLConnection urlconn = url.openConnection(); InputStream istr = urlconn.getInputStream(); - Vector entries = getUniprotEntries( - new InputStreamReader(istr, "UTF-8")); - + List entries = getUniprotEntries(istr); if (entries != null) { - ArrayList seqs = new ArrayList<>(); - for (UniprotEntry entry : entries) + List seqs = new ArrayList<>(); + for (Entry entry : entries) { - seqs.add(uniprotEntryToSequenceI(entry)); + seqs.add(uniprotEntryToSequence(entry)); } - al = new Alignment(seqs.toArray(new SequenceI[0])); - + al = new Alignment(seqs.toArray(new SequenceI[seqs.size()])); } + stopQuery(); return al; } catch (Exception e) @@ -207,83 +170,124 @@ public class Uniprot extends DbSourceProxyImpl } /** + * Converts an Entry object (bound from Uniprot XML) to a Jalview Sequence * * @param entry - * UniprotEntry - * @return SequenceI instance created from the UniprotEntry instance + * @return */ - public SequenceI uniprotEntryToSequenceI(UniprotEntry entry) + SequenceI uniprotEntryToSequence(Entry entry) { String id = getUniprotEntryId(entry); + String seqString = entry.getSequence().getValue(); + + /* + * for backwards compatibility with Castor processing, + * remove any internal spaces + */ + if (seqString.indexOf(' ') > -1) + { + seqString = seqString.replace(" ", ""); + } SequenceI sequence = new Sequence(id, - entry.getUniprotSequence().getContent()); + seqString); sequence.setDescription(getUniprotEntryDescription(entry)); + /* + * add a 'self' DBRefEntry for each accession + */ final String dbVersion = getDbVersion(); - ArrayList dbRefs = new ArrayList<>(); + List dbRefs = new ArrayList<>(); for (String accessionId : entry.getAccession()) { DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT, dbVersion, accessionId); - - // mark dbRef as a primary reference for this sequence dbRefs.add(dbRef); } - Vector onlyPdbEntries = new Vector<>(); - for (PDBEntry pdb : entry.getDbReference()) + /* + * add a DBRefEntry for each dbReference element in the XML; + * also add a PDBEntry if type="PDB"; + * also add an EMBLCDS dbref if protein sequence id is given + * also add an Ensembl dbref " " " " " " + */ + Vector pdbRefs = new Vector<>(); + for (DbReferenceType dbref : entry.getDbReference()) { - DBRefEntry dbr = new DBRefEntry(); - dbr.setSource(pdb.getType()); - dbr.setAccessionId(pdb.getId()); - dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion); + String type = dbref.getType(); + DBRefEntry dbr = new DBRefEntry(type, + DBRefSource.UNIPROT + ":" + dbVersion, dbref.getId()); dbRefs.add(dbr); - if ("PDB".equals(pdb.getType())) + if ("PDB".equals(type)) { - onlyPdbEntries.addElement(pdb); + pdbRefs.add(new PDBEntry(dbr)); } - if ("EMBL".equals(pdb.getType())) + if ("EMBL".equals(type)) { - // look for a CDS reference and add it, too. - String cdsId = (String) pdb.getProperty("protein sequence ID"); + /* + * e.g. Uniprot accession Q9BXM7 has + * + * + * + * + */ + String cdsId = getProperty(dbref.getProperty(), + "protein sequence ID"); if (cdsId != null && cdsId.trim().length() > 0) { // remove version String[] vrs = cdsId.split("\\."); - dbr = new DBRefEntry(DBRefSource.EMBLCDS, vrs.length > 1 ? vrs[1] - : DBRefSource.UNIPROT + ":" + dbVersion, vrs[0]); + String version = vrs.length > 1 ? vrs[1] + : DBRefSource.UNIPROT + ":" + dbVersion; + dbr = new DBRefEntry(DBRefSource.EMBLCDS, version, vrs[0]); dbRefs.add(dbr); } } - if ("Ensembl".equals(pdb.getType())) + if ("Ensembl".equals(type)) { - /*UniprotXML + /* + * e.g. Uniprot accession Q9BXM7 has * - * - * - * - * + * + * + * + * */ - String cdsId = (String) pdb.getProperty("protein sequence ID"); + String cdsId = getProperty(dbref.getProperty(), + "protein sequence ID"); if (cdsId != null && cdsId.trim().length() > 0) { dbr = new DBRefEntry(DBRefSource.ENSEMBL, DBRefSource.UNIPROT + ":" + dbVersion, cdsId.trim()); dbRefs.add(dbr); - } } } - sequence.setPDBId(onlyPdbEntries); + /* + * create features; they have either begin and end, or position, in XML + */ + sequence.setPDBId(pdbRefs); if (entry.getFeature() != null) { - for (UniprotFeature uf : entry.getFeature()) + for (FeatureType uf : entry.getFeature()) { - SequenceFeature copy = new SequenceFeature(uf.getType(), - getDescription(uf), uf.getBegin(), uf.getEnd(), "Uniprot"); - copy.setStatus(uf.getStatus()); - sequence.addSequenceFeature(copy); + LocationType location = uf.getLocation(); + int start = 0; + int end = 0; + if (location.getPosition() != null) + { + start = location.getPosition().getPosition().intValue(); + end = start; + } + else + { + start = location.getBegin().getPosition().intValue(); + end = location.getEnd().getPosition().intValue(); + } + SequenceFeature sf = new SequenceFeature(uf.getType(), + getDescription(uf), start, end, "Uniprot"); + sf.setStatus(uf.getStatus()); + sequence.addSequenceFeature(sf); } } for (DBRefEntry dbr : dbRefs) @@ -294,22 +298,23 @@ public class Uniprot extends DbSourceProxyImpl } /** - * Constructs a feature description from the description and (optionally) - * original and variant fields of the Uniprot XML feature + * A helper method that builds a sequence feature description * - * @param uf + * @param feature * @return */ - protected static String getDescription(UniprotFeature uf) + static String getDescription(FeatureType feature) { - String orig = uf.getOriginal(); - List variants = uf.getVariation(); + String orig = feature.getOriginal(); + List variants = feature.getVariation(); StringBuilder sb = new StringBuilder(); /* * append variant in standard format if present * e.g. p.Arg59Lys + * multiple variants are split over lines using
*/ + boolean asHtml = false; if (orig != null && !orig.isEmpty() && variants != null && !variants.isEmpty()) { @@ -337,7 +342,11 @@ public class Uniprot extends DbSourceProxyImpl sb.append(orig); } - sb.append(Integer.toString(uf.getPosition())); + LocationType location = feature.getLocation(); + PositionType start = location.getPosition() == null + ? location.getBegin() + : location.getPosition(); + sb.append(Integer.toString(start.getPosition().intValue())); if (var.length() < 4) { @@ -356,7 +365,8 @@ public class Uniprot extends DbSourceProxyImpl } if (++p != variants.size()) { - sb.append("\n"); + sb.append("
  "); + asHtml = true; } else { @@ -364,47 +374,72 @@ public class Uniprot extends DbSourceProxyImpl } } } - String description = uf.getDescription(); + String description = feature.getDescription(); if (description != null) { sb.append(description); } + if (asHtml) + { + sb.insert(0, ""); + sb.append(""); + } return sb.toString(); } /** + * A helper method that searches the list of properties for one with the given + * key, and if found returns the property value, else returns null * - * @param entry - * UniportEntry - * @return protein name(s) delimited by a white space character + * @param properties + * @param key + * @return */ - public static String getUniprotEntryDescription(UniprotEntry entry) + static String getProperty(List properties, String key) { - StringBuilder desc = new StringBuilder(32); - if (entry.getProtein() != null && entry.getProtein().getName() != null) + String value = null; + if (properties != null) { - boolean first = true; - for (String nm : entry.getProtein().getName()) + for (PropertyType prop : properties) { - if (!first) + if (key.equals(prop.getType())) { - desc.append(" "); + value = prop.getValue(); + break; } - first = false; - desc.append(nm); } } - return desc.toString(); + return value; } /** - * + * Extracts xml element entry/protein/recommendedName/fullName + * * @param entry - * UniprotEntry - * @return The accession id(s) and name(s) delimited by '|'. + * @return */ - public static String getUniprotEntryId(UniprotEntry entry) + static String getUniprotEntryDescription(Entry entry) + { + String desc = ""; + if (entry.getProtein() != null + && entry.getProtein().getRecommendedName() != null) + { + // fullName is mandatory if recommendedName is present + desc = entry.getProtein().getRecommendedName().getFullName() + .getValue(); + } + return desc; + } + + /** + * Constructs a sequence id by concatenating all entry/name elements with '|' + * separator + * + * @param entry + * @return + */ + static String getUniprotEntryId(Entry entry) { StringBuilder name = new StringBuilder(32); for (String n : entry.getName()) @@ -451,4 +486,35 @@ public class Uniprot extends DbSourceProxyImpl { return 0; } + + /** + * Reads the reply to the EBI Fetch Uniprot data query, unmarshals it to an + * Uniprot object, and returns the enclosed Entry objects, or null on any + * failure + * + * @param is + * @return + */ + public List getUniprotEntries(InputStream is) + { + List entries = null; + try + { + JAXBContext jc = JAXBContext + .newInstance("jalview.xml.binding.uniprot"); + XMLStreamReader streamReader = XMLInputFactory.newInstance() + .createXMLStreamReader(is); + javax.xml.bind.Unmarshaller um = jc.createUnmarshaller(); + jalview.xml.binding.uniprot.Uniprot uniprot = (jalview.xml.binding.uniprot.Uniprot) um.unmarshal(streamReader); + if (uniprot != null && !uniprot.getEntry().isEmpty()) + { + entries = uniprot.getEntry(); + } + } catch (JAXBException | XMLStreamException + | FactoryConfigurationError e) + { + e.printStackTrace(); + } + return entries; + } }