*/
package jalview.ws.dbsources;
+import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.DBRefSource;
import jalview.datamodel.PDBEntry;
+import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
-import jalview.datamodel.UniprotEntry;
-import jalview.datamodel.UniprotFile;
-import jalview.ws.ebi.EBIFetchClient;
+import jalview.datamodel.xdb.uniprot.UniprotEntry;
+import jalview.datamodel.xdb.uniprot.UniprotFeature;
+import jalview.datamodel.xdb.uniprot.UniprotFile;
import jalview.ws.seqfetcher.DbSourceProxyImpl;
-import java.io.File;
-import java.io.FileReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
+import java.net.URLConnection;
+import java.util.ArrayList;
import java.util.Vector;
import org.exolab.castor.mapping.Mapping;
*/
public class Uniprot extends DbSourceProxyImpl
{
-
private static final String BAR_DELIMITER = "|";
- private static final String NEWLINE = "\n";
-
+ /*
+ * Castor mapping loaded from uniprot_mapping.xml
+ */
private static Mapping map;
/**
@Override
public String getAccessionSeparator()
{
- return null; // ";";
+ return null;
}
/*
queries = queries.toUpperCase().replaceAll(
"(UNIPROT\\|?|UNIPROT_|UNIREF\\d+_|UNIREF\\d+\\|?)", "");
AlignmentI al = null;
- EBIFetchClient ebi = new EBIFetchClient();
- // uniprotxml parameter required since december 2007
- // uniprotkb dbname changed introduced december 2008
- File file = ebi.fetchDataAsFile("uniprotkb:" + queries, "uniprotxml",
- null);
- Vector<UniprotEntry> entries = getUniprotEntries(new FileReader(file));
+
+ String downloadstring = "http://www.uniprot.org/uniprot/" + queries
+ + ".xml";
+ URL url = null;
+ URLConnection urlconn = null;
+
+ url = new URL(downloadstring);
+ urlconn = url.openConnection();
+ InputStream istr = urlconn.getInputStream();
+ Vector<UniprotEntry> entries = getUniprotEntries(
+ new InputStreamReader(istr, "UTF-8"));
if (entries != null)
{
- /*
- * If Castor binding included sequence@length, we could guesstimate the
- * size of buffer to hold the alignment
- */
- StringBuffer result = new StringBuffer(128);
- // First, make the new sequences
+ ArrayList<SequenceI> seqs = new ArrayList<>();
for (UniprotEntry entry : entries)
{
- StringBuilder name = constructSequenceFastaHeader(entry);
-
- result.append(name).append(NEWLINE)
- .append(entry.getUniprotSequence().getContent())
- .append(NEWLINE);
+ seqs.add(uniprotEntryToSequenceI(entry));
}
+ al = new Alignment(seqs.toArray(new SequenceI[0]));
- // Then read in the features and apply them to the dataset
- al = parseResult(result.toString());
- if (al != null)
- {
- // Decorate the alignment with database entries.
- addUniprotXrefs(al, entries);
- }
- else
- {
- results = result;
- }
}
stopQuery();
return al;
} catch (Exception e)
{
- stopQuery();
throw (e);
+ } finally
+ {
+ stopQuery();
}
}
/**
- * Construct a Fasta-format sequence header by concatenating the source,
- * accession id(s) and name(s), delimited by '|', plus any protein names, now
- * with space rather than bar delimiter
*
* @param entry
- * @return
+ * UniprotEntry
+ * @return SequenceI instance created from the UniprotEntry instance
*/
- public static StringBuilder constructSequenceFastaHeader(
- UniprotEntry entry)
+ public SequenceI uniprotEntryToSequenceI(UniprotEntry entry)
{
- StringBuilder name = new StringBuilder(32);
- name.append(">UniProt/Swiss-Prot");
+ String id = getUniprotEntryId(entry);
+ SequenceI sequence = new Sequence(id,
+ entry.getUniprotSequence().getContent());
+ sequence.setDescription(getUniprotEntryDescription(entry));
+
+ final String dbVersion = getDbVersion();
+ ArrayList<DBRefEntry> dbRefs = new ArrayList<>();
for (String accessionId : entry.getAccession())
{
- name.append(BAR_DELIMITER);
- name.append(accessionId);
+ DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT, dbVersion,
+ accessionId);
+
+ // mark dbRef as a primary reference for this sequence
+ dbRefs.add(dbRef);
}
- for (String n : entry.getName())
+
+ Vector<PDBEntry> onlyPdbEntries = new Vector<>();
+ for (PDBEntry pdb : entry.getDbReference())
{
- name.append(BAR_DELIMITER);
- name.append(n);
+ DBRefEntry dbr = new DBRefEntry();
+ dbr.setSource(pdb.getType());
+ dbr.setAccessionId(pdb.getId());
+ dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion);
+ dbRefs.add(dbr);
+ if ("PDB".equals(pdb.getType()))
+ {
+ onlyPdbEntries.addElement(pdb);
+ }
+ if ("EMBL".equals(pdb.getType()))
+ {
+ // look for a CDS reference and add it, too.
+ String cdsId = (String) pdb.getProperty("protein sequence ID");
+ if (cdsId != null && cdsId.trim().length() > 0)
+ {
+ // remove version
+ String[] vrs = cdsId.split("\\.");
+ dbr = new DBRefEntry(DBRefSource.EMBLCDS, vrs.length > 1 ? vrs[1]
+ : DBRefSource.UNIPROT + ":" + dbVersion, vrs[0]);
+ dbRefs.add(dbr);
+ }
+ }
+ if ("Ensembl".equals(pdb.getType()))
+ {
+ /*UniprotXML
+ * <dbReference type="Ensembl" id="ENST00000321556">
+ * <molecule id="Q9BXM7-1"/>
+ * <property type="protein sequence ID" value="ENSP00000364204"/>
+ * <property type="gene ID" value="ENSG00000158828"/>
+ * </dbReference>
+ */
+ String cdsId = (String) pdb.getProperty("protein sequence ID");
+ if (cdsId != null && cdsId.trim().length() > 0)
+ {
+ dbr = new DBRefEntry(DBRefSource.ENSEMBL,
+ DBRefSource.UNIPROT + ":" + dbVersion, cdsId.trim());
+ dbRefs.add(dbr);
+
+ }
+ }
}
- if (entry.getProtein() != null && entry.getProtein().getName() != null)
+ sequence.setPDBId(onlyPdbEntries);
+ if (entry.getFeature() != null)
{
- for (String nm : entry.getProtein().getName())
+ for (UniprotFeature uf : entry.getFeature())
{
- name.append(" ").append(nm);
+ SequenceFeature copy = new SequenceFeature(uf.getType(),
+ uf.getDescription(), uf.getBegin(), uf.getEnd(), "Uniprot");
+ copy.setStatus(uf.getStatus());
+ sequence.addSequenceFeature(copy);
}
}
- return name;
+ for (DBRefEntry dbr : dbRefs)
+ {
+ sequence.addDBRef(dbr);
+ }
+ return sequence;
}
/**
- * add an ordered set of UniprotEntry objects to an ordered set of seuqences.
*
- * @param al
- * - a sequence of n sequences
- * @param entries
- * a list of n uniprot entries to be analysed.
+ * @param entry
+ * UniportEntry
+ * @return protein name(s) delimited by a white space character
*/
- public void addUniprotXrefs(AlignmentI al, Vector<UniprotEntry> entries)
+ public static String getUniprotEntryDescription(UniprotEntry entry)
{
- final String dbVersion = getDbVersion();
-
- for (int i = 0; i < entries.size(); i++)
+ StringBuilder desc = new StringBuilder(32);
+ if (entry.getProtein() != null && entry.getProtein().getName() != null)
{
- UniprotEntry entry = entries.elementAt(i);
- Vector<PDBEntry> onlyPdbEntries = new Vector<PDBEntry>();
- Vector<DBRefEntry> dbxrefs = new Vector<DBRefEntry>();
-
- for (PDBEntry pdb : entry.getDbReference())
+ boolean first = true;
+ for (String nm : entry.getProtein().getName())
{
- DBRefEntry dbr = new DBRefEntry();
- dbr.setSource(pdb.getType());
- dbr.setAccessionId(pdb.getId());
- dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion);
- dbxrefs.addElement(dbr);
- if ("PDB".equals(pdb.getType()))
+ if (!first)
{
- onlyPdbEntries.addElement(pdb);
+ desc.append(" ");
}
+ first = false;
+ desc.append(nm);
}
+ }
+ return desc.toString();
+ }
- SequenceI sq = al.getSequenceAt(i);
- while (sq.getDatasetSequence() != null)
- {
- sq = sq.getDatasetSequence();
- }
-
- for (String accessionId : entry.getAccession())
- {
- /*
- * add as uniprot whether retrieved from uniprot or uniprot_name
- */
- sq.addDBRef(new DBRefEntry(DBRefSource.UNIPROT, dbVersion,
- accessionId));
- }
-
- for (DBRefEntry dbRef : dbxrefs)
- {
- sq.addDBRef(dbRef);
- }
- sq.setPDBId(onlyPdbEntries);
- if (entry.getFeature() != null)
+ /**
+ *
+ * @param entry
+ * UniprotEntry
+ * @return The accession id(s) and name(s) delimited by '|'.
+ */
+ public static String getUniprotEntryId(UniprotEntry entry)
+ {
+ StringBuilder name = new StringBuilder(32);
+ for (String n : entry.getName())
+ {
+ if (name.length() > 0)
{
- for (SequenceFeature sf : entry.getFeature())
- {
- sf.setFeatureGroup("Uniprot");
- sq.addSequenceFeature(sf);
- }
+ name.append(BAR_DELIMITER);
}
+ name.append(n);
}
+ return name.toString();
}
/*
{
return 0;
}
-
- @Override
- public int getMaximumQueryCount()
- {
- // relocated this commented out code...
- // addDbSourceProperty(DBRefSource.MULTIACC, new Integer(50));
- // return 50;
- return super.getMaximumQueryCount();
- }
}