import jalview.datamodel.DBRefEntry;
import jalview.datamodel.DBRefSource;
import jalview.datamodel.PDBEntry;
+import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import jalview.datamodel.UniprotEntry;
import jalview.datamodel.UniprotFile;
import jalview.ws.ebi.EBIFetchClient;
-import jalview.ws.seqfetcher.DbSourceProxy;
import jalview.ws.seqfetcher.DbSourceProxyImpl;
import java.io.File;
import java.io.FileReader;
import java.io.Reader;
+import java.net.URL;
+import java.util.ArrayList;
import java.util.Vector;
+import org.exolab.castor.mapping.Mapping;
import org.exolab.castor.xml.Unmarshaller;
import com.stevesoft.pat.Regex;
* @author JimP
*
*/
-public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy
+public class Uniprot extends DbSourceProxyImpl
{
-
private static final String BAR_DELIMITER = "|";
- private static final String NEWLINE = "\n";
-
- private static org.exolab.castor.mapping.Mapping map;
+ /*
+ * Castor mapping loaded from uniprot_mapping.xml
+ */
+ private static Mapping map;
/**
* Constructor
public Uniprot()
{
super();
- addDbSourceProperty(DBRefSource.SEQDB, DBRefSource.SEQDB);
- addDbSourceProperty(DBRefSource.PROTSEQDB);
- // addDbSourceProperty(DBRefSource.MULTIACC, new Integer(50));
}
/*
*
* @see jalview.ws.DbSourceProxy#getAccessionSeparator()
*/
+ @Override
public String getAccessionSeparator()
{
- return null; // ";";
+ return null;
}
/*
*
* @see jalview.ws.DbSourceProxy#getAccessionValidator()
*/
+ @Override
public Regex getAccessionValidator()
{
return new Regex("([A-Z]+[0-9]+[A-Z0-9]+|[A-Z0-9]+_[A-Z0-9]+)");
*
* @see jalview.ws.DbSourceProxy#getDbSource()
*/
+ @Override
public String getDbSource()
{
return DBRefSource.UNIPROT;
*
* @see jalview.ws.DbSourceProxy#getDbVersion()
*/
+ @Override
public String getDbVersion()
{
return "0"; // we really don't know what version we're on.
if (map == null)
{
// 1. Load the mapping information from the file
- map = new org.exolab.castor.mapping.Mapping(uni.getClass()
- .getClassLoader());
- java.net.URL url = getClass().getResource("/uniprot_mapping.xml");
+ map = new Mapping(uni.getClass().getClassLoader());
+ URL url = getClass().getResource("/uniprot_mapping.xml");
map.loadMapping(url);
}
*
* @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[])
*/
+ @Override
public AlignmentI getSequenceRecords(String queries) throws Exception
{
startQuery();
{
queries = queries.toUpperCase().replaceAll(
"(UNIPROT\\|?|UNIPROT_|UNIREF\\d+_|UNIREF\\d+\\|?)", "");
- Alignment al = null;
+ AlignmentI al = null;
EBIFetchClient ebi = new EBIFetchClient();
// uniprotxml parameter required since december 2007
// uniprotkb dbname changed introduced december 2008
File file = ebi.fetchDataAsFile("uniprotkb:" + queries, "uniprotxml",
- null);
+ ".xml");
Vector<UniprotEntry> entries = getUniprotEntries(new FileReader(file));
if (entries != null)
{
- /*
- * If Castor binding included sequence@length, we could guesstimate the
- * size of buffer to hold the alignment
- */
- StringBuffer result = new StringBuffer(128);
- // First, make the new sequences
+ ArrayList<SequenceI> seqs = new ArrayList<SequenceI>();
for (UniprotEntry entry : entries)
{
- StringBuilder name = constructSequenceFastaHeader(entry);
-
- result.append(name).append(NEWLINE)
- .append(entry.getUniprotSequence().getContent())
- .append(NEWLINE);
+ seqs.add(uniprotEntryToSequenceI(entry));
}
+ al = new Alignment(seqs.toArray(new SequenceI[0]));
- // Then read in the features and apply them to the dataset
- al = parseResult(result.toString());
- if (al != null)
- {
- // Decorate the alignment with database entries.
- addUniprotXrefs(al, entries);
- }
- else
- {
- results = result;
- }
}
stopQuery();
return al;
}
/**
- * Construct a Fasta-format sequence header by concatenating the source,
- * accession id(s) and name(s), delimited by '|', plus any protein names, now
- * with space rather than bar delimiter
*
* @param entry
- * @return
+ * UniprotEntry
+ * @return SequenceI instance created from the UniprotEntry instance
*/
- public static StringBuilder constructSequenceFastaHeader(
- UniprotEntry entry)
- {
- StringBuilder name = new StringBuilder(32);
- name.append(">UniProt/Swiss-Prot");
+ public SequenceI uniprotEntryToSequenceI(UniprotEntry entry){
+ String id = getUniprotEntryId(entry);
+ SequenceI sequence = new Sequence(id, entry.getUniprotSequence()
+ .getContent());
+ sequence.setDescription(getUniprotEntryDescription(entry));
+
+ final String dbVersion = getDbVersion();
+ ArrayList<DBRefEntry> dbRefs = new ArrayList<DBRefEntry>();
for (String accessionId : entry.getAccession())
{
- name.append(BAR_DELIMITER);
- name.append(accessionId);
+ DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT, dbVersion,
+ accessionId);
+
+ // mark dbRef as a primary reference for this sequence
+ dbRefs.add(dbRef);
}
- for (String n : entry.getName())
+
+ Vector<PDBEntry> onlyPdbEntries = new Vector<PDBEntry>();
+ for (PDBEntry pdb : entry.getDbReference())
{
- name.append(BAR_DELIMITER);
- name.append(n);
+ DBRefEntry dbr = new DBRefEntry();
+ dbr.setSource(pdb.getType());
+ dbr.setAccessionId(pdb.getId());
+ dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion);
+ dbRefs.add(dbr);
+ if ("PDB".equals(pdb.getType()))
+ {
+ onlyPdbEntries.addElement(pdb);
+ }
+ if ("EMBL".equals(pdb.getType()))
+ {
+ // look for a CDS reference and add it, too.
+ String cdsId = (String) pdb.getProperty()
+ .get("protein sequence ID");
+ if (cdsId != null && cdsId.trim().length() > 0)
+ {
+ // remove version
+ String[] vrs = cdsId.split("\\.");
+ dbr = new DBRefEntry(DBRefSource.EMBLCDS, vrs.length > 1 ? vrs[1]
+ : DBRefSource.UNIPROT + ":" + dbVersion, vrs[0]);
+ dbRefs.add(dbr);
+ }
+ }
+ if ("Ensembl".equals(pdb.getType()))
+ {
+ /*UniprotXML
+ * <dbReference type="Ensembl" id="ENST00000321556">
+ * <molecule id="Q9BXM7-1"/>
+ * <property type="protein sequence ID" value="ENSP00000364204"/>
+ * <property type="gene ID" value="ENSG00000158828"/>
+ * </dbReference>
+ */
+ String cdsId = (String) pdb.getProperty()
+ .get("protein sequence ID");
+ if (cdsId != null && cdsId.trim().length() > 0)
+ {
+ dbr = new DBRefEntry(DBRefSource.ENSEMBL, DBRefSource.UNIPROT
+ + ":" + dbVersion, cdsId.trim());
+ dbRefs.add(dbr);
+
+ }
+ }
+
}
- if (entry.getProtein() != null
- && entry.getProtein().getName() != null)
+ sequence.setPDBId(onlyPdbEntries);
+ if (entry.getFeature() != null)
{
- for (String nm : entry.getProtein().getName())
+ for (SequenceFeature sf : entry.getFeature())
{
- name.append(" ").append(nm);
+ sf.setFeatureGroup("Uniprot");
+ sequence.addSequenceFeature(sf);
}
}
- return name;
+ // we use setDBRefs to assign refs quickly.
+ sequence.setDBRefs(dbRefs.toArray(new DBRefEntry[0]));
+ // need to use ensurePrimaries to reify any refs that should become primary
+ // refs
+ DBRefUtils.ensurePrimaries(sequence); // promote any direct refs to primary
+ // source dbs
+ return sequence;
}
/**
- * add an ordered set of UniprotEntry objects to an ordered set of seuqences.
*
- * @param al
- * - a sequence of n sequences
- * @param entries
- * a list of n uniprot entries to be analysed.
+ * @param entry
+ * UniportEntry
+ * @return protein name(s) delimited by a white space character
*/
- public void addUniprotXrefs(Alignment al, Vector<UniprotEntry> entries)
+ public static String getUniprotEntryDescription(UniprotEntry entry)
{
- final String dbVersion = getDbVersion();
-
- for (int i = 0; i < entries.size(); i++)
+ StringBuilder desc = new StringBuilder(32);
+ if (entry.getProtein() != null && entry.getProtein().getName() != null)
{
- UniprotEntry entry = entries.elementAt(i);
- Vector<PDBEntry> onlyPdbEntries = new Vector<PDBEntry>();
- Vector<DBRefEntry> dbxrefs = new Vector<DBRefEntry>();
-
- for (PDBEntry pdb : entry.getDbReference())
+ boolean first = true;
+ for (String nm : entry.getProtein().getName())
{
- DBRefEntry dbr = new DBRefEntry();
- dbr.setSource(pdb.getType());
- dbr.setAccessionId(pdb.getId());
- dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion);
- dbxrefs.addElement(dbr);
- if ("PDB".equals(pdb.getType()))
+ if (!first)
{
- onlyPdbEntries.addElement(pdb);
+ desc.append(" ");
}
+ first = false;
+ desc.append(nm);
}
+ }
+ return desc.toString();
+ }
- SequenceI sq = al.getSequenceAt(i);
- while (sq.getDatasetSequence() != null)
- {
- sq = sq.getDatasetSequence();
- }
-
- for (String accessionId : entry.getAccession())
- {
- /*
- * add as uniprot whether retrieved from uniprot or uniprot_name
- */
- sq.addDBRef(new DBRefEntry(DBRefSource.UNIPROT, dbVersion,
- accessionId));
- }
-
- for (DBRefEntry dbRef : dbxrefs)
- {
- sq.addDBRef(dbRef);
- }
- sq.setPDBId(onlyPdbEntries);
- if (entry.getFeature() != null)
- {
- for (SequenceFeature sf : entry.getFeature())
- {
- sf.setFeatureGroup("Uniprot");
- sq.addSequenceFeature(sf);
- }
- }
+ /**
+ *
+ * @param entry
+ * UniportEntry
+ * @return The accession id(s) and name(s) delimited by '|'.
+ */
+ public static String getUniprotEntryId(UniprotEntry entry)
+ {
+ StringBuilder name = new StringBuilder(32);
+ // name.append("UniProt/Swiss-Prot");
+ // use 'canonicalised' name for optimal id matching
+ name.append(DBRefSource.UNIPROT);
+ for (String accessionId : entry.getAccession())
+ {
+ name.append(BAR_DELIMITER);
+ name.append(accessionId);
}
+ for (String n : entry.getName())
+ {
+ name.append(BAR_DELIMITER);
+ name.append(n);
+ }
+ return name.toString();
}
/*
*
* @see jalview.ws.DbSourceProxy#isValidReference(java.lang.String)
*/
+ @Override
public boolean isValidReference(String accession)
{
// TODO: make the following a standard validator
/**
* return LDHA_CHICK uniprot entry
*/
+ @Override
public String getTestQuery()
{
return "P00340";
}
+ @Override
public String getDbName()
{
return "Uniprot"; // getDbSource();