From 08fbe7c3d40a4077ab519334952a079968c07cd1 Mon Sep 17 00:00:00 2001 From: jprocter Date: Fri, 21 Dec 2007 15:03:56 +0000 Subject: [PATCH] sequence db fetcher and db reference validation/annotation transfer --- src/jalview/ws/DBRefFetcher.java | 433 +++++++++++++++----------- src/jalview/ws/dbsources/EmblCdsSouce.java | 4 + src/jalview/ws/dbsources/EmblSource.java | 4 + src/jalview/ws/dbsources/EmblXmlSource.java | 17 +- src/jalview/ws/dbsources/GeneDbSource.java | 98 ++++++ src/jalview/ws/dbsources/Pdb.java | 15 +- src/jalview/ws/dbsources/Pfam.java | 50 +-- src/jalview/ws/dbsources/Uniprot.java | 43 ++- src/jalview/ws/seqfetcher/DbSourceProxy.java | 6 + 9 files changed, 461 insertions(+), 209 deletions(-) create mode 100644 src/jalview/ws/dbsources/GeneDbSource.java diff --git a/src/jalview/ws/DBRefFetcher.java b/src/jalview/ws/DBRefFetcher.java index 109f4b4..c2d7ad5 100644 --- a/src/jalview/ws/DBRefFetcher.java +++ b/src/jalview/ws/DBRefFetcher.java @@ -25,77 +25,80 @@ import org.exolab.castor.mapping.*; import org.exolab.castor.xml.*; import jalview.analysis.*; import jalview.datamodel.*; +import jalview.datamodel.Mapping; import jalview.gui.*; +import jalview.ws.dbsources.Uniprot; import jalview.ws.ebi.EBIFetchClient; /** * DOCUMENT ME! - * + * * @author $author$ * @version $Revision$ */ -public class DBRefFetcher - implements Runnable +public class DBRefFetcher implements Runnable { - SequenceI [] dataset; + SequenceI[] dataset; + AlignFrame af; + CutAndPasteTransfer output = new CutAndPasteTransfer(); + StringBuffer sbuffer = new StringBuffer(); + boolean running = false; - ///This will be a collection of Vectors of sequenceI refs. - //The key will be the seq name or accession id of the seq + // /This will be a collection of Vectors of sequenceI refs. + // The key will be the seq name or accession id of the seq Hashtable seqRefs; - public DBRefFetcher() - {} + String[] dbSources; - public Vector getUniprotEntries(File file) - { - UniprotFile uni = new UniprotFile(); - try - { - // 1. Load the mapping information from the file - org.exolab.castor.mapping.Mapping map = new org.exolab.castor.mapping.Mapping(uni.getClass().getClassLoader()); - java.net.URL url = getClass().getResource("/uniprot_mapping.xml"); - map.loadMapping(url); - - // 2. Unmarshal the data - Unmarshaller unmar = new Unmarshaller(uni); - unmar.setIgnoreExtraElements(true); - unmar.setMapping(map); - - uni = (UniprotFile) unmar.unmarshal(new FileReader(file)); - } - catch (Exception e) - { - System.out.println("Error getUniprotEntries() " + e); - } + SequenceFetcher sfetcher; - return uni.getUniprotEntries(); + public DBRefFetcher() + { } /** * Creates a new SequenceFeatureFetcher object. - * - * @param align DOCUMENT ME! - * @param ap DOCUMENT ME! + * + * @param seqs + * fetch references for these sequences + * @param af + * the parent alignframe for progress bar monitoring. */ - public DBRefFetcher(SequenceI [] seqs, AlignFrame af) + public DBRefFetcher(SequenceI[] seqs, AlignFrame af) { this.af = af; - SequenceI [] ds = new SequenceI[seqs.length]; + SequenceI[] ds = new SequenceI[seqs.length]; for (int i = 0; i < seqs.length; i++) { - if(seqs[i].getDatasetSequence()!=null) + if (seqs[i].getDatasetSequence() != null) ds[i] = seqs[i].getDatasetSequence(); else ds[i] = seqs[i]; } this.dataset = ds; + sfetcher = new SequenceFetcher(); + // select appropriate databases based on alignFrame context. + if (af.getViewport().getAlignment().isNucleotide()) + { + dbSources = DBRefSource.DNACODINGDBS; + } + else + { + dbSources = DBRefSource.PROTEINDBS; + } } - public boolean fetchDBRefs(boolean waitTillFinished) + /** + * start the fetcher thread + * + * @param waitTillFinished + * true to block until the fetcher has finished + */ + public void fetchDBRefs(boolean waitTillFinished) { Thread thread = new Thread(this); thread.start(); @@ -108,20 +111,21 @@ public class DBRefFetcher try { Thread.sleep(500); + } catch (Exception ex) + { } - catch (Exception ex) - {} } } - - return true; } /** - * The sequence will be added to a vector of sequences - * belonging to key which could be either seq name or dbref id - * @param seq SequenceI - * @param key String + * The sequence will be added to a vector of sequences belonging to key which + * could be either seq name or dbref id + * + * @param seq + * SequenceI + * @param key + * String */ void addSeqId(SequenceI seq, String key) { @@ -157,83 +161,129 @@ public class DBRefFetcher */ public void run() { + if (dbSources == null) + { + throw new Error("Implementation error. Must initialise dbSources"); + } long startTime = System.currentTimeMillis(); af.setProgressBar("Fetching db refs", startTime); running = true; - - seqRefs = new Hashtable(); - - try + int db = 0; + Vector sdataset = new Vector(); + for (int s = 0; s < dataset.length; s++) { + sdataset.addElement(dataset[s]); + } + while (sdataset.size() > 0 && db < dbSources.length) + { + int maxqlen = 1; // default number of queries made to at one time + System.err.println("Verifying against " + dbSources[db]); + jalview.ws.seqfetcher.DbSourceProxy dbsource = sfetcher + .getSourceProxy(dbSources[db]); + if (dbsource == null) + { + System.err.println("No proxy for " + dbSources[db]); + db++; + continue; + } + if (dbsource.getDbSourceProperties() + .containsKey(DBRefSource.MULTIACC)) + { + maxqlen = ((Integer) dbsource.getDbSourceProperties().get( + DBRefSource.MULTIACC)).intValue(); + } + // iterate through db for each remaining un-verified sequence + SequenceI[] currSeqs = new SequenceI[sdataset.size()]; + sdataset.copyInto(currSeqs);// seqs that are to be validated against + // dbSources[db] + Vector queries = new Vector(); // generated queries curSeq + seqRefs = new Hashtable(); + int seqIndex = 0; - while (seqIndex < dataset.length) + while (queries.size() > 0 || seqIndex < currSeqs.length) { - StringBuffer queryString = new StringBuffer("uniprot:"); - - for (int i = 0; (seqIndex < dataset.length) && (i < 50); - seqIndex++, i++) + if (queries.size() > 0) { - SequenceI sequence = dataset[seqIndex]; - DBRefEntry[] uprefs = jalview.util.DBRefUtils.selectRefs(sequence. - getDBRef(), new String[] - { - jalview.datamodel.DBRefSource.UNIPROT}); - if (uprefs != null) + // Still queries to make for current seqIndex + StringBuffer queryString = new StringBuffer(""); + int nqSize = (maxqlen > queries.size()) ? queries.size() + : maxqlen; + for (int nq = 0, numq = 0; nq < nqSize; nq++) { - if (uprefs.length + i > 50) + String query = (String) queries.elementAt(nq); + if (dbsource.isValidReference(query)) { - break; - } - - for (int j = 0; j < uprefs.length; j++) - { - addSeqId(sequence, uprefs[j].getAccessionId()); - queryString.append(uprefs[j].getAccessionId() + ";"); + queryString.append((nq == 0) ? "" : dbsource + .getAccessionSeparator()); + queryString.append(query); + numq++; } } - else + for (int nq = 0; nq < nqSize; nq++) + { + queries.removeElementAt(0); + } + // make the queries and process the response + AlignmentI retrieved = null; + try + { + retrieved = dbsource.getSequenceRecords(queryString.toString()); + } catch (Exception ex) + { + ex.printStackTrace(); + } + if (retrieved != null) + { + transferReferences(sdataset, dbSources[db], retrieved); + } + } + else + { + // make some more strings for use as queries + for (int i = 0; (seqIndex < dataset.length) && (i < 50); seqIndex++, i++) { - StringTokenizer st = new StringTokenizer(sequence.getName(), "|"); - if (st.countTokens() + i > 50) + SequenceI sequence = dataset[seqIndex]; + DBRefEntry[] uprefs = jalview.util.DBRefUtils.selectRefs( + sequence.getDBRef(), new String[] + { dbSources[db] }); // jalview.datamodel.DBRefSource.UNIPROT + // }); + // check for existing dbrefs to use + if (uprefs != null) { - //Dont send more than 50 id strings to dbFetch!! - seqIndex--; + for (int j = 0; j < uprefs.length; j++) + { + addSeqId(sequence, uprefs[j].getAccessionId()); + queries + .addElement(uprefs[j].getAccessionId() + .toUpperCase()); + } } else { + // generate queries from sequence ID string + StringTokenizer st = new StringTokenizer(sequence.getName(), + "|"); while (st.hasMoreTokens()) { String token = st.nextToken(); addSeqId(sequence, token); - queryString.append(token + ";"); + queries.addElement(token.toUpperCase()); } } } } - - /////////////////////////////////// - ///READ FROM EBI - EBIFetchClient ebi = new EBIFetchClient(); - File file = ebi.fetchDataAsFile(queryString.toString(), "xml", "raw"); - if (file != null) - { - ReadUniprotFile(file); - } } - } - catch (Exception ex) - { - ex.printStackTrace(); - } - + // advance to next database + db++; + } // all databases have been queries. if (sbuffer.length() > 0) { - output.setText( - "Your sequences have been matched to Uniprot. Some of the ids have been\n" + - "altered, most likely the start/end residue will have been updated.\n" + - "Save your alignment to maintain the updated id.\n\n" + - sbuffer.toString()); + output + .setText("Your sequences have been verified against known sequence databases. Some of the ids have been\n" + + "altered, most likely the start/end residue will have been updated.\n" + + "Save your alignment to maintain the updated id.\n\n" + + sbuffer.toString()); Desktop.addInternalFrame(output, "Sequence names updated ", 600, 300); // The above is the dataset, we must now find out the index // of the viewed sequence @@ -248,36 +298,42 @@ public class DBRefFetcher } /** - * DOCUMENT ME! - * - * @param result DOCUMENT ME! - * @param out DOCUMENT ME! - * @param align DOCUMENT ME! + * Verify local sequences in seqRefs against the retrieved sequence database + * records. + * */ - void ReadUniprotFile(File file) + void transferReferences(Vector sdataset, String dbSource, + AlignmentI retrievedAl) // File + // file) { - if (!file.exists()) + + if (retrievedAl == null || retrievedAl.getHeight() == 0) { return; } - + SequenceI[] retrieved = retrievedAl.getSequencesArray(); SequenceI sequence = null; - Vector entries = getUniprotEntries(file); + // Vector entries = new Uniprot().getUniprotEntries(file); - int i, iSize = entries == null ? 0 : entries.size(); - UniprotEntry entry; + int i, iSize = retrieved.length; // entries == null ? 0 : entries.size(); + // UniprotEntry entry; for (i = 0; i < iSize; i++) { - entry = (UniprotEntry) entries.elementAt(i); + SequenceI entry = retrieved[i]; // (UniprotEntry) entries.elementAt(i); - //Work out which sequences this Uniprot file has matches to, - //taking into account all accessionIds and names in the file + // Work out which sequences this sequence matches, + // taking into account all accessionIds and names in the file Vector sequenceMatches = new Vector(); - for (int j = 0; j < entry.getAccession().size(); j++) + // look for corresponding accession ids + DBRefEntry[] entryRefs = jalview.util.DBRefUtils.selectRefs(entry + .getDBRef(), new String[] + { dbSource }); + for (int j = 0; j < entryRefs.length; j++) { - String accessionId = entry.getAccession().elementAt(j).toString(); - if (seqRefs.containsKey(accessionId)) + String accessionId = entryRefs[j].getAccessionId(); // .getAccession().elementAt(j).toString(); + // match up on accessionId + if (seqRefs.containsKey(accessionId.toUpperCase())) { Vector seqs = (Vector) seqRefs.get(accessionId); for (int jj = 0; jj < seqs.size(); jj++) @@ -290,100 +346,125 @@ public class DBRefFetcher } } } - for (int j = 0; j < entry.getName().size(); j++) + if (sequenceMatches.size()==0) { - String name = entry.getName().elementAt(j).toString(); - if (seqRefs.containsKey(name)) + // failed to match directly on accessionId==query so just compare all sequences to entry + Enumeration e = seqRefs.keys(); + while (e.hasMoreElements()) { - Vector seqs = (Vector) seqRefs.get(name); - for (int jj = 0; jj < seqs.size(); jj++) + Vector sqs = (Vector) seqRefs.get(e.nextElement()); + if (sqs!=null && sqs.size()>0) { - sequence = (SequenceI) seqs.elementAt(jj); - if (!sequenceMatches.contains(sequence)) + Enumeration sqe = sqs.elements(); + while (sqe.hasMoreElements()) { - sequenceMatches.addElement(sequence); + sequenceMatches.addElement(sqe.nextElement()); } } } } - + // look for corresponding names + // this is uniprot specific ? + // could be useful to extend this so we try to find any 'significant' + // information in common between two sequence objects. + /* + * DBRefEntry[] entryRefs = + * jalview.util.DBRefUtils.selectRefs(entry.getDBRef(), new String[] { + * dbSource }); for (int j = 0; j < entry.getName().size(); j++) { String + * name = entry.getName().elementAt(j).toString(); if + * (seqRefs.containsKey(name)) { Vector seqs = (Vector) seqRefs.get(name); + * for (int jj = 0; jj < seqs.size(); jj++) { sequence = (SequenceI) + * seqs.elementAt(jj); if (!sequenceMatches.contains(sequence)) { + * sequenceMatches.addElement(sequence); } } } } + */ + // sequenceMatches now contains the set of all sequences associated with + // the returned db record + String entrySeq = entry.getSequenceAsString().toUpperCase(); for (int m = 0; m < sequenceMatches.size(); m++) { sequence = (SequenceI) sequenceMatches.elementAt(m); - sequence.addDBRef(new DBRefEntry(DBRefSource.UNIPROT, - "0", // TODO: VERSION FROM UNIPROT - entry.getAccession().elementAt(0). - toString())); - - System.out.println("Adding dbref to " + sequence.getName() + " : " + - entry.getAccession().elementAt(0).toString()); + // only update start and end positions and shift features if there are no existing references + // TODO: test for legacy where uniprot or EMBL refs exist but no mappings are made (but content matches retrieved set) + boolean updateRefFrame = sequence.getDBRef()==null || sequence.getDBRef().length==0; + // verify sequence against the entry sequence String nonGapped = AlignSeq.extractGaps("-. ", - sequence.getSequenceAsString()). - toUpperCase(); + sequence.getSequenceAsString()).toUpperCase(); - int absStart = entry.getUniprotSequence().getContent().indexOf( - nonGapped.toString()); + int absStart = entrySeq.indexOf(nonGapped); + int mapStart = entry.getStart(); + jalview.datamodel.Mapping mp; if (absStart == -1) { - // Is UniprotSequence contained in dataset sequence? - absStart = nonGapped.toString().indexOf(entry.getUniprotSequence(). - getContent()); + // Is local sequence contained in dataset sequence? + absStart = nonGapped.indexOf(entrySeq); if (absStart == -1) - { - sbuffer.append(sequence.getName() + " SEQUENCE NOT %100 MATCH \n"); + { // verification failed. + sbuffer.append(sequence.getName() + + " SEQUENCE NOT %100 MATCH \n"); continue; } - - if (entry.getFeature() != null) + + sbuffer.append(sequence.getName() + " HAS " + absStart + + " PREFIXED RESIDUES COMPARED TO " + dbSource+"\n"); + // + // + " - ANY SEQUENCE FEATURES" + // + " HAVE BEEN ADJUSTED ACCORDINGLY \n"); + // absStart = 0; + // create valid mapping between matching region of local sequence and + // the mapped sequence + mp = new Mapping(null, new int[] + { sequence.getStart()+absStart, sequence.getStart()+absStart+entrySeq.length()-1 }, new int[] + { entry.getStart(), + entry.getStart() + entrySeq.length() - 1 }, 1, 1); + updateRefFrame=false; // mapping is based on current start/end so don't modify start and end + } + else + { + // update start and end of local sequence to place it in entry's + // reference frame. + // apply identity map map from whole of local sequence to matching + // region of database + // sequence + mp = null; // Mapping.getIdentityMap(); + // new Mapping(null, + // new int[] { absStart+sequence.getStart(), + // absStart+sequence.getStart()+entrySeq.length()-1}, + // new int[] { entry.getStart(), entry.getEnd() }, 1, 1); + // relocate local features for updated start + if (updateRefFrame && sequence.getSequenceFeatures() != null) { - Enumeration e = entry.getFeature().elements(); - while (e.hasMoreElements()) + SequenceFeature[] sf = sequence.getSequenceFeatures(); + int start = sequence.getStart(); + int end = sequence.getEnd(); + for (int sfi = 0; sfi < sf.length; sfi++) { - SequenceFeature sf = (SequenceFeature) e.nextElement(); - sf.setBegin(sf.getBegin() + absStart + 1); - sf.setEnd(sf.getEnd() + absStart + 1); + if (sf[sfi].getBegin() >= start && sf[sfi].getEnd() <= end) + { + // shift feature along by absstart + sf[sfi].setBegin(sf[sfi].getBegin() + absStart); + sf[sfi].setEnd(sf[sfi].getEnd() + absStart); + } } - - sbuffer.append(sequence.getName() + - " HAS " + absStart + - " PREFIXED RESIDUES COMPARED TO UNIPROT - ANY SEQUENCE FEATURES" - + " HAVE BEEN ADJUSTED ACCORDINGLY \n"); - absStart = 0; } - } - //unknownSequences.remove(sequence); - - int absEnd = absStart + nonGapped.toString().length(); + System.out.println("Adding dbrefs to " + sequence.getName() + + " from " + dbSource + " sequence : " + entry.getName()); + sequence.transferAnnotation(entry, mp); + // unknownSequences.remove(sequence); + int absEnd = absStart + nonGapped.length(); absStart += 1; - - Enumeration e = entry.getDbReference().elements(); - Vector onlyPdbEntries = new Vector(); - while (e.hasMoreElements()) + if (updateRefFrame) { - PDBEntry pdb = (PDBEntry) e.nextElement(); - if (!pdb.getType().equals(DBRefSource.PDB)) - { - DBRefEntry xref = new DBRefEntry(pdb.getType(), DBRefSource.UNIPROT, pdb.getId()); - sequence.addDBRef(xref); - continue; - } - - sequence.addDBRef(new DBRefEntry(DBRefSource.PDB, - "0", - pdb.getId())); - - onlyPdbEntries.addElement(pdb); + // finally, update local sequence reference frame if we're allowed + sequence.setStart(absStart); + sequence.setEnd(absEnd); } - - sequence.setPDBId(onlyPdbEntries); - - sequence.setStart(absStart); - sequence.setEnd(absEnd); - + // and remove it from the rest + // TODO: decide if we should remove annotated sequence from set + sdataset.remove(sequence); } } } diff --git a/src/jalview/ws/dbsources/EmblCdsSouce.java b/src/jalview/ws/dbsources/EmblCdsSouce.java index d448830..65b4e21 100644 --- a/src/jalview/ws/dbsources/EmblCdsSouce.java +++ b/src/jalview/ws/dbsources/EmblCdsSouce.java @@ -58,5 +58,9 @@ public class EmblCdsSouce extends EmblXmlSource implements DbSourceProxy { return "CAA37824"; } + public String getDbName() + { + return "EMBL (CDS)"; + } } diff --git a/src/jalview/ws/dbsources/EmblSource.java b/src/jalview/ws/dbsources/EmblSource.java index 5ae7a72..f245797 100644 --- a/src/jalview/ws/dbsources/EmblSource.java +++ b/src/jalview/ws/dbsources/EmblSource.java @@ -89,4 +89,8 @@ public class EmblSource extends EmblXmlSource implements DbSourceProxy return "X53828"; } + public String getDbName() + { + return getDbSource(); + } } diff --git a/src/jalview/ws/dbsources/EmblXmlSource.java b/src/jalview/ws/dbsources/EmblXmlSource.java index 68fa87c..68a2424 100644 --- a/src/jalview/ws/dbsources/EmblXmlSource.java +++ b/src/jalview/ws/dbsources/EmblXmlSource.java @@ -32,9 +32,6 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy public AlignmentI getEmblSequenceRecords(String emprefx, String query) throws Exception { startQuery(); - SequenceI seqs[] = null; - Vector alseq = new Vector(); // the sequences that will actually be presented in the alignment - StringBuffer result = new StringBuffer(); EBIFetchClient dbFetch = new EBIFetchClient(); File reply; try { @@ -47,6 +44,20 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy stopQuery(); throw new Exception("EBI EMBL XML retrieval failed on "+emprefx.toLowerCase()+":"+query.trim(),e); } + return getEmblSequenceRecords(emprefx, query, reply); + } + /** + * parse an emblxml file stored locally + * @param emprefx either EMBL or EMBLCDS strings are allowed - anything else will not retrieve emblxml + * @param query + * @param file the EMBL XML file containing the results of a query + * @return + * @throws Exception + */ + public AlignmentI getEmblSequenceRecords(String emprefx, String query, File reply) throws Exception + { + SequenceI seqs[] = null; + StringBuffer result = new StringBuffer(); if (reply != null && reply.exists()) { efile=null; diff --git a/src/jalview/ws/dbsources/GeneDbSource.java b/src/jalview/ws/dbsources/GeneDbSource.java new file mode 100644 index 0000000..7cea532 --- /dev/null +++ b/src/jalview/ws/dbsources/GeneDbSource.java @@ -0,0 +1,98 @@ +/** + * + */ +package jalview.ws.dbsources; + +import java.io.File; +import java.util.Hashtable; +import java.util.Iterator; +import java.util.StringTokenizer; + +import com.stevesoft.pat.Regex; + +import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.SequenceI; +import jalview.datamodel.xdb.embl.EmblEntry; +import jalview.ws.ebi.EBIFetchClient; +import jalview.ws.seqfetcher.DbSourceProxy; +import jalview.ws.seqfetcher.DbSourceProxyImpl; + +/** + * @author JimP + * + */ +public class GeneDbSource extends EmblXmlSource implements DbSourceProxy +{ + + public GeneDbSource() { + addDbSourceProperty(DBRefSource.DNASEQDB); + addDbSourceProperty(DBRefSource.CODINGSEQDB); + } + + /* (non-Javadoc) + * @see jalview.ws.DbSourceProxy#getAccessionSeparator() + */ + public String getAccessionSeparator() + { + // TODO Auto-generated method stub + return null; + } + + /* (non-Javadoc) + * @see jalview.ws.DbSourceProxy#getAccessionValidator() + */ + public Regex getAccessionValidator() + { + // TODO Auto-generated method stub + return null; + } + + /* (non-Javadoc) + * @see jalview.ws.DbSourceProxy#getDbSource() + */ + public String getDbSource() + { + return DBRefSource.GENEDB; + } + /* (non-Javadoc) + * @see jalview.ws.DbSourceProxy#getDbVersion() + */ + public String getDbVersion() + { + // TODO Auto-generated method stub + return "0"; + } + + /* (non-Javadoc) + * @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[]) + */ + public AlignmentI getSequenceRecords(String queries) throws Exception + { + // query of form http://www.genedb.org/genedb/ArtemisFormHandler?id=&dest=EMBL + // + return getEmblSequenceRecords(DBRefSource.GENEDB, queries); + } + /* (non-Javadoc) + * @see jalview.ws.DbSourceProxy#isValidReference(java.lang.String) + */ + public boolean isValidReference(String accession) + { + // TODO Auto-generated method stub + return false; + } + + /** + * return T.Brucei Mannosyl-Transferase TbPIG-M + */ + public String getTestQuery() + { + return "Tb927.6.3300"; + } + + public String getDbName() + { + return getDbSource(); + } +} diff --git a/src/jalview/ws/dbsources/Pdb.java b/src/jalview/ws/dbsources/Pdb.java index 0548583..5773171 100644 --- a/src/jalview/ws/dbsources/Pdb.java +++ b/src/jalview/ws/dbsources/Pdb.java @@ -50,7 +50,7 @@ public class Pdb extends EbiFileRetrievedProxy implements DbSourceProxy */ public Regex getAccessionValidator() { - return new Regex("[1-9][0-9A-Za-z]{3}[ _A-Za-z0-9]?"); + return new Regex("([1-9][0-9A-Za-z]{3}):?([ _A-Za-z0-9]?)"); } /* (non-Javadoc) @@ -88,9 +88,15 @@ public class Pdb extends EbiFileRetrievedProxy implements DbSourceProxy } if (queries.length() > 4 && chain == null) { - chain = queries.substring(4); + chain = queries.substring(4,5); id = queries.substring(0, 4); } + if (!isValidReference(id)) + { + System.err.println("Ignoring invalid pdb query: '"+id+"'"); + stopQuery(); + return null; + } EBIFetchClient ebi = new EBIFetchClient(); file = ebi.fetchDataAsFile("pdb:" + id, "pdb", "raw") .getAbsolutePath(); @@ -177,4 +183,9 @@ public class Pdb extends EbiFileRetrievedProxy implements DbSourceProxy return "1QIPA"; } + public String getDbName() + { + return getDbSource(); + } + } diff --git a/src/jalview/ws/dbsources/Pfam.java b/src/jalview/ws/dbsources/Pfam.java index 4077c51..d935db9 100644 --- a/src/jalview/ws/dbsources/Pfam.java +++ b/src/jalview/ws/dbsources/Pfam.java @@ -8,6 +8,8 @@ import java.util.Hashtable; import com.stevesoft.pat.Regex; import jalview.datamodel.AlignmentI; +import jalview.datamodel.DBRefEntry; +import jalview.io.FastaFile; import jalview.ws.seqfetcher.DbSourceProxy; import jalview.ws.seqfetcher.DbSourceProxyImpl; /** @@ -20,6 +22,12 @@ import jalview.ws.seqfetcher.DbSourceProxyImpl; public class Pfam extends DbSourceProxyImpl implements DbSourceProxy { + public Pfam() + { + super(); + + } + /* (non-Javadoc) * @see jalview.ws.DbSourceProxy#getAccessionSeparator() */ @@ -43,7 +51,6 @@ public class Pfam extends DbSourceProxyImpl implements DbSourceProxy */ public String getDbSource() { - // TODO Auto-generated method stub return jalview.datamodel.DBRefSource.PFAM; } @@ -64,22 +71,25 @@ public class Pfam extends DbSourceProxyImpl implements DbSourceProxy // TODO Auto-generated method stub return null; } - - /* (non-Javadoc) - * @see jalview.ws.DbSourceProxy#getRawRecords() - */ - public StringBuffer getRawRecords() - { - // TODO Auto-generated method stub - return null; - } - + public static String PFAMURL = "http://www.sanger.ac.uk/cgi-bin/Pfam/getalignment.pl?format=fal&acc="; /* (non-Javadoc) * @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[]) */ public AlignmentI getSequenceRecords(String queries) throws Exception { - throw new Exception("PFAM Retrieval not yet implemented - see jalview.gui.SequenceFetcher for current implementation"); + // TODO: this is not a perfect implementation. We need to be able to add individual references to each sequence in each family alignment that's retrieved. + startQuery(); + results = new StringBuffer(); + // split queries into many little ones. + results.append(new FastaFile( + PFAMURL+queries.trim().toUpperCase(), "URL").print()); + stopQuery(); + AlignmentI rcds = parseResult(results.toString()); + for (int s=0,sNum=rcds.getHeight(); s