/* * Jalview - A Sequence Alignment Editor and Viewer * Copyright (C) 2007 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ package jalview.io; import java.io.*; import java.util.*; import org.exolab.castor.mapping.*; import org.exolab.castor.xml.*; import jalview.analysis.*; import jalview.datamodel.*; import jalview.gui.*; /** * DOCUMENT ME! * * @author $author$ * @version $Revision$ */ public class DBRefFetcher implements Runnable { AlignmentI dataset; AlignFrame af; CutAndPasteTransfer output = new CutAndPasteTransfer(); StringBuffer sbuffer = new StringBuffer(); boolean running = false; ///This will be a collection of Vectors of sequenceI refs. //The key will be the seq name or accession id of the seq Hashtable seqRefs; public DBRefFetcher() {} public Vector getUniprotEntries(File file) { UniprotFile uni = new UniprotFile(); try { // 1. Load the mapping information from the file Mapping map = new Mapping(uni.getClass().getClassLoader()); java.net.URL url = getClass().getResource("/uniprot_mapping.xml"); map.loadMapping(url); // 2. Unmarshal the data Unmarshaller unmar = new Unmarshaller(uni); unmar.setIgnoreExtraElements(true); unmar.setMapping(map); uni = (UniprotFile) unmar.unmarshal(new FileReader(file)); } catch (Exception e) { System.out.println("Error getUniprotEntries() " + e); } return uni.getUniprotEntries(); } /** * Creates a new SequenceFeatureFetcher object. * * @param align DOCUMENT ME! * @param ap DOCUMENT ME! */ public DBRefFetcher(AlignmentI align, AlignFrame af) { this.af = af; this.dataset = align.getDataset(); } public boolean fetchDBRefs(boolean waitTillFinished) { Thread thread = new Thread(this); thread.start(); running = true; if (waitTillFinished) { while (running) { try { Thread.sleep(500); } catch (Exception ex) {} } } return true; } /** * The sequence will be added to a vector of sequences * belonging to key which could be either seq name or dbref id * @param seq SequenceI * @param key String */ void addSeqId(SequenceI seq, String key) { key = key.toUpperCase(); Vector seqs; if (seqRefs.containsKey(key)) { seqs = (Vector) seqRefs.get(key); if (seqs != null && !seqs.contains(seq)) { seqs.addElement(seq); } else if (seqs == null) { seqs = new Vector(); seqs.addElement(seq); } } else { seqs = new Vector(); seqs.addElement(seq); } seqRefs.put(key, seqs); } /** * DOCUMENT ME! */ public void run() { long startTime = System.currentTimeMillis(); af.setProgressBar("Fetching db refs", startTime); running = true; seqRefs = new Hashtable(); try { int seqIndex = 0; Vector sequences = dataset.getSequences(); while (seqIndex < sequences.size()) { StringBuffer queryString = new StringBuffer("uniprot:"); for (int i = 0; (seqIndex < sequences.size()) && (i < 50); seqIndex++, i++) { Sequence sequence = (Sequence) sequences.get(seqIndex); DBRefEntry[] uprefs = jalview.util.DBRefUtils.selectRefs(sequence. getDBRef(), new String[] { jalview.datamodel.DBRefSource.UNIPROT}); if (uprefs != null) { if (uprefs.length + i > 50) { break; } for (int j = 0; j < uprefs.length; j++) { addSeqId(sequence, uprefs[j].getAccessionId()); queryString.append(uprefs[j].getAccessionId() + ";"); } } else { StringTokenizer st = new StringTokenizer(sequence.getName(), "|"); if (st.countTokens() + i > 50) { //Dont send more than 50 id strings to dbFetch!! seqIndex--; } else { while (st.hasMoreTokens()) { String token = st.nextToken(); addSeqId(sequence, token); queryString.append(token + ";"); } } } } /////////////////////////////////// ///READ FROM EBI EBIFetchClient ebi = new EBIFetchClient(); File file = ebi.fetchDataAsFile(queryString.toString(), "xml", "raw"); if (file != null) { ReadUniprotFile(file); } } } catch (Exception ex) { ex.printStackTrace(); } if (sbuffer.length() > 0) { output.setText( "Your sequences have been matched to Uniprot. Some of the ids have been\n" + "altered, most likely the start/end residue will have been updated.\n" + "Save your alignment to maintain the updated id.\n\n" + sbuffer.toString()); Desktop.addInternalFrame(output, "Sequence names updated ", 600, 300); // The above is the dataset, we must now find out the index // of the viewed sequence } af.setProgressBar("DBRef search completed", startTime); // promptBeforeBlast(); running = false; } /** * DOCUMENT ME! * * @param result DOCUMENT ME! * @param out DOCUMENT ME! * @param align DOCUMENT ME! */ void ReadUniprotFile(File file) { if (!file.exists()) { return; } SequenceI sequence = null; Vector entries = getUniprotEntries(file); int i, iSize = entries == null ? 0 : entries.size(); UniprotEntry entry; for (i = 0; i < iSize; i++) { entry = (UniprotEntry) entries.elementAt(i); //Work out which sequences this Uniprot file has matches to, //taking into account all accessionIds and names in the file Vector sequenceMatches = new Vector(); for (int j = 0; j < entry.getAccession().size(); j++) { String accessionId = entry.getAccession().elementAt(j).toString(); if (seqRefs.containsKey(accessionId)) { Vector seqs = (Vector) seqRefs.get(accessionId); for (int jj = 0; jj < seqs.size(); jj++) { sequence = (SequenceI) seqs.elementAt(jj); if (!sequenceMatches.contains(sequence)) { sequenceMatches.addElement(sequence); } } } } for (int j = 0; j < entry.getName().size(); j++) { String name = entry.getName().elementAt(j).toString(); if (seqRefs.containsKey(name)) { Vector seqs = (Vector) seqRefs.get(name); for (int jj = 0; jj < seqs.size(); jj++) { sequence = (SequenceI) seqs.elementAt(jj); if (!sequenceMatches.contains(sequence)) { sequenceMatches.addElement(sequence); } } } } for (int m = 0; m < sequenceMatches.size(); m++) { sequence = (SequenceI) sequenceMatches.elementAt(m); sequence.addDBRef(new DBRefEntry(DBRefSource.UNIPROT, "0", entry.getAccession().elementAt(0). toString())); System.out.println("Adding dbref to " + sequence.getName() + " : " + entry.getAccession().elementAt(0).toString()); String nonGapped = AlignSeq.extractGaps("-. ", sequence.getSequenceAsString()). toUpperCase(); int absStart = entry.getUniprotSequence().getContent().indexOf( nonGapped.toString()); if (absStart == -1) { // Is UniprotSequence contained in dataset sequence? absStart = nonGapped.toString().indexOf(entry.getUniprotSequence(). getContent()); if (absStart == -1) { sbuffer.append(sequence.getName() + " SEQUENCE NOT %100 MATCH \n"); continue; } if (entry.getFeature() != null) { Enumeration e = entry.getFeature().elements(); while (e.hasMoreElements()) { SequenceFeature sf = (SequenceFeature) e.nextElement(); sf.setBegin(sf.getBegin() + absStart + 1); sf.setEnd(sf.getEnd() + absStart + 1); } sbuffer.append(sequence.getName() + " HAS " + absStart + " PREFIXED RESIDUES COMPARED TO UNIPROT - ANY SEQUENCE FEATURES" + " HAVE BEEN ADJUSTED ACCORDINGLY \n"); absStart = 0; } } //unknownSequences.remove(sequence); int absEnd = absStart + nonGapped.toString().length(); absStart += 1; Enumeration e = entry.getDbReference().elements(); Vector onlyPdbEntries = new Vector(); while (e.hasMoreElements()) { PDBEntry pdb = (PDBEntry) e.nextElement(); if (!pdb.getType().equals(DBRefSource.PDB)) { continue; } sequence.addDBRef(new DBRefEntry(DBRefSource.PDB, "0", pdb.getId())); onlyPdbEntries.addElement(pdb); } sequence.setPDBId(onlyPdbEntries); sequence.setStart(absStart); sequence.setEnd(absEnd); } } } }