From 56e5bdc625697d50d7d3f422616f0f1b40ca2828 Mon Sep 17 00:00:00 2001 From: amwaterhouse Date: Thu, 22 Jun 2006 08:50:07 +0000 Subject: [PATCH] Ye olde SequenceFeatureFetcher --- src/jalview/io/DBRefFetcher.java | 369 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 369 insertions(+) create mode 100644 src/jalview/io/DBRefFetcher.java diff --git a/src/jalview/io/DBRefFetcher.java b/src/jalview/io/DBRefFetcher.java new file mode 100644 index 0000000..6e44910 --- /dev/null +++ b/src/jalview/io/DBRefFetcher.java @@ -0,0 +1,369 @@ +/* +* Jalview - A Sequence Alignment Editor and Viewer +* Copyright (C) 2005 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version 2 +* of the License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +*/ +package jalview.io; + +import jalview.datamodel.*; + +import jalview.gui.*; + +import java.io.*; + +import java.util.*; + +import org.exolab.castor.mapping.Mapping; + +import org.exolab.castor.xml.*; +import jalview.analysis.AlignSeq; + + + +/** + * DOCUMENT ME! + * + * @author $author$ + * @version $Revision$ + */ +public class DBRefFetcher implements Runnable +{ + AlignmentI align; + AlignmentI dataset; + AlignFrame af; + ArrayList unknownSequences; + CutAndPasteTransfer output = new CutAndPasteTransfer(); + StringBuffer sbuffer = new StringBuffer(); + boolean uniprotFlag = false; + + public DBRefFetcher() + {} + + public Vector getUniprotEntries(File file) + { + UniprotFile uni = new UniprotFile(); + try + { + // 1. Load the mapping information from the file + Mapping map = new Mapping(uni.getClass().getClassLoader()); + java.net.URL url = getClass().getResource("/uniprot_mapping.xml"); + map.loadMapping(url); + + // 2. Unmarshal the data + Unmarshaller unmar = new Unmarshaller(uni); + unmar.setIgnoreExtraElements(true); + unmar.setMapping(map); + // unmar.setDebug(true); + + uni = (UniprotFile) unmar.unmarshal(new FileReader(file)); + } + catch (Exception e) + { + System.out.println("Error getUniprotEntries() "+e); + } + + + return uni.getUniprotEntries(); + } + + /** + * Creates a new SequenceFeatureFetcher object. + * + * @param align DOCUMENT ME! + * @param ap DOCUMENT ME! + */ + public DBRefFetcher(AlignmentI align, AlignFrame af) + { + this.af = af; + unknownSequences = new ArrayList(); + this.align = align; + this.dataset = align.getDataset(); + + Thread thread = new Thread(this); + thread.start(); + } + + /** + * DOCUMENT ME! + */ + public void run() + { + long startTime = System.currentTimeMillis(); + af.setProgressBar("Fetching db refs", startTime); + + try + { + int seqIndex = 0; + Vector sequences = dataset.getSequences(); + + while (seqIndex < sequences.size()) + { + Vector ids = new Vector(); + + for (int i = 0; (seqIndex < sequences.size()) && (i < 50); + seqIndex++, i++) + { + Sequence sequence = (Sequence) sequences.get(seqIndex); + Vector uprefs = jalview.util.DBRefUtils.selectRefs(sequence.getDBRef(), new String[] { + jalview.datamodel.DBRefSource.UNIPROT}); + if (uprefs!=null) + { + // we know the id for this entry, so don't note its ID in the unknownSequences list + for (int j=0,k=uprefs.size(); j 0) + { + StringBuffer remainingIds = new StringBuffer("uniprot:"); + for (int i = 0; i < ids.size(); i++) + { + if(ids.get(i).toString().indexOf("|")>-1) + { + remainingIds.append(ids.get(i).toString().substring( + ids.get(i).toString().lastIndexOf("|") + 1)); + uniprotFlag = true; + } + remainingIds.append(ids.get(i) + ";"); + } + EBIFetchClient ebi = new EBIFetchClient(); + File file = ebi.fetchDataAsFile(remainingIds.toString(), + "xml", "raw"); + + + + if (file != null) + { + ReadUniprotFile(file, ids); + } + } + } + } + catch (Exception ex) + { + ex.printStackTrace(); + } + + if (sbuffer.length() > 0) + { + output.setText( + "Your sequences have been matched to Uniprot. Some of the ids have been\n" + + "altered, most likely the start/end residue will have been updated.\n" + + "Save your alignment to maintain the updated id.\n\n" + + sbuffer.toString()); + Desktop.addInternalFrame(output, "Sequence names updated ", 600, 300); + // The above is the dataset, we must now find out the index + // of the viewed sequence + + } + + af.setProgressBar("DBRef search completed", startTime); + // promptBeforeBlast(); + + } + + + void promptBeforeBlast() + { + // This must be outside the run() body as java 1.5 + // will not return any value from the OptionPane to the expired thread. + if (unknownSequences.size() > 0) + { + // int reply = javax.swing.JOptionPane.showConfirmDialog( + // Desktop.desktop, "Couldn't find a match for "+unknownSequences.size()+" sequences." + // +"\nPerform blast for unknown sequences?", + // "Blast for Unidentified Sequences", + // javax.swing.JOptionPane.YES_NO_OPTION, javax.swing.JOptionPane.QUESTION_MESSAGE); + javax.swing.JOptionPane.showMessageDialog( + Desktop.desktop, "Couldn't find a match for "+unknownSequences.size()+" sequences.", + "Unidentified Sequences", + javax.swing.JOptionPane.WARNING_MESSAGE); + + + // if(reply == javax.swing.JOptionPane.YES_OPTION) + // new WSWUBlastClient(ap, align, unknownSequences); + } + } + + /** + * DOCUMENT ME! + * + * @param result DOCUMENT ME! + * @param out DOCUMENT ME! + * @param align DOCUMENT ME! + */ + void ReadUniprotFile(File file, Vector ids) + { + if(!file.exists()) + return; + + SequenceI sequence = null; + + Vector entries = getUniprotEntries(file); + + int i, iSize = entries==null?0:entries.size(); + UniprotEntry entry; + for (i = 0; i < iSize; i++) + { + entry = (UniprotEntry) entries.elementAt(i); + String idmatch = entry.getAccession().elementAt(0).toString(); + sequence = dataset.findName(idmatch); + + if (sequence == null) + { + //Sequence maybe Name, not Accession + idmatch = entry.getName().elementAt(0).toString(); + sequence = dataset.findName(idmatch); + } + + if(sequence!=null) + ids.remove(sequence.getName()); + + else if (sequence == null && uniprotFlag) + { + sequence = dataset.findName("UniProt/Swiss-Prot|"+entry.getAccession().elementAt(0)+"|"+idmatch); + ids.remove(idmatch); + } + + if(sequence ==null) + { + System.out.println(idmatch+" not found"); + continue; + } + + + sequence.addDBRef(new DBRefEntry(DBRefSource.UNIPROT, + "1.0", + entry.getAccession().elementAt(0).toString())); + + System.out.println("Adding dbref to "+sequence.getName()+" : "+ + entry.getAccession().elementAt(0).toString()); + + String nonGapped = AlignSeq.extractGaps("-. ", sequence.getSequence()); + + int absStart = entry.getUniprotSequence().getContent().indexOf( + nonGapped.toString()); + + if (absStart == -1) + { + // Is UniprotSequence contained in dataset sequence? + absStart = nonGapped.toString().indexOf(entry.getUniprotSequence().getContent()); + if(absStart == -1) + { + sbuffer.append(sequence.getName() + + " SEQUENCE NOT %100 MATCH \n"); + + continue; + } + else + { + + if(entry.getFeature()!=null) + { + Enumeration e = entry.getFeature().elements(); + while (e.hasMoreElements()) + { + SequenceFeature sf = (SequenceFeature) e.nextElement(); + sf.setBegin(sf.getBegin() + absStart + 1); + sf.setEnd(sf.getEnd() + absStart + 1); + } + } + + sbuffer.append(sequence.getName() + + " HAS "+absStart+" PREFIXED RESIDUES COMPARED TO UNIPROT - ANY SEQUENCE FEATURES" + +" HAVE BEEN ADJUSTED ACCORDINGLY \n"); + absStart = 0; + } + + } + + unknownSequences.remove(sequence); + + int absEnd = absStart + nonGapped.toString().length(); + absStart += 1; + + Enumeration e = entry.getDbReference().elements(); + Vector onlyPdbEntries = new Vector(); + while(e.hasMoreElements()) + { + PDBEntry pdb = (PDBEntry)e.nextElement(); + if(!pdb.getType().equals("PDB")) + continue; + + onlyPdbEntries.addElement(pdb); + } + + sequence.setPDBId(onlyPdbEntries); + + if (entry.getFeature()!=null) { + //e = entry.getFeature().elements(); + // while (e.hasMoreElements()) + { + // SequenceFeature sf = (SequenceFeature) e.nextElement(); + // sf.setFeatureGroup("Uniprot"); + // sequence.addSequenceFeature( sf ); + } + } + sequence.setStart(absStart); + sequence.setEnd(absEnd); + + + int n = 0; + SequenceI seq2; + while (n < align.getHeight()) + { + //This loop enables multiple sequences with the same + //id to have features added and seq limits updated + seq2 = align.getSequenceAt(n); + if (seq2.getName().equals(idmatch)) + { + nonGapped = AlignSeq.extractGaps("-. ", seq2.getSequence()); + + absStart = sequence.getSequence().indexOf(nonGapped); + absEnd = absStart + nonGapped.toString().length() - 1; + + // This is the Viewd alignment sequences + // No need to tell the user of the dataset updates + if ( (seq2.getStart() != absStart+sequence.getStart()) + || (seq2.getEnd() != absEnd+sequence.getStart())) + { + sbuffer.append("Updated: " + seq2.getName() + " " + + seq2.getStart() + "/" + seq2.getEnd() + + " to " + (absStart + sequence.getStart()) + "/" + + (absEnd + sequence.getStart()) + "\n"); + + seq2.setStart(absStart + sequence.getStart()); + seq2.setEnd(absEnd + sequence.getStart()); + } + } + + n++; + } + } + } +} + + -- 1.7.10.2