X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2FDBRefFetcher.java;h=6d589f9df4c3c75d54639e91f0db56375ca41544;hb=4dd44779e0d1ffa07b52b67bd8178ad45801bb4f;hp=6e4491059267415a949777d7c87eec82235ab77d;hpb=56e5bdc625697d50d7d3f422616f0f1b40ca2828;p=jalview.git diff --git a/src/jalview/io/DBRefFetcher.java b/src/jalview/io/DBRefFetcher.java index 6e44910..6d589f9 100644 --- a/src/jalview/io/DBRefFetcher.java +++ b/src/jalview/io/DBRefFetcher.java @@ -1,37 +1,31 @@ /* -* Jalview - A Sequence Alignment Editor and Viewer -* Copyright (C) 2005 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle -* -* This program is free software; you can redistribute it and/or -* modify it under the terms of the GNU General Public License -* as published by the Free Software Foundation; either version 2 -* of the License, or (at your option) any later version. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. -* -* You should have received a copy of the GNU General Public License -* along with this program; if not, write to the Free Software -* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA -*/ + * Jalview - A Sequence Alignment Editor and Viewer + * Copyright (C) 2007 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + */ package jalview.io; -import jalview.datamodel.*; - -import jalview.gui.*; - import java.io.*; - import java.util.*; -import org.exolab.castor.mapping.Mapping; - +import org.exolab.castor.mapping.*; import org.exolab.castor.xml.*; -import jalview.analysis.AlignSeq; - - +import jalview.analysis.*; +import jalview.datamodel.*; +import jalview.gui.*; /** * DOCUMENT ME! @@ -39,15 +33,18 @@ import jalview.analysis.AlignSeq; * @author $author$ * @version $Revision$ */ -public class DBRefFetcher implements Runnable +public class DBRefFetcher + implements Runnable { - AlignmentI align; - AlignmentI dataset; + SequenceI [] dataset; AlignFrame af; - ArrayList unknownSequences; CutAndPasteTransfer output = new CutAndPasteTransfer(); StringBuffer sbuffer = new StringBuffer(); - boolean uniprotFlag = false; + boolean running = false; + + ///This will be a collection of Vectors of sequenceI refs. + //The key will be the seq name or accession id of the seq + Hashtable seqRefs; public DBRefFetcher() {} @@ -58,7 +55,7 @@ public class DBRefFetcher implements Runnable try { // 1. Load the mapping information from the file - Mapping map = new Mapping(uni.getClass().getClassLoader()); + org.exolab.castor.mapping.Mapping map = new org.exolab.castor.mapping.Mapping(uni.getClass().getClassLoader()); java.net.URL url = getClass().getResource("/uniprot_mapping.xml"); map.loadMapping(url); @@ -66,16 +63,14 @@ public class DBRefFetcher implements Runnable Unmarshaller unmar = new Unmarshaller(uni); unmar.setIgnoreExtraElements(true); unmar.setMapping(map); - // unmar.setDebug(true); uni = (UniprotFile) unmar.unmarshal(new FileReader(file)); } catch (Exception e) { - System.out.println("Error getUniprotEntries() "+e); + System.out.println("Error getUniprotEntries() " + e); } - return uni.getUniprotEntries(); } @@ -85,15 +80,75 @@ public class DBRefFetcher implements Runnable * @param align DOCUMENT ME! * @param ap DOCUMENT ME! */ - public DBRefFetcher(AlignmentI align, AlignFrame af) + public DBRefFetcher(SequenceI [] seqs, AlignFrame af) { this.af = af; - unknownSequences = new ArrayList(); - this.align = align; - this.dataset = align.getDataset(); + SequenceI [] ds = new SequenceI[seqs.length]; + for (int i = 0; i < seqs.length; i++) + { + if(seqs[i].getDatasetSequence()!=null) + ds[i] = seqs[i].getDatasetSequence(); + else + ds[i] = seqs[i]; + } + this.dataset = ds; + } + public boolean fetchDBRefs(boolean waitTillFinished) + { Thread thread = new Thread(this); thread.start(); + running = true; + + if (waitTillFinished) + { + while (running) + { + try + { + Thread.sleep(500); + } + catch (Exception ex) + {} + } + } + + return true; + } + + /** + * The sequence will be added to a vector of sequences + * belonging to key which could be either seq name or dbref id + * @param seq SequenceI + * @param key String + */ + void addSeqId(SequenceI seq, String key) + { + key = key.toUpperCase(); + + Vector seqs; + if (seqRefs.containsKey(key)) + { + seqs = (Vector) seqRefs.get(key); + + if (seqs != null && !seqs.contains(seq)) + { + seqs.addElement(seq); + } + else if (seqs == null) + { + seqs = new Vector(); + seqs.addElement(seq); + } + + } + else + { + seqs = new Vector(); + seqs.addElement(seq); + } + + seqRefs.put(key, seqs); } /** @@ -103,62 +158,66 @@ public class DBRefFetcher implements Runnable { long startTime = System.currentTimeMillis(); af.setProgressBar("Fetching db refs", startTime); + running = true; + + seqRefs = new Hashtable(); try { int seqIndex = 0; - Vector sequences = dataset.getSequences(); - while (seqIndex < sequences.size()) + while (seqIndex < dataset.length) { - Vector ids = new Vector(); + StringBuffer queryString = new StringBuffer("uniprot:"); - for (int i = 0; (seqIndex < sequences.size()) && (i < 50); + for (int i = 0; (seqIndex < dataset.length) && (i < 50); seqIndex++, i++) { - Sequence sequence = (Sequence) sequences.get(seqIndex); - Vector uprefs = jalview.util.DBRefUtils.selectRefs(sequence.getDBRef(), new String[] { + SequenceI sequence = dataset[seqIndex]; + DBRefEntry[] uprefs = jalview.util.DBRefUtils.selectRefs(sequence. + getDBRef(), new String[] + { jalview.datamodel.DBRefSource.UNIPROT}); - if (uprefs!=null) + if (uprefs != null) { - // we know the id for this entry, so don't note its ID in the unknownSequences list - for (int j=0,k=uprefs.size(); j 50) { - ids.add(sequence.getName()); - unknownSequences.add(sequence); + break; + } + + for (int j = 0; j < uprefs.length; j++) + { + addSeqId(sequence, uprefs[j].getAccessionId()); + queryString.append(uprefs[j].getAccessionId() + ";"); + } + } + else + { + StringTokenizer st = new StringTokenizer(sequence.getName(), "|"); + if (st.countTokens() + i > 50) + { + //Dont send more than 50 id strings to dbFetch!! + seqIndex--; + } + else + { + while (st.hasMoreTokens()) + { + String token = st.nextToken(); + addSeqId(sequence, token); + queryString.append(token + ";"); + } } } } /////////////////////////////////// ///READ FROM EBI - if (ids.size() > 0) + EBIFetchClient ebi = new EBIFetchClient(); + File file = ebi.fetchDataAsFile(queryString.toString(), "xml", "raw"); + if (file != null) { - StringBuffer remainingIds = new StringBuffer("uniprot:"); - for (int i = 0; i < ids.size(); i++) - { - if(ids.get(i).toString().indexOf("|")>-1) - { - remainingIds.append(ids.get(i).toString().substring( - ids.get(i).toString().lastIndexOf("|") + 1)); - uniprotFlag = true; - } - remainingIds.append(ids.get(i) + ";"); - } - EBIFetchClient ebi = new EBIFetchClient(); - File file = ebi.fetchDataAsFile(remainingIds.toString(), - "xml", "raw"); - - - - if (file != null) - { - ReadUniprotFile(file, ids); - } + ReadUniprotFile(file); } } } @@ -181,31 +240,10 @@ public class DBRefFetcher implements Runnable } af.setProgressBar("DBRef search completed", startTime); - // promptBeforeBlast(); + // promptBeforeBlast(); - } + running = false; - - void promptBeforeBlast() - { - // This must be outside the run() body as java 1.5 - // will not return any value from the OptionPane to the expired thread. - if (unknownSequences.size() > 0) - { - // int reply = javax.swing.JOptionPane.showConfirmDialog( - // Desktop.desktop, "Couldn't find a match for "+unknownSequences.size()+" sequences." - // +"\nPerform blast for unknown sequences?", - // "Blast for Unidentified Sequences", - // javax.swing.JOptionPane.YES_NO_OPTION, javax.swing.JOptionPane.QUESTION_MESSAGE); - javax.swing.JOptionPane.showMessageDialog( - Desktop.desktop, "Couldn't find a match for "+unknownSequences.size()+" sequences.", - "Unidentified Sequences", - javax.swing.JOptionPane.WARNING_MESSAGE); - - - // if(reply == javax.swing.JOptionPane.YES_OPTION) - // new WSWUBlastClient(ap, align, unknownSequences); - } } /** @@ -215,73 +253,89 @@ public class DBRefFetcher implements Runnable * @param out DOCUMENT ME! * @param align DOCUMENT ME! */ - void ReadUniprotFile(File file, Vector ids) + void ReadUniprotFile(File file) { - if(!file.exists()) + if (!file.exists()) + { return; + } SequenceI sequence = null; Vector entries = getUniprotEntries(file); - int i, iSize = entries==null?0:entries.size(); + int i, iSize = entries == null ? 0 : entries.size(); UniprotEntry entry; for (i = 0; i < iSize; i++) { entry = (UniprotEntry) entries.elementAt(i); - String idmatch = entry.getAccession().elementAt(0).toString(); - sequence = dataset.findName(idmatch); - if (sequence == null) + //Work out which sequences this Uniprot file has matches to, + //taking into account all accessionIds and names in the file + Vector sequenceMatches = new Vector(); + for (int j = 0; j < entry.getAccession().size(); j++) { - //Sequence maybe Name, not Accession - idmatch = entry.getName().elementAt(0).toString(); - sequence = dataset.findName(idmatch); + String accessionId = entry.getAccession().elementAt(j).toString(); + if (seqRefs.containsKey(accessionId)) + { + Vector seqs = (Vector) seqRefs.get(accessionId); + for (int jj = 0; jj < seqs.size(); jj++) + { + sequence = (SequenceI) seqs.elementAt(jj); + if (!sequenceMatches.contains(sequence)) + { + sequenceMatches.addElement(sequence); + } + } + } } - - if(sequence!=null) - ids.remove(sequence.getName()); - - else if (sequence == null && uniprotFlag) + for (int j = 0; j < entry.getName().size(); j++) { - sequence = dataset.findName("UniProt/Swiss-Prot|"+entry.getAccession().elementAt(0)+"|"+idmatch); - ids.remove(idmatch); + String name = entry.getName().elementAt(j).toString(); + if (seqRefs.containsKey(name)) + { + Vector seqs = (Vector) seqRefs.get(name); + for (int jj = 0; jj < seqs.size(); jj++) + { + sequence = (SequenceI) seqs.elementAt(jj); + if (!sequenceMatches.contains(sequence)) + { + sequenceMatches.addElement(sequence); + } + } + } } - if(sequence ==null) + for (int m = 0; m < sequenceMatches.size(); m++) { - System.out.println(idmatch+" not found"); - continue; - } + sequence = (SequenceI) sequenceMatches.elementAt(m); + sequence.addDBRef(new DBRefEntry(DBRefSource.UNIPROT, + "0", + entry.getAccession().elementAt(0). + toString())); + System.out.println("Adding dbref to " + sequence.getName() + " : " + + entry.getAccession().elementAt(0).toString()); - sequence.addDBRef(new DBRefEntry(DBRefSource.UNIPROT, - "1.0", - entry.getAccession().elementAt(0).toString())); + String nonGapped = AlignSeq.extractGaps("-. ", + sequence.getSequenceAsString()). + toUpperCase(); - System.out.println("Adding dbref to "+sequence.getName()+" : "+ - entry.getAccession().elementAt(0).toString()); + int absStart = entry.getUniprotSequence().getContent().indexOf( + nonGapped.toString()); - String nonGapped = AlignSeq.extractGaps("-. ", sequence.getSequence()); - - int absStart = entry.getUniprotSequence().getContent().indexOf( - nonGapped.toString()); - - if (absStart == -1) - { - // Is UniprotSequence contained in dataset sequence? - absStart = nonGapped.toString().indexOf(entry.getUniprotSequence().getContent()); - if(absStart == -1) - { - sbuffer.append(sequence.getName() + - " SEQUENCE NOT %100 MATCH \n"); - - continue; - } - else + if (absStart == -1) { + // Is UniprotSequence contained in dataset sequence? + absStart = nonGapped.toString().indexOf(entry.getUniprotSequence(). + getContent()); + if (absStart == -1) + { + sbuffer.append(sequence.getName() + " SEQUENCE NOT %100 MATCH \n"); + continue; + } - if(entry.getFeature()!=null) + if (entry.getFeature() != null) { Enumeration e = entry.getFeature().elements(); while (e.hasMoreElements()) @@ -290,80 +344,44 @@ public class DBRefFetcher implements Runnable sf.setBegin(sf.getBegin() + absStart + 1); sf.setEnd(sf.getEnd() + absStart + 1); } + + sbuffer.append(sequence.getName() + + " HAS " + absStart + + " PREFIXED RESIDUES COMPARED TO UNIPROT - ANY SEQUENCE FEATURES" + + " HAVE BEEN ADJUSTED ACCORDINGLY \n"); + absStart = 0; } - sbuffer.append(sequence.getName() + - " HAS "+absStart+" PREFIXED RESIDUES COMPARED TO UNIPROT - ANY SEQUENCE FEATURES" - +" HAVE BEEN ADJUSTED ACCORDINGLY \n"); - absStart = 0; } - } - - unknownSequences.remove(sequence); - - int absEnd = absStart + nonGapped.toString().length(); - absStart += 1; - - Enumeration e = entry.getDbReference().elements(); - Vector onlyPdbEntries = new Vector(); - while(e.hasMoreElements()) - { - PDBEntry pdb = (PDBEntry)e.nextElement(); - if(!pdb.getType().equals("PDB")) - continue; - - onlyPdbEntries.addElement(pdb); - } + //unknownSequences.remove(sequence); - sequence.setPDBId(onlyPdbEntries); + int absEnd = absStart + nonGapped.toString().length(); + absStart += 1; - if (entry.getFeature()!=null) { - //e = entry.getFeature().elements(); - // while (e.hasMoreElements()) + Enumeration e = entry.getDbReference().elements(); + Vector onlyPdbEntries = new Vector(); + while (e.hasMoreElements()) { - // SequenceFeature sf = (SequenceFeature) e.nextElement(); - // sf.setFeatureGroup("Uniprot"); - // sequence.addSequenceFeature( sf ); - } - } - sequence.setStart(absStart); - sequence.setEnd(absEnd); - + PDBEntry pdb = (PDBEntry) e.nextElement(); + if (!pdb.getType().equals(DBRefSource.PDB)) + { + continue; + } - int n = 0; - SequenceI seq2; - while (n < align.getHeight()) - { - //This loop enables multiple sequences with the same - //id to have features added and seq limits updated - seq2 = align.getSequenceAt(n); - if (seq2.getName().equals(idmatch)) - { - nonGapped = AlignSeq.extractGaps("-. ", seq2.getSequence()); + sequence.addDBRef(new DBRefEntry(DBRefSource.PDB, + "0", + pdb.getId())); - absStart = sequence.getSequence().indexOf(nonGapped); - absEnd = absStart + nonGapped.toString().length() - 1; + onlyPdbEntries.addElement(pdb); + } - // This is the Viewd alignment sequences - // No need to tell the user of the dataset updates - if ( (seq2.getStart() != absStart+sequence.getStart()) - || (seq2.getEnd() != absEnd+sequence.getStart())) - { - sbuffer.append("Updated: " + seq2.getName() + " " + - seq2.getStart() + "/" + seq2.getEnd() + - " to " + (absStart + sequence.getStart()) + "/" + - (absEnd + sequence.getStart()) + "\n"); + sequence.setPDBId(onlyPdbEntries); - seq2.setStart(absStart + sequence.getStart()); - seq2.setEnd(absEnd + sequence.getStart()); - } - } + sequence.setStart(absStart); + sequence.setEnd(absEnd); - n++; } } } } - -