X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2FSequenceFeatureFetcher.java;h=edbf86210970a76d083e2e34dc88b84a1223bc2c;hb=91416b3038a6f3fc655791512770da07cb6cb251;hp=8cfa6b89c050cb8404c25d64f6ca5c07caa83fe5;hpb=af0a8883d743fc5f3e0382ceccac903d380f0ee6;p=jalview.git diff --git a/src/jalview/io/SequenceFeatureFetcher.java b/src/jalview/io/SequenceFeatureFetcher.java index 8cfa6b8..edbf862 100755 --- a/src/jalview/io/SequenceFeatureFetcher.java +++ b/src/jalview/io/SequenceFeatureFetcher.java @@ -1,368 +1,363 @@ +/* +* Jalview - A Sequence Alignment Editor and Viewer +* Copyright (C) 2005 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version 2 +* of the License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +*/ package jalview.io; +import jalview.datamodel.*; + +import jalview.gui.*; + import java.io.*; + import java.util.*; -import javax.swing.*; -import jalview.io.*; -import jalview.gui.*; -import jalview.datamodel.*; +import org.exolab.castor.mapping.Mapping; + +import org.exolab.castor.xml.*; +import jalview.analysis.AlignSeq; + + + +/** + * DOCUMENT ME! + * + * @author $author$ + * @version $Revision$ + */ public class SequenceFeatureFetcher implements Runnable { + AlignmentI align; + AlignmentI dataset; AlignmentPanel ap; ArrayList unknownSequences; CutAndPasteTransfer output = new CutAndPasteTransfer(); StringBuffer sbuffer = new StringBuffer(); + boolean uniprotFlag = false; + + public SequenceFeatureFetcher() + {} + + public Vector getUniprotEntries(File file) + { + UniprotFile uni = new UniprotFile(); + try + { + // 1. Load the mapping information from the file + Mapping map = new Mapping(uni.getClass().getClassLoader()); + java.net.URL url = getClass().getResource("/uniprot_mapping.xml"); + map.loadMapping(url); + + // 2. Unmarshal the data + Unmarshaller unmar = new Unmarshaller(uni); + unmar.setIgnoreExtraElements(true); + unmar.setMapping(map); + // unmar.setDebug(true); + + uni = (UniprotFile) unmar.unmarshal(new FileReader(file)); + } + catch (Exception e) + { + System.out.println("Error getUniprotEntries() "+e); + } + + + return uni.getUniprotEntries(); + } + + /** + * Creates a new SequenceFeatureFetcher object. + * + * @param align DOCUMENT ME! + * @param ap DOCUMENT ME! + */ public SequenceFeatureFetcher(AlignmentI align, AlignmentPanel ap) { unknownSequences = new ArrayList(); this.align = align; + this.dataset = align.getDataset(); this.ap = ap; + Thread thread = new Thread(this); thread.start(); } + /** + * DOCUMENT ME! + */ public void run() -{ - - String cache = jalview.bin.Cache.getProperty("UNIPROT_CACHE"); - - RandomAccessFile out = null; - - try{ - if (cache == null) + { + try { - jalview.bin.Cache.setProperty("UNIPROT_CACHE", System.getProperty("user.home")+"/uniprot.xml"); - cache = jalview.bin.Cache.getProperty("UNIPROT_CACHE"); - } + int seqIndex = 0; + Vector sequences = dataset.getSequences(); + while (seqIndex < sequences.size()) + { + Vector ids = new Vector(); + for (int i = 0; (seqIndex < sequences.size()) && (i < 50); + seqIndex++, i++) + { + Sequence sequence = (Sequence) sequences.get(seqIndex); + Vector uprefs = jalview.util.DBRefUtils.selectRefs(sequence.getDBRef(), new String[] { + jalview.datamodel.DBRefSource.UNIPROT}); + if (uprefs!=null) + { + // we know the id for this entry, so don't note its ID in the unknownSequences list + for (int j=0,k=uprefs.size(); j 0) + { + StringBuffer remainingIds = new StringBuffer("uniprot:"); + for (int i = 0; i < ids.size(); i++) + { + if(ids.get(i).toString().indexOf("|")>-1) + { + remainingIds.append(ids.get(i).toString().substring( + ids.get(i).toString().lastIndexOf("|") + 1)); + uniprotFlag = true; + } + remainingIds.append(ids.get(i) + ";"); + } + EBIFetchClient ebi = new EBIFetchClient(); + File file = ebi.fetchDataAsFile(remainingIds.toString(), + "xml", "raw"); + + + + if (file != null) + { + ReadUniprotFile(file, ids); + } + } + } + } + catch (Exception ex) { - out = new RandomAccessFile(cache, "rw"); - out.writeBytes("\n"); - out.writeBytes("\n"); + ex.printStackTrace(); } - else + + if (sbuffer.length() > 0) { - out = new RandomAccessFile(cache, "rw"); - // open exisiting cache and remove from the end - long lastLine = 0; - String data; - while ( (data = out.readLine()) != null) - { - if (data.indexOf("") > -1) - lastLine = out.getFilePointer(); + output.setText( + "Your sequences have been matched to Uniprot. Some of the ids have been\n" + + "altered, most likely the start/end residue will have been updated.\n" + + "Save your alignment to maintain the updated id.\n\n" + + sbuffer.toString()); + Desktop.addInternalFrame(output, "Sequence names updated ", 600, 300); + // The above is the dataset, we must now find out the index + // of the viewed sequence - } - out.seek(lastLine); } - int seqIndex = 0; - Vector sequences = align.getSequences(); + promptBeforeBlast(); - while (seqIndex < sequences.size()) - { - ArrayList ids = new ArrayList(); - for (int i = 0; seqIndex < sequences.size() && i < 50; seqIndex++, i++) - { - SequenceI sequence = (SequenceI) sequences.get(seqIndex); - ids.add(sequence.getName()); - } + } - tryLocalCacheFirst(ids, align); - if (ids.size() > 0) + void promptBeforeBlast() + { + // This must be outside the run() body as java 1.5 + // will not return any value from the OptionPane to the expired thread. + if (unknownSequences.size() > 0) { - StringBuffer remainingIds = new StringBuffer("uniprot:"); - for (int i = 0; i < ids.size(); i++) - remainingIds.append(ids.get(i) + ";"); - - EBIFetchClient ebi = new EBIFetchClient(); - String[] result = ebi.fetchData(remainingIds.toString(), "xml", null); - - if(result!=null) - ReadUniprotFile(result, out, align); + // int reply = javax.swing.JOptionPane.showConfirmDialog( + // Desktop.desktop, "Couldn't find a match for "+unknownSequences.size()+" sequences." + // +"\nPerform blast for unknown sequences?", + // "Blast for Unidentified Sequences", + // javax.swing.JOptionPane.YES_NO_OPTION, javax.swing.JOptionPane.QUESTION_MESSAGE); + javax.swing.JOptionPane.showMessageDialog( + Desktop.desktop, "Couldn't find a match for "+unknownSequences.size()+" sequences.", + "Unidentified Sequences", + javax.swing.JOptionPane.WARNING_MESSAGE); + + + // if(reply == javax.swing.JOptionPane.YES_OPTION) + // new WSWUBlastClient(ap, align, unknownSequences); } - } - - if (out != null) - { - out.writeBytes("\n"); - out.close(); - } - }catch(Exception ex){ex.printStackTrace();} - - ap.repaint(); - findMissingIds(align); - if(sbuffer.length()>0) - { - output.setText("Your sequences have been matched to Uniprot. Some of the ids have been\n" - +"altered, most likely the start/end residue will have been updated.\n" - +"Save your alignment to maintain the updated id.\n\n"+sbuffer.toString()); - Desktop.addInternalFrame(output, "Sequence names updated ", 600,300); + ap.repaint(); } - if(unknownSequences.size()>0) + /** + * DOCUMENT ME! + * + * @param result DOCUMENT ME! + * @param out DOCUMENT ME! + * @param align DOCUMENT ME! + */ + void ReadUniprotFile(File file, Vector ids) { - //ignore for now!!!!!!!!!! - // WSWUBlastClient blastClient = new WSWUBlastClient(align, unknownSequences); - } - -} + if(!file.exists()) + return; -void ReadUniprotFile(String [] result, RandomAccessFile out, AlignmentI align) -{ - SequenceI sequence = null; - Vector features = null; - String type, description, status, start, end, pdb = null; + SequenceI sequence = null; + Vector entries = getUniprotEntries(file); - for (int r = 0; r < result.length; r++) - { - if(sequence==null && result[r].indexOf("")>-1) + int i, iSize = entries==null?0:entries.size(); + UniprotEntry entry; + for (i = 0; i < iSize; i++) { - long filePointer = 0; + entry = (UniprotEntry) entries.elementAt(i); + String idmatch = entry.getAccession().elementAt(0).toString(); + sequence = dataset.findName(idmatch); - if(out!=null) - try{ - filePointer=out.getFilePointer(); - out.writeBytes("\n"); - }catch(Exception ex){} - - String seqName = parseElement( result[r], "" , out); - sequence = align.findName( seqName ) ; - if(sequence==null) + if (sequence == null) { - sequence = align.findName( seqName.substring(0, seqName.indexOf('_'))); - if(sequence!=null) - { - sbuffer.append("changing "+sequence.getName()+" to "+seqName+"\n"); - sequence.setName(seqName); - } - } - if(sequence==null) - { - sbuffer.append("UNIPROT updated suggestion is "+result[r]+"\n"); - sequence = align.findName( result[r] ) ; - - // this entry has been suggested by ebi. - // doesn't match id in alignment file - try { out.setLength(filePointer); } catch (Exception ex) {} - // now skip to next entry - while( result[r].indexOf("")==-1) - r++; + //Sequence maybe Name, not Accession + idmatch = entry.getName().elementAt(0).toString(); + sequence = dataset.findName(idmatch); } - features = new Vector(); - type=""; start="0"; end="0"; description=""; status=""; pdb=""; + if(sequence!=null) + ids.remove(sequence.getName()); - } - - if(sequence==null) - continue; - - if( result[r].indexOf("-1) - { - pdb = parseValue( result[r], "value=" , out); - sequence.setPDBId(pdb); - } - - if(result[r].indexOf("feature type")>-1) - { - type = parseValue( result[r], "type=" , out); - description = parseValue( result[r], "description=" , null ); - status = parseValue ( result[r], "status=", null); - - while( result[r].indexOf("position")==-1) - { - r++; // - } - // r++; - if(result[r].indexOf("begin")>-1) - { - start = parseValue( result[r], "position=" , out); - end = parseValue( result[++r], "position=" , out); - } - else - { - start = parseValue( result[r], "position=" , out); - end = parseValue( result[r], "position=" , null); - } - int sstart = Integer.parseInt(start); - int eend = Integer.parseInt(end); - if(out!=null) - try{ out.writeBytes("\n"); }catch(Exception ex){} - - SequenceFeature sf = new SequenceFeature(type, - sstart, - eend, - description, - status); - features.add(sf); - } - - if(result[r].indexOf("-1) - { - StringBuffer seqString = new StringBuffer(); - - if(out!=null) - try { out.writeBytes(result[r]+"\n"); } catch (Exception ex){} - - while(result[++r].indexOf("")==-1) - { - seqString.append(result[r]); - if(out!=null) - try { out.writeBytes(result[r]+"\n"); } catch (Exception ex){} - } - - if(out!=null) - try { out.writeBytes(result[r]+"\n"); } catch (Exception ex){} - - StringBuffer nonGapped = new StringBuffer(); - for (int i = 0; i < sequence.getSequence().length(); i++) - { - if (!jalview.util.Comparison.isGap(sequence.getCharAt(i))) - nonGapped.append(sequence.getCharAt(i)); - } - - int absStart = seqString.toString().indexOf(nonGapped.toString()); - if(absStart==-1) - { - unknownSequences.add(sequence.getName()); - features = null; - sbuffer.append(sequence.getName()+ " SEQUENCE NOT %100 MATCH \n"); - continue; - } - - int absEnd = absStart + nonGapped.toString().length(); - absStart+=1; - - if(absStart!=sequence.getStart() || absEnd!=sequence.getEnd()) - sbuffer.append("Updated: "+sequence.getName()+" "+ - sequence.getStart()+"/"+sequence.getEnd()+" to "+ absStart+"/"+absEnd+"\n"); - - - sequence.setStart(absStart); - sequence.setEnd(absEnd); - - } - - if(result[r].indexOf("")>-1) - { - if(features!=null) - sequence.setSequenceFeatures( features ); - features = null; - sequence = null; - if(out!=null) - try{ out.writeBytes("\n"); }catch(Exception ex){} - - } - } -} - -void findMissingIds(AlignmentI align) -{ - String data; - ArrayList cachedIds = new ArrayList(); - - try - { - BufferedReader in = new BufferedReader( - new FileReader(jalview.bin.Cache.getProperty("UNIPROT_CACHE"))); + else if (sequence == null && uniprotFlag) + { + sequence = dataset.findName("UniProt/Swiss-Prot|"+entry.getAccession().elementAt(0)+"|"+idmatch); + ids.remove(idmatch); + } - while ( (data = in.readLine()) != null) - { - if (data.indexOf("name") > -1) + if(sequence ==null) { - String name = parseElement(data, "", null); - cachedIds.add(name); + System.out.println(idmatch+" not found"); + continue; } - } - } - catch (Exception ex) - { ex.printStackTrace(); } - for(int i=0; i-1) + if (absStart == -1) { - String name = parseElement( data, "" , null) ; - if(ids.contains( name ) ) + // Is UniprotSequence contained in dataset sequence? + absStart = nonGapped.toString().indexOf(entry.getUniprotSequence().getContent()); + if(absStart == -1) + { + sbuffer.append(sequence.getName() + + " SEQUENCE NOT %100 MATCH \n"); + + continue; + } + else { - cacheData.add(""); - cacheData.add(data); - while( data.indexOf("")==-1) + + if(entry.getFeature()!=null) { - data = in.readLine(); - cacheData.add(data); + Enumeration e = entry.getFeature().elements(); + while (e.hasMoreElements()) + { + SequenceFeature sf = (SequenceFeature) e.nextElement(); + sf.setBegin(sf.getBegin() + absStart + 1); + sf.setEnd(sf.getEnd() + absStart + 1); + } } - cacheData.add(data); - ids.remove( name ); + sbuffer.append(sequence.getName() + + " HAS "+absStart+" PREFIXED RESIDUES COMPARED TO UNIPROT - ANY SEQUENCE FEATURES" + +" HAVE BEEN ADJUSTED ACCORDINGLY \n"); + absStart = 0; } + } - } - } - catch(Exception ex){ex.printStackTrace();} - String [] localData = new String[cacheData.size()]; - cacheData.toArray( localData ); - if(localData!=null && localData.length>0) - ReadUniprotFile(localData, null, align); -} + unknownSequences.remove(sequence); + int absEnd = absStart + nonGapped.toString().length(); + absStart += 1; -String parseValue(String line, String tag, RandomAccessFile out) -{ - if(out!=null) - try{ out.writeBytes(line+"\n"); }catch(Exception ex){} + Enumeration e = entry.getDbReference().elements(); + Vector onlyPdbEntries = new Vector(); + while(e.hasMoreElements()) + { + PDBEntry pdb = (PDBEntry)e.nextElement(); + if(!pdb.getType().equals("PDB")) + continue; + onlyPdbEntries.addElement(pdb); + } - int index = line.indexOf(tag)+tag.length()+1; - if(index==tag.length()) - return ""; + sequence.setPDBId(onlyPdbEntries); + if (entry.getFeature()!=null) { + e = entry.getFeature().elements(); + while (e.hasMoreElements()) + { + SequenceFeature sf = (SequenceFeature) e.nextElement(); + sf.setFeatureGroup("Uniprot"); + sequence.addSequenceFeature( sf ); + } + } + sequence.setStart(absStart); + sequence.setEnd(absEnd); - return line.substring( index, line.indexOf("\"", index+1) ); -} + int n = 0; + SequenceI seq2; + while (n < align.getHeight()) + { + //This loop enables multiple sequences with the same + //id to have features added and seq limits updated + seq2 = align.getSequenceAt(n); + if (seq2.getName().equals(idmatch)) + { -String parseElement(String line, String tag, RandomAccessFile out) -{ - if (out != null) - try - { - out.writeBytes(line + "\n"); - } - catch (Exception ex) - {} + nonGapped = AlignSeq.extractGaps("-. ", seq2.getSequence()); + + absStart = sequence.getSequence().indexOf(nonGapped); + absEnd = absStart + nonGapped.toString().length() - 1; + + // This is the Viewd alignment sequences + // No need to tell the user of the dataset updates + if ( (seq2.getStart() != absStart+sequence.getStart()) + || (seq2.getEnd() != absEnd+sequence.getStart())) + { + sbuffer.append("Updated: " + seq2.getName() + " " + + seq2.getStart() + "/" + seq2.getEnd() + + " to " + (absStart + sequence.getStart()) + "/" + + (absEnd + sequence.getStart()) + "\n"); - int index = line.indexOf(tag) + tag.length(); - return line.substring(index, line.indexOf("