X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Fjalview%2Fio%2FSequenceFeatureFetcher.java;h=edbf86210970a76d083e2e34dc88b84a1223bc2c;hb=91416b3038a6f3fc655791512770da07cb6cb251;hp=89d84b03b12b760a0474b4e40953beaf705aff3c;hpb=efc31b4a8d5cee63555586804a2b79c06bdb5a14;p=jalview.git diff --git a/src/jalview/io/SequenceFeatureFetcher.java b/src/jalview/io/SequenceFeatureFetcher.java index 89d84b0..edbf862 100755 --- a/src/jalview/io/SequenceFeatureFetcher.java +++ b/src/jalview/io/SequenceFeatureFetcher.java @@ -22,12 +22,16 @@ import jalview.datamodel.*; import jalview.gui.*; -import jalview.io.*; - import java.io.*; import java.util.*; +import org.exolab.castor.mapping.Mapping; + +import org.exolab.castor.xml.*; +import jalview.analysis.AlignSeq; + + /** * DOCUMENT ME! @@ -37,521 +41,323 @@ import java.util.*; */ public class SequenceFeatureFetcher implements Runnable { - AlignmentI align; - AlignmentPanel ap; - ArrayList unknownSequences; - CutAndPasteTransfer output = new CutAndPasteTransfer(); - StringBuffer sbuffer = new StringBuffer(); - - /** - * Creates a new SequenceFeatureFetcher object. - * - * @param align DOCUMENT ME! - * @param ap DOCUMENT ME! - */ - public SequenceFeatureFetcher(AlignmentI align, AlignmentPanel ap) - { - unknownSequences = new ArrayList(); - this.align = align; - this.ap = ap; - - Thread thread = new Thread(this); - thread.start(); - } - - /** - * DOCUMENT ME! - */ - public void run() - { - RandomAccessFile out = null; - - try - { - String cache = System.getProperty("user.home") + - "/.jalview.uniprot.xml"; - - File test = new File(cache); - if (!test.exists()) - { - out = new RandomAccessFile(cache, "rw"); - out.writeBytes("\n"); - out.writeBytes("\n"); - } - else - { - out = new RandomAccessFile(cache, "rw"); - - // open exisiting cache and remove from the end - long lastLine = 0; - String data; - - while ((data = out.readLine()) != null) - { - if (data.indexOf("") > -1) - { - lastLine = out.getFilePointer(); - } - } - - out.seek(lastLine); - } - - int seqIndex = 0; - Vector sequences = align.getSequences(); - - while (seqIndex < sequences.size()) - { - ArrayList ids = new ArrayList(); + AlignmentI align; + AlignmentI dataset; + AlignmentPanel ap; + ArrayList unknownSequences; + CutAndPasteTransfer output = new CutAndPasteTransfer(); + StringBuffer sbuffer = new StringBuffer(); + boolean uniprotFlag = false; - for (int i = 0; (seqIndex < sequences.size()) && (i < 50); - seqIndex++, i++) - { - SequenceI sequence = (SequenceI) sequences.get(seqIndex); - ids.add(sequence.getName()); - } + public SequenceFeatureFetcher() + {} - tryLocalCacheFirst(ids, align); + public Vector getUniprotEntries(File file) + { - if (ids.size() > 0) - { - StringBuffer remainingIds = new StringBuffer("uniprot:"); + UniprotFile uni = new UniprotFile(); + try + { + // 1. Load the mapping information from the file + Mapping map = new Mapping(uni.getClass().getClassLoader()); + java.net.URL url = getClass().getResource("/uniprot_mapping.xml"); + map.loadMapping(url); + + // 2. Unmarshal the data + Unmarshaller unmar = new Unmarshaller(uni); + unmar.setIgnoreExtraElements(true); + unmar.setMapping(map); + // unmar.setDebug(true); + + uni = (UniprotFile) unmar.unmarshal(new FileReader(file)); + } + catch (Exception e) + { + System.out.println("Error getUniprotEntries() "+e); + } - for (int i = 0; i < ids.size(); i++) - remainingIds.append(ids.get(i) + ";"); - EBIFetchClient ebi = new EBIFetchClient(); - String[] result = ebi.fetchData(remainingIds.toString(), - "xml", null); + return uni.getUniprotEntries(); + } + + /** + * Creates a new SequenceFeatureFetcher object. + * + * @param align DOCUMENT ME! + * @param ap DOCUMENT ME! + */ + public SequenceFeatureFetcher(AlignmentI align, AlignmentPanel ap) + { + unknownSequences = new ArrayList(); + this.align = align; + this.dataset = align.getDataset(); + this.ap = ap; + + Thread thread = new Thread(this); + thread.start(); + } + + /** + * DOCUMENT ME! + */ + public void run() + { + try + { + int seqIndex = 0; + Vector sequences = dataset.getSequences(); - if (result != null) - { - ReadUniprotFile(result, out, align); - } - } - } + while (seqIndex < sequences.size()) + { + Vector ids = new Vector(); - if (out != null) + for (int i = 0; (seqIndex < sequences.size()) && (i < 50); + seqIndex++, i++) + { + Sequence sequence = (Sequence) sequences.get(seqIndex); + Vector uprefs = jalview.util.DBRefUtils.selectRefs(sequence.getDBRef(), new String[] { + jalview.datamodel.DBRefSource.UNIPROT}); + if (uprefs!=null) + { + // we know the id for this entry, so don't note its ID in the unknownSequences list + for (int j=0,k=uprefs.size(); j\n"); - out.close(); + ids.add(sequence.getName()); + unknownSequences.add(sequence); } + } } - catch (Exception ex) - { - ex.printStackTrace(); - } - - findMissingIds(align); - if (sbuffer.length() > 0) + /////////////////////////////////// + ///READ FROM EBI + if (ids.size() > 0) { - output.setText( - "Your sequences have been matched to Uniprot. Some of the ids have been\n" + - "altered, most likely the start/end residue will have been updated.\n" + - "Save your alignment to maintain the updated id.\n\n" + - sbuffer.toString()); - Desktop.addInternalFrame(output, "Sequence names updated ", 600, 300); + StringBuffer remainingIds = new StringBuffer("uniprot:"); + for (int i = 0; i < ids.size(); i++) + { + if(ids.get(i).toString().indexOf("|")>-1) + { + remainingIds.append(ids.get(i).toString().substring( + ids.get(i).toString().lastIndexOf("|") + 1)); + uniprotFlag = true; + } + remainingIds.append(ids.get(i) + ";"); + } + EBIFetchClient ebi = new EBIFetchClient(); + File file = ebi.fetchDataAsFile(remainingIds.toString(), + "xml", "raw"); + + + + if (file != null) + { + ReadUniprotFile(file, ids); + } } + } + } + catch (Exception ex) + { + ex.printStackTrace(); + } - if (unknownSequences.size() > 0) - { - //ignore for now!!!!!!!!!! - // WSWUBlastClient blastClient = new WSWUBlastClient(align, unknownSequences); - } + if (sbuffer.length() > 0) + { + output.setText( + "Your sequences have been matched to Uniprot. Some of the ids have been\n" + + "altered, most likely the start/end residue will have been updated.\n" + + "Save your alignment to maintain the updated id.\n\n" + + sbuffer.toString()); + Desktop.addInternalFrame(output, "Sequence names updated ", 600, 300); + // The above is the dataset, we must now find out the index + // of the viewed sequence - jalview.gui.PaintRefresher.Refresh(null, align); } - /** - * DOCUMENT ME! - * - * @param result DOCUMENT ME! - * @param out DOCUMENT ME! - * @param align DOCUMENT ME! - */ - void ReadUniprotFile(String[] result, RandomAccessFile out, AlignmentI align) - { - SequenceI sequence = null; - Vector features = null; - String type; - String description; - String status; - String start; - String end; - String pdb = null; - - for (int r = 0; r < result.length; r++) - { - if ((sequence == null) && (result[r].indexOf("") > -1)) - { - long filePointer = 0; - - if (out != null) - { - try - { - filePointer = out.getFilePointer(); - out.writeBytes("\n"); - } - catch (Exception ex) - { - } - } - - String seqName = parseElement(result[r], "", out); - sequence = align.findName(seqName); - - if (sequence == null) - { - sequence = align.findName(seqName.substring(0, - seqName.indexOf('_'))); - - if (sequence != null) - { - sbuffer.append("changing " + sequence.getName() + - " to " + seqName + "\n"); - sequence.setName(seqName); - } - } - - if (sequence == null) - { - sbuffer.append("UNIPROT updated suggestion is " + - result[r] + "\n"); - sequence = align.findName(result[r]); - - // this entry has been suggested by ebi. - // doesn't match id in alignment file - try - { - out.setLength(filePointer); - } - catch (Exception ex) - { - } - - // now skip to next entry - while (result[r].indexOf("") == -1) - r++; - } - - features = new Vector(); - type = ""; - start = "0"; - end = "0"; - description = ""; - status = ""; - pdb = ""; - } + promptBeforeBlast(); - if (sequence == null) - { - continue; - } + } - if (result[r].indexOf(" -1) - { - pdb = parseValue(result[r], "value=", out); - sequence.setPDBId(pdb); - } - if (result[r].indexOf("feature type") > -1) - { - type = parseValue(result[r], "type=", out); - description = parseValue(result[r], "description=", null); - status = parseValue(result[r], "status=", null); - - while (result[r].indexOf("position") == -1) - { - r++; // - } - - // r++; - if (result[r].indexOf("begin") > -1) - { - start = parseValue(result[r], "position=", out); - end = parseValue(result[++r], "position=", out); - } - else - { - start = parseValue(result[r], "position=", out); - end = parseValue(result[r], "position=", null); - } - - int sstart = Integer.parseInt(start); - int eend = Integer.parseInt(end); - - if (out != null) - { - try - { - out.writeBytes("\n"); - } - catch (Exception ex) - { - } - } - - SequenceFeature sf = new SequenceFeature(type, sstart, eend, - description, status); - features.add(sf); - } + void promptBeforeBlast() + { + // This must be outside the run() body as java 1.5 + // will not return any value from the OptionPane to the expired thread. + if (unknownSequences.size() > 0) + { + // int reply = javax.swing.JOptionPane.showConfirmDialog( + // Desktop.desktop, "Couldn't find a match for "+unknownSequences.size()+" sequences." + // +"\nPerform blast for unknown sequences?", + // "Blast for Unidentified Sequences", + // javax.swing.JOptionPane.YES_NO_OPTION, javax.swing.JOptionPane.QUESTION_MESSAGE); + javax.swing.JOptionPane.showMessageDialog( + Desktop.desktop, "Couldn't find a match for "+unknownSequences.size()+" sequences.", + "Unidentified Sequences", + javax.swing.JOptionPane.WARNING_MESSAGE); - if (result[r].indexOf(" -1) - { - StringBuffer seqString = new StringBuffer(); - - if (out != null) - { - try - { - out.writeBytes(result[r] + "\n"); - } - catch (Exception ex) - { - } - } - - while (result[++r].indexOf("") == -1) - { - seqString.append(result[r]); - - if (out != null) - { - try - { - out.writeBytes(result[r] + "\n"); - } - catch (Exception ex) - { - } - } - } - - if (out != null) - { - try - { - out.writeBytes(result[r] + "\n"); - } - catch (Exception ex) - { - } - } - - StringBuffer nonGapped = new StringBuffer(); - - for (int i = 0; i < sequence.getSequence().length(); i++) - { - if (!jalview.util.Comparison.isGap(sequence.getCharAt(i))) - { - nonGapped.append(sequence.getCharAt(i)); - } - } - - int absStart = seqString.toString().indexOf(nonGapped.toString()); - - if (absStart == -1) - { - unknownSequences.add(sequence.getName()); - features = null; - sbuffer.append(sequence.getName() + - " SEQUENCE NOT %100 MATCH \n"); - - continue; - } - - int absEnd = absStart + nonGapped.toString().length(); - absStart += 1; - - if ((absStart != sequence.getStart()) || - (absEnd != sequence.getEnd())) - { - sbuffer.append("Updated: " + sequence.getName() + " " + - sequence.getStart() + "/" + sequence.getEnd() + - " to " + absStart + "/" + absEnd + "\n"); - } - - sequence.setStart(absStart); - sequence.setEnd(absEnd); - } - if (result[r].indexOf("") > -1) - { - if (features != null) - { - sequence.setSequenceFeatures(features); - } - - features = null; - sequence = null; - - if (out != null) - { - try - { - out.writeBytes("\n"); - } - catch (Exception ex) - { - } - } - } - } - } + // if(reply == javax.swing.JOptionPane.YES_OPTION) + // new WSWUBlastClient(ap, align, unknownSequences); + } - /** - * DOCUMENT ME! - * - * @param align DOCUMENT ME! - */ - void findMissingIds(AlignmentI align) - { - String data; - ArrayList cachedIds = new ArrayList(); - try + ap.repaint(); + } + + /** + * DOCUMENT ME! + * + * @param result DOCUMENT ME! + * @param out DOCUMENT ME! + * @param align DOCUMENT ME! + */ + void ReadUniprotFile(File file, Vector ids) + { + if(!file.exists()) + return; + + SequenceI sequence = null; + + Vector entries = getUniprotEntries(file); + + int i, iSize = entries==null?0:entries.size(); + UniprotEntry entry; + for (i = 0; i < iSize; i++) + { + entry = (UniprotEntry) entries.elementAt(i); + String idmatch = entry.getAccession().elementAt(0).toString(); + sequence = dataset.findName(idmatch); + + if (sequence == null) + { + //Sequence maybe Name, not Accession + idmatch = entry.getName().elementAt(0).toString(); + sequence = dataset.findName(idmatch); + } + + if(sequence!=null) + ids.remove(sequence.getName()); + + else if (sequence == null && uniprotFlag) + { + sequence = dataset.findName("UniProt/Swiss-Prot|"+entry.getAccession().elementAt(0)+"|"+idmatch); + ids.remove(idmatch); + } + + if(sequence ==null) + { + System.out.println(idmatch+" not found"); + continue; + } + + + String nonGapped = AlignSeq.extractGaps("-. ", sequence.getSequence()); + + int absStart = entry.getUniprotSequence().getContent().indexOf( + nonGapped.toString()); + + if (absStart == -1) + { + // Is UniprotSequence contained in dataset sequence? + absStart = nonGapped.toString().indexOf(entry.getUniprotSequence().getContent()); + if(absStart == -1) { - BufferedReader in = new BufferedReader(new FileReader( - jalview.bin.Cache.getProperty("UNIPROT_CACHE"))); + sbuffer.append(sequence.getName() + + " SEQUENCE NOT %100 MATCH \n"); - while ((data = in.readLine()) != null) - { - if (data.indexOf("name") > -1) - { - String name = parseElement(data, "", null); - cachedIds.add(name); - } - } + continue; } - catch (Exception ex) + else { - ex.printStackTrace(); - } - for (int i = 0; i < align.getHeight(); i++) - if (!cachedIds.contains(align.getSequenceAt(i).getName())) + if(entry.getFeature()!=null) + { + Enumeration e = entry.getFeature().elements(); + while (e.hasMoreElements()) { - unknownSequences.add(align.getSequenceAt(i).getName()); + SequenceFeature sf = (SequenceFeature) e.nextElement(); + sf.setBegin(sf.getBegin() + absStart + 1); + sf.setEnd(sf.getEnd() + absStart + 1); } - } + } - /** - * DOCUMENT ME! - * - * @param ids DOCUMENT ME! - * @param align DOCUMENT ME! - */ - void tryLocalCacheFirst(ArrayList ids, AlignmentI align) - { - ArrayList cacheData = new ArrayList(); + sbuffer.append(sequence.getName() + + " HAS "+absStart+" PREFIXED RESIDUES COMPARED TO UNIPROT - ANY SEQUENCE FEATURES" + +" HAVE BEEN ADJUSTED ACCORDINGLY \n"); + absStart = 0; + } - try - { - BufferedReader in = new BufferedReader(new FileReader( - jalview.bin.Cache.getProperty("UNIPROT_CACHE"))); + } - // read through cache file, if the cache has sequences we're looking for - // add the lines to a new String array, Readthis new array and - // make sure we remove the ids from the list to retrieve from EBI - String data; + unknownSequences.remove(sequence); - while ((data = in.readLine()) != null) - { - if (data.indexOf("name") > -1) - { - String name = parseElement(data, "", null); - - if (ids.contains(name)) - { - cacheData.add(""); - cacheData.add(data); - - while (data.indexOf("") == -1) - { - data = in.readLine(); - cacheData.add(data); - } - - cacheData.add(data); - - ids.remove(name); - } - } - } - } - catch (Exception ex) - { - ex.printStackTrace(); - } + int absEnd = absStart + nonGapped.toString().length(); + absStart += 1; - String[] localData = new String[cacheData.size()]; - cacheData.toArray(localData); + Enumeration e = entry.getDbReference().elements(); + Vector onlyPdbEntries = new Vector(); + while(e.hasMoreElements()) + { + PDBEntry pdb = (PDBEntry)e.nextElement(); + if(!pdb.getType().equals("PDB")) + continue; - if ((localData != null) && (localData.length > 0)) - { - ReadUniprotFile(localData, null, align); - } - } + onlyPdbEntries.addElement(pdb); + } - /** - * DOCUMENT ME! - * - * @param line DOCUMENT ME! - * @param tag DOCUMENT ME! - * @param out DOCUMENT ME! - * - * @return DOCUMENT ME! - */ - String parseValue(String line, String tag, RandomAccessFile out) - { - if (out != null) + sequence.setPDBId(onlyPdbEntries); + if (entry.getFeature()!=null) { + e = entry.getFeature().elements(); + while (e.hasMoreElements()) { - try - { - out.writeBytes(line + "\n"); - } - catch (Exception ex) - { - } + SequenceFeature sf = (SequenceFeature) e.nextElement(); + sf.setFeatureGroup("Uniprot"); + sequence.addSequenceFeature( sf ); } + } + sequence.setStart(absStart); + sequence.setEnd(absEnd); + + + int n = 0; + SequenceI seq2; + while (n < align.getHeight()) + { + //This loop enables multiple sequences with the same + //id to have features added and seq limits updated + seq2 = align.getSequenceAt(n); + if (seq2.getName().equals(idmatch)) + { - int index = line.indexOf(tag) + tag.length() + 1; + nonGapped = AlignSeq.extractGaps("-. ", seq2.getSequence()); - if (index == tag.length()) - { - return ""; - } + absStart = sequence.getSequence().indexOf(nonGapped); + absEnd = absStart + nonGapped.toString().length() - 1; - return line.substring(index, line.indexOf("\"", index + 1)); - } + // This is the Viewd alignment sequences + // No need to tell the user of the dataset updates + if ( (seq2.getStart() != absStart+sequence.getStart()) + || (seq2.getEnd() != absEnd+sequence.getStart())) + { + sbuffer.append("Updated: " + seq2.getName() + " " + + seq2.getStart() + "/" + seq2.getEnd() + + " to " + (absStart + sequence.getStart()) + "/" + + (absEnd + sequence.getStart()) + "\n"); - /** - * DOCUMENT ME! - * - * @param line DOCUMENT ME! - * @param tag DOCUMENT ME! - * @param out DOCUMENT ME! - * - * @return DOCUMENT ME! - */ - String parseElement(String line, String tag, RandomAccessFile out) - { - if (out != null) - { - try - { - out.writeBytes(line + "\n"); - } - catch (Exception ex) - { - } + seq2.setStart(absStart + sequence.getStart()); + seq2.setEnd(absEnd + sequence.getStart()); + } } - int index = line.indexOf(tag) + tag.length(); - - return line.substring(index, line.indexOf("