X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Fjalview%2Fio%2FSequenceFeatureFetcher.java;h=61b25f932b70ed01b65d03ddc7dbc53b3e7a218d;hb=b01c4edc00dbeedd6b53777310d4232a4104a9ec;hp=89d84b03b12b760a0474b4e40953beaf705aff3c;hpb=efc31b4a8d5cee63555586804a2b79c06bdb5a14;p=jalview.git
diff --git a/src/jalview/io/SequenceFeatureFetcher.java b/src/jalview/io/SequenceFeatureFetcher.java
index 89d84b0..61b25f9 100755
--- a/src/jalview/io/SequenceFeatureFetcher.java
+++ b/src/jalview/io/SequenceFeatureFetcher.java
@@ -22,12 +22,16 @@ import jalview.datamodel.*;
import jalview.gui.*;
-import jalview.io.*;
-
import java.io.*;
import java.util.*;
+import org.exolab.castor.mapping.Mapping;
+
+import org.exolab.castor.xml.*;
+import jalview.analysis.AlignSeq;
+
+
/**
* DOCUMENT ME!
@@ -37,521 +41,319 @@ import java.util.*;
*/
public class SequenceFeatureFetcher implements Runnable
{
- AlignmentI align;
- AlignmentPanel ap;
- ArrayList unknownSequences;
- CutAndPasteTransfer output = new CutAndPasteTransfer();
- StringBuffer sbuffer = new StringBuffer();
-
- /**
- * Creates a new SequenceFeatureFetcher object.
- *
- * @param align DOCUMENT ME!
- * @param ap DOCUMENT ME!
- */
- public SequenceFeatureFetcher(AlignmentI align, AlignmentPanel ap)
- {
- unknownSequences = new ArrayList();
- this.align = align;
- this.ap = ap;
-
- Thread thread = new Thread(this);
- thread.start();
- }
-
- /**
- * DOCUMENT ME!
- */
- public void run()
- {
- RandomAccessFile out = null;
-
- try
- {
- String cache = System.getProperty("user.home") +
- "/.jalview.uniprot.xml";
-
- File test = new File(cache);
- if (!test.exists())
- {
- out = new RandomAccessFile(cache, "rw");
- out.writeBytes("\n");
- out.writeBytes("\n");
- }
- else
- {
- out = new RandomAccessFile(cache, "rw");
-
- // open exisiting cache and remove from the end
- long lastLine = 0;
- String data;
-
- while ((data = out.readLine()) != null)
- {
- if (data.indexOf("") > -1)
- {
- lastLine = out.getFilePointer();
- }
- }
-
- out.seek(lastLine);
- }
-
- int seqIndex = 0;
- Vector sequences = align.getSequences();
-
- while (seqIndex < sequences.size())
- {
- ArrayList ids = new ArrayList();
+ AlignmentI align;
+ AlignmentI dataset;
+ AlignmentPanel ap;
+ ArrayList unknownSequences;
+ CutAndPasteTransfer output = new CutAndPasteTransfer();
+ StringBuffer sbuffer = new StringBuffer();
+ boolean uniprotFlag = false;
- for (int i = 0; (seqIndex < sequences.size()) && (i < 50);
- seqIndex++, i++)
- {
- SequenceI sequence = (SequenceI) sequences.get(seqIndex);
- ids.add(sequence.getName());
- }
+ public SequenceFeatureFetcher()
+ {}
- tryLocalCacheFirst(ids, align);
+ public Vector getUniprotEntries(File file)
+ {
- if (ids.size() > 0)
- {
- StringBuffer remainingIds = new StringBuffer("uniprot:");
+ UniprotFile uni = new UniprotFile();
+ try
+ {
+ // 1. Load the mapping information from the file
+ Mapping map = new Mapping(uni.getClass().getClassLoader());
+ java.net.URL url = getClass().getResource("/uniprot_mapping.xml");
+ map.loadMapping(url);
- for (int i = 0; i < ids.size(); i++)
- remainingIds.append(ids.get(i) + ";");
+ // 2. Unmarshal the data
+ Unmarshaller unmar = new Unmarshaller();
+ unmar.setIgnoreExtraElements(true);
+ unmar.setMapping(map);
+ uni = (UniprotFile) unmar.unmarshal(new FileReader(file));
- EBIFetchClient ebi = new EBIFetchClient();
- String[] result = ebi.fetchData(remainingIds.toString(),
- "xml", null);
+ }
+ catch (Exception e)
+ {
+ System.out.println("Error getUniprotEntries() "+e);
+ }
+ return uni.getUniprotEntries();
+ }
+
+ /**
+ * Creates a new SequenceFeatureFetcher object.
+ *
+ * @param align DOCUMENT ME!
+ * @param ap DOCUMENT ME!
+ */
+ public SequenceFeatureFetcher(AlignmentI align, AlignmentPanel ap)
+ {
+ unknownSequences = new ArrayList();
+ this.align = align;
+ this.dataset = align.getDataset();
+ this.ap = ap;
+
+ Thread thread = new Thread(this);
+ thread.start();
+ }
+
+ /**
+ * DOCUMENT ME!
+ */
+ public void run()
+ {
+ try
+ {
+ int seqIndex = 0;
+ Vector sequences = dataset.getSequences();
- if (result != null)
- {
- ReadUniprotFile(result, out, align);
- }
- }
- }
+ while (seqIndex < sequences.size())
+ {
+ Vector ids = new Vector();
- if (out != null)
+ for (int i = 0; (seqIndex < sequences.size()) && (i < 50);
+ seqIndex++, i++)
+ {
+ Sequence sequence = (Sequence) sequences.get(seqIndex);
+ Vector uprefs = jalview.util.DBRefUtils.selectRefs(sequence.getDBRef(), new String[] { "Uniprot"});
+ if (uprefs!=null)
+ {
+ // we know the id for this entry, so don't note its ID in the unknownSequences list
+ for (int j=0,k=uprefs.size(); j\n");
- out.close();
+ ids.add(sequence.getName());
+ unknownSequences.add(sequence);
}
+ }
}
- catch (Exception ex)
- {
- ex.printStackTrace();
- }
-
- findMissingIds(align);
- if (sbuffer.length() > 0)
+ ///////////////////////////////////
+ ///READ FROM EBI
+ if (ids.size() > 0)
{
- output.setText(
- "Your sequences have been matched to Uniprot. Some of the ids have been\n" +
- "altered, most likely the start/end residue will have been updated.\n" +
- "Save your alignment to maintain the updated id.\n\n" +
- sbuffer.toString());
- Desktop.addInternalFrame(output, "Sequence names updated ", 600, 300);
+ StringBuffer remainingIds = new StringBuffer("uniprot:");
+ for (int i = 0; i < ids.size(); i++)
+ {
+ if(ids.get(i).toString().indexOf("|")>-1)
+ {
+ remainingIds.append(ids.get(i).toString().substring(
+ ids.get(i).toString().lastIndexOf("|") + 1));
+ uniprotFlag = true;
+ }
+ remainingIds.append(ids.get(i) + ";");
+ }
+ EBIFetchClient ebi = new EBIFetchClient();
+ File file = ebi.fetchDataAsFile(remainingIds.toString(),
+ "xml", "raw");
+
+
+
+ if (file != null)
+ {
+ ReadUniprotFile(file, ids);
+ }
}
+ }
+ }
+ catch (Exception ex)
+ {
+ ex.printStackTrace();
+ }
- if (unknownSequences.size() > 0)
- {
- //ignore for now!!!!!!!!!!
- // WSWUBlastClient blastClient = new WSWUBlastClient(align, unknownSequences);
- }
+ if (sbuffer.length() > 0)
+ {
+ output.setText(
+ "Your sequences have been matched to Uniprot. Some of the ids have been\n" +
+ "altered, most likely the start/end residue will have been updated.\n" +
+ "Save your alignment to maintain the updated id.\n\n" +
+ sbuffer.toString());
+ Desktop.addInternalFrame(output, "Sequence names updated ", 600, 300);
+ // The above is the dataset, we must now find out the index
+ // of the viewed sequence
- jalview.gui.PaintRefresher.Refresh(null, align);
}
- /**
- * DOCUMENT ME!
- *
- * @param result DOCUMENT ME!
- * @param out DOCUMENT ME!
- * @param align DOCUMENT ME!
- */
- void ReadUniprotFile(String[] result, RandomAccessFile out, AlignmentI align)
- {
- SequenceI sequence = null;
- Vector features = null;
- String type;
- String description;
- String status;
- String start;
- String end;
- String pdb = null;
-
- for (int r = 0; r < result.length; r++)
- {
- if ((sequence == null) && (result[r].indexOf("") > -1))
- {
- long filePointer = 0;
-
- if (out != null)
- {
- try
- {
- filePointer = out.getFilePointer();
- out.writeBytes("\n");
- }
- catch (Exception ex)
- {
- }
- }
-
- String seqName = parseElement(result[r], "", out);
- sequence = align.findName(seqName);
-
- if (sequence == null)
- {
- sequence = align.findName(seqName.substring(0,
- seqName.indexOf('_')));
-
- if (sequence != null)
- {
- sbuffer.append("changing " + sequence.getName() +
- " to " + seqName + "\n");
- sequence.setName(seqName);
- }
- }
-
- if (sequence == null)
- {
- sbuffer.append("UNIPROT updated suggestion is " +
- result[r] + "\n");
- sequence = align.findName(result[r]);
-
- // this entry has been suggested by ebi.
- // doesn't match id in alignment file
- try
- {
- out.setLength(filePointer);
- }
- catch (Exception ex)
- {
- }
-
- // now skip to next entry
- while (result[r].indexOf("") == -1)
- r++;
- }
-
- features = new Vector();
- type = "";
- start = "0";
- end = "0";
- description = "";
- status = "";
- pdb = "";
- }
+ promptBeforeBlast();
- if (sequence == null)
- {
- continue;
- }
+ }
- if (result[r].indexOf(" -1)
- {
- pdb = parseValue(result[r], "value=", out);
- sequence.setPDBId(pdb);
- }
- if (result[r].indexOf("feature type") > -1)
- {
- type = parseValue(result[r], "type=", out);
- description = parseValue(result[r], "description=", null);
- status = parseValue(result[r], "status=", null);
-
- while (result[r].indexOf("position") == -1)
- {
- r++; //
- }
-
- // r++;
- if (result[r].indexOf("begin") > -1)
- {
- start = parseValue(result[r], "position=", out);
- end = parseValue(result[++r], "position=", out);
- }
- else
- {
- start = parseValue(result[r], "position=", out);
- end = parseValue(result[r], "position=", null);
- }
-
- int sstart = Integer.parseInt(start);
- int eend = Integer.parseInt(end);
-
- if (out != null)
- {
- try
- {
- out.writeBytes("\n");
- }
- catch (Exception ex)
- {
- }
- }
-
- SequenceFeature sf = new SequenceFeature(type, sstart, eend,
- description, status);
- features.add(sf);
- }
+ void promptBeforeBlast()
+ {
+ // This must be outside the run() body as java 1.5
+ // will not return any value from the OptionPane to the expired thread.
+ if (unknownSequences.size() > 0)
+ {
+ // int reply = javax.swing.JOptionPane.showConfirmDialog(
+ // Desktop.desktop, "Couldn't find a match for "+unknownSequences.size()+" sequences."
+ // +"\nPerform blast for unknown sequences?",
+ // "Blast for Unidentified Sequences",
+ // javax.swing.JOptionPane.YES_NO_OPTION, javax.swing.JOptionPane.QUESTION_MESSAGE);
+ javax.swing.JOptionPane.showMessageDialog(
+ Desktop.desktop, "Couldn't find a match for "+unknownSequences.size()+" sequences.",
+ "Unidentified Sequences",
+ javax.swing.JOptionPane.WARNING_MESSAGE);
- if (result[r].indexOf(" -1)
- {
- StringBuffer seqString = new StringBuffer();
-
- if (out != null)
- {
- try
- {
- out.writeBytes(result[r] + "\n");
- }
- catch (Exception ex)
- {
- }
- }
-
- while (result[++r].indexOf("") == -1)
- {
- seqString.append(result[r]);
-
- if (out != null)
- {
- try
- {
- out.writeBytes(result[r] + "\n");
- }
- catch (Exception ex)
- {
- }
- }
- }
-
- if (out != null)
- {
- try
- {
- out.writeBytes(result[r] + "\n");
- }
- catch (Exception ex)
- {
- }
- }
-
- StringBuffer nonGapped = new StringBuffer();
-
- for (int i = 0; i < sequence.getSequence().length(); i++)
- {
- if (!jalview.util.Comparison.isGap(sequence.getCharAt(i)))
- {
- nonGapped.append(sequence.getCharAt(i));
- }
- }
-
- int absStart = seqString.toString().indexOf(nonGapped.toString());
-
- if (absStart == -1)
- {
- unknownSequences.add(sequence.getName());
- features = null;
- sbuffer.append(sequence.getName() +
- " SEQUENCE NOT %100 MATCH \n");
-
- continue;
- }
-
- int absEnd = absStart + nonGapped.toString().length();
- absStart += 1;
-
- if ((absStart != sequence.getStart()) ||
- (absEnd != sequence.getEnd()))
- {
- sbuffer.append("Updated: " + sequence.getName() + " " +
- sequence.getStart() + "/" + sequence.getEnd() +
- " to " + absStart + "/" + absEnd + "\n");
- }
-
- sequence.setStart(absStart);
- sequence.setEnd(absEnd);
- }
- if (result[r].indexOf("") > -1)
- {
- if (features != null)
- {
- sequence.setSequenceFeatures(features);
- }
-
- features = null;
- sequence = null;
-
- if (out != null)
- {
- try
- {
- out.writeBytes("\n");
- }
- catch (Exception ex)
- {
- }
- }
- }
- }
- }
+ // if(reply == javax.swing.JOptionPane.YES_OPTION)
+ // new WSWUBlastClient(ap, align, unknownSequences);
+ }
- /**
- * DOCUMENT ME!
- *
- * @param align DOCUMENT ME!
- */
- void findMissingIds(AlignmentI align)
- {
- String data;
- ArrayList cachedIds = new ArrayList();
- try
+ ap.repaint();
+ }
+
+ /**
+ * DOCUMENT ME!
+ *
+ * @param result DOCUMENT ME!
+ * @param out DOCUMENT ME!
+ * @param align DOCUMENT ME!
+ */
+ void ReadUniprotFile(File file, Vector ids)
+ {
+ if(!file.exists())
+ return;
+
+ SequenceI sequence = null;
+
+ Vector entries = getUniprotEntries(file);
+
+ int i, iSize = entries==null?0:entries.size();
+ UniprotEntry entry;
+ for (i = 0; i < iSize; i++)
+ {
+ entry = (UniprotEntry) entries.elementAt(i);
+ String idmatch = entry.getAccession().elementAt(0).toString();
+ sequence = dataset.findName(idmatch);
+
+ if (sequence == null)
+ {
+ //Sequence maybe Name, not Accession
+ idmatch = entry.getName().elementAt(0).toString();
+ sequence = dataset.findName(idmatch);
+ }
+
+ if(sequence!=null)
+ ids.remove(sequence.getName());
+
+ else if (sequence == null && uniprotFlag)
+ {
+ sequence = dataset.findName("UniProt/Swiss-Prot|"+entry.getAccession().elementAt(0)+"|"+idmatch);
+ ids.remove(idmatch);
+ }
+
+ if(sequence ==null)
+ {
+ System.out.println(idmatch+" not found");
+ continue;
+ }
+
+
+ String nonGapped = AlignSeq.extractGaps("-. ", sequence.getSequence());
+
+ int absStart = entry.getUniprotSequence().getContent().indexOf(
+ nonGapped.toString());
+
+ if (absStart == -1)
+ {
+ // Is UniprotSequence contained in dataset sequence?
+ absStart = nonGapped.toString().indexOf(entry.getUniprotSequence().getContent());
+ if(absStart == -1)
{
- BufferedReader in = new BufferedReader(new FileReader(
- jalview.bin.Cache.getProperty("UNIPROT_CACHE")));
+ sbuffer.append(sequence.getName() +
+ " SEQUENCE NOT %100 MATCH \n");
- while ((data = in.readLine()) != null)
- {
- if (data.indexOf("name") > -1)
- {
- String name = parseElement(data, "", null);
- cachedIds.add(name);
- }
- }
+ continue;
}
- catch (Exception ex)
+ else
{
- ex.printStackTrace();
- }
- for (int i = 0; i < align.getHeight(); i++)
- if (!cachedIds.contains(align.getSequenceAt(i).getName()))
+ if(entry.getFeature()!=null)
+ {
+ Enumeration e = entry.getFeature().elements();
+ while (e.hasMoreElements())
{
- unknownSequences.add(align.getSequenceAt(i).getName());
+ SequenceFeature sf = (SequenceFeature) e.nextElement();
+ sf.setBegin(sf.getBegin() + absStart + 1);
+ sf.setEnd(sf.getEnd() + absStart + 1);
}
- }
+ }
- /**
- * DOCUMENT ME!
- *
- * @param ids DOCUMENT ME!
- * @param align DOCUMENT ME!
- */
- void tryLocalCacheFirst(ArrayList ids, AlignmentI align)
- {
- ArrayList cacheData = new ArrayList();
+ sbuffer.append(sequence.getName() +
+ " HAS "+absStart+" PREFIXED RESIDUES COMPARED TO UNIPROT - ANY SEQUENCE FEATURES"
+ +" HAVE BEEN ADJUSTED ACCORDINGLY \n");
+ absStart = 0;
+ }
- try
- {
- BufferedReader in = new BufferedReader(new FileReader(
- jalview.bin.Cache.getProperty("UNIPROT_CACHE")));
+ }
- // read through cache file, if the cache has sequences we're looking for
- // add the lines to a new String array, Readthis new array and
- // make sure we remove the ids from the list to retrieve from EBI
- String data;
+ unknownSequences.remove(sequence);
- while ((data = in.readLine()) != null)
- {
- if (data.indexOf("name") > -1)
- {
- String name = parseElement(data, "", null);
-
- if (ids.contains(name))
- {
- cacheData.add("");
- cacheData.add(data);
-
- while (data.indexOf("") == -1)
- {
- data = in.readLine();
- cacheData.add(data);
- }
-
- cacheData.add(data);
-
- ids.remove(name);
- }
- }
- }
- }
- catch (Exception ex)
- {
- ex.printStackTrace();
- }
+ int absEnd = absStart + nonGapped.toString().length();
+ absStart += 1;
- String[] localData = new String[cacheData.size()];
- cacheData.toArray(localData);
+ Enumeration e = entry.getDbReference().elements();
+ Vector onlyPdbEntries = new Vector();
+ while(e.hasMoreElements())
+ {
+ PDBEntry pdb = (PDBEntry)e.nextElement();
+ if(!pdb.getType().equals("PDB"))
+ continue;
- if ((localData != null) && (localData.length > 0))
- {
- ReadUniprotFile(localData, null, align);
- }
- }
+ onlyPdbEntries.addElement(pdb);
+ }
- /**
- * DOCUMENT ME!
- *
- * @param line DOCUMENT ME!
- * @param tag DOCUMENT ME!
- * @param out DOCUMENT ME!
- *
- * @return DOCUMENT ME!
- */
- String parseValue(String line, String tag, RandomAccessFile out)
- {
- if (out != null)
+ sequence.setPDBId(onlyPdbEntries);
+ if (entry.getFeature()!=null) {
+ e = entry.getFeature().elements();
+ while (e.hasMoreElements())
{
- try
- {
- out.writeBytes(line + "\n");
- }
- catch (Exception ex)
- {
- }
+ SequenceFeature sf = (SequenceFeature) e.nextElement();
+ sf.setFeatureGroup("Uniprot");
+ sequence.addSequenceFeature( sf );
}
+ }
+ sequence.setStart(absStart);
+ sequence.setEnd(absEnd);
+
+
+ int n = 0;
+ SequenceI seq2;
+ while (n < align.getHeight())
+ {
+ //This loop enables multiple sequences with the same
+ //id to have features added and seq limits updated
+ seq2 = align.getSequenceAt(n);
+ if (seq2.getName().equals(idmatch))
+ {
- int index = line.indexOf(tag) + tag.length() + 1;
+ nonGapped = AlignSeq.extractGaps("-. ", seq2.getSequence());
- if (index == tag.length())
- {
- return "";
- }
+ absStart = sequence.getSequence().indexOf(nonGapped);
+ absEnd = absStart + nonGapped.toString().length() - 1;
- return line.substring(index, line.indexOf("\"", index + 1));
- }
+ // This is the Viewd alignment sequences
+ // No need to tell the user of the dataset updates
+ if ( (seq2.getStart() != absStart+sequence.getStart())
+ || (seq2.getEnd() != absEnd+sequence.getStart()))
+ {
+ sbuffer.append("Updated: " + seq2.getName() + " " +
+ seq2.getStart() + "/" + seq2.getEnd() +
+ " to " + (absStart + sequence.getStart()) + "/" +
+ (absEnd + sequence.getStart()) + "\n");
- /**
- * DOCUMENT ME!
- *
- * @param line DOCUMENT ME!
- * @param tag DOCUMENT ME!
- * @param out DOCUMENT ME!
- *
- * @return DOCUMENT ME!
- */
- String parseElement(String line, String tag, RandomAccessFile out)
- {
- if (out != null)
- {
- try
- {
- out.writeBytes(line + "\n");
- }
- catch (Exception ex)
- {
- }
+ seq2.setStart(absStart + sequence.getStart());
+ seq2.setEnd(absEnd + sequence.getStart());
+ }
}
- int index = line.indexOf(tag) + tag.length();
-
- return line.substring(index, line.indexOf(""));
+ n++;
+ }
}
+ }
}
+
+