Not used in Jalview
[jalview.git] / src / jalview / io / SequenceFeatureFetcher.java
1 /*\r
2 * Jalview - A Sequence Alignment Editor and Viewer\r
3 * Copyright (C) 2005 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle\r
4 *\r
5 * This program is free software; you can redistribute it and/or\r
6 * modify it under the terms of the GNU General Public License\r
7 * as published by the Free Software Foundation; either version 2\r
8 * of the License, or (at your option) any later version.\r
9 *\r
10 * This program is distributed in the hope that it will be useful,\r
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
13 * GNU General Public License for more details.\r
14 *\r
15 * You should have received a copy of the GNU General Public License\r
16 * along with this program; if not, write to the Free Software\r
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA\r
18 */\r
19 package jalview.io;\r
20 \r
21 import jalview.datamodel.*;\r
22 \r
23 import jalview.gui.*;\r
24 \r
25 import jalview.io.*;\r
26 \r
27 import java.io.*;\r
28 \r
29 import java.util.*;\r
30 \r
31 \r
32 public class SequenceFeatureFetcher implements Runnable {\r
33     AlignmentI align;\r
34     AlignmentPanel ap;\r
35     ArrayList unknownSequences;\r
36     CutAndPasteTransfer output = new CutAndPasteTransfer();\r
37     StringBuffer sbuffer = new StringBuffer();\r
38 \r
39     public SequenceFeatureFetcher(AlignmentI align, AlignmentPanel ap) {\r
40         unknownSequences = new ArrayList();\r
41         this.align = align;\r
42         this.ap = ap;\r
43 \r
44         Thread thread = new Thread(this);\r
45         thread.start();\r
46     }\r
47 \r
48     public void run() {\r
49 \r
50         RandomAccessFile out = null;\r
51 \r
52         try {\r
53             String cache = System.getProperty("user.home") + "/.jalview.uniprot.xml";\r
54 \r
55             File test = new File(cache);\r
56 \r
57             if (!test.exists()) {\r
58                 out = new RandomAccessFile(cache, "rw");\r
59                 out.writeBytes("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");\r
60                 out.writeBytes("<UNIPROT_CACHE>\n");\r
61             } else {\r
62                 out = new RandomAccessFile(cache, "rw");\r
63 \r
64                 // open exisiting cache and remove </UNIPROT_CACHE> from the end\r
65                 long lastLine = 0;\r
66                 String data;\r
67 \r
68                 while ((data = out.readLine()) != null) {\r
69                     if (data.indexOf("</entry>") > -1) {\r
70                         lastLine = out.getFilePointer();\r
71                     }\r
72                 }\r
73 \r
74                 out.seek(lastLine);\r
75             }\r
76 \r
77             int seqIndex = 0;\r
78             Vector sequences = align.getSequences();\r
79 \r
80             while (seqIndex < sequences.size()) {\r
81                 ArrayList ids = new ArrayList();\r
82 \r
83                 for (int i = 0; (seqIndex < sequences.size()) && (i < 50);\r
84                         seqIndex++, i++) {\r
85                     SequenceI sequence = (SequenceI) sequences.get(seqIndex);\r
86                     ids.add(sequence.getName());\r
87                 }\r
88 \r
89                 tryLocalCacheFirst(ids, align);\r
90 \r
91                 if (ids.size() > 0) {\r
92                     StringBuffer remainingIds = new StringBuffer("uniprot:");\r
93 \r
94                     for (int i = 0; i < ids.size(); i++)\r
95                         remainingIds.append(ids.get(i) + ";");\r
96 \r
97                     EBIFetchClient ebi = new EBIFetchClient();\r
98                     String[] result = ebi.fetchData(remainingIds.toString(),\r
99                             "xml", null);\r
100 \r
101                     if (result != null) {\r
102                         ReadUniprotFile(result, out, align);\r
103                     }\r
104                 }\r
105             }\r
106 \r
107             if (out != null) {\r
108                 out.writeBytes("</UNIPROT_CACHE>\n");\r
109                 out.close();\r
110             }\r
111         } catch (Exception ex) {\r
112             ex.printStackTrace();\r
113         }\r
114 \r
115         findMissingIds(align);\r
116 \r
117         if (sbuffer.length() > 0) {\r
118             output.setText(\r
119                 "Your sequences have been matched to Uniprot. Some of the ids have been\n" +\r
120                 "altered, most likely the start/end residue will have been updated.\n" +\r
121                 "Save your alignment to maintain the updated id.\n\n" +\r
122                 sbuffer.toString());\r
123             Desktop.addInternalFrame(output, "Sequence names updated ", 600, 300);\r
124         }\r
125 \r
126         if (unknownSequences.size() > 0) {\r
127             //ignore for now!!!!!!!!!!\r
128             //  WSWUBlastClient blastClient = new WSWUBlastClient(align, unknownSequences);\r
129         }\r
130 \r
131         jalview.gui.PaintRefresher.Refresh(null, align);\r
132     }\r
133 \r
134     void ReadUniprotFile(String[] result, RandomAccessFile out, AlignmentI align) {\r
135         SequenceI sequence = null;\r
136         Vector features = null;\r
137         String type;\r
138         String description;\r
139         String status;\r
140         String start;\r
141         String end;\r
142         String pdb = null;\r
143 \r
144         for (int r = 0; r < result.length; r++) {\r
145             if ((sequence == null) && (result[r].indexOf("<name>") > -1)) {\r
146                 long filePointer = 0;\r
147 \r
148                 if (out != null) {\r
149                     try {\r
150                         filePointer = out.getFilePointer();\r
151                         out.writeBytes("<entry>\n");\r
152                     } catch (Exception ex) {\r
153                     }\r
154                 }\r
155 \r
156                 String seqName = parseElement(result[r], "<name>", out);\r
157                 sequence = align.findName(seqName);\r
158 \r
159                 if (sequence == null) {\r
160                     sequence = align.findName(seqName.substring(0,\r
161                                 seqName.indexOf('_')));\r
162 \r
163                     if (sequence != null) {\r
164                         sbuffer.append("changing " + sequence.getName() +\r
165                             " to " + seqName + "\n");\r
166                         sequence.setName(seqName);\r
167                     }\r
168                 }\r
169 \r
170                 if (sequence == null) {\r
171                     sbuffer.append("UNIPROT updated suggestion is " +\r
172                         result[r] + "\n");\r
173                     sequence = align.findName(result[r]);\r
174 \r
175                     // this entry has been suggested by ebi.\r
176                     // doesn't match id in alignment file\r
177                     try {\r
178                         out.setLength(filePointer);\r
179                     } catch (Exception ex) {\r
180                     }\r
181 \r
182                     // now skip to next entry\r
183                     while (result[r].indexOf("</entry>") == -1)\r
184                         r++;\r
185                 }\r
186 \r
187                 features = new Vector();\r
188                 type = "";\r
189                 start = "0";\r
190                 end = "0";\r
191                 description = "";\r
192                 status = "";\r
193                 pdb = "";\r
194             }\r
195 \r
196             if (sequence == null) {\r
197                 continue;\r
198             }\r
199 \r
200             if (result[r].indexOf("<property type=\"pdb accession\"") > -1) {\r
201                 pdb = parseValue(result[r], "value=", out);\r
202                 sequence.setPDBId(pdb);\r
203             }\r
204 \r
205             if (result[r].indexOf("feature type") > -1) {\r
206                 type = parseValue(result[r], "type=", out);\r
207                 description = parseValue(result[r], "description=", null);\r
208                 status = parseValue(result[r], "status=", null);\r
209 \r
210                 while (result[r].indexOf("position") == -1) {\r
211                     r++; //<location>\r
212                 }\r
213 \r
214                 // r++;\r
215                 if (result[r].indexOf("begin") > -1) {\r
216                     start = parseValue(result[r], "position=", out);\r
217                     end = parseValue(result[++r], "position=", out);\r
218                 } else {\r
219                     start = parseValue(result[r], "position=", out);\r
220                     end = parseValue(result[r], "position=", null);\r
221                 }\r
222 \r
223                 int sstart = Integer.parseInt(start);\r
224                 int eend = Integer.parseInt(end);\r
225 \r
226                 if (out != null) {\r
227                     try {\r
228                         out.writeBytes("</feature>\n");\r
229                     } catch (Exception ex) {\r
230                     }\r
231                 }\r
232 \r
233                 SequenceFeature sf = new SequenceFeature(type, sstart, eend,\r
234                         description, status);\r
235                 features.add(sf);\r
236             }\r
237 \r
238             if (result[r].indexOf("<sequence") > -1) {\r
239                 StringBuffer seqString = new StringBuffer();\r
240 \r
241                 if (out != null) {\r
242                     try {\r
243                         out.writeBytes(result[r] + "\n");\r
244                     } catch (Exception ex) {\r
245                     }\r
246                 }\r
247 \r
248                 while (result[++r].indexOf("</sequence>") == -1) {\r
249                     seqString.append(result[r]);\r
250 \r
251                     if (out != null) {\r
252                         try {\r
253                             out.writeBytes(result[r] + "\n");\r
254                         } catch (Exception ex) {\r
255                         }\r
256                     }\r
257                 }\r
258 \r
259                 if (out != null) {\r
260                     try {\r
261                         out.writeBytes(result[r] + "\n");\r
262                     } catch (Exception ex) {\r
263                     }\r
264                 }\r
265 \r
266                 StringBuffer nonGapped = new StringBuffer();\r
267 \r
268                 for (int i = 0; i < sequence.getSequence().length(); i++) {\r
269                     if (!jalview.util.Comparison.isGap(sequence.getCharAt(i))) {\r
270                         nonGapped.append(sequence.getCharAt(i));\r
271                     }\r
272                 }\r
273 \r
274                 int absStart = seqString.toString().indexOf(nonGapped.toString());\r
275 \r
276                 if (absStart == -1) {\r
277                     unknownSequences.add(sequence.getName());\r
278                     features = null;\r
279                     sbuffer.append(sequence.getName() +\r
280                         " SEQUENCE NOT %100 MATCH \n");\r
281 \r
282                     continue;\r
283                 }\r
284 \r
285                 int absEnd = absStart + nonGapped.toString().length();\r
286                 absStart += 1;\r
287 \r
288                 if ((absStart != sequence.getStart()) ||\r
289                         (absEnd != sequence.getEnd())) {\r
290                     sbuffer.append("Updated: " + sequence.getName() + " " +\r
291                         sequence.getStart() + "/" + sequence.getEnd() +\r
292                         "  to  " + absStart + "/" + absEnd + "\n");\r
293                 }\r
294 \r
295                 sequence.setStart(absStart);\r
296                 sequence.setEnd(absEnd);\r
297             }\r
298 \r
299             if (result[r].indexOf("</entry>") > -1) {\r
300                 if (features != null) {\r
301                     sequence.setSequenceFeatures(features);\r
302                 }\r
303 \r
304                 features = null;\r
305                 sequence = null;\r
306 \r
307                 if (out != null) {\r
308                     try {\r
309                         out.writeBytes("</entry>\n");\r
310                     } catch (Exception ex) {\r
311                     }\r
312                 }\r
313             }\r
314         }\r
315     }\r
316 \r
317     void findMissingIds(AlignmentI align) {\r
318         String data;\r
319         ArrayList cachedIds = new ArrayList();\r
320 \r
321         try {\r
322             BufferedReader in = new BufferedReader(new FileReader(\r
323                         jalview.bin.Cache.getProperty("UNIPROT_CACHE")));\r
324 \r
325             while ((data = in.readLine()) != null) {\r
326                 if (data.indexOf("name") > -1) {\r
327                     String name = parseElement(data, "<name>", null);\r
328                     cachedIds.add(name);\r
329                 }\r
330             }\r
331         } catch (Exception ex) {\r
332             ex.printStackTrace();\r
333         }\r
334 \r
335         for (int i = 0; i < align.getHeight(); i++)\r
336             if (!cachedIds.contains(align.getSequenceAt(i).getName())) {\r
337                 unknownSequences.add(align.getSequenceAt(i).getName());\r
338             }\r
339     }\r
340 \r
341     void tryLocalCacheFirst(ArrayList ids, AlignmentI align) {\r
342         ArrayList cacheData = new ArrayList();\r
343 \r
344         try {\r
345             BufferedReader in = new BufferedReader(new FileReader(\r
346                         jalview.bin.Cache.getProperty("UNIPROT_CACHE")));\r
347 \r
348             // read through cache file, if the cache has sequences we're looking for\r
349             // add the lines to a new String array, Readthis new array and\r
350             // make sure we remove the ids from the list to retrieve from EBI\r
351             String data;\r
352 \r
353             while ((data = in.readLine()) != null) {\r
354                 if (data.indexOf("name") > -1) {\r
355                     String name = parseElement(data, "<name>", null);\r
356 \r
357                     if (ids.contains(name)) {\r
358                         cacheData.add("<entry>");\r
359                         cacheData.add(data);\r
360 \r
361                         while (data.indexOf("</entry>") == -1) {\r
362                             data = in.readLine();\r
363                             cacheData.add(data);\r
364                         }\r
365 \r
366                         cacheData.add(data);\r
367 \r
368                         ids.remove(name);\r
369                     }\r
370                 }\r
371             }\r
372         } catch (Exception ex) {\r
373             ex.printStackTrace();\r
374         }\r
375 \r
376         String[] localData = new String[cacheData.size()];\r
377         cacheData.toArray(localData);\r
378 \r
379         if ((localData != null) && (localData.length > 0)) {\r
380             ReadUniprotFile(localData, null, align);\r
381         }\r
382     }\r
383 \r
384     String parseValue(String line, String tag, RandomAccessFile out) {\r
385         if (out != null) {\r
386             try {\r
387                 out.writeBytes(line + "\n");\r
388             } catch (Exception ex) {\r
389             }\r
390         }\r
391 \r
392         int index = line.indexOf(tag) + tag.length() + 1;\r
393 \r
394         if (index == tag.length()) {\r
395             return "";\r
396         }\r
397 \r
398         return line.substring(index, line.indexOf("\"", index + 1));\r
399     }\r
400 \r
401     String parseElement(String line, String tag, RandomAccessFile out) {\r
402         if (out != null) {\r
403             try {\r
404                 out.writeBytes(line + "\n");\r
405             } catch (Exception ex) {\r
406             }\r
407         }\r
408 \r
409         int index = line.indexOf(tag) + tag.length();\r
410 \r
411         return line.substring(index, line.indexOf("</"));\r
412     }\r
413 }\r