0c82f6fb41cd813c1a9ae04d21bc5dacb7bbe5f3
[jalview.git] / src / jalview / io / SequenceFeatureFetcher.java
1 /*\r
2 * Jalview - A Sequence Alignment Editor and Viewer\r
3 * Copyright (C) 2005 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle\r
4 *\r
5 * This program is free software; you can redistribute it and/or\r
6 * modify it under the terms of the GNU General Public License\r
7 * as published by the Free Software Foundation; either version 2\r
8 * of the License, or (at your option) any later version.\r
9 *\r
10 * This program is distributed in the hope that it will be useful,\r
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
13 * GNU General Public License for more details.\r
14 *\r
15 * You should have received a copy of the GNU General Public License\r
16 * along with this program; if not, write to the Free Software\r
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA\r
18 */\r
19 package jalview.io;\r
20 \r
21 import jalview.datamodel.*;\r
22 \r
23 import jalview.gui.*;\r
24 \r
25 import jalview.io.*;\r
26 \r
27 import java.io.*;\r
28 \r
29 import java.util.*;\r
30 \r
31 \r
32 /**\r
33  * DOCUMENT ME!\r
34  *\r
35  * @author $author$\r
36  * @version $Revision$\r
37  */\r
38 public class SequenceFeatureFetcher implements Runnable\r
39 {\r
40     AlignmentI align;\r
41     AlignmentPanel ap;\r
42     ArrayList unknownSequences;\r
43     CutAndPasteTransfer output = new CutAndPasteTransfer();\r
44     StringBuffer sbuffer = new StringBuffer();\r
45 \r
46     /**\r
47      * Creates a new SequenceFeatureFetcher object.\r
48      *\r
49      * @param align DOCUMENT ME!\r
50      * @param ap DOCUMENT ME!\r
51      */\r
52     public SequenceFeatureFetcher(AlignmentI align, AlignmentPanel ap)\r
53     {\r
54         unknownSequences = new ArrayList();\r
55         this.align = align;\r
56         this.ap = ap;\r
57 \r
58         Thread thread = new Thread(this);\r
59         thread.start();\r
60     }\r
61 \r
62     /**\r
63      * DOCUMENT ME!\r
64      */\r
65     public void run()\r
66     {\r
67         RandomAccessFile out = null;\r
68 \r
69         try\r
70         {\r
71             String cache = System.getProperty("user.home") +\r
72                 "/.jalview.uniprot.xml";\r
73 \r
74             File test = new File(cache);\r
75 \r
76             if (!test.exists())\r
77             {\r
78                 out = new RandomAccessFile(cache, "rw");\r
79                 out.writeBytes("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");\r
80                 out.writeBytes("<UNIPROT_CACHE>\n");\r
81             }\r
82             else\r
83             {\r
84                 out = new RandomAccessFile(cache, "rw");\r
85 \r
86                 // open exisiting cache and remove </UNIPROT_CACHE> from the end\r
87                 long lastLine = 0;\r
88                 String data;\r
89 \r
90                 while ((data = out.readLine()) != null)\r
91                 {\r
92                     if (data.indexOf("</entry>") > -1)\r
93                     {\r
94                         lastLine = out.getFilePointer();\r
95                     }\r
96                 }\r
97 \r
98                 out.seek(lastLine);\r
99             }\r
100 \r
101             int seqIndex = 0;\r
102             Vector sequences = align.getSequences();\r
103 \r
104             while (seqIndex < sequences.size())\r
105             {\r
106                 ArrayList ids = new ArrayList();\r
107 \r
108                 for (int i = 0; (seqIndex < sequences.size()) && (i < 50);\r
109                         seqIndex++, i++)\r
110                 {\r
111                     SequenceI sequence = (SequenceI) sequences.get(seqIndex);\r
112                     ids.add(sequence.getName());\r
113                 }\r
114 \r
115                 tryLocalCacheFirst(ids, align);\r
116 \r
117                 if (ids.size() > 0)\r
118                 {\r
119                     StringBuffer remainingIds = new StringBuffer("uniprot:");\r
120 \r
121                     for (int i = 0; i < ids.size(); i++)\r
122                         remainingIds.append(ids.get(i) + ";");\r
123 \r
124                     EBIFetchClient ebi = new EBIFetchClient();\r
125                     String[] result = ebi.fetchData(remainingIds.toString(),\r
126                             "xml", null);\r
127 \r
128                     if (result != null)\r
129                     {\r
130                         ReadUniprotFile(result, out, align);\r
131                     }\r
132                 }\r
133             }\r
134 \r
135             if (out != null)\r
136             {\r
137                 out.writeBytes("</UNIPROT_CACHE>\n");\r
138                 out.close();\r
139             }\r
140         }\r
141         catch (Exception ex)\r
142         {\r
143             ex.printStackTrace();\r
144         }\r
145 \r
146         findMissingIds(align);\r
147 \r
148         if (sbuffer.length() > 0)\r
149         {\r
150             output.setText(\r
151                 "Your sequences have been matched to Uniprot. Some of the ids have been\n" +\r
152                 "altered, most likely the start/end residue will have been updated.\n" +\r
153                 "Save your alignment to maintain the updated id.\n\n" +\r
154                 sbuffer.toString());\r
155             Desktop.addInternalFrame(output, "Sequence names updated ", 600, 300);\r
156         }\r
157 \r
158         if (unknownSequences.size() > 0)\r
159         {\r
160             //ignore for now!!!!!!!!!!\r
161             //  WSWUBlastClient blastClient = new WSWUBlastClient(align, unknownSequences);\r
162         }\r
163 \r
164         jalview.gui.PaintRefresher.Refresh(null, align);\r
165     }\r
166 \r
167     /**\r
168      * DOCUMENT ME!\r
169      *\r
170      * @param result DOCUMENT ME!\r
171      * @param out DOCUMENT ME!\r
172      * @param align DOCUMENT ME!\r
173      */\r
174     void ReadUniprotFile(String[] result, RandomAccessFile out, AlignmentI align)\r
175     {\r
176         SequenceI sequence = null;\r
177         Vector features = null;\r
178         String type;\r
179         String description;\r
180         String status;\r
181         String start;\r
182         String end;\r
183         String pdb = null;\r
184 \r
185         for (int r = 0; r < result.length; r++)\r
186         {\r
187             if ((sequence == null) && (result[r].indexOf("<name>") > -1))\r
188             {\r
189                 long filePointer = 0;\r
190 \r
191                 if (out != null)\r
192                 {\r
193                     try\r
194                     {\r
195                         filePointer = out.getFilePointer();\r
196                         out.writeBytes("<entry>\n");\r
197                     }\r
198                     catch (Exception ex)\r
199                     {\r
200                     }\r
201                 }\r
202 \r
203                 String seqName = parseElement(result[r], "<name>", out);\r
204                 sequence = align.findName(seqName);\r
205 \r
206                 if (sequence == null)\r
207                 {\r
208                     sequence = align.findName(seqName.substring(0,\r
209                                 seqName.indexOf('_')));\r
210 \r
211                     if (sequence != null)\r
212                     {\r
213                         sbuffer.append("changing " + sequence.getName() +\r
214                             " to " + seqName + "\n");\r
215                         sequence.setName(seqName);\r
216                     }\r
217                 }\r
218 \r
219                 if (sequence == null)\r
220                 {\r
221                     sbuffer.append("UNIPROT updated suggestion is " +\r
222                         result[r] + "\n");\r
223                     sequence = align.findName(result[r]);\r
224 \r
225                     // this entry has been suggested by ebi.\r
226                     // doesn't match id in alignment file\r
227                     try\r
228                     {\r
229                         out.setLength(filePointer);\r
230                     }\r
231                     catch (Exception ex)\r
232                     {\r
233                     }\r
234 \r
235                     // now skip to next entry\r
236                     while (result[r].indexOf("</entry>") == -1)\r
237                         r++;\r
238                 }\r
239 \r
240                 features = new Vector();\r
241                 type = "";\r
242                 start = "0";\r
243                 end = "0";\r
244                 description = "";\r
245                 status = "";\r
246                 pdb = "";\r
247             }\r
248 \r
249             if (sequence == null)\r
250             {\r
251                 continue;\r
252             }\r
253 \r
254             if (result[r].indexOf("<property type=\"pdb accession\"") > -1)\r
255             {\r
256                 pdb = parseValue(result[r], "value=", out);\r
257                 sequence.setPDBId(pdb);\r
258             }\r
259 \r
260             if (result[r].indexOf("feature type") > -1)\r
261             {\r
262                 type = parseValue(result[r], "type=", out);\r
263                 description = parseValue(result[r], "description=", null);\r
264                 status = parseValue(result[r], "status=", null);\r
265 \r
266                 while (result[r].indexOf("position") == -1)\r
267                 {\r
268                     r++; //<location>\r
269                 }\r
270 \r
271                 // r++;\r
272                 if (result[r].indexOf("begin") > -1)\r
273                 {\r
274                     start = parseValue(result[r], "position=", out);\r
275                     end = parseValue(result[++r], "position=", out);\r
276                 }\r
277                 else\r
278                 {\r
279                     start = parseValue(result[r], "position=", out);\r
280                     end = parseValue(result[r], "position=", null);\r
281                 }\r
282 \r
283                 int sstart = Integer.parseInt(start);\r
284                 int eend = Integer.parseInt(end);\r
285 \r
286                 if (out != null)\r
287                 {\r
288                     try\r
289                     {\r
290                         out.writeBytes("</feature>\n");\r
291                     }\r
292                     catch (Exception ex)\r
293                     {\r
294                     }\r
295                 }\r
296 \r
297                 SequenceFeature sf = new SequenceFeature(type, sstart, eend,\r
298                         description, status);\r
299                 features.add(sf);\r
300             }\r
301 \r
302             if (result[r].indexOf("<sequence") > -1)\r
303             {\r
304                 StringBuffer seqString = new StringBuffer();\r
305 \r
306                 if (out != null)\r
307                 {\r
308                     try\r
309                     {\r
310                         out.writeBytes(result[r] + "\n");\r
311                     }\r
312                     catch (Exception ex)\r
313                     {\r
314                     }\r
315                 }\r
316 \r
317                 while (result[++r].indexOf("</sequence>") == -1)\r
318                 {\r
319                     seqString.append(result[r]);\r
320 \r
321                     if (out != null)\r
322                     {\r
323                         try\r
324                         {\r
325                             out.writeBytes(result[r] + "\n");\r
326                         }\r
327                         catch (Exception ex)\r
328                         {\r
329                         }\r
330                     }\r
331                 }\r
332 \r
333                 if (out != null)\r
334                 {\r
335                     try\r
336                     {\r
337                         out.writeBytes(result[r] + "\n");\r
338                     }\r
339                     catch (Exception ex)\r
340                     {\r
341                     }\r
342                 }\r
343 \r
344                 StringBuffer nonGapped = new StringBuffer();\r
345 \r
346                 for (int i = 0; i < sequence.getSequence().length(); i++)\r
347                 {\r
348                     if (!jalview.util.Comparison.isGap(sequence.getCharAt(i)))\r
349                     {\r
350                         nonGapped.append(sequence.getCharAt(i));\r
351                     }\r
352                 }\r
353 \r
354                 int absStart = seqString.toString().indexOf(nonGapped.toString());\r
355 \r
356                 if (absStart == -1)\r
357                 {\r
358                     unknownSequences.add(sequence.getName());\r
359                     features = null;\r
360                     sbuffer.append(sequence.getName() +\r
361                         " SEQUENCE NOT %100 MATCH \n");\r
362 \r
363                     continue;\r
364                 }\r
365 \r
366                 int absEnd = absStart + nonGapped.toString().length();\r
367                 absStart += 1;\r
368 \r
369                 if ((absStart != sequence.getStart()) ||\r
370                         (absEnd != sequence.getEnd()))\r
371                 {\r
372                     sbuffer.append("Updated: " + sequence.getName() + " " +\r
373                         sequence.getStart() + "/" + sequence.getEnd() +\r
374                         "  to  " + absStart + "/" + absEnd + "\n");\r
375                 }\r
376 \r
377                 sequence.setStart(absStart);\r
378                 sequence.setEnd(absEnd);\r
379             }\r
380 \r
381             if (result[r].indexOf("</entry>") > -1)\r
382             {\r
383                 if (features != null)\r
384                 {\r
385                     sequence.setSequenceFeatures(features);\r
386                 }\r
387 \r
388                 features = null;\r
389                 sequence = null;\r
390 \r
391                 if (out != null)\r
392                 {\r
393                     try\r
394                     {\r
395                         out.writeBytes("</entry>\n");\r
396                     }\r
397                     catch (Exception ex)\r
398                     {\r
399                     }\r
400                 }\r
401             }\r
402         }\r
403     }\r
404 \r
405     /**\r
406      * DOCUMENT ME!\r
407      *\r
408      * @param align DOCUMENT ME!\r
409      */\r
410     void findMissingIds(AlignmentI align)\r
411     {\r
412         String data;\r
413         ArrayList cachedIds = new ArrayList();\r
414 \r
415         try\r
416         {\r
417             if(jalview.bin.Cache.getProperty("UNIPROT_CACHE")==null)\r
418               return;\r
419 \r
420             BufferedReader in = new BufferedReader(new FileReader(\r
421                         jalview.bin.Cache.getProperty("UNIPROT_CACHE")));\r
422 \r
423             while ((data = in.readLine()) != null)\r
424             {\r
425                 if (data.indexOf("name") > -1)\r
426                 {\r
427                     String name = parseElement(data, "<name>", null);\r
428                     cachedIds.add(name);\r
429                 }\r
430             }\r
431         }\r
432         catch (Exception ex)\r
433         {\r
434             ex.printStackTrace();\r
435         }\r
436 \r
437         for (int i = 0; i < align.getHeight(); i++)\r
438             if (!cachedIds.contains(align.getSequenceAt(i).getName()))\r
439             {\r
440                 unknownSequences.add(align.getSequenceAt(i).getName());\r
441             }\r
442     }\r
443 \r
444     /**\r
445      * DOCUMENT ME!\r
446      *\r
447      * @param ids DOCUMENT ME!\r
448      * @param align DOCUMENT ME!\r
449      */\r
450     void tryLocalCacheFirst(ArrayList ids, AlignmentI align)\r
451     {\r
452         ArrayList cacheData = new ArrayList();\r
453 \r
454         try\r
455         {\r
456             if(jalview.bin.Cache.getProperty("UNIPROT_CACHE")==null)\r
457               return;\r
458 \r
459             BufferedReader in = new BufferedReader(new FileReader(\r
460                         jalview.bin.Cache.getProperty("UNIPROT_CACHE")));\r
461 \r
462             // read through cache file, if the cache has sequences we're looking for\r
463             // add the lines to a new String array, Readthis new array and\r
464             // make sure we remove the ids from the list to retrieve from EBI\r
465             String data;\r
466 \r
467             while ((data = in.readLine()) != null)\r
468             {\r
469                 if (data.indexOf("name") > -1)\r
470                 {\r
471                     String name = parseElement(data, "<name>", null);\r
472 \r
473                     if (ids.contains(name))\r
474                     {\r
475                         cacheData.add("<entry>");\r
476                         cacheData.add(data);\r
477 \r
478                         while (data.indexOf("</entry>") == -1)\r
479                         {\r
480                             data = in.readLine();\r
481                             cacheData.add(data);\r
482                         }\r
483 \r
484                         cacheData.add(data);\r
485 \r
486                         ids.remove(name);\r
487                     }\r
488                 }\r
489             }\r
490         }\r
491         catch (Exception ex)\r
492         {\r
493             ex.printStackTrace();\r
494         }\r
495 \r
496         String[] localData = new String[cacheData.size()];\r
497         cacheData.toArray(localData);\r
498 \r
499         if ((localData != null) && (localData.length > 0))\r
500         {\r
501             ReadUniprotFile(localData, null, align);\r
502         }\r
503     }\r
504 \r
505     /**\r
506      * DOCUMENT ME!\r
507      *\r
508      * @param line DOCUMENT ME!\r
509      * @param tag DOCUMENT ME!\r
510      * @param out DOCUMENT ME!\r
511      *\r
512      * @return DOCUMENT ME!\r
513      */\r
514     String parseValue(String line, String tag, RandomAccessFile out)\r
515     {\r
516         if (out != null)\r
517         {\r
518             try\r
519             {\r
520                 out.writeBytes(line + "\n");\r
521             }\r
522             catch (Exception ex)\r
523             {\r
524             }\r
525         }\r
526 \r
527         int index = line.indexOf(tag) + tag.length() + 1;\r
528 \r
529         if (index == tag.length())\r
530         {\r
531             return "";\r
532         }\r
533 \r
534         return line.substring(index, line.indexOf("\"", index + 1));\r
535     }\r
536 \r
537     /**\r
538      * DOCUMENT ME!\r
539      *\r
540      * @param line DOCUMENT ME!\r
541      * @param tag DOCUMENT ME!\r
542      * @param out DOCUMENT ME!\r
543      *\r
544      * @return DOCUMENT ME!\r
545      */\r
546     String parseElement(String line, String tag, RandomAccessFile out)\r
547     {\r
548         if (out != null)\r
549         {\r
550             try\r
551             {\r
552                 out.writeBytes(line + "\n");\r
553             }\r
554             catch (Exception ex)\r
555             {\r
556             }\r
557         }\r
558 \r
559         int index = line.indexOf(tag) + tag.length();\r
560 \r
561         return line.substring(index, line.indexOf("</"));\r
562     }\r
563 }\r