Quick fix for reading sequence
[jalview.git] / src / jalview / io / SequenceFeatureFetcher.java
1 /*\r
2 * Jalview - A Sequence Alignment Editor and Viewer\r
3 * Copyright (C) 2005 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle\r
4 *\r
5 * This program is free software; you can redistribute it and/or\r
6 * modify it under the terms of the GNU General Public License\r
7 * as published by the Free Software Foundation; either version 2\r
8 * of the License, or (at your option) any later version.\r
9 *\r
10 * This program is distributed in the hope that it will be useful,\r
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
13 * GNU General Public License for more details.\r
14 *\r
15 * You should have received a copy of the GNU General Public License\r
16 * along with this program; if not, write to the Free Software\r
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA\r
18 */\r
19 package jalview.io;\r
20 \r
21 import jalview.datamodel.*;\r
22 \r
23 import jalview.gui.*;\r
24 \r
25 import java.io.*;\r
26 \r
27 import java.util.*;\r
28 \r
29 \r
30 /**\r
31  * DOCUMENT ME!\r
32  *\r
33  * @author $author$\r
34  * @version $Revision$\r
35  */\r
36 public class SequenceFeatureFetcher implements Runnable\r
37 {\r
38     AlignmentI align;\r
39     AlignmentPanel ap;\r
40     ArrayList unknownSequences;\r
41     CutAndPasteTransfer output = new CutAndPasteTransfer();\r
42     StringBuffer sbuffer = new StringBuffer();\r
43 \r
44     /**\r
45      * Creates a new SequenceFeatureFetcher object.\r
46      *\r
47      * @param align DOCUMENT ME!\r
48      * @param ap DOCUMENT ME!\r
49      */\r
50     public SequenceFeatureFetcher(AlignmentI align, AlignmentPanel ap)\r
51     {\r
52         unknownSequences = new ArrayList();\r
53         this.align = align;\r
54         this.ap = ap;\r
55 \r
56         Thread thread = new Thread(this);\r
57         thread.start();\r
58     }\r
59 \r
60     /**\r
61      * DOCUMENT ME!\r
62      */\r
63     public void run()\r
64     {\r
65         RandomAccessFile out = null;\r
66 \r
67         try\r
68         {\r
69             String cache = System.getProperty("user.home") +\r
70                 "/.jalview.uniprot.xml";\r
71 \r
72             File test = new File(cache);\r
73 \r
74             if (!test.exists())\r
75             {\r
76                 out = new RandomAccessFile(cache, "rw");\r
77                 out.writeBytes("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");\r
78                 out.writeBytes("<UNIPROT_CACHE>\n");\r
79             }\r
80             else\r
81             {\r
82                 out = new RandomAccessFile(cache, "rw");\r
83 \r
84                 // open exisiting cache and remove </UNIPROT_CACHE> from the end\r
85                 long lastLine = 0;\r
86                 String data;\r
87 \r
88                 while ((data = out.readLine()) != null)\r
89                 {\r
90                     if (data.indexOf("</entry>") > -1)\r
91                     {\r
92                         lastLine = out.getFilePointer();\r
93                     }\r
94                 }\r
95 \r
96                 out.seek(lastLine);\r
97             }\r
98 \r
99             int seqIndex = 0;\r
100             Vector sequences = align.getSequences();\r
101 \r
102             while (seqIndex < sequences.size())\r
103             {\r
104                 ArrayList ids = new ArrayList();\r
105 \r
106                 for (int i = 0; (seqIndex < sequences.size()) && (i < 50);\r
107                         seqIndex++, i++)\r
108                 {\r
109                     SequenceI sequence = (SequenceI) sequences.get(seqIndex);\r
110                     ids.add(sequence.getName());\r
111                 }\r
112 \r
113                 tryLocalCacheFirst(ids, align);\r
114 \r
115                 if (ids.size() > 0)\r
116                 {\r
117                     StringBuffer remainingIds = new StringBuffer("uniprot:");\r
118 \r
119                     for (int i = 0; i < ids.size(); i++)\r
120                         remainingIds.append(ids.get(i) + ";");\r
121 \r
122                     EBIFetchClient ebi = new EBIFetchClient();\r
123                     String[] result = ebi.fetchData(remainingIds.toString(),\r
124                             "xml", null);\r
125 \r
126                     if (result != null)\r
127                     {\r
128                         ReadUniprotFile(result, out, align);\r
129                     }\r
130                 }\r
131             }\r
132 \r
133             if (out != null)\r
134             {\r
135                 out.writeBytes("</UNIPROT_CACHE>\n");\r
136                 out.close();\r
137             }\r
138         }\r
139         catch (Exception ex)\r
140         {\r
141             ex.printStackTrace();\r
142         }\r
143 \r
144         findMissingIds(align);\r
145 \r
146         if (sbuffer.length() > 0)\r
147         {\r
148             output.setText(\r
149                 "Your sequences have been matched to Uniprot. Some of the ids have been\n" +\r
150                 "altered, most likely the start/end residue will have been updated.\n" +\r
151                 "Save your alignment to maintain the updated id.\n\n" +\r
152                 sbuffer.toString());\r
153             Desktop.addInternalFrame(output, "Sequence names updated ", 600, 300);\r
154         }\r
155 \r
156         if (unknownSequences.size() > 0)\r
157         {\r
158             //ignore for now!!!!!!!!!!\r
159             //  WSWUBlastClient blastClient = new WSWUBlastClient(align, unknownSequences);\r
160         }\r
161 \r
162         jalview.gui.PaintRefresher.Refresh(null, align);\r
163     }\r
164 \r
165     /**\r
166      * DOCUMENT ME!\r
167      *\r
168      * @param result DOCUMENT ME!\r
169      * @param out DOCUMENT ME!\r
170      * @param align DOCUMENT ME!\r
171      */\r
172     void ReadUniprotFile(String[] result, RandomAccessFile out, AlignmentI align)\r
173     {\r
174         SequenceI sequence = null;\r
175         Vector features = null;\r
176         String type;\r
177         String description;\r
178         String status;\r
179         String start;\r
180         String end;\r
181         String pdb = null;\r
182 \r
183         for (int r = 0; r < result.length; r++)\r
184         {\r
185             if ((sequence == null) && (result[r].indexOf("<name>") > -1))\r
186             {\r
187                 long filePointer = 0;\r
188 \r
189                 if (out != null)\r
190                 {\r
191                     try\r
192                     {\r
193                         filePointer = out.getFilePointer();\r
194                         out.writeBytes("<entry>\n");\r
195                     }\r
196                     catch (Exception ex)\r
197                     {\r
198                     }\r
199                 }\r
200 \r
201                 String seqName = parseElement(result[r], "<name>", out);\r
202                 sequence = align.findName(seqName);\r
203 \r
204                 if (sequence == null)\r
205                 {\r
206                     sequence = align.findName(seqName.substring(0,\r
207                                 seqName.indexOf('_')));\r
208 \r
209                     if (sequence != null)\r
210                     {\r
211                         sbuffer.append("changing " + sequence.getName() +\r
212                             " to " + seqName + "\n");\r
213                         sequence.setName(seqName);\r
214                     }\r
215                 }\r
216 \r
217                 if (sequence == null)\r
218                 {\r
219                     sbuffer.append("UNIPROT updated suggestion is " +\r
220                         result[r] + "\n");\r
221                     sequence = align.findName(result[r]);\r
222 \r
223                     // this entry has been suggested by ebi.\r
224                     // doesn't match id in alignment file\r
225                     try\r
226                     {\r
227                         out.setLength(filePointer);\r
228                     }\r
229                     catch (Exception ex)\r
230                     {\r
231                     }\r
232 \r
233                     // now skip to next entry\r
234                     while (result[r].indexOf("</entry>") == -1)\r
235                         r++;\r
236                 }\r
237 \r
238                 features = new Vector();\r
239                 type = "";\r
240                 start = "0";\r
241                 end = "0";\r
242                 description = "";\r
243                 status = "";\r
244                 pdb = "";\r
245             }\r
246 \r
247             if (sequence == null)\r
248             {\r
249                 continue;\r
250             }\r
251 \r
252             if (result[r].indexOf("<property type=\"pdb accession\"") > -1)\r
253             {\r
254                 pdb = parseValue(result[r], "value=", out);\r
255                 sequence.setPDBId(pdb);\r
256             }\r
257 \r
258             if (result[r].indexOf("feature type") > -1)\r
259             {\r
260                 type = parseValue(result[r], "type=", out);\r
261                 description = parseValue(result[r], "description=", null);\r
262                 status = parseValue(result[r], "status=", null);\r
263 \r
264                 while (result[r].indexOf("position") == -1)\r
265                 {\r
266                     r++; //<location>\r
267                 }\r
268 \r
269                 // r++;\r
270                 if (result[r].indexOf("begin") > -1)\r
271                 {\r
272                     start = parseValue(result[r], "position=", out);\r
273                     end = parseValue(result[++r], "position=", out);\r
274                 }\r
275                 else\r
276                 {\r
277                     start = parseValue(result[r], "position=", out);\r
278                     end = parseValue(result[r], "position=", null);\r
279                 }\r
280 \r
281                 int sstart = Integer.parseInt(start);\r
282                 int eend = Integer.parseInt(end);\r
283 \r
284                 if (out != null)\r
285                 {\r
286                     try\r
287                     {\r
288                         out.writeBytes("</feature>\n");\r
289                     }\r
290                     catch (Exception ex)\r
291                     {\r
292                     }\r
293                 }\r
294 \r
295                 SequenceFeature sf = new SequenceFeature(type, sstart, eend,\r
296                         description, status);\r
297                 features.add(sf);\r
298             }\r
299 \r
300             if (result[r].indexOf("<sequence length=") > -1)\r
301             {\r
302                 StringBuffer seqString = new StringBuffer();\r
303 \r
304                 if (out != null)\r
305                 {\r
306                     try\r
307                     {\r
308                         out.writeBytes(result[r] + "\n");\r
309                     }\r
310                     catch (Exception ex)\r
311                     {\r
312                     }\r
313                 }\r
314 \r
315                 while (result[++r].indexOf("</sequence>") == -1)\r
316                 {\r
317                     seqString.append(result[r]);\r
318 \r
319                     if (out != null)\r
320                     {\r
321                         try\r
322                         {\r
323                             out.writeBytes(result[r] + "\n");\r
324                         }\r
325                         catch (Exception ex)\r
326                         {\r
327                         }\r
328                     }\r
329                 }\r
330 \r
331                 if (out != null)\r
332                 {\r
333                     try\r
334                     {\r
335                         out.writeBytes(result[r] + "\n");\r
336                     }\r
337                     catch (Exception ex)\r
338                     {\r
339                     }\r
340                 }\r
341 \r
342                 StringBuffer nonGapped = new StringBuffer();\r
343 \r
344                 for (int i = 0; i < sequence.getSequence().length(); i++)\r
345                 {\r
346                     if (!jalview.util.Comparison.isGap(sequence.getCharAt(i)))\r
347                     {\r
348                         nonGapped.append(sequence.getCharAt(i));\r
349                     }\r
350                 }\r
351 \r
352                 int absStart = seqString.toString().indexOf(nonGapped.toString());\r
353 \r
354                 if (absStart == -1)\r
355                 {\r
356                     unknownSequences.add(sequence.getName());\r
357                     features = null;\r
358                     sbuffer.append(sequence.getName() +\r
359                         " SEQUENCE NOT %100 MATCH \n");\r
360 \r
361                     continue;\r
362                 }\r
363 \r
364                 int absEnd = absStart + nonGapped.toString().length();\r
365                 absStart += 1;\r
366 \r
367                 if ((absStart != sequence.getStart()) ||\r
368                         (absEnd != sequence.getEnd()))\r
369                 {\r
370                     sbuffer.append("Updated: " + sequence.getName() + " " +\r
371                         sequence.getStart() + "/" + sequence.getEnd() +\r
372                         "  to  " + absStart + "/" + absEnd + "\n");\r
373                 }\r
374 \r
375                 sequence.setStart(absStart);\r
376                 sequence.setEnd(absEnd);\r
377             }\r
378 \r
379             if (result[r].indexOf("</entry>") > -1)\r
380             {\r
381                 if (features != null)\r
382                 {\r
383                     sequence.setSequenceFeatures(features);\r
384                 }\r
385 \r
386                 features = null;\r
387                 sequence = null;\r
388 \r
389                 if (out != null)\r
390                 {\r
391                     try\r
392                     {\r
393                         out.writeBytes("</entry>\n");\r
394                     }\r
395                     catch (Exception ex)\r
396                     {\r
397                     }\r
398                 }\r
399             }\r
400         }\r
401     }\r
402 \r
403     /**\r
404      * DOCUMENT ME!\r
405      *\r
406      * @param align DOCUMENT ME!\r
407      */\r
408     void findMissingIds(AlignmentI align)\r
409     {\r
410         String data;\r
411         ArrayList cachedIds = new ArrayList();\r
412 \r
413         try\r
414         {\r
415             if(jalview.bin.Cache.getProperty("UNIPROT_CACHE")==null)\r
416               return;\r
417 \r
418             BufferedReader in = new BufferedReader(new FileReader(\r
419                         jalview.bin.Cache.getProperty("UNIPROT_CACHE")));\r
420 \r
421             while ((data = in.readLine()) != null)\r
422             {\r
423                 if (data.indexOf("name") > -1)\r
424                 {\r
425                     String name = parseElement(data, "<name>", null);\r
426                     cachedIds.add(name);\r
427                 }\r
428             }\r
429         }\r
430         catch (Exception ex)\r
431         {\r
432             ex.printStackTrace();\r
433         }\r
434 \r
435         for (int i = 0; i < align.getHeight(); i++)\r
436             if (!cachedIds.contains(align.getSequenceAt(i).getName()))\r
437             {\r
438                 unknownSequences.add(align.getSequenceAt(i).getName());\r
439             }\r
440     }\r
441 \r
442     /**\r
443      * DOCUMENT ME!\r
444      *\r
445      * @param ids DOCUMENT ME!\r
446      * @param align DOCUMENT ME!\r
447      */\r
448     void tryLocalCacheFirst(ArrayList ids, AlignmentI align)\r
449     {\r
450         ArrayList cacheData = new ArrayList();\r
451 \r
452         try\r
453         {\r
454             if(jalview.bin.Cache.getProperty("UNIPROT_CACHE")==null)\r
455               return;\r
456 \r
457             BufferedReader in = new BufferedReader(new FileReader(\r
458                         jalview.bin.Cache.getProperty("UNIPROT_CACHE")));\r
459 \r
460             // read through cache file, if the cache has sequences we're looking for\r
461             // add the lines to a new String array, Readthis new array and\r
462             // make sure we remove the ids from the list to retrieve from EBI\r
463             String data;\r
464 \r
465             while ((data = in.readLine()) != null)\r
466             {\r
467                 if (data.indexOf("name") > -1)\r
468                 {\r
469                     String name = parseElement(data, "<name>", null);\r
470 \r
471                     if (ids.contains(name))\r
472                     {\r
473                         cacheData.add("<entry>");\r
474                         cacheData.add(data);\r
475 \r
476                         while (data.indexOf("</entry>") == -1)\r
477                         {\r
478                             data = in.readLine();\r
479                             cacheData.add(data);\r
480                         }\r
481 \r
482                         cacheData.add(data);\r
483 \r
484                         ids.remove(name);\r
485                     }\r
486                 }\r
487             }\r
488         }\r
489         catch (Exception ex)\r
490         {\r
491             ex.printStackTrace();\r
492         }\r
493 \r
494         String[] localData = new String[cacheData.size()];\r
495         cacheData.toArray(localData);\r
496 \r
497         if ((localData != null) && (localData.length > 0))\r
498         {\r
499             ReadUniprotFile(localData, null, align);\r
500         }\r
501     }\r
502 \r
503     /**\r
504      * DOCUMENT ME!\r
505      *\r
506      * @param line DOCUMENT ME!\r
507      * @param tag DOCUMENT ME!\r
508      * @param out DOCUMENT ME!\r
509      *\r
510      * @return DOCUMENT ME!\r
511      */\r
512     String parseValue(String line, String tag, RandomAccessFile out)\r
513     {\r
514         if (out != null)\r
515         {\r
516             try\r
517             {\r
518                 out.writeBytes(line + "\n");\r
519             }\r
520             catch (Exception ex)\r
521             {\r
522             }\r
523         }\r
524 \r
525         int index = line.indexOf(tag) + tag.length() + 1;\r
526 \r
527         if (index == tag.length())\r
528         {\r
529             return "";\r
530         }\r
531 \r
532         return line.substring(index, line.indexOf("\"", index + 1));\r
533     }\r
534 \r
535     /**\r
536      * DOCUMENT ME!\r
537      *\r
538      * @param line DOCUMENT ME!\r
539      * @param tag DOCUMENT ME!\r
540      * @param out DOCUMENT ME!\r
541      *\r
542      * @return DOCUMENT ME!\r
543      */\r
544     String parseElement(String line, String tag, RandomAccessFile out)\r
545     {\r
546         if (out != null)\r
547         {\r
548             try\r
549             {\r
550                 out.writeBytes(line + "\n");\r
551             }\r
552             catch (Exception ex)\r
553             {\r
554             }\r
555         }\r
556 \r
557         int index = line.indexOf(tag) + tag.length();\r
558 \r
559         return line.substring(index, line.indexOf("</"));\r
560     }\r
561 }\r