c8c39dc893c0c0009ef75bad39806d647e206170
[jalview.git] / src / jalview / io / SequenceFeatureFetcher.java
1 /*\r
2 * Jalview - A Sequence Alignment Editor and Viewer\r
3 * Copyright (C) 2005 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle\r
4 *\r
5 * This program is free software; you can redistribute it and/or\r
6 * modify it under the terms of the GNU General Public License\r
7 * as published by the Free Software Foundation; either version 2\r
8 * of the License, or (at your option) any later version.\r
9 *\r
10 * This program is distributed in the hope that it will be useful,\r
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
13 * GNU General Public License for more details.\r
14 *\r
15 * You should have received a copy of the GNU General Public License\r
16 * along with this program; if not, write to the Free Software\r
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA\r
18 */\r
19 package jalview.io;\r
20 \r
21 import jalview.datamodel.*;\r
22 \r
23 import jalview.gui.*;\r
24 \r
25 import jalview.io.*;\r
26 \r
27 import java.io.*;\r
28 \r
29 import java.util.*;\r
30 \r
31 import javax.swing.*;\r
32 \r
33 \r
34 public class SequenceFeatureFetcher implements Runnable {\r
35     AlignmentI align;\r
36     AlignmentPanel ap;\r
37     ArrayList unknownSequences;\r
38     CutAndPasteTransfer output = new CutAndPasteTransfer();\r
39     StringBuffer sbuffer = new StringBuffer();\r
40 \r
41     public SequenceFeatureFetcher(AlignmentI align, AlignmentPanel ap) {\r
42         unknownSequences = new ArrayList();\r
43         this.align = align;\r
44         this.ap = ap;\r
45 \r
46         Thread thread = new Thread(this);\r
47         thread.start();\r
48     }\r
49 \r
50     public void run() {\r
51         String cache = jalview.bin.Cache.getProperty("UNIPROT_CACHE");\r
52 \r
53         RandomAccessFile out = null;\r
54 \r
55         try {\r
56             if (cache == null) {\r
57                 jalview.bin.Cache.setProperty("UNIPROT_CACHE",\r
58                     System.getProperty("user.home") + "/uniprot.xml");\r
59                 cache = jalview.bin.Cache.getProperty("UNIPROT_CACHE");\r
60             }\r
61 \r
62             File test = new File(cache);\r
63 \r
64             if (!test.exists()) {\r
65                 out = new RandomAccessFile(cache, "rw");\r
66                 out.writeBytes("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");\r
67                 out.writeBytes("<UNIPROT_CACHE>\n");\r
68             } else {\r
69                 out = new RandomAccessFile(cache, "rw");\r
70 \r
71                 // open exisiting cache and remove </UNIPROT_CACHE> from the end\r
72                 long lastLine = 0;\r
73                 String data;\r
74 \r
75                 while ((data = out.readLine()) != null) {\r
76                     if (data.indexOf("</entry>") > -1) {\r
77                         lastLine = out.getFilePointer();\r
78                     }\r
79                 }\r
80 \r
81                 out.seek(lastLine);\r
82             }\r
83 \r
84             int seqIndex = 0;\r
85             Vector sequences = align.getSequences();\r
86 \r
87             while (seqIndex < sequences.size()) {\r
88                 ArrayList ids = new ArrayList();\r
89 \r
90                 for (int i = 0; (seqIndex < sequences.size()) && (i < 50);\r
91                         seqIndex++, i++) {\r
92                     SequenceI sequence = (SequenceI) sequences.get(seqIndex);\r
93                     ids.add(sequence.getName());\r
94                 }\r
95 \r
96                 tryLocalCacheFirst(ids, align);\r
97 \r
98                 if (ids.size() > 0) {\r
99                     StringBuffer remainingIds = new StringBuffer("uniprot:");\r
100 \r
101                     for (int i = 0; i < ids.size(); i++)\r
102                         remainingIds.append(ids.get(i) + ";");\r
103 \r
104                     EBIFetchClient ebi = new EBIFetchClient();\r
105                     String[] result = ebi.fetchData(remainingIds.toString(),\r
106                             "xml", null);\r
107 \r
108                     if (result != null) {\r
109                         ReadUniprotFile(result, out, align);\r
110                     }\r
111                 }\r
112             }\r
113 \r
114             if (out != null) {\r
115                 out.writeBytes("</UNIPROT_CACHE>\n");\r
116                 out.close();\r
117             }\r
118         } catch (Exception ex) {\r
119             ex.printStackTrace();\r
120         }\r
121 \r
122         ap.repaint();\r
123         findMissingIds(align);\r
124 \r
125         if (sbuffer.length() > 0) {\r
126             output.setText(\r
127                 "Your sequences have been matched to Uniprot. Some of the ids have been\n" +\r
128                 "altered, most likely the start/end residue will have been updated.\n" +\r
129                 "Save your alignment to maintain the updated id.\n\n" +\r
130                 sbuffer.toString());\r
131             Desktop.addInternalFrame(output, "Sequence names updated ", 600, 300);\r
132         }\r
133 \r
134         if (unknownSequences.size() > 0) {\r
135             //ignore for now!!!!!!!!!!\r
136             //  WSWUBlastClient blastClient = new WSWUBlastClient(align, unknownSequences);\r
137         }\r
138     }\r
139 \r
140     void ReadUniprotFile(String[] result, RandomAccessFile out, AlignmentI align) {\r
141         SequenceI sequence = null;\r
142         Vector features = null;\r
143         String type;\r
144         String description;\r
145         String status;\r
146         String start;\r
147         String end;\r
148         String pdb = null;\r
149 \r
150         for (int r = 0; r < result.length; r++) {\r
151             if ((sequence == null) && (result[r].indexOf("<name>") > -1)) {\r
152                 long filePointer = 0;\r
153 \r
154                 if (out != null) {\r
155                     try {\r
156                         filePointer = out.getFilePointer();\r
157                         out.writeBytes("<entry>\n");\r
158                     } catch (Exception ex) {\r
159                     }\r
160                 }\r
161 \r
162                 String seqName = parseElement(result[r], "<name>", out);\r
163                 sequence = align.findName(seqName);\r
164 \r
165                 if (sequence == null) {\r
166                     sequence = align.findName(seqName.substring(0,\r
167                                 seqName.indexOf('_')));\r
168 \r
169                     if (sequence != null) {\r
170                         sbuffer.append("changing " + sequence.getName() +\r
171                             " to " + seqName + "\n");\r
172                         sequence.setName(seqName);\r
173                     }\r
174                 }\r
175 \r
176                 if (sequence == null) {\r
177                     sbuffer.append("UNIPROT updated suggestion is " +\r
178                         result[r] + "\n");\r
179                     sequence = align.findName(result[r]);\r
180 \r
181                     // this entry has been suggested by ebi.\r
182                     // doesn't match id in alignment file\r
183                     try {\r
184                         out.setLength(filePointer);\r
185                     } catch (Exception ex) {\r
186                     }\r
187 \r
188                     // now skip to next entry\r
189                     while (result[r].indexOf("</entry>") == -1)\r
190                         r++;\r
191                 }\r
192 \r
193                 features = new Vector();\r
194                 type = "";\r
195                 start = "0";\r
196                 end = "0";\r
197                 description = "";\r
198                 status = "";\r
199                 pdb = "";\r
200             }\r
201 \r
202             if (sequence == null) {\r
203                 continue;\r
204             }\r
205 \r
206             if (result[r].indexOf("<property type=\"pdb accession\"") > -1) {\r
207                 pdb = parseValue(result[r], "value=", out);\r
208                 sequence.setPDBId(pdb);\r
209             }\r
210 \r
211             if (result[r].indexOf("feature type") > -1) {\r
212                 type = parseValue(result[r], "type=", out);\r
213                 description = parseValue(result[r], "description=", null);\r
214                 status = parseValue(result[r], "status=", null);\r
215 \r
216                 while (result[r].indexOf("position") == -1) {\r
217                     r++; //<location>\r
218                 }\r
219 \r
220                 // r++;\r
221                 if (result[r].indexOf("begin") > -1) {\r
222                     start = parseValue(result[r], "position=", out);\r
223                     end = parseValue(result[++r], "position=", out);\r
224                 } else {\r
225                     start = parseValue(result[r], "position=", out);\r
226                     end = parseValue(result[r], "position=", null);\r
227                 }\r
228 \r
229                 int sstart = Integer.parseInt(start);\r
230                 int eend = Integer.parseInt(end);\r
231 \r
232                 if (out != null) {\r
233                     try {\r
234                         out.writeBytes("</feature>\n");\r
235                     } catch (Exception ex) {\r
236                     }\r
237                 }\r
238 \r
239                 SequenceFeature sf = new SequenceFeature(type, sstart, eend,\r
240                         description, status);\r
241                 features.add(sf);\r
242             }\r
243 \r
244             if (result[r].indexOf("<sequence") > -1) {\r
245                 StringBuffer seqString = new StringBuffer();\r
246 \r
247                 if (out != null) {\r
248                     try {\r
249                         out.writeBytes(result[r] + "\n");\r
250                     } catch (Exception ex) {\r
251                     }\r
252                 }\r
253 \r
254                 while (result[++r].indexOf("</sequence>") == -1) {\r
255                     seqString.append(result[r]);\r
256 \r
257                     if (out != null) {\r
258                         try {\r
259                             out.writeBytes(result[r] + "\n");\r
260                         } catch (Exception ex) {\r
261                         }\r
262                     }\r
263                 }\r
264 \r
265                 if (out != null) {\r
266                     try {\r
267                         out.writeBytes(result[r] + "\n");\r
268                     } catch (Exception ex) {\r
269                     }\r
270                 }\r
271 \r
272                 StringBuffer nonGapped = new StringBuffer();\r
273 \r
274                 for (int i = 0; i < sequence.getSequence().length(); i++) {\r
275                     if (!jalview.util.Comparison.isGap(sequence.getCharAt(i))) {\r
276                         nonGapped.append(sequence.getCharAt(i));\r
277                     }\r
278                 }\r
279 \r
280                 int absStart = seqString.toString().indexOf(nonGapped.toString());\r
281 \r
282                 if (absStart == -1) {\r
283                     unknownSequences.add(sequence.getName());\r
284                     features = null;\r
285                     sbuffer.append(sequence.getName() +\r
286                         " SEQUENCE NOT %100 MATCH \n");\r
287 \r
288                     continue;\r
289                 }\r
290 \r
291                 int absEnd = absStart + nonGapped.toString().length();\r
292                 absStart += 1;\r
293 \r
294                 if ((absStart != sequence.getStart()) ||\r
295                         (absEnd != sequence.getEnd())) {\r
296                     sbuffer.append("Updated: " + sequence.getName() + " " +\r
297                         sequence.getStart() + "/" + sequence.getEnd() +\r
298                         "  to  " + absStart + "/" + absEnd + "\n");\r
299                 }\r
300 \r
301                 sequence.setStart(absStart);\r
302                 sequence.setEnd(absEnd);\r
303             }\r
304 \r
305             if (result[r].indexOf("</entry>") > -1) {\r
306                 if (features != null) {\r
307                     sequence.setSequenceFeatures(features);\r
308                 }\r
309 \r
310                 features = null;\r
311                 sequence = null;\r
312 \r
313                 if (out != null) {\r
314                     try {\r
315                         out.writeBytes("</entry>\n");\r
316                     } catch (Exception ex) {\r
317                     }\r
318                 }\r
319             }\r
320         }\r
321     }\r
322 \r
323     void findMissingIds(AlignmentI align) {\r
324         String data;\r
325         ArrayList cachedIds = new ArrayList();\r
326 \r
327         try {\r
328             BufferedReader in = new BufferedReader(new FileReader(\r
329                         jalview.bin.Cache.getProperty("UNIPROT_CACHE")));\r
330 \r
331             while ((data = in.readLine()) != null) {\r
332                 if (data.indexOf("name") > -1) {\r
333                     String name = parseElement(data, "<name>", null);\r
334                     cachedIds.add(name);\r
335                 }\r
336             }\r
337         } catch (Exception ex) {\r
338             ex.printStackTrace();\r
339         }\r
340 \r
341         for (int i = 0; i < align.getHeight(); i++)\r
342             if (!cachedIds.contains(align.getSequenceAt(i).getName())) {\r
343                 unknownSequences.add(align.getSequenceAt(i).getName());\r
344             }\r
345     }\r
346 \r
347     void tryLocalCacheFirst(ArrayList ids, AlignmentI align) {\r
348         ArrayList cacheData = new ArrayList();\r
349 \r
350         try {\r
351             BufferedReader in = new BufferedReader(new FileReader(\r
352                         jalview.bin.Cache.getProperty("UNIPROT_CACHE")));\r
353 \r
354             // read through cache file, if the cache has sequences we're looking for\r
355             // add the lines to a new String array, Readthis new array and\r
356             // make sure we remove the ids from the list to retrieve from EBI\r
357             String data;\r
358 \r
359             while ((data = in.readLine()) != null) {\r
360                 if (data.indexOf("name") > -1) {\r
361                     String name = parseElement(data, "<name>", null);\r
362 \r
363                     if (ids.contains(name)) {\r
364                         cacheData.add("<entry>");\r
365                         cacheData.add(data);\r
366 \r
367                         while (data.indexOf("</entry>") == -1) {\r
368                             data = in.readLine();\r
369                             cacheData.add(data);\r
370                         }\r
371 \r
372                         cacheData.add(data);\r
373 \r
374                         ids.remove(name);\r
375                     }\r
376                 }\r
377             }\r
378         } catch (Exception ex) {\r
379             ex.printStackTrace();\r
380         }\r
381 \r
382         String[] localData = new String[cacheData.size()];\r
383         cacheData.toArray(localData);\r
384 \r
385         if ((localData != null) && (localData.length > 0)) {\r
386             ReadUniprotFile(localData, null, align);\r
387         }\r
388     }\r
389 \r
390     String parseValue(String line, String tag, RandomAccessFile out) {\r
391         if (out != null) {\r
392             try {\r
393                 out.writeBytes(line + "\n");\r
394             } catch (Exception ex) {\r
395             }\r
396         }\r
397 \r
398         int index = line.indexOf(tag) + tag.length() + 1;\r
399 \r
400         if (index == tag.length()) {\r
401             return "";\r
402         }\r
403 \r
404         return line.substring(index, line.indexOf("\"", index + 1));\r
405     }\r
406 \r
407     String parseElement(String line, String tag, RandomAccessFile out) {\r
408         if (out != null) {\r
409             try {\r
410                 out.writeBytes(line + "\n");\r
411             } catch (Exception ex) {\r
412             }\r
413         }\r
414 \r
415         int index = line.indexOf(tag) + tag.length();\r
416 \r
417         return line.substring(index, line.indexOf("</"));\r
418     }\r
419 }\r