AnnotationId is hashcode of object
[jalview.git] / src / jalview / io / StockholmFile.java
1 /*\r
2  * Jalview - A Sequence Alignment Editor and Viewer\r
3  * Copyright (C) 2006 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle\r
4  *\r
5  * This program is free software; you can redistribute it and/or\r
6  * modify it under the terms of the GNU General Public License\r
7  * as published by the Free Software Foundation; either version 2\r
8  * of the License, or (at your option) any later version.\r
9  *\r
10  * This program is distributed in the hope that it will be useful,\r
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
13  * GNU General Public License for more details.\r
14  *\r
15  * You should have received a copy of the GNU General Public License\r
16  * along with this program; if not, write to the Free Software\r
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA\r
18  */\r
19 /*\r
20  * This extension was written by Benjamin Schuster-Boeckler at sanger.ac.uk\r
21  */\r
22 package jalview.io;\r
23 import java.io.*;\r
24 import java.util.*;\r
25 import jalview.datamodel.*;\r
26 import com.stevesoft.pat.*;\r
27 //import org.apache.log4j.*;\r
28 \r
29 /**\r
30  * This class is supposed to parse a Stockholm format file into Jalview\r
31  * @author bsb at sanger.ac.uk\r
32  * @version 0.3\r
33  */\r
34 public class StockholmFile extends AlignFile\r
35 {\r
36         //static Logger logger = Logger.getLogger("jalview.io.StockholmFile");\r
37 \r
38     public StockholmFile()\r
39     {\r
40     }\r
41 \r
42 \r
43     public StockholmFile(String inFile, String type) throws IOException\r
44     {\r
45         super(inFile, type);\r
46     }\r
47 \r
48     public void initData()\r
49     {\r
50          super.initData();\r
51     }\r
52 \r
53     /**\r
54      * Parse a file in Stockholm format into Jalview's data model. The file has\r
55      * to be passed at construction time\r
56      * @throws IOException If there is an error with the input file\r
57      */\r
58     public void parse() throws IOException\r
59     {\r
60         // --------------- Variable Definitions -------------------\r
61         String line;\r
62         String version;\r
63       //  String id;\r
64         Hashtable alAnn = new Hashtable(); // Alignment wide annotations\r
65         Hashtable seqAnn = new Hashtable(); // Sequence related annotations\r
66         Hashtable seqs = new Hashtable();\r
67         Regex p, r, rend, s, x;\r
68 \r
69         // ------------------ Parsing File ----------------------\r
70         // First, we have to check that this file has STOCKHOLM format, i.e. the first line must match\r
71         r = new Regex("# STOCKHOLM ([\\d\\.]+)");\r
72         if(!r.search(nextLine()))\r
73         {\r
74             throw new IOException("This file is not in valid STOCKHOLM format: First line does not contain '# STOCKHOLM'");\r
75         }\r
76         else\r
77         {\r
78             version = r.stringMatched(1);\r
79             //logger.debug("Stockholm version: " + version);\r
80         }\r
81 \r
82 //      We define some Regexes here that will be used regularily later\r
83         rend = new Regex("\\/\\/"); // Find the end of an alignment\r
84         p = new Regex("(\\S+)\\/(\\d+)\\-(\\d+)"); // split sequence id in id/from/to\r
85         s = new Regex("(\\S+)\\s+(\\w{2})\\s+(.*)"); // Parses annotation subtype\r
86         r = new Regex("#=(G[FSRC]?)\\s+(.*)"); // Finds any annotation line\r
87         x = new Regex("(\\S+)\\s+(\\S+)"); //split id from sequence\r
88 \r
89         rend.optimize();\r
90         p.optimize();\r
91         s.optimize();\r
92         r.optimize();\r
93         x.optimize();\r
94 \r
95         while ( (line = nextLine()) != null)\r
96         {\r
97             if (line.length() == 0) continue;\r
98             if(rend.search(line))\r
99             {\r
100 //              End of the alignment, pass stuff back\r
101 \r
102                  this.noSeqs = seqs.size();\r
103                 //logger.debug("Number of sequences: " + this.noSeqs);\r
104                 Enumeration accs = seqs.keys();\r
105                 while (accs.hasMoreElements())\r
106                 {\r
107                     String acc = (String) accs.nextElement();\r
108                     //logger.debug("Processing sequence " + acc);\r
109                     String seq = (String) seqs.get(acc);\r
110                     if (maxLength < seq.length())\r
111                     {\r
112                         maxLength = seq.length();\r
113                     }\r
114                     int start = 1;\r
115                     int end = -1;\r
116                     String sid = acc;\r
117                     // Split accession in id and from/to\r
118                     if (p.search(acc))\r
119                     {\r
120                         sid = p.stringMatched(1);\r
121                         start = Integer.parseInt(p.stringMatched(2));\r
122                         end = Integer.parseInt(p.stringMatched(3));\r
123                     }\r
124                     //logger.debug(sid + ", " + start + ", " + end);\r
125 \r
126                     Sequence seqO = new Sequence(sid, seq, start, end);\r
127                     Hashtable features = null;\r
128                     // We need to adjust the positions of all features to account for gaps\r
129                     try\r
130                     {\r
131                          features = (Hashtable) ((Hashtable) seqAnn.get(acc)).get("features");\r
132                     }\r
133                     catch (java.lang.NullPointerException e)\r
134                     {\r
135                        //loggerwarn("Getting Features for " + acc + ": " + e.getMessage());\r
136                        //continue;\r
137                     }\r
138                     // if we have features\r
139                     if (features != null)\r
140                     {\r
141                         Enumeration i = features.keys();\r
142                         while(i.hasMoreElements())\r
143                         {\r
144                             String type = i.nextElement().toString();\r
145                             Hashtable content = (Hashtable) features.get(type);\r
146 \r
147                             Enumeration j = content.keys();\r
148                             while(j.hasMoreElements())\r
149                             {\r
150                                         String desc = j.nextElement().toString();\r
151                                         String ns = content.get(desc).toString();\r
152                                         char[] byChar = ns.toCharArray();\r
153                                         for (int k = 0; k < byChar.length; k++)\r
154                                         {\r
155                                                 char c = byChar[k];\r
156                                                 if (! (c == ' ' || c == '_' ||\r
157                                                        c == '-'))\r
158                                                 {\r
159                                                   int new_pos = seqO.findPosition(k);\r
160                                                   SequenceFeature feat =\r
161                                                       new SequenceFeature(type,\r
162                                                       desc, new_pos, new_pos, 0f, null);\r
163 \r
164                                                   seqO.addSequenceFeature(feat);\r
165                                                 }\r
166                                         }\r
167                                 }\r
168 \r
169                         }\r
170 \r
171                     }\r
172                     //logger.debug("Adding seq " + acc + " from "  + start + " to " + end + ": " + seq);\r
173                     this.seqs.addElement(seqO);\r
174                 }\r
175             }\r
176             else if (!r.search(line))\r
177             {\r
178                 //System.err.println("Found sequence line: " + line);\r
179 \r
180                 // Split sequence in sequence and accession parts\r
181                 if(!x.search(line))\r
182                 {\r
183                                 //logger.error("Could not parse sequence line: " + line);\r
184                                 throw new IOException("Could not parse sequence line: " + line);\r
185                 }\r
186                 String ns  = (String) seqs.get(x.stringMatched(1));\r
187                 if (ns == null) ns = "";\r
188                 ns += x.stringMatched(2);\r
189 \r
190                 seqs.put(x.stringMatched(1), ns);\r
191             }\r
192             else\r
193             {\r
194                 String annType = r.stringMatched(1);\r
195                 String annContent = r.stringMatched(2);\r
196 \r
197                 //System.err.println("type:" + annType + " content: " + annContent);\r
198 \r
199                 if (annType.equals("GF"))\r
200                 {\r
201                     /* Generic per-File annotation, free text\r
202                      * Magic features:\r
203                      * #=GF NH <tree in New Hampshire eXtended format>\r
204                      * #=GF TN <Unique identifier for the next tree>\r
205                      * Pfam descriptions:\r
206                         7. DESCRIPTION OF FIELDS\r
207 \r
208                            Compulsory fields:\r
209                            ------------------\r
210 \r
211                            AC   Accession number:           Accession number in form PFxxxxx.version or PBxxxxxx.\r
212                            ID   Identification:             One word name for family.\r
213                            DE   Definition:                 Short description of family.\r
214                            AU   Author:                     Authors of the entry.\r
215                            SE   Source of seed:             The source suggesting the seed members belong to one family.\r
216                            GA   Gathering method:           Search threshold to build the full alignment.\r
217                            TC   Trusted Cutoff:             Lowest sequence score and domain score of match in the full alignment.\r
218                            NC   Noise Cutoff:               Highest sequence score and domain score of match not in full alignment.\r
219                            TP   Type:                       Type of family -- presently Family, Domain, Motif or Repeat.\r
220                            SQ   Sequence:                   Number of sequences in alignment.\r
221                            AM   Alignment Method        The order ls and fs hits are aligned to the model to build the full align.\r
222                            //                               End of alignment.\r
223 \r
224                            Optional fields:\r
225                            ----------------\r
226 \r
227                            DC   Database Comment:           Comment about database reference.\r
228                            DR   Database Reference:         Reference to external database.\r
229                            RC   Reference Comment:          Comment about literature reference.\r
230                            RN   Reference Number:           Reference Number.\r
231                            RM   Reference Medline:          Eight digit medline UI number.\r
232                            RT   Reference Title:            Reference Title.\r
233                            RA   Reference Author:           Reference Author\r
234                            RL   Reference Location:         Journal location.\r
235                            PI   Previous identifier:        Record of all previous ID lines.\r
236                            KW   Keywords:                   Keywords.\r
237                            CC   Comment:                    Comments.\r
238                            NE   Pfam accession:         Indicates a nested domain.\r
239                            NL   Location:                   Location of nested domains - sequence ID, start and end of insert.\r
240 \r
241                            Obsolete fields:\r
242                            -----------\r
243                            AL   Alignment method of seed:   The method used to align the seed members.\r
244                      */\r
245                     // Let's save the annotations, maybe we'll be able to do something with them later...\r
246                     Regex an = new Regex("(\\w+)\\s*(.*)");\r
247                     if (an.search(annContent)) alAnn.put(an.stringMatched(1), an.stringMatched(2));\r
248                 }\r
249                 else if(annType.equals("GS"))\r
250                 {\r
251                     // Generic per-Sequence annotation, free text\r
252                     /* Pfam uses these features:\r
253                         Feature                    Description\r
254                         ---------------------      -----------\r
255                         AC <accession>             ACcession number\r
256                         DE <freetext>              DEscription\r
257                         DR <db>; <accession>;      Database Reference\r
258                         OS <organism>              OrganiSm (species)\r
259                         OC <clade>                 Organism Classification (clade, etc.)\r
260                         LO <look>                  Look (Color, etc.)\r
261                     */\r
262                     if (s.search(annContent))\r
263                     {\r
264                         String acc = s.stringMatched(1);\r
265                         String type = s.stringMatched(2);\r
266                         String content = s.stringMatched(3);\r
267 \r
268                         Hashtable ann;\r
269                         if (seqAnn.containsKey(acc))\r
270                         {\r
271                             ann = (Hashtable) seqAnn.get(acc);\r
272                         }\r
273                         else\r
274                         {\r
275                             ann = new Hashtable();\r
276                         }\r
277                         ann.put(type, content);\r
278                         seqAnn.put(acc, ann);\r
279                     }\r
280                     else\r
281                     {\r
282                         throw new IOException("Error parsing " + line);\r
283                     }\r
284                 }\r
285                 else if(annType.equals("GC"))\r
286                 {\r
287                   System.out.println(annContent);\r
288                     // Generic per-Column annotation, exactly 1 char per column\r
289                 }\r
290                 else if(annType.equals("GR"))\r
291                 {\r
292                     // Generic per-Sequence AND per-Column markup, exactly 1 char per column\r
293                     /*\r
294                         Feature   Description            Markup letters\r
295                         -------   -----------            --------------\r
296                         SS        Secondary Structure    [HGIEBTSCX]\r
297                         SA        Surface Accessibility  [0-9X]\r
298                                       (0=0%-10%; ...; 9=90%-100%)\r
299                         TM        TransMembrane          [Mio]\r
300                         PP        Posterior Probability  [0-9*]\r
301                                       (0=0.00-0.05; 1=0.05-0.15; *=0.95-1.00)\r
302                         LI        LIgand binding         [*]\r
303                         AS        Active Site            [*]\r
304                         IN        INtron (in or after)   [0-2]\r
305                      */\r
306                     if (s.search(annContent))\r
307                     {\r
308                         String acc = s.stringMatched(1);\r
309                         String type = s.stringMatched(2);\r
310                         String seq = s.stringMatched(3);\r
311                         String description = new String();\r
312 \r
313                         // Check for additional information about the current annotation\r
314                         if (x.search(seq))\r
315                         {\r
316                             description = x.stringMatched(1);\r
317                             seq = x.stringMatched(2);\r
318                         }\r
319                         // sequence id with from-to fields\r
320 \r
321                         Hashtable ann;\r
322                         // Get an object with all the annotations for this sequence\r
323                         if (seqAnn.containsKey(acc))\r
324                         {\r
325                             //logger.debug("Found annotations for " + acc);\r
326                             ann = (Hashtable) seqAnn.get(acc);\r
327                         }\r
328                         else\r
329                         {\r
330                             //logger.debug("Creating new annotations holder for " + acc);\r
331                             ann = new Hashtable();\r
332                             seqAnn.put(acc, ann);\r
333                         }\r
334 \r
335                         Hashtable features;\r
336                         // Get an object with all the content for an annotation\r
337                         if (ann.containsKey("features"))\r
338                         {\r
339                             //logger.debug("Found features for " + acc);\r
340                             features = (Hashtable) ann.get("features");\r
341                         }\r
342                         else\r
343                         {\r
344                             //logger.debug("Creating new features holder for " + acc);\r
345                             features = new Hashtable();\r
346                             ann.put("features", features);\r
347                         }\r
348 \r
349                         Hashtable content;\r
350                         if (features.containsKey(this.id2type(type)))\r
351                         {\r
352                             //logger.debug("Found content for " + this.id2type(type));\r
353                             content = (Hashtable) features.get(this.id2type(type));\r
354                         }\r
355                         else\r
356                         {\r
357                             //logger.debug("Creating new content holder for " + this.id2type(type));\r
358                             content = new Hashtable();\r
359                             features.put(this.id2type(type), content);\r
360                         }\r
361                         String ns = (String) content.get(description);\r
362                         if (ns == null) ns = "";\r
363                         ns += seq;\r
364                         content.put(description, seq);\r
365                     }\r
366                     else\r
367                     {\r
368                         throw new IOException("Error parsing " + line);\r
369                     }\r
370                 }\r
371                 else\r
372                 {\r
373                     throw new IOException("Unknown annotation detected: " + annType + " " + annContent);\r
374                 }\r
375             }\r
376         }\r
377     }\r
378 \r
379     public static String print(SequenceI[] s)\r
380     {\r
381         return "not yet implemented";\r
382     }\r
383 \r
384     public String print()\r
385     {\r
386         return print(getSeqsAsArray());\r
387     }\r
388 \r
389     private String id2type(String id)\r
390     {\r
391         // GR ids\r
392         if (id.equals("SS")) return "secondary structure";\r
393         else if (id.equals("SA")) return "surface accessibility";\r
394         else if (id.equals("TM")) return "transmembrane";\r
395         else if (id.equals("PP")) return "posterior probability";\r
396         else if (id.equals("LI")) return "ligand binding";\r
397         else if (id.equals("AS")) return "active site";\r
398         else if (id.equals("IN")) return "intron";\r
399         else if (id.equals("IR")) return "interacting residue";\r
400         // GS ids\r
401         else if (id.equals("AC")) return "accession";\r
402         else if (id.equals("OS")) return "organism";\r
403         else if (id.equals("CL")) return "class";\r
404         else if (id.equals("DE")) return "description";\r
405         else if (id.equals("DR")) return "reference";\r
406         else if (id.equals("LO")) return "look";\r
407         else return null;\r
408     }\r
409 }\r