src/jalview/io/StockholmFile.java

   1 /*\r
   2  * Jalview - A Sequence Alignment Editor and Viewer\r
   3  * Copyright (C) 2005 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle\r
   4  *\r
   5  * This program is free software; you can redistribute it and/or\r
   6  * modify it under the terms of the GNU General Public License\r
   7  * as published by the Free Software Foundation; either version 2\r
   8  * of the License, or (at your option) any later version.\r
   9  *\r
  10  * This program is distributed in the hope that it will be useful,\r
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
  13  * GNU General Public License for more details.\r
  14  *\r
  15  * You should have received a copy of the GNU General Public License\r
  16  * along with this program; if not, write to the Free Software\r
  17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA\r
  18  */\r
  19 /*\r
  20  * This extension was written by Benjamin Schuster-Boeckler at sanger.ac.uk\r
  21  */\r
  22 package jalview.io;\r
  23 import java.io.*;\r
  24 import java.util.*;\r
  25 import jalview.datamodel.*;\r
  26 import com.stevesoft.pat.*;\r
  27 //import org.apache.log4j.*;\r
  28 \r
  29 /**\r
  30  * This class is supposed to parse a Stockholm format file into Jalview\r
  31  * @author bsb at sanger.ac.uk\r
  32  * @version 0.3\r
  33  */\r
  34 public class StockholmFile extends AlignFile\r
  35 {\r
  36         //static Logger logger = Logger.getLogger("jalview.io.StockholmFile");\r
  37 \r
  38     public StockholmFile()\r
  39     {\r
  40     }\r
  41 \r
  42 \r
  43     public StockholmFile(String inFile, String type) throws IOException\r
  44     {\r
  45         super(inFile, type);\r
  46     }\r
  47 \r
  48     public void initData()\r
  49     {\r
  50          super.initData();\r
  51     }\r
  52 \r
  53     /**\r
  54      * Parse a file in Stockholm format into Jalview's data model. The file has\r
  55      * to be passed at construction time\r
  56      * @throws IOException If there is an error with the input file\r
  57      */\r
  58     public void parse() throws IOException\r
  59     {\r
  60         // --------------- Variable Definitions -------------------\r
  61         String line;\r
  62         String version;\r
  63       //  String id;\r
  64         Hashtable alAnn = new Hashtable(); // Alignment wide annotations\r
  65         Hashtable seqAnn = new Hashtable(); // Sequence related annotations\r
  66         Hashtable seqs = new Hashtable();\r
  67         Regex p, r, rend, s, x;\r
  68 \r
  69         // ------------------ Parsing File ----------------------\r
  70         // First, we have to check that this file has STOCKHOLM format, i.e. the first line must match\r
  71         r = new Regex("# STOCKHOLM ([\\d\\.]+)");\r
  72         if(!r.search(nextLine()))\r
  73         {\r
  74             throw new IOException("This file is not in valid STOCKHOLM format: First line does not contain '# STOCKHOLM'");\r
  75         }\r
  76         else\r
  77         {\r
  78             version = r.stringMatched(1);\r
  79             //logger.debug("Stockholm version: " + version);\r
  80         }\r
  81 \r
  82 //      We define some Regexes here that will be used regularily later\r
  83         rend = new Regex("\\/\\/"); // Find the end of an alignment\r
  84         p = new Regex("(\\S+)\\/(\\d+)\\-(\\d+)"); // split sequence id in id/from/to\r
  85         s = new Regex("(\\S+)\\s+(\\w{2})\\s+(.*)"); // Parses annotation subtype\r
  86         r = new Regex("#=(G[FSRC]?)\\s+(.*)"); // Finds any annotation line\r
  87         x = new Regex("(\\S+)\\s+(\\S+)"); //split id from sequence\r
  88 \r
  89         rend.optimize();\r
  90         p.optimize();\r
  91         s.optimize();\r
  92         r.optimize();\r
  93         x.optimize();\r
  94 \r
  95         while ( (line = nextLine()) != null)\r
  96         {\r
  97             if (line.length() == 0) continue;\r
  98             if(rend.search(line))\r
  99             {\r
 100 //              End of the alignment, pass stuff back\r
 101 \r
 102                  this.noSeqs = seqs.size();\r
 103                 //logger.debug("Number of sequences: " + this.noSeqs);\r
 104                 Enumeration accs = seqs.keys();\r
 105                 while (accs.hasMoreElements())\r
 106                 {\r
 107                     String acc = (String) accs.nextElement();\r
 108                     //logger.debug("Processing sequence " + acc);\r
 109                     String seq = (String) seqs.get(acc);\r
 110                     if (maxLength < seq.length())\r
 111                     {\r
 112                         maxLength = seq.length();\r
 113                     }\r
 114                     int start = 1;\r
 115                     int end = -1;\r
 116                     String sid = acc;\r
 117                     // Split accession in id and from/to\r
 118                     if (p.search(acc))\r
 119                     {\r
 120                         sid = p.stringMatched(1);\r
 121                         start = Integer.parseInt(p.stringMatched(2));\r
 122                         end = Integer.parseInt(p.stringMatched(3));\r
 123                     }\r
 124                     //logger.debug(sid + ", " + start + ", " + end);\r
 125 \r
 126                     Sequence seqO = new Sequence(sid, seq, start, end);\r
 127                     Hashtable features = null;\r
 128                     // We need to adjust the positions of all features to account for gaps\r
 129                     try\r
 130                     {\r
 131                          features = (Hashtable) ((Hashtable) seqAnn.get(acc)).get("features");\r
 132                     }\r
 133                     catch (java.lang.NullPointerException e)\r
 134                     {\r
 135                        //loggerwarn("Getting Features for " + acc + ": " + e.getMessage());\r
 136                        //continue;\r
 137                     }\r
 138                     // if we have features\r
 139                     if (features != null)\r
 140                     {\r
 141                         Enumeration i = features.keys();\r
 142                         while(i.hasMoreElements())\r
 143                         {\r
 144                             String type = i.nextElement().toString();\r
 145                             Hashtable content = (Hashtable) features.get(type);\r
 146 \r
 147                             Enumeration j = content.keys();\r
 148                             while(j.hasMoreElements())\r
 149                             {\r
 150                                         String desc = j.nextElement().toString();\r
 151                                         String ns = content.get(desc).toString();\r
 152                                         char[] byChar = ns.toCharArray();\r
 153                                         for (int k = 0; k < byChar.length; k++)\r
 154                                         {\r
 155                                                 char c = byChar[k];\r
 156                                                 if (! (c == ' ' || c == '_' ||\r
 157                                                        c == '-'))\r
 158                                                 {\r
 159                                                   int new_pos = seqO.findPosition(k);\r
 160                                                   SequenceFeature feat =\r
 161                                                       new SequenceFeature(type,\r
 162                                                       desc, new_pos, new_pos, 0f, null);\r
 163 \r
 164                                                   seqO.addSequenceFeature(feat);\r
 165                                                 }\r
 166                                         }\r
 167                                 }\r
 168 \r
 169                         }\r
 170 \r
 171                     }\r
 172                     //logger.debug("Adding seq " + acc + " from "  + start + " to " + end + ": " + seq);\r
 173                     this.seqs.addElement(seqO);\r
 174                 }\r
 175             }\r
 176             else if (!r.search(line))\r
 177             {\r
 178                 //System.err.println("Found sequence line: " + line);\r
 179 \r
 180                 // Split sequence in sequence and accession parts\r
 181                 if(!x.search(line))\r
 182                 {\r
 183                                 //logger.error("Could not parse sequence line: " + line);\r
 184                                 throw new IOException("Could not parse sequence line: " + line);\r
 185                 }\r
 186                 String ns  = (String) seqs.get(x.stringMatched(1));\r
 187                 if (ns == null) ns = "";\r
 188                 ns += x.stringMatched(2);\r
 189 \r
 190                 seqs.put(x.stringMatched(1), ns);\r
 191             }\r
 192             else\r
 193             {\r
 194                 String annType = r.stringMatched(1);\r
 195                 String annContent = r.stringMatched(2);\r
 196 \r
 197                 //System.err.println("type:" + annType + " content: " + annContent);\r
 198 \r
 199                 if (annType.equals("GF"))\r
 200                 {\r
 201                     /* Generic per-File annotation, free text\r
 202                      * Magic features:\r
 203                      * #=GF NH <tree in New Hampshire eXtended format>\r
 204                      * #=GF TN <Unique identifier for the next tree>\r
 205                      * Pfam descriptions:\r
 206                         7. DESCRIPTION OF FIELDS\r
 207 \r
 208                            Compulsory fields:\r
 209                            ------------------\r
 210 \r
 211                            AC   Accession number:           Accession number in form PFxxxxx.version or PBxxxxxx.\r
 212                            ID   Identification:             One word name for family.\r
 213                            DE   Definition:                 Short description of family.\r
 214                            AU   Author:                     Authors of the entry.\r
 215                            SE   Source of seed:             The source suggesting the seed members belong to one family.\r
 216                            GA   Gathering method:           Search threshold to build the full alignment.\r
 217                            TC   Trusted Cutoff:             Lowest sequence score and domain score of match in the full alignment.\r
 218                            NC   Noise Cutoff:               Highest sequence score and domain score of match not in full alignment.\r
 219                            TP   Type:                       Type of family -- presently Family, Domain, Motif or Repeat.\r
 220                            SQ   Sequence:                   Number of sequences in alignment.\r
 221                            AM   Alignment Method        The order ls and fs hits are aligned to the model to build the full align.\r
 222                            //                               End of alignment.\r
 223 \r
 224                            Optional fields:\r
 225                            ----------------\r
 226 \r
 227                            DC   Database Comment:           Comment about database reference.\r
 228                            DR   Database Reference:         Reference to external database.\r
 229                            RC   Reference Comment:          Comment about literature reference.\r
 230                            RN   Reference Number:           Reference Number.\r
 231                            RM   Reference Medline:          Eight digit medline UI number.\r
 232                            RT   Reference Title:            Reference Title.\r
 233                            RA   Reference Author:           Reference Author\r
 234                            RL   Reference Location:         Journal location.\r
 235                            PI   Previous identifier:        Record of all previous ID lines.\r
 236                            KW   Keywords:                   Keywords.\r
 237                            CC   Comment:                    Comments.\r
 238                            NE   Pfam accession:         Indicates a nested domain.\r
 239                            NL   Location:                   Location of nested domains - sequence ID, start and end of insert.\r
 240 \r
 241                            Obsolete fields:\r
 242                            -----------\r
 243                            AL   Alignment method of seed:   The method used to align the seed members.\r
 244                      */\r
 245                     // Let's save the annotations, maybe we'll be able to do something with them later...\r
 246                     Regex an = new Regex("(\\w+)\\s*(.*)");\r
 247                     if (an.search(annContent)) alAnn.put(an.stringMatched(1), an.stringMatched(2));\r
 248                 }\r
 249                 else if(annType.equals("GS"))\r
 250                 {\r
 251                     // Generic per-Sequence annotation, free text\r
 252                     /* Pfam uses these features:\r
 253                         Feature                    Description\r
 254                         ---------------------      -----------\r
 255                         AC <accession>             ACcession number\r
 256                         DE <freetext>              DEscription\r
 257                         DR <db>; <accession>;      Database Reference\r
 258                         OS <organism>              OrganiSm (species)\r
 259                         OC <clade>                 Organism Classification (clade, etc.)\r
 260                         LO <look>                  Look (Color, etc.)\r
 261                     */\r
 262                     if (s.search(annContent))\r
 263                     {\r
 264                         String acc = s.stringMatched(1);\r
 265                         String type = s.stringMatched(2);\r
 266                         String content = s.stringMatched(3);\r
 267 \r
 268                         Hashtable ann;\r
 269                         if (seqAnn.containsKey(acc))\r
 270                         {\r
 271                             ann = (Hashtable) seqAnn.get(acc);\r
 272                         }\r
 273                         else\r
 274                         {\r
 275                             ann = new Hashtable();\r
 276                         }\r
 277                         ann.put(type, content);\r
 278                         seqAnn.put(acc, ann);\r
 279                     }\r
 280                     else\r
 281                     {\r
 282                         throw new IOException("Error parsing " + line);\r
 283                     }\r
 284                 }\r
 285                 else if(annType.equals("GC"))\r
 286                 {\r
 287                     // Generic per-Column annotation, exactly 1 char per column\r
 288                 }\r
 289                 else if(annType.equals("GR"))\r
 290                 {\r
 291                     // Generic per-Sequence AND per-Column markup, exactly 1 char per column\r
 292                     /*\r
 293                         Feature   Description            Markup letters\r
 294                         -------   -----------            --------------\r
 295                         SS        Secondary Structure    [HGIEBTSCX]\r
 296                         SA        Surface Accessibility  [0-9X]\r
 297                                       (0=0%-10%; ...; 9=90%-100%)\r
 298                         TM        TransMembrane          [Mio]\r
 299                         PP        Posterior Probability  [0-9*]\r
 300                                       (0=0.00-0.05; 1=0.05-0.15; *=0.95-1.00)\r
 301                         LI        LIgand binding         [*]\r
 302                         AS        Active Site            [*]\r
 303                         IN        INtron (in or after)   [0-2]\r
 304                      */\r
 305                     if (s.search(annContent))\r
 306                     {\r
 307                         String acc = s.stringMatched(1);\r
 308                         String type = s.stringMatched(2);\r
 309                         String seq = s.stringMatched(3);\r
 310                         String description = new String();\r
 311 \r
 312                         // Check for additional information about the current annotation\r
 313                         if (x.search(seq))\r
 314                         {\r
 315                             description = x.stringMatched(1);\r
 316                             seq = x.stringMatched(2);\r
 317                         }\r
 318                         // sequence id with from-to fields\r
 319 \r
 320                         Hashtable ann;\r
 321                         // Get an object with all the annotations for this sequence\r
 322                         if (seqAnn.containsKey(acc))\r
 323                         {\r
 324                             //logger.debug("Found annotations for " + acc);\r
 325                             ann = (Hashtable) seqAnn.get(acc);\r
 326                         }\r
 327                         else\r
 328                         {\r
 329                             //logger.debug("Creating new annotations holder for " + acc);\r
 330                             ann = new Hashtable();\r
 331                             seqAnn.put(acc, ann);\r
 332                         }\r
 333 \r
 334                         Hashtable features;\r
 335                         // Get an object with all the content for an annotation\r
 336                         if (ann.containsKey("features"))\r
 337                         {\r
 338                             //logger.debug("Found features for " + acc);\r
 339                             features = (Hashtable) ann.get("features");\r
 340                         }\r
 341                         else\r
 342                         {\r
 343                             //logger.debug("Creating new features holder for " + acc);\r
 344                             features = new Hashtable();\r
 345                             ann.put("features", features);\r
 346                         }\r
 347 \r
 348                         Hashtable content;\r
 349                         if (features.containsKey(this.id2type(type)))\r
 350                         {\r
 351                             //logger.debug("Found content for " + this.id2type(type));\r
 352                             content = (Hashtable) features.get(this.id2type(type));\r
 353                         }\r
 354                         else\r
 355                         {\r
 356                             //logger.debug("Creating new content holder for " + this.id2type(type));\r
 357                             content = new Hashtable();\r
 358                             features.put(this.id2type(type), content);\r
 359                         }\r
 360                         String ns = (String) content.get(description);\r
 361                         if (ns == null) ns = "";\r
 362                         ns += seq;\r
 363                         content.put(description, seq);\r
 364                     }\r
 365                     else\r
 366                     {\r
 367                         throw new IOException("Error parsing " + line);\r
 368                     }\r
 369                 }\r
 370                 else\r
 371                 {\r
 372                     throw new IOException("Unknown annotation detected: " + annType + " " + annContent);\r
 373                 }\r
 374             }\r
 375         }\r
 376     }\r
 377 \r
 378     public static String print(SequenceI[] s)\r
 379     {\r
 380         return "not yet implemented";\r
 381     }\r
 382 \r
 383     public String print()\r
 384     {\r
 385         return print(getSeqsAsArray());\r
 386     }\r
 387 \r
 388     private String id2type(String id)\r
 389     {\r
 390         // GR ids\r
 391         if (id.equals("SS")) return "secondary structure";\r
 392         else if (id.equals("SA")) return "surface accessibility";\r
 393         else if (id.equals("TM")) return "transmembrane";\r
 394         else if (id.equals("PP")) return "posterior probability";\r
 395         else if (id.equals("LI")) return "ligand binding";\r
 396         else if (id.equals("AS")) return "active site";\r
 397         else if (id.equals("IN")) return "intron";\r
 398         else if (id.equals("IR")) return "interacting residue";\r
 399         // GS ids\r
 400         else if (id.equals("AC")) return "accession";\r
 401         else if (id.equals("OS")) return "organism";\r
 402         else if (id.equals("CL")) return "class";\r
 403         else if (id.equals("DE")) return "description";\r
 404         else if (id.equals("DR")) return "reference";\r
 405         else if (id.equals("LO")) return "look";\r
 406         else return null;\r
 407     }\r
 408 }\r