src/jalview/io/StockholmFile.java

   1 /*\r
   2  * Jalview - A Sequence Alignment Editor and Viewer\r
   3  * Copyright (C) 2007 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle\r
   4  *\r
   5  * This program is free software; you can redistribute it and/or\r
   6  * modify it under the terms of the GNU General Public License\r
   7  * as published by the Free Software Foundation; either version 2\r
   8  * of the License, or (at your option) any later version.\r
   9  *\r
  10  * This program is distributed in the hope that it will be useful,\r
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
  13  * GNU General Public License for more details.\r
  14  *\r
  15  * You should have received a copy of the GNU General Public License\r
  16  * along with this program; if not, write to the Free Software\r
  17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA\r
  18  */\r
  19 /*\r
  20  * This extension was written by Benjamin Schuster-Boeckler at sanger.ac.uk\r
  21  */\r
  22 package jalview.io;\r
  23 \r
  24 import java.io.*;\r
  25 import java.util.*;\r
  26 \r
  27 import com.stevesoft.pat.*;\r
  28 import jalview.datamodel.*;\r
  29 \r
  30 //import org.apache.log4j.*;\r
  31 \r
  32 /**\r
  33  * This class is supposed to parse a Stockholm format file into Jalview\r
  34  * @author bsb at sanger.ac.uk\r
  35  * @version 0.3\r
  36  */\r
  37 public class StockholmFile\r
  38     extends AlignFile\r
  39 {\r
  40   //static Logger logger = Logger.getLogger("jalview.io.StockholmFile");\r
  41 \r
  42   public StockholmFile()\r
  43   {\r
  44   }\r
  45 \r
  46   public StockholmFile(String inFile, String type)\r
  47       throws IOException\r
  48   {\r
  49     super(inFile, type);\r
  50   }\r
  51 \r
  52   public void initData()\r
  53   {\r
  54     super.initData();\r
  55   }\r
  56 \r
  57   /**\r
  58    * Parse a file in Stockholm format into Jalview's data model. The file has\r
  59    * to be passed at construction time\r
  60    * @throws IOException If there is an error with the input file\r
  61    */\r
  62   public void parse()\r
  63       throws IOException\r
  64   {\r
  65     // --------------- Variable Definitions -------------------\r
  66     String line;\r
  67     String version;\r
  68     //  String id;\r
  69     Hashtable alAnn = new Hashtable(); // Alignment wide annotations\r
  70     Hashtable seqAnn = new Hashtable(); // Sequence related annotations\r
  71     Hashtable seqs = new Hashtable();\r
  72     Regex p, r, rend, s, x;\r
  73 \r
  74     // ------------------ Parsing File ----------------------\r
  75     // First, we have to check that this file has STOCKHOLM format, i.e. the first line must match\r
  76     r = new Regex("# STOCKHOLM ([\\d\\.]+)");\r
  77     if (!r.search(nextLine()))\r
  78     {\r
  79       throw new IOException("This file is not in valid STOCKHOLM format: First line does not contain '# STOCKHOLM'");\r
  80     }\r
  81     else\r
  82     {\r
  83       version = r.stringMatched(1);\r
  84       //logger.debug("Stockholm version: " + version);\r
  85     }\r
  86 \r
  87 //      We define some Regexes here that will be used regularily later\r
  88     rend = new Regex("\\/\\/"); // Find the end of an alignment\r
  89     p = new Regex("(\\S+)\\/(\\d+)\\-(\\d+)"); // split sequence id in id/from/to\r
  90     s = new Regex("(\\S+)\\s+(\\w{2})\\s+(.*)"); // Parses annotation subtype\r
  91     r = new Regex("#=(G[FSRC]?)\\s+(.*)"); // Finds any annotation line\r
  92     x = new Regex("(\\S+)\\s+(\\S+)"); //split id from sequence\r
  93 \r
  94     rend.optimize();\r
  95     p.optimize();\r
  96     s.optimize();\r
  97     r.optimize();\r
  98     x.optimize();\r
  99 \r
 100     while ( (line = nextLine()) != null)\r
 101     {\r
 102       if (line.length() == 0)\r
 103       {\r
 104         continue;\r
 105       }\r
 106       if (rend.search(line))\r
 107       {\r
 108 //              End of the alignment, pass stuff back\r
 109 \r
 110         this.noSeqs = seqs.size();\r
 111         //logger.debug("Number of sequences: " + this.noSeqs);\r
 112         Enumeration accs = seqs.keys();\r
 113         while (accs.hasMoreElements())\r
 114         {\r
 115           String acc = (String) accs.nextElement();\r
 116           //logger.debug("Processing sequence " + acc);\r
 117           String seq = (String) seqs.get(acc);\r
 118           if (maxLength < seq.length())\r
 119           {\r
 120             maxLength = seq.length();\r
 121           }\r
 122           int start = 1;\r
 123           int end = -1;\r
 124           String sid = acc;\r
 125           // Split accession in id and from/to\r
 126           if (p.search(acc))\r
 127           {\r
 128             sid = p.stringMatched(1);\r
 129             start = Integer.parseInt(p.stringMatched(2));\r
 130             end = Integer.parseInt(p.stringMatched(3));\r
 131           }\r
 132           //logger.debug(sid + ", " + start + ", " + end);\r
 133 \r
 134           Sequence seqO = new Sequence(sid, seq, start, end);\r
 135           Hashtable features = null;\r
 136           // We need to adjust the positions of all features to account for gaps\r
 137           try\r
 138           {\r
 139             features = (Hashtable) ( (Hashtable) seqAnn.get(acc)).get(\r
 140                 "features");\r
 141           }\r
 142           catch (java.lang.NullPointerException e)\r
 143           {\r
 144             //loggerwarn("Getting Features for " + acc + ": " + e.getMessage());\r
 145             //continue;\r
 146           }\r
 147           // if we have features\r
 148           if (features != null)\r
 149           {\r
 150             Enumeration i = features.keys();\r
 151             while (i.hasMoreElements())\r
 152             {\r
 153               String type = i.nextElement().toString();\r
 154               Hashtable content = (Hashtable) features.get(type);\r
 155 \r
 156               Enumeration j = content.keys();\r
 157               while (j.hasMoreElements())\r
 158               {\r
 159                 String desc = j.nextElement().toString();\r
 160                 String ns = content.get(desc).toString();\r
 161                 char[] byChar = ns.toCharArray();\r
 162                 for (int k = 0; k < byChar.length; k++)\r
 163                 {\r
 164                   char c = byChar[k];\r
 165                   if (! (c == ' ' || c == '_' ||\r
 166                          c == '-'))\r
 167                   {\r
 168                     int new_pos = seqO.findPosition(k);\r
 169                     SequenceFeature feat =\r
 170                         new SequenceFeature(type,\r
 171                                             desc, new_pos, new_pos, 0f, null);\r
 172 \r
 173                     seqO.addSequenceFeature(feat);\r
 174                   }\r
 175                 }\r
 176               }\r
 177 \r
 178             }\r
 179 \r
 180           }\r
 181           //logger.debug("Adding seq " + acc + " from "  + start + " to " + end + ": " + seq);\r
 182           this.seqs.addElement(seqO);\r
 183         }\r
 184       }\r
 185       else if (!r.search(line))\r
 186       {\r
 187         //System.err.println("Found sequence line: " + line);\r
 188 \r
 189         // Split sequence in sequence and accession parts\r
 190         if (!x.search(line))\r
 191         {\r
 192           //logger.error("Could not parse sequence line: " + line);\r
 193           throw new IOException("Could not parse sequence line: " + line);\r
 194         }\r
 195         String ns = (String) seqs.get(x.stringMatched(1));\r
 196         if (ns == null)\r
 197         {\r
 198           ns = "";\r
 199         }\r
 200         ns += x.stringMatched(2);\r
 201 \r
 202         seqs.put(x.stringMatched(1), ns);\r
 203       }\r
 204       else\r
 205       {\r
 206         String annType = r.stringMatched(1);\r
 207         String annContent = r.stringMatched(2);\r
 208 \r
 209         //System.err.println("type:" + annType + " content: " + annContent);\r
 210 \r
 211         if (annType.equals("GF"))\r
 212         {\r
 213           /* Generic per-File annotation, free text\r
 214            * Magic features:\r
 215            * #=GF NH <tree in New Hampshire eXtended format>\r
 216            * #=GF TN <Unique identifier for the next tree>\r
 217            * Pfam descriptions:\r
 218               7. DESCRIPTION OF FIELDS\r
 219 \r
 220                  Compulsory fields:\r
 221                  ------------------\r
 222 \r
 223                  AC   Accession number:           Accession number in form PFxxxxx.version or PBxxxxxx.\r
 224                  ID   Identification:             One word name for family.\r
 225                  DE   Definition:                 Short description of family.\r
 226                  AU   Author:                     Authors of the entry.\r
 227                  SE   Source of seed:             The source suggesting the seed members belong to one family.\r
 228            GA   Gathering method:           Search threshold to build the full alignment.\r
 229                  TC   Trusted Cutoff:             Lowest sequence score and domain score of match in the full alignment.\r
 230                  NC   Noise Cutoff:               Highest sequence score and domain score of match not in full alignment.\r
 231                  TP   Type:                       Type of family -- presently Family, Domain, Motif or Repeat.\r
 232            SQ   Sequence:                   Number of sequences in alignment.\r
 233                  AM   Alignment Method        The order ls and fs hits are aligned to the model to build the full align.\r
 234                  //                               End of alignment.\r
 235 \r
 236                  Optional fields:\r
 237                  ----------------\r
 238 \r
 239            DC   Database Comment:           Comment about database reference.\r
 240            DR   Database Reference:         Reference to external database.\r
 241            RC   Reference Comment:          Comment about literature reference.\r
 242                  RN   Reference Number:           Reference Number.\r
 243            RM   Reference Medline:          Eight digit medline UI number.\r
 244                  RT   Reference Title:            Reference Title.\r
 245                  RA   Reference Author:           Reference Author\r
 246                  RL   Reference Location:         Journal location.\r
 247            PI   Previous identifier:        Record of all previous ID lines.\r
 248                  KW   Keywords:                   Keywords.\r
 249                  CC   Comment:                    Comments.\r
 250                  NE   Pfam accession:         Indicates a nested domain.\r
 251                  NL   Location:                   Location of nested domains - sequence ID, start and end of insert.\r
 252 \r
 253                  Obsolete fields:\r
 254                  -----------\r
 255            AL   Alignment method of seed:   The method used to align the seed members.\r
 256            */\r
 257           // Let's save the annotations, maybe we'll be able to do something with them later...\r
 258           Regex an = new Regex("(\\w+)\\s*(.*)");\r
 259           if (an.search(annContent))\r
 260           {\r
 261             alAnn.put(an.stringMatched(1), an.stringMatched(2));\r
 262           }\r
 263         }\r
 264         else if (annType.equals("GS"))\r
 265         {\r
 266           // Generic per-Sequence annotation, free text\r
 267           /* Pfam uses these features:\r
 268               Feature                    Description\r
 269               ---------------------      -----------\r
 270               AC <accession>             ACcession number\r
 271               DE <freetext>              DEscription\r
 272               DR <db>; <accession>;      Database Reference\r
 273               OS <organism>              OrganiSm (species)\r
 274               OC <clade>                 Organism Classification (clade, etc.)\r
 275               LO <look>                  Look (Color, etc.)\r
 276            */\r
 277           if (s.search(annContent))\r
 278           {\r
 279             String acc = s.stringMatched(1);\r
 280             String type = s.stringMatched(2);\r
 281             String content = s.stringMatched(3);\r
 282 \r
 283             Hashtable ann;\r
 284             if (seqAnn.containsKey(acc))\r
 285             {\r
 286               ann = (Hashtable) seqAnn.get(acc);\r
 287             }\r
 288             else\r
 289             {\r
 290               ann = new Hashtable();\r
 291             }\r
 292             ann.put(type, content);\r
 293             seqAnn.put(acc, ann);\r
 294           }\r
 295           else\r
 296           {\r
 297             throw new IOException("Error parsing " + line);\r
 298           }\r
 299         }\r
 300         else if (annType.equals("GC"))\r
 301         {\r
 302           System.out.println(annContent);\r
 303           // Generic per-Column annotation, exactly 1 char per column\r
 304         }\r
 305         else if (annType.equals("GR"))\r
 306         {\r
 307           // Generic per-Sequence AND per-Column markup, exactly 1 char per column\r
 308           /*\r
 309               Feature   Description            Markup letters\r
 310               -------   -----------            --------------\r
 311               SS        Secondary Structure    [HGIEBTSCX]\r
 312               SA        Surface Accessibility  [0-9X]\r
 313                             (0=0%-10%; ...; 9=90%-100%)\r
 314               TM        TransMembrane          [Mio]\r
 315               PP        Posterior Probability  [0-9*]\r
 316                             (0=0.00-0.05; 1=0.05-0.15; *=0.95-1.00)\r
 317               LI        LIgand binding         [*]\r
 318               AS        Active Site            [*]\r
 319               IN        INtron (in or after)   [0-2]\r
 320            */\r
 321           if (s.search(annContent))\r
 322           {\r
 323             String acc = s.stringMatched(1);\r
 324             String type = s.stringMatched(2);\r
 325             String seq = s.stringMatched(3);\r
 326             String description = new String();\r
 327 \r
 328             // Check for additional information about the current annotation\r
 329             if (x.search(seq))\r
 330             {\r
 331               description = x.stringMatched(1);\r
 332               seq = x.stringMatched(2);\r
 333             }\r
 334             // sequence id with from-to fields\r
 335 \r
 336             Hashtable ann;\r
 337             // Get an object with all the annotations for this sequence\r
 338             if (seqAnn.containsKey(acc))\r
 339             {\r
 340               //logger.debug("Found annotations for " + acc);\r
 341               ann = (Hashtable) seqAnn.get(acc);\r
 342             }\r
 343             else\r
 344             {\r
 345               //logger.debug("Creating new annotations holder for " + acc);\r
 346               ann = new Hashtable();\r
 347               seqAnn.put(acc, ann);\r
 348             }\r
 349 \r
 350             Hashtable features;\r
 351             // Get an object with all the content for an annotation\r
 352             if (ann.containsKey("features"))\r
 353             {\r
 354               //logger.debug("Found features for " + acc);\r
 355               features = (Hashtable) ann.get("features");\r
 356             }\r
 357             else\r
 358             {\r
 359               //logger.debug("Creating new features holder for " + acc);\r
 360               features = new Hashtable();\r
 361               ann.put("features", features);\r
 362             }\r
 363 \r
 364             Hashtable content;\r
 365             if (features.containsKey(this.id2type(type)))\r
 366             {\r
 367               //logger.debug("Found content for " + this.id2type(type));\r
 368               content = (Hashtable) features.get(this.id2type(type));\r
 369             }\r
 370             else\r
 371             {\r
 372               //logger.debug("Creating new content holder for " + this.id2type(type));\r
 373               content = new Hashtable();\r
 374               features.put(this.id2type(type), content);\r
 375             }\r
 376             String ns = (String) content.get(description);\r
 377             if (ns == null)\r
 378             {\r
 379               ns = "";\r
 380             }\r
 381             ns += seq;\r
 382             content.put(description, seq);\r
 383           }\r
 384           else\r
 385           {\r
 386             throw new IOException("Error parsing " + line);\r
 387           }\r
 388         }\r
 389         else\r
 390         {\r
 391           throw new IOException("Unknown annotation detected: " + annType + " " +\r
 392                                 annContent);\r
 393         }\r
 394       }\r
 395     }\r
 396   }\r
 397 \r
 398   public static String print(SequenceI[] s)\r
 399   {\r
 400     return "not yet implemented";\r
 401   }\r
 402 \r
 403   public String print()\r
 404   {\r
 405     return print(getSeqsAsArray());\r
 406   }\r
 407 \r
 408   private String id2type(String id)\r
 409   {\r
 410     // GR ids\r
 411     if (id.equals("SS"))\r
 412     {\r
 413       return "secondary structure";\r
 414     }\r
 415     else if (id.equals("SA"))\r
 416     {\r
 417       return "surface accessibility";\r
 418     }\r
 419     else if (id.equals("TM"))\r
 420     {\r
 421       return "transmembrane";\r
 422     }\r
 423     else if (id.equals("PP"))\r
 424     {\r
 425       return "posterior probability";\r
 426     }\r
 427     else if (id.equals("LI"))\r
 428     {\r
 429       return "ligand binding";\r
 430     }\r
 431     else if (id.equals("AS"))\r
 432     {\r
 433       return "active site";\r
 434     }\r
 435     else if (id.equals("IN"))\r
 436     {\r
 437       return "intron";\r
 438     }\r
 439     else if (id.equals("IR"))\r
 440     {\r
 441       return "interacting residue";\r
 442     }\r
 443     // GS ids\r
 444     else if (id.equals("AC"))\r
 445     {\r
 446       return "accession";\r
 447     }\r
 448     else if (id.equals("OS"))\r
 449     {\r
 450       return "organism";\r
 451     }\r
 452     else if (id.equals("CL"))\r
 453     {\r
 454       return "class";\r
 455     }\r
 456     else if (id.equals("DE"))\r
 457     {\r
 458       return "description";\r
 459     }\r
 460     else if (id.equals("DR"))\r
 461     {\r
 462       return "reference";\r
 463     }\r
 464     else if (id.equals("LO"))\r
 465     {\r
 466       return "look";\r
 467     }\r
 468     else\r
 469     {\r
 470       return null;\r
 471     }\r
 472   }\r
 473 }\r