src/jalview/io/StockholmFile.java

   1 /*\r
   2  * Jalview - A Sequence Alignment Editor and Viewer\r
   3  * Copyright (C) 2007 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle\r
   4  *\r
   5  * This program is free software; you can redistribute it and/or\r
   6  * modify it under the terms of the GNU General Public License\r
   7  * as published by the Free Software Foundation; either version 2\r
   8  * of the License, or (at your option) any later version.\r
   9  *\r
  10  * This program is distributed in the hope that it will be useful,\r
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
  13  * GNU General Public License for more details.\r
  14  *\r
  15  * You should have received a copy of the GNU General Public License\r
  16  * along with this program; if not, write to the Free Software\r
  17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA\r
  18  */\r
  19 /*\r
  20  * This extension was written by Benjamin Schuster-Boeckler at sanger.ac.uk\r
  21  */\r
  22 package jalview.io;\r
  23 \r
  24 import java.io.*;\r
  25 import java.util.*;\r
  26 \r
  27 import com.stevesoft.pat.*;\r
  28 import jalview.datamodel.*;\r
  29 \r
  30 //import org.apache.log4j.*;\r
  31 \r
  32 /**\r
  33  * This class is supposed to parse a Stockholm format file into Jalview\r
  34  * @author bsb at sanger.ac.uk\r
  35  * @version 0.3\r
  36  */\r
  37 public class StockholmFile\r
  38     extends AlignFile\r
  39 {\r
  40   //static Logger logger = Logger.getLogger("jalview.io.StockholmFile");\r
  41 \r
  42   public StockholmFile()\r
  43   {\r
  44   }\r
  45 \r
  46   public StockholmFile(String inFile, String type)\r
  47       throws IOException\r
  48   {\r
  49     super(inFile, type);\r
  50   }\r
  51 \r
  52   public void initData()\r
  53   {\r
  54     super.initData();\r
  55   }\r
  56 \r
  57   /**\r
  58    * Parse a file in Stockholm format into Jalview's data model. The file has\r
  59    * to be passed at construction time\r
  60    * @throws IOException If there is an error with the input file\r
  61    */\r
  62   public void parse()\r
  63       throws IOException\r
  64   {\r
  65     StringBuffer treeString=new StringBuffer();\r
  66     String treeName = null;\r
  67     // --------------- Variable Definitions -------------------\r
  68     String line;\r
  69     String version;\r
  70     //  String id;\r
  71      Hashtable seqAnn = new Hashtable(); // Sequence related annotations\r
  72     Hashtable seqs = new Hashtable();\r
  73     Regex p, r, rend, s, x;\r
  74 \r
  75     // ------------------ Parsing File ----------------------\r
  76     // First, we have to check that this file has STOCKHOLM format, i.e. the first line must match\r
  77     r = new Regex("# STOCKHOLM ([\\d\\.]+)");\r
  78     if (!r.search(nextLine()))\r
  79     {\r
  80       throw new IOException("This file is not in valid STOCKHOLM format: First line does not contain '# STOCKHOLM'");\r
  81     }\r
  82     else\r
  83     {\r
  84       version = r.stringMatched(1);\r
  85       //logger.debug("Stockholm version: " + version);\r
  86     }\r
  87 \r
  88 //      We define some Regexes here that will be used regularily later\r
  89     rend = new Regex("\\/\\/"); // Find the end of an alignment\r
  90     p = new Regex("(\\S+)\\/(\\d+)\\-(\\d+)"); // split sequence id in id/from/to\r
  91     s = new Regex("(\\S+)\\s+(\\w{2})\\s+(.*)"); // Parses annotation subtype\r
  92     r = new Regex("#=(G[FSRC]?)\\s+(.*)"); // Finds any annotation line\r
  93     x = new Regex("(\\S+)\\s+(\\S+)"); //split id from sequence\r
  94 \r
  95     rend.optimize();\r
  96     p.optimize();\r
  97     s.optimize();\r
  98     r.optimize();\r
  99     x.optimize();\r
 100 \r
 101     while ( (line = nextLine()) != null)\r
 102     {\r
 103       if (line.length() == 0)\r
 104       {\r
 105         continue;\r
 106       }\r
 107       if (rend.search(line))\r
 108       {\r
 109 //              End of the alignment, pass stuff back\r
 110 \r
 111         this.noSeqs = seqs.size();\r
 112         //logger.debug("Number of sequences: " + this.noSeqs);\r
 113         Enumeration accs = seqs.keys();\r
 114         while (accs.hasMoreElements())\r
 115         {\r
 116           String acc = (String) accs.nextElement();\r
 117           //logger.debug("Processing sequence " + acc);\r
 118           String seq = (String) seqs.get(acc);\r
 119           if (maxLength < seq.length())\r
 120           {\r
 121             maxLength = seq.length();\r
 122           }\r
 123           int start = 1;\r
 124           int end = -1;\r
 125           String sid = acc;\r
 126           // Retrieve hash of annotations for this accession\r
 127           Hashtable accAnnotations = null;\r
 128           \r
 129           if (seqAnn!=null && seqAnn.containsKey(acc))\r
 130           {\r
 131             accAnnotations = (Hashtable) seqAnn.get(acc);\r
 132           }\r
 133           \r
 134           // Split accession in id and from/to\r
 135           if (p.search(acc))\r
 136           {\r
 137             sid = p.stringMatched(1);\r
 138             start = Integer.parseInt(p.stringMatched(2));\r
 139             end = Integer.parseInt(p.stringMatched(3));\r
 140           }\r
 141           //logger.debug(sid + ", " + start + ", " + end);\r
 142 \r
 143           Sequence seqO = new Sequence(sid, seq, start, end);\r
 144           // Add Description (if any)\r
 145           if (accAnnotations!=null && accAnnotations.containsKey("DE"))\r
 146           {\r
 147             String desc = (String) accAnnotations.get("DE");\r
 148             seqO.setDescription((desc==null)?"" : desc);\r
 149           }\r
 150           // Add DB References (if any)\r
 151           if (accAnnotations!=null && accAnnotations.containsKey("DR"))\r
 152           {\r
 153             String dbr = (String) accAnnotations.get("DR");\r
 154             if (dbr!=null && dbr.indexOf(";")>-1)\r
 155             {\r
 156               String src = dbr.substring(0, dbr.indexOf(";"));\r
 157               String acn = dbr.substring(dbr.indexOf(";")+1);\r
 158               DBRefEntry dbref = new DBRefEntry(jalview.util.DBRefUtils.getCanonicalName(src), acn, "");\r
 159               seqO.addDBRef(dbref);\r
 160             }\r
 161           }\r
 162           Hashtable features = null;\r
 163           // We need to adjust the positions of all features to account for gaps\r
 164           try\r
 165           {\r
 166             features = (Hashtable) accAnnotations.get(\r
 167                 "features");\r
 168           }\r
 169           catch (java.lang.NullPointerException e)\r
 170           {\r
 171             //loggerwarn("Getting Features for " + acc + ": " + e.getMessage());\r
 172             //continue;\r
 173           }\r
 174           // if we have features\r
 175           if (features != null)\r
 176           {\r
 177             Enumeration i = features.keys();\r
 178             while (i.hasMoreElements())\r
 179             {\r
 180               // TODO: parse out secondary structure annotation as annotation row\r
 181               // TODO: parse out scores as annotation row\r
 182               // TODO: map coding region to core jalview feature types\r
 183               String type = i.nextElement().toString();\r
 184               Hashtable content = (Hashtable) features.get(type);\r
 185 \r
 186               Enumeration j = content.keys();\r
 187               while (j.hasMoreElements())\r
 188               {\r
 189                 String desc = j.nextElement().toString();\r
 190                 String ns = content.get(desc).toString();\r
 191                 char[] byChar = ns.toCharArray();\r
 192                 for (int k = 0; k < byChar.length; k++)\r
 193                 {\r
 194                   char c = byChar[k];\r
 195                   if (! (c == ' ' || c == '_' ||\r
 196                          c == '-'))\r
 197                   {\r
 198                     int new_pos = seqO.findPosition(k);\r
 199                     SequenceFeature feat =\r
 200                         new SequenceFeature(type,\r
 201                                             desc, new_pos, new_pos, 0f, null);\r
 202 \r
 203                     seqO.addSequenceFeature(feat);\r
 204                   }\r
 205                 }\r
 206               }\r
 207 \r
 208             }\r
 209 \r
 210           }\r
 211           //logger.debug("Adding seq " + acc + " from "  + start + " to " + end + ": " + seq);\r
 212           this.seqs.addElement(seqO);\r
 213         }\r
 214       }\r
 215       else if (!r.search(line))\r
 216       {\r
 217         //System.err.println("Found sequence line: " + line);\r
 218 \r
 219         // Split sequence in sequence and accession parts\r
 220         if (!x.search(line))\r
 221         {\r
 222           //logger.error("Could not parse sequence line: " + line);\r
 223           throw new IOException("Could not parse sequence line: " + line);\r
 224         }\r
 225         String ns = (String) seqs.get(x.stringMatched(1));\r
 226         if (ns == null)\r
 227         {\r
 228           ns = "";\r
 229         }\r
 230         ns += x.stringMatched(2);\r
 231 \r
 232         seqs.put(x.stringMatched(1), ns);\r
 233       }\r
 234       else\r
 235       {\r
 236         String annType = r.stringMatched(1);\r
 237         String annContent = r.stringMatched(2);\r
 238 \r
 239         //System.err.println("type:" + annType + " content: " + annContent);\r
 240 \r
 241         if (annType.equals("GF"))\r
 242         {\r
 243           /* Generic per-File annotation, free text\r
 244            * Magic features:\r
 245            * #=GF NH <tree in New Hampshire eXtended format>\r
 246            * #=GF TN <Unique identifier for the next tree>\r
 247            * Pfam descriptions:\r
 248               7. DESCRIPTION OF FIELDS\r
 249 \r
 250                  Compulsory fields:\r
 251                  ------------------\r
 252 \r
 253                  AC   Accession number:           Accession number in form PFxxxxx.version or PBxxxxxx.\r
 254                  ID   Identification:             One word name for family.\r
 255                  DE   Definition:                 Short description of family.\r
 256                  AU   Author:                     Authors of the entry.\r
 257                  SE   Source of seed:             The source suggesting the seed members belong to one family.\r
 258            GA   Gathering method:           Search threshold to build the full alignment.\r
 259                  TC   Trusted Cutoff:             Lowest sequence score and domain score of match in the full alignment.\r
 260                  NC   Noise Cutoff:               Highest sequence score and domain score of match not in full alignment.\r
 261                  TP   Type:                       Type of family -- presently Family, Domain, Motif or Repeat.\r
 262            SQ   Sequence:                   Number of sequences in alignment.\r
 263                  AM   Alignment Method        The order ls and fs hits are aligned to the model to build the full align.\r
 264                  //                               End of alignment.\r
 265 \r
 266                  Optional fields:\r
 267                  ----------------\r
 268 \r
 269            DC   Database Comment:           Comment about database reference.\r
 270            DR   Database Reference:         Reference to external database.\r
 271            RC   Reference Comment:          Comment about literature reference.\r
 272                  RN   Reference Number:           Reference Number.\r
 273            RM   Reference Medline:          Eight digit medline UI number.\r
 274                  RT   Reference Title:            Reference Title.\r
 275                  RA   Reference Author:           Reference Author\r
 276                  RL   Reference Location:         Journal location.\r
 277            PI   Previous identifier:        Record of all previous ID lines.\r
 278                  KW   Keywords:                   Keywords.\r
 279                  CC   Comment:                    Comments.\r
 280                  NE   Pfam accession:         Indicates a nested domain.\r
 281                  NL   Location:                   Location of nested domains - sequence ID, start and end of insert.\r
 282 \r
 283                  Obsolete fields:\r
 284                  -----------\r
 285            AL   Alignment method of seed:   The method used to align the seed members.\r
 286            */\r
 287           // Let's save the annotations, maybe we'll be able to do something with them later...\r
 288           Regex an = new Regex("(\\w+)\\s*(.*)");\r
 289           if (an.search(annContent))\r
 290           {\r
 291             if (an.stringMatched(1).equals("NH"))\r
 292             {\r
 293               treeString.append(an.stringMatched(2));\r
 294             } else \r
 295             if (an.stringMatched(1).equals("TN")) {\r
 296               if (treeString.length()>0)\r
 297               {\r
 298                 if (treeName==null)\r
 299                 {\r
 300                   treeName = "Tree "+(getTreeCount()+1);\r
 301                 }\r
 302                 addNewickTree(treeName, treeString.toString());\r
 303               }\r
 304               treeName = an.stringMatched(2);\r
 305               treeString = new StringBuffer();\r
 306             }\r
 307             setAlignmentProperty(an.stringMatched(1), an.stringMatched(2));\r
 308           }\r
 309         }\r
 310         else if (annType.equals("GS"))\r
 311         {\r
 312           // Generic per-Sequence annotation, free text\r
 313           /* Pfam uses these features:\r
 314               Feature                    Description\r
 315               ---------------------      -----------\r
 316               AC <accession>             ACcession number\r
 317               DE <freetext>              DEscription\r
 318               DR <db>; <accession>;      Database Reference\r
 319               OS <organism>              OrganiSm (species)\r
 320               OC <clade>                 Organism Classification (clade, etc.)\r
 321               LO <look>                  Look (Color, etc.)\r
 322            */\r
 323           if (s.search(annContent))\r
 324           {\r
 325             String acc = s.stringMatched(1);\r
 326             String type = s.stringMatched(2);\r
 327             String content = s.stringMatched(3);\r
 328             // TODO: store DR in a vector.\r
 329             // TODO: store AC according to generic file db annotation.\r
 330             Hashtable ann;\r
 331             if (seqAnn.containsKey(acc))\r
 332             {\r
 333               ann = (Hashtable) seqAnn.get(acc);\r
 334             }\r
 335             else\r
 336             {\r
 337               ann = new Hashtable();\r
 338             }\r
 339             ann.put(type, content);\r
 340             seqAnn.put(acc, ann);\r
 341           }\r
 342           else\r
 343           {\r
 344             throw new IOException("Error parsing " + line);\r
 345           }\r
 346         }\r
 347         else if (annType.equals("GC"))\r
 348         {\r
 349           // Generic per-Column annotation, exactly 1 char per column\r
 350           // always need a label.\r
 351           if (x.search(annContent))\r
 352           {\r
 353             // parse out and create alignment annotation directly.\r
 354             AlignmentAnnotation annotation = parseAnnotationRow(x.stringMatched(1), x.stringMatched(2));\r
 355             annotations.addElement(annotation);\r
 356           }\r
 357         }\r
 358         else if (annType.equals("GR"))\r
 359         {\r
 360           // Generic per-Sequence AND per-Column markup, exactly 1 char per column\r
 361           /*\r
 362               Feature   Description            Markup letters\r
 363               -------   -----------            --------------\r
 364               SS        Secondary Structure    [HGIEBTSCX]\r
 365               SA        Surface Accessibility  [0-9X]\r
 366                             (0=0%-10%; ...; 9=90%-100%)\r
 367               TM        TransMembrane          [Mio]\r
 368               PP        Posterior Probability  [0-9*]\r
 369                             (0=0.00-0.05; 1=0.05-0.15; *=0.95-1.00)\r
 370               LI        LIgand binding         [*]\r
 371               AS        Active Site            [*]\r
 372               IN        INtron (in or after)   [0-2]\r
 373            */\r
 374           if (s.search(annContent))\r
 375           {\r
 376             String acc = s.stringMatched(1);\r
 377             String type = s.stringMatched(2);\r
 378             String seq = s.stringMatched(3);\r
 379             String description = new String();\r
 380 \r
 381             // Check for additional information about the current annotation\r
 382             if (x.search(seq))\r
 383             {\r
 384               description = x.stringMatched(1);\r
 385               seq = x.stringMatched(2);\r
 386             }\r
 387             // sequence id with from-to fields\r
 388 \r
 389             Hashtable ann;\r
 390             // Get an object with all the annotations for this sequence\r
 391             if (seqAnn.containsKey(acc))\r
 392             {\r
 393               //logger.debug("Found annotations for " + acc);\r
 394               ann = (Hashtable) seqAnn.get(acc);\r
 395             }\r
 396             else\r
 397             {\r
 398               //logger.debug("Creating new annotations holder for " + acc);\r
 399               ann = new Hashtable();\r
 400               seqAnn.put(acc, ann);\r
 401             }\r
 402 \r
 403             Hashtable features;\r
 404             // Get an object with all the content for an annotation\r
 405             if (ann.containsKey("features"))\r
 406             {\r
 407               //logger.debug("Found features for " + acc);\r
 408               features = (Hashtable) ann.get("features");\r
 409             }\r
 410             else\r
 411             {\r
 412               //logger.debug("Creating new features holder for " + acc);\r
 413               features = new Hashtable();\r
 414               ann.put("features", features);\r
 415             }\r
 416 \r
 417             Hashtable content;\r
 418             if (features.containsKey(this.id2type(type)))\r
 419             {\r
 420               //logger.debug("Found content for " + this.id2type(type));\r
 421               content = (Hashtable) features.get(this.id2type(type));\r
 422             }\r
 423             else\r
 424             {\r
 425               //logger.debug("Creating new content holder for " + this.id2type(type));\r
 426               content = new Hashtable();\r
 427               features.put(this.id2type(type), content);\r
 428             }\r
 429             String ns = (String) content.get(description);\r
 430             if (ns == null)\r
 431             {\r
 432               ns = "";\r
 433             }\r
 434             ns += seq;\r
 435             content.put(description, seq);\r
 436           }\r
 437           else\r
 438           {\r
 439             throw new IOException("Error parsing " + line);\r
 440           }\r
 441         }\r
 442         else\r
 443         {\r
 444           throw new IOException("Unknown annotation detected: " + annType + " " +\r
 445                                 annContent);\r
 446         }\r
 447       }\r
 448     }\r
 449     if (treeString.length()>0)\r
 450     {\r
 451       if (treeName==null)\r
 452       {\r
 453         treeName = "Tree "+(1+getTreeCount());\r
 454       }\r
 455       addNewickTree(treeName, treeString.toString());\r
 456     }\r
 457   }\r
 458 \r
 459   private AlignmentAnnotation parseAnnotationRow(String label, String annots)\r
 460   {\r
 461     String type = (label.indexOf("_cons")==label.length()-5) ? label.substring(0, label.length()-5)\r
 462             : label;\r
 463     boolean ss = false;\r
 464     type = id2type(type);\r
 465     if (type.equals("secondary structure"))\r
 466     {\r
 467       ss=true;\r
 468     }\r
 469     // decide on secondary structure or not.\r
 470     Annotation[] els = new Annotation[annots.length()];\r
 471     for (int i = 0; i<annots.length(); i++)\r
 472     {\r
 473       String pos = annots.substring(i,i+1);\r
 474       Annotation ann;\r
 475       ann = new Annotation(pos, "", ' ', Float.NaN);\r
 476       if (ss)\r
 477       {\r
 478         ann.secondaryStructure = jalview.schemes.ResidueProperties.getDssp3state(pos).charAt(0);\r
 479         if (ann.secondaryStructure == pos.charAt(0) || pos.charAt(0)=='C')\r
 480         {\r
 481           ann.displayCharacter = "";\r
 482         } else {\r
 483           ann.displayCharacter += " ";\r
 484         }\r
 485       }\r
 486       \r
 487       els[i] = ann;\r
 488     }\r
 489     return new AlignmentAnnotation(label, label, els);\r
 490   }\r
 491   \r
 492   public static String print(SequenceI[] s)\r
 493   {\r
 494     return "not yet implemented";\r
 495   }\r
 496 \r
 497   public String print()\r
 498   {\r
 499     return print(getSeqsAsArray());\r
 500   }\r
 501 \r
 502   private String id2type(String id)\r
 503   {\r
 504     // GR ids\r
 505     if (id.equals("SS"))\r
 506     {\r
 507       return "secondary structure";\r
 508     }\r
 509     else if (id.equals("SA"))\r
 510     {\r
 511       return "surface accessibility";\r
 512     }\r
 513     else if (id.equals("TM"))\r
 514     {\r
 515       return "transmembrane";\r
 516     }\r
 517     else if (id.equals("PP"))\r
 518     {\r
 519       return "posterior probability";\r
 520     }\r
 521     else if (id.equals("LI"))\r
 522     {\r
 523       return "ligand binding";\r
 524     }\r
 525     else if (id.equals("AS"))\r
 526     {\r
 527       return "active site";\r
 528     }\r
 529     else if (id.equals("IN"))\r
 530     {\r
 531       return "intron";\r
 532     }\r
 533     else if (id.equals("IR"))\r
 534     {\r
 535       return "interacting residue";\r
 536     }\r
 537     // GS ids\r
 538     else if (id.equals("AC"))\r
 539     {\r
 540       return "accession";\r
 541     }\r
 542     else if (id.equals("OS"))\r
 543     {\r
 544       return "organism";\r
 545     }\r
 546     else if (id.equals("CL"))\r
 547     {\r
 548       return "class";\r
 549     }\r
 550     else if (id.equals("DE"))\r
 551     {\r
 552       return "description";\r
 553     }\r
 554     else if (id.equals("DR"))\r
 555     {\r
 556       return "reference";\r
 557     }\r
 558     else if (id.equals("LO"))\r
 559     {\r
 560       return "look";\r
 561     }\r
 562     else\r
 563     {\r
 564       return null;\r
 565     }\r
 566   }\r
 567 }\r