src/jalview/io/StockholmFile.java

   1 /*\r
   2  * Jalview - A Sequence Alignment Editor and Viewer\r
   3  * Copyright (C) 2007 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle\r
   4  *\r
   5  * This program is free software; you can redistribute it and/or\r
   6  * modify it under the terms of the GNU General Public License\r
   7  * as published by the Free Software Foundation; either version 2\r
   8  * of the License, or (at your option) any later version.\r
   9  *\r
  10  * This program is distributed in the hope that it will be useful,\r
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
  13  * GNU General Public License for more details.\r
  14  *\r
  15  * You should have received a copy of the GNU General Public License\r
  16  * along with this program; if not, write to the Free Software\r
  17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA\r
  18  */\r
  19 /*\r
  20  * This extension was written by Benjamin Schuster-Boeckler at sanger.ac.uk\r
  21  */\r
  22 package jalview.io;\r
  23 \r
  24 import java.io.*;\r
  25 import java.util.*;\r
  26 \r
  27 import com.stevesoft.pat.*;\r
  28 import jalview.datamodel.*;\r
  29 \r
  30 // import org.apache.log4j.*;\r
  31 \r
  32 /**\r
  33  * This class is supposed to parse a Stockholm format file into Jalview\r
  34  * There are TODOs in this class: we do not know what the database source and\r
  35  * version is for the file when parsing the #GS= AC tag which associates accessions\r
  36  * with sequences. \r
  37  * Database references are also not parsed correctly: a separate reference string\r
  38  * parser must be added to parse the database reference form into Jalview's local\r
  39  * representation.\r
  40  * @author bsb at sanger.ac.uk\r
  41  * @version 0.3 + jalview mods\r
  42  * \r
  43  */\r
  44 public class StockholmFile extends AlignFile\r
  45 {\r
  46   // static Logger logger = Logger.getLogger("jalview.io.StockholmFile");\r
  47 \r
  48   public StockholmFile()\r
  49   {\r
  50   }\r
  51 \r
  52   public StockholmFile(String inFile, String type) throws IOException\r
  53   {\r
  54     super(inFile, type);\r
  55   }\r
  56 \r
  57   public void initData()\r
  58   {\r
  59     super.initData();\r
  60   }\r
  61 \r
  62   /**\r
  63    * Parse a file in Stockholm format into Jalview's data model. The file has to\r
  64    * be passed at construction time\r
  65    * \r
  66    * @throws IOException\r
  67    *           If there is an error with the input file\r
  68    */\r
  69   public void parse() throws IOException\r
  70   {\r
  71     StringBuffer treeString = new StringBuffer();\r
  72     String treeName = null;\r
  73     // --------------- Variable Definitions -------------------\r
  74     String line;\r
  75     String version;\r
  76     // String id;\r
  77     Hashtable seqAnn = new Hashtable(); // Sequence related annotations\r
  78     Hashtable seqs = new Hashtable();\r
  79     Regex p, r, rend, s, x;\r
  80 \r
  81     // ------------------ Parsing File ----------------------\r
  82     // First, we have to check that this file has STOCKHOLM format, i.e. the\r
  83     // first line must match\r
  84     r = new Regex("# STOCKHOLM ([\\d\\.]+)");\r
  85     if (!r.search(nextLine()))\r
  86     {\r
  87       throw new IOException(\r
  88               "This file is not in valid STOCKHOLM format: First line does not contain '# STOCKHOLM'");\r
  89     }\r
  90     else\r
  91     {\r
  92       version = r.stringMatched(1);\r
  93       // logger.debug("Stockholm version: " + version);\r
  94     }\r
  95 \r
  96     // We define some Regexes here that will be used regularily later\r
  97     rend = new Regex("\\/\\/"); // Find the end of an alignment\r
  98     p = new Regex("(\\S+)\\/(\\d+)\\-(\\d+)"); // split sequence id in\r
  99                                                 // id/from/to\r
 100     s = new Regex("(\\S+)\\s+(\\w{2})\\s+(.*)"); // Parses annotation subtype\r
 101     r = new Regex("#=(G[FSRC]?)\\s+(.*)"); // Finds any annotation line\r
 102     x = new Regex("(\\S+)\\s+(\\S+)"); // split id from sequence\r
 103 \r
 104     rend.optimize();\r
 105     p.optimize();\r
 106     s.optimize();\r
 107     r.optimize();\r
 108     x.optimize();\r
 109 \r
 110     while ((line = nextLine()) != null)\r
 111     {\r
 112       if (line.length() == 0)\r
 113       {\r
 114         continue;\r
 115       }\r
 116       if (rend.search(line))\r
 117       {\r
 118         // End of the alignment, pass stuff back\r
 119 \r
 120         this.noSeqs = seqs.size();\r
 121         // logger.debug("Number of sequences: " + this.noSeqs);\r
 122         Enumeration accs = seqs.keys();\r
 123         while (accs.hasMoreElements())\r
 124         {\r
 125           String acc = (String) accs.nextElement();\r
 126           // logger.debug("Processing sequence " + acc);\r
 127           String seq = (String) seqs.get(acc);\r
 128           if (maxLength < seq.length())\r
 129           {\r
 130             maxLength = seq.length();\r
 131           }\r
 132           int start = 1;\r
 133           int end = -1;\r
 134           String sid = acc;\r
 135           // Retrieve hash of annotations for this accession\r
 136           Hashtable accAnnotations = null;\r
 137 \r
 138           if (seqAnn != null && seqAnn.containsKey(acc))\r
 139           {\r
 140             accAnnotations = (Hashtable) seqAnn.get(acc);\r
 141           }\r
 142 \r
 143           // Split accession in id and from/to\r
 144           if (p.search(acc))\r
 145           {\r
 146             sid = p.stringMatched(1);\r
 147             start = Integer.parseInt(p.stringMatched(2));\r
 148             end = Integer.parseInt(p.stringMatched(3));\r
 149           }\r
 150           // logger.debug(sid + ", " + start + ", " + end);\r
 151 \r
 152           Sequence seqO = new Sequence(sid, seq, start, end);\r
 153           // Add Description (if any)\r
 154           if (accAnnotations != null && accAnnotations.containsKey("DE"))\r
 155           {\r
 156             String desc = (String) accAnnotations.get("DE");\r
 157             seqO.setDescription((desc == null) ? "" : desc);\r
 158           }\r
 159           // Add DB References (if any)\r
 160           if (accAnnotations != null && accAnnotations.containsKey("DR"))\r
 161           {\r
 162             String dbr = (String) accAnnotations.get("DR");\r
 163             if (dbr != null && dbr.indexOf(";") > -1)\r
 164             {\r
 165               String src = dbr.substring(0, dbr.indexOf(";"));\r
 166               String acn = dbr.substring(dbr.indexOf(";") + 1);\r
 167               jalview.util.DBRefUtils.parseToDbRef(seqO, src, "0", acn);\r
 168               //seqO.addDBRef(dbref);\r
 169             }\r
 170           }\r
 171           Hashtable features = null;\r
 172           // We need to adjust the positions of all features to account for gaps\r
 173           try\r
 174           {\r
 175             features = (Hashtable) accAnnotations.get("features");\r
 176           } catch (java.lang.NullPointerException e)\r
 177           {\r
 178             // loggerwarn("Getting Features for " + acc + ": " +\r
 179             // e.getMessage());\r
 180             // continue;\r
 181           }\r
 182           // if we have features\r
 183           if (features != null)\r
 184           {\r
 185             Enumeration i = features.keys();\r
 186             while (i.hasMoreElements())\r
 187             {\r
 188               // TODO: parse out secondary structure annotation as annotation\r
 189               // row\r
 190               // TODO: parse out scores as annotation row\r
 191               // TODO: map coding region to core jalview feature types\r
 192               String type = i.nextElement().toString();\r
 193               Hashtable content = (Hashtable) features.get(type);\r
 194 \r
 195               Enumeration j = content.keys();\r
 196               while (j.hasMoreElements())\r
 197               {\r
 198                 String desc = j.nextElement().toString();\r
 199                 String ns = content.get(desc).toString();\r
 200                 char[] byChar = ns.toCharArray();\r
 201                 for (int k = 0; k < byChar.length; k++)\r
 202                 {\r
 203                   char c = byChar[k];\r
 204                   if (!(c == ' ' || c == '_' || c == '-'))\r
 205                   {\r
 206                     int new_pos = seqO.findPosition(k);\r
 207                     SequenceFeature feat = new SequenceFeature(type, desc,\r
 208                             new_pos, new_pos, 0f, null);\r
 209 \r
 210                     seqO.addSequenceFeature(feat);\r
 211                   }\r
 212                 }\r
 213               }\r
 214 \r
 215             }\r
 216 \r
 217           }\r
 218           // logger.debug("Adding seq " + acc + " from " + start + " to " + end\r
 219           // + ": " + seq);\r
 220           this.seqs.addElement(seqO);\r
 221         }\r
 222       }\r
 223       else if (!r.search(line))\r
 224       {\r
 225         // System.err.println("Found sequence line: " + line);\r
 226 \r
 227         // Split sequence in sequence and accession parts\r
 228         if (!x.search(line))\r
 229         {\r
 230           // logger.error("Could not parse sequence line: " + line);\r
 231           throw new IOException("Could not parse sequence line: " + line);\r
 232         }\r
 233         String ns = (String) seqs.get(x.stringMatched(1));\r
 234         if (ns == null)\r
 235         {\r
 236           ns = "";\r
 237         }\r
 238         ns += x.stringMatched(2);\r
 239 \r
 240         seqs.put(x.stringMatched(1), ns);\r
 241       }\r
 242       else\r
 243       {\r
 244         String annType = r.stringMatched(1);\r
 245         String annContent = r.stringMatched(2);\r
 246 \r
 247         // System.err.println("type:" + annType + " content: " + annContent);\r
 248 \r
 249         if (annType.equals("GF"))\r
 250         {\r
 251           /*\r
 252            * Generic per-File annotation, free text Magic features: #=GF NH\r
 253            * <tree in New Hampshire eXtended format> #=GF TN <Unique identifier\r
 254            * for the next tree> Pfam descriptions: 7. DESCRIPTION OF FIELDS\r
 255            * \r
 256            * Compulsory fields: ------------------\r
 257            * \r
 258            * AC Accession number: Accession number in form PFxxxxx.version or\r
 259            * PBxxxxxx. ID Identification: One word name for family. DE\r
 260            * Definition: Short description of family. AU Author: Authors of the\r
 261            * entry. SE Source of seed: The source suggesting the seed members\r
 262            * belong to one family. GA Gathering method: Search threshold to\r
 263            * build the full alignment. TC Trusted Cutoff: Lowest sequence score\r
 264            * and domain score of match in the full alignment. NC Noise Cutoff:\r
 265            * Highest sequence score and domain score of match not in full\r
 266            * alignment. TP Type: Type of family -- presently Family, Domain,\r
 267            * Motif or Repeat. SQ Sequence: Number of sequences in alignment. AM\r
 268            * Alignment Method The order ls and fs hits are aligned to the model\r
 269            * to build the full align. // End of alignment.\r
 270            * \r
 271            * Optional fields: ----------------\r
 272            * \r
 273            * DC Database Comment: Comment about database reference. DR Database\r
 274            * Reference: Reference to external database. RC Reference Comment:\r
 275            * Comment about literature reference. RN Reference Number: Reference\r
 276            * Number. RM Reference Medline: Eight digit medline UI number. RT\r
 277            * Reference Title: Reference Title. RA Reference Author: Reference\r
 278            * Author RL Reference Location: Journal location. PI Previous\r
 279            * identifier: Record of all previous ID lines. KW Keywords: Keywords.\r
 280            * CC Comment: Comments. NE Pfam accession: Indicates a nested domain.\r
 281            * NL Location: Location of nested domains - sequence ID, start and\r
 282            * end of insert.\r
 283            * \r
 284            * Obsolete fields: ----------- AL Alignment method of seed: The\r
 285            * method used to align the seed members.\r
 286            */\r
 287           // Let's save the annotations, maybe we'll be able to do something\r
 288           // with them later...\r
 289           Regex an = new Regex("(\\w+)\\s*(.*)");\r
 290           if (an.search(annContent))\r
 291           {\r
 292             if (an.stringMatched(1).equals("NH"))\r
 293             {\r
 294               treeString.append(an.stringMatched(2));\r
 295             }\r
 296             else if (an.stringMatched(1).equals("TN"))\r
 297             {\r
 298               if (treeString.length() > 0)\r
 299               {\r
 300                 if (treeName == null)\r
 301                 {\r
 302                   treeName = "Tree " + (getTreeCount() + 1);\r
 303                 }\r
 304                 addNewickTree(treeName, treeString.toString());\r
 305               }\r
 306               treeName = an.stringMatched(2);\r
 307               treeString = new StringBuffer();\r
 308             }\r
 309             setAlignmentProperty(an.stringMatched(1), an.stringMatched(2));\r
 310           }\r
 311         }\r
 312         else if (annType.equals("GS"))\r
 313         {\r
 314           // Generic per-Sequence annotation, free text\r
 315           /*\r
 316            * Pfam uses these features: Feature Description ---------------------\r
 317            * ----------- AC <accession> ACcession number DE <freetext>\r
 318            * DEscription DR <db>; <accession>; Database Reference OS <organism>\r
 319            * OrganiSm (species) OC <clade> Organism Classification (clade, etc.)\r
 320            * LO <look> Look (Color, etc.)\r
 321            */\r
 322           if (s.search(annContent))\r
 323           {\r
 324             String acc = s.stringMatched(1);\r
 325             String type = s.stringMatched(2);\r
 326             String content = s.stringMatched(3);\r
 327             // TODO: store DR in a vector.\r
 328             // TODO: store AC according to generic file db annotation.\r
 329             Hashtable ann;\r
 330             if (seqAnn.containsKey(acc))\r
 331             {\r
 332               ann = (Hashtable) seqAnn.get(acc);\r
 333             }\r
 334             else\r
 335             {\r
 336               ann = new Hashtable();\r
 337             }\r
 338             ann.put(type, content);\r
 339             seqAnn.put(acc, ann);\r
 340           }\r
 341           else\r
 342           {\r
 343             throw new IOException("Error parsing " + line);\r
 344           }\r
 345         }\r
 346         else if (annType.equals("GC"))\r
 347         {\r
 348           // Generic per-Column annotation, exactly 1 char per column\r
 349           // always need a label.\r
 350           if (x.search(annContent))\r
 351           {\r
 352             // parse out and create alignment annotation directly.\r
 353             parseAnnotationRow(annotations, x.stringMatched(1), x\r
 354                     .stringMatched(2));\r
 355           }\r
 356         }\r
 357         else if (annType.equals("GR"))\r
 358         {\r
 359           // Generic per-Sequence AND per-Column markup, exactly 1 char per\r
 360           // column\r
 361           /*\r
 362            * Feature Description Markup letters ------- -----------\r
 363            * -------------- SS Secondary Structure [HGIEBTSCX] SA Surface\r
 364            * Accessibility [0-9X] (0=0%-10%; ...; 9=90%-100%) TM TransMembrane\r
 365            * [Mio] PP Posterior Probability [0-9*] (0=0.00-0.05; 1=0.05-0.15;\r
 366            * *=0.95-1.00) LI LIgand binding [*] AS Active Site [*] IN INtron (in\r
 367            * or after) [0-2]\r
 368            */\r
 369           if (s.search(annContent))\r
 370           {\r
 371             String acc = s.stringMatched(1);\r
 372             String type = s.stringMatched(2);\r
 373             String seq = s.stringMatched(3);\r
 374             String description = new String();\r
 375 \r
 376             // Check for additional information about the current annotation\r
 377             if (x.search(seq))\r
 378             {\r
 379               description = x.stringMatched(1);\r
 380               seq = x.stringMatched(2);\r
 381             }\r
 382             // sequence id with from-to fields\r
 383 \r
 384             Hashtable ann;\r
 385             // Get an object with all the annotations for this sequence\r
 386             if (seqAnn.containsKey(acc))\r
 387             {\r
 388               // logger.debug("Found annotations for " + acc);\r
 389               ann = (Hashtable) seqAnn.get(acc);\r
 390             }\r
 391             else\r
 392             {\r
 393               // logger.debug("Creating new annotations holder for " + acc);\r
 394               ann = new Hashtable();\r
 395               seqAnn.put(acc, ann);\r
 396             }\r
 397 \r
 398             Hashtable features;\r
 399             // Get an object with all the content for an annotation\r
 400             if (ann.containsKey("features"))\r
 401             {\r
 402               // logger.debug("Found features for " + acc);\r
 403               features = (Hashtable) ann.get("features");\r
 404             }\r
 405             else\r
 406             {\r
 407               // logger.debug("Creating new features holder for " + acc);\r
 408               features = new Hashtable();\r
 409               ann.put("features", features);\r
 410             }\r
 411 \r
 412             Hashtable content;\r
 413             if (features.containsKey(this.id2type(type)))\r
 414             {\r
 415               // logger.debug("Found content for " + this.id2type(type));\r
 416               content = (Hashtable) features.get(this.id2type(type));\r
 417             }\r
 418             else\r
 419             {\r
 420               // logger.debug("Creating new content holder for " +\r
 421               // this.id2type(type));\r
 422               content = new Hashtable();\r
 423               features.put(this.id2type(type), content);\r
 424             }\r
 425             String ns = (String) content.get(description);\r
 426             if (ns == null)\r
 427             {\r
 428               ns = "";\r
 429             }\r
 430             ns += seq;\r
 431             content.put(description, seq);\r
 432           }\r
 433           else\r
 434           {\r
 435             throw new IOException("Error parsing " + line);\r
 436           }\r
 437         }\r
 438         else\r
 439         {\r
 440           throw new IOException("Unknown annotation detected: " + annType\r
 441                   + " " + annContent);\r
 442         }\r
 443       }\r
 444     }\r
 445     if (treeString.length() > 0)\r
 446     {\r
 447       if (treeName == null)\r
 448       {\r
 449         treeName = "Tree " + (1 + getTreeCount());\r
 450       }\r
 451       addNewickTree(treeName, treeString.toString());\r
 452     }\r
 453   }\r
 454 \r
 455   private AlignmentAnnotation parseAnnotationRow(Vector annotation,\r
 456           String label, String annots)\r
 457   {\r
 458     String type = (label.indexOf("_cons") == label.length() - 5) ? label\r
 459             .substring(0, label.length() - 5) : label;\r
 460     boolean ss = false;\r
 461     type = id2type(type);\r
 462     if (type.equals("secondary structure"))\r
 463     {\r
 464       ss = true;\r
 465     }\r
 466     // decide on secondary structure or not.\r
 467     Annotation[] els = new Annotation[annots.length()];\r
 468     for (int i = 0; i < annots.length(); i++)\r
 469     {\r
 470       String pos = annots.substring(i, i + 1);\r
 471       Annotation ann;\r
 472       ann = new Annotation(pos, "", ' ', Float.NaN);\r
 473       if (ss)\r
 474       {\r
 475         ann.secondaryStructure = jalview.schemes.ResidueProperties\r
 476                 .getDssp3state(pos).charAt(0);\r
 477         if (ann.secondaryStructure == pos.charAt(0) || pos.charAt(0) == 'C')\r
 478         {\r
 479           ann.displayCharacter = "";\r
 480         }\r
 481         else\r
 482         {\r
 483           ann.displayCharacter += " ";\r
 484         }\r
 485       }\r
 486 \r
 487       els[i] = ann;\r
 488     }\r
 489     AlignmentAnnotation annot = null;\r
 490     Enumeration e = annotation.elements();\r
 491     while (e.hasMoreElements())\r
 492     {\r
 493       annot = (AlignmentAnnotation) e.nextElement();\r
 494       if (annot.label.equals(type))\r
 495         break;\r
 496       annot = null;\r
 497     }\r
 498     if (annot == null)\r
 499     {\r
 500       annot = new AlignmentAnnotation(type, type, els);\r
 501       annotation.addElement(annot);\r
 502     }\r
 503     else\r
 504     {\r
 505       Annotation[] anns = new Annotation[annot.annotations.length\r
 506               + els.length];\r
 507       System.arraycopy(annot.annotations, 0, anns, 0,\r
 508               annot.annotations.length);\r
 509       System.arraycopy(els, 0, anns, annot.annotations.length, els.length);\r
 510       annot.annotations = anns;\r
 511     }\r
 512     return annot;\r
 513   }\r
 514 \r
 515   public static String print(SequenceI[] s)\r
 516   {\r
 517     return "not yet implemented";\r
 518   }\r
 519 \r
 520   public String print()\r
 521   {\r
 522     return print(getSeqsAsArray());\r
 523   }\r
 524 \r
 525   private static Hashtable typeIds = null;\r
 526   static\r
 527   {\r
 528     if (typeIds == null)\r
 529     {\r
 530       typeIds = new Hashtable();\r
 531       typeIds.put("SS", "secondary structure");\r
 532       typeIds.put("SA", "surface accessibility");\r
 533       typeIds.put("TM", "transmembrane");\r
 534       typeIds.put("PP", "posterior probability");\r
 535       typeIds.put("LI", "ligand binding");\r
 536       typeIds.put("AS", "active site");\r
 537       typeIds.put("IN", "intron");\r
 538       typeIds.put("IR", "interacting residue");\r
 539       typeIds.put("AC", "accession");\r
 540       typeIds.put("OS", "organism");\r
 541       typeIds.put("CL", "class");\r
 542       typeIds.put("DE", "description");\r
 543       typeIds.put("DR", "reference");\r
 544       typeIds.put("LO", "look");\r
 545       typeIds.put("RF", "reference positions");\r
 546 \r
 547     }\r
 548   }\r
 549 \r
 550   private String id2type(String id)\r
 551   {\r
 552     if (typeIds.containsKey(id))\r
 553     {\r
 554       return (String) typeIds.get(id);\r
 555     }\r
 556     System.err.println("Warning : Unknown Stockholm annotation type code "\r
 557             + id);\r
 558     return id;\r
 559   }\r
 560 }\r