pdb
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_summary.rb
index 9c5e3a9..db82d4a 100644 (file)
@@ -1,17 +1,20 @@
 #
-# = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class
+# = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
 #
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# Copyright::  Copyright (C) 2012 Christian M. Zmasek
 # License::    GNU Lesser General Public License (LGPL)
 #
 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
 #
-# last modified: 11/24/2009
+# last modified: 121003
+
+require 'set'
 
 require 'lib/evo/util/constants'
 require 'lib/evo/util/util'
 require 'lib/evo/util/command_line_arguments'
 require 'lib/evo/io/parser/hmmscan_parser'
+require 'lib/evo/io/web/uniprotkb'
 
 module Evoruby
 
@@ -19,8 +22,8 @@ module Evoruby
 
     PRG_NAME       = "hsp"
     PRG_VERSION    = "2.000"
-    PRG_DESC       = "hmmscan parser"
-    PRG_DATE       = "2012.10.19"
+    PRG_DESC       = "hmmscan summary"
+    PRG_DATE       = "2012.10.23"
     COPYRIGHT      = "2012 Christian M Zmasek"
     CONTACT        = "phylosoft@gmail.com"
     WWW            = "www.phylosoft.org"
@@ -31,6 +34,7 @@ module Evoruby
     HMM_FOR_PROTEIN_OUTPUT        = "m"
     IGNORE_DUF_OPTION             = "i"
     PARSE_OUT_DESCRIPITION_OPTION = "a"
+    UNIPROT                       = "u"
     HELP_OPTION_1                 = "help"
     HELP_OPTION_2                 = "h"
 
@@ -44,6 +48,8 @@ module Evoruby
 
     def run
 
+
+
       Util.print_program_information( PRG_NAME,
         PRG_VERSION,
         PRG_DESC,
@@ -77,6 +83,7 @@ module Evoruby
       allowed_opts.push( IGNORE_DUF_OPTION )
       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
+      allowed_opts.push( UNIPROT )
 
       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
       if ( disallowed.length > 0 )
@@ -130,6 +137,15 @@ module Evoruby
         end
       end
 
+      uniprot = ""
+      if ( cla.is_option_set?( UNIPROT ) )
+        begin
+          uniprot = cla.get_option_value( UNIPROT )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+
       ignore_dufs = false
       if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
         ignore_dufs = true
@@ -163,14 +179,17 @@ module Evoruby
       else
         puts( "column delimiter     : " + column_delimiter )
       end
-      if ( fs_e_value_threshold >= 0.0 )
+      if fs_e_value_threshold >= 0.0
         puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
       else
         puts( "E-value threshold   : no threshold" )
       end
-      if ( !hmm_for_protein_output.empty? )
+      if !hmm_for_protein_output.empty?
         puts( "HMM for proteins    : " + hmm_for_protein_output )
       end
+      if !uniprot.empty?
+        puts( "Uniprot             : " + uniprot )
+      end
       puts()
 
       begin
@@ -181,8 +200,9 @@ module Evoruby
           ignore_dufs,
           parse_descriptions,
           fs_e_value_threshold,
-          hmm_for_protein_output )
-      rescue ArgumentError, IOError => e
+          hmm_for_protein_output,
+          uniprot )
+      rescue IOError => e
         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
       end
       domain_counts = get_domain_counts()
@@ -209,10 +229,17 @@ module Evoruby
         ignore_dufs,
         get_descriptions,
         fs_e_value_threshold,
-        hmm_for_protein_output )
+        hmm_for_protein_output,
+        uniprot )
+
+
+
       Util.check_file_for_readability( inpath )
       Util.check_file_for_writability( outpath )
 
+      hmmscan_parser = HmmscanParser.new( inpath )
+      results = hmmscan_parser.parse
+
       outfile = File.open( outpath, "a" )
 
       query     = ""
@@ -224,11 +251,11 @@ module Evoruby
 
       hmmscan_results_per_protein = []
 
-      hmmscan_parser = HmmscanParser.new( inpath )
+
 
       prev_query = ""
 
-      hmmscan_parser.parse.each do | r |
+      results.each do | r |
         model     = r.model
         query     = r.query
         i_e_value = r.i_e_value
@@ -260,7 +287,8 @@ module Evoruby
               process_hmmscan_results_per_protein( hmmscan_results_per_protein,
                 fs_e_value_threshold,
                 hmm_for_protein_output,
-                i_e_value_threshold )
+                i_e_value_threshold,
+                true )
             end
             hmmscan_results_per_protein.clear
           end
@@ -275,19 +303,28 @@ module Evoruby
           end
         end
       end
-      if !hmm_for_protein_output.empty?
-        if !hmmscan_results_per_protein.empty?
-          process_hmmscan_results_per_protein( hmmscan_results_per_protein,
-            fs_e_value_threshold,
-            hmm_for_protein_output,
-            i_e_value_threshold )
-        end
+      if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
+        process_hmmscan_results_per_protein( hmmscan_results_per_protein,
+          fs_e_value_threshold,
+          hmm_for_protein_output,
+          i_e_value_threshold,
+          true )
       end
+
       outfile.flush()
       outfile.close()
 
     end # def parse
 
+    def process_id( id )
+      if  id =~ /(sp|tr)\|\S+\|(\S+)/
+        id = $2
+      end
+      id
+    end
+
+
+
     def count_model( model )
       if ( @domain_counts.has_key?( model ) )
         count = @domain_counts[ model ].to_i
@@ -301,7 +338,8 @@ module Evoruby
     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
         fs_e_value_threshold,
         hmm_for_protein_output,
-        i_e_value_threshold )
+        i_e_value_threshold,
+        uniprotkb )
 
       dc = 0
       # filter according to i-Evalue threshold
@@ -347,7 +385,34 @@ module Evoruby
         s << r.model + " "
       end
       s << "\t"
+      e = UniprotKB::get_entry_by_id( process_id( own.query ) )
+      # if e != nil && e.de != nil
+      #   e.de.each do |i|
+      #
+      #   end
+      # else
+      #   s << "-"
+      # end
+      s << "\t"
+      if e != nil && e.dr != nil
+        e.dr.each do | dr |
+          if dr != nil
+            if dr =~ /PDB;\s+([A-Z0-9]{4});/
+              s << $1
 
+            end
+          end
+        end
+      else
+        s << "-"
+      end
+      s << "\t"
+
+
+
+
+
+      s << "\t"
       overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
 
       s << overview   + "\t"