X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Ftool%2Fhmmscan_summary.rb;h=bb32da53ffc2b38470d5347f45c2ea97ca9d5935;hb=2f17bcaa7c1b364b5dd8ba7109dea5e5c0575b1e;hp=bb697b08179437433b90132229287f79854e3e32;hpb=4832d54fed791f182d012d406e4bcebefef0547c;p=jalview.git

diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb
index bb697b0..bb32da5 100644
--- a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb
+++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb
@@ -1,170 +1,61 @@
 #
-# = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class
+# = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
 #
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# Copyright::  Copyright (C) 2012 Christian M. Zmasek
 # License::    GNU Lesser General Public License (LGPL)
 #
 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
 #
-# last modified: 11/24/2009
+
+require 'set'
 
 require 'lib/evo/util/constants'
 require 'lib/evo/util/util'
 require 'lib/evo/util/command_line_arguments'
 require 'lib/evo/io/parser/hmmscan_parser'
+require 'lib/evo/io/web/uniprotkb'
 
 module Evoruby
 
   class HmmscanSummary
 
     PRG_NAME       = "hsp"
-    PRG_VERSION    = "2.000"
-    PRG_DESC       = "hmmscan parser"
-    PRG_DATE       = "2012.10.19"
-    COPYRIGHT      = "2012 Christian M Zmasek"
-    CONTACT        = "phylosoft@gmail.com"
-    WWW            = "www.phylosoft.org"
+    PRG_VERSION    = "2.001"
+    PRG_DESC       = "hmmscan summary"
+    PRG_DATE       = "2013.10.23"
+    COPYRIGHT      = "2013 Christian M Zmasek"
+    CONTACT        = "phyloxml@gmail.com"
+    WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
 
     DELIMITER_OPTION              = "d"
-    I_E_VALUE_THRESHOLD_OPTION    = "e"
+    SPECIES_OPTION                = "s"
+    I_E_VALUE_THRESHOLD_OPTION    = "ie"
     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
     HMM_FOR_PROTEIN_OUTPUT        = "m"
     IGNORE_DUF_OPTION             = "i"
     PARSE_OUT_DESCRIPITION_OPTION = "a"
+    UNIPROT                       = "u"
     HELP_OPTION_1                 = "help"
     HELP_OPTION_2                 = "h"
 
+    USE_AVOID_HMMS = true
+    AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
+    LIMIT_FOR_CLOSE_DOMAINS = 20
+
     def initialize
       @domain_counts = Hash.new
     end
 
-    # raises ArgumentError, IOError
-    def parse( inpath,
-        outpath,
-        column_delimiter,
-        i_e_value_threshold,
-        ignore_dufs,
-        get_descriptions,
-        fs_e_value_threshold,
-        hmm_for_protein_output )
-      Util.check_file_for_readability( inpath )
-      Util.check_file_for_writability( outpath )
-
-      outfile = File.open( outpath, "a" )
-
-      query     = ""
-      desc      = ""
-      model     = ""
-      env_from  = ""
-      env_to    = ""
-      i_e_value = ""
-
-      hmmscan_results_per_protein = []
-
-      hmmscan_parser = HmmscanParser.new( inpath )
-
-      prev_query = ""
-
-      hmmscan_parser.parse.each do | r |
-        model     = r.model
-        query     = r.query
-        i_e_value = r.i_e_value
-        env_from  = r.env_from
-        env_to    = r.env_to
-
-        if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
-           ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
-          count_model( model )
-          outfile.print( query +
-             column_delimiter )
-          if ( get_descriptions )
-            outfile.print( desc +
-               column_delimiter )
-          end
-          outfile.print( model +
-             column_delimiter +
-             env_from.to_s +
-             column_delimiter +
-             env_to.to_s +
-             column_delimiter +
-             i_e_value.to_s )
-          outfile.print( Constants::LINE_DELIMITER )
-        end
-
-        if  !prev_query.empty? && prev_query != query
-          if !hmmscan_results_per_protein.empty?
-            process_hmmscan_results_per_protein( hmmscan_results_per_protein,
-              fs_e_value_threshold,
-              hmm_for_protein_output  )
-          end
-          hmmscan_results_per_protein.clear
-        end
-        prev_query = query
-        hmmscan_results_per_protein << r
-      end
-      if !hmmscan_results_per_protein.empty?
-        process_hmmscan_results_per_protein( hmmscan_results_per_protein,
-          fs_e_value_threshold,
-          hmm_for_protein_output  )
-      end
-      outfile.flush()
-      outfile.close()
-
-    end # def parse
-
-    def count_model( model )
-      if ( @domain_counts.has_key?( model ) )
-        count = @domain_counts[ model ].to_i
-        count += 1
-        @domain_counts[ model ] = count
-      else
-        @domain_counts[ model ] = 1
-      end
-    end
-
-    def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
-        fs_e_value_threshold,
-        hmm_for_protein_output )
-
-      fs_e_value = -1
-      hmmscan_results_per_protein.each do | r |
-        if r.model ==  hmm_for_protein_output
-          fs_e_value = r.fs_e_value
-          if fs_e_value > fs_e_value_threshold
-            return
-          end
-        end
-      end
+    def run
 
-
-      first = hmmscan_results_per_protein[ 0 ]
-      s = ""
-      s << first.query + "\t"
-      s << fs_e_value.to_s + "\t"
-      s << first.qlen.to_s + "\t"
-      # s << first.fs_e_value.to_s + "\t"
-      # s << first.out_of.to_s + "\t"
-      hmmscan_results_per_protein.each do | r |
-        s <<  r.model + "|"
-      end
-      puts s
-    end
-
-
-    def get_domain_counts()
-      return @domain_counts
-    end
-
-    def run()
-
-      Util.print_program_information( PRG_NAME,
-        PRG_VERSION,
-        PRG_DESC,
-        PRG_DATE,
-        COPYRIGHT,
-        CONTACT,
-        WWW,
-        STDOUT )
+   #   Util.print_program_information( PRG_NAME,
+   #     PRG_VERSION,
+   #     PRG_DESC,
+   #     PRG_DATE,
+   #     COPYRIGHT,
+   #     CONTACT,
+   #     WWW,
+   #     STDOUT )
 
       begin
         cla = CommandLineArguments.new( ARGV )
@@ -190,6 +81,8 @@ module Evoruby
       allowed_opts.push( IGNORE_DUF_OPTION )
       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
+      allowed_opts.push( UNIPROT )
+      allowed_opts.push( SPECIES_OPTION )
 
       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
       if ( disallowed.length > 0 )
@@ -222,6 +115,8 @@ module Evoruby
         end
       end
 
+
+
       fs_e_value_threshold = -1.0
       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
         begin
@@ -234,7 +129,6 @@ module Evoruby
         end
       end
 
-
       hmm_for_protein_output = ""
       if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
         begin
@@ -244,6 +138,23 @@ module Evoruby
         end
       end
 
+      uniprot = ""
+      if ( cla.is_option_set?( UNIPROT ) )
+        begin
+          uniprot = cla.get_option_value( UNIPROT )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+
+      species = "HUMAN"
+      if ( cla.is_option_set?( SPECIES_OPTION ) )
+        begin
+          species = cla.get_option_value( SPECIES_OPTION )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
 
       ignore_dufs = false
       if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
@@ -255,38 +166,42 @@ module Evoruby
         parse_descriptions = true
       end
 
-      puts()
-      puts( "hmmpfam outputfile  : " + inpath )
-      puts( "outputfile          : " + outpath )
-      if ( i_e_value_threshold >= 0.0 )
-        puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
-      else
-        puts( "i-E-value threshold : no threshold" )
-      end
-      if ( parse_descriptions )
-        puts( "parse descriptions  : true" )
-      else
-        puts( "parse descriptions  : false" )
-      end
-      if ( ignore_dufs )
-        puts( "ignore DUFs         : true" )
-      else
-        puts( "ignore DUFs         : false" )
-      end
-      if ( column_delimiter == "\t" )
-        puts( "column delimiter    : TAB" )
-      else
-        puts( "column delimiter     : " + column_delimiter )
-      end
-      if ( fs_e_value_threshold >= 0.0 )
-        puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
-      else
-        puts( "E-value threshold   : no threshold" )
-      end
-      if ( !hmm_for_protein_output.empty? )
-        puts( "HMM for proteins    : " + hmm_for_protein_output )
-      end
-      puts()
+#      puts()
+#      puts( "hmmpfam outputfile  : " + inpath )
+#      puts( "outputfile          : " + outpath )
+#      puts( "species             : " + species )
+#      if ( i_e_value_threshold >= 0.0 )
+#        puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
+#      else
+#        puts( "i-E-value threshold : no threshold" )
+#      end
+#      if ( parse_descriptions )
+#        puts( "parse descriptions  : true" )
+#      else
+#        puts( "parse descriptions  : false" )
+#      end
+#      if ( ignore_dufs )
+#        puts( "ignore DUFs         : true" )
+#      else
+#        puts( "ignore DUFs         : false" )
+#      end
+#      if ( column_delimiter == "\t" )
+#        puts( "column delimiter    : TAB" )
+#      else
+#        puts( "column delimiter     : " + column_delimiter )
+#      end
+#      if fs_e_value_threshold >= 0.0
+#        puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
+#      else
+#        puts( "E-value threshold   : no threshold" )
+#      end
+#      if !hmm_for_protein_output.empty?
+#        puts( "HMM for proteins    : " + hmm_for_protein_output )
+#      end
+#      if !uniprot.empty?
+#        puts( "Uniprot             : " + uniprot )
+#      end
+#      puts()
 
       begin
         parse( inpath,
@@ -296,23 +211,293 @@ module Evoruby
           ignore_dufs,
           parse_descriptions,
           fs_e_value_threshold,
-          hmm_for_protein_output )
-      rescue ArgumentError, IOError => e
+          hmm_for_protein_output,
+          uniprot,
+          species )
+      rescue IOError => e
         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
       end
       domain_counts = get_domain_counts()
 
+#      puts
+#      puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
+#      puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
+#      puts
+#      puts( Util.draw_histogram( domain_counts, "#" ) )
+#      puts
+#      Util.print_message( PRG_NAME, 'OK' )
+#      puts 
+
+    end # def run
+
+    private
 
-      puts
-      puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
-      puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
-      puts
-      puts( Util.draw_histogram( domain_counts, "#" ) )
-      puts
-      Util.print_message( PRG_NAME, 'OK' )
-      puts
+    # raises ArgumentError, IOError
+    def parse( inpath,
+        outpath,
+        column_delimiter,
+        i_e_value_threshold,
+        ignore_dufs,
+        get_descriptions,
+        fs_e_value_threshold,
+        hmm_for_protein_output,
+        uniprot,
+        species )
+
+      Util.check_file_for_readability( inpath )
+      Util.check_file_for_writability( outpath )
+
+      hmmscan_parser = HmmscanParser.new( inpath )
+      results = hmmscan_parser.parse
+
+      outfile = File.open( outpath, "a" )
+
+      query     = ""
+      desc      = ""
+      model     = ""
+      env_from  = ""
+      env_to    = ""
+      i_e_value = ""
+
+      hmmscan_results_per_protein = []
+
+      prev_query = ""
+
+      results.each do | r |
+        model     = r.model
+        query     = r.query
+        i_e_value = r.i_e_value
+        env_from  = r.env_from
+        env_to    = r.env_to
+
+        if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
+           ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
+          count_model( model )
+          outfile.print( query +
+             column_delimiter )
+          if ( get_descriptions )
+            outfile.print( desc +
+               column_delimiter )
+          end
+          outfile.print( model +
+             column_delimiter +
+             env_from.to_s +
+             column_delimiter +
+             env_to.to_s +
+             column_delimiter +
+             i_e_value.to_s )
+          outfile.print( Constants::LINE_DELIMITER )
+        end
+
+        if !hmm_for_protein_output.empty?
+          if  !prev_query.empty? && prev_query != query
+            if !hmmscan_results_per_protein.empty?
+              process_hmmscan_results_per_protein( hmmscan_results_per_protein,
+                fs_e_value_threshold,
+                hmm_for_protein_output,
+                i_e_value_threshold,
+                uniprot,
+                species )
+            end
+            hmmscan_results_per_protein.clear
+          end
+          prev_query = query
+
+          if USE_AVOID_HMMS
+            if !AVOID_HHMS.include? r.model
+              hmmscan_results_per_protein << r
+            end
+          else
+            hmmscan_results_per_protein << r
+          end
+        end
+      end
+
+      if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
+        process_hmmscan_results_per_protein( hmmscan_results_per_protein,
+          fs_e_value_threshold,
+          hmm_for_protein_output,
+          i_e_value_threshold,
+          uniprot,
+          species )
+      end
+
+      outfile.flush()
+      outfile.close()
+    end # def parse
+
+    def process_id( id )
+      if  id =~ /(sp|tr)\|\S+\|(\S+)/
+        id = $2
+      end
+      id
+    end
+
+    def count_model( model )
+      if ( @domain_counts.has_key?( model ) )
+        count = @domain_counts[ model ].to_i
+        count += 1
+        @domain_counts[ model ] = count
+      else
+        @domain_counts[ model ] = 1
+      end
+    end
+
+    def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
+        fs_e_value_threshold,
+        hmm_for_protein_output,
+        i_e_value_threshold,
+        uniprotkb,
+        species )
+
+      dc = 0
+      # filter according to i-Evalue threshold
+      # abort if fs Evalue too high
+      hmmscan_results_per_protein_filtered = []
+
+      hmmscan_results_per_protein.each do | r |
+
+
+        if r.model == hmm_for_protein_output
+          if fs_e_value_threshold > 0.0 && r.fs_e_value > fs_e_value_threshold
+            return
+          end
+        end
+        if i_e_value_threshold <= 0 || r.i_e_value <= i_e_value_threshold
+          hmmscan_results_per_protein_filtered << r
+          if r.model == hmm_for_protein_output
+            dc += 1
+          end
+        end
+      end
+
+      if dc == 0
+        # passed on protein E-value, failed in per domain E-values
+        return
+      end
+
+      hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
+
+      own = nil
+      hmmscan_results_per_protein_filtered.each do | r |
+        if r.model == hmm_for_protein_output
+          own = r
+        end
+      end
+
+      s = ""
+      s << own.query + "\t"
+      s << species + "\t"
+      s << own.fs_e_value.to_s + "\t"
+      s << own.qlen.to_s + "\t"
+      s << dc.to_s + "\t"
+      s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
+      hmmscan_results_per_protein_filtered.each do | r |
+        s << r.model + " "
+      end
+      s << "\t"
+
+      if !uniprotkb.empty?
+        #e = UniprotKB::get_entry_by_id( process_id( own.query ) )
+
+        #if e != nil
+        #  s << uniprot_annotation( e )
+        # # s << "\uniprot_annotationt"
+        #end
+      end
+
+      overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
+
+      s << overview  + "\t"
+
+      s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
+
+      prev_r = nil
+      hmmscan_results_per_protein_filtered.each do | r |
+
+        if  prev_r != nil
+          s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
+        else
+          s << make_interdomain_sequence( r.env_from, false )
+        end
+        s << r.model
+        s << "["
+        s << r.env_from.to_s << "-" << r.env_to.to_s
+        s << "|ie=" << r.i_e_value.to_s
+        s << "|ce=" << r.c_e_value.to_s
+        s << "]"
+        prev_r = r
+      end
+      s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
+      puts s
+    end
+
+    def uniprot_annotation( e )
+      s = ""
+      pdb_ids = e.get_pdb_ids
+      if !pdb_ids.empty?
+        pdb_ids.each do | pdb |
+          s << pdb << ", "
+        end
+      else
+        s << "-"
+      end
+      s
+    end
+
+    def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
+      linkers = ""
+      prev_r = nil
+      hmmscan_results_per_protein_filtered.each do | r |
+        if r.model == hmm_for_protein_output
+          if  prev_r != nil
+            linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
+          end
+          prev_r = r
+        end
+      end
+      linkers
+    end
+
+    def get_domain_counts()
+      return @domain_counts
+    end
+
+    def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
+      overview = ""
+      prev_r = nil
+      hmmscan_results_per_protein_filtered.each do | r |
+        if r.model == hmm_for_protein_output
+          if prev_r == nil
+            overview << hmm_for_protein_output
+          else
+            if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
+              overview << "~" << hmm_for_protein_output
+            else
+              overview << "----" << hmm_for_protein_output
+            end
+          end
+          prev_r = r
+        end
+      end
+      overview
+    end
+
+    def make_interdomain_sequence( d, mark_short = true )
+      s = ""
+      d /= 20
+      if d >= 10
+        s << "----//----"
+      elsif d >= 1
+        d.times do
+          s << "-"
+        end
+      elsif mark_short
+        s << "~"
+      end
+      s
+    end
 
-    end # def run()
 
     def print_help()
       puts( "Usage:" )
@@ -325,9 +510,10 @@ module Evoruby
       puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
       puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
+      puts( "           -" + SPECIES_OPTION + ": species for protein summary" )
       puts()
     end
 
-  end # class HmmscanParser
+  end # class
 
 end # module Evoruby
\ No newline at end of file