inprogress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Tue, 19 Mar 2013 03:01:10 +0000 (03:01 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Tue, 19 Mar 2013 03:01:10 +0000 (03:01 +0000)
forester/ruby/evoruby/exe/hsa.rb [new file with mode: 0755]
forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb [new file with mode: 0644]

diff --git a/forester/ruby/evoruby/exe/hsa.rb b/forester/ruby/evoruby/exe/hsa.rb
new file mode 100755 (executable)
index 0000000..a79da24
--- /dev/null
@@ -0,0 +1,20 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/hsp
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: hsp.rb,v 1.1 2009/11/25 05:42:04 cmzmasek Exp $
+#
+# last modified: 11/24/2009
+
+require 'lib/evo/tool/hmmscan_analysis'
+
+module Evoruby
+
+    hsa = HmmscanAnalysis.new()
+
+    hsa.run()
+
+end  # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb
new file mode 100644 (file)
index 0000000..08d74c1
--- /dev/null
@@ -0,0 +1,481 @@
+#
+# = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
+#
+# Copyright::  Copyright (C) 2012 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
+#
+
+require 'set'
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/io/parser/hmmscan_parser'
+require 'lib/evo/msa/msa'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/io/writer/fasta_writer'
+
+module Evoruby
+
+  class HmmscanAnalysis
+
+    PRG_NAME       = "hsp"
+    PRG_VERSION    = "2.001"
+    PRG_DESC       = "hmmscan summary"
+    PRG_DATE       = "2013.10.23"
+    COPYRIGHT      = "2013 Christian M Zmasek"
+    CONTACT        = "phyloxml@gmail.com"
+    WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
+
+    DELIMITER_OPTION              = "d"
+    SPECIES_OPTION                = "s"
+    I_E_VALUE_THRESHOLD_OPTION    = "ie"
+    FS_E_VALUE_THRESHOLD_OPTION   = "pe"
+    HMM_FOR_PROTEIN_OUTPUT        = "m"
+    IGNORE_DUF_OPTION             = "i"
+    PARSE_OUT_DESCRIPITION_OPTION = "a"
+    HELP_OPTION_1                 = "help"
+    HELP_OPTION_2                 = "h"
+
+    USE_AVOID_HMMS = false
+    AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
+    LIMIT_FOR_CLOSE_DOMAINS = 20
+
+    def initialize
+      @domain_counts = Hash.new
+    end
+
+    def run
+
+      #   Util.print_program_information( PRG_NAME,
+      #     PRG_VERSION,
+      #     PRG_DESC,
+      #     PRG_DATE,
+      #     COPYRIGHT,
+      #     CONTACT,
+      #     WWW,
+      #     STDOUT )
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+      end
+
+      if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+           cla.is_option_set?( HELP_OPTION_2 ) )
+        print_help
+        exit( 0 )
+      end
+
+      if ( cla.get_number_of_files != 1 &&  cla.get_number_of_files != 2 )
+        print_help
+        exit( -1 )
+      end
+
+      allowed_opts = Array.new
+      allowed_opts.push( DELIMITER_OPTION )
+      allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
+      allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
+      allowed_opts.push( IGNORE_DUF_OPTION )
+      allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
+      allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
+      allowed_opts.push( SPECIES_OPTION )
+
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME,
+          "unknown option(s): " + disallowed,
+          STDOUT )
+      end
+
+      inpath = cla.get_file_name( 0 )
+      seq_file_path = nil
+
+      if ( cla.get_number_of_files == 2 )
+        seq_file_path = cla.get_file_name( 1 )
+      end
+
+      msa = nil
+      if seq_file_path != nil
+        msa = read_fasta_file(seq_file_path )
+      end
+
+      column_delimiter = "\t"
+      if ( cla.is_option_set?( DELIMITER_OPTION ) )
+        begin
+          column_delimiter = cla.get_option_value( DELIMITER_OPTION )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+
+      i_e_value_threshold = -1
+      if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
+        begin
+          i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+        if ( i_e_value_threshold < 0.0 )
+          Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
+        end
+      end
+
+      fs_e_value_threshold = -1
+      if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
+        begin
+          fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+        if ( fs_e_value_threshold < 0.0 )
+          Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
+        end
+      end
+
+      hmm_for_protein_outputs = []
+      if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
+        begin
+          hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
+          hmm_for_protein_outputs = hmm_for_protein_output.split( "~" );
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+
+      species = "HUMAN"
+      if ( cla.is_option_set?( SPECIES_OPTION ) )
+        begin
+          species = cla.get_option_value( SPECIES_OPTION )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+
+      ignore_dufs = false
+      if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
+        ignore_dufs = true
+      end
+
+      parse_descriptions = false
+      if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
+        parse_descriptions = true
+      end
+
+
+      begin
+        parse( inpath,
+          column_delimiter,
+          i_e_value_threshold,
+          ignore_dufs,
+          parse_descriptions,
+          fs_e_value_threshold,
+          hmm_for_protein_outputs,
+          species,
+          msa        )
+      rescue IOError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+      end
+
+    end # def run
+
+    private
+
+    def read_fasta_file( input )
+      f = MsaFactory.new()
+      msa = nil
+      begin
+        msa = f.create_msa_from_file( input, FastaParser.new() )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+      msa
+    end
+
+
+    # raises ArgumentError, IOError
+    def parse( inpath,
+        column_delimiter,
+        i_e_value_threshold,
+        ignore_dufs,
+        get_descriptions,
+        fs_e_value_threshold,
+        hmm_for_protein_outputs,
+        species,
+        msa )
+
+      Util.check_file_for_readability( inpath )
+
+      hmmscan_parser = HmmscanParser.new( inpath )
+      results = hmmscan_parser.parse
+
+
+      query     = ""
+      desc      = ""
+      model     = ""
+      env_from  = ""
+      env_to    = ""
+      i_e_value = ""
+
+      hmmscan_results_per_protein = []
+
+      prev_query = ""
+
+      results.each do | r |
+        model     = r.model
+        query     = r.query
+        i_e_value = r.i_e_value
+        env_from  = r.env_from
+        env_to    = r.env_to
+
+
+        if !prev_query.empty? && prev_query != query
+          if !hmmscan_results_per_protein.empty?
+            process_hmmscan_results_per_protein( hmmscan_results_per_protein,
+              fs_e_value_threshold,
+              hmm_for_protein_outputs,
+              i_e_value_threshold,
+              species,
+              msa            )
+          end
+          hmmscan_results_per_protein.clear
+        end
+        prev_query = query
+
+        if USE_AVOID_HMMS
+          if !AVOID_HHMS.include? r.model
+            hmmscan_results_per_protein << r
+          end
+        else
+          hmmscan_results_per_protein << r
+        end
+
+      end
+
+      if !hmm_for_protein_outputs.empty? && !hmmscan_results_per_protein.empty?
+        process_hmmscan_results_per_protein( hmmscan_results_per_protein,
+          fs_e_value_threshold,
+          hmm_for_protein_outputs,
+          i_e_value_threshold,
+          species,
+          msa        )
+      end
+
+
+    end # def parse
+
+    def process_id( id )
+      if  id =~ /(sp|tr)\|\S+\|(\S+)/
+        id = $2
+      end
+      id
+    end
+
+
+
+    def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
+        fs_e_value_threshold,
+        target_hmms,
+        i_e_value_threshold,
+        species,
+        msa )
+
+      raise StandardError, "target hmms is empty" if target_hmms.length < 1
+      raise StandardError, "results is empty" if hmmscan_results_per_protein.length < 1
+
+      # filter according to i-Evalue threshold
+      # abort if fs Evalue too high
+
+
+      if fs_e_value_threshold >= 0.0
+        hmmscan_results_per_protein.each do | r |
+          target_hmms.each do | hmm |
+            if r.model == hmm && r.fs_e_value > fs_e_value_threshold
+              return
+            end
+          end
+        end
+      end
+
+      #  dcs = []
+      hmmscan_results_per_protein_filtered = []
+      matched = Set.new
+
+      hmmscan_results_per_protein.each do | r |
+        if i_e_value_threshold < 0 || r.i_e_value <= i_e_value_threshold
+          hmmscan_results_per_protein_filtered << r
+          target_hmms.each do | hmm |
+            if r.model == hmm
+
+              matched << hmm
+              break
+            end
+          end
+
+        end
+      end
+
+      if matched.length < target_hmms.length
+        return
+      end
+      if  hmmscan_results_per_protein_filtered.length < 1
+        return
+      end
+
+      hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
+
+      owns = []
+      target_hmms.each do | hmm |
+        hmmscan_results_per_protein_filtered.each do | r |
+          if r.model == hmm
+            owns << r
+            break
+          end
+        end
+      end
+
+      s = ""
+      query = nil
+      owns.each do | own |
+        s << own.query + "\t"
+        query = own.query
+      end
+      s << species + "\t"
+      owns.each do | own |
+        s << own.fs_e_value.to_s + "\t"
+      end
+      owns.each do | own |
+        s << own.qlen.to_s + "\t" #TODO !
+      end
+
+      #   dcs.each do | dc |
+      #     s << dc.to_s + "\t"
+      #   end
+      s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
+      hmmscan_results_per_protein_filtered.each do | r |
+        s << r.model + " "
+      end
+      s << "\t"
+
+
+
+      # overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
+
+      #   s << overview  + "\t"
+
+      #  s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
+
+      prev_r = nil
+      hmmscan_results_per_protein_filtered.each do | r |
+
+        if  prev_r != nil
+          s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
+
+          if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] )
+            puts "xxx"
+            linker( prev_r.env_to, r.env_from, query, msa )
+          end
+
+
+        else
+          s << make_interdomain_sequence( r.env_from, false )
+
+        end
+        s << r.model
+        s << "["
+        s << r.env_from.to_s << "-" << r.env_to.to_s
+        s << " " << r.i_e_value.to_s
+        s << "]"
+        prev_r = r
+      end
+      # s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
+      puts s
+    end
+
+
+    def linker( first, last , query , msa)
+      puts first.to_s + "-" + last.to_s
+      if ( last - first >= 1 )
+        seq = msa.get_by_name( query, true, false )
+        linker = seq.get_subsequence( first -1 , last - 1 )
+        puts linker.get_sequence_as_string
+      end
+
+    end
+
+
+    def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
+      linkers = ""
+      prev_r = nil
+      hmmscan_results_per_protein_filtered.each do | r |
+        if r.model == hmm_for_protein_output
+          if  prev_r != nil
+            linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
+          end
+          prev_r = r
+        end
+      end
+      linkers
+    end
+
+
+
+    def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
+      overview = ""
+      prev_r = nil
+      hmmscan_results_per_protein_filtered.each do | r |
+        if r.model == hmm_for_protein_output
+          if prev_r == nil
+            overview << hmm_for_protein_output
+          else
+            if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
+              overview << "~" << hmm_for_protein_output
+            else
+              overview << "----" << hmm_for_protein_output
+            end
+          end
+          prev_r = r
+        end
+      end
+      overview
+    end
+
+    def make_interdomain_sequence( d, mark_short = true )
+      s = ""
+      d /= 20
+      if d >= 10
+        s << "----//----"
+      elsif d >= 1
+        d.times do
+          s << "-"
+        end
+      elsif mark_short
+        s << "~"
+      end
+      s
+    end
+
+
+    def print_help()
+      puts( "Usage:" )
+      puts()
+      puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
+      puts()
+      puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
+      puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
+      puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
+      puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
+      puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
+      puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
+      puts( "           -" + SPECIES_OPTION + ": species for protein summary" )
+      puts()
+    end
+
+  end # class
+
+end # module Evoruby
\ No newline at end of file