From: cmzmasek@gmail.com Date: Fri, 19 Oct 2012 06:00:12 +0000 (+0000) Subject: in progress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=4832d54fed791f182d012d406e4bcebefef0547c;p=jalview.git in progress --- diff --git a/forester/ruby/evoruby/exe/hsp.rb b/forester/ruby/evoruby/exe/hsp.rb index 493cc2d..7b09d13 100755 --- a/forester/ruby/evoruby/exe/hsp.rb +++ b/forester/ruby/evoruby/exe/hsp.rb @@ -9,11 +9,11 @@ # # last modified: 11/24/2009 -require 'lib/evo/tool/hmmscan_parser' +require 'lib/evo/tool/hmmscan_summary' module Evoruby - hsp = HmmscanParser.new() + hsp = HmmscanSummary.new() hsp.run() diff --git a/forester/ruby/evoruby/exe/test.rb b/forester/ruby/evoruby/exe/test.rb index cc9c466..24c3185 100755 --- a/forester/ruby/evoruby/exe/test.rb +++ b/forester/ruby/evoruby/exe/test.rb @@ -26,7 +26,7 @@ require 'lib/evo/io/parser/fasta_parser' require 'lib/evo/io/parser/ncbi_tseq_parser' require 'lib/evo/io/parser/hmmsearch_domain_extractor' require 'lib/evo/tool/domain_sequence_extractor' -require 'lib/evo/tool/hmmscan_parser' +require 'lib/evo/tool/hmmscan_summary' require 'lib/evo/tool/domains_to_forester' require 'lib/evo/io/parser/general_msa_parser' require 'lib/evo/io/parser/basic_table_parser' @@ -823,7 +823,7 @@ module Evoruby def test_hmmscan_parser() begin - h = Evoruby::HmmscanParser.new() + h = Evoruby::HmmscanSummary.new() rescue Exception => e puts() puts( e.to_s ) diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb index 2c76dd6..30962da 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb @@ -12,6 +12,8 @@ require 'lib/evo/msa/msa_factory' require 'lib/evo/io/msa_io' require 'lib/evo/io/writer/fasta_writer' require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/io/parser/hmmscan_parser' + module Evoruby @@ -84,96 +86,89 @@ module Evoruby max_domain_copy_number_per_protein = -1 max_domain_copy_number_sequence = "" - hmmscan_datas = Array.new + hmmscan_datas = [] - File.open( hmmscan_output ) do | file | - while line = file.gets - if !is_ignorable?( line ) && line =~ /^\S+\s+/ + hmmscan_parser = HmmscanParser.new( hmmscan_output ) + results = hmmscan_parser.parse - # tn acc tlen query acc qlen Evalue score bias # of c-E i-E score bias hf ht af at ef et acc desc - # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 - line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/ + results.each do | r | + if domain_id != r.model + next + end - if domain_id != $1 - next - end + sequence = r.query + number = r.number + out_of = r.out_of + env_from = r.env_from + env_to = r.env_to + i_e_value = r.i_e_value + + if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) && + ( ( length_threshold <= 0 ) || ( env_to - env_from + 1 ) >= length_threshold.to_f ) ) + hmmscan_datas << HmmsearchData.new( sequence, number, out_of, env_from, env_to, i_e_value ) + if ( number > max_domain_copy_number_per_protein ) + max_domain_copy_number_sequence = sequence + max_domain_copy_number_per_protein = number + end + else # failed + print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" ) + log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" + if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) ) + print( " iE=" + i_e_value.to_s ) + log << " iE=" + i_e_value.to_s + end + if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f ) + le = env_to - env_from + 1 + print( " l=" + le.to_s ) + log << " l=" + le.to_s + end + print( ld ) + log << ld + domain_fail_counter += 1 + end - sequence = $4 - number = $10.to_i - out_of = $11.to_i - env_from = $20.to_i - env_to = $21.to_i - i_e_value = $13.to_f - - if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) && - ( ( length_threshold <= 0 ) || ( env_to - env_from + 1 ) >= length_threshold.to_f ) ) - hmmscan_datas << HmmsearchData.new( sequence, number, out_of, env_from, env_to, i_e_value ) - if ( number > max_domain_copy_number_per_protein ) - max_domain_copy_number_sequence = sequence - max_domain_copy_number_per_protein = number - end - else # failed - print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" ) - log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" - if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) ) - print( " iE=" + i_e_value.to_s ) - log << " iE=" + i_e_value.to_s - end - if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f ) - le = env_to - env_from + 1 - print( " l=" + le.to_s ) - log << " l=" + le.to_s - end - print( ld ) - log << ld - domain_fail_counter += 1 - end + if number > out_of + error_msg = "number > out_of ! (this should not have happened)" + raise StandardError, error_msg + end - if number > out_of - error_msg = "number > out_of ! (this should not have happened)" + if number == out_of + if !hmmscan_datas.empty? + process_hmmscan_datas( hmmscan_datas, + in_msa, + add_position, + add_domain_number, + add_species, + out_msa, + out_msa_singles, + out_msa_pairs, + out_msa_isolated, + min_linker, + out_msa_single_domains_protein_seqs, + out_msa_close_pairs_protein_seqs, + out_msa_close_pairs_only_protein_seqs, + out_msa_isolated_protein_seqs, + out_msa_isolated_only_protein_seqs, + out_msa_isolated_and_close_pair_protein_seqs ) + domain_pass_counter += hmmscan_datas.length + if passed_seqs.find_by_name_start( sequence, true ).length < 1 + add_sequence( sequence, in_msa, passed_seqs ) + else + error_msg = "this should not have happened" raise StandardError, error_msg end - - if number == out_of - if !hmmscan_datas.empty? - process_hmmscan_datas( hmmscan_datas, - in_msa, - add_position, - add_domain_number, - add_species, - out_msa, - out_msa_singles, - out_msa_pairs, - out_msa_isolated, - min_linker, - out_msa_single_domains_protein_seqs, - out_msa_close_pairs_protein_seqs, - out_msa_close_pairs_only_protein_seqs, - out_msa_isolated_protein_seqs, - out_msa_isolated_only_protein_seqs, - out_msa_isolated_and_close_pair_protein_seqs ) - domain_pass_counter += hmmscan_datas.length - if passed_seqs.find_by_name_start( sequence, true ).length < 1 - add_sequence( sequence, in_msa, passed_seqs ) - else - error_msg = "this should not have happened" - raise StandardError, error_msg - end - else - if failed_seqs.find_by_name_start( sequence, true ).length < 1 - add_sequence( sequence, in_msa, failed_seqs ) - proteins_with_failing_domains += 1 - else - error_msg = "this should not have happened" - raise StandardError, error_msg - end - end - hmmscan_datas.clear + else + if failed_seqs.find_by_name_start( sequence, true ).length < 1 + add_sequence( sequence, in_msa, failed_seqs ) + proteins_with_failing_domains += 1 + else + error_msg = "this should not have happened" + raise StandardError, error_msg end - end - end # while line = file.gets - end # File.open( hmmsearch_output ) do | file | + hmmscan_datas.clear + end + end if domain_pass_counter < 1 error_msg = "no domain sequences were extracted" diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb new file mode 100644 index 0000000..bb697b0 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb @@ -0,0 +1,333 @@ +# +# = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $ +# +# last modified: 11/24/2009 + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/io/parser/hmmscan_parser' + +module Evoruby + + class HmmscanSummary + + PRG_NAME = "hsp" + PRG_VERSION = "2.000" + PRG_DESC = "hmmscan parser" + PRG_DATE = "2012.10.19" + COPYRIGHT = "2012 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + DELIMITER_OPTION = "d" + I_E_VALUE_THRESHOLD_OPTION = "e" + FS_E_VALUE_THRESHOLD_OPTION = "pe" + HMM_FOR_PROTEIN_OUTPUT = "m" + IGNORE_DUF_OPTION = "i" + PARSE_OUT_DESCRIPITION_OPTION = "a" + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" + + def initialize + @domain_counts = Hash.new + end + + # raises ArgumentError, IOError + def parse( inpath, + outpath, + column_delimiter, + i_e_value_threshold, + ignore_dufs, + get_descriptions, + fs_e_value_threshold, + hmm_for_protein_output ) + Util.check_file_for_readability( inpath ) + Util.check_file_for_writability( outpath ) + + outfile = File.open( outpath, "a" ) + + query = "" + desc = "" + model = "" + env_from = "" + env_to = "" + i_e_value = "" + + hmmscan_results_per_protein = [] + + hmmscan_parser = HmmscanParser.new( inpath ) + + prev_query = "" + + hmmscan_parser.parse.each do | r | + model = r.model + query = r.query + i_e_value = r.i_e_value + env_from = r.env_from + env_to = r.env_to + + if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) && + ( !ignore_dufs || ( model !~ /^DUF\d+/ ) ) + count_model( model ) + outfile.print( query + + column_delimiter ) + if ( get_descriptions ) + outfile.print( desc + + column_delimiter ) + end + outfile.print( model + + column_delimiter + + env_from.to_s + + column_delimiter + + env_to.to_s + + column_delimiter + + i_e_value.to_s ) + outfile.print( Constants::LINE_DELIMITER ) + end + + if !prev_query.empty? && prev_query != query + if !hmmscan_results_per_protein.empty? + process_hmmscan_results_per_protein( hmmscan_results_per_protein, + fs_e_value_threshold, + hmm_for_protein_output ) + end + hmmscan_results_per_protein.clear + end + prev_query = query + hmmscan_results_per_protein << r + end + if !hmmscan_results_per_protein.empty? + process_hmmscan_results_per_protein( hmmscan_results_per_protein, + fs_e_value_threshold, + hmm_for_protein_output ) + end + outfile.flush() + outfile.close() + + end # def parse + + def count_model( model ) + if ( @domain_counts.has_key?( model ) ) + count = @domain_counts[ model ].to_i + count += 1 + @domain_counts[ model ] = count + else + @domain_counts[ model ] = 1 + end + end + + def process_hmmscan_results_per_protein( hmmscan_results_per_protein, + fs_e_value_threshold, + hmm_for_protein_output ) + + fs_e_value = -1 + hmmscan_results_per_protein.each do | r | + if r.model == hmm_for_protein_output + fs_e_value = r.fs_e_value + if fs_e_value > fs_e_value_threshold + return + end + end + end + + + first = hmmscan_results_per_protein[ 0 ] + s = "" + s << first.query + "\t" + s << fs_e_value.to_s + "\t" + s << first.qlen.to_s + "\t" + # s << first.fs_e_value.to_s + "\t" + # s << first.out_of.to_s + "\t" + hmmscan_results_per_protein.each do | r | + s << r.model + "|" + end + puts s + end + + + def get_domain_counts() + return @domain_counts + end + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 2 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( DELIMITER_OPTION ) + allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( IGNORE_DUF_OPTION ) + allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION ) + allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + inpath = cla.get_file_name( 0 ) + outpath = cla.get_file_name( 1 ) + + column_delimiter = "\t" + if ( cla.is_option_set?( DELIMITER_OPTION ) ) + begin + column_delimiter = cla.get_option_value( DELIMITER_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + + i_e_value_threshold = -1.0 + if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) ) + begin + i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( i_e_value_threshold < 0.0 ) + Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT ) + end + end + + fs_e_value_threshold = -1.0 + if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) ) + begin + fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( fs_e_value_threshold < 0.0 ) + Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) + end + end + + + hmm_for_protein_output = "" + if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) ) + begin + hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + + + ignore_dufs = false + if ( cla.is_option_set?( IGNORE_DUF_OPTION ) ) + ignore_dufs = true + end + + parse_descriptions = false + if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) ) + parse_descriptions = true + end + + puts() + puts( "hmmpfam outputfile : " + inpath ) + puts( "outputfile : " + outpath ) + if ( i_e_value_threshold >= 0.0 ) + puts( "i-E-value threshold : " + i_e_value_threshold.to_s ) + else + puts( "i-E-value threshold : no threshold" ) + end + if ( parse_descriptions ) + puts( "parse descriptions : true" ) + else + puts( "parse descriptions : false" ) + end + if ( ignore_dufs ) + puts( "ignore DUFs : true" ) + else + puts( "ignore DUFs : false" ) + end + if ( column_delimiter == "\t" ) + puts( "column delimiter : TAB" ) + else + puts( "column delimiter : " + column_delimiter ) + end + if ( fs_e_value_threshold >= 0.0 ) + puts( "E-value threshold : " + fs_e_value_threshold.to_s ) + else + puts( "E-value threshold : no threshold" ) + end + if ( !hmm_for_protein_output.empty? ) + puts( "HMM for proteins : " + hmm_for_protein_output ) + end + puts() + + begin + parse( inpath, + outpath, + column_delimiter, + i_e_value_threshold, + ignore_dufs, + parse_descriptions, + fs_e_value_threshold, + hmm_for_protein_output ) + rescue ArgumentError, IOError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + domain_counts = get_domain_counts() + + + puts + puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" ) + puts( "(number of different domains: " + domain_counts.length.to_s + ")" ) + puts + puts( Util.draw_histogram( domain_counts, "#" ) ) + puts + Util.print_message( PRG_NAME, 'OK' ) + puts + + end # def run() + + def print_help() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" ) + puts( " -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" ) + puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" ) + puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" ) + puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" ) + puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" ) + puts() + end + + end # class HmmscanParser + +end # module Evoruby \ No newline at end of file