From 8b26b15cc4d90b491aa7b7441058675179a6d0d2 Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Wed, 1 Mar 2017 15:37:00 -0800 Subject: [PATCH] in progress... --- .../io/parser/hmmscan_multi_domain_extractor.rb | 123 +++++++----- .../lib/evo/tool/multi_domain_seq_extractor.rb | 200 +++++--------------- 2 files changed, 118 insertions(+), 205 deletions(-) diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb index 7699c88..c35e15c 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb @@ -15,30 +15,36 @@ require 'lib/evo/io/parser/hmmscan_parser' module Evoruby class HmmscanMultiDomainExtractor + + OUTPUT_ID = 'MDSX' + + PASSING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta" + FAILING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta" + TARGET_DA_SUFFIX = "_#{OUTPUT_ID}_target_da.fasta" + CONCAT_TARGET_DOM_SUFFIX = "_#{OUTPUT_ID}_concat_target_dom.fasta" def initialize @passed = Hash.new @non_passsing_domain_architectures = Hash.new end # raises ArgumentError, IOError, StandardError - def parse( domain_id, + def parse( target_da, hmmscan_output, fasta_sequence_file, - outfile, - passed_seqs_outfile, - failed_seqs_outfile, - e_value_threshold, - length_threshold, - add_position, - add_domain_number, - add_species, - log ) + outfile_base, + log_str ) + + passing_fl_seqs_outfile = outfile_base + PASSING_FL_SEQS_SUFFIX + failing_fl_seqs_outfile = outfile_base + FAILING_FL_SEQS_SUFFIX + target_da_outfile = outfile_base + TARGET_DA_SUFFIX + concat_target_dom_outfile = outfile_base + CONCAT_TARGET_DOM_SUFFIX Util.check_file_for_readability( hmmscan_output ) Util.check_file_for_readability( fasta_sequence_file ) - Util.check_file_for_writability( outfile + ".fasta" ) - Util.check_file_for_writability( passed_seqs_outfile ) - Util.check_file_for_writability( failed_seqs_outfile ) + Util.check_file_for_writability( passing_fl_seqs_outfile ) + Util.check_file_for_writability( failing_fl_seqs_outfile ) + Util.check_file_for_writability( target_da_outfile ) + Util.check_file_for_writability( concat_target_dom_outfile ) in_msa = nil factory = MsaFactory.new() @@ -51,9 +57,9 @@ module Evoruby out_msa = Msa.new - failed_seqs = Msa.new - passed_seqs = Msa.new - passed_multi_seqs = Msa.new + failed_seqs_msa = Msa.new + passed_seqs_msa = Msa.new + # passed_multi_seqs_msa = Msa.new ld = Constants::LINE_DELIMITER @@ -83,8 +89,12 @@ module Evoruby #### # Import: if multiple copies of same domain, threshold need to be same! target_domain_ary = Array.new - target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 1 )) - target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 )) + + target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 )) + target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 )) + + # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 0 )) + # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 )) # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 )) # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 )) @@ -119,16 +129,21 @@ module Evoruby prev_query_seq_name = nil domains_in_query_seq = Array.new - passing_sequences = Array.new + passing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq. + failing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq. total_sequences = 0 @passed = Hash.new out_domain_msas = Hash.new out_domain_architecture_msa = Msa.new + out_concatenated_domains_msa = Msa.new results.each do |hmmscan_result| if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name ) - if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure) + if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_contactenated_domains_msa, target_domain_architecure) passing_sequences.push(domains_in_query_seq) + else + failing_sequences.push(domains_in_query_seq) end + domains_in_query_seq = Array.new total_sequences += 1 end @@ -138,8 +153,10 @@ module Evoruby if prev_query_seq_name != nil total_sequences += 1 - if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure) + if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_contactenated_domains_msa, target_domain_architecure) passing_sequences.push(domains_in_query_seq) + else + failing_sequences.push(domains_in_query_seq) end end @@ -149,29 +166,34 @@ module Evoruby end puts "writing target_domain_architecure" - write_msa( out_domain_architecture_msa, "target_domain_architecure" + ".fasta" ) - - passing_sequences.each do | domains | + write_msa( out_domain_architecture_msa, target_da_outfile ) - seq = domains[0].query - # puts seq + "::" + puts "writing contactenated_domains_msa" + write_msa( out_concatenated_domains_msa, concat_target_dom_outfile ) - if passed_seqs.find_by_name_start( seq, true ).length < 1 - add_sequence( seq, in_msa, passed_multi_seqs ) + passing_sequences.each do | domains | + query_name = domains[0].query + if passed_seqs_msa.find_by_name_start( query_name, true ).length < 1 + add_sequence( query_name, in_msa, passed_seqs_msa ) else error_msg = "this should not have happened" raise StandardError, error_msg end + end - domains.each do | domain | - #puts domain.query + ": " + domain.model + failing_sequences.each do | domains | + query_name = domains[0].query + if failed_seqs_msa.find_by_name_start( query_name, true ).length < 1 + add_sequence( query_name, in_msa, failed_seqs_msa ) + else + error_msg = "this should not have happened" + raise StandardError, error_msg end - #puts end puts - puts 'Non passing domain architectures:' + puts 'Non passing domain architectures containing target domain(s):' @non_passsing_domain_architectures = @non_passsing_domain_architectures.sort{|a, b|a<=>b}.to_h @non_passsing_domain_architectures.each do |da, count| puts da + ': ' + count.to_s @@ -179,7 +201,7 @@ module Evoruby puts - puts 'Passing domain counts:' + puts 'Passing target domain counts:' @passed = @passed.sort{|a, b|a<=>b}.to_h @passed.each do |dom, count| puts dom + ': ' + count.to_s @@ -190,20 +212,19 @@ module Evoruby puts "total sequences : " + total_sequences.to_s puts "passing sequences: " + passing_sequences.size.to_s - # write_msa( out_msa, outfile + ".fasta" ) - # write_msa( passed_multi_seqs, passed_seqs_outfile ) - # write_msa( failed_seqs, failed_seqs_outfile ) + write_msa( passed_seqs_msa, passing_fl_seqs_outfile ) + write_msa( failed_seqs_msa, failing_fl_seqs_outfile ) - log << ld - log << "passing target domains : " + domain_pass_counter.to_s + ld - log << "failing target domains : " + domain_fail_counter.to_s + ld - log << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld - log << "proteins in hmmscan outputfile : " + protein_counter.to_s + ld - log << "proteins with passing target domain(s) : " + passed_seqs.get_number_of_seqs.to_s + ld - log << "proteins with no passing target domain : " + proteins_with_failing_domains.to_s + ld - log << "proteins with no target domain : " + domain_not_present_counter.to_s + ld + log_str << ld + log_str << "passing target domains : " + domain_pass_counter.to_s + ld + log_str << "failing target domains : " + domain_fail_counter.to_s + ld + log_str << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld + log_str << "proteins in hmmscan outputfile : " + protein_counter.to_s + ld + log_str << "proteins with passing target domain(s) : " + passed_seqs_msa.get_number_of_seqs.to_s + ld + log_str << "proteins with no passing target domain : " + proteins_with_failing_domains.to_s + ld + log_str << "proteins with no target domain : " + domain_not_present_counter.to_s + ld - log << ld + log_str << ld return domain_pass_counter @@ -221,6 +242,7 @@ module Evoruby in_msa, out_domain_msas, out_domain_architecture_msa, + out_contactenated_domains_msa, target_domain_architecture) domain_names_in_query_seq = Set.new @@ -249,7 +271,7 @@ module Evoruby end if (target_domain.rel_len != nil) && (target_domain.rel_len > 0) length = 1 + domain.env_to - domain.env_from - puts (target_domain.rel_len * domain.tlen) + # puts (target_domain.rel_len * domain.tlen) if length < (target_domain.rel_len * domain.tlen) next end @@ -311,7 +333,6 @@ module Evoruby error_msg = "seq names do not match: this should not have happened" raise StandardError, error_msg end - puts "query: " + query_seq extracted_domain_seq = extract_domain( query_seq, domain_counter[domain_name], @@ -337,8 +358,8 @@ module Evoruby end end - puts "env from: #{min_env_from} - #{max_env_from}" - puts "env to : #{min_env_to} - #{max_env_to}" + #puts "env from: #{min_env_from} - #{max_env_from}" + #puts "env to : #{min_env_to} - #{max_env_to}" extract_domain( query_seq, 1, @@ -349,7 +370,9 @@ module Evoruby out_domain_architecture_msa ) puts passed_domains_counts - puts concatenated_domains + + out_contactenated_domains_msa.add( query_seq, concatenated_domains) + true end diff --git a/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb index 6c74e8a..cf6d40e 100644 --- a/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb @@ -18,17 +18,7 @@ module Evoruby PRG_DATE = "20170220" WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" - E_VALUE_THRESHOLD_DEFAULT = 0.1 - LENGTH_THRESHOLD_DEFAULT = 50 - - E_VALUE_THRESHOLD_OPTION = 'e' - LENGTH_THRESHOLD_OPTION = 'l' - ADD_POSITION_OPTION = 'p' - ADD_DOMAIN_NUMBER_OPTION = 'd' - ADD_SPECIES = 's' - LOG_FILE_SUFFIX = '_multi_domain_seq_extr.log' - PASSED_SEQS_SUFFIX = '_with_passing_domains.fasta' - FAILED_SEQS_SUFFIX = '_with_no_passing_domains.fasta' + LOG_FILE_SUFFIX = '_MDSX.log' HELP_OPTION_1 = 'help' HELP_OPTION_2 = 'h' def run() @@ -57,17 +47,12 @@ module Evoruby exit( 0 ) end - unless ( cla.get_number_of_files == 2 || cla.get_number_of_files == 3 || cla.get_number_of_files == 4 ) + unless ( cla.get_number_of_files == 2 || cla.get_number_of_files == 3 ) print_help exit( -1 ) end allowed_opts = Array.new - allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) - allowed_opts.push( ADD_POSITION_OPTION ) - allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION ) - allowed_opts.push( LENGTH_THRESHOLD_OPTION ) - allowed_opts.push( ADD_SPECIES ) disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) @@ -76,140 +61,58 @@ module Evoruby STDOUT ) end - e_value_threshold = E_VALUE_THRESHOLD_DEFAULT - if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) - begin - e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - if ( e_value_threshold < 0.0 ) - Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) - end - end - - length_threshold = LENGTH_THRESHOLD_DEFAULT - if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) ) - begin - length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - if ( length_threshold < 0) - Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT ) - end - end - domain_id = cla.get_file_name( 0 ) hmmscan_output = cla.get_file_name( 1 ) fasta_sequence_file = "" outfile = "" - if (cla.get_number_of_files == 4) + if (cla.get_number_of_files == 3) fasta_sequence_file = cla.get_file_name( 2 ) - outfile = cla.get_file_name( 3 ) - elsif (cla.get_number_of_files == 2 || cla.get_number_of_files == 3) - if (cla.get_number_of_files == 3) - fasta_sequence_file = cla.get_file_name( 2 ) - else - hmmscan_index = hmmscan_output.index(Constants::HMMSCAN) - if ( hmmscan_index != nil ) - prefix = hmmscan_output[0 .. hmmscan_index-1 ] - suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX - files = Dir.entries( "." ) - matching_files = Util.get_matching_files( files, prefix, suffix) - if matching_files.length < 1 - Util.fatal_error( PRG_NAME, 'no file matching [' + prefix + - '...' + suffix + '] present in current directory: need to indicate as second argument' ) - end - if matching_files.length > 1 - Util.fatal_error( PRG_NAME, 'more than one file matching [' + - prefix + '...' + suffix + '] present in current directory: need to indicate as second argument' ) - end - fasta_sequence_file = matching_files[ 0 ] - else - Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' ) - end - end + else hmmscan_index = hmmscan_output.index(Constants::HMMSCAN) if ( hmmscan_index != nil ) - outfile = hmmscan_output.sub(Constants::HMMSCAN, "_") + "_" + domain_id - e = e_value_threshold >= 1 ? e_value_threshold.to_i : e_value_threshold.to_s.sub!('.','_') - outfile += "_E" + e.to_s - outfile += "_L" + length_threshold.to_i.to_s + prefix = hmmscan_output[0 .. hmmscan_index-1 ] + suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX + files = Dir.entries( "." ) + matching_files = Util.get_matching_files( files, prefix, suffix) + if matching_files.length < 1 + Util.fatal_error( PRG_NAME, 'no file matching [' + prefix + + '...' + suffix + '] present in current directory: need to indicate as second argument' ) + end + if matching_files.length > 1 + Util.fatal_error( PRG_NAME, 'more than one file matching [' + + prefix + '...' + suffix + '] present in current directory: need to indicate as second argument' ) + end + fasta_sequence_file = matching_files[ 0 ] else Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' ) end end - - if outfile.downcase.end_with?( ".fasta" ) - outfile = outfile[ 0 .. outfile.length - 7 ] - elsif outfile.downcase.end_with?( ".fsa" ) - outfile = outfile[ 0 .. outfile.length - 5 ] - end - - add_position = false - if ( cla.is_option_set?( ADD_POSITION_OPTION ) ) - add_position = true - end - - add_domain_number = false - if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) ) - add_domain_number = true - end - - add_species = false - if cla.is_option_set?( ADD_SPECIES ) - add_species = true + hmmscan_index = hmmscan_output.index(Constants::HMMSCAN) + if ( hmmscan_index != nil ) + outfile = hmmscan_output.sub(Constants::HMMSCAN, "_") + "_MDSX" + else + Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' ) end log = String.new ld = Constants::LINE_DELIMITER - puts() - puts( "Domain : " + domain_id ) - log << "Domain : " + domain_id + ld - puts( "Hmmscan outputfile : " + hmmscan_output ) - log << "Hmmscan outputfile : " + hmmscan_output + ld - puts( "Fasta sequencefile (complete sequences) : " + fasta_sequence_file ) - log << "Fasta sequencefile (complete sequences) : " + fasta_sequence_file + ld - puts( "Outputfile : " + outfile + ".fasta" ) - log << "Outputfile : " + outfile + ld - puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX ) - log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld - puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX ) - log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld - puts( "Logfile : " + outfile + LOG_FILE_SUFFIX ) - log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld - if ( e_value_threshold >= 0.0 ) - puts( "iE-value threshold : " + e_value_threshold.to_s ) - log << "iE-value threshold : " + e_value_threshold.to_s + ld - else - puts( "iE-value threshold : no threshold" ) - log << "iE-value threshold : no threshold" + ld - end - if ( length_threshold > 0 ) - puts( "Length threshold (env) : " + length_threshold.to_s ) - log << "Length threshold (env) : " + length_threshold.to_s + ld - else - puts( "Length threshold (env) : no threshold" ) - log << "Length threshold (env) : no threshold" + ld - end - if ( add_position ) - puts( "Add positions (rel to complete seq) to extracted domains : true" ) - log << "Add positions (rel to complete seq) to extracted domains : true" + ld - else - puts( "Add positions (rel to complete seq) to extracted domains : false" ) - log << "Add positions (rel to complete seq) to extracted domains : false" + ld - end - - if ( add_domain_number ) - puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" ) - log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld - else - puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" ) - log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld - end + # puts() + # puts( "Domain : " + domain_id ) + # log << "Domain : " + domain_id + ld + # puts( "Hmmscan outputfile : " + hmmscan_output ) + # log << "Hmmscan outputfile : " + hmmscan_output + ld + # puts( "Fasta sequencefile (complete sequences) : " + fasta_sequence_file ) + # log << "Fasta sequencefile (complete sequences) : " + fasta_sequence_file + ld + # puts( "Outputfile : " + outfile + ".fasta" ) + # log << "Outputfile : " + outfile + ld + # puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX ) + # log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld + # puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX ) + # log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld + # puts( "Logfile : " + outfile + LOG_FILE_SUFFIX ) + # log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld puts log << ld @@ -221,13 +124,6 @@ module Evoruby hmmscan_output, fasta_sequence_file, outfile, - outfile + PASSED_SEQS_SUFFIX, - outfile + FAILED_SEQS_SUFFIX, - e_value_threshold, - length_threshold, - add_position, - add_domain_number, - add_species, log ) rescue ArgumentError, IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) @@ -240,10 +136,10 @@ module Evoruby puts Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" ) - Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta") - Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX ) - Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX ) - Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX ) + # Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta") + # Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX ) + # Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX ) + # Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX ) begin f = File.open( outfile + LOG_FILE_SUFFIX, 'a' ) @@ -263,21 +159,15 @@ module Evoruby puts() puts( "Usage:" ) puts() - puts( " " + PRG_NAME + ".rb [options] [file containing complete sequences in fasta format] [outputfile]" ) + puts( " " + PRG_NAME + ".rb [file containing complete sequences in fasta format]" ) puts() - puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "= : iE-value threshold for target domain, default is " + E_VALUE_THRESHOLD_DEFAULT.to_s ) - puts( " -" + LENGTH_THRESHOLD_OPTION + "= : length threshold target domain (env), default is " + LENGTH_THRESHOLD_DEFAULT.to_s ) - puts( " -" + ADD_DOMAIN_NUMBER_OPTION + " : to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" ) - puts( " -" + ADD_POSITION_OPTION + " : to add positions (rel to complete seq) to extracted domains" ) - puts( " -" + ADD_SPECIES + " : to add species [in brackets]" ) + puts( " options: -" ) puts() puts( "Examples:" ) puts - puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta P53_#{Constants::PFAM_V_FOR_EX}_10_Pkinase_E1_0e-06_L50" ) + puts( " " + PRG_NAME + ".rb " ) puts - puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta" ) - puts - puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10" ) + puts() end -- 1.7.10.2