From 5614453599baa9428212c67d9a452aefb76a9370 Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Mon, 6 Mar 2017 22:47:36 -0800 Subject: [PATCH] in progress... --- .../io/parser/hmmscan_multi_domain_extractor.rb | 320 +++++++++++++------- .../lib/evo/tool/multi_domain_seq_extractor.rb | 9 +- 2 files changed, 221 insertions(+), 108 deletions(-) diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb index c35e15c..33e48a4 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb @@ -16,15 +16,21 @@ require 'lib/evo/io/parser/hmmscan_parser' module Evoruby class HmmscanMultiDomainExtractor - OUTPUT_ID = 'MDSX' - - PASSING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta" - FAILING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta" - TARGET_DA_SUFFIX = "_#{OUTPUT_ID}_target_da.fasta" - CONCAT_TARGET_DOM_SUFFIX = "_#{OUTPUT_ID}_concat_target_dom.fasta" + OUTPUT_ID = 'mdsx' + DOMAIN_DELIMITER = ' -- ' + + PASSING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta" + FAILING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta" + TARGET_DA_SUFFIX = "_#{OUTPUT_ID}_target_da.fasta" + CONCAT_TARGET_DOM_SUFFIX = "_#{OUTPUT_ID}_concat_target_dom.fasta" + TARGET_DOM_OUTPUT_MIDPART = "_#{OUTPUT_ID}_target_dom_" def initialize - @passed = Hash.new - @non_passsing_domain_architectures = Hash.new + @passing_domains_data = nil + @failing_domains_data = nil + @failing_domain_architectures = nil + @passsing_domain_architectures = nil + @failing_proteins_bc_not_all_target_doms_present = nil + @failing_proteins_bc_missing_cutoffs = nil end # raises ArgumentError, IOError, StandardError @@ -55,51 +61,29 @@ module Evoruby raise IOError, error_msg end - out_msa = Msa.new - failed_seqs_msa = Msa.new passed_seqs_msa = Msa.new - # passed_multi_seqs_msa = Msa.new ld = Constants::LINE_DELIMITER - domain_pass_counter = 0 - domain_fail_counter = 0 - passing_domains_per_protein = 0 - proteins_with_failing_domains = 0 - domain_not_present_counter = 0 - protein_counter = 1 - max_domain_copy_number_per_protein = -1 - max_domain_copy_number_sequence = "" - passing_target_length_sum = 0 - overall_target_length_sum = 0 - overall_target_length_min = 10000000 - overall_target_length_max = -1 - passing_target_length_min = 10000000 - passing_target_length_max = -1 - - overall_target_ie_min = 10000000 - overall_target_ie_max = -1 - passing_target_ie_min = 10000000 - passing_target_ie_max = -1 - hmmscan_parser = HmmscanParser.new( hmmscan_output ) results = hmmscan_parser.parse #### - # Import: if multiple copies of same domain, threshold need to be same! + # Import: if multiple copies of same domain, thresholds need to be same! target_domain_ary = Array.new - target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 )) - target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 )) + #target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 )) + #target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 )) - # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 0 )) - # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 )) + #target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.9, 0 )) + #target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.9, 1 )) # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 )) # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 )) - #target_domain_ary.push(TargetDomain.new('BH4', 0.1, -1, 0.5, 0 )) - #target_domain_ary.push(TargetDomain.new('Bcl-2', 0.1, -1, 0.5, 1 )) + target_domain_ary.push(TargetDomain.new('BH4', 1, -1, 0.2, 0 )) + target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 1 )) + target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 2 )) # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 )) # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 )) @@ -114,7 +98,7 @@ module Evoruby target_domain_ary.each do |target_domain| target_domains[target_domain.name] = target_domain if target_domain_architecure.length > 0 - target_domain_architecure += ">" + target_domain_architecure += DOMAIN_DELIMITER end target_domain_architecure += target_domain.name end @@ -132,18 +116,23 @@ module Evoruby passing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq. failing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq. total_sequences = 0 - @passed = Hash.new + + @failing_domain_architectures = Hash.new + @passsing_domain_architectures = Hash.new + @passing_domains_data = Hash.new + @failing_domains_data = Hash.new + @failing_proteins_bc_not_all_target_doms_present = 0 + @failing_proteins_bc_missing_cutoffs = 0 out_domain_msas = Hash.new out_domain_architecture_msa = Msa.new out_concatenated_domains_msa = Msa.new results.each do |hmmscan_result| if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name ) - if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_contactenated_domains_msa, target_domain_architecure) + if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure) passing_sequences.push(domains_in_query_seq) else failing_sequences.push(domains_in_query_seq) end - domains_in_query_seq = Array.new total_sequences += 1 end @@ -153,23 +142,88 @@ module Evoruby if prev_query_seq_name != nil total_sequences += 1 - if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_contactenated_domains_msa, target_domain_architecure) + if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure) passing_sequences.push(domains_in_query_seq) else failing_sequences.push(domains_in_query_seq) end end + puts + + puts 'Failing domain architectures containing one or more target domain(s):' + @failing_domain_architectures = @failing_domain_architectures .sort{|a, b|a<=>b}.to_h + failing_da_sum = 0 + @failing_domain_architectures .each do |da, count| + failing_da_sum += count + puts count.to_s.rjust(4) + ': ' + da + end + + puts + + puts 'Passing domain architectures containing target domain(s):' + @passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h + passing_da_sum = 0 + @passsing_domain_architectures.each do |da, count| + passing_da_sum += count + puts count.to_s.rjust(4) + ': ' + da + end + + puts 'Passing domain(s):' + @passing_domains_data = @passing_domains_data.sort{|a, b|a<=>b}.to_h + @passing_domains_data.each do |n, d| + puts d.to_str + end + + puts 'Failing domain(s):' + @failing_domains_data = @failing_domains_data.sort{|a, b|a<=>b}.to_h + @failing_domains_data.each do |n, d| + puts d.to_str + end + + unless total_sequences == (passing_sequences.size + failing_sequences.size) + error_msg = "this should not have happened: total seqs not equal to passing plus failing seqs" + raise StandardError, error_msg + end + + unless failing_sequences.size == (@failing_proteins_bc_not_all_target_doms_present + @failing_proteins_bc_missing_cutoffs) + error_msg = "this should not have happened: failing seqs sums not consistent" + raise StandardError, error_msg + end + + unless @failing_proteins_bc_missing_cutoffs == failing_da_sum + error_msg = "this should not have happened: failing seqs not equal to failing da sum" + raise StandardError, error_msg + end + + unless passing_sequences.size == passing_da_sum + error_msg = "this should not have happened: passing seqs not equal to passing da sum" + raise StandardError, error_msg + end + + puts + + puts "Protein sequences in sequence (fasta) file: " + in_msa.get_number_of_seqs.to_s.rjust(5) + puts "Protein sequences in hmmscan output file : " + total_sequences.to_s.rjust(5) + puts " Passing protein sequences : " + passing_sequences.size.to_s.rjust(5) + puts " Failing protein sequences : " + failing_sequences.size.to_s.rjust(5) + puts " Not all target domain present : " + @failing_proteins_bc_not_all_target_doms_present.to_s.rjust(5) + puts " Target domain(s) failing cutoffs : " + @failing_proteins_bc_missing_cutoffs.to_s.rjust(5) + + puts + out_domain_msas.keys.sort.each do |domain_name| - puts "writing #{domain_name}" - write_msa( out_domain_msas[domain_name], domain_name + ".fasta" ) + file_name = outfile_base + TARGET_DOM_OUTPUT_MIDPART + domain_name + '.fasta' + + write_msa( out_domain_msas[domain_name], file_name ) + puts "wrote #{domain_name}" end - puts "writing target_domain_architecure" write_msa( out_domain_architecture_msa, target_da_outfile ) + puts "wrote target_domain_architecure" - puts "writing contactenated_domains_msa" write_msa( out_concatenated_domains_msa, concat_target_dom_outfile ) + puts "wrote concatenated_domains_msa" passing_sequences.each do | domains | query_name = domains[0].query @@ -191,43 +245,17 @@ module Evoruby end end - puts - - puts 'Non passing domain architectures containing target domain(s):' - @non_passsing_domain_architectures = @non_passsing_domain_architectures.sort{|a, b|a<=>b}.to_h - @non_passsing_domain_architectures.each do |da, count| - puts da + ': ' + count.to_s - end - - puts - - puts 'Passing target domain counts:' - @passed = @passed.sort{|a, b|a<=>b}.to_h - @passed.each do |dom, count| - puts dom + ': ' + count.to_s - end - - puts - - puts "total sequences : " + total_sequences.to_s - puts "passing sequences: " + passing_sequences.size.to_s - write_msa( passed_seqs_msa, passing_fl_seqs_outfile ) + puts "wrote ..." write_msa( failed_seqs_msa, failing_fl_seqs_outfile ) + puts "wrote ..." log_str << ld - log_str << "passing target domains : " + domain_pass_counter.to_s + ld - log_str << "failing target domains : " + domain_fail_counter.to_s + ld + log_str << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld - log_str << "proteins in hmmscan outputfile : " + protein_counter.to_s + ld - log_str << "proteins with passing target domain(s) : " + passed_seqs_msa.get_number_of_seqs.to_s + ld - log_str << "proteins with no passing target domain : " + proteins_with_failing_domains.to_s + ld - log_str << "proteins with no target domain : " + domain_not_present_counter.to_s + ld log_str << ld - return domain_pass_counter - end # parse private @@ -249,7 +277,7 @@ module Evoruby domains_in_query_seq.each do |domain| domain_names_in_query_seq.add(domain.model) end - if (target_domain_names.subset?(domain_names_in_query_seq)) + if target_domain_names.subset?(domain_names_in_query_seq) passed_domains = Array.new passed_domains_counts = Hash.new @@ -260,19 +288,21 @@ module Evoruby if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0) if domain.i_e_value > target_domain.i_e_value + addToFailingDomainData(domain) next end end if (target_domain.abs_len != nil) && (target_domain.abs_len > 0) length = 1 + domain.env_to - domain.env_from if length < target_domain.abs_len + addToFailingDomainData(domain) next end end if (target_domain.rel_len != nil) && (target_domain.rel_len > 0) length = 1 + domain.env_to - domain.env_from - # puts (target_domain.rel_len * domain.tlen) if length < (target_domain.rel_len * domain.tlen) + addToFailingDomainData(domain) next end end @@ -285,28 +315,25 @@ module Evoruby passed_domains_counts[domain.model] = 1 end - if (@passed.key?(domain.model)) - @passed[domain.model] = @passed[domain.model] + 1 - else - @passed[domain.model] = 1 - end + addToPassingDomainData(domain) + end # if target_domains.has_key?(domain.model) end # domains_in_query_seq.each do |domain| else + @failing_proteins_bc_not_all_target_doms_present += 1 return false - end + end # target_domain_names.subset?(domain_names_in_query_seq) passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from } # Compare architectures if !compareArchitectures(target_domain_architecture, passed_domains) + @failing_proteins_bc_missing_cutoffs += 1 return false end domain_counter = Hash.new min_env_from = 10000000 - max_env_from = 0 - min_env_to = 10000000 max_env_to = 0 query_seq = nil @@ -347,20 +374,11 @@ module Evoruby if domain.env_from < min_env_from min_env_from = domain.env_from end - if domain.env_from > max_env_from - max_env_from = domain.env_from - end - if domain.env_to < min_env_to - min_env_to = domain.env_to - end if domain.env_to > max_env_to max_env_to = domain.env_to end end - #puts "env from: #{min_env_from} - #{max_env_from}" - #puts "env to : #{min_env_to} - #{max_env_to}" - extract_domain( query_seq, 1, 1, @@ -369,19 +387,31 @@ module Evoruby in_msa, out_domain_architecture_msa ) - puts passed_domains_counts - out_contactenated_domains_msa.add( query_seq, concatenated_domains) true end + def addToPassingDomainData(domain) + unless ( @passing_domains_data.key?(domain.model)) + @passing_domains_data[domain.model] = DomainData.new(domain.model) + end + @passing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value) + end + + def addToFailingDomainData(domain) + unless ( @failing_domains_data.key?(domain.model)) + @failing_domains_data[domain.model] = DomainData.new(domain.model) + end + @failing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value) + end + # passed_domains needs to be sorted! def compareArchitectures(target_domain_architecture, passed_domains, strict = false) - domain_architecture = "" + domain_architecture = '' passed_domains.each do |domain| if domain_architecture.length > 0 - domain_architecture += ">" + domain_architecture += DOMAIN_DELIMITER end domain_architecture += domain.model end @@ -391,11 +421,17 @@ module Evoruby else pass = (domain_architecture.index(target_domain_architecture) != nil) end - if !pass - if (@non_passsing_domain_architectures.key?(domain_architecture)) - @non_passsing_domain_architectures[domain_architecture] = @non_passsing_domain_architectures[domain_architecture] + 1 + if pass + if (@passsing_domain_architectures.key?(domain_architecture)) + @passsing_domain_architectures[domain_architecture] = @passsing_domain_architectures[domain_architecture] + 1 + else + @passsing_domain_architectures[domain_architecture] = 1 + end + else + if ( @failing_domain_architectures .key?(domain_architecture)) + @failing_domain_architectures [domain_architecture] = @failing_domain_architectures [domain_architecture] + 1 else - @non_passsing_domain_architectures[domain_architecture] = 1 + @failing_domain_architectures [domain_architecture] = 1 end end return pass @@ -471,6 +507,84 @@ module Evoruby end # class HmmscanMultiDomainExtractor + class DomainData + def initialize( name ) + if (name == nil) || name.size < 1 + error_msg = "domain name nil or empty" + raise StandardError, error_msg + end + @name = name + @count = 0 + @i_e_value_min = 10000000.0 + @i_e_value_max = -1.0 + @i_e_value_sum = 0.0 + @len_min = 10000000 + @len_max = -1 + @len_sum = 0.0 + end + + def add( name, length, i_e_value) + if name != @name + error_msg = "domain names do not match" + raise StandardError, error_msg + end + + if length < 0 + error_msg = "length cannot me negative" + raise StandardError, error_msg + end + if i_e_value < 0 + error_msg = "iE-value cannot me negative" + raise StandardError, error_msg + end + @count += 1 + @i_e_value_sum += i_e_value + @len_sum += length + if i_e_value > @i_e_value_max + @i_e_value_max = i_e_value + end + if i_e_value < @i_e_value_min + @i_e_value_min = i_e_value + end + if length > @len_max + @len_max = length + end + if length < @len_min + @len_min = length + end + end + + def avg_length + if @count == 0 + return 0 + end + @len_sum / @count + end + + def avg_i_e_value + if @count == 0 + return 0 + end + @i_e_value_sum / @count + end + + def to_str + s = '' + s << @name.rjust(24) + ': ' + s << @count.to_s.rjust(4) + ' ' + s << avg_length.round(1).to_s.rjust(6) + ' ' + s << @len_min.to_s.rjust(4) + ' -' + s << @len_max.to_s.rjust(4) + ' ' + s << ("%.2E" % avg_i_e_value).rjust(9) + ' ' + s << ("%.2E" % @i_e_value_min).rjust(9) + ' -' + s << ("%.2E" % @i_e_value_max).rjust(9) + ' ' + s + end + + attr_reader :name, :count, :i_e_value_min, :i_e_value_max, :length_min, :length_max + + end + class TargetDomain def initialize( name, i_e_value, abs_len, rel_len, position ) if (name == nil) || name.size < 1 diff --git a/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb index cf6d40e..c1a35ab 100644 --- a/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb @@ -66,7 +66,7 @@ module Evoruby fasta_sequence_file = "" outfile = "" - if (cla.get_number_of_files == 3) + if cla.get_number_of_files == 3 fasta_sequence_file = cla.get_file_name( 2 ) else hmmscan_index = hmmscan_output.index(Constants::HMMSCAN) @@ -89,8 +89,8 @@ module Evoruby end end hmmscan_index = hmmscan_output.index(Constants::HMMSCAN) - if ( hmmscan_index != nil ) - outfile = hmmscan_output.sub(Constants::HMMSCAN, "_") + "_MDSX" + if hmmscan_index != nil + outfile = hmmscan_output.sub(Constants::HMMSCAN, '_') else Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' ) end @@ -131,11 +131,10 @@ module Evoruby rescue Exception => e puts e.backtrace Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT ) - end puts - Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" ) + # Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta") # Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX ) # Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX ) -- 1.7.10.2