From 112213800fbe25848a6a5eaa40ee88a68da7b5dc Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Wed, 8 Mar 2017 12:03:02 -0800 Subject: [PATCH] in progress --- .../io/parser/hmmscan_multi_domain_extractor.rb | 53 ++++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb index c8cbc8d..4f4ac4f 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb @@ -20,6 +20,9 @@ module Evoruby OUTPUT_ID = 'mdsx' DOMAIN_DELIMITER = ' -- ' + IE_CUTOFF_FOR_DA_OVERVIEW = 1E-6 + REL_LEN_CUTOFF_FOR_DA_OVERVIEW = 0.5 + PASSING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta" FAILING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta" TARGET_DA_SUFFIX = "_#{OUTPUT_ID}_target_da.fasta" @@ -33,6 +36,7 @@ module Evoruby @passsing_domain_architectures = nil @failing_proteins_bc_not_all_target_doms_present = nil @failing_proteins_bc_missing_cutoffs = nil + @encountered_domain_architectures = nil @log = nil end @@ -99,14 +103,14 @@ module Evoruby target_domains = Hash.new - target_domain_architecure = "" + target_domain_architecure = '' target_domain_ary.each do |target_domain| target_domains[target_domain.name] = target_domain if target_domain_architecure.length > 0 - target_domain_architecure += DOMAIN_DELIMITER + target_domain_architecure << DOMAIN_DELIMITER end - target_domain_architecure += target_domain.name + target_domain_architecure << target_domain.name end target_domain_architecure.freeze @@ -132,6 +136,7 @@ module Evoruby @passsing_domain_architectures = Hash.new @passing_domains_data = Hash.new @failing_domains_data = Hash.new + @encountered_domain_architectures= Hash.new @failing_proteins_bc_not_all_target_doms_present = 0 @failing_proteins_bc_missing_cutoffs = 0 out_domain_msas = Hash.new @@ -166,6 +171,20 @@ module Evoruby end log + log 'Domain architecture overview using default iE-cutoff ' + + IE_CUTOFF_FOR_DA_OVERVIEW.to_s + ' and relative length cutoff ' + REL_LEN_CUTOFF_FOR_DA_OVERVIEW.to_s + ':' + + @encountered_domain_architectures = @encountered_domain_architectures.sort_by {|k, v| v}.reverse.to_h + counter = 1; + @encountered_domain_architectures.each do |k, v| + log counter.to_s.rjust(2) + ') ' + v.to_s.rjust(5) + ': ' + k + counter += 1 + if counter > 80 + break + end + end + + log log 'Passing domain architectures containing target domain(s):' @passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h passing_da_sum = 0 @@ -273,6 +292,32 @@ module Evoruby private + # domains: Array of HmmscanResult + def collect(domains, ie_cutoff, rel_len_cutoff) + passed = Array.new + domains.each do |domain| + length = 1 + domain.env_to - domain.env_from + if (domain.i_e_value <= ie_cutoff) && (length >= (rel_len_cutoff * domain.tlen)) + passed.push domain + end + end + passed.sort! { |a,b| a.ali_from <=> b.ali_from } + s = '' + passed.each do |domain| + if s.length > 0 + s << DOMAIN_DELIMITER + end + s << domain.model + end + if s.length > 0 + if @encountered_domain_architectures.has_key? s + @encountered_domain_architectures[s] = 1 + @encountered_domain_architectures[s] + else + @encountered_domain_architectures[s] = 1 + end + end + end + # domains_in_query_seq: Array of HmmscanResult # target_domain_names: Set of String # target_domains: Hash String->TargetDomain @@ -286,6 +331,8 @@ module Evoruby out_contactenated_domains_msa, target_domain_architecture) + collect(domains_in_query_seq, IE_CUTOFF_FOR_DA_OVERVIEW, REL_LEN_CUTOFF_FOR_DA_OVERVIEW) + domain_names_in_query_seq = Set.new domains_in_query_seq.each do |domain| domain_names_in_query_seq.add(domain.model) -- 1.7.10.2