From 2baffb4b606ab667b2ac861c613d96c3cfca9156 Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Wed, 8 Mar 2017 17:09:59 -0800 Subject: [PATCH] v 1.001 --- .../io/parser/hmmscan_multi_domain_extractor.rb | 65 ++++++++------------ .../lib/evo/tool/multi_domain_seq_extractor.rb | 14 +++-- 2 files changed, 36 insertions(+), 43 deletions(-) diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb index 4f4ac4f..fe62a42 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb @@ -4,7 +4,11 @@ # Copyright:: Copyright (C) 2017 Christian M. Zmasek # License:: GNU Lesser General Public License (LGPL) # -# Last modified: 2017/02/20 +# Last modified: 2017/03/08 + +#### +# Import: if multiple copies of same domain, thresholds need to be same! +#### require 'lib/evo/util/constants' require 'lib/evo/msa/msa_factory' @@ -78,29 +82,8 @@ module Evoruby hmmscan_parser = HmmscanParser.new( hmmscan_output ) results = hmmscan_parser.parse - #### - # Import: if multiple copies of same domain, thresholds need to be same! - - #target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 )) - #target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 )) - - #target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.9, 0 )) - #target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.9, 1 )) - # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 )) - # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 )) - target_domain_ary = parse_da target_da - # target_domain_ary.push(TargetDomain.new('BH4', 1, -1, 0.2, 0 )) - # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 1 )) - # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 2 )) - # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 )) - - # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 )) - # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 )) - - #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 )) - target_domains = Hash.new target_domain_architecure = '' @@ -179,13 +162,13 @@ module Evoruby @encountered_domain_architectures.each do |k, v| log counter.to_s.rjust(2) + ') ' + v.to_s.rjust(5) + ': ' + k counter += 1 - if counter > 80 + if counter > 40 break end end log - log 'Passing domain architectures containing target domain(s):' + log 'Passing domain arrangements of target domain(s):' @passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h passing_da_sum = 0 @passsing_domain_architectures.each do |da, count| @@ -193,8 +176,8 @@ module Evoruby log count.to_s.rjust(4) + ': ' + da end log - log 'Failing domain architectures containing one or more target domain(s):' - @failing_domain_architectures = @failing_domain_architectures .sort{|a, b|a<=>b}.to_h + log 'Failing domain arrangements of target domain(s):' + @failing_domain_architectures = @failing_domain_architectures.sort{|a, b|a<=>b}.to_h failing_da_sum = 0 @failing_domain_architectures .each do |da, count| failing_da_sum += count @@ -223,8 +206,8 @@ module Evoruby raise StandardError, error_msg end - unless @failing_proteins_bc_missing_cutoffs == failing_da_sum - error_msg = "this should not have happened: failing seqs not equal to failing da sum" + unless @failing_proteins_bc_missing_cutoffs >= failing_da_sum + error_msg = "this should not have happened: failing seqs larger than failing da sum" raise StandardError, error_msg end @@ -337,29 +320,30 @@ module Evoruby domains_in_query_seq.each do |domain| domain_names_in_query_seq.add(domain.model) end - if target_domain_names.subset?(domain_names_in_query_seq) - passed_domains = Array.new - passed_domains_counts = Hash.new + passed_domains = Array.new + passed_domains_counts = Hash.new + + if (domain_names_in_query_seq.length > 0) && (target_domain_names.subset?(domain_names_in_query_seq)) domains_in_query_seq.each do |domain| if target_domains.has_key?(domain.model) target_domain = target_domains[domain.model] - if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0) + if target_domain.i_e_value >= 0 if domain.i_e_value > target_domain.i_e_value addToFailingDomainData(domain) next end end - if (target_domain.abs_len != nil) && (target_domain.abs_len > 0) + if target_domain.abs_len > 0 length = 1 + domain.env_to - domain.env_from if length < target_domain.abs_len addToFailingDomainData(domain) next end end - if (target_domain.rel_len != nil) && (target_domain.rel_len > 0) + if target_domain.rel_len > 0 length = 1 + domain.env_to - domain.env_from if length < (target_domain.rel_len * domain.tlen) addToFailingDomainData(domain) @@ -384,9 +368,14 @@ module Evoruby return false end # target_domain_names.subset?(domain_names_in_query_seq) + if passed_domains.length < 1 + @failing_proteins_bc_missing_cutoffs += 1 + return false + end + passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from } # Compare architectures - if !compareArchitectures(target_domain_architecture, passed_domains) + if !compareArchitectures(target_domain_architecture, passed_domains, false) @failing_proteins_bc_missing_cutoffs += 1 return false end @@ -488,10 +477,10 @@ module Evoruby @passsing_domain_architectures[domain_architecture] = 1 end else - if ( @failing_domain_architectures .key?(domain_architecture)) - @failing_domain_architectures [domain_architecture] = @failing_domain_architectures [domain_architecture] + 1 + if ( @failing_domain_architectures.key?(domain_architecture)) + @failing_domain_architectures[domain_architecture] = @failing_domain_architectures[domain_architecture] + 1 else - @failing_domain_architectures [domain_architecture] = 1 + @failing_domain_architectures[domain_architecture] = 1 end end return pass diff --git a/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb index 762556b..506743a 100644 --- a/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb @@ -13,9 +13,9 @@ module Evoruby class MultiDomainSeqExtractor PRG_NAME = "mdsx" - PRG_VERSION = "1.000" + PRG_VERSION = "1.001" PRG_DESC = "Extraction of multi domain sequences from hmmscan output" - PRG_DATE = "20170307" + PRG_DATE = "2017/03/08" WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" HELP_OPTION_1 = 'help' @@ -127,13 +127,17 @@ module Evoruby puts puts "Usage:" puts - puts " " + PRG_NAME + ".rb [file containing complete sequences in fasta format]" + puts " " + PRG_NAME + ".rb [file containing complete sequences in fasta format]" puts - puts " options: -" + puts "Format for target dom architecture:" + puts + puts " domainA=iE-cutoff=abs-len-cutoff=rel-len-cutoff--domainB=..." puts puts "Examples:" puts - puts " " + PRG_NAME + ".rb " + puts " " + PRG_NAME + ".rb BH4=1e-6=0=0.5--Bcl-2=1e-6=0=0.5 Bcl2_hmmscan_#{Constants::PFAM_V_FOR_EX}_10" + puts + puts " " + PRG_NAME + ".rb BH4=0.1=20=0--Bcl-2=0.1=50=0 Bcl2_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 Bcl2_ni.fasta" puts puts end -- 1.7.10.2