# Copyright:: Copyright (C) 2017 Christian M. Zmasek
# License:: GNU Lesser General Public License (LGPL)
#
-# Last modified: 2017/02/20
+# Last modified: 2017/03/08
+
+####
+# Import: if multiple copies of same domain, thresholds need to be same!
+####
require 'lib/evo/util/constants'
require 'lib/evo/msa/msa_factory'
OUTPUT_ID = 'mdsx'
DOMAIN_DELIMITER = ' -- '
+ IE_CUTOFF_FOR_DA_OVERVIEW = 1E-6
+ REL_LEN_CUTOFF_FOR_DA_OVERVIEW = 0.5
+
PASSING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
FAILING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
TARGET_DA_SUFFIX = "_#{OUTPUT_ID}_target_da.fasta"
@passsing_domain_architectures = nil
@failing_proteins_bc_not_all_target_doms_present = nil
@failing_proteins_bc_missing_cutoffs = nil
+ @encountered_domain_architectures = nil
@log = nil
end
hmmscan_parser = HmmscanParser.new( hmmscan_output )
results = hmmscan_parser.parse
- ####
- # Import: if multiple copies of same domain, thresholds need to be same!
-
- #target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 ))
- #target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 ))
-
- #target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.9, 0 ))
- #target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.9, 1 ))
- # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
- # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
-
target_domain_ary = parse_da target_da
- # target_domain_ary.push(TargetDomain.new('BH4', 1, -1, 0.2, 0 ))
- # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 1 ))
- # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 2 ))
- # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
-
- # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
- # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
-
- #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
-
target_domains = Hash.new
- target_domain_architecure = ""
+ target_domain_architecure = ''
target_domain_ary.each do |target_domain|
target_domains[target_domain.name] = target_domain
if target_domain_architecure.length > 0
- target_domain_architecure += DOMAIN_DELIMITER
+ target_domain_architecure << DOMAIN_DELIMITER
end
- target_domain_architecure += target_domain.name
+ target_domain_architecure << target_domain.name
end
target_domain_architecure.freeze
@passsing_domain_architectures = Hash.new
@passing_domains_data = Hash.new
@failing_domains_data = Hash.new
+ @encountered_domain_architectures= Hash.new
@failing_proteins_bc_not_all_target_doms_present = 0
@failing_proteins_bc_missing_cutoffs = 0
out_domain_msas = Hash.new
end
log
- log 'Passing domain architectures containing target domain(s):'
+ log 'Domain architecture overview using default iE-cutoff ' +
+ IE_CUTOFF_FOR_DA_OVERVIEW.to_s + ' and relative length cutoff ' + REL_LEN_CUTOFF_FOR_DA_OVERVIEW.to_s + ':'
+
+ @encountered_domain_architectures = @encountered_domain_architectures.sort_by {|k, v| v}.reverse.to_h
+ counter = 1;
+ @encountered_domain_architectures.each do |k, v|
+ log counter.to_s.rjust(2) + ') ' + v.to_s.rjust(5) + ': ' + k
+ counter += 1
+ if counter > 40
+ break
+ end
+ end
+
+ log
+ log 'Passing domain arrangements of target domain(s):'
@passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
passing_da_sum = 0
@passsing_domain_architectures.each do |da, count|
log count.to_s.rjust(4) + ': ' + da
end
log
- log 'Failing domain architectures containing one or more target domain(s):'
- @failing_domain_architectures = @failing_domain_architectures .sort{|a, b|a<=>b}.to_h
+ log 'Failing domain arrangements of target domain(s):'
+ @failing_domain_architectures = @failing_domain_architectures.sort{|a, b|a<=>b}.to_h
failing_da_sum = 0
@failing_domain_architectures .each do |da, count|
failing_da_sum += count
log d.to_str
end
log
- log'Failing target domain(s):'
+ log 'Failing target domain(s) (in proteins sequences with target domain architecture):'
@failing_domains_data = @failing_domains_data.sort{|a, b|a<=>b}.to_h
@failing_domains_data.each do |n, d|
log d.to_str
raise StandardError, error_msg
end
- unless @failing_proteins_bc_missing_cutoffs == failing_da_sum
- error_msg = "this should not have happened: failing seqs not equal to failing da sum"
+ unless @failing_proteins_bc_missing_cutoffs >= failing_da_sum
+ error_msg = "this should not have happened: failing seqs larger than failing da sum"
raise StandardError, error_msg
end
private
+ # domains: Array of HmmscanResult
+ def collect(domains, ie_cutoff, rel_len_cutoff)
+ passed = Array.new
+ domains.each do |domain|
+ length = 1 + domain.env_to - domain.env_from
+ if (domain.i_e_value <= ie_cutoff) && (length >= (rel_len_cutoff * domain.tlen))
+ passed.push domain
+ end
+ end
+ passed.sort! { |a,b| a.ali_from <=> b.ali_from }
+ s = ''
+ passed.each do |domain|
+ if s.length > 0
+ s << DOMAIN_DELIMITER
+ end
+ s << domain.model
+ end
+ if s.length > 0
+ if @encountered_domain_architectures.has_key? s
+ @encountered_domain_architectures[s] = 1 + @encountered_domain_architectures[s]
+ else
+ @encountered_domain_architectures[s] = 1
+ end
+ end
+ end
+
# domains_in_query_seq: Array of HmmscanResult
# target_domain_names: Set of String
# target_domains: Hash String->TargetDomain
out_contactenated_domains_msa,
target_domain_architecture)
+ collect(domains_in_query_seq, IE_CUTOFF_FOR_DA_OVERVIEW, REL_LEN_CUTOFF_FOR_DA_OVERVIEW)
+
domain_names_in_query_seq = Set.new
domains_in_query_seq.each do |domain|
domain_names_in_query_seq.add(domain.model)
end
- if target_domain_names.subset?(domain_names_in_query_seq)
- passed_domains = Array.new
- passed_domains_counts = Hash.new
+ passed_domains = Array.new
+ passed_domains_counts = Hash.new
+
+ if (domain_names_in_query_seq.length > 0) && (target_domain_names.subset?(domain_names_in_query_seq))
domains_in_query_seq.each do |domain|
if target_domains.has_key?(domain.model)
target_domain = target_domains[domain.model]
- if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
+ if target_domain.i_e_value >= 0
if domain.i_e_value > target_domain.i_e_value
addToFailingDomainData(domain)
next
end
end
- if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
+ if target_domain.abs_len > 0
length = 1 + domain.env_to - domain.env_from
if length < target_domain.abs_len
addToFailingDomainData(domain)
next
end
end
- if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
+ if target_domain.rel_len > 0
length = 1 + domain.env_to - domain.env_from
if length < (target_domain.rel_len * domain.tlen)
addToFailingDomainData(domain)
return false
end # target_domain_names.subset?(domain_names_in_query_seq)
+ if passed_domains.length < 1
+ @failing_proteins_bc_missing_cutoffs += 1
+ return false
+ end
+
passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
# Compare architectures
- if !compareArchitectures(target_domain_architecture, passed_domains)
+ if !compareArchitectures(target_domain_architecture, passed_domains, false)
@failing_proteins_bc_missing_cutoffs += 1
return false
end
@passsing_domain_architectures[domain_architecture] = 1
end
else
- if ( @failing_domain_architectures .key?(domain_architecture))
- @failing_domain_architectures [domain_architecture] = @failing_domain_architectures [domain_architecture] + 1
+ if ( @failing_domain_architectures.key?(domain_architecture))
+ @failing_domain_architectures[domain_architecture] = @failing_domain_architectures[domain_architecture] + 1
else
- @failing_domain_architectures [domain_architecture] = 1
+ @failing_domain_architectures[domain_architecture] = 1
end
end
return pass
target_das = target_da_str.split '--'
target_das.each do |x|
inds = x.split '='
- unless inds.size == 4
+ unless inds.size == 1 || inds.size == 4
raise IOError, 'domain architecture is ill formatted: ' + x
end
+
target_domain_name = inds[0]
- ie_cutoff = Float(inds[1])
- abs_len_cutoff = Integer(inds[2])
- rel_len_cutoff = Float(inds[3])
+ ie_cutoff = inds.size == 4 ? Float(inds[1]) : IE_CUTOFF_FOR_DA_OVERVIEW
+ abs_len_cutoff = inds.size == 4 ? Integer(inds[2]) : 0
+ rel_len_cutoff = inds.size == 4 ? Float(inds[3]) : REL_LEN_CUTOFF_FOR_DA_OVERVIEW
if target_domain_hash.has_key? target_domain_name
target_domain_ary.push target_domain_hash[target_domain_name]
else