# Copyright:: Copyright (C) 2017 Christian M. Zmasek
# License:: GNU Lesser General Public License (LGPL)
#
-# Last modified: 2017/02/20
+# Last modified: 2017/03/08
+
+####
+# Import: if multiple copies of same domain, thresholds need to be same!
+####
require 'lib/evo/util/constants'
require 'lib/evo/msa/msa_factory'
module Evoruby
class HmmscanMultiDomainExtractor
- OUTPUT_ID = 'MDSX'
+ TESTING = false
+ OUTPUT_ID = 'mdsx'
+ DOMAIN_DELIMITER = ' -- '
+
+ IE_CUTOFF_FOR_DA_OVERVIEW = 1E-6
+ REL_LEN_CUTOFF_FOR_DA_OVERVIEW = 0.5
- PASSING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
- FAILING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
- TARGET_DA_SUFFIX = "_#{OUTPUT_ID}_target_da.fasta"
- CONCAT_TARGET_DOM_SUFFIX = "_#{OUTPUT_ID}_concat_target_dom.fasta"
+ PASSING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
+ FAILING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
+ TARGET_DA_SUFFIX = "_#{OUTPUT_ID}_target_da.fasta"
+ CONCAT_TARGET_DOM_SUFFIX = "_#{OUTPUT_ID}_concat_target_doms.fasta"
+ TARGET_DOM_OUTPUT_MIDPART = "_#{OUTPUT_ID}_target_dom_"
+ LOG_FILE_SUFFIX = "_#{OUTPUT_ID}.log"
def initialize
- @passed = Hash.new
- @non_passsing_domain_architectures = Hash.new
+ @passing_domains_data = nil
+ @failing_domains_data = nil
+ @failing_domain_architectures = nil
+ @passsing_domain_architectures = nil
+ @failing_proteins_bc_not_all_target_doms_present = nil
+ @failing_proteins_bc_missing_cutoffs = nil
+ @encountered_domain_architectures = nil
+ @log = nil
end
# raises ArgumentError, IOError, StandardError
failing_fl_seqs_outfile = outfile_base + FAILING_FL_SEQS_SUFFIX
target_da_outfile = outfile_base + TARGET_DA_SUFFIX
concat_target_dom_outfile = outfile_base + CONCAT_TARGET_DOM_SUFFIX
+ logfile = outfile_base + LOG_FILE_SUFFIX
Util.check_file_for_readability( hmmscan_output )
Util.check_file_for_readability( fasta_sequence_file )
Util.check_file_for_writability( failing_fl_seqs_outfile )
Util.check_file_for_writability( target_da_outfile )
Util.check_file_for_writability( concat_target_dom_outfile )
+ Util.check_file_for_writability( logfile )
+
+ @log = log_str
in_msa = nil
factory = MsaFactory.new()
raise IOError, error_msg
end
- out_msa = Msa.new
-
failed_seqs_msa = Msa.new
passed_seqs_msa = Msa.new
- # passed_multi_seqs_msa = Msa.new
-
- ld = Constants::LINE_DELIMITER
-
- domain_pass_counter = 0
- domain_fail_counter = 0
- passing_domains_per_protein = 0
- proteins_with_failing_domains = 0
- domain_not_present_counter = 0
- protein_counter = 1
- max_domain_copy_number_per_protein = -1
- max_domain_copy_number_sequence = ""
- passing_target_length_sum = 0
- overall_target_length_sum = 0
- overall_target_length_min = 10000000
- overall_target_length_max = -1
- passing_target_length_min = 10000000
- passing_target_length_max = -1
-
- overall_target_ie_min = 10000000
- overall_target_ie_max = -1
- passing_target_ie_min = 10000000
- passing_target_ie_max = -1
hmmscan_parser = HmmscanParser.new( hmmscan_output )
results = hmmscan_parser.parse
- ####
- # Import: if multiple copies of same domain, threshold need to be same!
- target_domain_ary = Array.new
-
- target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 ))
- target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 ))
-
- # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 0 ))
- # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 ))
- # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
- # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
-
- #target_domain_ary.push(TargetDomain.new('BH4', 0.1, -1, 0.5, 0 ))
- #target_domain_ary.push(TargetDomain.new('Bcl-2', 0.1, -1, 0.5, 1 ))
- # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
-
- # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
- # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
-
- #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
+ target_domain_ary = parse_da target_da
target_domains = Hash.new
- target_domain_architecure = ""
+ target_domain_architecure = ''
target_domain_ary.each do |target_domain|
target_domains[target_domain.name] = target_domain
if target_domain_architecure.length > 0
- target_domain_architecure += ">"
+ target_domain_architecure << DOMAIN_DELIMITER
end
- target_domain_architecure += target_domain.name
+ target_domain_architecure << target_domain.name
end
target_domain_architecure.freeze
- puts 'Target domain architecture: ' + target_domain_architecure
+ log 'Hmmscan outputfile : ' + hmmscan_output
+ log 'Full length fasta sequence file: ' + fasta_sequence_file
+ log 'Target domain architecture : ' + target_domain_architecure
+ target_domain_ary.each do |x|
+ log x.to_str
+ end
target_domain_names = Set.new
passing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
failing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
total_sequences = 0
- @passed = Hash.new
+
+ @failing_domain_architectures = Hash.new
+ @passsing_domain_architectures = Hash.new
+ @passing_domains_data = Hash.new
+ @failing_domains_data = Hash.new
+ @encountered_domain_architectures= Hash.new
+ @failing_proteins_bc_not_all_target_doms_present = 0
+ @failing_proteins_bc_missing_cutoffs = 0
out_domain_msas = Hash.new
out_domain_architecture_msa = Msa.new
out_concatenated_domains_msa = Msa.new
results.each do |hmmscan_result|
if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
- if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_contactenated_domains_msa, target_domain_architecure)
+ if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
passing_sequences.push(domains_in_query_seq)
else
failing_sequences.push(domains_in_query_seq)
end
-
domains_in_query_seq = Array.new
total_sequences += 1
end
end # each result
if prev_query_seq_name != nil
- total_sequences += 1
- if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_contactenated_domains_msa, target_domain_architecure)
+ if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
passing_sequences.push(domains_in_query_seq)
else
failing_sequences.push(domains_in_query_seq)
end
+ total_sequences += 1
end
+ if in_msa.get_number_of_seqs < total_sequences
+ error_msg = "hmmscan output contains more protein sequences than fasta sequence file"
+ raise IOError, error_msg
+ end
+
+ log
+ log 'Domain architecture overview using default iE-cutoff ' +
+ IE_CUTOFF_FOR_DA_OVERVIEW.to_s + ' and relative length cutoff ' + REL_LEN_CUTOFF_FOR_DA_OVERVIEW.to_s + ':'
+
+ @encountered_domain_architectures = @encountered_domain_architectures.sort_by {|k, v| v}.reverse.to_h
+ counter = 1;
+ @encountered_domain_architectures.each do |k, v|
+ log counter.to_s.rjust(2) + ') ' + v.to_s.rjust(5) + ': ' + k
+ counter += 1
+ if counter > 40
+ break
+ end
+ end
+
+ log
+ log 'Passing domain arrangements of target domain(s):'
+ @passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
+ passing_da_sum = 0
+ @passsing_domain_architectures.each do |da, count|
+ passing_da_sum += count
+ log count.to_s.rjust(4) + ': ' + da
+ end
+ log
+ log 'Failing domain arrangements of target domain(s):'
+ @failing_domain_architectures = @failing_domain_architectures.sort{|a, b|a<=>b}.to_h
+ failing_da_sum = 0
+ @failing_domain_architectures .each do |da, count|
+ failing_da_sum += count
+ log count.to_s.rjust(4) + ': ' + da
+ end
+ log
+ log 'Passing target domain(s):'
+ @passing_domains_data = @passing_domains_data.sort{|a, b|a<=>b}.to_h
+ @passing_domains_data.each do |n, d|
+ log d.to_str
+ end
+ log
+ log 'Failing target domain(s) (in proteins sequences with target domain architecture):'
+ @failing_domains_data = @failing_domains_data.sort{|a, b|a<=>b}.to_h
+ @failing_domains_data.each do |n, d|
+ log d.to_str
+ end
+
+ unless total_sequences == (passing_sequences.size + failing_sequences.size)
+ error_msg = "this should not have happened: total seqs not equal to passing plus failing seqs"
+ raise StandardError, error_msg
+ end
+
+ unless failing_sequences.size == (@failing_proteins_bc_not_all_target_doms_present + @failing_proteins_bc_missing_cutoffs)
+ error_msg = "this should not have happened: failing seqs sums not consistent"
+ raise StandardError, error_msg
+ end
+
+ unless @failing_proteins_bc_missing_cutoffs >= failing_da_sum
+ error_msg = "this should not have happened: failing seqs larger than failing da sum"
+ raise StandardError, error_msg
+ end
+
+ unless passing_sequences.size == passing_da_sum
+ error_msg = "this should not have happened: passing seqs not equal to passing da sum"
+ raise StandardError, error_msg
+ end
+
+ log
+ log "Protein sequences in sequence (fasta) file: " + in_msa.get_number_of_seqs.to_s.rjust(5)
+ log "Protein sequences in hmmscan output file : " + total_sequences.to_s.rjust(5)
+ log " Passing protein sequences : " + passing_sequences.size.to_s.rjust(5)
+ log " Failing protein sequences : " + failing_sequences.size.to_s.rjust(5)
+ log " Not all target domain present : " + @failing_proteins_bc_not_all_target_doms_present.to_s.rjust(5)
+ log " Target domain(s) failing cutoffs : " + @failing_proteins_bc_missing_cutoffs.to_s.rjust(5)
+ log
+
out_domain_msas.keys.sort.each do |domain_name|
- puts "writing #{domain_name}"
- write_msa( out_domain_msas[domain_name], domain_name + ".fasta" )
+ file_name = outfile_base + TARGET_DOM_OUTPUT_MIDPART + domain_name + '.fasta'
+ write_msa out_domain_msas[domain_name], file_name
+ log "Wrote passing target domain sequence for " + domain_name.ljust(16) + ': ' + file_name
end
- puts "writing target_domain_architecure"
- write_msa( out_domain_architecture_msa, target_da_outfile )
+ write_msa out_domain_architecture_msa, target_da_outfile
+ log 'Wrote target domain architecture : ' + target_da_outfile
- puts "writing contactenated_domains_msa"
- write_msa( out_concatenated_domains_msa, concat_target_dom_outfile )
+ write_msa out_concatenated_domains_msa, concat_target_dom_outfile
+ log 'Wrote concatenated target domain(s) : ' + concat_target_dom_outfile
passing_sequences.each do | domains |
query_name = domains[0].query
- if passed_seqs_msa.find_by_name_start( query_name, true ).length < 1
+ if (!TESTING) || (passed_seqs_msa.find_by_name_start( query_name, true ).length < 1)
add_sequence( query_name, in_msa, passed_seqs_msa )
else
- error_msg = "this should not have happened"
+ error_msg = 'this should not have happened'
raise StandardError, error_msg
end
end
failing_sequences.each do | domains |
query_name = domains[0].query
- if failed_seqs_msa.find_by_name_start( query_name, true ).length < 1
+ if (!TESTING) || (failed_seqs_msa.find_by_name_start( query_name, true ).length < 1)
add_sequence( query_name, in_msa, failed_seqs_msa )
else
- error_msg = "this should not have happened"
+ error_msg = 'this should not have happened'
raise StandardError, error_msg
end
end
- puts
-
- puts 'Non passing domain architectures containing target domain(s):'
- @non_passsing_domain_architectures = @non_passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
- @non_passsing_domain_architectures.each do |da, count|
- puts da + ': ' + count.to_s
- end
-
- puts
+ write_msa passed_seqs_msa, passing_fl_seqs_outfile
+ log 'Wrote passing full length protein sequences : ' + passing_fl_seqs_outfile
+ write_msa failed_seqs_msa, failing_fl_seqs_outfile
+ log 'Wrote failing full length protein sequences : ' + failing_fl_seqs_outfile
- puts 'Passing target domain counts:'
- @passed = @passed.sort{|a, b|a<=>b}.to_h
- @passed.each do |dom, count|
- puts dom + ': ' + count.to_s
+ begin
+ f = File.open( logfile, 'w' )
+ f.print( @log )
+ f.close
+ rescue Exception => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
-
- puts
-
- puts "total sequences : " + total_sequences.to_s
- puts "passing sequences: " + passing_sequences.size.to_s
-
- write_msa( passed_seqs_msa, passing_fl_seqs_outfile )
- write_msa( failed_seqs_msa, failing_fl_seqs_outfile )
-
- log_str << ld
- log_str << "passing target domains : " + domain_pass_counter.to_s + ld
- log_str << "failing target domains : " + domain_fail_counter.to_s + ld
- log_str << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld
- log_str << "proteins in hmmscan outputfile : " + protein_counter.to_s + ld
- log_str << "proteins with passing target domain(s) : " + passed_seqs_msa.get_number_of_seqs.to_s + ld
- log_str << "proteins with no passing target domain : " + proteins_with_failing_domains.to_s + ld
- log_str << "proteins with no target domain : " + domain_not_present_counter.to_s + ld
-
- log_str << ld
-
- return domain_pass_counter
+ log 'Wrote log file : ' + logfile
end # parse
private
+ # domains: Array of HmmscanResult
+ def collect(domains, ie_cutoff, rel_len_cutoff)
+ passed = Array.new
+ domains.each do |domain|
+ length = 1 + domain.env_to - domain.env_from
+ if (domain.i_e_value <= ie_cutoff) && (length >= (rel_len_cutoff * domain.tlen))
+ passed.push domain
+ end
+ end
+ passed.sort! { |a,b| a.ali_from <=> b.ali_from }
+ s = ''
+ passed.each do |domain|
+ if s.length > 0
+ s << DOMAIN_DELIMITER
+ end
+ s << domain.model
+ end
+ if s.length > 0
+ if @encountered_domain_architectures.has_key? s
+ @encountered_domain_architectures[s] = 1 + @encountered_domain_architectures[s]
+ else
+ @encountered_domain_architectures[s] = 1
+ end
+ end
+ end
+
# domains_in_query_seq: Array of HmmscanResult
# target_domain_names: Set of String
# target_domains: Hash String->TargetDomain
# target_domain_architecture: String
- def checkit2(domains_in_query_seq,
+ def compare(domains_in_query_seq,
target_domain_names,
target_domains,
in_msa,
out_contactenated_domains_msa,
target_domain_architecture)
+ collect(domains_in_query_seq, IE_CUTOFF_FOR_DA_OVERVIEW, REL_LEN_CUTOFF_FOR_DA_OVERVIEW)
+
domain_names_in_query_seq = Set.new
domains_in_query_seq.each do |domain|
domain_names_in_query_seq.add(domain.model)
end
- if (target_domain_names.subset?(domain_names_in_query_seq))
- passed_domains = Array.new
- passed_domains_counts = Hash.new
+ passed_domains = Array.new
+ passed_domains_counts = Hash.new
+
+ if (domain_names_in_query_seq.length > 0) && (target_domain_names.subset?(domain_names_in_query_seq))
domains_in_query_seq.each do |domain|
if target_domains.has_key?(domain.model)
target_domain = target_domains[domain.model]
- if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
+ if target_domain.i_e_value >= 0
if domain.i_e_value > target_domain.i_e_value
+ addToFailingDomainData(domain)
next
end
end
- if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
+ if target_domain.abs_len > 0
length = 1 + domain.env_to - domain.env_from
if length < target_domain.abs_len
+ addToFailingDomainData(domain)
next
end
end
- if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
+ if target_domain.rel_len > 0
length = 1 + domain.env_to - domain.env_from
- # puts (target_domain.rel_len * domain.tlen)
if length < (target_domain.rel_len * domain.tlen)
+ addToFailingDomainData(domain)
next
end
end
passed_domains_counts[domain.model] = 1
end
- if (@passed.key?(domain.model))
- @passed[domain.model] = @passed[domain.model] + 1
- else
- @passed[domain.model] = 1
- end
+ addToPassingDomainData(domain)
+
end # if target_domains.has_key?(domain.model)
end # domains_in_query_seq.each do |domain|
else
+ @failing_proteins_bc_not_all_target_doms_present += 1
+ return false
+ end # target_domain_names.subset?(domain_names_in_query_seq)
+
+ if passed_domains.length < 1
+ @failing_proteins_bc_missing_cutoffs += 1
return false
end
passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
# Compare architectures
- if !compareArchitectures(target_domain_architecture, passed_domains)
+ if !compareArchitectures(target_domain_architecture, passed_domains, false)
+ @failing_proteins_bc_missing_cutoffs += 1
return false
end
domain_counter = Hash.new
min_env_from = 10000000
- max_env_from = 0
- min_env_to = 10000000
max_env_to = 0
query_seq = nil
if domain.env_from < min_env_from
min_env_from = domain.env_from
end
- if domain.env_from > max_env_from
- max_env_from = domain.env_from
- end
- if domain.env_to < min_env_to
- min_env_to = domain.env_to
- end
if domain.env_to > max_env_to
max_env_to = domain.env_to
end
end
- #puts "env from: #{min_env_from} - #{max_env_from}"
- #puts "env to : #{min_env_to} - #{max_env_to}"
-
extract_domain( query_seq,
1,
1,
in_msa,
out_domain_architecture_msa )
- puts passed_domains_counts
-
out_contactenated_domains_msa.add( query_seq, concatenated_domains)
true
end
+ def addToPassingDomainData(domain)
+ unless ( @passing_domains_data.key?(domain.model))
+ @passing_domains_data[domain.model] = DomainData.new(domain.model)
+ end
+ @passing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value)
+ end
+
+ def addToFailingDomainData(domain)
+ unless ( @failing_domains_data.key?(domain.model))
+ @failing_domains_data[domain.model] = DomainData.new(domain.model)
+ end
+ @failing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value)
+ end
+
# passed_domains needs to be sorted!
def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
- domain_architecture = ""
+ domain_architecture = ''
passed_domains.each do |domain|
if domain_architecture.length > 0
- domain_architecture += ">"
+ domain_architecture += DOMAIN_DELIMITER
end
domain_architecture += domain.model
end
else
pass = (domain_architecture.index(target_domain_architecture) != nil)
end
- if !pass
- if (@non_passsing_domain_architectures.key?(domain_architecture))
- @non_passsing_domain_architectures[domain_architecture] = @non_passsing_domain_architectures[domain_architecture] + 1
+ if pass
+ if (@passsing_domain_architectures.key?(domain_architecture))
+ @passsing_domain_architectures[domain_architecture] = @passsing_domain_architectures[domain_architecture] + 1
+ else
+ @passsing_domain_architectures[domain_architecture] = 1
+ end
+ else
+ if ( @failing_domain_architectures.key?(domain_architecture))
+ @failing_domain_architectures[domain_architecture] = @failing_domain_architectures[domain_architecture] + 1
else
- @non_passsing_domain_architectures[domain_architecture] = 1
+ @failing_domain_architectures[domain_architecture] = 1
end
end
return pass
w = FastaWriter.new()
w.set_line_width( 60 )
w.clean( true )
- File.delete(filename) if File.exist?(filename) #TODO remove me
+ File.delete(filename) if File.exist?(filename)
begin
io.write_to_file( msa, filename, w )
rescue Exception
seq.get_sequence_as_string
end
+ def parse_da( target_da_str )
+ target_domain_hash = Hash.new
+ target_domain_ary = Array.new
+ target_das = target_da_str.split '--'
+ target_das.each do |x|
+ inds = x.split '='
+ unless inds.size == 1 || inds.size == 4
+ raise IOError, 'domain architecture is ill formatted: ' + x
+ end
+
+ target_domain_name = inds[0]
+ ie_cutoff = inds.size == 4 ? Float(inds[1]) : IE_CUTOFF_FOR_DA_OVERVIEW
+ abs_len_cutoff = inds.size == 4 ? Integer(inds[2]) : 0
+ rel_len_cutoff = inds.size == 4 ? Float(inds[3]) : REL_LEN_CUTOFF_FOR_DA_OVERVIEW
+ if target_domain_hash.has_key? target_domain_name
+ target_domain_ary.push target_domain_hash[target_domain_name]
+ else
+ td = TargetDomain.new(target_domain_name, ie_cutoff, abs_len_cutoff, rel_len_cutoff)
+ target_domain_hash[target_domain_name] = td
+ target_domain_ary.push td
+ end
+ end
+ target_domain_ary
+ end
+
+ def log(str = '')
+ puts str
+ @log << str << Constants::LINE_DELIMITER
+ end
+
end # class HmmscanMultiDomainExtractor
+ class DomainData
+ def initialize( name )
+ if (name == nil) || name.size < 1
+ error_msg = "domain name nil or empty"
+ raise IOError, error_msg
+ end
+ @name = name
+ @count = 0
+ @i_e_value_min = 10000000.0
+ @i_e_value_max = -1.0
+ @i_e_value_sum = 0.0
+ @len_min = 10000000
+ @len_max = -1
+ @len_sum = 0.0
+ end
+
+ def add( name, length, i_e_value)
+ if name != @name
+ error_msg = "domain names do not match"
+ raise IOError, error_msg
+ end
+
+ if length < 0
+ error_msg = "length cannot me negative"
+ raise IOError, error_msg
+ end
+ if i_e_value < 0
+ error_msg = "iE-value cannot me negative"
+ raise IOError, error_msg
+ end
+ @count += 1
+ @i_e_value_sum += i_e_value
+ @len_sum += length
+ if i_e_value > @i_e_value_max
+ @i_e_value_max = i_e_value
+ end
+ if i_e_value < @i_e_value_min
+ @i_e_value_min = i_e_value
+ end
+ if length > @len_max
+ @len_max = length
+ end
+ if length < @len_min
+ @len_min = length
+ end
+ end
+
+ def avg_length
+ if @count == 0
+ return 0
+ end
+ @len_sum / @count
+ end
+
+ def avg_i_e_value
+ if @count == 0
+ return 0
+ end
+ @i_e_value_sum / @count
+ end
+
+ def to_str
+ s = ''
+ s << @name.rjust(16) + ': '
+ s << @count.to_s.rjust(4) + ' '
+ s << avg_length.round(1).to_s.rjust(6) + ' '
+ s << @len_min.to_s.rjust(4) + ' -'
+ s << @len_max.to_s.rjust(4) + ' '
+ s << ("%.2E" % avg_i_e_value).rjust(9) + ' '
+ s << ("%.2E" % @i_e_value_min).rjust(9) + ' -'
+ s << ("%.2E" % @i_e_value_max).rjust(9) + ' '
+ s
+ end
+
+ attr_reader :name, :count, :i_e_value_min, :i_e_value_max, :length_min, :length_max
+
+ end
+
class TargetDomain
- def initialize( name, i_e_value, abs_len, rel_len, position )
+ def initialize(name, i_e_value, abs_len, rel_len)
if (name == nil) || name.size < 1
error_msg = "target domain name nil or empty"
- raise StandardError, error_msg
+ raise IOError, error_msg
end
if rel_len > 1
- error_msg = "target domain relative length is greater than 1"
- raise StandardError, error_msg
+ error_msg = name + ": target domain relative length is greater than 1"
+ raise IOError, error_msg
+ end
+ if (abs_len <= 0) && (rel_len <= 0)
+ error_msg = name + ": need to have either absolute length or relative length cutoff"
+ raise IOError, error_msg
+ end
+ if (abs_len > 0) && (rel_len > 0)
+ error_msg = name + ": cannot have both absolute length and relative length cutoff"
+ raise IOError, error_msg
end
@name = name
@i_e_value = i_e_value
@abs_len = abs_len
@rel_len = rel_len
- @position = position
end
- attr_reader :name, :i_e_value, :abs_len, :rel_len, :position
+
+ def to_str
+ s = @name.rjust(16) + ':'
+ s << ' iE-cutoff: ' + ("%.2E" % @i_e_value).rjust(9)
+ if @abs_len > 0
+ s << ', abs len-cutoff: ' + @abs_len.to_s.rjust(4)
+ end
+ if @rel_len > 0
+ s << ', rel len-cutoff: ' + @rel_len.to_s.rjust(4)
+ end
+ s
+ end
+ attr_reader :name, :i_e_value, :abs_len, :rel_len
end
end # module Evoruby