module Evoruby
class HmmscanMultiDomainExtractor
def initialize
+ @passed = Hash.new
+ @non_passsing_domain_architectures = Hash.new
end
# raises ArgumentError, IOError, StandardError
failed_seqs = Msa.new
passed_seqs = Msa.new
+ passed_multi_seqs = Msa.new
ld = Constants::LINE_DELIMITER
results = hmmscan_parser.parse
####
+ # Import: if multiple copies of same domain, threshold need to be same!
+ target_domain_ary = Array.new
+ target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 1 ))
+ target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 ))
+ # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
+ # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
- pc0 = PassCondition.new('Bcl-2', 0.01, -1, 0.5 )
+ #target_domain_ary.push(TargetDomain.new('BH4', 0.1, -1, 0.5, 0 ))
+ #target_domain_ary.push(TargetDomain.new('Bcl-2', 0.1, -1, 0.5, 1 ))
+ # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
- pcs = Hash.new
- pcs["Bcl-2"] = pc0
+ # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
+ # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
- prev_query = nil
- domains_in_query_ary = Array.new
- queries_ary = Array.new
+ #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
+
+ target_domains = Hash.new
+
+ target_domain_architecure = ""
+
+ target_domain_ary.each do |target_domain|
+ target_domains[target_domain.name] = target_domain
+ if target_domain_architecure.length > 0
+ target_domain_architecure += ">"
+ end
+ target_domain_architecure += target_domain.name
+ end
+
+ target_domain_architecure.freeze
+
+ puts target_domain_architecure
+
+ target_domain_names = Set.new
+
+ target_domains.each_key {|key| target_domain_names.add( key ) }
+
+ prev_query_seq_name = nil
+ domains_in_query_seq = Array.new
+ passing_sequences = Array.new
+ total_sequences = 0
+ @passed = Hash.new
+ out_domain_msas = Hash.new
+ out_domain_architecture_msa = Msa.new
results.each do |hmmscan_result|
- if ( prev_query != nil ) && ( hmmscan_result.query != prev_query )
- ###--
- pass = true
- domains_in_query_ary.each do |domain_in_query|
- if pcs.has_key?(domain_in_query.model)
- pc = pcs[domain_in_query.model]
- # @abs_len = abs_len
- # @percent_len = rel_len
- if (pc.i_e_value != nil) && (pc.i_e_value >= 0)
- if domain_in_query.i_e_value > pc.i_e_value
- pass = false
- #break
- end
- end
- if (pc.abs_len != nil) && (pc.abs_len > 0)
- length = 1 + domain_in_query.env_to - domain_in_query.env_from
- if length < pc.abs_len
- pass = false
- #break
- end
- end
- if (pc.rel_len != nil) && (pc.rel_len > 0)
- length = 1 + domain_in_query.env_to - domain_in_query.env_from
- if length < (pc.rel_len * domain_in_query.tlen)
- pass = false
- #break
- end
- end
- end
+ if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
+ if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure)
+ passing_sequences.push(domains_in_query_seq)
end
- if pass == true
- queries_ary.push(domains_in_query_ary)
- end
- domains_in_query_ary = Array.new
- ###--
+ domains_in_query_seq = Array.new
+ total_sequences += 1
+ end
+ prev_query_seq_name = hmmscan_result.query
+ domains_in_query_seq.push(hmmscan_result)
+ end # each result
+
+ if prev_query_seq_name != nil
+ total_sequences += 1
+ if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure)
+ passing_sequences.push(domains_in_query_seq)
end
- prev_query = hmmscan_result.query
- domains_in_query_ary.push(hmmscan_result)
end
- if prev_query != nil
- queries_ary.push(domains_in_query_ary)
+
+ out_domain_msas.keys.sort.each do |domain_name|
+ puts "writing #{domain_name}"
+ write_msa( out_domain_msas[domain_name], domain_name + ".fasta" )
end
- puts queries_ary[0][0].model
- puts queries_ary[0][0].i_e_value
- puts queries_ary[0][1].model
- puts queries_ary[0][2].model
- puts queries_ary[1][0].model
- puts queries_ary[1][0].i_e_value
- puts queries_ary[1][1].model
- puts queries_ary[2][2].model
- queries_ary.each do | query_ary |
- query_ary.each do | domain |
- # puts domain.model
+
+ puts "writing target_domain_architecure"
+ write_msa( out_domain_architecture_msa, "target_domain_architecure" + ".fasta" )
+
+ passing_sequences.each do | domains |
+
+ seq = domains[0].query
+ # puts seq + "::"
+ ################
+ if passed_seqs.find_by_name_start( seq, true ).length < 1
+ add_sequence( seq, in_msa, passed_multi_seqs )
+ else
+ error_msg = "this should not have happened"
+ raise StandardError, error_msg
+ end
+ ################
+
+ domains.each do | domain |
+ #puts domain.query + ": " + domain.model
end
#puts
end
- ####
+ puts @non_passsing_domain_architectures
- prev_query = nil
- saw_target = false
+ puts @passed
+ puts "total sequences : " + total_sequences.to_s
+ puts "passing sequences: " + passing_sequences.size.to_s
- results.each do | r |
+ # write_msa( out_msa, outfile + ".fasta" )
+ # write_msa( passed_multi_seqs, passed_seqs_outfile )
+ # write_msa( failed_seqs, failed_seqs_outfile )
- if ( prev_query != nil ) && ( r.query != prev_query )
- protein_counter += 1
- passing_domains_per_protein = 0
- if !saw_target
- log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
- domain_not_present_counter += 1
- end
- saw_target = false
- end
+ log << ld
+ log << "passing target domains : " + domain_pass_counter.to_s + ld
+ log << "failing target domains : " + domain_fail_counter.to_s + ld
+ log << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld
+ log << "proteins in hmmscan outputfile : " + protein_counter.to_s + ld
+ log << "proteins with passing target domain(s) : " + passed_seqs.get_number_of_seqs.to_s + ld
+ log << "proteins with no passing target domain : " + proteins_with_failing_domains.to_s + ld
+ log << "proteins with no target domain : " + domain_not_present_counter.to_s + ld
- prev_query = r.query
+ log << ld
- if domain_id != r.model
- next
- end
+ return domain_pass_counter
- saw_target = true
+ end # parse
- # target = r.model
- sequence = r.query
- # sequence_len = r.qlen
- number = r.number
- out_of = r.out_of
- env_from = r.env_from
- env_to = r.env_to
- i_e_value = r.i_e_value
- prev_query = r.query
+ private
- length = 1 + env_to - env_from
+ # domains_in_query_seq: Array of HmmscanResult
+ # target_domain_names: Set of String
+ # target_domains: Hash String->TargetDomain
+ # target_domain_architecture: String
+ def checkit2(domains_in_query_seq,
+ target_domain_names,
+ target_domains,
+ in_msa,
+ out_domain_msas,
+ out_domain_architecture_msa,
+ target_domain_architecture = nil)
- overall_target_length_sum += length
- if length > overall_target_length_max
- overall_target_length_max = length
- end
- if length < overall_target_length_min
- overall_target_length_min = length
- end
+ domain_names_in_query_seq = Set.new
+ domains_in_query_seq.each do |domain|
+ domain_names_in_query_seq.add(domain.model)
+ end
+ if (target_domain_names.subset?(domain_names_in_query_seq))
- if i_e_value > overall_target_ie_max
- overall_target_ie_max = i_e_value
- end
- if i_e_value < overall_target_ie_min
- overall_target_ie_min = i_e_value
- end
+ passed_domains = Array.new
+ passed_domains_counts = Hash.new
- if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
- ( ( length_threshold <= 0 ) || ( length >= length_threshold.to_f ) ) )
- hmmscan_datas << HmmscanData.new( sequence, number, out_of, env_from, env_to, i_e_value )
- passing_target_length_sum += length
- passing_domains_per_protein += 1
- if length > passing_target_length_max
- passing_target_length_max = length
- end
- if length < passing_target_length_min
- passing_target_length_min = length
- end
- if i_e_value > passing_target_ie_max
- passing_target_ie_max = i_e_value
- end
- if i_e_value < passing_target_ie_min
- passing_target_ie_min = i_e_value
- end
- if ( passing_domains_per_protein > max_domain_copy_number_per_protein )
- max_domain_copy_number_sequence = sequence
- max_domain_copy_number_per_protein = passing_domains_per_protein
- end
- else # no pass
- log << domain_fail_counter.to_s + ": " + sequence.to_s + " fails threshold(s)"
- if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
- log << " iE=" + i_e_value.to_s
- end
- if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
- le = env_to - env_from + 1
- log << " l=" + le.to_s
- end
- log << ld
- domain_fail_counter += 1
- end
+ domains_in_query_seq.each do |domain|
+ if target_domains.has_key?(domain.model)
+ target_domain = target_domains[domain.model]
+ if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
+ if domain.i_e_value > target_domain.i_e_value
+ return false
+ end
+ end
+ if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
+ length = 1 + domain.env_to - domain.env_from
+ if length < target_domain.abs_len
+ return false
+ end
+ end
+ if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
+ length = 1 + domain.env_to - domain.env_from
+ if length < (target_domain.rel_len * domain.tlen)
+ return false
+ end
+ end
- if number > out_of
- error_msg = "number > out_of (this should not have happened)"
- raise StandardError, error_msg
- end
+ # For domain architecture comparison
+ if target_domain_architecture != nil
+ passed_domains.push(domain)
+ end
- if number == out_of
- if !hmmscan_datas.empty?
- process_hmmscan_datas( hmmscan_datas,
- in_msa,
- add_position,
- add_domain_number,
- add_species,
- out_msa )
- domain_pass_counter += hmmscan_datas.length
- if passed_seqs.find_by_name_start( sequence, true ).length < 1
- add_sequence( sequence, in_msa, passed_seqs )
+ if (passed_domains_counts.key?(domain.model))
+ passed_domains_counts[domain.model] = passed_domains_counts[domain.model] + 1
else
- error_msg = "this should not have happened"
- raise StandardError, error_msg
+ passed_domains_counts[domain.model] = 1
end
- else # no pass
- if failed_seqs.find_by_name_start( sequence, true ).length < 1
- add_sequence( sequence, in_msa, failed_seqs )
- proteins_with_failing_domains += 1
+
+ if (@passed.key?(domain.model))
+ @passed[domain.model] = @passed[domain.model] + 1
else
- error_msg = "this should not have happened"
- raise StandardError, error_msg
+ @passed[domain.model] = 1
end
end
- hmmscan_datas.clear
end
- end # results.each do | r |
-
- if (prev_query != nil) && (!saw_target)
- log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
- domain_not_present_counter += 1
+ else
+ return false
end
-
- if domain_pass_counter < 1
- error_msg = "no domain sequences were extracted"
- raise IOError, error_msg
+ passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
+ # Compare architectures
+ if target_domain_architecture != nil
+ if !compareArchitectures(target_domain_architecture, passed_domains)
+ return false
+ end
end
- if ( domain_not_present_counter + passed_seqs.get_number_of_seqs + proteins_with_failing_domains ) != protein_counter
- error_msg = "not present + passing + not passing != proteins in sequence (fasta) file (this should not have happened)"
- raise StandardError, error_msg
- end
+ domain_counter = Hash.new
- puts
- log << ld
+ min_env_from = 10000000
+ max_env_from = 0
+ min_env_to = 10000000
+ max_env_to = 0
- log << ld
- avg_pass = ( passing_target_length_sum / domain_pass_counter )
- puts( "Passing target domain lengths: average: " + avg_pass.to_s )
- log << "Passing target domain lengths: average: " + avg_pass.to_s
- log << ld
- puts( "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - " + passing_target_length_max.to_s)
- log << "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - " + passing_target_length_max.to_s
- log << ld
- puts( "Passing target domain iE: min-max: " + passing_target_ie_min.to_s + " - " + passing_target_ie_max.to_s)
- log << "Passing target domain iE: min-max: " + passing_target_ie_min.to_s + " - " + passing_target_ie_max.to_s
- log << ld
- puts( "Passing target domains: sum: " + domain_pass_counter.to_s )
- log << "Passing target domains: sum: " + domain_pass_counter.to_s
- log << ld
- log << ld
- puts
- sum = domain_pass_counter + domain_fail_counter
- avg_all = overall_target_length_sum / sum
- puts( "All target domain lengths: average: " + avg_all.to_s )
- log << "All target domain lengths: average: " + avg_all.to_s
- log << ld
- puts( "All target domain lengths: min-max: " + overall_target_length_min.to_s + " - " + overall_target_length_max.to_s)
- log << "All target domain lengths: min-max: " + overall_target_length_min.to_s + " - " + overall_target_length_max.to_s
- log << ld
- puts( "All target target domain iE: min-max: " + overall_target_ie_min.to_s + " - " + overall_target_ie_max.to_s)
- log << "All target target domain iE: min-max: " + overall_target_ie_min.to_s + " - " + overall_target_ie_max.to_s
- log << ld
- puts( "All target domains: sum: " + sum.to_s )
- log << "All target domains: sum: " + sum.to_s
+ query_seq = nil
- puts
- puts( "Proteins with passing target domain(s): " + passed_seqs.get_number_of_seqs.to_s )
- puts( "Proteins with no passing target domain: " + proteins_with_failing_domains.to_s )
- puts( "Proteins with no target domain : " + domain_not_present_counter.to_s )
+ passed_domains.each do |domain|
+ domain_name = domain.model
- log << ld
- log << ld
- puts
- puts( "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s )
- log << "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s
- log << ld
+ unless out_domain_msas.has_key? domain_name
+ out_domain_msas[ domain_name ] = Msa.new
+ end
- if ( max_domain_copy_number_per_protein > 1 )
- puts( "First target protein with this copy number: " + max_domain_copy_number_sequence )
- log << "First target protein with this copy number: " + max_domain_copy_number_sequence
- log << ld
- end
+ if domain_counter.key?(domain_name)
+ domain_counter[domain_name] = domain_counter[domain_name] + 1
+ else
+ domain_counter[domain_name] = 1
+ end
- write_msa( out_msa, outfile + ".fasta" )
- write_msa( passed_seqs, passed_seqs_outfile )
- write_msa( failed_seqs, failed_seqs_outfile )
+ if query_seq == nil
+ query_seq = domain.query
+ query_seq.freeze
+ elsif query_seq != domain.query
+ error_msg = "seq names do not match: this should not have happened"
+ raise StandardError, error_msg
+ end
- log << ld
- log << "passing target domains : " + domain_pass_counter.to_s + ld
- log << "failing target domains : " + domain_fail_counter.to_s + ld
- log << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld
- log << "proteins in hmmscan outputfile : " + protein_counter.to_s + ld
- log << "proteins with passing target domain(s) : " + passed_seqs.get_number_of_seqs.to_s + ld
- log << "proteins with no passing target domain : " + proteins_with_failing_domains.to_s + ld
- log << "proteins with no target domain : " + domain_not_present_counter.to_s + ld
+ extract_domain( query_seq,
+ domain_counter[domain_name],
+ passed_domains_counts[domain_name],
+ domain.env_from,
+ domain.env_to,
+ in_msa,
+ out_domain_msas[ domain_name ] )
- log << ld
+ if domain.env_from < min_env_from
+ min_env_from = domain.env_from
+ end
+ if domain.env_from > max_env_from
+ max_env_from = domain.env_from
+ end
+ if domain.env_to < min_env_to
+ min_env_to = domain.env_to
+ end
+ if domain.env_to > max_env_to
+ max_env_to = domain.env_to
+ end
+ end
- return domain_pass_counter
+ puts "env from: #{min_env_from} - #{max_env_from}"
+ puts "env to : #{min_env_to} - #{max_env_to}"
- end # parse
+ extract_domain( query_seq,
+ 1,
+ 1,
+ min_env_from,
+ max_env_to,
+ in_msa,
+ out_domain_architecture_msa )
- private
+ puts passed_domains_counts
+ true
+ end
+
+ # passed_domains needs to be sorted!
+ def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
+ domain_architecture = ""
+ passed_domains.each do |domain|
+ if domain_architecture.length > 0
+ domain_architecture += ">"
+ end
+ domain_architecture += domain.model
+ end
+ pass = false
+ if strict
+ pass = (target_domain_architecture == domain_architecture)
+ else
+ pass = (domain_architecture.index(target_domain_architecture) != nil)
+ end
+ if !pass
+ if (@non_passsing_domain_architectures.key?(domain_architecture))
+ @non_passsing_domain_architectures[domain_architecture] = @non_passsing_domain_architectures[domain_architecture] + 1
+ else
+ @non_passsing_domain_architectures[domain_architecture] = 1
+ end
+ end
+ return pass
+ end
def write_msa( msa, filename )
io = MsaIO.new()
w = FastaWriter.new()
w.set_line_width( 60 )
w.clean( true )
+ File.delete(filename) if File.exist?(filename) #TODO remove me
begin
io.write_to_file( msa, filename, w )
rescue Exception
add_to_msa.add_sequence( seq )
end
- def process_hmmscan_datas( hmmscan_datas,
- in_msa,
- add_position,
- add_domain_number,
- add_species,
- out_msa )
-
- actual_out_of = hmmscan_datas.size
-
- seq_name = ""
- prev_seq_name = nil
-
- hmmscan_datas.each_with_index do |hmmscan_data, index|
- if hmmscan_data.number < ( index + 1 )
- error_msg = "hmmscan_data.number < ( index + 1 ) (this should not have happened)"
- raise StandardError, error_msg
- end
-
- seq_name = hmmscan_data.seq_name
-
- extract_domain( seq_name,
- index + 1,
- actual_out_of,
- hmmscan_data.env_from,
- hmmscan_data.env_to,
- in_msa,
- out_msa,
- add_position,
- add_domain_number,
- add_species )
-
- if prev_seq_name && prev_seq_name != seq_name
- error_msg = "this should not have happened"
- raise StandardError, error_msg
- end
- prev_seq_name = seq_name
- end # each
- end # def process_hmmscan_data
-
- def extract_domain( sequence,
+ def extract_domain( seq_name,
number,
out_of,
seq_from,
seq_to,
in_msa,
- out_msa,
- add_position,
- add_domain_number,
- add_species )
- if number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
+ out_msa)
+ if number < 1 || out_of < 1 || number > out_of
error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
raise StandardError, error_msg
end
error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
raise StandardError, error_msg
end
- seqs = in_msa.find_by_name_start( sequence, true )
+ seqs = in_msa.find_by_name_start(seq_name, true)
if seqs.length < 1
- error_msg = "sequence \"" + sequence + "\" not found in sequence file"
+ error_msg = "sequence \"" + seq_name + "\" not found in sequence file"
raise IOError, error_msg
end
if seqs.length > 1
- error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
+ error_msg = "sequence \"" + seq_name + "\" not unique in sequence file"
raise IOError, error_msg
end
# hmmscan is 1 based, whereas sequences are 0 bases in this package.
seq.set_name( orig_name.split[ 0 ] )
- if add_position
- seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
- end
-
- if out_of != 1 && add_domain_number
+ if out_of != 1
seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
end
- if add_species
- a = orig_name.rindex "["
- b = orig_name.rindex "]"
- unless a && b
- error_msg = "species not found in " + orig_name
- raise StandardError, error_msg
- end
- species = orig_name[ a .. b ]
- seq.set_name( seq.get_name + " " + species )
- end
out_msa.add_sequence( seq )
end
end # class HmmscanDomainExtractor
- class HmmscanData
- def initialize( seq_name, number, out_of, env_from, env_to, i_e_value )
- @seq_name = seq_name
- @number = number
- @out_of = out_of
- @env_from = env_from
- @env_to = env_to
- @i_e_value = i_e_value
- end
- attr_reader :seq_name, :number, :out_of, :env_from, :env_to, :i_e_value
- end
-
- class PassCondition
- def initialize( hmm, i_e_value, abs_len, rel_len )
- @hmm = hmm
+ class TargetDomain
+ def initialize( name, i_e_value, abs_len, rel_len, position )
+ @name = name
@i_e_value = i_e_value
@abs_len = abs_len
@percent_len = rel_len
+ @position = position
end
- attr_reader :hmm, :i_e_value, :abs_len, :rel_len
+ attr_reader :name, :i_e_value, :abs_len, :rel_len, :position
end
end # module Evoruby