2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # Last modified: 2017/02/20
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
17 class HmmscanMultiDomainExtractor
20 DOMAIN_DELIMITER = ' -- '
22 PASSING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
23 FAILING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
24 TARGET_DA_SUFFIX = "_#{OUTPUT_ID}_target_da.fasta"
25 CONCAT_TARGET_DOM_SUFFIX = "_#{OUTPUT_ID}_concat_target_dom.fasta"
26 TARGET_DOM_OUTPUT_MIDPART = "_#{OUTPUT_ID}_target_dom_"
28 @passing_domains_data = nil
29 @failing_domains_data = nil
30 @failing_domain_architectures = nil
31 @passsing_domain_architectures = nil
32 @failing_proteins_bc_not_all_target_doms_present = nil
33 @failing_proteins_bc_missing_cutoffs = nil
36 # raises ArgumentError, IOError, StandardError
43 passing_fl_seqs_outfile = outfile_base + PASSING_FL_SEQS_SUFFIX
44 failing_fl_seqs_outfile = outfile_base + FAILING_FL_SEQS_SUFFIX
45 target_da_outfile = outfile_base + TARGET_DA_SUFFIX
46 concat_target_dom_outfile = outfile_base + CONCAT_TARGET_DOM_SUFFIX
48 Util.check_file_for_readability( hmmscan_output )
49 Util.check_file_for_readability( fasta_sequence_file )
50 Util.check_file_for_writability( passing_fl_seqs_outfile )
51 Util.check_file_for_writability( failing_fl_seqs_outfile )
52 Util.check_file_for_writability( target_da_outfile )
53 Util.check_file_for_writability( concat_target_dom_outfile )
56 factory = MsaFactory.new()
57 in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
59 if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
60 error_msg = "could not find fasta sequences in " + fasta_sequence_file
61 raise IOError, error_msg
64 failed_seqs_msa = Msa.new
65 passed_seqs_msa = Msa.new
67 ld = Constants::LINE_DELIMITER
69 hmmscan_parser = HmmscanParser.new( hmmscan_output )
70 results = hmmscan_parser.parse
73 # Import: if multiple copies of same domain, thresholds need to be same!
74 target_domain_ary = Array.new
76 #target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 ))
77 #target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 ))
79 #target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.9, 0 ))
80 #target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.9, 1 ))
81 # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
82 # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
84 target_domain_ary.push(TargetDomain.new('BH4', 1, -1, 0.2, 0 ))
85 target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 1 ))
86 target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 2 ))
87 # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
89 # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
90 # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
92 #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
94 target_domains = Hash.new
96 target_domain_architecure = ""
98 target_domain_ary.each do |target_domain|
99 target_domains[target_domain.name] = target_domain
100 if target_domain_architecure.length > 0
101 target_domain_architecure += DOMAIN_DELIMITER
103 target_domain_architecure += target_domain.name
106 target_domain_architecure.freeze
108 puts 'Target domain architecture: ' + target_domain_architecure
110 target_domain_names = Set.new
112 target_domains.each_key {|key| target_domain_names.add( key ) }
114 prev_query_seq_name = nil
115 domains_in_query_seq = Array.new
116 passing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
117 failing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
120 @failing_domain_architectures = Hash.new
121 @passsing_domain_architectures = Hash.new
122 @passing_domains_data = Hash.new
123 @failing_domains_data = Hash.new
124 @failing_proteins_bc_not_all_target_doms_present = 0
125 @failing_proteins_bc_missing_cutoffs = 0
126 out_domain_msas = Hash.new
127 out_domain_architecture_msa = Msa.new
128 out_concatenated_domains_msa = Msa.new
129 results.each do |hmmscan_result|
130 if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
131 if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
132 passing_sequences.push(domains_in_query_seq)
134 failing_sequences.push(domains_in_query_seq)
136 domains_in_query_seq = Array.new
139 prev_query_seq_name = hmmscan_result.query
140 domains_in_query_seq.push(hmmscan_result)
143 if prev_query_seq_name != nil
145 if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
146 passing_sequences.push(domains_in_query_seq)
148 failing_sequences.push(domains_in_query_seq)
154 puts 'Failing domain architectures containing one or more target domain(s):'
155 @failing_domain_architectures = @failing_domain_architectures .sort{|a, b|a<=>b}.to_h
157 @failing_domain_architectures .each do |da, count|
158 failing_da_sum += count
159 puts count.to_s.rjust(4) + ': ' + da
164 puts 'Passing domain architectures containing target domain(s):'
165 @passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
167 @passsing_domain_architectures.each do |da, count|
168 passing_da_sum += count
169 puts count.to_s.rjust(4) + ': ' + da
172 puts 'Passing domain(s):'
173 @passing_domains_data = @passing_domains_data.sort{|a, b|a<=>b}.to_h
174 @passing_domains_data.each do |n, d|
178 puts 'Failing domain(s):'
179 @failing_domains_data = @failing_domains_data.sort{|a, b|a<=>b}.to_h
180 @failing_domains_data.each do |n, d|
184 unless total_sequences == (passing_sequences.size + failing_sequences.size)
185 error_msg = "this should not have happened: total seqs not equal to passing plus failing seqs"
186 raise StandardError, error_msg
189 unless failing_sequences.size == (@failing_proteins_bc_not_all_target_doms_present + @failing_proteins_bc_missing_cutoffs)
190 error_msg = "this should not have happened: failing seqs sums not consistent"
191 raise StandardError, error_msg
194 unless @failing_proteins_bc_missing_cutoffs == failing_da_sum
195 error_msg = "this should not have happened: failing seqs not equal to failing da sum"
196 raise StandardError, error_msg
199 unless passing_sequences.size == passing_da_sum
200 error_msg = "this should not have happened: passing seqs not equal to passing da sum"
201 raise StandardError, error_msg
206 puts "Protein sequences in sequence (fasta) file: " + in_msa.get_number_of_seqs.to_s.rjust(5)
207 puts "Protein sequences in hmmscan output file : " + total_sequences.to_s.rjust(5)
208 puts " Passing protein sequences : " + passing_sequences.size.to_s.rjust(5)
209 puts " Failing protein sequences : " + failing_sequences.size.to_s.rjust(5)
210 puts " Not all target domain present : " + @failing_proteins_bc_not_all_target_doms_present.to_s.rjust(5)
211 puts " Target domain(s) failing cutoffs : " + @failing_proteins_bc_missing_cutoffs.to_s.rjust(5)
215 out_domain_msas.keys.sort.each do |domain_name|
216 file_name = outfile_base + TARGET_DOM_OUTPUT_MIDPART + domain_name + '.fasta'
218 write_msa( out_domain_msas[domain_name], file_name )
219 puts "wrote #{domain_name}"
222 write_msa( out_domain_architecture_msa, target_da_outfile )
223 puts "wrote target_domain_architecure"
225 write_msa( out_concatenated_domains_msa, concat_target_dom_outfile )
226 puts "wrote concatenated_domains_msa"
228 passing_sequences.each do | domains |
229 query_name = domains[0].query
230 if passed_seqs_msa.find_by_name_start( query_name, true ).length < 1
231 add_sequence( query_name, in_msa, passed_seqs_msa )
233 error_msg = "this should not have happened"
234 raise StandardError, error_msg
238 failing_sequences.each do | domains |
239 query_name = domains[0].query
240 if failed_seqs_msa.find_by_name_start( query_name, true ).length < 1
241 add_sequence( query_name, in_msa, failed_seqs_msa )
243 error_msg = "this should not have happened"
244 raise StandardError, error_msg
248 write_msa( passed_seqs_msa, passing_fl_seqs_outfile )
250 write_msa( failed_seqs_msa, failing_fl_seqs_outfile )
255 log_str << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld
263 # domains_in_query_seq: Array of HmmscanResult
264 # target_domain_names: Set of String
265 # target_domains: Hash String->TargetDomain
266 # target_domain_architecture: String
267 def checkit2(domains_in_query_seq,
272 out_domain_architecture_msa,
273 out_contactenated_domains_msa,
274 target_domain_architecture)
276 domain_names_in_query_seq = Set.new
277 domains_in_query_seq.each do |domain|
278 domain_names_in_query_seq.add(domain.model)
280 if target_domain_names.subset?(domain_names_in_query_seq)
282 passed_domains = Array.new
283 passed_domains_counts = Hash.new
285 domains_in_query_seq.each do |domain|
286 if target_domains.has_key?(domain.model)
287 target_domain = target_domains[domain.model]
289 if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
290 if domain.i_e_value > target_domain.i_e_value
291 addToFailingDomainData(domain)
295 if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
296 length = 1 + domain.env_to - domain.env_from
297 if length < target_domain.abs_len
298 addToFailingDomainData(domain)
302 if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
303 length = 1 + domain.env_to - domain.env_from
304 if length < (target_domain.rel_len * domain.tlen)
305 addToFailingDomainData(domain)
310 passed_domains.push(domain)
312 if (passed_domains_counts.key?(domain.model))
313 passed_domains_counts[domain.model] = passed_domains_counts[domain.model] + 1
315 passed_domains_counts[domain.model] = 1
318 addToPassingDomainData(domain)
320 end # if target_domains.has_key?(domain.model)
321 end # domains_in_query_seq.each do |domain|
323 @failing_proteins_bc_not_all_target_doms_present += 1
325 end # target_domain_names.subset?(domain_names_in_query_seq)
327 passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
328 # Compare architectures
329 if !compareArchitectures(target_domain_architecture, passed_domains)
330 @failing_proteins_bc_missing_cutoffs += 1
334 domain_counter = Hash.new
336 min_env_from = 10000000
341 concatenated_domains = ''
343 passed_domains.each do |domain|
344 domain_name = domain.model
346 unless out_domain_msas.has_key? domain_name
347 out_domain_msas[ domain_name ] = Msa.new
350 if domain_counter.key?(domain_name)
351 domain_counter[domain_name] = domain_counter[domain_name] + 1
353 domain_counter[domain_name] = 1
357 query_seq = domain.query
359 elsif query_seq != domain.query
360 error_msg = "seq names do not match: this should not have happened"
361 raise StandardError, error_msg
364 extracted_domain_seq = extract_domain( query_seq,
365 domain_counter[domain_name],
366 passed_domains_counts[domain_name],
370 out_domain_msas[ domain_name ] )
372 concatenated_domains += extracted_domain_seq
374 if domain.env_from < min_env_from
375 min_env_from = domain.env_from
377 if domain.env_to > max_env_to
378 max_env_to = domain.env_to
382 extract_domain( query_seq,
388 out_domain_architecture_msa )
390 out_contactenated_domains_msa.add( query_seq, concatenated_domains)
395 def addToPassingDomainData(domain)
396 unless ( @passing_domains_data.key?(domain.model))
397 @passing_domains_data[domain.model] = DomainData.new(domain.model)
399 @passing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value)
402 def addToFailingDomainData(domain)
403 unless ( @failing_domains_data.key?(domain.model))
404 @failing_domains_data[domain.model] = DomainData.new(domain.model)
406 @failing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value)
409 # passed_domains needs to be sorted!
410 def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
411 domain_architecture = ''
412 passed_domains.each do |domain|
413 if domain_architecture.length > 0
414 domain_architecture += DOMAIN_DELIMITER
416 domain_architecture += domain.model
420 pass = (target_domain_architecture == domain_architecture)
422 pass = (domain_architecture.index(target_domain_architecture) != nil)
425 if (@passsing_domain_architectures.key?(domain_architecture))
426 @passsing_domain_architectures[domain_architecture] = @passsing_domain_architectures[domain_architecture] + 1
428 @passsing_domain_architectures[domain_architecture] = 1
431 if ( @failing_domain_architectures .key?(domain_architecture))
432 @failing_domain_architectures [domain_architecture] = @failing_domain_architectures [domain_architecture] + 1
434 @failing_domain_architectures [domain_architecture] = 1
440 def write_msa( msa, filename )
442 w = FastaWriter.new()
443 w.set_line_width( 60 )
445 File.delete(filename) if File.exist?(filename) #TODO remove me
447 io.write_to_file( msa, filename, w )
449 error_msg = "could not write to \"" + filename + "\""
450 raise IOError, error_msg
454 def add_sequence( sequence_name, in_msa, add_to_msa )
455 seqs = in_msa.find_by_name_start( sequence_name, true )
456 if ( seqs.length < 1 )
457 error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
458 raise StandardError, error_msg
460 if ( seqs.length > 1 )
461 error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
462 raise StandardError, error_msg
464 seq = in_msa.get_sequence( seqs[ 0 ] )
465 add_to_msa.add_sequence( seq )
468 def extract_domain( seq_name,
475 if number < 1 || out_of < 1 || number > out_of
476 error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
477 raise StandardError, error_msg
479 if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
480 error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
481 raise StandardError, error_msg
483 seqs = in_msa.find_by_name_start(seq_name, true)
485 error_msg = "sequence \"" + seq_name + "\" not found in sequence file"
486 raise IOError, error_msg
489 error_msg = "sequence \"" + seq_name + "\" not unique in sequence file"
490 raise IOError, error_msg
492 # hmmscan is 1 based, whereas sequences are 0 bases in this package.
493 seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
495 orig_name = seq.get_name
497 seq.set_name( orig_name.split[ 0 ] )
500 seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
503 out_msa.add_sequence( seq )
505 seq.get_sequence_as_string
508 end # class HmmscanMultiDomainExtractor
511 def initialize( name )
512 if (name == nil) || name.size < 1
513 error_msg = "domain name nil or empty"
514 raise StandardError, error_msg
518 @i_e_value_min = 10000000.0
519 @i_e_value_max = -1.0
526 def add( name, length, i_e_value)
528 error_msg = "domain names do not match"
529 raise StandardError, error_msg
533 error_msg = "length cannot me negative"
534 raise StandardError, error_msg
537 error_msg = "iE-value cannot me negative"
538 raise StandardError, error_msg
541 @i_e_value_sum += i_e_value
543 if i_e_value > @i_e_value_max
544 @i_e_value_max = i_e_value
546 if i_e_value < @i_e_value_min
547 @i_e_value_min = i_e_value
568 @i_e_value_sum / @count
573 s << @name.rjust(24) + ': '
574 s << @count.to_s.rjust(4) + ' '
575 s << avg_length.round(1).to_s.rjust(6) + ' '
576 s << @len_min.to_s.rjust(4) + ' -'
577 s << @len_max.to_s.rjust(4) + ' '
578 s << ("%.2E" % avg_i_e_value).rjust(9) + ' '
579 s << ("%.2E" % @i_e_value_min).rjust(9) + ' -'
580 s << ("%.2E" % @i_e_value_max).rjust(9) + ' '
584 attr_reader :name, :count, :i_e_value_min, :i_e_value_max, :length_min, :length_max
589 def initialize( name, i_e_value, abs_len, rel_len, position )
590 if (name == nil) || name.size < 1
591 error_msg = "target domain name nil or empty"
592 raise StandardError, error_msg
595 error_msg = "target domain relative length is greater than 1"
596 raise StandardError, error_msg
599 @i_e_value = i_e_value
604 attr_reader :name, :i_e_value, :abs_len, :rel_len, :position