2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # Last modified: 2017/03/08
10 # Import: if multiple copies of same domain, thresholds need to be same!
13 require 'lib/evo/util/constants'
14 require 'lib/evo/msa/msa_factory'
15 require 'lib/evo/io/msa_io'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/io/parser/fasta_parser'
18 require 'lib/evo/io/parser/hmmscan_parser'
21 class HmmscanMultiDomainExtractor
25 DOMAIN_DELIMITER = ' -- '
27 IE_CUTOFF_FOR_DA_OVERVIEW = 1E-6
28 REL_LEN_CUTOFF_FOR_DA_OVERVIEW = 0.5
30 PASSING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
31 FAILING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
32 TARGET_DA_SUFFIX = "_#{OUTPUT_ID}_target_da.fasta"
33 CONCAT_TARGET_DOM_SUFFIX = "_#{OUTPUT_ID}_concat_target_doms.fasta"
34 TARGET_DOM_OUTPUT_MIDPART = "_#{OUTPUT_ID}_target_dom_"
35 LOG_FILE_SUFFIX = "_#{OUTPUT_ID}.log"
37 @passing_domains_data = nil
38 @failing_domains_data = nil
39 @failing_domain_architectures = nil
40 @passsing_domain_architectures = nil
41 @failing_proteins_bc_not_all_target_doms_present = nil
42 @failing_proteins_bc_missing_cutoffs = nil
43 @encountered_domain_architectures = nil
47 # raises ArgumentError, IOError, StandardError
54 passing_fl_seqs_outfile = outfile_base + PASSING_FL_SEQS_SUFFIX
55 failing_fl_seqs_outfile = outfile_base + FAILING_FL_SEQS_SUFFIX
56 target_da_outfile = outfile_base + TARGET_DA_SUFFIX
57 concat_target_dom_outfile = outfile_base + CONCAT_TARGET_DOM_SUFFIX
58 logfile = outfile_base + LOG_FILE_SUFFIX
60 Util.check_file_for_readability( hmmscan_output )
61 Util.check_file_for_readability( fasta_sequence_file )
62 Util.check_file_for_writability( passing_fl_seqs_outfile )
63 Util.check_file_for_writability( failing_fl_seqs_outfile )
64 Util.check_file_for_writability( target_da_outfile )
65 Util.check_file_for_writability( concat_target_dom_outfile )
66 Util.check_file_for_writability( logfile )
71 factory = MsaFactory.new()
72 in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
74 if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
75 error_msg = "could not find fasta sequences in " + fasta_sequence_file
76 raise IOError, error_msg
79 failed_seqs_msa = Msa.new
80 passed_seqs_msa = Msa.new
82 hmmscan_parser = HmmscanParser.new( hmmscan_output )
83 results = hmmscan_parser.parse
85 target_domain_ary = parse_da target_da
87 target_domains = Hash.new
89 target_domain_architecure = ''
91 target_domain_ary.each do |target_domain|
92 target_domains[target_domain.name] = target_domain
93 if target_domain_architecure.length > 0
94 target_domain_architecure << DOMAIN_DELIMITER
96 target_domain_architecure << target_domain.name
99 target_domain_architecure.freeze
101 log 'Hmmscan outputfile : ' + hmmscan_output
102 log 'Full length fasta sequence file: ' + fasta_sequence_file
103 log 'Target domain architecture : ' + target_domain_architecure
104 target_domain_ary.each do |x|
108 target_domain_names = Set.new
110 target_domains.each_key {|key| target_domain_names.add( key ) }
112 prev_query_seq_name = nil
113 domains_in_query_seq = Array.new
114 passing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
115 failing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
118 @failing_domain_architectures = Hash.new
119 @passsing_domain_architectures = Hash.new
120 @passing_domains_data = Hash.new
121 @failing_domains_data = Hash.new
122 @encountered_domain_architectures= Hash.new
123 @failing_proteins_bc_not_all_target_doms_present = 0
124 @failing_proteins_bc_missing_cutoffs = 0
125 out_domain_msas = Hash.new
126 out_domain_architecture_msa = Msa.new
127 out_concatenated_domains_msa = Msa.new
128 results.each do |hmmscan_result|
129 if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
130 if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
131 passing_sequences.push(domains_in_query_seq)
133 failing_sequences.push(domains_in_query_seq)
135 domains_in_query_seq = Array.new
138 prev_query_seq_name = hmmscan_result.query
139 domains_in_query_seq.push(hmmscan_result)
142 if prev_query_seq_name != nil
143 if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
144 passing_sequences.push(domains_in_query_seq)
146 failing_sequences.push(domains_in_query_seq)
151 if in_msa.get_number_of_seqs < total_sequences
152 error_msg = "hmmscan output contains more protein sequences than fasta sequence file"
153 raise IOError, error_msg
157 log 'Domain architecture overview using default iE-cutoff ' +
158 IE_CUTOFF_FOR_DA_OVERVIEW.to_s + ' and relative length cutoff ' + REL_LEN_CUTOFF_FOR_DA_OVERVIEW.to_s + ':'
160 @encountered_domain_architectures = @encountered_domain_architectures.sort_by {|k, v| v}.reverse.to_h
162 @encountered_domain_architectures.each do |k, v|
163 log counter.to_s.rjust(2) + ') ' + v.to_s.rjust(5) + ': ' + k
171 log 'Passing domain arrangements of target domain(s):'
172 @passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
174 @passsing_domain_architectures.each do |da, count|
175 passing_da_sum += count
176 log count.to_s.rjust(4) + ': ' + da
179 log 'Failing domain arrangements of target domain(s):'
180 @failing_domain_architectures = @failing_domain_architectures.sort{|a, b|a<=>b}.to_h
182 @failing_domain_architectures .each do |da, count|
183 failing_da_sum += count
184 log count.to_s.rjust(4) + ': ' + da
187 log 'Passing target domain(s):'
188 @passing_domains_data = @passing_domains_data.sort{|a, b|a<=>b}.to_h
189 @passing_domains_data.each do |n, d|
193 log 'Failing target domain(s) (in proteins sequences with target domain architecture):'
194 @failing_domains_data = @failing_domains_data.sort{|a, b|a<=>b}.to_h
195 @failing_domains_data.each do |n, d|
199 unless total_sequences == (passing_sequences.size + failing_sequences.size)
200 error_msg = "this should not have happened: total seqs not equal to passing plus failing seqs"
201 raise StandardError, error_msg
204 unless failing_sequences.size == (@failing_proteins_bc_not_all_target_doms_present + @failing_proteins_bc_missing_cutoffs)
205 error_msg = "this should not have happened: failing seqs sums not consistent"
206 raise StandardError, error_msg
209 unless @failing_proteins_bc_missing_cutoffs >= failing_da_sum
210 error_msg = "this should not have happened: failing seqs larger than failing da sum"
211 raise StandardError, error_msg
214 unless passing_sequences.size == passing_da_sum
215 error_msg = "this should not have happened: passing seqs not equal to passing da sum"
216 raise StandardError, error_msg
220 log "Protein sequences in sequence (fasta) file: " + in_msa.get_number_of_seqs.to_s.rjust(5)
221 log "Protein sequences in hmmscan output file : " + total_sequences.to_s.rjust(5)
222 log " Passing protein sequences : " + passing_sequences.size.to_s.rjust(5)
223 log " Failing protein sequences : " + failing_sequences.size.to_s.rjust(5)
224 log " Not all target domain present : " + @failing_proteins_bc_not_all_target_doms_present.to_s.rjust(5)
225 log " Target domain(s) failing cutoffs : " + @failing_proteins_bc_missing_cutoffs.to_s.rjust(5)
228 out_domain_msas.keys.sort.each do |domain_name|
229 file_name = outfile_base + TARGET_DOM_OUTPUT_MIDPART + domain_name + '.fasta'
230 write_msa out_domain_msas[domain_name], file_name
231 log "Wrote passing target domain sequence for " + domain_name.ljust(16) + ': ' + file_name
234 write_msa out_domain_architecture_msa, target_da_outfile
235 log 'Wrote target domain architecture : ' + target_da_outfile
237 write_msa out_concatenated_domains_msa, concat_target_dom_outfile
238 log 'Wrote concatenated target domain(s) : ' + concat_target_dom_outfile
240 passing_sequences.each do | domains |
241 query_name = domains[0].query
242 if (!TESTING) || (passed_seqs_msa.find_by_name_start( query_name, true ).length < 1)
243 add_sequence( query_name, in_msa, passed_seqs_msa )
245 error_msg = 'this should not have happened'
246 raise StandardError, error_msg
250 failing_sequences.each do | domains |
251 query_name = domains[0].query
252 if (!TESTING) || (failed_seqs_msa.find_by_name_start( query_name, true ).length < 1)
253 add_sequence( query_name, in_msa, failed_seqs_msa )
255 error_msg = 'this should not have happened'
256 raise StandardError, error_msg
260 write_msa passed_seqs_msa, passing_fl_seqs_outfile
261 log 'Wrote passing full length protein sequences : ' + passing_fl_seqs_outfile
262 write_msa failed_seqs_msa, failing_fl_seqs_outfile
263 log 'Wrote failing full length protein sequences : ' + failing_fl_seqs_outfile
266 f = File.open( logfile, 'w' )
269 rescue Exception => e
270 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
272 log 'Wrote log file : ' + logfile
278 # domains: Array of HmmscanResult
279 def collect(domains, ie_cutoff, rel_len_cutoff)
281 domains.each do |domain|
282 length = 1 + domain.env_to - domain.env_from
283 if (domain.i_e_value <= ie_cutoff) && (length >= (rel_len_cutoff * domain.tlen))
287 passed.sort! { |a,b| a.ali_from <=> b.ali_from }
289 passed.each do |domain|
291 s << DOMAIN_DELIMITER
296 if @encountered_domain_architectures.has_key? s
297 @encountered_domain_architectures[s] = 1 + @encountered_domain_architectures[s]
299 @encountered_domain_architectures[s] = 1
304 # domains_in_query_seq: Array of HmmscanResult
305 # target_domain_names: Set of String
306 # target_domains: Hash String->TargetDomain
307 # target_domain_architecture: String
308 def compare(domains_in_query_seq,
313 out_domain_architecture_msa,
314 out_contactenated_domains_msa,
315 target_domain_architecture)
317 collect(domains_in_query_seq, IE_CUTOFF_FOR_DA_OVERVIEW, REL_LEN_CUTOFF_FOR_DA_OVERVIEW)
319 domain_names_in_query_seq = Set.new
320 domains_in_query_seq.each do |domain|
321 domain_names_in_query_seq.add(domain.model)
324 passed_domains = Array.new
325 passed_domains_counts = Hash.new
327 if (domain_names_in_query_seq.length > 0) && (target_domain_names.subset?(domain_names_in_query_seq))
329 domains_in_query_seq.each do |domain|
330 if target_domains.has_key?(domain.model)
331 target_domain = target_domains[domain.model]
333 if target_domain.i_e_value >= 0
334 if domain.i_e_value > target_domain.i_e_value
335 addToFailingDomainData(domain)
339 if target_domain.abs_len > 0
340 length = 1 + domain.env_to - domain.env_from
341 if length < target_domain.abs_len
342 addToFailingDomainData(domain)
346 if target_domain.rel_len > 0
347 length = 1 + domain.env_to - domain.env_from
348 if length < (target_domain.rel_len * domain.tlen)
349 addToFailingDomainData(domain)
354 passed_domains.push(domain)
356 if (passed_domains_counts.key?(domain.model))
357 passed_domains_counts[domain.model] = passed_domains_counts[domain.model] + 1
359 passed_domains_counts[domain.model] = 1
362 addToPassingDomainData(domain)
364 end # if target_domains.has_key?(domain.model)
365 end # domains_in_query_seq.each do |domain|
367 @failing_proteins_bc_not_all_target_doms_present += 1
369 end # target_domain_names.subset?(domain_names_in_query_seq)
371 if passed_domains.length < 1
372 @failing_proteins_bc_missing_cutoffs += 1
376 passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
377 # Compare architectures
378 if !compareArchitectures(target_domain_architecture, passed_domains, false)
379 @failing_proteins_bc_missing_cutoffs += 1
383 domain_counter = Hash.new
385 min_env_from = 10000000
390 concatenated_domains = ''
392 passed_domains.each do |domain|
393 domain_name = domain.model
395 unless out_domain_msas.has_key? domain_name
396 out_domain_msas[ domain_name ] = Msa.new
399 if domain_counter.key?(domain_name)
400 domain_counter[domain_name] = domain_counter[domain_name] + 1
402 domain_counter[domain_name] = 1
406 query_seq = domain.query
408 elsif query_seq != domain.query
409 error_msg = "seq names do not match: this should not have happened"
410 raise StandardError, error_msg
413 extracted_domain_seq = extract_domain( query_seq,
414 domain_counter[domain_name],
415 passed_domains_counts[domain_name],
419 out_domain_msas[ domain_name ] )
421 concatenated_domains += extracted_domain_seq
423 if domain.env_from < min_env_from
424 min_env_from = domain.env_from
426 if domain.env_to > max_env_to
427 max_env_to = domain.env_to
431 extract_domain( query_seq,
437 out_domain_architecture_msa )
439 out_contactenated_domains_msa.add( query_seq, concatenated_domains)
444 def addToPassingDomainData(domain)
445 unless ( @passing_domains_data.key?(domain.model))
446 @passing_domains_data[domain.model] = DomainData.new(domain.model)
448 @passing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value)
451 def addToFailingDomainData(domain)
452 unless ( @failing_domains_data.key?(domain.model))
453 @failing_domains_data[domain.model] = DomainData.new(domain.model)
455 @failing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value)
458 # passed_domains needs to be sorted!
459 def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
460 domain_architecture = ''
461 passed_domains.each do |domain|
462 if domain_architecture.length > 0
463 domain_architecture += DOMAIN_DELIMITER
465 domain_architecture += domain.model
469 pass = (target_domain_architecture == domain_architecture)
471 pass = (domain_architecture.index(target_domain_architecture) != nil)
474 if (@passsing_domain_architectures.key?(domain_architecture))
475 @passsing_domain_architectures[domain_architecture] = @passsing_domain_architectures[domain_architecture] + 1
477 @passsing_domain_architectures[domain_architecture] = 1
480 if ( @failing_domain_architectures.key?(domain_architecture))
481 @failing_domain_architectures[domain_architecture] = @failing_domain_architectures[domain_architecture] + 1
483 @failing_domain_architectures[domain_architecture] = 1
489 def write_msa( msa, filename )
491 w = FastaWriter.new()
492 w.set_line_width( 60 )
494 File.delete(filename) if File.exist?(filename)
496 io.write_to_file( msa, filename, w )
498 error_msg = "could not write to \"" + filename + "\""
499 raise IOError, error_msg
503 def add_sequence( sequence_name, in_msa, add_to_msa )
504 seqs = in_msa.find_by_name_start( sequence_name, true )
505 if ( seqs.length < 1 )
506 error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
507 raise StandardError, error_msg
509 if ( seqs.length > 1 )
510 error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
511 raise StandardError, error_msg
513 seq = in_msa.get_sequence( seqs[ 0 ] )
514 add_to_msa.add_sequence( seq )
517 def extract_domain( seq_name,
524 if number < 1 || out_of < 1 || number > out_of
525 error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
526 raise StandardError, error_msg
528 if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
529 error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
530 raise StandardError, error_msg
532 seqs = in_msa.find_by_name_start(seq_name, true)
534 error_msg = "sequence \"" + seq_name + "\" not found in sequence file"
535 raise IOError, error_msg
538 error_msg = "sequence \"" + seq_name + "\" not unique in sequence file"
539 raise IOError, error_msg
541 # hmmscan is 1 based, whereas sequences are 0 bases in this package.
542 seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
544 orig_name = seq.get_name
546 seq.set_name( orig_name.split[ 0 ] )
549 seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
552 out_msa.add_sequence( seq )
554 seq.get_sequence_as_string
557 def parse_da( target_da_str )
558 target_domain_hash = Hash.new
559 target_domain_ary = Array.new
560 target_das = target_da_str.split '--'
561 target_das.each do |x|
563 unless inds.size == 1 || inds.size == 4
564 raise IOError, 'domain architecture is ill formatted: ' + x
567 target_domain_name = inds[0]
568 ie_cutoff = inds.size == 4 ? Float(inds[1]) : IE_CUTOFF_FOR_DA_OVERVIEW
569 abs_len_cutoff = inds.size == 4 ? Integer(inds[2]) : 0
570 rel_len_cutoff = inds.size == 4 ? Float(inds[3]) : REL_LEN_CUTOFF_FOR_DA_OVERVIEW
571 if target_domain_hash.has_key? target_domain_name
572 target_domain_ary.push target_domain_hash[target_domain_name]
574 td = TargetDomain.new(target_domain_name, ie_cutoff, abs_len_cutoff, rel_len_cutoff)
575 target_domain_hash[target_domain_name] = td
576 target_domain_ary.push td
584 @log << str << Constants::LINE_DELIMITER
587 end # class HmmscanMultiDomainExtractor
590 def initialize( name )
591 if (name == nil) || name.size < 1
592 error_msg = "domain name nil or empty"
593 raise IOError, error_msg
597 @i_e_value_min = 10000000.0
598 @i_e_value_max = -1.0
605 def add( name, length, i_e_value)
607 error_msg = "domain names do not match"
608 raise IOError, error_msg
612 error_msg = "length cannot me negative"
613 raise IOError, error_msg
616 error_msg = "iE-value cannot me negative"
617 raise IOError, error_msg
620 @i_e_value_sum += i_e_value
622 if i_e_value > @i_e_value_max
623 @i_e_value_max = i_e_value
625 if i_e_value < @i_e_value_min
626 @i_e_value_min = i_e_value
647 @i_e_value_sum / @count
652 s << @name.rjust(16) + ': '
653 s << @count.to_s.rjust(4) + ' '
654 s << avg_length.round(1).to_s.rjust(6) + ' '
655 s << @len_min.to_s.rjust(4) + ' -'
656 s << @len_max.to_s.rjust(4) + ' '
657 s << ("%.2E" % avg_i_e_value).rjust(9) + ' '
658 s << ("%.2E" % @i_e_value_min).rjust(9) + ' -'
659 s << ("%.2E" % @i_e_value_max).rjust(9) + ' '
663 attr_reader :name, :count, :i_e_value_min, :i_e_value_max, :length_min, :length_max
668 def initialize(name, i_e_value, abs_len, rel_len)
669 if (name == nil) || name.size < 1
670 error_msg = "target domain name nil or empty"
671 raise IOError, error_msg
674 error_msg = name + ": target domain relative length is greater than 1"
675 raise IOError, error_msg
677 if (abs_len <= 0) && (rel_len <= 0)
678 error_msg = name + ": need to have either absolute length or relative length cutoff"
679 raise IOError, error_msg
681 if (abs_len > 0) && (rel_len > 0)
682 error_msg = name + ": cannot have both absolute length and relative length cutoff"
683 raise IOError, error_msg
686 @i_e_value = i_e_value
692 s = @name.rjust(16) + ':'
693 s << ' iE-cutoff: ' + ("%.2E" % @i_e_value).rjust(9)
695 s << ', abs len-cutoff: ' + @abs_len.to_s.rjust(4)
698 s << ', rel len-cutoff: ' + @rel_len.to_s.rjust(4)
702 attr_reader :name, :i_e_value, :abs_len, :rel_len