2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # Last modified: 2017/02/20
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
17 class HmmscanMultiDomainExtractor
21 DOMAIN_DELIMITER = ' -- '
23 PASSING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
24 FAILING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
25 TARGET_DA_SUFFIX = "_#{OUTPUT_ID}_target_da.fasta"
26 CONCAT_TARGET_DOM_SUFFIX = "_#{OUTPUT_ID}_concat_target_doms.fasta"
27 TARGET_DOM_OUTPUT_MIDPART = "_#{OUTPUT_ID}_target_dom_"
28 LOG_FILE_SUFFIX = "_#{OUTPUT_ID}.log"
30 @passing_domains_data = nil
31 @failing_domains_data = nil
32 @failing_domain_architectures = nil
33 @passsing_domain_architectures = nil
34 @failing_proteins_bc_not_all_target_doms_present = nil
35 @failing_proteins_bc_missing_cutoffs = nil
39 # raises ArgumentError, IOError, StandardError
46 passing_fl_seqs_outfile = outfile_base + PASSING_FL_SEQS_SUFFIX
47 failing_fl_seqs_outfile = outfile_base + FAILING_FL_SEQS_SUFFIX
48 target_da_outfile = outfile_base + TARGET_DA_SUFFIX
49 concat_target_dom_outfile = outfile_base + CONCAT_TARGET_DOM_SUFFIX
50 logfile = outfile_base + LOG_FILE_SUFFIX
52 Util.check_file_for_readability( hmmscan_output )
53 Util.check_file_for_readability( fasta_sequence_file )
54 Util.check_file_for_writability( passing_fl_seqs_outfile )
55 Util.check_file_for_writability( failing_fl_seqs_outfile )
56 Util.check_file_for_writability( target_da_outfile )
57 Util.check_file_for_writability( concat_target_dom_outfile )
58 Util.check_file_for_writability( logfile )
63 factory = MsaFactory.new()
64 in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
66 if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
67 error_msg = "could not find fasta sequences in " + fasta_sequence_file
68 raise IOError, error_msg
71 failed_seqs_msa = Msa.new
72 passed_seqs_msa = Msa.new
74 hmmscan_parser = HmmscanParser.new( hmmscan_output )
75 results = hmmscan_parser.parse
78 # Import: if multiple copies of same domain, thresholds need to be same!
80 #target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 ))
81 #target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 ))
83 #target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.9, 0 ))
84 #target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.9, 1 ))
85 # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
86 # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
88 target_domain_ary = parse_da target_da
90 # target_domain_ary.push(TargetDomain.new('BH4', 1, -1, 0.2, 0 ))
91 # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 1 ))
92 # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 2 ))
93 # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
95 # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
96 # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
98 #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
100 target_domains = Hash.new
102 target_domain_architecure = ""
104 target_domain_ary.each do |target_domain|
105 target_domains[target_domain.name] = target_domain
106 if target_domain_architecure.length > 0
107 target_domain_architecure += DOMAIN_DELIMITER
109 target_domain_architecure += target_domain.name
112 target_domain_architecure.freeze
114 log 'Hmmscan outputfile : ' + hmmscan_output
115 log 'Full length fasta sequence file: ' + fasta_sequence_file
116 log 'Target domain architecture : ' + target_domain_architecure
117 target_domain_ary.each do |x|
121 target_domain_names = Set.new
123 target_domains.each_key {|key| target_domain_names.add( key ) }
125 prev_query_seq_name = nil
126 domains_in_query_seq = Array.new
127 passing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
128 failing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
131 @failing_domain_architectures = Hash.new
132 @passsing_domain_architectures = Hash.new
133 @passing_domains_data = Hash.new
134 @failing_domains_data = Hash.new
135 @failing_proteins_bc_not_all_target_doms_present = 0
136 @failing_proteins_bc_missing_cutoffs = 0
137 out_domain_msas = Hash.new
138 out_domain_architecture_msa = Msa.new
139 out_concatenated_domains_msa = Msa.new
140 results.each do |hmmscan_result|
141 if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
142 if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
143 passing_sequences.push(domains_in_query_seq)
145 failing_sequences.push(domains_in_query_seq)
147 domains_in_query_seq = Array.new
150 prev_query_seq_name = hmmscan_result.query
151 domains_in_query_seq.push(hmmscan_result)
154 if prev_query_seq_name != nil
155 if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
156 passing_sequences.push(domains_in_query_seq)
158 failing_sequences.push(domains_in_query_seq)
163 if in_msa.get_number_of_seqs < total_sequences
164 error_msg = "hmmscan output contains more protein sequences than fasta sequence file"
165 raise IOError, error_msg
169 log 'Passing domain architectures containing target domain(s):'
170 @passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
172 @passsing_domain_architectures.each do |da, count|
173 passing_da_sum += count
174 log count.to_s.rjust(4) + ': ' + da
177 log 'Failing domain architectures containing one or more target domain(s):'
178 @failing_domain_architectures = @failing_domain_architectures .sort{|a, b|a<=>b}.to_h
180 @failing_domain_architectures .each do |da, count|
181 failing_da_sum += count
182 log count.to_s.rjust(4) + ': ' + da
185 log 'Passing target domain(s):'
186 @passing_domains_data = @passing_domains_data.sort{|a, b|a<=>b}.to_h
187 @passing_domains_data.each do |n, d|
191 log'Failing target domain(s):'
192 @failing_domains_data = @failing_domains_data.sort{|a, b|a<=>b}.to_h
193 @failing_domains_data.each do |n, d|
197 unless total_sequences == (passing_sequences.size + failing_sequences.size)
198 error_msg = "this should not have happened: total seqs not equal to passing plus failing seqs"
199 raise StandardError, error_msg
202 unless failing_sequences.size == (@failing_proteins_bc_not_all_target_doms_present + @failing_proteins_bc_missing_cutoffs)
203 error_msg = "this should not have happened: failing seqs sums not consistent"
204 raise StandardError, error_msg
207 unless @failing_proteins_bc_missing_cutoffs == failing_da_sum
208 error_msg = "this should not have happened: failing seqs not equal to failing da sum"
209 raise StandardError, error_msg
212 unless passing_sequences.size == passing_da_sum
213 error_msg = "this should not have happened: passing seqs not equal to passing da sum"
214 raise StandardError, error_msg
218 log "Protein sequences in sequence (fasta) file: " + in_msa.get_number_of_seqs.to_s.rjust(5)
219 log "Protein sequences in hmmscan output file : " + total_sequences.to_s.rjust(5)
220 log " Passing protein sequences : " + passing_sequences.size.to_s.rjust(5)
221 log " Failing protein sequences : " + failing_sequences.size.to_s.rjust(5)
222 log " Not all target domain present : " + @failing_proteins_bc_not_all_target_doms_present.to_s.rjust(5)
223 log " Target domain(s) failing cutoffs : " + @failing_proteins_bc_missing_cutoffs.to_s.rjust(5)
226 out_domain_msas.keys.sort.each do |domain_name|
227 file_name = outfile_base + TARGET_DOM_OUTPUT_MIDPART + domain_name + '.fasta'
228 write_msa out_domain_msas[domain_name], file_name
229 log "Wrote passing target domain sequence for " + domain_name.ljust(16) + ': ' + file_name
232 write_msa out_domain_architecture_msa, target_da_outfile
233 log 'Wrote target domain architecture : ' + target_da_outfile
235 write_msa out_concatenated_domains_msa, concat_target_dom_outfile
236 log 'Wrote concatenated target domain(s) : ' + concat_target_dom_outfile
238 passing_sequences.each do | domains |
239 query_name = domains[0].query
240 if (!TESTING) || (passed_seqs_msa.find_by_name_start( query_name, true ).length < 1)
241 add_sequence( query_name, in_msa, passed_seqs_msa )
243 error_msg = 'this should not have happened'
244 raise StandardError, error_msg
248 failing_sequences.each do | domains |
249 query_name = domains[0].query
250 if (!TESTING) || (failed_seqs_msa.find_by_name_start( query_name, true ).length < 1)
251 add_sequence( query_name, in_msa, failed_seqs_msa )
253 error_msg = 'this should not have happened'
254 raise StandardError, error_msg
258 write_msa passed_seqs_msa, passing_fl_seqs_outfile
259 log 'Wrote passing full length protein sequences : ' + passing_fl_seqs_outfile
260 write_msa failed_seqs_msa, failing_fl_seqs_outfile
261 log 'Wrote failing full length protein sequences : ' + failing_fl_seqs_outfile
264 f = File.open( logfile, 'w' )
267 rescue Exception => e
268 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
270 log 'Wrote log file : ' + logfile
276 # domains_in_query_seq: Array of HmmscanResult
277 # target_domain_names: Set of String
278 # target_domains: Hash String->TargetDomain
279 # target_domain_architecture: String
280 def compare(domains_in_query_seq,
285 out_domain_architecture_msa,
286 out_contactenated_domains_msa,
287 target_domain_architecture)
289 domain_names_in_query_seq = Set.new
290 domains_in_query_seq.each do |domain|
291 domain_names_in_query_seq.add(domain.model)
293 if target_domain_names.subset?(domain_names_in_query_seq)
295 passed_domains = Array.new
296 passed_domains_counts = Hash.new
298 domains_in_query_seq.each do |domain|
299 if target_domains.has_key?(domain.model)
300 target_domain = target_domains[domain.model]
302 if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
303 if domain.i_e_value > target_domain.i_e_value
304 addToFailingDomainData(domain)
308 if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
309 length = 1 + domain.env_to - domain.env_from
310 if length < target_domain.abs_len
311 addToFailingDomainData(domain)
315 if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
316 length = 1 + domain.env_to - domain.env_from
317 if length < (target_domain.rel_len * domain.tlen)
318 addToFailingDomainData(domain)
323 passed_domains.push(domain)
325 if (passed_domains_counts.key?(domain.model))
326 passed_domains_counts[domain.model] = passed_domains_counts[domain.model] + 1
328 passed_domains_counts[domain.model] = 1
331 addToPassingDomainData(domain)
333 end # if target_domains.has_key?(domain.model)
334 end # domains_in_query_seq.each do |domain|
336 @failing_proteins_bc_not_all_target_doms_present += 1
338 end # target_domain_names.subset?(domain_names_in_query_seq)
340 passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
341 # Compare architectures
342 if !compareArchitectures(target_domain_architecture, passed_domains)
343 @failing_proteins_bc_missing_cutoffs += 1
347 domain_counter = Hash.new
349 min_env_from = 10000000
354 concatenated_domains = ''
356 passed_domains.each do |domain|
357 domain_name = domain.model
359 unless out_domain_msas.has_key? domain_name
360 out_domain_msas[ domain_name ] = Msa.new
363 if domain_counter.key?(domain_name)
364 domain_counter[domain_name] = domain_counter[domain_name] + 1
366 domain_counter[domain_name] = 1
370 query_seq = domain.query
372 elsif query_seq != domain.query
373 error_msg = "seq names do not match: this should not have happened"
374 raise StandardError, error_msg
377 extracted_domain_seq = extract_domain( query_seq,
378 domain_counter[domain_name],
379 passed_domains_counts[domain_name],
383 out_domain_msas[ domain_name ] )
385 concatenated_domains += extracted_domain_seq
387 if domain.env_from < min_env_from
388 min_env_from = domain.env_from
390 if domain.env_to > max_env_to
391 max_env_to = domain.env_to
395 extract_domain( query_seq,
401 out_domain_architecture_msa )
403 out_contactenated_domains_msa.add( query_seq, concatenated_domains)
408 def addToPassingDomainData(domain)
409 unless ( @passing_domains_data.key?(domain.model))
410 @passing_domains_data[domain.model] = DomainData.new(domain.model)
412 @passing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value)
415 def addToFailingDomainData(domain)
416 unless ( @failing_domains_data.key?(domain.model))
417 @failing_domains_data[domain.model] = DomainData.new(domain.model)
419 @failing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value)
422 # passed_domains needs to be sorted!
423 def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
424 domain_architecture = ''
425 passed_domains.each do |domain|
426 if domain_architecture.length > 0
427 domain_architecture += DOMAIN_DELIMITER
429 domain_architecture += domain.model
433 pass = (target_domain_architecture == domain_architecture)
435 pass = (domain_architecture.index(target_domain_architecture) != nil)
438 if (@passsing_domain_architectures.key?(domain_architecture))
439 @passsing_domain_architectures[domain_architecture] = @passsing_domain_architectures[domain_architecture] + 1
441 @passsing_domain_architectures[domain_architecture] = 1
444 if ( @failing_domain_architectures .key?(domain_architecture))
445 @failing_domain_architectures [domain_architecture] = @failing_domain_architectures [domain_architecture] + 1
447 @failing_domain_architectures [domain_architecture] = 1
453 def write_msa( msa, filename )
455 w = FastaWriter.new()
456 w.set_line_width( 60 )
458 File.delete(filename) if File.exist?(filename)
460 io.write_to_file( msa, filename, w )
462 error_msg = "could not write to \"" + filename + "\""
463 raise IOError, error_msg
467 def add_sequence( sequence_name, in_msa, add_to_msa )
468 seqs = in_msa.find_by_name_start( sequence_name, true )
469 if ( seqs.length < 1 )
470 error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
471 raise StandardError, error_msg
473 if ( seqs.length > 1 )
474 error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
475 raise StandardError, error_msg
477 seq = in_msa.get_sequence( seqs[ 0 ] )
478 add_to_msa.add_sequence( seq )
481 def extract_domain( seq_name,
488 if number < 1 || out_of < 1 || number > out_of
489 error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
490 raise StandardError, error_msg
492 if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
493 error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
494 raise StandardError, error_msg
496 seqs = in_msa.find_by_name_start(seq_name, true)
498 error_msg = "sequence \"" + seq_name + "\" not found in sequence file"
499 raise IOError, error_msg
502 error_msg = "sequence \"" + seq_name + "\" not unique in sequence file"
503 raise IOError, error_msg
505 # hmmscan is 1 based, whereas sequences are 0 bases in this package.
506 seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
508 orig_name = seq.get_name
510 seq.set_name( orig_name.split[ 0 ] )
513 seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
516 out_msa.add_sequence( seq )
518 seq.get_sequence_as_string
521 def parse_da( target_da_str )
522 target_domain_hash = Hash.new
523 target_domain_ary = Array.new
524 target_das = target_da_str.split '--'
525 target_das.each do |x|
527 unless inds.size == 4
528 raise IOError, 'domain architecture is ill formatted: ' + x
530 target_domain_name = inds[0]
531 ie_cutoff = Float(inds[1])
532 abs_len_cutoff = Integer(inds[2])
533 rel_len_cutoff = Float(inds[3])
534 if target_domain_hash.has_key? target_domain_name
535 target_domain_ary.push target_domain_hash[target_domain_name]
537 td = TargetDomain.new(target_domain_name, ie_cutoff, abs_len_cutoff, rel_len_cutoff)
538 target_domain_hash[target_domain_name] = td
539 target_domain_ary.push td
547 @log << str << Constants::LINE_DELIMITER
550 end # class HmmscanMultiDomainExtractor
553 def initialize( name )
554 if (name == nil) || name.size < 1
555 error_msg = "domain name nil or empty"
556 raise IOError, error_msg
560 @i_e_value_min = 10000000.0
561 @i_e_value_max = -1.0
568 def add( name, length, i_e_value)
570 error_msg = "domain names do not match"
571 raise IOError, error_msg
575 error_msg = "length cannot me negative"
576 raise IOError, error_msg
579 error_msg = "iE-value cannot me negative"
580 raise IOError, error_msg
583 @i_e_value_sum += i_e_value
585 if i_e_value > @i_e_value_max
586 @i_e_value_max = i_e_value
588 if i_e_value < @i_e_value_min
589 @i_e_value_min = i_e_value
610 @i_e_value_sum / @count
615 s << @name.rjust(16) + ': '
616 s << @count.to_s.rjust(4) + ' '
617 s << avg_length.round(1).to_s.rjust(6) + ' '
618 s << @len_min.to_s.rjust(4) + ' -'
619 s << @len_max.to_s.rjust(4) + ' '
620 s << ("%.2E" % avg_i_e_value).rjust(9) + ' '
621 s << ("%.2E" % @i_e_value_min).rjust(9) + ' -'
622 s << ("%.2E" % @i_e_value_max).rjust(9) + ' '
626 attr_reader :name, :count, :i_e_value_min, :i_e_value_max, :length_min, :length_max
631 def initialize(name, i_e_value, abs_len, rel_len)
632 if (name == nil) || name.size < 1
633 error_msg = "target domain name nil or empty"
634 raise IOError, error_msg
637 error_msg = name + ": target domain relative length is greater than 1"
638 raise IOError, error_msg
640 if (abs_len <= 0) && (rel_len <= 0)
641 error_msg = name + ": need to have either absolute length or relative length cutoff"
642 raise IOError, error_msg
644 if (abs_len > 0) && (rel_len > 0)
645 error_msg = name + ": cannot have both absolute length and relative length cutoff"
646 raise IOError, error_msg
649 @i_e_value = i_e_value
655 s = @name.rjust(16) + ':'
656 s << ' iE-cutoff: ' + ("%.2E" % @i_e_value).rjust(9)
658 s << ', abs len-cutoff: ' + @abs_len.to_s.rjust(4)
661 s << ', rel len-cutoff: ' + @rel_len.to_s.rjust(4)
665 attr_reader :name, :i_e_value, :abs_len, :rel_len