2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # Last modified: 2017/02/20
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
17 class HmmscanMultiDomainExtractor
21 PASSING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
22 FAILING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
23 TARGET_DA_SUFFIX = "_#{OUTPUT_ID}_target_da.fasta"
24 CONCAT_TARGET_DOM_SUFFIX = "_#{OUTPUT_ID}_concat_target_dom.fasta"
27 @non_passsing_domain_architectures = Hash.new
30 # raises ArgumentError, IOError, StandardError
37 passing_fl_seqs_outfile = outfile_base + PASSING_FL_SEQS_SUFFIX
38 failing_fl_seqs_outfile = outfile_base + FAILING_FL_SEQS_SUFFIX
39 target_da_outfile = outfile_base + TARGET_DA_SUFFIX
40 concat_target_dom_outfile = outfile_base + CONCAT_TARGET_DOM_SUFFIX
42 Util.check_file_for_readability( hmmscan_output )
43 Util.check_file_for_readability( fasta_sequence_file )
44 Util.check_file_for_writability( passing_fl_seqs_outfile )
45 Util.check_file_for_writability( failing_fl_seqs_outfile )
46 Util.check_file_for_writability( target_da_outfile )
47 Util.check_file_for_writability( concat_target_dom_outfile )
50 factory = MsaFactory.new()
51 in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
53 if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
54 error_msg = "could not find fasta sequences in " + fasta_sequence_file
55 raise IOError, error_msg
60 failed_seqs_msa = Msa.new
61 passed_seqs_msa = Msa.new
62 # passed_multi_seqs_msa = Msa.new
64 ld = Constants::LINE_DELIMITER
66 domain_pass_counter = 0
67 domain_fail_counter = 0
68 passing_domains_per_protein = 0
69 proteins_with_failing_domains = 0
70 domain_not_present_counter = 0
72 max_domain_copy_number_per_protein = -1
73 max_domain_copy_number_sequence = ""
74 passing_target_length_sum = 0
75 overall_target_length_sum = 0
76 overall_target_length_min = 10000000
77 overall_target_length_max = -1
78 passing_target_length_min = 10000000
79 passing_target_length_max = -1
81 overall_target_ie_min = 10000000
82 overall_target_ie_max = -1
83 passing_target_ie_min = 10000000
84 passing_target_ie_max = -1
86 hmmscan_parser = HmmscanParser.new( hmmscan_output )
87 results = hmmscan_parser.parse
90 # Import: if multiple copies of same domain, threshold need to be same!
91 target_domain_ary = Array.new
93 target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 ))
94 target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 ))
96 # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 0 ))
97 # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 ))
98 # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
99 # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
101 #target_domain_ary.push(TargetDomain.new('BH4', 0.1, -1, 0.5, 0 ))
102 #target_domain_ary.push(TargetDomain.new('Bcl-2', 0.1, -1, 0.5, 1 ))
103 # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
105 # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
106 # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
108 #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
110 target_domains = Hash.new
112 target_domain_architecure = ""
114 target_domain_ary.each do |target_domain|
115 target_domains[target_domain.name] = target_domain
116 if target_domain_architecure.length > 0
117 target_domain_architecure += ">"
119 target_domain_architecure += target_domain.name
122 target_domain_architecure.freeze
124 puts 'Target domain architecture: ' + target_domain_architecure
126 target_domain_names = Set.new
128 target_domains.each_key {|key| target_domain_names.add( key ) }
130 prev_query_seq_name = nil
131 domains_in_query_seq = Array.new
132 passing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
133 failing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
136 out_domain_msas = Hash.new
137 out_domain_architecture_msa = Msa.new
138 out_concatenated_domains_msa = Msa.new
139 results.each do |hmmscan_result|
140 if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
141 if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_contactenated_domains_msa, target_domain_architecure)
142 passing_sequences.push(domains_in_query_seq)
144 failing_sequences.push(domains_in_query_seq)
147 domains_in_query_seq = Array.new
150 prev_query_seq_name = hmmscan_result.query
151 domains_in_query_seq.push(hmmscan_result)
154 if prev_query_seq_name != nil
156 if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_contactenated_domains_msa, target_domain_architecure)
157 passing_sequences.push(domains_in_query_seq)
159 failing_sequences.push(domains_in_query_seq)
163 out_domain_msas.keys.sort.each do |domain_name|
164 puts "writing #{domain_name}"
165 write_msa( out_domain_msas[domain_name], domain_name + ".fasta" )
168 puts "writing target_domain_architecure"
169 write_msa( out_domain_architecture_msa, target_da_outfile )
171 puts "writing contactenated_domains_msa"
172 write_msa( out_concatenated_domains_msa, concat_target_dom_outfile )
174 passing_sequences.each do | domains |
175 query_name = domains[0].query
176 if passed_seqs_msa.find_by_name_start( query_name, true ).length < 1
177 add_sequence( query_name, in_msa, passed_seqs_msa )
179 error_msg = "this should not have happened"
180 raise StandardError, error_msg
184 failing_sequences.each do | domains |
185 query_name = domains[0].query
186 if failed_seqs_msa.find_by_name_start( query_name, true ).length < 1
187 add_sequence( query_name, in_msa, failed_seqs_msa )
189 error_msg = "this should not have happened"
190 raise StandardError, error_msg
196 puts 'Non passing domain architectures containing target domain(s):'
197 @non_passsing_domain_architectures = @non_passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
198 @non_passsing_domain_architectures.each do |da, count|
199 puts da + ': ' + count.to_s
204 puts 'Passing target domain counts:'
205 @passed = @passed.sort{|a, b|a<=>b}.to_h
206 @passed.each do |dom, count|
207 puts dom + ': ' + count.to_s
212 puts "total sequences : " + total_sequences.to_s
213 puts "passing sequences: " + passing_sequences.size.to_s
215 write_msa( passed_seqs_msa, passing_fl_seqs_outfile )
216 write_msa( failed_seqs_msa, failing_fl_seqs_outfile )
219 log_str << "passing target domains : " + domain_pass_counter.to_s + ld
220 log_str << "failing target domains : " + domain_fail_counter.to_s + ld
221 log_str << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld
222 log_str << "proteins in hmmscan outputfile : " + protein_counter.to_s + ld
223 log_str << "proteins with passing target domain(s) : " + passed_seqs_msa.get_number_of_seqs.to_s + ld
224 log_str << "proteins with no passing target domain : " + proteins_with_failing_domains.to_s + ld
225 log_str << "proteins with no target domain : " + domain_not_present_counter.to_s + ld
229 return domain_pass_counter
235 # domains_in_query_seq: Array of HmmscanResult
236 # target_domain_names: Set of String
237 # target_domains: Hash String->TargetDomain
238 # target_domain_architecture: String
239 def checkit2(domains_in_query_seq,
244 out_domain_architecture_msa,
245 out_contactenated_domains_msa,
246 target_domain_architecture)
248 domain_names_in_query_seq = Set.new
249 domains_in_query_seq.each do |domain|
250 domain_names_in_query_seq.add(domain.model)
252 if (target_domain_names.subset?(domain_names_in_query_seq))
254 passed_domains = Array.new
255 passed_domains_counts = Hash.new
257 domains_in_query_seq.each do |domain|
258 if target_domains.has_key?(domain.model)
259 target_domain = target_domains[domain.model]
261 if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
262 if domain.i_e_value > target_domain.i_e_value
266 if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
267 length = 1 + domain.env_to - domain.env_from
268 if length < target_domain.abs_len
272 if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
273 length = 1 + domain.env_to - domain.env_from
274 # puts (target_domain.rel_len * domain.tlen)
275 if length < (target_domain.rel_len * domain.tlen)
280 passed_domains.push(domain)
282 if (passed_domains_counts.key?(domain.model))
283 passed_domains_counts[domain.model] = passed_domains_counts[domain.model] + 1
285 passed_domains_counts[domain.model] = 1
288 if (@passed.key?(domain.model))
289 @passed[domain.model] = @passed[domain.model] + 1
291 @passed[domain.model] = 1
293 end # if target_domains.has_key?(domain.model)
294 end # domains_in_query_seq.each do |domain|
299 passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
300 # Compare architectures
301 if !compareArchitectures(target_domain_architecture, passed_domains)
305 domain_counter = Hash.new
307 min_env_from = 10000000
309 min_env_to = 10000000
314 concatenated_domains = ''
316 passed_domains.each do |domain|
317 domain_name = domain.model
319 unless out_domain_msas.has_key? domain_name
320 out_domain_msas[ domain_name ] = Msa.new
323 if domain_counter.key?(domain_name)
324 domain_counter[domain_name] = domain_counter[domain_name] + 1
326 domain_counter[domain_name] = 1
330 query_seq = domain.query
332 elsif query_seq != domain.query
333 error_msg = "seq names do not match: this should not have happened"
334 raise StandardError, error_msg
337 extracted_domain_seq = extract_domain( query_seq,
338 domain_counter[domain_name],
339 passed_domains_counts[domain_name],
343 out_domain_msas[ domain_name ] )
345 concatenated_domains += extracted_domain_seq
347 if domain.env_from < min_env_from
348 min_env_from = domain.env_from
350 if domain.env_from > max_env_from
351 max_env_from = domain.env_from
353 if domain.env_to < min_env_to
354 min_env_to = domain.env_to
356 if domain.env_to > max_env_to
357 max_env_to = domain.env_to
361 #puts "env from: #{min_env_from} - #{max_env_from}"
362 #puts "env to : #{min_env_to} - #{max_env_to}"
364 extract_domain( query_seq,
370 out_domain_architecture_msa )
372 puts passed_domains_counts
374 out_contactenated_domains_msa.add( query_seq, concatenated_domains)
379 # passed_domains needs to be sorted!
380 def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
381 domain_architecture = ""
382 passed_domains.each do |domain|
383 if domain_architecture.length > 0
384 domain_architecture += ">"
386 domain_architecture += domain.model
390 pass = (target_domain_architecture == domain_architecture)
392 pass = (domain_architecture.index(target_domain_architecture) != nil)
395 if (@non_passsing_domain_architectures.key?(domain_architecture))
396 @non_passsing_domain_architectures[domain_architecture] = @non_passsing_domain_architectures[domain_architecture] + 1
398 @non_passsing_domain_architectures[domain_architecture] = 1
404 def write_msa( msa, filename )
406 w = FastaWriter.new()
407 w.set_line_width( 60 )
409 File.delete(filename) if File.exist?(filename) #TODO remove me
411 io.write_to_file( msa, filename, w )
413 error_msg = "could not write to \"" + filename + "\""
414 raise IOError, error_msg
418 def add_sequence( sequence_name, in_msa, add_to_msa )
419 seqs = in_msa.find_by_name_start( sequence_name, true )
420 if ( seqs.length < 1 )
421 error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
422 raise StandardError, error_msg
424 if ( seqs.length > 1 )
425 error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
426 raise StandardError, error_msg
428 seq = in_msa.get_sequence( seqs[ 0 ] )
429 add_to_msa.add_sequence( seq )
432 def extract_domain( seq_name,
439 if number < 1 || out_of < 1 || number > out_of
440 error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
441 raise StandardError, error_msg
443 if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
444 error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
445 raise StandardError, error_msg
447 seqs = in_msa.find_by_name_start(seq_name, true)
449 error_msg = "sequence \"" + seq_name + "\" not found in sequence file"
450 raise IOError, error_msg
453 error_msg = "sequence \"" + seq_name + "\" not unique in sequence file"
454 raise IOError, error_msg
456 # hmmscan is 1 based, whereas sequences are 0 bases in this package.
457 seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
459 orig_name = seq.get_name
461 seq.set_name( orig_name.split[ 0 ] )
464 seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
467 out_msa.add_sequence( seq )
469 seq.get_sequence_as_string
472 end # class HmmscanMultiDomainExtractor
475 def initialize( name, i_e_value, abs_len, rel_len, position )
476 if (name == nil) || name.size < 1
477 error_msg = "target domain name nil or empty"
478 raise StandardError, error_msg
481 error_msg = "target domain relative length is greater than 1"
482 raise StandardError, error_msg
485 @i_e_value = i_e_value
490 attr_reader :name, :i_e_value, :abs_len, :rel_len, :position