2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # Last modified: 2017/02/20
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
17 class HmmscanMultiDomainExtractor
20 @non_passsing_domain_architectures = Hash.new
23 # raises ArgumentError, IOError, StandardError
37 Util.check_file_for_readability( hmmscan_output )
38 Util.check_file_for_readability( fasta_sequence_file )
39 Util.check_file_for_writability( outfile + ".fasta" )
40 Util.check_file_for_writability( passed_seqs_outfile )
41 Util.check_file_for_writability( failed_seqs_outfile )
44 factory = MsaFactory.new()
45 in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
47 if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
48 error_msg = "could not find fasta sequences in " + fasta_sequence_file
49 raise IOError, error_msg
56 passed_multi_seqs = Msa.new
58 ld = Constants::LINE_DELIMITER
60 domain_pass_counter = 0
61 domain_fail_counter = 0
62 passing_domains_per_protein = 0
63 proteins_with_failing_domains = 0
64 domain_not_present_counter = 0
66 max_domain_copy_number_per_protein = -1
67 max_domain_copy_number_sequence = ""
68 passing_target_length_sum = 0
69 overall_target_length_sum = 0
70 overall_target_length_min = 10000000
71 overall_target_length_max = -1
72 passing_target_length_min = 10000000
73 passing_target_length_max = -1
75 overall_target_ie_min = 10000000
76 overall_target_ie_max = -1
77 passing_target_ie_min = 10000000
78 passing_target_ie_max = -1
80 hmmscan_parser = HmmscanParser.new( hmmscan_output )
81 results = hmmscan_parser.parse
84 # Import: if multiple copies of same domain, threshold need to be same!
85 target_domain_ary = Array.new
86 target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 1 ))
87 target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 ))
88 # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
89 # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
91 #target_domain_ary.push(TargetDomain.new('BH4', 0.1, -1, 0.5, 0 ))
92 #target_domain_ary.push(TargetDomain.new('Bcl-2', 0.1, -1, 0.5, 1 ))
93 # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
95 # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
96 # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
98 #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
100 target_domains = Hash.new
102 target_domain_architecure = ""
104 target_domain_ary.each do |target_domain|
105 target_domains[target_domain.name] = target_domain
106 if target_domain_architecure.length > 0
107 target_domain_architecure += ">"
109 target_domain_architecure += target_domain.name
112 target_domain_architecure.freeze
114 puts 'Target domain architecture: ' + target_domain_architecure
116 target_domain_names = Set.new
118 target_domains.each_key {|key| target_domain_names.add( key ) }
120 prev_query_seq_name = nil
121 domains_in_query_seq = Array.new
122 passing_sequences = Array.new
125 out_domain_msas = Hash.new
126 out_domain_architecture_msa = Msa.new
127 results.each do |hmmscan_result|
128 if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
129 if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure)
130 passing_sequences.push(domains_in_query_seq)
132 domains_in_query_seq = Array.new
135 prev_query_seq_name = hmmscan_result.query
136 domains_in_query_seq.push(hmmscan_result)
139 if prev_query_seq_name != nil
141 if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure)
142 passing_sequences.push(domains_in_query_seq)
146 out_domain_msas.keys.sort.each do |domain_name|
147 puts "writing #{domain_name}"
148 write_msa( out_domain_msas[domain_name], domain_name + ".fasta" )
151 puts "writing target_domain_architecure"
152 write_msa( out_domain_architecture_msa, "target_domain_architecure" + ".fasta" )
154 passing_sequences.each do | domains |
156 seq = domains[0].query
159 if passed_seqs.find_by_name_start( seq, true ).length < 1
160 add_sequence( seq, in_msa, passed_multi_seqs )
162 error_msg = "this should not have happened"
163 raise StandardError, error_msg
166 domains.each do | domain |
167 #puts domain.query + ": " + domain.model
174 puts 'Non passing domain architectures:'
175 @non_passsing_domain_architectures = @non_passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
176 @non_passsing_domain_architectures.each do |da, count|
177 puts da + ': ' + count.to_s
182 puts 'Passing domain counts:'
183 @passed = @passed.sort{|a, b|a<=>b}.to_h
184 @passed.each do |dom, count|
185 puts dom + ': ' + count.to_s
190 puts "total sequences : " + total_sequences.to_s
191 puts "passing sequences: " + passing_sequences.size.to_s
193 # write_msa( out_msa, outfile + ".fasta" )
194 # write_msa( passed_multi_seqs, passed_seqs_outfile )
195 # write_msa( failed_seqs, failed_seqs_outfile )
198 log << "passing target domains : " + domain_pass_counter.to_s + ld
199 log << "failing target domains : " + domain_fail_counter.to_s + ld
200 log << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld
201 log << "proteins in hmmscan outputfile : " + protein_counter.to_s + ld
202 log << "proteins with passing target domain(s) : " + passed_seqs.get_number_of_seqs.to_s + ld
203 log << "proteins with no passing target domain : " + proteins_with_failing_domains.to_s + ld
204 log << "proteins with no target domain : " + domain_not_present_counter.to_s + ld
208 return domain_pass_counter
214 # domains_in_query_seq: Array of HmmscanResult
215 # target_domain_names: Set of String
216 # target_domains: Hash String->TargetDomain
217 # target_domain_architecture: String
218 def checkit2(domains_in_query_seq,
223 out_domain_architecture_msa,
224 target_domain_architecture)
226 domain_names_in_query_seq = Set.new
227 domains_in_query_seq.each do |domain|
228 domain_names_in_query_seq.add(domain.model)
230 if (target_domain_names.subset?(domain_names_in_query_seq))
232 passed_domains = Array.new
233 passed_domains_counts = Hash.new
235 domains_in_query_seq.each do |domain|
236 if target_domains.has_key?(domain.model)
237 target_domain = target_domains[domain.model]
239 if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
240 if domain.i_e_value > target_domain.i_e_value
244 if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
245 length = 1 + domain.env_to - domain.env_from
246 if length < target_domain.abs_len
250 if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
251 length = 1 + domain.env_to - domain.env_from
252 puts (target_domain.rel_len * domain.tlen)
253 if length < (target_domain.rel_len * domain.tlen)
258 passed_domains.push(domain)
260 if (passed_domains_counts.key?(domain.model))
261 passed_domains_counts[domain.model] = passed_domains_counts[domain.model] + 1
263 passed_domains_counts[domain.model] = 1
266 if (@passed.key?(domain.model))
267 @passed[domain.model] = @passed[domain.model] + 1
269 @passed[domain.model] = 1
271 end # if target_domains.has_key?(domain.model)
272 end # domains_in_query_seq.each do |domain|
277 passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
278 # Compare architectures
279 if !compareArchitectures(target_domain_architecture, passed_domains)
283 domain_counter = Hash.new
285 min_env_from = 10000000
287 min_env_to = 10000000
292 concatenated_domains = ''
294 passed_domains.each do |domain|
295 domain_name = domain.model
297 unless out_domain_msas.has_key? domain_name
298 out_domain_msas[ domain_name ] = Msa.new
301 if domain_counter.key?(domain_name)
302 domain_counter[domain_name] = domain_counter[domain_name] + 1
304 domain_counter[domain_name] = 1
308 query_seq = domain.query
310 elsif query_seq != domain.query
311 error_msg = "seq names do not match: this should not have happened"
312 raise StandardError, error_msg
314 puts "query: " + query_seq
316 extracted_domain_seq = extract_domain( query_seq,
317 domain_counter[domain_name],
318 passed_domains_counts[domain_name],
322 out_domain_msas[ domain_name ] )
324 concatenated_domains += extracted_domain_seq
326 if domain.env_from < min_env_from
327 min_env_from = domain.env_from
329 if domain.env_from > max_env_from
330 max_env_from = domain.env_from
332 if domain.env_to < min_env_to
333 min_env_to = domain.env_to
335 if domain.env_to > max_env_to
336 max_env_to = domain.env_to
340 puts "env from: #{min_env_from} - #{max_env_from}"
341 puts "env to : #{min_env_to} - #{max_env_to}"
343 extract_domain( query_seq,
349 out_domain_architecture_msa )
351 puts passed_domains_counts
352 puts concatenated_domains
356 # passed_domains needs to be sorted!
357 def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
358 domain_architecture = ""
359 passed_domains.each do |domain|
360 if domain_architecture.length > 0
361 domain_architecture += ">"
363 domain_architecture += domain.model
367 pass = (target_domain_architecture == domain_architecture)
369 pass = (domain_architecture.index(target_domain_architecture) != nil)
372 if (@non_passsing_domain_architectures.key?(domain_architecture))
373 @non_passsing_domain_architectures[domain_architecture] = @non_passsing_domain_architectures[domain_architecture] + 1
375 @non_passsing_domain_architectures[domain_architecture] = 1
381 def write_msa( msa, filename )
383 w = FastaWriter.new()
384 w.set_line_width( 60 )
386 File.delete(filename) if File.exist?(filename) #TODO remove me
388 io.write_to_file( msa, filename, w )
390 error_msg = "could not write to \"" + filename + "\""
391 raise IOError, error_msg
395 def add_sequence( sequence_name, in_msa, add_to_msa )
396 seqs = in_msa.find_by_name_start( sequence_name, true )
397 if ( seqs.length < 1 )
398 error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
399 raise StandardError, error_msg
401 if ( seqs.length > 1 )
402 error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
403 raise StandardError, error_msg
405 seq = in_msa.get_sequence( seqs[ 0 ] )
406 add_to_msa.add_sequence( seq )
409 def extract_domain( seq_name,
416 if number < 1 || out_of < 1 || number > out_of
417 error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
418 raise StandardError, error_msg
420 if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
421 error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
422 raise StandardError, error_msg
424 seqs = in_msa.find_by_name_start(seq_name, true)
426 error_msg = "sequence \"" + seq_name + "\" not found in sequence file"
427 raise IOError, error_msg
430 error_msg = "sequence \"" + seq_name + "\" not unique in sequence file"
431 raise IOError, error_msg
433 # hmmscan is 1 based, whereas sequences are 0 bases in this package.
434 seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
436 orig_name = seq.get_name
438 seq.set_name( orig_name.split[ 0 ] )
441 seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
444 out_msa.add_sequence( seq )
446 seq.get_sequence_as_string
449 end # class HmmscanMultiDomainExtractor
452 def initialize( name, i_e_value, abs_len, rel_len, position )
453 if (name == nil) || name.size < 1
454 error_msg = "target domain name nil or empty"
455 raise StandardError, error_msg
458 error_msg = "target domain relative length is greater than 1"
459 raise StandardError, error_msg
462 @i_e_value = i_e_value
467 attr_reader :name, :i_e_value, :abs_len, :rel_len, :position