2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # Last modified: 2017/02/20
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
17 class HmmscanMultiDomainExtractor
20 @non_passsing_domain_architectures = Hash.new
23 # raises ArgumentError, IOError, StandardError
37 Util.check_file_for_readability( hmmscan_output )
38 Util.check_file_for_readability( fasta_sequence_file )
39 Util.check_file_for_writability( outfile + ".fasta" )
40 Util.check_file_for_writability( passed_seqs_outfile )
41 Util.check_file_for_writability( failed_seqs_outfile )
44 factory = MsaFactory.new()
45 in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
47 if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
48 error_msg = "could not find fasta sequences in " + fasta_sequence_file
49 raise IOError, error_msg
56 passed_multi_seqs = Msa.new
58 ld = Constants::LINE_DELIMITER
60 domain_pass_counter = 0
61 domain_fail_counter = 0
62 passing_domains_per_protein = 0
63 proteins_with_failing_domains = 0
64 domain_not_present_counter = 0
66 max_domain_copy_number_per_protein = -1
67 max_domain_copy_number_sequence = ""
68 passing_target_length_sum = 0
69 overall_target_length_sum = 0
70 overall_target_length_min = 10000000
71 overall_target_length_max = -1
72 passing_target_length_min = 10000000
73 passing_target_length_max = -1
75 overall_target_ie_min = 10000000
76 overall_target_ie_max = -1
77 passing_target_ie_min = 10000000
78 passing_target_ie_max = -1
82 hmmscan_parser = HmmscanParser.new( hmmscan_output )
83 results = hmmscan_parser.parse
86 # Import: if multiple copies of same domain, threshold need to be same!
87 target_domain_ary = Array.new
88 target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 1 ))
89 target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 ))
90 # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
91 # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
93 #target_domain_ary.push(TargetDomain.new('BH4', 0.1, -1, 0.5, 0 ))
94 #target_domain_ary.push(TargetDomain.new('Bcl-2', 0.1, -1, 0.5, 1 ))
95 # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
97 # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
98 # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
100 #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
102 target_domains = Hash.new
104 target_domain_architecure = ""
106 target_domain_ary.each do |target_domain|
107 target_domains[target_domain.name] = target_domain
108 if target_domain_architecure.length > 0
109 target_domain_architecure += ">"
111 target_domain_architecure += target_domain.name
114 target_domain_architecure.freeze
116 puts target_domain_architecure
118 target_domain_names = Set.new
120 target_domains.each_key {|key| target_domain_names.add( key ) }
122 prev_query_seq_name = nil
123 domains_in_query_seq = Array.new
124 passing_sequences = Array.new
127 out_domain_msas = Hash.new
128 out_domain_architecture_msa = Msa.new
129 results.each do |hmmscan_result|
130 if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
131 if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure)
132 passing_sequences.push(domains_in_query_seq)
134 domains_in_query_seq = Array.new
137 prev_query_seq_name = hmmscan_result.query
138 domains_in_query_seq.push(hmmscan_result)
141 if prev_query_seq_name != nil
143 if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure)
144 passing_sequences.push(domains_in_query_seq)
148 out_domain_msas.keys.sort.each do |domain_name|
149 puts "writing #{domain_name}"
150 write_msa( out_domain_msas[domain_name], domain_name + ".fasta" )
153 puts "writing target_domain_architecure"
154 write_msa( out_domain_architecture_msa, "target_domain_architecure" + ".fasta" )
156 passing_sequences.each do | domains |
158 seq = domains[0].query
161 if passed_seqs.find_by_name_start( seq, true ).length < 1
162 add_sequence( seq, in_msa, passed_multi_seqs )
164 error_msg = "this should not have happened"
165 raise StandardError, error_msg
169 domains.each do | domain |
170 #puts domain.query + ": " + domain.model
175 puts @non_passsing_domain_architectures
178 puts "total sequences : " + total_sequences.to_s
179 puts "passing sequences: " + passing_sequences.size.to_s
181 # write_msa( out_msa, outfile + ".fasta" )
182 # write_msa( passed_multi_seqs, passed_seqs_outfile )
183 # write_msa( failed_seqs, failed_seqs_outfile )
186 log << "passing target domains : " + domain_pass_counter.to_s + ld
187 log << "failing target domains : " + domain_fail_counter.to_s + ld
188 log << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld
189 log << "proteins in hmmscan outputfile : " + protein_counter.to_s + ld
190 log << "proteins with passing target domain(s) : " + passed_seqs.get_number_of_seqs.to_s + ld
191 log << "proteins with no passing target domain : " + proteins_with_failing_domains.to_s + ld
192 log << "proteins with no target domain : " + domain_not_present_counter.to_s + ld
196 return domain_pass_counter
202 # domains_in_query_seq: Array of HmmscanResult
203 # target_domain_names: Set of String
204 # target_domains: Hash String->TargetDomain
205 # target_domain_architecture: String
206 def checkit2(domains_in_query_seq,
211 out_domain_architecture_msa,
212 target_domain_architecture = nil)
214 domain_names_in_query_seq = Set.new
215 domains_in_query_seq.each do |domain|
216 domain_names_in_query_seq.add(domain.model)
218 if (target_domain_names.subset?(domain_names_in_query_seq))
220 passed_domains = Array.new
221 passed_domains_counts = Hash.new
223 domains_in_query_seq.each do |domain|
224 if target_domains.has_key?(domain.model)
225 target_domain = target_domains[domain.model]
226 if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
227 if domain.i_e_value > target_domain.i_e_value
231 if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
232 length = 1 + domain.env_to - domain.env_from
233 if length < target_domain.abs_len
237 if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
238 length = 1 + domain.env_to - domain.env_from
239 if length < (target_domain.rel_len * domain.tlen)
244 # For domain architecture comparison
245 if target_domain_architecture != nil
246 passed_domains.push(domain)
249 if (passed_domains_counts.key?(domain.model))
250 passed_domains_counts[domain.model] = passed_domains_counts[domain.model] + 1
252 passed_domains_counts[domain.model] = 1
255 if (@passed.key?(domain.model))
256 @passed[domain.model] = @passed[domain.model] + 1
258 @passed[domain.model] = 1
266 passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
267 # Compare architectures
268 if target_domain_architecture != nil
269 if !compareArchitectures(target_domain_architecture, passed_domains)
274 domain_counter = Hash.new
276 min_env_from = 10000000
278 min_env_to = 10000000
283 passed_domains.each do |domain|
284 domain_name = domain.model
286 unless out_domain_msas.has_key? domain_name
287 out_domain_msas[ domain_name ] = Msa.new
290 if domain_counter.key?(domain_name)
291 domain_counter[domain_name] = domain_counter[domain_name] + 1
293 domain_counter[domain_name] = 1
297 query_seq = domain.query
299 elsif query_seq != domain.query
300 error_msg = "seq names do not match: this should not have happened"
301 raise StandardError, error_msg
304 extract_domain( query_seq,
305 domain_counter[domain_name],
306 passed_domains_counts[domain_name],
310 out_domain_msas[ domain_name ] )
312 if domain.env_from < min_env_from
313 min_env_from = domain.env_from
315 if domain.env_from > max_env_from
316 max_env_from = domain.env_from
318 if domain.env_to < min_env_to
319 min_env_to = domain.env_to
321 if domain.env_to > max_env_to
322 max_env_to = domain.env_to
326 puts "env from: #{min_env_from} - #{max_env_from}"
327 puts "env to : #{min_env_to} - #{max_env_to}"
329 extract_domain( query_seq,
335 out_domain_architecture_msa )
337 puts passed_domains_counts
341 # passed_domains needs to be sorted!
342 def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
343 domain_architecture = ""
344 passed_domains.each do |domain|
345 if domain_architecture.length > 0
346 domain_architecture += ">"
348 domain_architecture += domain.model
352 pass = (target_domain_architecture == domain_architecture)
354 pass = (domain_architecture.index(target_domain_architecture) != nil)
357 if (@non_passsing_domain_architectures.key?(domain_architecture))
358 @non_passsing_domain_architectures[domain_architecture] = @non_passsing_domain_architectures[domain_architecture] + 1
360 @non_passsing_domain_architectures[domain_architecture] = 1
366 def write_msa( msa, filename )
368 w = FastaWriter.new()
369 w.set_line_width( 60 )
371 File.delete(filename) if File.exist?(filename) #TODO remove me
373 io.write_to_file( msa, filename, w )
375 error_msg = "could not write to \"" + filename + "\""
376 raise IOError, error_msg
380 def add_sequence( sequence_name, in_msa, add_to_msa )
381 seqs = in_msa.find_by_name_start( sequence_name, true )
382 if ( seqs.length < 1 )
383 error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
384 raise StandardError, error_msg
386 if ( seqs.length > 1 )
387 error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
388 raise StandardError, error_msg
390 seq = in_msa.get_sequence( seqs[ 0 ] )
391 add_to_msa.add_sequence( seq )
394 def extract_domain( seq_name,
401 if number < 1 || out_of < 1 || number > out_of
402 error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
403 raise StandardError, error_msg
405 if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
406 error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
407 raise StandardError, error_msg
409 seqs = in_msa.find_by_name_start(seq_name, true)
411 error_msg = "sequence \"" + seq_name + "\" not found in sequence file"
412 raise IOError, error_msg
415 error_msg = "sequence \"" + seq_name + "\" not unique in sequence file"
416 raise IOError, error_msg
418 # hmmscan is 1 based, whereas sequences are 0 bases in this package.
419 seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
421 orig_name = seq.get_name
423 seq.set_name( orig_name.split[ 0 ] )
426 seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
429 out_msa.add_sequence( seq )
432 def is_ignorable?( line )
433 return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
436 end # class HmmscanDomainExtractor
439 def initialize( name, i_e_value, abs_len, rel_len, position )
441 @i_e_value = i_e_value
443 @percent_len = rel_len
446 attr_reader :name, :i_e_value, :abs_len, :rel_len, :position