in progress...
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_multi_domain_extractor.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
3 #
4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
5 # License::      GNU Lesser General Public License (LGPL)
6 #
7 # Last modified: 2017/02/20
8
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
15
16 module Evoruby
17   class HmmscanMultiDomainExtractor
18
19     OUTPUT_ID = 'MDSX'
20
21     PASSING_FL_SEQS_SUFFIX   = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
22     FAILING_FL_SEQS_SUFFIX   = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
23     TARGET_DA_SUFFIX         = "_#{OUTPUT_ID}_target_da.fasta"
24     CONCAT_TARGET_DOM_SUFFIX = "_#{OUTPUT_ID}_concat_target_dom.fasta"
25     def initialize
26       @passed = Hash.new
27       @non_passsing_domain_architectures = Hash.new
28     end
29
30     # raises ArgumentError, IOError, StandardError
31     def parse( target_da,
32       hmmscan_output,
33       fasta_sequence_file,
34       outfile_base,
35       log_str )
36
37       passing_fl_seqs_outfile   = outfile_base + PASSING_FL_SEQS_SUFFIX
38       failing_fl_seqs_outfile   = outfile_base + FAILING_FL_SEQS_SUFFIX
39       target_da_outfile         = outfile_base + TARGET_DA_SUFFIX
40       concat_target_dom_outfile = outfile_base + CONCAT_TARGET_DOM_SUFFIX
41
42       Util.check_file_for_readability( hmmscan_output )
43       Util.check_file_for_readability( fasta_sequence_file )
44       Util.check_file_for_writability( passing_fl_seqs_outfile )
45       Util.check_file_for_writability( failing_fl_seqs_outfile )
46       Util.check_file_for_writability( target_da_outfile )
47       Util.check_file_for_writability( concat_target_dom_outfile )
48
49       in_msa = nil
50       factory = MsaFactory.new()
51       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
52
53       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
54         error_msg = "could not find fasta sequences in " + fasta_sequence_file
55         raise IOError, error_msg
56       end
57
58       out_msa = Msa.new
59
60       failed_seqs_msa = Msa.new
61       passed_seqs_msa = Msa.new
62       # passed_multi_seqs_msa = Msa.new
63
64       ld = Constants::LINE_DELIMITER
65
66       domain_pass_counter                = 0
67       domain_fail_counter                = 0
68       passing_domains_per_protein        = 0
69       proteins_with_failing_domains      = 0
70       domain_not_present_counter         = 0
71       protein_counter                    = 1
72       max_domain_copy_number_per_protein = -1
73       max_domain_copy_number_sequence    = ""
74       passing_target_length_sum          = 0
75       overall_target_length_sum          = 0
76       overall_target_length_min          = 10000000
77       overall_target_length_max          = -1
78       passing_target_length_min          = 10000000
79       passing_target_length_max          = -1
80
81       overall_target_ie_min          = 10000000
82       overall_target_ie_max          = -1
83       passing_target_ie_min          = 10000000
84       passing_target_ie_max          = -1
85
86       hmmscan_parser = HmmscanParser.new( hmmscan_output )
87       results = hmmscan_parser.parse
88
89       ####
90       # Import: if multiple copies of same domain, threshold need to be same!
91       target_domain_ary = Array.new
92
93       target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 ))
94       target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 ))
95
96       # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 0 ))
97       # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 ))
98       # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
99       # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
100
101       #target_domain_ary.push(TargetDomain.new('BH4', 0.1, -1, 0.5, 0 ))
102       #target_domain_ary.push(TargetDomain.new('Bcl-2', 0.1, -1, 0.5, 1 ))
103       # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
104
105       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
106       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
107
108       #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
109
110       target_domains = Hash.new
111
112       target_domain_architecure = ""
113
114       target_domain_ary.each do |target_domain|
115         target_domains[target_domain.name] = target_domain
116         if target_domain_architecure.length > 0
117           target_domain_architecure += ">"
118         end
119         target_domain_architecure += target_domain.name
120       end
121
122       target_domain_architecure.freeze
123
124       puts 'Target domain architecture: ' + target_domain_architecure
125
126       target_domain_names = Set.new
127
128       target_domains.each_key {|key| target_domain_names.add( key ) }
129
130       prev_query_seq_name = nil
131       domains_in_query_seq = Array.new
132       passing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
133       failing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
134       total_sequences = 0
135       @passed = Hash.new
136       out_domain_msas = Hash.new
137       out_domain_architecture_msa = Msa.new
138       out_concatenated_domains_msa = Msa.new
139       results.each do |hmmscan_result|
140         if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
141           if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_contactenated_domains_msa, target_domain_architecure)
142             passing_sequences.push(domains_in_query_seq)
143           else
144             failing_sequences.push(domains_in_query_seq)
145           end
146
147           domains_in_query_seq = Array.new
148           total_sequences += 1
149         end
150         prev_query_seq_name = hmmscan_result.query
151         domains_in_query_seq.push(hmmscan_result)
152       end # each result
153
154       if prev_query_seq_name != nil
155         total_sequences += 1
156         if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_contactenated_domains_msa, target_domain_architecure)
157           passing_sequences.push(domains_in_query_seq)
158         else
159           failing_sequences.push(domains_in_query_seq)
160         end
161       end
162
163       out_domain_msas.keys.sort.each do |domain_name|
164         puts "writing #{domain_name}"
165         write_msa( out_domain_msas[domain_name], domain_name + ".fasta" )
166       end
167
168       puts "writing target_domain_architecure"
169       write_msa( out_domain_architecture_msa, target_da_outfile )
170
171       puts "writing contactenated_domains_msa"
172       write_msa( out_concatenated_domains_msa, concat_target_dom_outfile )
173
174       passing_sequences.each do | domains |
175         query_name = domains[0].query
176         if passed_seqs_msa.find_by_name_start( query_name, true ).length < 1
177           add_sequence( query_name, in_msa, passed_seqs_msa )
178         else
179           error_msg = "this should not have happened"
180           raise StandardError, error_msg
181         end
182       end
183
184       failing_sequences.each do | domains |
185         query_name = domains[0].query
186         if failed_seqs_msa.find_by_name_start( query_name, true ).length < 1
187           add_sequence( query_name, in_msa, failed_seqs_msa )
188         else
189           error_msg = "this should not have happened"
190           raise StandardError, error_msg
191         end
192       end
193
194       puts
195
196       puts 'Non passing domain architectures containing target domain(s):'
197       @non_passsing_domain_architectures = @non_passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
198       @non_passsing_domain_architectures.each do |da, count|
199         puts da + ': ' + count.to_s
200       end
201
202       puts
203
204       puts 'Passing target domain counts:'
205       @passed = @passed.sort{|a, b|a<=>b}.to_h
206       @passed.each do |dom, count|
207         puts dom + ': ' + count.to_s
208       end
209
210       puts
211
212       puts "total sequences  : " + total_sequences.to_s
213       puts "passing sequences: " + passing_sequences.size.to_s
214
215       write_msa( passed_seqs_msa, passing_fl_seqs_outfile )
216       write_msa( failed_seqs_msa, failing_fl_seqs_outfile )
217
218       log_str << ld
219       log_str << "passing target domains                       : " + domain_pass_counter.to_s + ld
220       log_str << "failing target domains                       : " + domain_fail_counter.to_s + ld
221       log_str << "proteins in sequence (fasta) file            : " + in_msa.get_number_of_seqs.to_s + ld
222       log_str << "proteins in hmmscan outputfile               : " + protein_counter.to_s + ld
223       log_str << "proteins with passing target domain(s)       : " + passed_seqs_msa.get_number_of_seqs.to_s + ld
224       log_str << "proteins with no passing target domain       : " + proteins_with_failing_domains.to_s + ld
225       log_str << "proteins with no target domain               : " + domain_not_present_counter.to_s + ld
226
227       log_str << ld
228
229       return domain_pass_counter
230
231     end # parse
232
233     private
234
235     # domains_in_query_seq: Array of HmmscanResult
236     # target_domain_names: Set of String
237     # target_domains: Hash String->TargetDomain
238     # target_domain_architecture: String
239     def checkit2(domains_in_query_seq,
240       target_domain_names,
241       target_domains,
242       in_msa,
243       out_domain_msas,
244       out_domain_architecture_msa,
245       out_contactenated_domains_msa,
246       target_domain_architecture)
247
248       domain_names_in_query_seq = Set.new
249       domains_in_query_seq.each do |domain|
250         domain_names_in_query_seq.add(domain.model)
251       end
252       if (target_domain_names.subset?(domain_names_in_query_seq))
253
254         passed_domains = Array.new
255         passed_domains_counts = Hash.new
256
257         domains_in_query_seq.each do |domain|
258           if target_domains.has_key?(domain.model)
259             target_domain = target_domains[domain.model]
260
261             if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
262               if domain.i_e_value > target_domain.i_e_value
263                 next
264               end
265             end
266             if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
267               length = 1 + domain.env_to - domain.env_from
268               if length < target_domain.abs_len
269                 next
270               end
271             end
272             if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
273               length = 1 + domain.env_to - domain.env_from
274               #  puts (target_domain.rel_len * domain.tlen)
275               if length < (target_domain.rel_len * domain.tlen)
276                 next
277               end
278             end
279
280             passed_domains.push(domain)
281
282             if (passed_domains_counts.key?(domain.model))
283               passed_domains_counts[domain.model] = passed_domains_counts[domain.model] + 1
284             else
285               passed_domains_counts[domain.model] = 1
286             end
287
288             if (@passed.key?(domain.model))
289               @passed[domain.model] = @passed[domain.model] + 1
290             else
291               @passed[domain.model] = 1
292             end
293           end # if target_domains.has_key?(domain.model)
294         end # domains_in_query_seq.each do |domain|
295       else
296         return false
297       end
298
299       passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
300       # Compare architectures
301       if !compareArchitectures(target_domain_architecture, passed_domains)
302         return false
303       end
304
305       domain_counter = Hash.new
306
307       min_env_from = 10000000
308       max_env_from = 0
309       min_env_to = 10000000
310       max_env_to = 0
311
312       query_seq = nil
313
314       concatenated_domains = ''
315
316       passed_domains.each do |domain|
317         domain_name = domain.model
318
319         unless out_domain_msas.has_key? domain_name
320           out_domain_msas[ domain_name ] = Msa.new
321         end
322
323         if domain_counter.key?(domain_name)
324           domain_counter[domain_name] = domain_counter[domain_name] + 1
325         else
326           domain_counter[domain_name] = 1
327         end
328
329         if query_seq == nil
330           query_seq = domain.query
331           query_seq.freeze
332         elsif query_seq != domain.query
333           error_msg = "seq names do not match: this should not have happened"
334           raise StandardError, error_msg
335         end
336
337         extracted_domain_seq = extract_domain( query_seq,
338         domain_counter[domain_name],
339         passed_domains_counts[domain_name],
340         domain.env_from,
341         domain.env_to,
342         in_msa,
343         out_domain_msas[ domain_name ] )
344
345         concatenated_domains += extracted_domain_seq
346
347         if domain.env_from < min_env_from
348           min_env_from = domain.env_from
349         end
350         if domain.env_from > max_env_from
351           max_env_from = domain.env_from
352         end
353         if domain.env_to < min_env_to
354           min_env_to = domain.env_to
355         end
356         if domain.env_to > max_env_to
357           max_env_to = domain.env_to
358         end
359       end
360
361       #puts "env from: #{min_env_from} - #{max_env_from}"
362       #puts "env to  : #{min_env_to} - #{max_env_to}"
363
364       extract_domain( query_seq,
365       1,
366       1,
367       min_env_from,
368       max_env_to,
369       in_msa,
370       out_domain_architecture_msa )
371
372       puts passed_domains_counts
373
374       out_contactenated_domains_msa.add( query_seq, concatenated_domains)
375
376       true
377     end
378
379     # passed_domains needs to be sorted!
380     def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
381       domain_architecture = ""
382       passed_domains.each do |domain|
383         if domain_architecture.length > 0
384           domain_architecture += ">"
385         end
386         domain_architecture += domain.model
387       end
388       pass = false
389       if strict
390         pass = (target_domain_architecture == domain_architecture)
391       else
392         pass = (domain_architecture.index(target_domain_architecture) != nil)
393       end
394       if !pass
395         if (@non_passsing_domain_architectures.key?(domain_architecture))
396           @non_passsing_domain_architectures[domain_architecture] = @non_passsing_domain_architectures[domain_architecture] + 1
397         else
398           @non_passsing_domain_architectures[domain_architecture] = 1
399         end
400       end
401       return pass
402     end
403
404     def write_msa( msa, filename )
405       io = MsaIO.new()
406       w = FastaWriter.new()
407       w.set_line_width( 60 )
408       w.clean( true )
409       File.delete(filename) if File.exist?(filename) #TODO remove me
410       begin
411         io.write_to_file( msa, filename, w )
412       rescue Exception
413         error_msg = "could not write to \"" + filename + "\""
414         raise IOError, error_msg
415       end
416     end
417
418     def add_sequence( sequence_name, in_msa, add_to_msa )
419       seqs = in_msa.find_by_name_start( sequence_name, true )
420       if ( seqs.length < 1 )
421         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
422         raise StandardError, error_msg
423       end
424       if ( seqs.length > 1 )
425         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
426         raise StandardError, error_msg
427       end
428       seq = in_msa.get_sequence( seqs[ 0 ] )
429       add_to_msa.add_sequence( seq )
430     end
431
432     def extract_domain( seq_name,
433       number,
434       out_of,
435       seq_from,
436       seq_to,
437       in_msa,
438       out_msa)
439       if number < 1 || out_of < 1 || number > out_of
440         error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
441         raise StandardError, error_msg
442       end
443       if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
444         error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
445         raise StandardError, error_msg
446       end
447       seqs = in_msa.find_by_name_start(seq_name, true)
448       if seqs.length < 1
449         error_msg = "sequence \"" + seq_name + "\" not found in sequence file"
450         raise IOError, error_msg
451       end
452       if seqs.length > 1
453         error_msg = "sequence \"" + seq_name + "\" not unique in sequence file"
454         raise IOError, error_msg
455       end
456       # hmmscan is 1 based, whereas sequences are 0 bases in this package.
457       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
458
459       orig_name = seq.get_name
460
461       seq.set_name( orig_name.split[ 0 ] )
462
463       if out_of != 1
464         seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
465       end
466
467       out_msa.add_sequence( seq )
468
469       seq.get_sequence_as_string
470     end
471
472   end # class HmmscanMultiDomainExtractor
473
474   class TargetDomain
475     def initialize( name, i_e_value, abs_len, rel_len, position )
476       if (name == nil) || name.size < 1
477         error_msg = "target domain name nil or empty"
478         raise StandardError, error_msg
479       end
480       if rel_len > 1
481         error_msg = "target domain relative length is greater than 1"
482         raise StandardError, error_msg
483       end
484       @name = name
485       @i_e_value = i_e_value
486       @abs_len = abs_len
487       @rel_len = rel_len
488       @position = position
489     end
490     attr_reader :name, :i_e_value, :abs_len, :rel_len, :position
491   end
492
493 end # module Evoruby
494