in progress...
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_multi_domain_extractor.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
3 #
4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
5 # License::      GNU Lesser General Public License (LGPL)
6 #
7 # Last modified: 2017/02/20
8
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
15
16 module Evoruby
17   class HmmscanMultiDomainExtractor
18     def initialize
19       @passed = Hash.new
20       @non_passsing_domain_architectures = Hash.new
21     end
22
23     # raises ArgumentError, IOError, StandardError
24     def parse( domain_id,
25       hmmscan_output,
26       fasta_sequence_file,
27       outfile,
28       passed_seqs_outfile,
29       failed_seqs_outfile,
30       e_value_threshold,
31       length_threshold,
32       add_position,
33       add_domain_number,
34       add_species,
35       log )
36
37       Util.check_file_for_readability( hmmscan_output )
38       Util.check_file_for_readability( fasta_sequence_file )
39       Util.check_file_for_writability( outfile + ".fasta" )
40       Util.check_file_for_writability( passed_seqs_outfile )
41       Util.check_file_for_writability( failed_seqs_outfile )
42
43       in_msa = nil
44       factory = MsaFactory.new()
45       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
46
47       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
48         error_msg = "could not find fasta sequences in " + fasta_sequence_file
49         raise IOError, error_msg
50       end
51
52       out_msa = Msa.new
53
54       failed_seqs = Msa.new
55       passed_seqs = Msa.new
56       passed_multi_seqs = Msa.new
57
58       ld = Constants::LINE_DELIMITER
59
60       domain_pass_counter                = 0
61       domain_fail_counter                = 0
62       passing_domains_per_protein        = 0
63       proteins_with_failing_domains      = 0
64       domain_not_present_counter         = 0
65       protein_counter                    = 1
66       max_domain_copy_number_per_protein = -1
67       max_domain_copy_number_sequence    = ""
68       passing_target_length_sum          = 0
69       overall_target_length_sum          = 0
70       overall_target_length_min          = 10000000
71       overall_target_length_max          = -1
72       passing_target_length_min          = 10000000
73       passing_target_length_max          = -1
74
75       overall_target_ie_min          = 10000000
76       overall_target_ie_max          = -1
77       passing_target_ie_min          = 10000000
78       passing_target_ie_max          = -1
79
80       hmmscan_parser = HmmscanParser.new( hmmscan_output )
81       results = hmmscan_parser.parse
82
83       ####
84       # Import: if multiple copies of same domain, threshold need to be same!
85       target_domain_ary = Array.new
86       target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 1 ))
87       target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 ))
88       # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
89       # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
90
91       #target_domain_ary.push(TargetDomain.new('BH4', 0.1, -1, 0.5, 0 ))
92       #target_domain_ary.push(TargetDomain.new('Bcl-2', 0.1, -1, 0.5, 1 ))
93       # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
94
95       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
96       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
97
98       #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
99
100       target_domains = Hash.new
101
102       target_domain_architecure = ""
103
104       target_domain_ary.each do |target_domain|
105         target_domains[target_domain.name] = target_domain
106         if target_domain_architecure.length > 0
107           target_domain_architecure += ">"
108         end
109         target_domain_architecure += target_domain.name
110       end
111
112       target_domain_architecure.freeze
113
114       puts 'Target domain architecture: ' + target_domain_architecure
115
116       target_domain_names = Set.new
117
118       target_domains.each_key {|key| target_domain_names.add( key ) }
119
120       prev_query_seq_name = nil
121       domains_in_query_seq = Array.new
122       passing_sequences = Array.new
123       total_sequences = 0
124       @passed = Hash.new
125       out_domain_msas = Hash.new
126       out_domain_architecture_msa = Msa.new
127       results.each do |hmmscan_result|
128         if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
129           if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure)
130             passing_sequences.push(domains_in_query_seq)
131           end
132           domains_in_query_seq = Array.new
133           total_sequences += 1
134         end
135         prev_query_seq_name = hmmscan_result.query
136         domains_in_query_seq.push(hmmscan_result)
137       end # each result
138
139       if prev_query_seq_name != nil
140         total_sequences += 1
141         if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure)
142           passing_sequences.push(domains_in_query_seq)
143         end
144       end
145
146       out_domain_msas.keys.sort.each do |domain_name|
147         puts "writing #{domain_name}"
148         write_msa( out_domain_msas[domain_name], domain_name + ".fasta" )
149       end
150
151       puts "writing target_domain_architecure"
152       write_msa( out_domain_architecture_msa, "target_domain_architecure" + ".fasta" )
153
154       passing_sequences.each do | domains |
155
156         seq = domains[0].query
157         # puts seq + "::"
158
159         if passed_seqs.find_by_name_start( seq, true ).length < 1
160           add_sequence( seq, in_msa, passed_multi_seqs )
161         else
162           error_msg = "this should not have happened"
163           raise StandardError, error_msg
164         end
165
166         domains.each do | domain |
167           #puts domain.query + ": " + domain.model
168         end
169         #puts
170       end
171
172       puts
173
174       puts 'Non passing domain architectures:'
175       @non_passsing_domain_architectures = @non_passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
176       @non_passsing_domain_architectures.each do |da, count|
177         puts da + ': ' + count.to_s
178       end
179
180       puts
181
182       puts 'Passing domain counts:'
183       @passed = @passed.sort{|a, b|a<=>b}.to_h
184       @passed.each do |dom, count|
185         puts dom + ': ' + count.to_s
186       end
187
188       puts
189
190       puts "total sequences  : " + total_sequences.to_s
191       puts "passing sequences: " + passing_sequences.size.to_s
192
193       # write_msa( out_msa, outfile + ".fasta"  )
194       # write_msa( passed_multi_seqs, passed_seqs_outfile )
195       # write_msa( failed_seqs, failed_seqs_outfile )
196
197       log << ld
198       log << "passing target domains                       : " + domain_pass_counter.to_s + ld
199       log << "failing target domains                       : " + domain_fail_counter.to_s + ld
200       log << "proteins in sequence (fasta) file            : " + in_msa.get_number_of_seqs.to_s + ld
201       log << "proteins in hmmscan outputfile               : " + protein_counter.to_s + ld
202       log << "proteins with passing target domain(s)       : " + passed_seqs.get_number_of_seqs.to_s + ld
203       log << "proteins with no passing target domain       : " + proteins_with_failing_domains.to_s + ld
204       log << "proteins with no target domain               : " + domain_not_present_counter.to_s + ld
205
206       log << ld
207
208       return domain_pass_counter
209
210     end # parse
211
212     private
213
214     # domains_in_query_seq: Array of HmmscanResult
215     # target_domain_names: Set of String
216     # target_domains: Hash String->TargetDomain
217     # target_domain_architecture: String
218     def checkit2(domains_in_query_seq,
219       target_domain_names,
220       target_domains,
221       in_msa,
222       out_domain_msas,
223       out_domain_architecture_msa,
224       target_domain_architecture)
225
226       domain_names_in_query_seq = Set.new
227       domains_in_query_seq.each do |domain|
228         domain_names_in_query_seq.add(domain.model)
229       end
230       if (target_domain_names.subset?(domain_names_in_query_seq))
231
232         passed_domains = Array.new
233         passed_domains_counts = Hash.new
234
235         domains_in_query_seq.each do |domain|
236           if target_domains.has_key?(domain.model)
237             target_domain = target_domains[domain.model]
238
239             if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
240               if domain.i_e_value > target_domain.i_e_value
241                 next
242               end
243             end
244             if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
245               length = 1 + domain.env_to - domain.env_from
246               if length < target_domain.abs_len
247                 next
248               end
249             end
250             if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
251               length = 1 + domain.env_to - domain.env_from
252               puts (target_domain.rel_len * domain.tlen)
253               if length < (target_domain.rel_len * domain.tlen)
254                 next
255               end
256             end
257
258             passed_domains.push(domain)
259
260             if (passed_domains_counts.key?(domain.model))
261               passed_domains_counts[domain.model] = passed_domains_counts[domain.model] + 1
262             else
263               passed_domains_counts[domain.model] = 1
264             end
265
266             if (@passed.key?(domain.model))
267               @passed[domain.model] = @passed[domain.model] + 1
268             else
269               @passed[domain.model] = 1
270             end
271           end # if target_domains.has_key?(domain.model)
272         end # domains_in_query_seq.each do |domain|
273       else
274         return false
275       end
276
277       passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
278       # Compare architectures
279       if !compareArchitectures(target_domain_architecture, passed_domains)
280         return false
281       end
282
283       domain_counter = Hash.new
284
285       min_env_from = 10000000
286       max_env_from = 0
287       min_env_to = 10000000
288       max_env_to = 0
289
290       query_seq = nil
291
292       concatenated_domains = ''
293
294       passed_domains.each do |domain|
295         domain_name = domain.model
296
297         unless out_domain_msas.has_key? domain_name
298           out_domain_msas[ domain_name ] = Msa.new
299         end
300
301         if domain_counter.key?(domain_name)
302           domain_counter[domain_name] = domain_counter[domain_name] + 1
303         else
304           domain_counter[domain_name] = 1
305         end
306
307         if query_seq == nil
308           query_seq = domain.query
309           query_seq.freeze
310         elsif query_seq != domain.query
311           error_msg = "seq names do not match: this should not have happened"
312           raise StandardError, error_msg
313         end
314         puts "query: " + query_seq
315
316         extracted_domain_seq = extract_domain( query_seq,
317         domain_counter[domain_name],
318         passed_domains_counts[domain_name],
319         domain.env_from,
320         domain.env_to,
321         in_msa,
322         out_domain_msas[ domain_name ] )
323
324         concatenated_domains += extracted_domain_seq
325
326         if domain.env_from < min_env_from
327           min_env_from = domain.env_from
328         end
329         if domain.env_from > max_env_from
330           max_env_from = domain.env_from
331         end
332         if domain.env_to < min_env_to
333           min_env_to = domain.env_to
334         end
335         if domain.env_to > max_env_to
336           max_env_to = domain.env_to
337         end
338       end
339
340       puts "env from: #{min_env_from} - #{max_env_from}"
341       puts "env to  : #{min_env_to} - #{max_env_to}"
342
343       extract_domain( query_seq,
344       1,
345       1,
346       min_env_from,
347       max_env_to,
348       in_msa,
349       out_domain_architecture_msa )
350
351       puts passed_domains_counts
352       puts concatenated_domains
353       true
354     end
355
356     # passed_domains needs to be sorted!
357     def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
358       domain_architecture = ""
359       passed_domains.each do |domain|
360         if domain_architecture.length > 0
361           domain_architecture += ">"
362         end
363         domain_architecture += domain.model
364       end
365       pass = false
366       if strict
367         pass = (target_domain_architecture == domain_architecture)
368       else
369         pass = (domain_architecture.index(target_domain_architecture) != nil)
370       end
371       if !pass
372         if (@non_passsing_domain_architectures.key?(domain_architecture))
373           @non_passsing_domain_architectures[domain_architecture] = @non_passsing_domain_architectures[domain_architecture] + 1
374         else
375           @non_passsing_domain_architectures[domain_architecture] = 1
376         end
377       end
378       return pass
379     end
380
381     def write_msa( msa, filename )
382       io = MsaIO.new()
383       w = FastaWriter.new()
384       w.set_line_width( 60 )
385       w.clean( true )
386       File.delete(filename) if File.exist?(filename) #TODO remove me
387       begin
388         io.write_to_file( msa, filename, w )
389       rescue Exception
390         error_msg = "could not write to \"" + filename + "\""
391         raise IOError, error_msg
392       end
393     end
394
395     def add_sequence( sequence_name, in_msa, add_to_msa )
396       seqs = in_msa.find_by_name_start( sequence_name, true )
397       if ( seqs.length < 1 )
398         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
399         raise StandardError, error_msg
400       end
401       if ( seqs.length > 1 )
402         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
403         raise StandardError, error_msg
404       end
405       seq = in_msa.get_sequence( seqs[ 0 ] )
406       add_to_msa.add_sequence( seq )
407     end
408
409     def extract_domain( seq_name,
410       number,
411       out_of,
412       seq_from,
413       seq_to,
414       in_msa,
415       out_msa)
416       if number < 1 || out_of < 1 || number > out_of
417         error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
418         raise StandardError, error_msg
419       end
420       if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
421         error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
422         raise StandardError, error_msg
423       end
424       seqs = in_msa.find_by_name_start(seq_name, true)
425       if seqs.length < 1
426         error_msg = "sequence \"" + seq_name + "\" not found in sequence file"
427         raise IOError, error_msg
428       end
429       if seqs.length > 1
430         error_msg = "sequence \"" + seq_name + "\" not unique in sequence file"
431         raise IOError, error_msg
432       end
433       # hmmscan is 1 based, whereas sequences are 0 bases in this package.
434       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
435
436       orig_name = seq.get_name
437
438       seq.set_name( orig_name.split[ 0 ] )
439
440       if out_of != 1
441         seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
442       end
443
444       out_msa.add_sequence( seq )
445
446       seq.get_sequence_as_string
447     end
448
449   end # class HmmscanMultiDomainExtractor
450
451   class TargetDomain
452     def initialize( name, i_e_value, abs_len, rel_len, position )
453       if (name == nil) || name.size < 1
454         error_msg = "target domain name nil or empty"
455         raise StandardError, error_msg
456       end
457       if rel_len > 1
458         error_msg = "target domain relative length is greater than 1"
459         raise StandardError, error_msg
460       end
461       @name = name
462       @i_e_value = i_e_value
463       @abs_len = abs_len
464       @rel_len = rel_len
465       @position = position
466     end
467     attr_reader :name, :i_e_value, :abs_len, :rel_len, :position
468   end
469
470 end # module Evoruby
471