in progress...
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_multi_domain_extractor.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
3 #
4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
5 # License::      GNU Lesser General Public License (LGPL)
6 #
7 # Last modified: 2017/02/20
8
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
15
16 module Evoruby
17   class HmmscanMultiDomainExtractor
18
19     OUTPUT_ID = 'mdsx'
20     DOMAIN_DELIMITER = ' -- '
21
22     PASSING_FL_SEQS_SUFFIX    = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
23     FAILING_FL_SEQS_SUFFIX    = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
24     TARGET_DA_SUFFIX          = "_#{OUTPUT_ID}_target_da.fasta"
25     CONCAT_TARGET_DOM_SUFFIX  = "_#{OUTPUT_ID}_concat_target_dom.fasta"
26     TARGET_DOM_OUTPUT_MIDPART = "_#{OUTPUT_ID}_target_dom_"
27     def initialize
28       @passing_domains_data = nil
29       @failing_domains_data = nil
30       @failing_domain_architectures = nil
31       @passsing_domain_architectures = nil
32       @failing_proteins_bc_not_all_target_doms_present = nil
33       @failing_proteins_bc_missing_cutoffs = nil
34     end
35
36     # raises ArgumentError, IOError, StandardError
37     def parse( target_da,
38       hmmscan_output,
39       fasta_sequence_file,
40       outfile_base,
41       log_str )
42
43       passing_fl_seqs_outfile   = outfile_base + PASSING_FL_SEQS_SUFFIX
44       failing_fl_seqs_outfile   = outfile_base + FAILING_FL_SEQS_SUFFIX
45       target_da_outfile         = outfile_base + TARGET_DA_SUFFIX
46       concat_target_dom_outfile = outfile_base + CONCAT_TARGET_DOM_SUFFIX
47
48       Util.check_file_for_readability( hmmscan_output )
49       Util.check_file_for_readability( fasta_sequence_file )
50       Util.check_file_for_writability( passing_fl_seqs_outfile )
51       Util.check_file_for_writability( failing_fl_seqs_outfile )
52       Util.check_file_for_writability( target_da_outfile )
53       Util.check_file_for_writability( concat_target_dom_outfile )
54
55       in_msa = nil
56       factory = MsaFactory.new()
57       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
58
59       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
60         error_msg = "could not find fasta sequences in " + fasta_sequence_file
61         raise IOError, error_msg
62       end
63
64       failed_seqs_msa = Msa.new
65       passed_seqs_msa = Msa.new
66
67       ld = Constants::LINE_DELIMITER
68
69       hmmscan_parser = HmmscanParser.new( hmmscan_output )
70       results = hmmscan_parser.parse
71
72       ####
73       # Import: if multiple copies of same domain, thresholds need to be same!
74       target_domain_ary = Array.new
75
76       #target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 ))
77       #target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 ))
78
79       #target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.9, 0 ))
80       #target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.9, 1 ))
81       # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
82       # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
83
84       target_domain_ary.push(TargetDomain.new('BH4', 1, -1, 0.2, 0 ))
85       target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 1 ))
86       target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 2 ))
87       # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
88
89       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
90       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
91
92       #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
93
94       target_domains = Hash.new
95
96       target_domain_architecure = ""
97
98       target_domain_ary.each do |target_domain|
99         target_domains[target_domain.name] = target_domain
100         if target_domain_architecure.length > 0
101           target_domain_architecure += DOMAIN_DELIMITER
102         end
103         target_domain_architecure += target_domain.name
104       end
105
106       target_domain_architecure.freeze
107
108       puts 'Target domain architecture: ' + target_domain_architecure
109
110       target_domain_names = Set.new
111
112       target_domains.each_key {|key| target_domain_names.add( key ) }
113
114       prev_query_seq_name = nil
115       domains_in_query_seq = Array.new
116       passing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
117       failing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
118       total_sequences = 0
119
120       @failing_domain_architectures = Hash.new
121       @passsing_domain_architectures = Hash.new
122       @passing_domains_data = Hash.new
123       @failing_domains_data = Hash.new
124       @failing_proteins_bc_not_all_target_doms_present = 0
125       @failing_proteins_bc_missing_cutoffs = 0
126       out_domain_msas = Hash.new
127       out_domain_architecture_msa = Msa.new
128       out_concatenated_domains_msa = Msa.new
129       results.each do |hmmscan_result|
130         if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
131           if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
132             passing_sequences.push(domains_in_query_seq)
133           else
134             failing_sequences.push(domains_in_query_seq)
135           end
136           domains_in_query_seq = Array.new
137           total_sequences += 1
138         end
139         prev_query_seq_name = hmmscan_result.query
140         domains_in_query_seq.push(hmmscan_result)
141       end # each result
142
143       if prev_query_seq_name != nil
144         total_sequences += 1
145         if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
146           passing_sequences.push(domains_in_query_seq)
147         else
148           failing_sequences.push(domains_in_query_seq)
149         end
150       end
151
152       puts
153
154       puts 'Failing domain architectures containing one or more target domain(s):'
155       @failing_domain_architectures = @failing_domain_architectures .sort{|a, b|a<=>b}.to_h
156       failing_da_sum = 0
157       @failing_domain_architectures .each do |da, count|
158         failing_da_sum += count
159         puts count.to_s.rjust(4) + ': ' + da
160       end
161
162       puts
163
164       puts 'Passing domain architectures containing target domain(s):'
165       @passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
166       passing_da_sum = 0
167       @passsing_domain_architectures.each do |da, count|
168         passing_da_sum += count
169         puts count.to_s.rjust(4) + ': ' + da
170       end
171
172       puts 'Passing domain(s):'
173       @passing_domains_data = @passing_domains_data.sort{|a, b|a<=>b}.to_h
174       @passing_domains_data.each do |n, d|
175         puts d.to_str
176       end
177
178       puts 'Failing domain(s):'
179       @failing_domains_data = @failing_domains_data.sort{|a, b|a<=>b}.to_h
180       @failing_domains_data.each do |n, d|
181         puts d.to_str
182       end
183
184       unless total_sequences == (passing_sequences.size + failing_sequences.size)
185         error_msg = "this should not have happened: total seqs not equal to passing plus failing seqs"
186         raise StandardError, error_msg
187       end
188
189       unless failing_sequences.size == (@failing_proteins_bc_not_all_target_doms_present + @failing_proteins_bc_missing_cutoffs)
190         error_msg = "this should not have happened: failing seqs sums not consistent"
191         raise StandardError, error_msg
192       end
193
194       unless @failing_proteins_bc_missing_cutoffs == failing_da_sum
195         error_msg = "this should not have happened: failing seqs not equal to failing da sum"
196         raise StandardError, error_msg
197       end
198
199       unless passing_sequences.size == passing_da_sum
200         error_msg = "this should not have happened: passing seqs not equal to passing da sum"
201         raise StandardError, error_msg
202       end
203
204       puts
205
206       puts "Protein sequences in sequence (fasta) file: " + in_msa.get_number_of_seqs.to_s.rjust(5)
207       puts "Protein sequences in hmmscan output file  : " + total_sequences.to_s.rjust(5)
208       puts "  Passing protein sequences               : " + passing_sequences.size.to_s.rjust(5)
209       puts "  Failing protein sequences               : " + failing_sequences.size.to_s.rjust(5)
210       puts "    Not all target domain present         : " + @failing_proteins_bc_not_all_target_doms_present.to_s.rjust(5)
211       puts "    Target domain(s) failing cutoffs      : " + @failing_proteins_bc_missing_cutoffs.to_s.rjust(5)
212
213       puts
214
215       out_domain_msas.keys.sort.each do |domain_name|
216         file_name = outfile_base + TARGET_DOM_OUTPUT_MIDPART + domain_name + '.fasta'
217
218         write_msa( out_domain_msas[domain_name], file_name )
219         puts "wrote #{domain_name}"
220       end
221
222       write_msa( out_domain_architecture_msa, target_da_outfile )
223       puts "wrote target_domain_architecure"
224
225       write_msa( out_concatenated_domains_msa, concat_target_dom_outfile )
226       puts "wrote concatenated_domains_msa"
227
228       passing_sequences.each do | domains |
229         query_name = domains[0].query
230         if passed_seqs_msa.find_by_name_start( query_name, true ).length < 1
231           add_sequence( query_name, in_msa, passed_seqs_msa )
232         else
233           error_msg = "this should not have happened"
234           raise StandardError, error_msg
235         end
236       end
237
238       failing_sequences.each do | domains |
239         query_name = domains[0].query
240         if failed_seqs_msa.find_by_name_start( query_name, true ).length < 1
241           add_sequence( query_name, in_msa, failed_seqs_msa )
242         else
243           error_msg = "this should not have happened"
244           raise StandardError, error_msg
245         end
246       end
247
248       write_msa( passed_seqs_msa, passing_fl_seqs_outfile )
249       puts "wrote ..."
250       write_msa( failed_seqs_msa, failing_fl_seqs_outfile )
251       puts "wrote ..."
252
253       log_str << ld
254
255       log_str << "proteins in sequence (fasta) file            : " + in_msa.get_number_of_seqs.to_s + ld
256
257       log_str << ld
258
259     end # parse
260
261     private
262
263     # domains_in_query_seq: Array of HmmscanResult
264     # target_domain_names: Set of String
265     # target_domains: Hash String->TargetDomain
266     # target_domain_architecture: String
267     def checkit2(domains_in_query_seq,
268       target_domain_names,
269       target_domains,
270       in_msa,
271       out_domain_msas,
272       out_domain_architecture_msa,
273       out_contactenated_domains_msa,
274       target_domain_architecture)
275
276       domain_names_in_query_seq = Set.new
277       domains_in_query_seq.each do |domain|
278         domain_names_in_query_seq.add(domain.model)
279       end
280       if target_domain_names.subset?(domain_names_in_query_seq)
281
282         passed_domains = Array.new
283         passed_domains_counts = Hash.new
284
285         domains_in_query_seq.each do |domain|
286           if target_domains.has_key?(domain.model)
287             target_domain = target_domains[domain.model]
288
289             if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
290               if domain.i_e_value > target_domain.i_e_value
291                 addToFailingDomainData(domain)
292                 next
293               end
294             end
295             if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
296               length = 1 + domain.env_to - domain.env_from
297               if length < target_domain.abs_len
298                 addToFailingDomainData(domain)
299                 next
300               end
301             end
302             if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
303               length = 1 + domain.env_to - domain.env_from
304               if length < (target_domain.rel_len * domain.tlen)
305                 addToFailingDomainData(domain)
306                 next
307               end
308             end
309
310             passed_domains.push(domain)
311
312             if (passed_domains_counts.key?(domain.model))
313               passed_domains_counts[domain.model] = passed_domains_counts[domain.model] + 1
314             else
315               passed_domains_counts[domain.model] = 1
316             end
317
318             addToPassingDomainData(domain)
319
320           end # if target_domains.has_key?(domain.model)
321         end # domains_in_query_seq.each do |domain|
322       else
323         @failing_proteins_bc_not_all_target_doms_present += 1
324         return false
325       end # target_domain_names.subset?(domain_names_in_query_seq)
326
327       passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
328       # Compare architectures
329       if !compareArchitectures(target_domain_architecture, passed_domains)
330         @failing_proteins_bc_missing_cutoffs += 1
331         return false
332       end
333
334       domain_counter = Hash.new
335
336       min_env_from = 10000000
337       max_env_to = 0
338
339       query_seq = nil
340
341       concatenated_domains = ''
342
343       passed_domains.each do |domain|
344         domain_name = domain.model
345
346         unless out_domain_msas.has_key? domain_name
347           out_domain_msas[ domain_name ] = Msa.new
348         end
349
350         if domain_counter.key?(domain_name)
351           domain_counter[domain_name] = domain_counter[domain_name] + 1
352         else
353           domain_counter[domain_name] = 1
354         end
355
356         if query_seq == nil
357           query_seq = domain.query
358           query_seq.freeze
359         elsif query_seq != domain.query
360           error_msg = "seq names do not match: this should not have happened"
361           raise StandardError, error_msg
362         end
363
364         extracted_domain_seq = extract_domain( query_seq,
365         domain_counter[domain_name],
366         passed_domains_counts[domain_name],
367         domain.env_from,
368         domain.env_to,
369         in_msa,
370         out_domain_msas[ domain_name ] )
371
372         concatenated_domains += extracted_domain_seq
373
374         if domain.env_from < min_env_from
375           min_env_from = domain.env_from
376         end
377         if domain.env_to > max_env_to
378           max_env_to = domain.env_to
379         end
380       end
381
382       extract_domain( query_seq,
383       1,
384       1,
385       min_env_from,
386       max_env_to,
387       in_msa,
388       out_domain_architecture_msa )
389
390       out_contactenated_domains_msa.add( query_seq, concatenated_domains)
391
392       true
393     end
394
395     def addToPassingDomainData(domain)
396       unless ( @passing_domains_data.key?(domain.model))
397         @passing_domains_data[domain.model] = DomainData.new(domain.model)
398       end
399       @passing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value)
400     end
401
402     def addToFailingDomainData(domain)
403       unless ( @failing_domains_data.key?(domain.model))
404         @failing_domains_data[domain.model] = DomainData.new(domain.model)
405       end
406       @failing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value)
407     end
408
409     # passed_domains needs to be sorted!
410     def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
411       domain_architecture = ''
412       passed_domains.each do |domain|
413         if domain_architecture.length > 0
414           domain_architecture += DOMAIN_DELIMITER
415         end
416         domain_architecture += domain.model
417       end
418       pass = false
419       if strict
420         pass = (target_domain_architecture == domain_architecture)
421       else
422         pass = (domain_architecture.index(target_domain_architecture) != nil)
423       end
424       if pass
425         if (@passsing_domain_architectures.key?(domain_architecture))
426           @passsing_domain_architectures[domain_architecture] = @passsing_domain_architectures[domain_architecture] + 1
427         else
428           @passsing_domain_architectures[domain_architecture] = 1
429         end
430       else
431         if ( @failing_domain_architectures .key?(domain_architecture))
432           @failing_domain_architectures [domain_architecture] =  @failing_domain_architectures [domain_architecture] + 1
433         else
434           @failing_domain_architectures [domain_architecture] = 1
435         end
436       end
437       return pass
438     end
439
440     def write_msa( msa, filename )
441       io = MsaIO.new()
442       w = FastaWriter.new()
443       w.set_line_width( 60 )
444       w.clean( true )
445       File.delete(filename) if File.exist?(filename) #TODO remove me
446       begin
447         io.write_to_file( msa, filename, w )
448       rescue Exception
449         error_msg = "could not write to \"" + filename + "\""
450         raise IOError, error_msg
451       end
452     end
453
454     def add_sequence( sequence_name, in_msa, add_to_msa )
455       seqs = in_msa.find_by_name_start( sequence_name, true )
456       if ( seqs.length < 1 )
457         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
458         raise StandardError, error_msg
459       end
460       if ( seqs.length > 1 )
461         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
462         raise StandardError, error_msg
463       end
464       seq = in_msa.get_sequence( seqs[ 0 ] )
465       add_to_msa.add_sequence( seq )
466     end
467
468     def extract_domain( seq_name,
469       number,
470       out_of,
471       seq_from,
472       seq_to,
473       in_msa,
474       out_msa)
475       if number < 1 || out_of < 1 || number > out_of
476         error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
477         raise StandardError, error_msg
478       end
479       if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
480         error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
481         raise StandardError, error_msg
482       end
483       seqs = in_msa.find_by_name_start(seq_name, true)
484       if seqs.length < 1
485         error_msg = "sequence \"" + seq_name + "\" not found in sequence file"
486         raise IOError, error_msg
487       end
488       if seqs.length > 1
489         error_msg = "sequence \"" + seq_name + "\" not unique in sequence file"
490         raise IOError, error_msg
491       end
492       # hmmscan is 1 based, whereas sequences are 0 bases in this package.
493       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
494
495       orig_name = seq.get_name
496
497       seq.set_name( orig_name.split[ 0 ] )
498
499       if out_of != 1
500         seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
501       end
502
503       out_msa.add_sequence( seq )
504
505       seq.get_sequence_as_string
506     end
507
508   end # class HmmscanMultiDomainExtractor
509
510   class DomainData
511     def initialize( name )
512       if (name == nil) || name.size < 1
513         error_msg = "domain name nil or empty"
514         raise StandardError, error_msg
515       end
516       @name = name
517       @count = 0
518       @i_e_value_min = 10000000.0
519       @i_e_value_max = -1.0
520       @i_e_value_sum = 0.0
521       @len_min = 10000000
522       @len_max = -1
523       @len_sum = 0.0
524     end
525
526     def add( name, length, i_e_value)
527       if name != @name
528         error_msg = "domain names do not match"
529         raise StandardError, error_msg
530       end
531
532       if length < 0
533         error_msg = "length cannot me negative"
534         raise StandardError, error_msg
535       end
536       if i_e_value < 0
537         error_msg = "iE-value cannot me negative"
538         raise StandardError, error_msg
539       end
540       @count += 1
541       @i_e_value_sum += i_e_value
542       @len_sum += length
543       if i_e_value > @i_e_value_max
544         @i_e_value_max = i_e_value
545       end
546       if i_e_value < @i_e_value_min
547         @i_e_value_min = i_e_value
548       end
549       if length > @len_max
550         @len_max = length
551       end
552       if length < @len_min
553         @len_min = length
554       end
555     end
556
557     def avg_length
558       if @count == 0
559         return 0
560       end
561       @len_sum / @count
562     end
563
564     def avg_i_e_value
565       if @count == 0
566         return 0
567       end
568       @i_e_value_sum / @count
569     end
570
571     def to_str
572       s = ''
573       s << @name.rjust(24) + ': '
574       s << @count.to_s.rjust(4) + '  '
575       s << avg_length.round(1).to_s.rjust(6) + ' '
576       s << @len_min.to_s.rjust(4) + ' -'
577       s << @len_max.to_s.rjust(4) + '  '
578       s << ("%.2E" % avg_i_e_value).rjust(9) + ' '
579       s << ("%.2E" % @i_e_value_min).rjust(9) + ' -'
580       s << ("%.2E" % @i_e_value_max).rjust(9) + ' '
581       s
582     end
583
584     attr_reader :name, :count, :i_e_value_min, :i_e_value_max, :length_min, :length_max
585
586   end
587
588   class TargetDomain
589     def initialize( name, i_e_value, abs_len, rel_len, position )
590       if (name == nil) || name.size < 1
591         error_msg = "target domain name nil or empty"
592         raise StandardError, error_msg
593       end
594       if rel_len > 1
595         error_msg = "target domain relative length is greater than 1"
596         raise StandardError, error_msg
597       end
598       @name = name
599       @i_e_value = i_e_value
600       @abs_len = abs_len
601       @rel_len = rel_len
602       @position = position
603     end
604     attr_reader :name, :i_e_value, :abs_len, :rel_len, :position
605   end
606
607 end # module Evoruby
608