in progress...
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_multi_domain_extractor.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
3 #
4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
5 # License::      GNU Lesser General Public License (LGPL)
6 #
7 # Last modified: 2017/02/20
8
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
15
16 module Evoruby
17   class HmmscanMultiDomainExtractor
18     def initialize
19       @passed = Hash.new
20       @non_passsing_domain_architectures = Hash.new
21     end
22
23     # raises ArgumentError, IOError, StandardError
24     def parse( domain_id,
25       hmmscan_output,
26       fasta_sequence_file,
27       outfile,
28       passed_seqs_outfile,
29       failed_seqs_outfile,
30       e_value_threshold,
31       length_threshold,
32       add_position,
33       add_domain_number,
34       add_species,
35       log )
36
37       Util.check_file_for_readability( hmmscan_output )
38       Util.check_file_for_readability( fasta_sequence_file )
39       Util.check_file_for_writability( outfile + ".fasta" )
40       Util.check_file_for_writability( passed_seqs_outfile )
41       Util.check_file_for_writability( failed_seqs_outfile )
42
43       in_msa = nil
44       factory = MsaFactory.new()
45       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
46
47       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
48         error_msg = "could not find fasta sequences in " + fasta_sequence_file
49         raise IOError, error_msg
50       end
51
52       out_msa = Msa.new
53
54       failed_seqs = Msa.new
55       passed_seqs = Msa.new
56       passed_multi_seqs = Msa.new
57
58       ld = Constants::LINE_DELIMITER
59
60       domain_pass_counter                = 0
61       domain_fail_counter                = 0
62       passing_domains_per_protein        = 0
63       proteins_with_failing_domains      = 0
64       domain_not_present_counter         = 0
65       protein_counter                    = 1
66       max_domain_copy_number_per_protein = -1
67       max_domain_copy_number_sequence    = ""
68       passing_target_length_sum          = 0
69       overall_target_length_sum          = 0
70       overall_target_length_min          = 10000000
71       overall_target_length_max          = -1
72       passing_target_length_min          = 10000000
73       passing_target_length_max          = -1
74
75       overall_target_ie_min          = 10000000
76       overall_target_ie_max          = -1
77       passing_target_ie_min          = 10000000
78       passing_target_ie_max          = -1
79
80       hmmscan_datas = []
81
82       hmmscan_parser = HmmscanParser.new( hmmscan_output )
83       results = hmmscan_parser.parse
84
85       ####
86       # Import: if multiple copies of same domain, threshold need to be same!
87       target_domain_ary = Array.new
88       target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
89       target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
90       target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
91       target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
92
93       #target_domain_ary.push(TargetDomain.new('BH4', 0.1, -1, 0.5, 0 ))
94       #target_domain_ary.push(TargetDomain.new('Bcl-2', 0.1, -1, 0.5, 1 ))
95       # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
96
97       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
98       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
99
100       #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
101
102       target_domains = Hash.new
103
104       target_domain_architecure = ""
105
106       target_domain_ary.each do |target_domain|
107         target_domains[target_domain.name] = target_domain
108         if target_domain_architecure.length > 0
109           target_domain_architecure += ">"
110         end
111         target_domain_architecure += target_domain.name
112       end
113
114       target_domain_architecure.freeze
115
116       puts target_domain_architecure
117
118       target_domain_names = Set.new
119
120       target_domains.each_key {|key| target_domain_names.add( key ) }
121
122       prev_query_seq_name = nil
123       domains_in_query_seq = Array.new
124       passing_sequences = Array.new
125       total_sequences = 0
126       @passed = Hash.new
127       results.each do |hmmscan_result|
128         if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
129           if checkit2(domains_in_query_seq, target_domain_names, target_domains, target_domain_architecure)
130             passing_sequences.push(domains_in_query_seq)
131           end
132           domains_in_query_seq = Array.new
133           total_sequences += 1
134         end
135         prev_query_seq_name = hmmscan_result.query
136         domains_in_query_seq.push(hmmscan_result)
137       end # each result
138
139       if prev_query_seq_name != nil
140         total_sequences += 1
141         if checkit2(domains_in_query_seq, target_domain_names, target_domains, target_domain_architecure)
142           passing_sequences.push(domains_in_query_seq)
143         end
144       end
145
146       passing_sequences.each do | domains |
147
148         seq = domains[0].query
149         # puts seq + "::"
150         ################
151         if passed_seqs.find_by_name_start( seq, true ).length < 1
152           add_sequence( seq, in_msa, passed_multi_seqs )
153         else
154           error_msg = "this should not have happened"
155           raise StandardError, error_msg
156         end
157         ################
158
159         domains.each do | domain |
160           #puts domain.query + ": " + domain.model
161         end
162         #puts
163       end
164
165       puts @non_passsing_domain_architectures
166
167       puts @passed
168       puts "total sequences  : " + total_sequences.to_s
169       puts "passing sequences: " + passing_sequences.size.to_s
170
171       ####
172
173       prev_query = nil
174       saw_target = false
175
176       results.each do | r |
177
178         if ( prev_query != nil ) && ( r.query != prev_query )
179           protein_counter += 1
180           passing_domains_per_protein = 0
181           if !saw_target
182             log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
183             domain_not_present_counter += 1
184           end
185           saw_target = false
186         end
187
188         prev_query = r.query
189
190         if domain_id != r.model
191           next
192         end
193
194         saw_target = true
195
196         sequence  = r.query
197         number    = r.number
198         out_of    = r.out_of
199         env_from  = r.env_from
200         env_to    = r.env_to
201         i_e_value = r.i_e_value
202         prev_query = r.query
203
204         length = 1 + env_to - env_from
205
206         overall_target_length_sum += length
207         if length > overall_target_length_max
208           overall_target_length_max = length
209         end
210         if length < overall_target_length_min
211           overall_target_length_min = length
212         end
213
214         if i_e_value > overall_target_ie_max
215           overall_target_ie_max = i_e_value
216         end
217         if i_e_value < overall_target_ie_min
218           overall_target_ie_min = i_e_value
219         end
220
221         if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
222         ( ( length_threshold <= 0 ) || ( length >= length_threshold.to_f ) ) )
223           hmmscan_datas << HmmscanData.new( sequence, number, out_of, env_from, env_to, i_e_value )
224           passing_target_length_sum += length
225           passing_domains_per_protein += 1
226           if length > passing_target_length_max
227             passing_target_length_max = length
228           end
229           if length < passing_target_length_min
230             passing_target_length_min = length
231           end
232           if i_e_value > passing_target_ie_max
233             passing_target_ie_max = i_e_value
234           end
235           if i_e_value < passing_target_ie_min
236             passing_target_ie_min = i_e_value
237           end
238           if ( passing_domains_per_protein > max_domain_copy_number_per_protein )
239             max_domain_copy_number_sequence    = sequence
240             max_domain_copy_number_per_protein = passing_domains_per_protein
241           end
242         else # no pass
243           log << domain_fail_counter.to_s + ": " + sequence.to_s + " fails threshold(s)"
244           if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
245             log << " iE=" + i_e_value.to_s
246           end
247           if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
248             le = env_to - env_from + 1
249             log << " l=" + le.to_s
250           end
251           log << ld
252           domain_fail_counter += 1
253         end
254
255         if number > out_of
256           error_msg = "number > out_of (this should not have happened)"
257           raise StandardError, error_msg
258         end
259
260         if number == out_of
261           if !hmmscan_datas.empty?
262             process_hmmscan_datas( hmmscan_datas,
263             in_msa,
264             add_domain_number,
265             out_msa )
266             domain_pass_counter += hmmscan_datas.length
267             if passed_seqs.find_by_name_start( sequence, true ).length < 1
268               add_sequence( sequence, in_msa, passed_seqs )
269             else
270               error_msg = "this should not have happened"
271               raise StandardError, error_msg
272             end
273           else # no pass
274             if failed_seqs.find_by_name_start( sequence, true ).length < 1
275               add_sequence( sequence, in_msa, failed_seqs )
276               proteins_with_failing_domains += 1
277             else
278               error_msg = "this should not have happened"
279               raise StandardError, error_msg
280             end
281           end
282           hmmscan_datas.clear
283         end
284
285       end # results.each do | r |
286
287       if (prev_query != nil) && (!saw_target)
288         log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
289         domain_not_present_counter += 1
290       end
291
292       if domain_pass_counter < 1
293         error_msg = "no domain sequences were extracted"
294         raise IOError, error_msg
295       end
296
297       if ( domain_not_present_counter + passed_seqs.get_number_of_seqs + proteins_with_failing_domains ) != protein_counter
298         error_msg = "not present + passing + not passing != proteins in sequence (fasta) file (this should not have happened)"
299         raise StandardError, error_msg
300       end
301
302       puts
303       log << ld
304
305       log << ld
306       avg_pass = ( passing_target_length_sum / domain_pass_counter )
307       puts( "Passing target domain lengths: average: " + avg_pass.to_s  )
308       log << "Passing target domain lengths: average: " + avg_pass.to_s
309       log << ld
310       puts( "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - "  + passing_target_length_max.to_s)
311       log << "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - "  + passing_target_length_max.to_s
312       log << ld
313       puts( "Passing target domain iE:      min-max: " + passing_target_ie_min.to_s + " - "  + passing_target_ie_max.to_s)
314       log << "Passing target domain iE:      min-max: " + passing_target_ie_min.to_s + " - "  + passing_target_ie_max.to_s
315       log << ld
316       puts( "Passing target domains:            sum: " + domain_pass_counter.to_s  )
317       log << "Passing target domains:            sum: " + domain_pass_counter.to_s
318       log << ld
319       log << ld
320       puts
321       sum = domain_pass_counter + domain_fail_counter
322       avg_all = overall_target_length_sum / sum
323       puts( "All target domain lengths:     average: " + avg_all.to_s  )
324       log << "All target domain lengths:     average: " + avg_all.to_s
325       log << ld
326       puts( "All target domain lengths:     min-max: " + overall_target_length_min.to_s + " - "  + overall_target_length_max.to_s)
327       log << "All target domain lengths:     min-max: " + overall_target_length_min.to_s + " - "  + overall_target_length_max.to_s
328       log << ld
329       puts( "All target target domain iE:   min-max: " + overall_target_ie_min.to_s + " - "  + overall_target_ie_max.to_s)
330       log << "All target target domain iE:   min-max: " + overall_target_ie_min.to_s + " - "  + overall_target_ie_max.to_s
331       log << ld
332       puts( "All target domains:                sum: " + sum.to_s  )
333       log << "All target domains:                sum: " + sum.to_s
334
335       puts
336       puts( "Proteins with passing target domain(s): " + passed_multi_seqs.get_number_of_seqs.to_s )
337       #puts( "Proteins with passing target domain(s): " + passed_seqs.get_number_of_seqs.to_s )
338       puts( "Proteins with no passing target domain: " + proteins_with_failing_domains.to_s )
339       puts( "Proteins with no target domain        : " + domain_not_present_counter.to_s )
340
341       log << ld
342       log << ld
343       puts
344       puts( "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s )
345       log << "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s
346       log << ld
347
348       if ( max_domain_copy_number_per_protein > 1 )
349         puts( "First target protein with this copy number: " + max_domain_copy_number_sequence )
350         log << "First target protein with this copy number: " + max_domain_copy_number_sequence
351         log << ld
352       end
353
354       write_msa( out_msa, outfile + ".fasta"  )
355       write_msa( passed_multi_seqs, passed_seqs_outfile )
356       write_msa( failed_seqs, failed_seqs_outfile )
357
358       log << ld
359       log << "passing target domains                       : " + domain_pass_counter.to_s + ld
360       log << "failing target domains                       : " + domain_fail_counter.to_s + ld
361       log << "proteins in sequence (fasta) file            : " + in_msa.get_number_of_seqs.to_s + ld
362       log << "proteins in hmmscan outputfile               : " + protein_counter.to_s + ld
363       log << "proteins with passing target domain(s)       : " + passed_seqs.get_number_of_seqs.to_s + ld
364       log << "proteins with no passing target domain       : " + proteins_with_failing_domains.to_s + ld
365       log << "proteins with no target domain               : " + domain_not_present_counter.to_s + ld
366
367       log << ld
368
369       return domain_pass_counter
370
371     end # parse
372
373     private
374
375     def checkit2(domains_in_query_seq, target_domain_names, target_domains, target_domain_architecture = nil)
376       domain_names_in_query_seq = Set.new
377       domains_in_query_seq.each do |domain|
378         domain_names_in_query_seq.add(domain.model)
379       end
380       if (target_domain_names.subset?(domain_names_in_query_seq))
381
382         passed_domains = Array.new
383
384         domains_in_query_seq.each do |domain|
385           if target_domains.has_key?(domain.model)
386             target_domain = target_domains[domain.model]
387             if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
388               if domain.i_e_value > target_domain.i_e_value
389                 return false
390               end
391             end
392             if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
393               length = 1 + domain.env_to - domain.env_from
394               if length < target_domain.abs_len
395                 return false
396               end
397             end
398             if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
399               length = 1 + domain.env_to - domain.env_from
400               if length < (target_domain.rel_len * domain.tlen)
401                 return false
402               end
403             end
404
405             # For domain architecture comparison
406             if target_domain_architecture != nil
407               passed_domains.push(domain)
408             end
409
410             if (@passed.key?(domain.model))
411               @passed[domain.model] =  @passed[domain.model] + 1
412             else
413               @passed[domain.model] = 1
414             end
415           end
416         end
417
418       else
419         return false
420       end
421       # Compare architectures
422       if target_domain_architecture != nil
423         if !compareArchitectures(target_domain_architecture, passed_domains)
424           return false
425         end
426       end
427       true
428     end
429
430     def compareArchitectures(target_domain_architecture, passed_domains)
431       passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
432       domain_architecture = ""
433       passed_domains.each do |domain|
434         if domain_architecture.length > 0
435           domain_architecture += ">"
436         end
437         domain_architecture += domain.model
438       end
439       pass = (target_domain_architecture == domain_architecture)
440       if !pass
441         if (@non_passsing_domain_architectures.key?(domain_architecture))
442           @non_passsing_domain_architectures[domain_architecture] = @non_passsing_domain_architectures[domain_architecture] + 1
443         else
444           @non_passsing_domain_architectures[domain_architecture] = 1
445         end
446       end
447       return pass
448     end
449
450     def write_msa( msa, filename )
451       io = MsaIO.new()
452       w = FastaWriter.new()
453       w.set_line_width( 60 )
454       w.clean( true )
455       begin
456         io.write_to_file( msa, filename, w )
457       rescue Exception
458         error_msg = "could not write to \"" + filename + "\""
459         raise IOError, error_msg
460       end
461     end
462
463     def add_sequence( sequence_name, in_msa, add_to_msa )
464       seqs = in_msa.find_by_name_start( sequence_name, true )
465       if ( seqs.length < 1 )
466         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
467         raise StandardError, error_msg
468       end
469       if ( seqs.length > 1 )
470         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
471         raise StandardError, error_msg
472       end
473       seq = in_msa.get_sequence( seqs[ 0 ] )
474       add_to_msa.add_sequence( seq )
475     end
476
477     def process_hmmscan_datas( hmmscan_datas,
478       in_msa,
479       add_domain_number,
480       out_msa )
481
482       actual_out_of = hmmscan_datas.size
483
484       seq_name = ""
485       prev_seq_name = nil
486
487       hmmscan_datas.each_with_index do |hmmscan_data, index|
488         if hmmscan_data.number < ( index + 1 )
489           error_msg = "hmmscan_data.number < ( index + 1 ) (this should not have happened)"
490           raise StandardError, error_msg
491         end
492
493         seq_name = hmmscan_data.seq_name
494
495         extract_domain( seq_name,
496         index + 1,
497         actual_out_of,
498         hmmscan_data.env_from,
499         hmmscan_data.env_to,
500         in_msa,
501         out_msa,
502         add_domain_number )
503
504         if prev_seq_name && prev_seq_name != seq_name
505           error_msg = "this should not have happened"
506           raise StandardError, error_msg
507         end
508         prev_seq_name = seq_name
509       end # each
510     end # def process_hmmscan_data
511
512     def extract_domain( sequence,
513       number,
514       out_of,
515       seq_from,
516       seq_to,
517       in_msa,
518       out_msa,
519       add_domain_number)
520       if number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
521         error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
522         raise StandardError, error_msg
523       end
524       if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
525         error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
526         raise StandardError, error_msg
527       end
528       seqs = in_msa.find_by_name_start( sequence, true )
529       if seqs.length < 1
530         error_msg = "sequence \"" + sequence + "\" not found in sequence file"
531         raise IOError, error_msg
532       end
533       if seqs.length > 1
534         error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
535         raise IOError, error_msg
536       end
537       # hmmscan is 1 based, whereas sequences are 0 bases in this package.
538       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
539
540       orig_name = seq.get_name
541
542       seq.set_name( orig_name.split[ 0 ] )
543
544       if out_of != 1 && add_domain_number
545         seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
546       end
547
548       out_msa.add_sequence( seq )
549     end
550
551     def is_ignorable?( line )
552       return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
553     end
554
555   end # class HmmscanDomainExtractor
556
557   class HmmscanData
558     def initialize( seq_name, number, out_of, env_from, env_to, i_e_value )
559       @seq_name = seq_name
560       @number = number
561       @out_of = out_of
562       @env_from = env_from
563       @env_to = env_to
564       @i_e_value = i_e_value
565     end
566     attr_reader :seq_name, :number, :out_of, :env_from, :env_to, :i_e_value
567   end
568
569   class TargetDomain
570     def initialize( name, i_e_value, abs_len, rel_len, position )
571       @name = name
572       @i_e_value = i_e_value
573       @abs_len = abs_len
574       @percent_len = rel_len
575       @position = position
576     end
577     attr_reader :name, :i_e_value, :abs_len, :rel_len, :position
578   end
579
580 end # module Evoruby
581