in progress...
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_multi_domain_extractor.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
3 #
4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
5 # License::      GNU Lesser General Public License (LGPL)
6 #
7 # Last modified: 2017/02/20
8
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
15
16 module Evoruby
17   class HmmscanMultiDomainExtractor
18     def initialize
19       @passed = Hash.new
20     end
21
22     # raises ArgumentError, IOError, StandardError
23     def parse( domain_id,
24       hmmscan_output,
25       fasta_sequence_file,
26       outfile,
27       passed_seqs_outfile,
28       failed_seqs_outfile,
29       e_value_threshold,
30       length_threshold,
31       add_position,
32       add_domain_number,
33       add_species,
34       log )
35
36       Util.check_file_for_readability( hmmscan_output )
37       Util.check_file_for_readability( fasta_sequence_file )
38       Util.check_file_for_writability( outfile + ".fasta" )
39       Util.check_file_for_writability( passed_seqs_outfile )
40       Util.check_file_for_writability( failed_seqs_outfile )
41
42       in_msa = nil
43       factory = MsaFactory.new()
44       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
45
46       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
47         error_msg = "could not find fasta sequences in " + fasta_sequence_file
48         raise IOError, error_msg
49       end
50
51       out_msa = Msa.new
52
53       failed_seqs = Msa.new
54       passed_seqs = Msa.new
55
56       ld = Constants::LINE_DELIMITER
57
58       domain_pass_counter                = 0
59       domain_fail_counter                = 0
60       passing_domains_per_protein        = 0
61       proteins_with_failing_domains      = 0
62       domain_not_present_counter         = 0
63       protein_counter                    = 1
64       max_domain_copy_number_per_protein = -1
65       max_domain_copy_number_sequence    = ""
66       passing_target_length_sum          = 0
67       overall_target_length_sum          = 0
68       overall_target_length_min          = 10000000
69       overall_target_length_max          = -1
70       passing_target_length_min          = 10000000
71       passing_target_length_max          = -1
72
73       overall_target_ie_min          = 10000000
74       overall_target_ie_max          = -1
75       passing_target_ie_min          = 10000000
76       passing_target_ie_max          = -1
77
78       hmmscan_datas = []
79
80       hmmscan_parser = HmmscanParser.new( hmmscan_output )
81       results = hmmscan_parser.parse
82
83       ####
84
85       target_domain_ary = Array.new
86
87       target_domain_ary.push(TargetDomain.new('Bcl-2', 0.01, -1, 0.5 ))
88       target_domain_ary.push(TargetDomain.new('BH4', 0.01, -1, 0.5 ))
89       target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5 ))
90       target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
91
92       target_domains = Hash.new
93
94       target_domain_ary.each do |target_domain|
95         target_domains[target_domain.name] = target_domain
96       end
97
98       target_domain_names = Set.new
99
100       target_domains.each_key {|key| target_domain_names.add( key ) }
101
102       prev_query_seq_name = nil
103       domains_in_query_seq = Array.new
104       passing_sequences = Array.new
105       total_sequences = 0
106       @passed = Hash.new
107       results.each do |hmmscan_result|
108         if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
109           if checkit2(domains_in_query_seq, target_domain_names, target_domains)
110             passing_sequences.push(domains_in_query_seq)
111           end
112           domains_in_query_seq = Array.new
113           total_sequences += 1
114         end
115         prev_query_seq_name = hmmscan_result.query
116         domains_in_query_seq.push(hmmscan_result)
117       end # each result
118
119       if prev_query_seq_name != nil
120         total_sequences += 1
121         if checkit2(domains_in_query_seq, target_domain_names, target_domains)
122           passing_sequences.push(domains_in_query_seq)
123         end
124       end
125
126       passing_sequences.each do | domains |
127         domains.each do | domain |
128           puts domain.query + ": " + domain.model
129         end
130         puts
131       end
132
133       puts @passed
134       puts "total sequences  : " + total_sequences.to_s
135       puts "passing sequences: " + passing_sequences.size.to_s
136
137       ####
138
139       prev_query = nil
140       saw_target = false
141
142       results.each do | r |
143
144         if ( prev_query != nil ) && ( r.query != prev_query )
145           protein_counter += 1
146           passing_domains_per_protein = 0
147           if !saw_target
148             log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
149             domain_not_present_counter += 1
150           end
151           saw_target = false
152         end
153
154         prev_query = r.query
155
156         if domain_id != r.model
157           next
158         end
159
160         saw_target = true
161
162         sequence  = r.query
163         # sequence_len = r.qlen
164         number    = r.number
165         out_of    = r.out_of
166         env_from  = r.env_from
167         env_to    = r.env_to
168         i_e_value = r.i_e_value
169         prev_query = r.query
170
171         length = 1 + env_to - env_from
172
173         overall_target_length_sum += length
174         if length > overall_target_length_max
175           overall_target_length_max = length
176         end
177         if length < overall_target_length_min
178           overall_target_length_min = length
179         end
180
181         if i_e_value > overall_target_ie_max
182           overall_target_ie_max = i_e_value
183         end
184         if i_e_value < overall_target_ie_min
185           overall_target_ie_min = i_e_value
186         end
187
188         if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
189         ( ( length_threshold <= 0 ) || ( length >= length_threshold.to_f ) ) )
190           hmmscan_datas << HmmscanData.new( sequence, number, out_of, env_from, env_to, i_e_value )
191           passing_target_length_sum += length
192           passing_domains_per_protein += 1
193           if length > passing_target_length_max
194             passing_target_length_max = length
195           end
196           if length < passing_target_length_min
197             passing_target_length_min = length
198           end
199           if i_e_value > passing_target_ie_max
200             passing_target_ie_max = i_e_value
201           end
202           if i_e_value < passing_target_ie_min
203             passing_target_ie_min = i_e_value
204           end
205           if ( passing_domains_per_protein > max_domain_copy_number_per_protein )
206             max_domain_copy_number_sequence    = sequence
207             max_domain_copy_number_per_protein = passing_domains_per_protein
208           end
209         else # no pass
210           log << domain_fail_counter.to_s + ": " + sequence.to_s + " fails threshold(s)"
211           if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
212             log << " iE=" + i_e_value.to_s
213           end
214           if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
215             le = env_to - env_from + 1
216             log << " l=" + le.to_s
217           end
218           log << ld
219           domain_fail_counter += 1
220         end
221
222         if number > out_of
223           error_msg = "number > out_of (this should not have happened)"
224           raise StandardError, error_msg
225         end
226
227         if number == out_of
228           if !hmmscan_datas.empty?
229             process_hmmscan_datas( hmmscan_datas,
230             in_msa,
231             add_position,
232             add_domain_number,
233             add_species,
234             out_msa )
235             domain_pass_counter += hmmscan_datas.length
236             if passed_seqs.find_by_name_start( sequence, true ).length < 1
237               add_sequence( sequence, in_msa, passed_seqs )
238             else
239               error_msg = "this should not have happened"
240               raise StandardError, error_msg
241             end
242           else # no pass
243             if failed_seqs.find_by_name_start( sequence, true ).length < 1
244               add_sequence( sequence, in_msa, failed_seqs )
245               proteins_with_failing_domains += 1
246             else
247               error_msg = "this should not have happened"
248               raise StandardError, error_msg
249             end
250           end
251           hmmscan_datas.clear
252         end
253
254       end # results.each do | r |
255
256       if (prev_query != nil) && (!saw_target)
257         log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
258         domain_not_present_counter += 1
259       end
260
261       if domain_pass_counter < 1
262         error_msg = "no domain sequences were extracted"
263         raise IOError, error_msg
264       end
265
266       if ( domain_not_present_counter + passed_seqs.get_number_of_seqs + proteins_with_failing_domains ) != protein_counter
267         error_msg = "not present + passing + not passing != proteins in sequence (fasta) file (this should not have happened)"
268         raise StandardError, error_msg
269       end
270
271       puts
272       log << ld
273
274       log << ld
275       avg_pass = ( passing_target_length_sum / domain_pass_counter )
276       puts( "Passing target domain lengths: average: " + avg_pass.to_s  )
277       log << "Passing target domain lengths: average: " + avg_pass.to_s
278       log << ld
279       puts( "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - "  + passing_target_length_max.to_s)
280       log << "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - "  + passing_target_length_max.to_s
281       log << ld
282       puts( "Passing target domain iE:      min-max: " + passing_target_ie_min.to_s + " - "  + passing_target_ie_max.to_s)
283       log << "Passing target domain iE:      min-max: " + passing_target_ie_min.to_s + " - "  + passing_target_ie_max.to_s
284       log << ld
285       puts( "Passing target domains:            sum: " + domain_pass_counter.to_s  )
286       log << "Passing target domains:            sum: " + domain_pass_counter.to_s
287       log << ld
288       log << ld
289       puts
290       sum = domain_pass_counter + domain_fail_counter
291       avg_all = overall_target_length_sum / sum
292       puts( "All target domain lengths:     average: " + avg_all.to_s  )
293       log << "All target domain lengths:     average: " + avg_all.to_s
294       log << ld
295       puts( "All target domain lengths:     min-max: " + overall_target_length_min.to_s + " - "  + overall_target_length_max.to_s)
296       log << "All target domain lengths:     min-max: " + overall_target_length_min.to_s + " - "  + overall_target_length_max.to_s
297       log << ld
298       puts( "All target target domain iE:   min-max: " + overall_target_ie_min.to_s + " - "  + overall_target_ie_max.to_s)
299       log << "All target target domain iE:   min-max: " + overall_target_ie_min.to_s + " - "  + overall_target_ie_max.to_s
300       log << ld
301       puts( "All target domains:                sum: " + sum.to_s  )
302       log << "All target domains:                sum: " + sum.to_s
303
304       puts
305       puts( "Proteins with passing target domain(s): " + passed_seqs.get_number_of_seqs.to_s )
306       puts( "Proteins with no passing target domain: " + proteins_with_failing_domains.to_s )
307       puts( "Proteins with no target domain        : " + domain_not_present_counter.to_s )
308
309       log << ld
310       log << ld
311       puts
312       puts( "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s )
313       log << "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s
314       log << ld
315
316       if ( max_domain_copy_number_per_protein > 1 )
317         puts( "First target protein with this copy number: " + max_domain_copy_number_sequence )
318         log << "First target protein with this copy number: " + max_domain_copy_number_sequence
319         log << ld
320       end
321
322       write_msa( out_msa, outfile + ".fasta"  )
323       write_msa( passed_seqs, passed_seqs_outfile )
324       write_msa( failed_seqs, failed_seqs_outfile )
325
326       log << ld
327       log << "passing target domains                       : " + domain_pass_counter.to_s + ld
328       log << "failing target domains                       : " + domain_fail_counter.to_s + ld
329       log << "proteins in sequence (fasta) file            : " + in_msa.get_number_of_seqs.to_s + ld
330       log << "proteins in hmmscan outputfile               : " + protein_counter.to_s + ld
331       log << "proteins with passing target domain(s)       : " + passed_seqs.get_number_of_seqs.to_s + ld
332       log << "proteins with no passing target domain       : " + proteins_with_failing_domains.to_s + ld
333       log << "proteins with no target domain               : " + domain_not_present_counter.to_s + ld
334
335       log << ld
336
337       return domain_pass_counter
338
339     end # parse
340
341     private
342
343     def checkit2(domains_in_query_seq, target_domain_names, target_domains)
344       domain_names_in_query_seq = Set.new
345       domains_in_query_seq.each do |domain|
346         domain_names_in_query_seq.add(domain.model)
347       end
348       if (target_domain_names.subset?(domain_names_in_query_seq))
349
350         domains_in_query_seq.each do |domain|
351           if target_domains.has_key?(domain.model)
352             target_domain = target_domains[domain.model]
353             if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
354               if domain.i_e_value > target_domain.i_e_value
355                 return false
356               end
357             end
358             if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
359               length = 1 + domain.env_to - domain.env_from
360               if length < target_domain.abs_len
361                 return false
362               end
363             end
364             if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
365               length = 1 + domain.env_to - domain.env_from
366               if length < (target_domain.rel_len * domain.tlen)
367                 return false
368               end
369             end
370             #puts domain.model + " passed"
371             if ( @passed.key?(domain.model) )
372               @passed[domain.model] =  @passed[domain.model] + 1
373             else
374               @passed[domain.model] = 1
375             end
376           end
377         end
378
379       else
380         return false
381       end
382       true
383     end
384
385     def write_msa( msa, filename )
386       io = MsaIO.new()
387       w = FastaWriter.new()
388       w.set_line_width( 60 )
389       w.clean( true )
390       begin
391         io.write_to_file( msa, filename, w )
392       rescue Exception
393         error_msg = "could not write to \"" + filename + "\""
394         raise IOError, error_msg
395       end
396     end
397
398     def add_sequence( sequence_name, in_msa, add_to_msa )
399       seqs = in_msa.find_by_name_start( sequence_name, true )
400       if ( seqs.length < 1 )
401         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
402         raise StandardError, error_msg
403       end
404       if ( seqs.length > 1 )
405         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
406         raise StandardError, error_msg
407       end
408       seq = in_msa.get_sequence( seqs[ 0 ] )
409       add_to_msa.add_sequence( seq )
410     end
411
412     def process_hmmscan_datas( hmmscan_datas,
413       in_msa,
414       add_position,
415       add_domain_number,
416       add_species,
417       out_msa )
418
419       actual_out_of = hmmscan_datas.size
420
421       seq_name = ""
422       prev_seq_name = nil
423
424       hmmscan_datas.each_with_index do |hmmscan_data, index|
425         if hmmscan_data.number < ( index + 1 )
426           error_msg = "hmmscan_data.number < ( index + 1 ) (this should not have happened)"
427           raise StandardError, error_msg
428         end
429
430         seq_name = hmmscan_data.seq_name
431
432         extract_domain( seq_name,
433         index + 1,
434         actual_out_of,
435         hmmscan_data.env_from,
436         hmmscan_data.env_to,
437         in_msa,
438         out_msa,
439         add_position,
440         add_domain_number,
441         add_species )
442
443         if prev_seq_name && prev_seq_name != seq_name
444           error_msg = "this should not have happened"
445           raise StandardError, error_msg
446         end
447         prev_seq_name = seq_name
448       end # each
449     end # def process_hmmscan_data
450
451     def extract_domain( sequence,
452       number,
453       out_of,
454       seq_from,
455       seq_to,
456       in_msa,
457       out_msa,
458       add_position,
459       add_domain_number,
460       add_species )
461       if number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
462         error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
463         raise StandardError, error_msg
464       end
465       if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
466         error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
467         raise StandardError, error_msg
468       end
469       seqs = in_msa.find_by_name_start( sequence, true )
470       if seqs.length < 1
471         error_msg = "sequence \"" + sequence + "\" not found in sequence file"
472         raise IOError, error_msg
473       end
474       if seqs.length > 1
475         error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
476         raise IOError, error_msg
477       end
478       # hmmscan is 1 based, whereas sequences are 0 bases in this package.
479       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
480
481       orig_name = seq.get_name
482
483       seq.set_name( orig_name.split[ 0 ] )
484
485       if add_position
486         seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
487       end
488
489       if out_of != 1 && add_domain_number
490         seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
491       end
492
493       if add_species
494         a = orig_name.rindex "["
495         b = orig_name.rindex "]"
496         unless a && b
497           error_msg = "species not found in " + orig_name
498           raise StandardError, error_msg
499         end
500         species = orig_name[ a .. b ]
501         seq.set_name( seq.get_name + " " + species )
502       end
503       out_msa.add_sequence( seq )
504     end
505
506     def is_ignorable?( line )
507       return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
508     end
509
510   end # class HmmscanDomainExtractor
511
512   class HmmscanData
513     def initialize( seq_name, number, out_of, env_from, env_to, i_e_value )
514       @seq_name = seq_name
515       @number = number
516       @out_of = out_of
517       @env_from = env_from
518       @env_to = env_to
519       @i_e_value = i_e_value
520     end
521     attr_reader :seq_name, :number, :out_of, :env_from, :env_to, :i_e_value
522   end
523
524   class TargetDomain
525     def initialize( name, i_e_value, abs_len, rel_len )
526       @name = name
527       @i_e_value = i_e_value
528       @abs_len = abs_len
529       @percent_len = rel_len
530     end
531     attr_reader :name, :i_e_value, :abs_len, :rel_len
532   end
533
534 end # module Evoruby
535