in progress... multi-domain extractor started
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_multi_domain_extractor.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
3 #
4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
5 # License::      GNU Lesser General Public License (LGPL)
6 #
7 # Last modified: 2017/02/20
8
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
15
16 module Evoruby
17   class HmmscanMultiDomainExtractor
18     def initialize
19     end
20
21     # raises ArgumentError, IOError, StandardError
22     def parse( domain_id,
23       hmmscan_output,
24       fasta_sequence_file,
25       outfile,
26       passed_seqs_outfile,
27       failed_seqs_outfile,
28       e_value_threshold,
29       length_threshold,
30       add_position,
31       add_domain_number,
32       add_species,
33       log )
34
35       Util.check_file_for_readability( hmmscan_output )
36       Util.check_file_for_readability( fasta_sequence_file )
37       Util.check_file_for_writability( outfile + ".fasta" )
38       Util.check_file_for_writability( passed_seqs_outfile )
39       Util.check_file_for_writability( failed_seqs_outfile )
40
41       in_msa = nil
42       factory = MsaFactory.new()
43       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
44
45       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
46         error_msg = "could not find fasta sequences in " + fasta_sequence_file
47         raise IOError, error_msg
48       end
49
50       out_msa = Msa.new
51
52       failed_seqs = Msa.new
53       passed_seqs = Msa.new
54
55       ld = Constants::LINE_DELIMITER
56
57       domain_pass_counter                = 0
58       domain_fail_counter                = 0
59       passing_domains_per_protein        = 0
60       proteins_with_failing_domains      = 0
61       domain_not_present_counter         = 0
62       protein_counter                    = 1
63       max_domain_copy_number_per_protein = -1
64       max_domain_copy_number_sequence    = ""
65       passing_target_length_sum          = 0
66       overall_target_length_sum          = 0
67       overall_target_length_min          = 10000000
68       overall_target_length_max          = -1
69       passing_target_length_min          = 10000000
70       passing_target_length_max          = -1
71
72       overall_target_ie_min          = 10000000
73       overall_target_ie_max          = -1
74       passing_target_ie_min          = 10000000
75       passing_target_ie_max          = -1
76
77       hmmscan_datas = []
78
79       hmmscan_parser = HmmscanParser.new( hmmscan_output )
80       results = hmmscan_parser.parse
81
82       ####
83
84       pc0 = PassCondition.new('Bcl-2', 0.01, -1, 0.5 )
85
86       pcs = Hash.new
87       pcs["Bcl-2"] = pc0
88
89       prev_query = nil
90       domains_in_query_ary = Array.new
91       queries_ary = Array.new
92       results.each do |hmmscan_result|
93         if ( prev_query != nil ) && ( hmmscan_result.query != prev_query )
94           ###--
95           pass = true
96           domains_in_query_ary.each do |domain_in_query|
97             if pcs.has_key?(domain_in_query.model)
98               pc = pcs[domain_in_query.model]
99               #  @abs_len = abs_len
100               #  @percent_len = rel_len
101               if (pc.i_e_value != nil) && (pc.i_e_value >= 0)
102                 if domain_in_query.i_e_value > pc.i_e_value
103                   pass = false
104                   #break
105                 end
106               end
107               if (pc.abs_len != nil) && (pc.abs_len > 0)
108                 length = 1 + domain_in_query.env_to - domain_in_query.env_from
109                 if length < pc.abs_len
110                   pass = false
111                   #break
112                 end
113               end
114               if (pc.rel_len != nil) && (pc.rel_len > 0)
115                 length = 1 + domain_in_query.env_to - domain_in_query.env_from
116                 if length < (pc.rel_len * domain_in_query.tlen)
117                   pass = false
118                   #break
119                 end
120               end
121             end
122           end
123           if pass == true
124             queries_ary.push(domains_in_query_ary)
125           end
126           domains_in_query_ary = Array.new
127           ###--
128         end
129         prev_query = hmmscan_result.query
130         domains_in_query_ary.push(hmmscan_result)
131       end
132       if prev_query != nil
133         queries_ary.push(domains_in_query_ary)
134       end
135       puts  queries_ary[0][0].model
136       puts  queries_ary[0][0].i_e_value
137       puts  queries_ary[0][1].model
138       puts  queries_ary[0][2].model
139       puts  queries_ary[1][0].model
140       puts  queries_ary[1][0].i_e_value
141       puts  queries_ary[1][1].model
142       puts  queries_ary[2][2].model
143       queries_ary.each do | query_ary |
144         query_ary.each do | domain |
145           # puts domain.model
146         end
147         #puts
148       end
149
150       ####
151
152       prev_query = nil
153       saw_target = false
154
155       results.each do | r |
156
157         if ( prev_query != nil ) && ( r.query != prev_query )
158           protein_counter += 1
159           passing_domains_per_protein = 0
160           if !saw_target
161             log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
162             domain_not_present_counter += 1
163           end
164           saw_target = false
165         end
166
167         prev_query = r.query
168
169         if domain_id != r.model
170           next
171         end
172
173         saw_target = true
174
175         #   target    = r.model
176         sequence  = r.query
177         # sequence_len = r.qlen
178         number    = r.number
179         out_of    = r.out_of
180         env_from  = r.env_from
181         env_to    = r.env_to
182         i_e_value = r.i_e_value
183         prev_query = r.query
184
185         length = 1 + env_to - env_from
186
187         overall_target_length_sum += length
188         if length > overall_target_length_max
189           overall_target_length_max = length
190         end
191         if length < overall_target_length_min
192           overall_target_length_min = length
193         end
194
195         if i_e_value > overall_target_ie_max
196           overall_target_ie_max = i_e_value
197         end
198         if i_e_value < overall_target_ie_min
199           overall_target_ie_min = i_e_value
200         end
201
202         if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
203         ( ( length_threshold <= 0 ) || ( length >= length_threshold.to_f ) ) )
204           hmmscan_datas << HmmscanData.new( sequence, number, out_of, env_from, env_to, i_e_value )
205           passing_target_length_sum += length
206           passing_domains_per_protein += 1
207           if length > passing_target_length_max
208             passing_target_length_max = length
209           end
210           if length < passing_target_length_min
211             passing_target_length_min = length
212           end
213           if i_e_value > passing_target_ie_max
214             passing_target_ie_max = i_e_value
215           end
216           if i_e_value < passing_target_ie_min
217             passing_target_ie_min = i_e_value
218           end
219           if ( passing_domains_per_protein > max_domain_copy_number_per_protein )
220             max_domain_copy_number_sequence    = sequence
221             max_domain_copy_number_per_protein = passing_domains_per_protein
222           end
223         else # no pass
224           log << domain_fail_counter.to_s + ": " + sequence.to_s + " fails threshold(s)"
225           if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
226             log << " iE=" + i_e_value.to_s
227           end
228           if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
229             le = env_to - env_from + 1
230             log << " l=" + le.to_s
231           end
232           log << ld
233           domain_fail_counter += 1
234         end
235
236         if number > out_of
237           error_msg = "number > out_of (this should not have happened)"
238           raise StandardError, error_msg
239         end
240
241         if number == out_of
242           if !hmmscan_datas.empty?
243             process_hmmscan_datas( hmmscan_datas,
244             in_msa,
245             add_position,
246             add_domain_number,
247             add_species,
248             out_msa )
249             domain_pass_counter += hmmscan_datas.length
250             if passed_seqs.find_by_name_start( sequence, true ).length < 1
251               add_sequence( sequence, in_msa, passed_seqs )
252             else
253               error_msg = "this should not have happened"
254               raise StandardError, error_msg
255             end
256           else # no pass
257             if failed_seqs.find_by_name_start( sequence, true ).length < 1
258               add_sequence( sequence, in_msa, failed_seqs )
259               proteins_with_failing_domains += 1
260             else
261               error_msg = "this should not have happened"
262               raise StandardError, error_msg
263             end
264           end
265           hmmscan_datas.clear
266         end
267
268       end # results.each do | r |
269
270       if (prev_query != nil) && (!saw_target)
271         log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
272         domain_not_present_counter += 1
273       end
274
275       if domain_pass_counter < 1
276         error_msg = "no domain sequences were extracted"
277         raise IOError, error_msg
278       end
279
280       if ( domain_not_present_counter + passed_seqs.get_number_of_seqs + proteins_with_failing_domains ) != protein_counter
281         error_msg = "not present + passing + not passing != proteins in sequence (fasta) file (this should not have happened)"
282         raise StandardError, error_msg
283       end
284
285       puts
286       log << ld
287
288       log << ld
289       avg_pass = ( passing_target_length_sum / domain_pass_counter )
290       puts( "Passing target domain lengths: average: " + avg_pass.to_s  )
291       log << "Passing target domain lengths: average: " + avg_pass.to_s
292       log << ld
293       puts( "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - "  + passing_target_length_max.to_s)
294       log << "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - "  + passing_target_length_max.to_s
295       log << ld
296       puts( "Passing target domain iE:      min-max: " + passing_target_ie_min.to_s + " - "  + passing_target_ie_max.to_s)
297       log << "Passing target domain iE:      min-max: " + passing_target_ie_min.to_s + " - "  + passing_target_ie_max.to_s
298       log << ld
299       puts( "Passing target domains:            sum: " + domain_pass_counter.to_s  )
300       log << "Passing target domains:            sum: " + domain_pass_counter.to_s
301       log << ld
302       log << ld
303       puts
304       sum = domain_pass_counter + domain_fail_counter
305       avg_all = overall_target_length_sum / sum
306       puts( "All target domain lengths:     average: " + avg_all.to_s  )
307       log << "All target domain lengths:     average: " + avg_all.to_s
308       log << ld
309       puts( "All target domain lengths:     min-max: " + overall_target_length_min.to_s + " - "  + overall_target_length_max.to_s)
310       log << "All target domain lengths:     min-max: " + overall_target_length_min.to_s + " - "  + overall_target_length_max.to_s
311       log << ld
312       puts( "All target target domain iE:   min-max: " + overall_target_ie_min.to_s + " - "  + overall_target_ie_max.to_s)
313       log << "All target target domain iE:   min-max: " + overall_target_ie_min.to_s + " - "  + overall_target_ie_max.to_s
314       log << ld
315       puts( "All target domains:                sum: " + sum.to_s  )
316       log << "All target domains:                sum: " + sum.to_s
317
318       puts
319       puts( "Proteins with passing target domain(s): " + passed_seqs.get_number_of_seqs.to_s )
320       puts( "Proteins with no passing target domain: " + proteins_with_failing_domains.to_s )
321       puts( "Proteins with no target domain        : " + domain_not_present_counter.to_s )
322
323       log << ld
324       log << ld
325       puts
326       puts( "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s )
327       log << "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s
328       log << ld
329
330       if ( max_domain_copy_number_per_protein > 1 )
331         puts( "First target protein with this copy number: " + max_domain_copy_number_sequence )
332         log << "First target protein with this copy number: " + max_domain_copy_number_sequence
333         log << ld
334       end
335
336       write_msa( out_msa, outfile + ".fasta"  )
337       write_msa( passed_seqs, passed_seqs_outfile )
338       write_msa( failed_seqs, failed_seqs_outfile )
339
340       log << ld
341       log << "passing target domains                       : " + domain_pass_counter.to_s + ld
342       log << "failing target domains                       : " + domain_fail_counter.to_s + ld
343       log << "proteins in sequence (fasta) file            : " + in_msa.get_number_of_seqs.to_s + ld
344       log << "proteins in hmmscan outputfile               : " + protein_counter.to_s + ld
345       log << "proteins with passing target domain(s)       : " + passed_seqs.get_number_of_seqs.to_s + ld
346       log << "proteins with no passing target domain       : " + proteins_with_failing_domains.to_s + ld
347       log << "proteins with no target domain               : " + domain_not_present_counter.to_s + ld
348
349       log << ld
350
351       return domain_pass_counter
352
353     end # parse
354
355     private
356
357     def write_msa( msa, filename )
358       io = MsaIO.new()
359       w = FastaWriter.new()
360       w.set_line_width( 60 )
361       w.clean( true )
362       begin
363         io.write_to_file( msa, filename, w )
364       rescue Exception
365         error_msg = "could not write to \"" + filename + "\""
366         raise IOError, error_msg
367       end
368     end
369
370     def add_sequence( sequence_name, in_msa, add_to_msa )
371       seqs = in_msa.find_by_name_start( sequence_name, true )
372       if ( seqs.length < 1 )
373         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
374         raise StandardError, error_msg
375       end
376       if ( seqs.length > 1 )
377         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
378         raise StandardError, error_msg
379       end
380       seq = in_msa.get_sequence( seqs[ 0 ] )
381       add_to_msa.add_sequence( seq )
382     end
383
384     def process_hmmscan_datas( hmmscan_datas,
385       in_msa,
386       add_position,
387       add_domain_number,
388       add_species,
389       out_msa )
390
391       actual_out_of = hmmscan_datas.size
392
393       seq_name = ""
394       prev_seq_name = nil
395
396       hmmscan_datas.each_with_index do |hmmscan_data, index|
397         if hmmscan_data.number < ( index + 1 )
398           error_msg = "hmmscan_data.number < ( index + 1 ) (this should not have happened)"
399           raise StandardError, error_msg
400         end
401
402         seq_name = hmmscan_data.seq_name
403
404         extract_domain( seq_name,
405         index + 1,
406         actual_out_of,
407         hmmscan_data.env_from,
408         hmmscan_data.env_to,
409         in_msa,
410         out_msa,
411         add_position,
412         add_domain_number,
413         add_species )
414
415         if prev_seq_name && prev_seq_name != seq_name
416           error_msg = "this should not have happened"
417           raise StandardError, error_msg
418         end
419         prev_seq_name = seq_name
420       end # each
421     end # def process_hmmscan_data
422
423     def extract_domain( sequence,
424       number,
425       out_of,
426       seq_from,
427       seq_to,
428       in_msa,
429       out_msa,
430       add_position,
431       add_domain_number,
432       add_species )
433       if number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
434         error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
435         raise StandardError, error_msg
436       end
437       if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
438         error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
439         raise StandardError, error_msg
440       end
441       seqs = in_msa.find_by_name_start( sequence, true )
442       if seqs.length < 1
443         error_msg = "sequence \"" + sequence + "\" not found in sequence file"
444         raise IOError, error_msg
445       end
446       if seqs.length > 1
447         error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
448         raise IOError, error_msg
449       end
450       # hmmscan is 1 based, whereas sequences are 0 bases in this package.
451       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
452
453       orig_name = seq.get_name
454
455       seq.set_name( orig_name.split[ 0 ] )
456
457       if add_position
458         seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
459       end
460
461       if out_of != 1 && add_domain_number
462         seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
463       end
464
465       if add_species
466         a = orig_name.rindex "["
467         b = orig_name.rindex "]"
468         unless a && b
469           error_msg = "species not found in " + orig_name
470           raise StandardError, error_msg
471         end
472         species = orig_name[ a .. b ]
473         seq.set_name( seq.get_name + " " + species )
474       end
475       out_msa.add_sequence( seq )
476     end
477
478     def is_ignorable?( line )
479       return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
480     end
481
482   end # class HmmscanDomainExtractor
483
484   class HmmscanData
485     def initialize( seq_name, number, out_of, env_from, env_to, i_e_value )
486       @seq_name = seq_name
487       @number = number
488       @out_of = out_of
489       @env_from = env_from
490       @env_to = env_to
491       @i_e_value = i_e_value
492     end
493     attr_reader :seq_name, :number, :out_of, :env_from, :env_to, :i_e_value
494   end
495
496   class PassCondition
497     def initialize( hmm, i_e_value, abs_len, rel_len )
498       @hmm = hmm
499       @i_e_value = i_e_value
500       @abs_len = abs_len
501       @percent_len = rel_len
502     end
503     attr_reader :hmm, :i_e_value, :abs_len, :rel_len
504   end
505
506 end # module Evoruby
507