in progress...
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_multi_domain_extractor.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
3 #
4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
5 # License::      GNU Lesser General Public License (LGPL)
6 #
7 # Last modified: 2017/02/20
8
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
15
16 module Evoruby
17   class HmmscanMultiDomainExtractor
18     def initialize
19       @passed = Hash.new
20       @non_passsing_domain_architectures = Hash.new
21     end
22
23     # raises ArgumentError, IOError, StandardError
24     def parse( domain_id,
25       hmmscan_output,
26       fasta_sequence_file,
27       outfile,
28       passed_seqs_outfile,
29       failed_seqs_outfile,
30       e_value_threshold,
31       length_threshold,
32       add_position,
33       add_domain_number,
34       add_species,
35       log )
36
37       Util.check_file_for_readability( hmmscan_output )
38       Util.check_file_for_readability( fasta_sequence_file )
39       Util.check_file_for_writability( outfile + ".fasta" )
40       Util.check_file_for_writability( passed_seqs_outfile )
41       Util.check_file_for_writability( failed_seqs_outfile )
42
43       in_msa = nil
44       factory = MsaFactory.new()
45       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
46
47       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
48         error_msg = "could not find fasta sequences in " + fasta_sequence_file
49         raise IOError, error_msg
50       end
51
52       out_msa = Msa.new
53
54       failed_seqs = Msa.new
55       passed_seqs = Msa.new
56       passed_multi_seqs = Msa.new
57
58       ld = Constants::LINE_DELIMITER
59
60       domain_pass_counter                = 0
61       domain_fail_counter                = 0
62       passing_domains_per_protein        = 0
63       proteins_with_failing_domains      = 0
64       domain_not_present_counter         = 0
65       protein_counter                    = 1
66       max_domain_copy_number_per_protein = -1
67       max_domain_copy_number_sequence    = ""
68       passing_target_length_sum          = 0
69       overall_target_length_sum          = 0
70       overall_target_length_min          = 10000000
71       overall_target_length_max          = -1
72       passing_target_length_min          = 10000000
73       passing_target_length_max          = -1
74
75       overall_target_ie_min          = 10000000
76       overall_target_ie_max          = -1
77       passing_target_ie_min          = 10000000
78       passing_target_ie_max          = -1
79
80       hmmscan_datas = []
81
82       hmmscan_parser = HmmscanParser.new( hmmscan_output )
83       results = hmmscan_parser.parse
84
85       ####
86       # Import: if multiple copies of same domain, threshold need to be same!
87       target_domain_ary = Array.new
88       target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 1 ))
89       target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 ))
90       # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
91       # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
92
93       #target_domain_ary.push(TargetDomain.new('BH4', 0.1, -1, 0.5, 0 ))
94       #target_domain_ary.push(TargetDomain.new('Bcl-2', 0.1, -1, 0.5, 1 ))
95       # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
96
97       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
98       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
99
100       #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
101
102       target_domains = Hash.new
103
104       target_domain_architecure = ""
105
106       target_domain_ary.each do |target_domain|
107         target_domains[target_domain.name] = target_domain
108         if target_domain_architecure.length > 0
109           target_domain_architecure += ">"
110         end
111         target_domain_architecure += target_domain.name
112       end
113
114       target_domain_architecure.freeze
115
116       puts target_domain_architecure
117
118       target_domain_names = Set.new
119
120       target_domains.each_key {|key| target_domain_names.add( key ) }
121
122       prev_query_seq_name = nil
123       domains_in_query_seq = Array.new
124       passing_sequences = Array.new
125       total_sequences = 0
126       @passed = Hash.new
127       out_domain_msas = Hash.new
128       out_domain_architecture_msa = Msa.new
129       results.each do |hmmscan_result|
130         if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
131           if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure)
132             passing_sequences.push(domains_in_query_seq)
133           end
134           domains_in_query_seq = Array.new
135           total_sequences += 1
136         end
137         prev_query_seq_name = hmmscan_result.query
138         domains_in_query_seq.push(hmmscan_result)
139       end # each result
140
141       if prev_query_seq_name != nil
142         total_sequences += 1
143         if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure)
144           passing_sequences.push(domains_in_query_seq)
145         end
146       end
147
148       out_domain_msas.keys.sort.each do |domain_name|
149         puts "writing #{domain_name}"
150         write_msa( out_domain_msas[domain_name], domain_name + ".fasta" )
151       end
152
153       puts "writing target_domain_architecure"
154       write_msa( out_domain_architecture_msa, "target_domain_architecure" + ".fasta" )
155
156       passing_sequences.each do | domains |
157
158         seq = domains[0].query
159         # puts seq + "::"
160         ################
161         if passed_seqs.find_by_name_start( seq, true ).length < 1
162           add_sequence( seq, in_msa, passed_multi_seqs )
163         else
164           error_msg = "this should not have happened"
165           raise StandardError, error_msg
166         end
167         ################
168
169         domains.each do | domain |
170           #puts domain.query + ": " + domain.model
171         end
172         #puts
173       end
174
175       puts @non_passsing_domain_architectures
176
177       puts @passed
178       puts "total sequences  : " + total_sequences.to_s
179       puts "passing sequences: " + passing_sequences.size.to_s
180
181       # write_msa( out_msa, outfile + ".fasta"  )
182       # write_msa( passed_multi_seqs, passed_seqs_outfile )
183       # write_msa( failed_seqs, failed_seqs_outfile )
184
185       log << ld
186       log << "passing target domains                       : " + domain_pass_counter.to_s + ld
187       log << "failing target domains                       : " + domain_fail_counter.to_s + ld
188       log << "proteins in sequence (fasta) file            : " + in_msa.get_number_of_seqs.to_s + ld
189       log << "proteins in hmmscan outputfile               : " + protein_counter.to_s + ld
190       log << "proteins with passing target domain(s)       : " + passed_seqs.get_number_of_seqs.to_s + ld
191       log << "proteins with no passing target domain       : " + proteins_with_failing_domains.to_s + ld
192       log << "proteins with no target domain               : " + domain_not_present_counter.to_s + ld
193
194       log << ld
195
196       return domain_pass_counter
197
198     end # parse
199
200     private
201
202     # domains_in_query_seq: Array of HmmscanResult
203     # target_domain_names: Set of String
204     # target_domains: Hash String->TargetDomain
205     # target_domain_architecture: String
206     def checkit2(domains_in_query_seq,
207       target_domain_names,
208       target_domains,
209       in_msa,
210       out_domain_msas,
211       out_domain_architecture_msa,
212       target_domain_architecture = nil)
213
214       domain_names_in_query_seq = Set.new
215       domains_in_query_seq.each do |domain|
216         domain_names_in_query_seq.add(domain.model)
217       end
218       if (target_domain_names.subset?(domain_names_in_query_seq))
219
220         passed_domains = Array.new
221         passed_domains_counts = Hash.new
222
223         domains_in_query_seq.each do |domain|
224           if target_domains.has_key?(domain.model)
225             target_domain = target_domains[domain.model]
226             if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
227               if domain.i_e_value > target_domain.i_e_value
228                 return false
229               end
230             end
231             if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
232               length = 1 + domain.env_to - domain.env_from
233               if length < target_domain.abs_len
234                 return false
235               end
236             end
237             if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
238               length = 1 + domain.env_to - domain.env_from
239               if length < (target_domain.rel_len * domain.tlen)
240                 return false
241               end
242             end
243
244             # For domain architecture comparison
245             if target_domain_architecture != nil
246               passed_domains.push(domain)
247             end
248
249             if (passed_domains_counts.key?(domain.model))
250               passed_domains_counts[domain.model] = passed_domains_counts[domain.model] + 1
251             else
252               passed_domains_counts[domain.model] = 1
253             end
254
255             if (@passed.key?(domain.model))
256               @passed[domain.model] = @passed[domain.model] + 1
257             else
258               @passed[domain.model] = 1
259             end
260           end
261         end
262
263       else
264         return false
265       end
266       passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
267       # Compare architectures
268       if target_domain_architecture != nil
269         if !compareArchitectures(target_domain_architecture, passed_domains)
270           return false
271         end
272       end
273
274       domain_counter = Hash.new
275
276       min_env_from = 10000000
277       max_env_from = 0
278       min_env_to = 10000000
279       max_env_to = 0
280
281       query_seq = nil
282
283       concatenated_domains = ''
284
285       passed_domains.each do |domain|
286         domain_name = domain.model
287
288         unless out_domain_msas.has_key? domain_name
289           out_domain_msas[ domain_name ] = Msa.new
290         end
291
292         if domain_counter.key?(domain_name)
293           domain_counter[domain_name] = domain_counter[domain_name] + 1
294         else
295           domain_counter[domain_name] = 1
296         end
297
298         if query_seq == nil
299           query_seq = domain.query
300           query_seq.freeze
301         elsif query_seq != domain.query
302           error_msg = "seq names do not match: this should not have happened"
303           raise StandardError, error_msg
304         end
305
306         extracted_domain_seq = extract_domain( query_seq,
307         domain_counter[domain_name],
308         passed_domains_counts[domain_name],
309         domain.env_from,
310         domain.env_to,
311         in_msa,
312         out_domain_msas[ domain_name ] )
313
314         concatenated_domains += extracted_domain_seq
315
316         if domain.env_from < min_env_from
317           min_env_from = domain.env_from
318         end
319         if domain.env_from > max_env_from
320           max_env_from = domain.env_from
321         end
322         if domain.env_to < min_env_to
323           min_env_to = domain.env_to
324         end
325         if domain.env_to > max_env_to
326           max_env_to = domain.env_to
327         end
328       end
329
330       puts "env from: #{min_env_from} - #{max_env_from}"
331       puts "env to  : #{min_env_to} - #{max_env_to}"
332
333       extract_domain( query_seq,
334       1,
335       1,
336       min_env_from,
337       max_env_to,
338       in_msa,
339       out_domain_architecture_msa )
340
341       puts passed_domains_counts
342       puts concatenated_domains
343       true
344     end
345
346     # passed_domains needs to be sorted!
347     def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
348       domain_architecture = ""
349       passed_domains.each do |domain|
350         if domain_architecture.length > 0
351           domain_architecture += ">"
352         end
353         domain_architecture += domain.model
354       end
355       pass = false
356       if strict
357         pass = (target_domain_architecture == domain_architecture)
358       else
359         pass = (domain_architecture.index(target_domain_architecture) != nil)
360       end
361       if !pass
362         if (@non_passsing_domain_architectures.key?(domain_architecture))
363           @non_passsing_domain_architectures[domain_architecture] = @non_passsing_domain_architectures[domain_architecture] + 1
364         else
365           @non_passsing_domain_architectures[domain_architecture] = 1
366         end
367       end
368       return pass
369     end
370
371     def write_msa( msa, filename )
372       io = MsaIO.new()
373       w = FastaWriter.new()
374       w.set_line_width( 60 )
375       w.clean( true )
376       File.delete(filename) if File.exist?(filename) #TODO remove me
377       begin
378         io.write_to_file( msa, filename, w )
379       rescue Exception
380         error_msg = "could not write to \"" + filename + "\""
381         raise IOError, error_msg
382       end
383     end
384
385     def add_sequence( sequence_name, in_msa, add_to_msa )
386       seqs = in_msa.find_by_name_start( sequence_name, true )
387       if ( seqs.length < 1 )
388         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
389         raise StandardError, error_msg
390       end
391       if ( seqs.length > 1 )
392         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
393         raise StandardError, error_msg
394       end
395       seq = in_msa.get_sequence( seqs[ 0 ] )
396       add_to_msa.add_sequence( seq )
397     end
398
399     def extract_domain( seq_name,
400       number,
401       out_of,
402       seq_from,
403       seq_to,
404       in_msa,
405       out_msa)
406       if number < 1 || out_of < 1 || number > out_of
407         error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
408         raise StandardError, error_msg
409       end
410       if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
411         error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
412         raise StandardError, error_msg
413       end
414       seqs = in_msa.find_by_name_start(seq_name, true)
415       if seqs.length < 1
416         error_msg = "sequence \"" + seq_name + "\" not found in sequence file"
417         raise IOError, error_msg
418       end
419       if seqs.length > 1
420         error_msg = "sequence \"" + seq_name + "\" not unique in sequence file"
421         raise IOError, error_msg
422       end
423       # hmmscan is 1 based, whereas sequences are 0 bases in this package.
424       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
425
426       orig_name = seq.get_name
427
428       seq.set_name( orig_name.split[ 0 ] )
429
430       if out_of != 1
431         seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
432       end
433
434       out_msa.add_sequence( seq )
435
436       seq.get_sequence_as_string
437     end
438
439     def is_ignorable?( line )
440       return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
441     end
442
443   end # class HmmscanDomainExtractor
444
445   class TargetDomain
446     def initialize( name, i_e_value, abs_len, rel_len, position )
447       @name = name
448       @i_e_value = i_e_value
449       @abs_len = abs_len
450       @percent_len = rel_len
451       @position = position
452     end
453     attr_reader :name, :i_e_value, :abs_len, :rel_len, :position
454   end
455
456 end # module Evoruby
457