56bf65f75dfe570e570c76797aaa046201d43434
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_multi_domain_extractor.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
3 #
4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
5 # License::      GNU Lesser General Public License (LGPL)
6 #
7 # Last modified: 2017/02/20
8
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
15
16 module Evoruby
17   class HmmscanMultiDomainExtractor
18     def initialize
19       @passed = Hash.new
20       @non_passsing_domain_architectures = Hash.new
21     end
22
23     # raises ArgumentError, IOError, StandardError
24     def parse( domain_id,
25       hmmscan_output,
26       fasta_sequence_file,
27       outfile,
28       passed_seqs_outfile,
29       failed_seqs_outfile,
30       e_value_threshold,
31       length_threshold,
32       add_position,
33       add_domain_number,
34       add_species,
35       log )
36
37       Util.check_file_for_readability( hmmscan_output )
38       Util.check_file_for_readability( fasta_sequence_file )
39       Util.check_file_for_writability( outfile + ".fasta" )
40       Util.check_file_for_writability( passed_seqs_outfile )
41       Util.check_file_for_writability( failed_seqs_outfile )
42
43       in_msa = nil
44       factory = MsaFactory.new()
45       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
46
47       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
48         error_msg = "could not find fasta sequences in " + fasta_sequence_file
49         raise IOError, error_msg
50       end
51
52       out_msa = Msa.new
53
54       failed_seqs = Msa.new
55       passed_seqs = Msa.new
56       passed_multi_seqs = Msa.new
57
58       ld = Constants::LINE_DELIMITER
59
60       domain_pass_counter                = 0
61       domain_fail_counter                = 0
62       passing_domains_per_protein        = 0
63       proteins_with_failing_domains      = 0
64       domain_not_present_counter         = 0
65       protein_counter                    = 1
66       max_domain_copy_number_per_protein = -1
67       max_domain_copy_number_sequence    = ""
68       passing_target_length_sum          = 0
69       overall_target_length_sum          = 0
70       overall_target_length_min          = 10000000
71       overall_target_length_max          = -1
72       passing_target_length_min          = 10000000
73       passing_target_length_max          = -1
74
75       overall_target_ie_min          = 10000000
76       overall_target_ie_max          = -1
77       passing_target_ie_min          = 10000000
78       passing_target_ie_max          = -1
79
80       hmmscan_datas = []
81
82       hmmscan_parser = HmmscanParser.new( hmmscan_output )
83       results = hmmscan_parser.parse
84
85       ####
86       # Import: if multiple copies of same domain, threshold need to be same!
87       target_domain_ary = Array.new
88       target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 1 ))
89       target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 ))
90       # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
91       # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
92
93       #target_domain_ary.push(TargetDomain.new('BH4', 0.1, -1, 0.5, 0 ))
94       #target_domain_ary.push(TargetDomain.new('Bcl-2', 0.1, -1, 0.5, 1 ))
95       # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
96
97       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
98       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
99
100       #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
101
102       target_domains = Hash.new
103
104       target_domain_architecure = ""
105
106       target_domain_ary.each do |target_domain|
107         target_domains[target_domain.name] = target_domain
108         if target_domain_architecure.length > 0
109           target_domain_architecure += ">"
110         end
111         target_domain_architecure += target_domain.name
112       end
113
114       target_domain_architecure.freeze
115
116       puts target_domain_architecure
117
118       target_domain_names = Set.new
119
120       target_domains.each_key {|key| target_domain_names.add( key ) }
121
122       prev_query_seq_name = nil
123       domains_in_query_seq = Array.new
124       passing_sequences = Array.new
125       total_sequences = 0
126       @passed = Hash.new
127       out_domain_msas = Hash.new
128       out_domain_architecture_msa = Msa.new
129       results.each do |hmmscan_result|
130         if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
131           if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure)
132             passing_sequences.push(domains_in_query_seq)
133           end
134           domains_in_query_seq = Array.new
135           total_sequences += 1
136         end
137         prev_query_seq_name = hmmscan_result.query
138         domains_in_query_seq.push(hmmscan_result)
139       end # each result
140
141       if prev_query_seq_name != nil
142         total_sequences += 1
143         if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure)
144           passing_sequences.push(domains_in_query_seq)
145         end
146       end
147
148       out_domain_msas.keys.sort.each do |domain_name|
149         puts "writing #{domain_name}"
150         write_msa( out_domain_msas[domain_name], domain_name + ".fasta" )
151       end
152
153       puts "writing target_domain_architecure"
154       write_msa( out_domain_architecture_msa, "target_domain_architecure" + ".fasta" )
155
156       passing_sequences.each do | domains |
157
158         seq = domains[0].query
159         # puts seq + "::"
160         ################
161         if passed_seqs.find_by_name_start( seq, true ).length < 1
162           add_sequence( seq, in_msa, passed_multi_seqs )
163         else
164           error_msg = "this should not have happened"
165           raise StandardError, error_msg
166         end
167         ################
168
169         domains.each do | domain |
170           #puts domain.query + ": " + domain.model
171         end
172         #puts
173       end
174
175       puts @non_passsing_domain_architectures
176
177       puts @passed
178       puts "total sequences  : " + total_sequences.to_s
179       puts "passing sequences: " + passing_sequences.size.to_s
180
181       # write_msa( out_msa, outfile + ".fasta"  )
182       # write_msa( passed_multi_seqs, passed_seqs_outfile )
183       # write_msa( failed_seqs, failed_seqs_outfile )
184
185       log << ld
186       log << "passing target domains                       : " + domain_pass_counter.to_s + ld
187       log << "failing target domains                       : " + domain_fail_counter.to_s + ld
188       log << "proteins in sequence (fasta) file            : " + in_msa.get_number_of_seqs.to_s + ld
189       log << "proteins in hmmscan outputfile               : " + protein_counter.to_s + ld
190       log << "proteins with passing target domain(s)       : " + passed_seqs.get_number_of_seqs.to_s + ld
191       log << "proteins with no passing target domain       : " + proteins_with_failing_domains.to_s + ld
192       log << "proteins with no target domain               : " + domain_not_present_counter.to_s + ld
193
194       log << ld
195
196       return domain_pass_counter
197
198     end # parse
199
200     private
201
202     # domains_in_query_seq: Array of HmmscanResult
203     # target_domain_names: Set of String
204     # target_domains: Hash String->TargetDomain
205     # target_domain_architecture: String
206     def checkit2(domains_in_query_seq,
207       target_domain_names,
208       target_domains,
209       in_msa,
210       out_domain_msas,
211       out_domain_architecture_msa,
212       target_domain_architecture = nil)
213
214       domain_names_in_query_seq = Set.new
215       domains_in_query_seq.each do |domain|
216         domain_names_in_query_seq.add(domain.model)
217       end
218       if (target_domain_names.subset?(domain_names_in_query_seq))
219
220         passed_domains = Array.new
221         passed_domains_counts = Hash.new
222
223         domains_in_query_seq.each do |domain|
224           if target_domains.has_key?(domain.model)
225             target_domain = target_domains[domain.model]
226             if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
227               if domain.i_e_value > target_domain.i_e_value
228                 return false
229               end
230             end
231             if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
232               length = 1 + domain.env_to - domain.env_from
233               if length < target_domain.abs_len
234                 return false
235               end
236             end
237             if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
238               length = 1 + domain.env_to - domain.env_from
239               if length < (target_domain.rel_len * domain.tlen)
240                 return false
241               end
242             end
243
244             # For domain architecture comparison
245             if target_domain_architecture != nil
246               passed_domains.push(domain)
247             end
248
249             if (passed_domains_counts.key?(domain.model))
250               passed_domains_counts[domain.model] = passed_domains_counts[domain.model] + 1
251             else
252               passed_domains_counts[domain.model] = 1
253             end
254
255             if (@passed.key?(domain.model))
256               @passed[domain.model] = @passed[domain.model] + 1
257             else
258               @passed[domain.model] = 1
259             end
260           end
261         end
262
263       else
264         return false
265       end
266       passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
267       # Compare architectures
268       if target_domain_architecture != nil
269         if !compareArchitectures(target_domain_architecture, passed_domains)
270           return false
271         end
272       end
273
274       domain_counter = Hash.new
275
276       min_env_from = 10000000
277       max_env_from = 0
278       min_env_to = 10000000
279       max_env_to = 0
280
281       query_seq = nil
282
283       passed_domains.each do |domain|
284         domain_name = domain.model
285
286         unless out_domain_msas.has_key? domain_name
287           out_domain_msas[ domain_name ] = Msa.new
288         end
289
290         if domain_counter.key?(domain_name)
291           domain_counter[domain_name] = domain_counter[domain_name] + 1
292         else
293           domain_counter[domain_name] = 1
294         end
295
296         if query_seq == nil
297           query_seq = domain.query
298           query_seq.freeze
299         elsif query_seq != domain.query
300           error_msg = "seq names do not match: this should not have happened"
301           raise StandardError, error_msg
302         end
303
304         extract_domain( query_seq,
305         domain_counter[domain_name],
306         passed_domains_counts[domain_name],
307         domain.env_from,
308         domain.env_to,
309         in_msa,
310         out_domain_msas[ domain_name ] )
311
312         if domain.env_from < min_env_from
313           min_env_from = domain.env_from
314         end
315         if domain.env_from > max_env_from
316           max_env_from = domain.env_from
317         end
318         if domain.env_to < min_env_to
319           min_env_to = domain.env_to
320         end
321         if domain.env_to > max_env_to
322           max_env_to = domain.env_to
323         end
324       end
325
326       puts "env from: #{min_env_from} - #{max_env_from}"
327       puts "env to  : #{min_env_to} - #{max_env_to}"
328
329       extract_domain( query_seq,
330       1,
331       1,
332       min_env_from,
333       max_env_to,
334       in_msa,
335       out_domain_architecture_msa )
336
337       puts passed_domains_counts
338       true
339     end
340
341     # passed_domains needs to be sorted!
342     def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
343       domain_architecture = ""
344       passed_domains.each do |domain|
345         if domain_architecture.length > 0
346           domain_architecture += ">"
347         end
348         domain_architecture += domain.model
349       end
350       pass = false
351       if strict
352         pass = (target_domain_architecture == domain_architecture)
353       else
354         pass = (domain_architecture.index(target_domain_architecture) != nil)
355       end
356       if !pass
357         if (@non_passsing_domain_architectures.key?(domain_architecture))
358           @non_passsing_domain_architectures[domain_architecture] = @non_passsing_domain_architectures[domain_architecture] + 1
359         else
360           @non_passsing_domain_architectures[domain_architecture] = 1
361         end
362       end
363       return pass
364     end
365
366     def write_msa( msa, filename )
367       io = MsaIO.new()
368       w = FastaWriter.new()
369       w.set_line_width( 60 )
370       w.clean( true )
371       File.delete(filename) if File.exist?(filename) #TODO remove me
372       begin
373         io.write_to_file( msa, filename, w )
374       rescue Exception
375         error_msg = "could not write to \"" + filename + "\""
376         raise IOError, error_msg
377       end
378     end
379
380     def add_sequence( sequence_name, in_msa, add_to_msa )
381       seqs = in_msa.find_by_name_start( sequence_name, true )
382       if ( seqs.length < 1 )
383         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
384         raise StandardError, error_msg
385       end
386       if ( seqs.length > 1 )
387         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
388         raise StandardError, error_msg
389       end
390       seq = in_msa.get_sequence( seqs[ 0 ] )
391       add_to_msa.add_sequence( seq )
392     end
393
394     def extract_domain( seq_name,
395       number,
396       out_of,
397       seq_from,
398       seq_to,
399       in_msa,
400       out_msa)
401       if number < 1 || out_of < 1 || number > out_of
402         error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
403         raise StandardError, error_msg
404       end
405       if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
406         error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
407         raise StandardError, error_msg
408       end
409       seqs = in_msa.find_by_name_start(seq_name, true)
410       if seqs.length < 1
411         error_msg = "sequence \"" + seq_name + "\" not found in sequence file"
412         raise IOError, error_msg
413       end
414       if seqs.length > 1
415         error_msg = "sequence \"" + seq_name + "\" not unique in sequence file"
416         raise IOError, error_msg
417       end
418       # hmmscan is 1 based, whereas sequences are 0 bases in this package.
419       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
420
421       orig_name = seq.get_name
422
423       seq.set_name( orig_name.split[ 0 ] )
424
425       if out_of != 1
426         seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
427       end
428
429       out_msa.add_sequence( seq )
430     end
431
432     def is_ignorable?( line )
433       return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
434     end
435
436   end # class HmmscanDomainExtractor
437
438   class TargetDomain
439     def initialize( name, i_e_value, abs_len, rel_len, position )
440       @name = name
441       @i_e_value = i_e_value
442       @abs_len = abs_len
443       @percent_len = rel_len
444       @position = position
445     end
446     attr_reader :name, :i_e_value, :abs_len, :rel_len, :position
447   end
448
449 end # module Evoruby
450