in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_domain_extractor.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanDomainExtractor class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id:  $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/io/msa_io'
13 require 'lib/evo/io/writer/fasta_writer'
14 require 'lib/evo/io/parser/fasta_parser'
15
16
17 module Evoruby
18
19   class HmmscanDomainExtractor
20
21     TRIM_BY = 2
22
23     def initialize
24     end
25
26
27     def process_hmmscan_datas( hmmscan_datas,
28         in_msa,
29         add_position,
30         add_domain_number,
31         trim_name ,
32         add_species,
33         out_msa )
34
35       actual_out_of = hmmscan_datas.size
36
37       domain_pass_counter = 0;
38
39       hmmscan_datas.each_with_index do |hmmscan_data, index|
40         if hmmscan_data.number < ( index + 1 )
41           error_msg = "hmmscan_data.number < ( index + 1 ) (this should not have happened)"
42           raise StandardError, error_msg
43         end
44
45         extract_domain( hmmscan_data.seq_name,
46           index + 1,
47           actual_out_of,
48           hmmscan_data.env_from,
49           hmmscan_data.env_to,
50           in_msa,
51           out_msa,
52           add_position,
53           add_domain_number,
54           trim_name ,
55           add_species )
56         domain_pass_counter += 1
57
58         #   if passed_seqs.find_by_name_start( hmmscan_data.seq_name, true ).length < 1
59         #     add_sequence( hmmscan_data.seq_name, in_msa, passed_seqs )
60         #     proteins_with_passing_domains += 1
61         #   end
62
63 =begin
64         if min_linker
65           if out_of == 1
66             extract_domain( sequence,
67               number,
68               out_of,
69               env_from,
70               env_to,
71               in_msa,
72               out_msa_singlets,
73               false,
74               true,
75               false,
76               false,
77               trim_name,
78               add_species )
79             singlets_counter += 1
80           elsif prev_sequence
81             if sequence != prev_sequence
82               prev_is_pair = false
83             else
84               if ( env_from - prev_env_to ) <= min_linker
85                 extract_domain( sequence,
86                   prev_number.to_s + "+" + number.to_s,
87                   out_of,
88                   prev_env_from,
89                   env_to,
90                   in_msa,
91                   out_msa_pairs,
92                   false,
93                   true,
94                   false,
95                   false,
96                   trim_name,
97                   add_species )
98                 prev_is_pair = true
99                 close_pairs_counter += 2
100               else
101                 if !prev_is_pair
102                   extract_domain( sequence,
103                     prev_number,
104                     out_of,
105                     prev_env_from,
106                     prev_env_to,
107                     in_msa,
108                     out_msa_distant_partners,
109                     false,
110                     true,
111                     false,
112                     false,
113                     trim_name,
114                     add_species )
115                   distant_pairs_counter += 1
116                 end
117                 if number == out_of
118                   extract_domain( sequence,
119                     number,
120                     out_of,
121                     env_from,
122                     env_to,
123                     in_msa,
124                     out_msa_distant_partners,
125                     false,
126                     true,
127                     false,
128                     false,
129                     trim_name,
130                     add_species )
131                   distant_pairs_counter += 1
132                 end
133                 prev_is_pair = false
134               end
135             end # sequence != prev_sequence else
136           end
137           prev_sequence = sequence
138           prev_number   = number
139           prev_env_from = env_from
140           prev_env_to   = env_to
141           #prev_i_e_value  = i_e_value
142         end # if min_linker
143 =end
144
145       end # each
146       domain_pass_counter
147     end # def process_hmmscan_data
148
149
150     # raises ArgumentError, IOError, StandardError
151     def parse( domain_id,
152         hmmscan_output,
153         fasta_sequence_file,
154         outfile,
155         passed_seqs_outfile,
156         failed_seqs_outfile,
157         e_value_threshold,
158         length_threshold,
159         add_position,
160         add_domain_number,
161         add_domain_number_as_digit,
162         add_domain_number_as_letter,
163         trim_name,
164         add_species,
165         min_linker,
166         log )
167
168       Util.check_file_for_readability( hmmscan_output )
169       Util.check_file_for_readability( fasta_sequence_file )
170       Util.check_file_for_writability( outfile )
171       Util.check_file_for_writability( passed_seqs_outfile )
172       Util.check_file_for_writability( failed_seqs_outfile )
173
174       in_msa = nil
175       factory = MsaFactory.new()
176       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
177
178       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
179         error_msg = "could not find fasta sequences in " + fasta_sequence_file
180         raise IOError, error_msg
181       end
182
183       out_msa = Msa.new
184
185       failed_seqs = Msa.new
186       passed_seqs = Msa.new
187       out_msa_pairs = nil
188       out_msa_distant_partners = nil
189       out_msa_singlets = nil
190       if min_linker
191         out_msa_pairs = Msa.new
192         out_msa_distant_partners = Msa.new
193         out_msa_singlets = Msa.new
194       end
195
196       ld = Constants::LINE_DELIMITER
197
198       domain_pass_counter     = 0
199       domain_fail_counter     = 0
200       singlets_counter        = 0
201       distant_pairs_counter   = 0
202       close_pairs_counter     = 0
203       proteins_with_passing_domains = 0
204       proteins_with_failing_domains = 0
205       max_domain_copy_number_per_protein = -1
206       max_domain_copy_number_sequence    = ""
207
208       prev_sequence = nil
209       prev_number   = nil
210       prev_env_from = nil
211       prev_env_to   = nil
212       # prev_i_e_value  = nil
213       prev_is_pair = false
214
215
216       hmmscan_datas = Array.new
217
218
219       File.open( hmmscan_output ) do | file |
220         while line = file.gets
221           if !is_ignorable?( line ) && line =~ /^\S+\s+/
222
223             #         tn      acc     tlen    query   acc     qlen    Evalue  score   bias    #       of      c-E     i-E     score   bias    hf      ht      af      at      ef      et      acc     desc
224             #         1       2       3       4       5       6       7       8       9       10      11      12      13      14      15      16      17      18      19      20      21      22      23
225             line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
226
227             if domain_id != $1
228               next
229             end
230
231             sequence = $4
232             number   = $10.to_i
233             out_of   = $11.to_i
234             env_from = $20.to_i
235             env_to   = $21.to_i
236             i_e_value  = $13.to_f
237
238             if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
239                  ( ( length_threshold <= 0 )   || ( env_to - env_from + 1 ) >= length_threshold.to_f ) )
240               hmmscan_datas << HmmsearchData.new( sequence, number, out_of, env_from, env_to, i_e_value )
241               if ( number > max_domain_copy_number_per_protein )
242                 max_domain_copy_number_sequence    = sequence
243                 max_domain_copy_number_per_protein = number
244               end
245             else # failed
246               print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" )
247               log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)"
248               if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
249                 print( " iE=" + i_e_value.to_s )
250                 log << " iE=" + i_e_value.to_s
251               end
252               if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
253                 le = env_to - env_from + 1
254                 print( " l=" + le.to_s )
255                 log << " l=" + le.to_s
256               end
257               print( ld )
258               log << ld
259               domain_fail_counter  += 1
260
261               if failed_seqs.find_by_name_start( sequence, true ).length < 1
262                 add_sequence( sequence, in_msa, failed_seqs )
263                 proteins_with_failing_domains += 1
264               end
265             end
266
267             if number > out_of
268               error_msg = "number > out_of ! (this should not have happened)"
269               raise StandardError, error_msg
270             end
271
272             if number == out_of && !hmmscan_datas.empty?
273               domain_pass_counter += process_hmmscan_datas(  hmmscan_datas,
274                 in_msa,
275                 add_position,
276                 add_domain_number,
277                 trim_name ,
278                 add_species,
279                 out_msa        )
280
281               hmmscan_datas.clear
282             end
283
284           end
285         end #  while line = file.gets
286       end # File.open( hmmsearch_output ) do | file |
287
288
289
290       if domain_pass_counter < 1
291         error_msg = "no domain sequences were extracted"
292         raise IOError, error_msg
293       end
294
295       log << ld
296       puts( "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s )
297       log << "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s
298       log << ld
299
300       if ( max_domain_copy_number_per_protein > 1 )
301         puts( "First protein with this copy number: " + max_domain_copy_number_sequence )
302         log << "First protein with this copy number: " + max_domain_copy_number_sequence
303         log << ld
304       end
305
306       write_msa( out_msa, outfile  )
307       write_msa( passed_seqs, passed_seqs_outfile )
308       write_msa( failed_seqs, failed_seqs_outfile )
309
310       if out_msa_pairs
311         write_msa( out_msa_pairs, outfile +"_" + min_linker.to_s )
312       end
313
314       if out_msa_singlets
315         write_msa( out_msa_singlets, outfile +"_singles" )
316       end
317
318       if out_msa_distant_partners
319         write_msa( out_msa_distant_partners, outfile + "_" + min_linker.to_s + "_isolated" );
320       end
321
322
323       log << ld
324       log << "passing domains              : " + domain_pass_counter.to_s + ld
325       if ( min_linker )
326         log << "single domains               : " + singlets_counter.to_s + ld
327         log << "domains in close pairs       : " + close_pairs_counter.to_s + ld
328         log << "isolated domains             : " + distant_pairs_counter.to_s + ld
329       end
330       log << "failing domains              : " + domain_fail_counter.to_s + ld
331       log << "proteins with passing domains: " + proteins_with_passing_domains.to_s + ld
332       log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld
333       log << ld
334
335       return domain_pass_counter
336
337     end # parse
338
339
340     private
341
342     def write_msa( msa, filename )
343       io = MsaIO.new()
344       w = FastaWriter.new()
345       w.set_line_width( 60 )
346       w.clean( true )
347       begin
348         io.write_to_file( msa, filename, w )
349       rescue Exception
350         error_msg = "could not write to \"" + filename + "\""
351         raise IOError, error_msg
352       end
353     end
354
355
356     def add_sequence( sequence_name, in_msa, add_to_msa )
357       seqs = in_msa.find_by_name_start( sequence_name, true )
358       if ( seqs.length < 1 )
359         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
360         raise StandardError, error_msg
361       end
362       if ( seqs.length > 1 )
363         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
364         raise StandardError, error_msg
365       end
366       seq = in_msa.get_sequence( seqs[ 0 ] )
367       add_to_msa.add_sequence( seq )
368     end
369
370
371     def extract_domain( sequence,
372         number,
373         out_of,
374         seq_from,
375         seq_to,
376         in_msa,
377         out_msa,
378         add_position,
379         add_domain_number,
380         trim_name,
381         add_species )
382       if  number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
383         error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s
384         raise StandardError, error_msg
385       end
386       if  seq_from < 1 || seq_to < 1 || seq_from >= seq_to
387         error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s
388         raise StandardError, error_msg
389       end
390       seqs = in_msa.find_by_name_start( sequence, true )
391       if seqs.length < 1
392         error_msg = "sequence \"" + sequence + "\" not found in sequence file"
393         raise IOError, error_msg
394       end
395       if seqs.length > 1
396         error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
397         raise IOError, error_msg
398       end
399       # hmmsearch is 1 based, wheres sequences are 0 bases in this package.
400       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
401
402       orig_name = seq.get_name
403
404       seq.set_name( orig_name.split[ 0 ] )
405
406       if add_position
407         seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
408       end
409
410       if trim_name
411         seq.set_name( seq.get_name[ 0, seq.get_name.length - TRIM_BY ] )
412       end
413
414       if out_of != 1 && add_domain_number
415         seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
416       end
417
418       if add_species
419         a = orig_name.rindex "["
420         b = orig_name.rindex "]"
421         unless a && b
422           error_msg = "species not found in " + orig_name
423           raise StandardError, error_msg
424         end
425         species = orig_name[ a .. b ]
426         seq.set_name( seq.get_name + " " + species )
427       end
428       out_msa.add_sequence( seq )
429     end
430
431     def is_ignorable?( line )
432       return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
433     end
434
435   end # class HmmscanDomainExtractor
436
437   class HmmsearchData
438     def initialize( seq_name, number, out_of, env_from, env_to, i_e_value )
439       @seq_name = seq_name
440       @number = number
441       @out_of = out_of
442       @env_from = env_from
443       @env_to = env_to
444       @i_e_value = i_e_value
445     end
446     attr_reader :seq_name, :number, :out_of, :env_from, :env_to, :i_e_value
447   end
448
449 end # module Evoruby
450