bf7cb711a8a352449522534bb5d81e4d2c8ff334
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_domain_extractor.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanDomainExtractor class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id:  $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/io/msa_io'
13 require 'lib/evo/io/writer/fasta_writer'
14 require 'lib/evo/io/parser/fasta_parser'
15
16
17 module Evoruby
18
19   class HmmscanDomainExtractor
20
21     TRIM_BY = 2
22
23     def initialize
24     end
25
26     # raises ArgumentError, IOError, StandardError
27     def parse( domain_id,
28         hmmsearch_output,
29         fasta_sequence_file,
30         outfile,
31         passed_seqs_outfile,
32         failed_seqs_outfile,
33         e_value_threshold,
34         length_threshold,
35         add_position,
36         add_domain_number,
37         add_domain_number_as_digit,
38         add_domain_number_as_letter,
39         trim_name,
40         add_species,
41         min_linker,
42         log )
43
44       Util.check_file_for_readability( hmmsearch_output )
45       Util.check_file_for_readability( fasta_sequence_file )
46       Util.check_file_for_writability( outfile )
47       Util.check_file_for_writability( passed_seqs_outfile )
48       Util.check_file_for_writability( failed_seqs_outfile )
49
50       in_msa = nil
51       factory = MsaFactory.new()
52       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
53
54       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
55         error_msg = "could not find fasta sequences in " + fasta_sequence_file
56         raise IOError, error_msg
57       end
58
59       out_msa = Msa.new
60
61       failed_seqs = Msa.new
62       passed_seqs = Msa.new
63       out_msa_pairs = nil
64       if min_linker
65         out_msa_pairs = Msa.new
66       end
67
68       ld = Constants::LINE_DELIMITER
69
70       domain_pass_counter     = 0
71       domain_fail_counter     = 0
72       proteins_with_passing_domains = 0
73       proteins_with_failing_domains = 0
74       max_domain_copy_number_per_protein = -1
75       max_domain_copy_number_sequence    = ''
76       failed_species_counts         = Hash.new
77       passed_species_counts         = Hash.new
78
79       prev_sequence = nil
80       prev_number   = nil
81       prev_env_from = nil
82       prev_env_to   = nil
83       prev_i_e_value  = nil
84
85       File.open( hmmsearch_output ) do | file |
86         while line = file.gets
87           if !is_ignorable?( line ) && line =~ /^\S+\s+/
88
89             #         tn      acc     tlen    query   acc     qlen    Evalue  score   bias    #       of      c-E     i-E     score   bias    hf      ht      af      at      ef      et      acc     desc
90             #         1       2       3       4       5       6       7       8       9       10      11      12      13      14      15      16      17      18      19      20      21      22      23
91             line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
92
93             target_name = $1
94             if domain_id != target_name
95               next
96             end
97
98
99             sequence = $4
100             number   = $10.to_i
101             out_of   = $11.to_i
102             env_from = $20.to_i
103             env_to   = $21.to_i
104             i_e_value  = $13.to_f
105             if ( number > max_domain_copy_number_per_protein )
106               max_domain_copy_number_sequence    = sequence
107               max_domain_copy_number_per_protein = number
108             end
109             if ( ( ( e_value_threshold.to_f < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
110                  ( ( length_threshold.to_f <= 0 )   || ( env_to - env_from + 1 ) >= length_threshold.to_f )  )
111               extract_domain( sequence,
112                 number,
113                 out_of,
114                 env_from,
115                 env_to,
116                 in_msa,
117                 out_msa,
118                 add_position,
119                 add_domain_number,
120                 add_domain_number_as_digit,
121                 add_domain_number_as_letter,
122                 trim_name ,
123                 add_species )
124               domain_pass_counter += 1
125               count_species( sequence, passed_species_counts )
126               if passed_seqs.find_by_name_start( sequence, true ).length < 1
127                 add_sequence( sequence, in_msa, passed_seqs )
128                 proteins_with_passing_domains += 1
129               end
130
131               if min_linker
132                 if  sequence == prev_sequence  && ( ( ( e_value_threshold.to_f < 0.0 ) || ( prev_i_e_value <= e_value_threshold  ) ) &&
133                      ( ( length_threshold.to_f <= 0 )   || (  ( prev_env_to - prev_env_from + 1 ) >= length_threshold.to_f    ) )  )
134                   diff = env_from - prev_env_to
135                   if diff <= min_linker
136                     extract_domain( sequence,
137                       prev_number.to_s + "/" + number.to_s,
138                       out_of,
139                       prev_env_from,
140                       env_to,
141                       in_msa,
142                       out_msa_pairs,
143                       false,
144                       true,
145                       false,
146                       false,
147                       trim_name ,
148                       add_species )
149                   end
150                 end
151                 prev_sequence = sequence
152                 prev_number   = number
153                 prev_env_from = env_from
154                 prev_env_to   = env_to
155                 prev_i_e_value  = i_e_value
156               end
157
158             else
159               print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" )
160               log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)"
161               if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
162                 print( " iE=" + i_e_value.to_s )
163                 log << " iE=" + i_e_value.to_s
164               end
165               if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
166                 le = env_to - env_from + 1
167                 print( " l=" + le.to_s )
168                 log << " l=" + le.to_s
169               end
170               print( Constants::LINE_DELIMITER )
171               log << Constants::LINE_DELIMITER
172               domain_fail_counter  += 1
173               count_species( sequence, failed_species_counts )
174               if failed_seqs.find_by_name_start( sequence, true ).length < 1
175                 add_sequence( sequence, in_msa, failed_seqs )
176                 proteins_with_failing_domains += 1
177               end
178             end
179           end
180         end
181       end
182
183       if domain_pass_counter < 1
184         error_msg = "no domain sequences were extracted"
185         raise StandardError, error_msg
186       end
187
188       log << Constants::LINE_DELIMITER
189       puts( "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s )
190       log << "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s
191       log << Constants::LINE_DELIMITER
192
193       if ( max_domain_copy_number_per_protein > 1 )
194         puts( "First protein with this copy number: " + max_domain_copy_number_sequence )
195         log << "First protein with this copy number: " + max_domain_copy_number_sequence
196         log << Constants::LINE_DELIMITER
197       end
198
199       io = MsaIO.new()
200       w = FastaWriter.new()
201       w.set_line_width( 60 )
202       w.clean( true )
203
204       begin
205         io.write_to_file( out_msa, outfile, w )
206       rescue Exception
207         error_msg = "could not write to \"" + outfile + "\""
208         raise IOError, error_msg
209       end
210
211       if out_msa_pairs
212         begin
213           io.write_to_file( out_msa_pairs, outfile+"_" + min_linker.to_s, w )
214         rescue Exception
215           error_msg = "could not write to \"" + outfile+"_" + min_linker.to_s + "\""
216           raise IOError, error_msg
217         end
218       end
219
220       begin
221         io.write_to_file( passed_seqs, passed_seqs_outfile, w )
222       rescue Exception
223         error_msg = "could not write to \"" + passed_seqs_outfile + "\""
224         raise IOError, error_msg
225       end
226
227       begin
228         io.write_to_file( failed_seqs, failed_seqs_outfile, w )
229       rescue Exception
230         error_msg = "could not write to \"" + failed_seqs_outfile + "\""
231         raise IOError, error_msg
232       end
233
234       log << ld
235       log << "passing domains              : " + domain_pass_counter.to_s + ld
236       log << "failing domains              : " + domain_fail_counter.to_s + ld
237       log << "proteins with passing domains: " + proteins_with_passing_domains.to_s + ld
238       log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld
239       log << ld
240       log << 'passing domains counts per species: ' << ld
241       passed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld }
242       log << ld
243       log << 'failing domains counts per species: ' << ld
244       failed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld }
245       log << ld
246       return domain_pass_counter
247
248     end # parse
249
250     private
251
252     def add_sequence( sequence_name, in_msa, add_to_msa )
253       seqs = in_msa.find_by_name_start( sequence_name, true )
254       if ( seqs.length < 1 )
255         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
256         raise StandardError, error_msg
257       end
258       if ( seqs.length > 1 )
259         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
260         raise StandardError, error_msg
261       end
262       seq = in_msa.get_sequence( seqs[ 0 ] )
263       add_to_msa.add_sequence( seq )
264     end
265
266     # raises ArgumentError, StandardError
267     def extract_domain( sequence,
268         number,
269         out_of,
270         seq_from,
271         seq_to,
272         in_msa,
273         out_msa,
274         add_position,
275         add_domain_number,
276         add_domain_number_as_digit,
277         add_domain_number_as_letter,
278         trim_name,
279         add_species )
280       if  number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
281         error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s
282         raise ArgumentError, error_msg
283       end
284       if  seq_from < 1 || seq_to < 1 || seq_from >= seq_to
285         error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s
286         raise ArgumentError, error_msg
287       end
288       seqs = in_msa.find_by_name_start( sequence, true )
289       if seqs.length < 1
290         error_msg = "sequence \"" + sequence + "\" not found in sequence file"
291         raise StandardError, error_msg
292       end
293       if seqs.length > 1
294         error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
295         raise StandardError, error_msg
296       end
297       # hmmsearch is 1 based, wheres sequences are 0 bases in this package.
298       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
299
300       orig_name = seq.get_name
301
302       seq.set_name( orig_name.split[ 0 ] )
303
304       if add_position
305         seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
306       end
307
308       if trim_name
309         seq.set_name( seq.get_name[ 0, seq.get_name.length - TRIM_BY ] )
310       end
311
312       if out_of != 1
313         if add_domain_number_as_digit
314           seq.set_name( seq.get_name + number.to_s )
315         elsif add_domain_number_as_letter
316           if number > 25
317             error_msg = 'too many identical domains per sequence, cannot use letters to distinguish them'
318             raise StandardError, error_msg
319           end
320           seq.set_name( seq.get_name + ( number + 96 ).chr )
321         elsif add_domain_number
322           seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
323         end
324       end
325
326       # if ( seq.get_name.length > 10 )
327       #   error_msg = "sequence name [" + seq.get_name + "] is longer than 10 characters"
328       #   raise StandardError, error_msg
329       # end
330
331       if add_species
332         a = orig_name.rindex "["
333         b = orig_name.rindex "]"
334         unless a && b
335           error_msg = "species not found in " + orig_name
336           raise StandardError, error_msg
337         end
338         species = orig_name[ a .. b ]
339         seq.set_name( seq.get_name + " " + species )
340       end
341
342       out_msa.add_sequence( seq )
343
344     end
345
346     def count_species( sequence, species_counts_map )
347       species = get_species( sequence )
348       if species != nil
349         if !species_counts_map.has_key?( species )
350           species_counts_map[ species ] = 1
351         else
352           species_counts_map[ species ] = species_counts_map[ species ] + 1
353         end
354       end
355     end
356
357     def get_species( sequence_name )
358       if sequence_name =~ /^.+_(.+)$/
359         return $1
360       else
361         return nil
362       end
363     end
364
365     def is_ignorable?( line )
366       return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
367     end
368
369   end # class HmmscanDomainExtractor
370
371 end # module Evoruby
372