in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_domain_extractor.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanDomainExtractor class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id:  $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/io/msa_io'
13 require 'lib/evo/io/writer/fasta_writer'
14 require 'lib/evo/io/parser/fasta_parser'
15
16
17 module Evoruby
18
19   class HmmscanDomainExtractor
20
21     TRIM_BY = 2
22
23     def initialize
24     end
25
26     # raises ArgumentError, IOError, StandardError
27     def parse( domain_id,
28         hmmsearch_output,
29         fasta_sequence_file,
30         outfile,
31         passed_seqs_outfile,
32         failed_seqs_outfile,
33         e_value_threshold,
34         length_threshold,
35         add_position,
36         add_domain_number,
37         add_domain_number_as_digit,
38         add_domain_number_as_letter,
39         trim_name,
40         add_species,
41         min_linker,
42         log )
43
44       Util.check_file_for_readability( hmmsearch_output )
45       Util.check_file_for_readability( fasta_sequence_file )
46       Util.check_file_for_writability( outfile )
47       Util.check_file_for_writability( passed_seqs_outfile )
48       Util.check_file_for_writability( failed_seqs_outfile )
49
50       in_msa = nil
51       factory = MsaFactory.new()
52       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
53
54       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
55         error_msg = "could not find fasta sequences in " + fasta_sequence_file
56         raise IOError, error_msg
57       end
58
59       out_msa = Msa.new
60
61       failed_seqs = Msa.new
62       passed_seqs = Msa.new
63       out_msa_pairs = nil
64       out_msa_distance_partners = nil
65       out_msa_singlets = nil
66       if min_linker
67         out_msa_pairs = Msa.new
68         out_msa_distant_partners = Msa.new
69         out_msa_singlets = Msa.new
70       end
71
72       ld = Constants::LINE_DELIMITER
73
74       domain_pass_counter     = 0
75       domain_fail_counter     = 0
76       proteins_with_passing_domains = 0
77       proteins_with_failing_domains = 0
78       max_domain_copy_number_per_protein = -1
79       max_domain_copy_number_sequence    = ""
80
81       prev_sequence = nil
82       prev_number   = nil
83       prev_env_from = nil
84       prev_env_to   = nil
85       prev_i_e_value  = nil
86       prev_is_pair = false
87
88       File.open( hmmsearch_output ) do | file |
89         while line = file.gets
90           if !is_ignorable?( line ) && line =~ /^\S+\s+/
91
92             #         tn      acc     tlen    query   acc     qlen    Evalue  score   bias    #       of      c-E     i-E     score   bias    hf      ht      af      at      ef      et      acc     desc
93             #         1       2       3       4       5       6       7       8       9       10      11      12      13      14      15      16      17      18      19      20      21      22      23
94             line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
95
96             target_name = $1
97             if domain_id != target_name
98               next
99             end
100
101             sequence = $4
102             number   = $10.to_i
103             out_of   = $11.to_i
104             env_from = $20.to_i
105             env_to   = $21.to_i
106             i_e_value  = $13.to_f
107             if ( number > max_domain_copy_number_per_protein )
108               max_domain_copy_number_sequence    = sequence
109               max_domain_copy_number_per_protein = number
110             end
111             if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
112                  ( ( length_threshold <= 0 )   || ( env_to - env_from + 1 ) >= length_threshold.to_f )  )
113
114               extract_domain( sequence,
115                 number,
116                 out_of,
117                 env_from,
118                 env_to,
119                 in_msa,
120                 out_msa,
121                 add_position,
122                 add_domain_number,
123                 add_domain_number_as_digit,
124                 add_domain_number_as_letter,
125                 trim_name ,
126                 add_species )
127               domain_pass_counter += 1
128
129               if passed_seqs.find_by_name_start( sequence, true ).length < 1
130                 add_sequence( sequence, in_msa, passed_seqs )
131                 proteins_with_passing_domains += 1
132               end
133
134               if min_linker
135                 if ( ( e_value_threshold < 0.0 ) || ( prev_i_e_value <= e_value_threshold  ) ) &&
136                    ( ( length_threshold <= 0 )   || (  ( prev_env_to - prev_env_from + 1 ) >= length_threshold.to_f    ) )
137
138                   if sequence != prev_sequence
139                     prev_is_pair = false
140                   end
141                   
142                   if out_of == 1
143
144                     if sequence == prev_sequence
145                       puts "sequence == prev_sequence && out_of == 1"
146                       exit
147                     end
148                     extract_domain( sequence,
149                       number,
150                       out_of,
151                       env_from,
152                       env_to,
153                       in_msa,
154                       out_msa_singlets,
155                       false,
156                       true,
157                       false,
158                       false,
159                       trim_name ,
160                       add_species )
161
162                   elsif sequence == prev_sequence
163
164                     if  ( env_from - prev_env_to ) <= min_linker  #######
165                       extract_domain( sequence,
166                         prev_number.to_s + "+" + number.to_s,
167                         out_of,
168                         prev_env_from,
169                         env_to,
170                         in_msa,
171                         out_msa_pairs,
172                         false,
173                         true,
174                         false,
175                         false,
176                         trim_name ,
177                         add_species )
178                       prev_is_pair = true
179                     else               #######
180                       if !prev_is_pair
181                         extract_domain( sequence,
182                           prev_number,
183                           out_of,
184                           prev_env_from,
185                           prev_env_to,
186                           in_msa,
187                           out_msa_distant_partners,
188                           false,
189                           true,
190                           false,
191                           false,
192                           trim_name ,
193                           add_species )
194                       end
195                       if number == out_of
196                         extract_domain( sequence,
197                           number,
198                           out_of,
199                           env_from,
200                           env_to,
201                           in_msa,
202                           out_msa_distant_partners,
203                           false,
204                           true,
205                           false,
206                           false,
207                           trim_name ,
208                           add_species )
209                       end
210                       prev_is_pair = false
211                     end                #######
212
213                   end
214                   prev_sequence = sequence
215                   prev_number   = number
216                   prev_env_from = env_from
217                   prev_env_to   = env_to
218                   prev_i_e_value  = i_e_value
219                 end
220
221               else
222                 print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" )
223                 log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)"
224                 if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
225                   print( " iE=" + i_e_value.to_s )
226                   log << " iE=" + i_e_value.to_s
227                 end
228                 if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
229                   le = env_to - env_from + 1
230                   print( " l=" + le.to_s )
231                   log << " l=" + le.to_s
232                 end
233                 print( Constants::LINE_DELIMITER )
234                 log << Constants::LINE_DELIMITER
235                 domain_fail_counter  += 1
236
237                 if failed_seqs.find_by_name_start( sequence, true ).length < 1
238                   add_sequence( sequence, in_msa, failed_seqs )
239                   proteins_with_failing_domains += 1
240                 end
241               end
242             end
243           end
244         end
245
246         if domain_pass_counter < 1
247           error_msg = "no domain sequences were extracted"
248           raise StandardError, error_msg
249         end
250
251         log << Constants::LINE_DELIMITER
252         puts( "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s )
253         log << "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s
254         log << Constants::LINE_DELIMITER
255
256         if ( max_domain_copy_number_per_protein > 1 )
257           puts( "First protein with this copy number: " + max_domain_copy_number_sequence )
258           log << "First protein with this copy number: " + max_domain_copy_number_sequence
259           log << Constants::LINE_DELIMITER
260         end
261
262         write_msa( out_msa, outfile  )
263         write_msa( passed_seqs, passed_seqs_outfile )
264         write_msa( failed_seqs, failed_seqs_outfile )
265
266         if out_msa_pairs
267           write_msa( out_msa_pairs, outfile +"_" + min_linker.to_s )
268         end
269
270         if out_msa_singlets
271           write_msa( out_msa_singlets, outfile +"_singles" )
272         end
273
274         if out_msa_distant_partners
275           write_msa( out_msa_distant_partners, outfile +"_singles" )
276         end
277
278
279         log << ld
280         log << "passing domains              : " + domain_pass_counter.to_s + ld
281         log << "failing domains              : " + domain_fail_counter.to_s + ld
282         log << "proteins with passing domains: " + proteins_with_passing_domains.to_s + ld
283         log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld
284         log << ld
285
286         return domain_pass_counter
287
288       end # parse
289
290
291       private
292
293       def write_msa( msa, filename )
294         io = MsaIO.new()
295         w = FastaWriter.new()
296         w.set_line_width( 60 )
297         w.clean( true )
298         begin
299           io.write_to_file( msa, filename, w )
300         rescue Exception
301           error_msg = "could not write to \"" + filename + "\""
302           raise IOError, error_msg
303         end
304       end
305
306
307       def add_sequence( sequence_name, in_msa, add_to_msa )
308         seqs = in_msa.find_by_name_start( sequence_name, true )
309         if ( seqs.length < 1 )
310           error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
311           raise StandardError, error_msg
312         end
313         if ( seqs.length > 1 )
314           error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
315           raise StandardError, error_msg
316         end
317         seq = in_msa.get_sequence( seqs[ 0 ] )
318         add_to_msa.add_sequence( seq )
319       end
320
321       # raises ArgumentError, StandardError
322       def extract_domain( sequence,
323           number,
324           out_of,
325           seq_from,
326           seq_to,
327           in_msa,
328           out_msa,
329           add_position,
330           add_domain_number,
331           add_domain_number_as_digit,
332           add_domain_number_as_letter,
333           trim_name,
334           add_species )
335         if  number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
336           error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s
337           raise ArgumentError, error_msg
338         end
339         if  seq_from < 1 || seq_to < 1 || seq_from >= seq_to
340           error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s
341           raise ArgumentError, error_msg
342         end
343         seqs = in_msa.find_by_name_start( sequence, true )
344         if seqs.length < 1
345           error_msg = "sequence \"" + sequence + "\" not found in sequence file"
346           raise StandardError, error_msg
347         end
348         if seqs.length > 1
349           error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
350           raise StandardError, error_msg
351         end
352         # hmmsearch is 1 based, wheres sequences are 0 bases in this package.
353         seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
354
355         orig_name = seq.get_name
356
357         seq.set_name( orig_name.split[ 0 ] )
358
359         if add_position
360           seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
361         end
362
363         if trim_name
364           seq.set_name( seq.get_name[ 0, seq.get_name.length - TRIM_BY ] )
365         end
366
367         if out_of != 1
368           if add_domain_number_as_digit
369             seq.set_name( seq.get_name + number.to_s )
370           elsif add_domain_number_as_letter
371             if number > 25
372               error_msg = 'too many identical domains per sequence, cannot use letters to distinguish them'
373               raise StandardError, error_msg
374             end
375             seq.set_name( seq.get_name + ( number + 96 ).chr )
376           elsif add_domain_number
377             seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
378           end
379         end
380
381         # if ( seq.get_name.length > 10 )
382         #   error_msg = "sequence name [" + seq.get_name + "] is longer than 10 characters"
383         #   raise StandardError, error_msg
384         # end
385
386         if add_species
387           a = orig_name.rindex "["
388           b = orig_name.rindex "]"
389           unless a && b
390             error_msg = "species not found in " + orig_name
391             raise StandardError, error_msg
392           end
393           species = orig_name[ a .. b ]
394           seq.set_name( seq.get_name + " " + species )
395         end
396
397         out_msa.add_sequence( seq )
398
399       end
400
401       def is_ignorable?( line )
402         return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
403       end
404
405     end # class HmmscanDomainExtractor
406
407   end # module Evoruby
408