in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_domain_extractor.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanDomainExtractor class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id:  $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/io/msa_io'
13 require 'lib/evo/io/writer/fasta_writer'
14 require 'lib/evo/io/parser/fasta_parser'
15
16
17 module Evoruby
18
19   class HmmscanDomainExtractor
20
21     TRIM_BY = 2
22
23     def initialize
24     end
25
26     # raises ArgumentError, IOError, StandardError
27     def parse( domain_id,
28         hmmsearch_output,
29         fasta_sequence_file,
30         outfile,
31         passed_seqs_outfile,
32         failed_seqs_outfile,
33         e_value_threshold,
34         length_threshold,
35         add_position,
36         add_domain_number,
37         add_domain_number_as_digit,
38         add_domain_number_as_letter,
39         trim_name,
40         add_species,
41         min_linker,
42         log )
43
44       Util.check_file_for_readability( hmmsearch_output )
45       Util.check_file_for_readability( fasta_sequence_file )
46       Util.check_file_for_writability( outfile )
47       Util.check_file_for_writability( passed_seqs_outfile )
48       Util.check_file_for_writability( failed_seqs_outfile )
49
50       in_msa = nil
51       factory = MsaFactory.new()
52       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
53
54       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
55         error_msg = "could not find fasta sequences in " + fasta_sequence_file
56         raise IOError, error_msg
57       end
58
59       out_msa = Msa.new
60
61       failed_seqs = Msa.new
62       passed_seqs = Msa.new
63       out_msa_pairs = nil
64       out_msa_distant_partners = nil
65       out_msa_singlets = nil
66       if min_linker
67         out_msa_pairs = Msa.new
68         out_msa_distant_partners = Msa.new
69         out_msa_singlets = Msa.new
70       end
71
72       ld = Constants::LINE_DELIMITER
73
74       domain_pass_counter     = 0
75       domain_fail_counter     = 0
76       proteins_with_passing_domains = 0
77       proteins_with_failing_domains = 0
78       max_domain_copy_number_per_protein = -1
79       max_domain_copy_number_sequence    = ""
80
81       prev_sequence = nil
82       prev_number   = nil
83       prev_env_from = nil
84       prev_env_to   = nil
85       prev_i_e_value  = nil
86       prev_is_pair = false
87
88       File.open( hmmsearch_output ) do | file |
89         while line = file.gets
90           if !is_ignorable?( line ) && line =~ /^\S+\s+/
91
92             #         tn      acc     tlen    query   acc     qlen    Evalue  score   bias    #       of      c-E     i-E     score   bias    hf      ht      af      at      ef      et      acc     desc
93             #         1       2       3       4       5       6       7       8       9       10      11      12      13      14      15      16      17      18      19      20      21      22      23
94             line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
95
96             target_name = $1
97             if domain_id != target_name
98               next
99             end
100
101             sequence = $4
102             number   = $10.to_i
103             out_of   = $11.to_i
104             env_from = $20.to_i
105             env_to   = $21.to_i
106             i_e_value  = $13.to_f
107             if ( number > max_domain_copy_number_per_protein )
108               max_domain_copy_number_sequence    = sequence
109               max_domain_copy_number_per_protein = number
110             end
111             if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
112                  ( ( length_threshold <= 0 )   || ( env_to - env_from + 1 ) >= length_threshold.to_f )  )
113
114               extract_domain( sequence,
115                 number,
116                 out_of,
117                 env_from,
118                 env_to,
119                 in_msa,
120                 out_msa,
121                 add_position,
122                 add_domain_number,
123                 add_domain_number_as_digit,
124                 add_domain_number_as_letter,
125                 trim_name ,
126                 add_species )
127               domain_pass_counter += 1
128
129               if passed_seqs.find_by_name_start( sequence, true ).length < 1
130                 add_sequence( sequence, in_msa, passed_seqs )
131                 proteins_with_passing_domains += 1
132               end
133
134               if min_linker
135                 if ( ( e_value_threshold < 0.0 ) || ( prev_i_e_value <= e_value_threshold  ) ) &&
136                    ( ( length_threshold <= 0 )   || (  ( prev_env_to - prev_env_from + 1 ) >= length_threshold.to_f    ) )
137
138                   if prev_sequence && sequence != prev_sequence
139                     prev_is_pair = false
140                   end
141
142                   if out_of == 1
143
144                   #  if prev_sequence && sequence == prev_sequence
145                   #    puts "sequence == prev_sequence && out_of == 1"
146                   #    exit
147                   #  end
148                     extract_domain( sequence,
149                       number,
150                       out_of,
151                       env_from,
152                       env_to,
153                       in_msa,
154                       out_msa_singlets,
155                       false,
156                       true,
157                       false,
158                       false,
159                       trim_name ,
160                       add_species )
161
162                   elsif prev_sequence && sequence == prev_sequence
163
164                     if  ( env_from - prev_env_to ) <= min_linker  #######
165                       extract_domain( sequence,
166                         prev_number.to_s + "+" + number.to_s,
167                         out_of,
168                         prev_env_from,
169                         env_to,
170                         in_msa,
171                         out_msa_pairs,
172                         false,
173                         true,
174                         false,
175                         false,
176                         trim_name ,
177                         add_species )
178                       prev_is_pair = true
179                     else               #######
180                       if !prev_is_pair
181                         extract_domain( sequence,
182                           prev_number,
183                           out_of,
184                           prev_env_from,
185                           prev_env_to,
186                           in_msa,
187                           out_msa_distant_partners,
188                           false,
189                           true,
190                           false,
191                           false,
192                           trim_name ,
193                           add_species )
194                       end
195                       if number == out_of
196                         extract_domain( sequence,
197                           number,
198                           out_of,
199                           env_from,
200                           env_to,
201                           in_msa,
202                           out_msa_distant_partners,
203                           false,
204                           true,
205                           false,
206                           false,
207                           trim_name ,
208                           add_species )
209                       end
210                       prev_is_pair = false
211                     end                #######
212
213                   end
214                   prev_sequence = sequence
215                   prev_number   = number
216                   prev_env_from = env_from
217                   prev_env_to   = env_to
218                   prev_i_e_value  = i_e_value
219                 end
220
221               else
222                 print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" )
223                 log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)"
224                 if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
225                   print( " iE=" + i_e_value.to_s )
226                   log << " iE=" + i_e_value.to_s
227                 end
228                 if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
229                   le = env_to - env_from + 1
230                   print( " l=" + le.to_s )
231                   log << " l=" + le.to_s
232                 end
233                 print( Constants::LINE_DELIMITER )
234                 log << Constants::LINE_DELIMITER
235                 domain_fail_counter  += 1
236
237                 if failed_seqs.find_by_name_start( sequence, true ).length < 1
238                   add_sequence( sequence, in_msa, failed_seqs )
239                   proteins_with_failing_domains += 1
240                 end
241               end
242             end
243           end # if !is_ignorable?( line ) && line =~ /^\S+\s+/
244         end #  while line = file.gets
245       end #   File.open( hmmsearch_output ) do | file |
246
247       if domain_pass_counter < 1
248         error_msg = "no domain sequences were extracted"
249         raise StandardError, error_msg
250       end
251
252       log << Constants::LINE_DELIMITER
253       puts( "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s )
254       log << "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s
255       log << Constants::LINE_DELIMITER
256
257       if ( max_domain_copy_number_per_protein > 1 )
258         puts( "First protein with this copy number: " + max_domain_copy_number_sequence )
259         log << "First protein with this copy number: " + max_domain_copy_number_sequence
260         log << Constants::LINE_DELIMITER
261       end
262
263       write_msa( out_msa, outfile  )
264       write_msa( passed_seqs, passed_seqs_outfile )
265       write_msa( failed_seqs, failed_seqs_outfile )
266
267       if out_msa_pairs
268         write_msa( out_msa_pairs, outfile +"_" + min_linker.to_s )
269       end
270
271       if out_msa_singlets
272         write_msa( out_msa_singlets, outfile +"_singles" )
273       end
274
275       if out_msa_distant_partners
276         write_msa( out_msa_distant_partners, outfile +"_dist" )
277       end
278
279
280       log << ld
281       log << "passing domains              : " + domain_pass_counter.to_s + ld
282       log << "failing domains              : " + domain_fail_counter.to_s + ld
283       log << "proteins with passing domains: " + proteins_with_passing_domains.to_s + ld
284       log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld
285       log << ld
286
287       return domain_pass_counter
288
289     end # parse
290
291
292     private
293
294     def write_msa( msa, filename )
295       io = MsaIO.new()
296       w = FastaWriter.new()
297       w.set_line_width( 60 )
298       w.clean( true )
299       begin
300         io.write_to_file( msa, filename, w )
301       rescue Exception
302         error_msg = "could not write to \"" + filename + "\""
303         raise IOError, error_msg
304       end
305     end
306
307
308     def add_sequence( sequence_name, in_msa, add_to_msa )
309       seqs = in_msa.find_by_name_start( sequence_name, true )
310       if ( seqs.length < 1 )
311         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
312         raise StandardError, error_msg
313       end
314       if ( seqs.length > 1 )
315         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
316         raise StandardError, error_msg
317       end
318       seq = in_msa.get_sequence( seqs[ 0 ] )
319       add_to_msa.add_sequence( seq )
320     end
321
322     # raises ArgumentError, StandardError
323     def extract_domain( sequence,
324         number,
325         out_of,
326         seq_from,
327         seq_to,
328         in_msa,
329         out_msa,
330         add_position,
331         add_domain_number,
332         add_domain_number_as_digit,
333         add_domain_number_as_letter,
334         trim_name,
335         add_species )
336       if  number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
337         error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s
338         raise ArgumentError, error_msg
339       end
340       if  seq_from < 1 || seq_to < 1 || seq_from >= seq_to
341         error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s
342         raise ArgumentError, error_msg
343       end
344       seqs = in_msa.find_by_name_start( sequence, true )
345       if seqs.length < 1
346         error_msg = "sequence \"" + sequence + "\" not found in sequence file"
347         raise StandardError, error_msg
348       end
349       if seqs.length > 1
350         error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
351         raise StandardError, error_msg
352       end
353       # hmmsearch is 1 based, wheres sequences are 0 bases in this package.
354       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
355
356       orig_name = seq.get_name
357
358       seq.set_name( orig_name.split[ 0 ] )
359
360       if add_position
361         seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
362       end
363
364       if trim_name
365         seq.set_name( seq.get_name[ 0, seq.get_name.length - TRIM_BY ] )
366       end
367
368       if out_of != 1
369         if add_domain_number_as_digit
370           seq.set_name( seq.get_name + number.to_s )
371         elsif add_domain_number_as_letter
372           if number > 25
373             error_msg = 'too many identical domains per sequence, cannot use letters to distinguish them'
374             raise StandardError, error_msg
375           end
376           seq.set_name( seq.get_name + ( number + 96 ).chr )
377         elsif add_domain_number
378           seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
379         end
380       end
381
382       # if ( seq.get_name.length > 10 )
383       #   error_msg = "sequence name [" + seq.get_name + "] is longer than 10 characters"
384       #   raise StandardError, error_msg
385       # end
386
387       if add_species
388         a = orig_name.rindex "["
389         b = orig_name.rindex "]"
390         unless a && b
391           error_msg = "species not found in " + orig_name
392           raise StandardError, error_msg
393         end
394         species = orig_name[ a .. b ]
395         seq.set_name( seq.get_name + " " + species )
396       end
397       out_msa.add_sequence( seq )
398     end
399
400     def is_ignorable?( line )
401       return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
402     end
403
404   end # class HmmscanDomainExtractor
405
406 end # module Evoruby
407