in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_domain_extractor_old2.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanDomainExtractor class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id:  $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/io/msa_io'
13 require 'lib/evo/io/writer/fasta_writer'
14 require 'lib/evo/io/parser/fasta_parser'
15
16
17 module Evoruby
18
19   class HmmscanDomainExtractor
20
21     TRIM_BY = 2
22
23     def initialize
24     end
25
26     # raises ArgumentError, IOError, StandardError
27     def parse( domain_id,
28         hmmsearch_output,
29         fasta_sequence_file,
30         outfile,
31         passed_seqs_outfile,
32         failed_seqs_outfile,
33         e_value_threshold,
34         length_threshold,
35         add_position,
36         add_domain_number,
37         add_domain_number_as_digit,
38         add_domain_number_as_letter,
39         trim_name,
40         add_species,
41         min_linker,
42         log )
43
44       Util.check_file_for_readability( hmmsearch_output )
45       Util.check_file_for_readability( fasta_sequence_file )
46       Util.check_file_for_writability( outfile )
47       Util.check_file_for_writability( passed_seqs_outfile )
48       Util.check_file_for_writability( failed_seqs_outfile )
49
50       in_msa = nil
51       factory = MsaFactory.new()
52       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
53
54       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
55         error_msg = "could not find fasta sequences in " + fasta_sequence_file
56         raise IOError, error_msg
57       end
58
59       out_msa = Msa.new
60
61       failed_seqs = Msa.new
62       passed_seqs = Msa.new
63       out_msa_pairs = nil
64       out_msa_distant_partners = nil
65       out_msa_singlets = nil
66       if min_linker
67         out_msa_pairs = Msa.new
68         out_msa_distant_partners = Msa.new
69         out_msa_singlets = Msa.new
70       end
71
72       ld = Constants::LINE_DELIMITER
73
74       domain_pass_counter     = 0
75       domain_fail_counter     = 0
76       singlets_counter        = 0
77       distant_pairs_counter   = 0
78       close_pairs_counter     = 0
79       proteins_with_passing_domains = 0
80       proteins_with_failing_domains = 0
81       max_domain_copy_number_per_protein = -1
82       max_domain_copy_number_sequence    = ""
83
84       prev_sequence = nil
85       prev_number   = nil
86       prev_env_from = nil
87       prev_env_to   = nil
88       # prev_i_e_value  = nil
89       prev_is_pair = false
90
91       File.open( hmmsearch_output ) do | file |
92         while line = file.gets
93           if !is_ignorable?( line ) && line =~ /^\S+\s+/
94
95             #         tn      acc     tlen    query   acc     qlen    Evalue  score   bias    #       of      c-E     i-E     score   bias    hf      ht      af      at      ef      et      acc     desc
96             #         1       2       3       4       5       6       7       8       9       10      11      12      13      14      15      16      17      18      19      20      21      22      23
97             line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
98
99             target_name = $1
100             if domain_id != target_name
101               next
102             end
103
104             sequence = $4
105             number   = $10.to_i
106             out_of   = $11.to_i
107             env_from = $20.to_i
108             env_to   = $21.to_i
109             i_e_value  = $13.to_f
110             if ( number > max_domain_copy_number_per_protein )
111               max_domain_copy_number_sequence    = sequence
112               max_domain_copy_number_per_protein = number
113             end
114             if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
115                  ( ( length_threshold <= 0 )   || ( env_to - env_from + 1 ) >= length_threshold.to_f ) )
116
117               extract_domain( sequence,
118                 number,
119                 out_of,
120                 env_from,
121                 env_to,
122                 in_msa,
123                 out_msa,
124                 add_position,
125                 add_domain_number,
126                 add_domain_number_as_digit,
127                 add_domain_number_as_letter,
128                 trim_name ,
129                 add_species )
130               domain_pass_counter += 1
131
132               if passed_seqs.find_by_name_start( sequence, true ).length < 1
133                 add_sequence( sequence, in_msa, passed_seqs )
134                 proteins_with_passing_domains += 1
135               end
136
137               if min_linker
138                 if out_of == 1
139                   extract_domain( sequence,
140                     number,
141                     out_of,
142                     env_from,
143                     env_to,
144                     in_msa,
145                     out_msa_singlets,
146                     false,
147                     true,
148                     false,
149                     false,
150                     trim_name,
151                     add_species )
152                   singlets_counter += 1
153                 elsif prev_sequence
154                   if sequence != prev_sequence
155                     prev_is_pair = false
156                   else
157                     if ( env_from - prev_env_to ) <= min_linker
158                       extract_domain( sequence,
159                         prev_number.to_s + "+" + number.to_s,
160                         out_of,
161                         prev_env_from,
162                         env_to,
163                         in_msa,
164                         out_msa_pairs,
165                         false,
166                         true,
167                         false,
168                         false,
169                         trim_name,
170                         add_species )
171                       prev_is_pair = true
172                       close_pairs_counter += 2
173                     else
174                       if !prev_is_pair
175                         extract_domain( sequence,
176                           prev_number,
177                           out_of,
178                           prev_env_from,
179                           prev_env_to,
180                           in_msa,
181                           out_msa_distant_partners,
182                           false,
183                           true,
184                           false,
185                           false,
186                           trim_name,
187                           add_species )
188                         distant_pairs_counter += 1
189                       end
190                       if number == out_of
191                         extract_domain( sequence,
192                           number,
193                           out_of,
194                           env_from,
195                           env_to,
196                           in_msa,
197                           out_msa_distant_partners,
198                           false,
199                           true,
200                           false,
201                           false,
202                           trim_name,
203                           add_species )
204                         distant_pairs_counter += 1
205                       end
206                       prev_is_pair = false
207                     end
208                   end # sequence != prev_sequence else
209                 end
210                 prev_sequence = sequence
211                 prev_number   = number
212                 prev_env_from = env_from
213                 prev_env_to   = env_to
214                 #prev_i_e_value  = i_e_value
215               end # if min_linker
216
217             else
218               print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" )
219               log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)"
220               if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
221                 print( " iE=" + i_e_value.to_s )
222                 log << " iE=" + i_e_value.to_s
223               end
224               if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
225                 le = env_to - env_from + 1
226                 print( " l=" + le.to_s )
227                 log << " l=" + le.to_s
228               end
229               print( Constants::LINE_DELIMITER )
230               log << Constants::LINE_DELIMITER
231               domain_fail_counter  += 1
232
233               if failed_seqs.find_by_name_start( sequence, true ).length < 1
234                 add_sequence( sequence, in_msa, failed_seqs )
235                 proteins_with_failing_domains += 1
236               end
237
238             end
239           end # if !is_ignorable?( line ) && line =~ /^\S+\s+/
240         end #  while line = file.gets
241       end #   File.open( hmmsearch_output ) do | file |
242
243       if domain_pass_counter < 1
244         error_msg = "no domain sequences were extracted"
245         raise StandardError, error_msg
246       end
247
248       log << Constants::LINE_DELIMITER
249       puts( "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s )
250       log << "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s
251       log << Constants::LINE_DELIMITER
252
253       if ( max_domain_copy_number_per_protein > 1 )
254         puts( "First protein with this copy number: " + max_domain_copy_number_sequence )
255         log << "First protein with this copy number: " + max_domain_copy_number_sequence
256         log << Constants::LINE_DELIMITER
257       end
258
259       write_msa( out_msa, outfile  )
260       write_msa( passed_seqs, passed_seqs_outfile )
261       write_msa( failed_seqs, failed_seqs_outfile )
262
263       if out_msa_pairs
264         write_msa( out_msa_pairs, outfile +"_" + min_linker.to_s )
265       end
266
267       if out_msa_singlets
268         write_msa( out_msa_singlets, outfile +"_singles" )
269       end
270
271       if out_msa_distant_partners
272         write_msa( out_msa_distant_partners, outfile + "_" + min_linker.to_s + "_isolated" );
273       end
274
275
276       log << ld
277       log << "passing domains              : " + domain_pass_counter.to_s + ld
278       if ( min_linker )
279         log << "single domains               : " + singlets_counter.to_s + ld
280         log << "domains in close pairs       : " + close_pairs_counter.to_s + ld
281         log << "isolated domains             : " + distant_pairs_counter.to_s + ld
282       end
283       log << "failing domains              : " + domain_fail_counter.to_s + ld
284       log << "proteins with passing domains: " + proteins_with_passing_domains.to_s + ld
285       log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld
286       log << ld
287
288       return domain_pass_counter
289
290     end # parse
291
292
293     private
294
295     def write_msa( msa, filename )
296       io = MsaIO.new()
297       w = FastaWriter.new()
298       w.set_line_width( 60 )
299       w.clean( true )
300       begin
301         io.write_to_file( msa, filename, w )
302       rescue Exception
303         error_msg = "could not write to \"" + filename + "\""
304         raise IOError, error_msg
305       end
306     end
307
308
309     def add_sequence( sequence_name, in_msa, add_to_msa )
310       seqs = in_msa.find_by_name_start( sequence_name, true )
311       if ( seqs.length < 1 )
312         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
313         raise StandardError, error_msg
314       end
315       if ( seqs.length > 1 )
316         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
317         raise StandardError, error_msg
318       end
319       seq = in_msa.get_sequence( seqs[ 0 ] )
320       add_to_msa.add_sequence( seq )
321     end
322
323     # raises ArgumentError, StandardError
324     def extract_domain( sequence,
325         number,
326         out_of,
327         seq_from,
328         seq_to,
329         in_msa,
330         out_msa,
331         add_position,
332         add_domain_number,
333         add_domain_number_as_digit,
334         add_domain_number_as_letter,
335         trim_name,
336         add_species )
337       if  number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
338         error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s
339         raise ArgumentError, error_msg
340       end
341       if  seq_from < 1 || seq_to < 1 || seq_from >= seq_to
342         error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s
343         raise ArgumentError, error_msg
344       end
345       seqs = in_msa.find_by_name_start( sequence, true )
346       if seqs.length < 1
347         error_msg = "sequence \"" + sequence + "\" not found in sequence file"
348         raise StandardError, error_msg
349       end
350       if seqs.length > 1
351         error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
352         raise StandardError, error_msg
353       end
354       # hmmsearch is 1 based, wheres sequences are 0 bases in this package.
355       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
356
357       orig_name = seq.get_name
358
359       seq.set_name( orig_name.split[ 0 ] )
360
361       if add_position
362         seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
363       end
364
365       if trim_name
366         seq.set_name( seq.get_name[ 0, seq.get_name.length - TRIM_BY ] )
367       end
368
369       if out_of != 1
370         if add_domain_number_as_digit
371           seq.set_name( seq.get_name + number.to_s )
372         elsif add_domain_number_as_letter
373           if number > 25
374             error_msg = 'too many identical domains per sequence, cannot use letters to distinguish them'
375             raise StandardError, error_msg
376           end
377           seq.set_name( seq.get_name + ( number + 96 ).chr )
378         elsif add_domain_number
379           seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
380         end
381       end
382
383       # if ( seq.get_name.length > 10 )
384       #   error_msg = "sequence name [" + seq.get_name + "] is longer than 10 characters"
385       #   raise StandardError, error_msg
386       # end
387
388       if add_species
389         a = orig_name.rindex "["
390         b = orig_name.rindex "]"
391         unless a && b
392           error_msg = "species not found in " + orig_name
393           raise StandardError, error_msg
394         end
395         species = orig_name[ a .. b ]
396         seq.set_name( seq.get_name + " " + species )
397       end
398       out_msa.add_sequence( seq )
399     end
400
401     def is_ignorable?( line )
402       return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
403     end
404
405   end # class HmmscanDomainExtractor
406
407 end # module Evoruby
408