in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_summary.rb
1 #
2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
8 #
9
10 require 'set'
11
12 require 'lib/evo/util/constants'
13 require 'lib/evo/util/util'
14 require 'lib/evo/util/command_line_arguments'
15 require 'lib/evo/io/parser/hmmscan_parser'
16
17 module Evoruby
18
19   class HmmscanSummary
20
21     PRG_NAME       = "hsp"
22     PRG_VERSION    = "2.002"
23     PRG_DESC       = "hmmscan summary"
24     PRG_DATE       = "130319"
25     COPYRIGHT      = "2013 Christian M Zmasek"
26     CONTACT        = "phyloxml@gmail.com"
27     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
28
29     DELIMITER_OPTION              = "d"
30     SPECIES_OPTION                = "s"
31     I_E_VALUE_THRESHOLD_OPTION    = "ie"
32     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
33     HMM_FOR_PROTEIN_OUTPUT        = "m"
34     IGNORE_DUF_OPTION             = "i"
35     PARSE_OUT_DESCRIPITION_OPTION = "a"
36     HELP_OPTION_1                 = "help"
37     HELP_OPTION_2                 = "h"
38
39     USE_AVOID_HMMS = true
40     AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
41     LIMIT_FOR_CLOSE_DOMAINS = 20
42
43     def initialize
44       @domain_counts = Hash.new
45     end
46
47     def run
48
49       Util.print_program_information( PRG_NAME,
50         PRG_VERSION,
51         PRG_DESC,
52         PRG_DATE,
53         COPYRIGHT,
54         CONTACT,
55         WWW,
56         STDOUT )
57
58       begin
59         cla = CommandLineArguments.new( ARGV )
60       rescue ArgumentError => e
61         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
62       end
63
64       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
65            cla.is_option_set?( HELP_OPTION_2 ) )
66         print_help
67         exit( 0 )
68       end
69
70       if ( cla.get_number_of_files != 2 )
71         print_help
72         exit( -1 )
73       end
74
75       allowed_opts = Array.new
76       allowed_opts.push( DELIMITER_OPTION )
77       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
78       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
79       allowed_opts.push( IGNORE_DUF_OPTION )
80       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
81       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
82       allowed_opts.push( SPECIES_OPTION )
83
84       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
85       if ( disallowed.length > 0 )
86         Util.fatal_error( PRG_NAME,
87           "unknown option(s): " + disallowed,
88           STDOUT )
89       end
90
91       inpath = cla.get_file_name( 0 )
92       outpath = cla.get_file_name( 1 )
93
94       column_delimiter = "\t"
95       if ( cla.is_option_set?( DELIMITER_OPTION ) )
96         begin
97           column_delimiter = cla.get_option_value( DELIMITER_OPTION )
98         rescue ArgumentError => e
99           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
100         end
101       end
102
103       i_e_value_threshold = -1.0
104       if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
105         begin
106           i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
107         rescue ArgumentError => e
108           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
109         end
110         if ( i_e_value_threshold < 0.0 )
111           Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
112         end
113       end
114
115       fs_e_value_threshold = -1.0
116       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
117         begin
118           fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
119         rescue ArgumentError => e
120           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
121         end
122         if ( fs_e_value_threshold < 0.0 )
123           Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
124         end
125       end
126
127       hmm_for_protein_output = ""
128       if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
129         begin
130           hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
131         rescue ArgumentError => e
132           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
133         end
134       end
135
136       species = "HUMAN"
137       if ( cla.is_option_set?( SPECIES_OPTION ) )
138         begin
139           species = cla.get_option_value( SPECIES_OPTION )
140         rescue ArgumentError => e
141           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
142         end
143       end
144
145       ignore_dufs = false
146       if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
147         ignore_dufs = true
148       end
149
150       parse_descriptions = false
151       if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
152         parse_descriptions = true
153       end
154
155       puts()
156       puts( "hmmpfam outputfile  : " + inpath )
157       puts( "outputfile          : " + outpath )
158       puts( "species             : " + species )
159       if ( i_e_value_threshold >= 0.0 )
160         puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
161       else
162         puts( "i-E-value threshold : no threshold" )
163       end
164       if ( parse_descriptions )
165         puts( "parse descriptions  : true" )
166       else
167         puts( "parse descriptions  : false" )
168       end
169       if ( ignore_dufs )
170         puts( "ignore DUFs         : true" )
171       else
172         puts( "ignore DUFs         : false" )
173       end
174       if ( column_delimiter == "\t" )
175         puts( "column delimiter    : TAB" )
176       else
177         puts( "column delimiter     : " + column_delimiter )
178       end
179       if fs_e_value_threshold >= 0.0
180         puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
181       else
182         puts( "E-value threshold   : no threshold" )
183       end
184       if !hmm_for_protein_output.empty?
185         puts( "HMM for proteins    : " + hmm_for_protein_output )
186       end
187       puts()
188
189       begin
190         parse( inpath,
191           outpath,
192           column_delimiter,
193           i_e_value_threshold,
194           ignore_dufs,
195           parse_descriptions,
196           fs_e_value_threshold,
197           hmm_for_protein_output,
198           species )
199       rescue IOError => e
200         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
201       end
202       domain_counts = get_domain_counts()
203
204       puts
205       puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
206       puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
207       puts
208       puts( Util.draw_histogram( domain_counts, "#" ) )
209       puts
210       Util.print_message( PRG_NAME, 'OK' )
211       puts
212
213     end # def run
214
215     private
216
217     # raises ArgumentError, IOError
218     def parse( inpath,
219         outpath,
220         column_delimiter,
221         i_e_value_threshold,
222         ignore_dufs,
223         get_descriptions,
224         fs_e_value_threshold,
225         hmm_for_protein_output,
226         species )
227
228       Util.check_file_for_readability( inpath )
229       Util.check_file_for_writability( outpath )
230
231       hmmscan_parser = HmmscanParser.new( inpath )
232       results = hmmscan_parser.parse
233
234       outfile = File.open( outpath, "a" )
235
236       query     = ""
237       desc      = ""
238       model     = ""
239       env_from  = ""
240       env_to    = ""
241       i_e_value = ""
242
243       hmmscan_results_per_protein = []
244
245       prev_query = ""
246
247       results.each do | r |
248         model     = r.model
249         query     = r.query
250         i_e_value = r.i_e_value
251         env_from  = r.env_from
252         env_to    = r.env_to
253
254         if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
255            ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
256           count_model( model )
257           outfile.print( query +
258              column_delimiter )
259           if ( get_descriptions )
260             outfile.print( desc +
261                column_delimiter )
262           end
263           outfile.print( model +
264              column_delimiter +
265              env_from.to_s +
266              column_delimiter +
267              env_to.to_s +
268              column_delimiter +
269              i_e_value.to_s )
270           outfile.print( Constants::LINE_DELIMITER )
271         end
272
273         if !hmm_for_protein_output.empty?
274           if  !prev_query.empty? && prev_query != query
275             if !hmmscan_results_per_protein.empty?
276               process_hmmscan_results_per_protein( hmmscan_results_per_protein,
277                 fs_e_value_threshold,
278                 hmm_for_protein_output,
279                 i_e_value_threshold,
280                 species )
281             end
282             hmmscan_results_per_protein.clear
283           end
284           prev_query = query
285
286           if USE_AVOID_HMMS
287             if !AVOID_HHMS.include? r.model
288               hmmscan_results_per_protein << r
289             end
290           else
291             hmmscan_results_per_protein << r
292           end
293         end
294       end
295
296       if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
297         process_hmmscan_results_per_protein( hmmscan_results_per_protein,
298           fs_e_value_threshold,
299           hmm_for_protein_output,
300           i_e_value_threshold,
301           species )
302       end
303
304       outfile.flush()
305       outfile.close()
306     end # def parse
307
308     def process_id( id )
309       if  id =~ /(sp|tr)\|\S+\|(\S+)/
310         id = $2
311       end
312       id
313     end
314
315     def count_model( model )
316       if ( @domain_counts.has_key?( model ) )
317         count = @domain_counts[ model ].to_i
318         count += 1
319         @domain_counts[ model ] = count
320       else
321         @domain_counts[ model ] = 1
322       end
323     end
324
325     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
326         fs_e_value_threshold,
327         hmm_for_protein_output,
328         i_e_value_threshold,
329         species )
330
331       dc = 0
332       # filter according to i-Evalue threshold
333       # abort if fs Evalue too high
334       hmmscan_results_per_protein_filtered = []
335
336       hmmscan_results_per_protein.each do | r |
337
338
339         if r.model == hmm_for_protein_output
340           if fs_e_value_threshold > 0.0 && r.fs_e_value > fs_e_value_threshold
341             return
342           end
343         end
344         if i_e_value_threshold <= 0 || r.i_e_value <= i_e_value_threshold
345           hmmscan_results_per_protein_filtered << r
346           if r.model == hmm_for_protein_output
347             dc += 1
348           end
349         end
350       end
351
352       if dc == 0
353         # passed on protein E-value, failed in per domain E-values
354         return
355       end
356
357       hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
358
359       own = nil
360       hmmscan_results_per_protein_filtered.each do | r |
361         if r.model == hmm_for_protein_output
362           own = r
363         end
364       end
365
366       s = ""
367       s << own.query + "\t"
368       s << species + "\t"
369       s << own.fs_e_value.to_s + "\t"
370       s << own.qlen.to_s + "\t"
371       s << dc.to_s + "\t"
372       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
373       hmmscan_results_per_protein_filtered.each do | r |
374         s << r.model + " "
375       end
376       s << "\t"
377
378       overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
379
380       s << overview  + "\t"
381
382       s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
383
384       prev_r = nil
385       hmmscan_results_per_protein_filtered.each do | r |
386         if  prev_r != nil
387           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
388         else
389           s << make_interdomain_sequence( r.env_from, false )
390         end
391         s << r.model
392         s << "["
393         s << r.env_from.to_s << "-" << r.env_to.to_s
394         s << "|ie=" << r.i_e_value.to_s
395         s << "|ce=" << r.c_e_value.to_s
396         s << "]"
397         prev_r = r
398       end
399       s << make_interdomain_sequence( own.qlen - prev_r.env_to, false )
400       puts s
401     end
402
403     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
404       linkers = ""
405       prev_r = nil
406       hmmscan_results_per_protein_filtered.each do | r |
407         if r.model == hmm_for_protein_output
408           if  prev_r != nil
409             linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
410           end
411           prev_r = r
412         end
413       end
414       linkers
415     end
416
417     def get_domain_counts()
418       return @domain_counts
419     end
420
421     def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
422       overview = ""
423       prev_r = nil
424       hmmscan_results_per_protein_filtered.each do | r |
425         if r.model == hmm_for_protein_output
426           if prev_r == nil
427             overview << hmm_for_protein_output
428           else
429             if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
430               overview << "~" << hmm_for_protein_output
431             else
432               overview << "----" << hmm_for_protein_output
433             end
434           end
435           prev_r = r
436         end
437       end
438       overview
439     end
440
441     def make_interdomain_sequence( d, mark_short = true )
442       s = ""
443       d /= 20
444       if d >= 10
445         s << "----//----"
446       elsif d >= 1
447         d.times do
448           s << "-"
449         end
450       elsif mark_short
451         s << "~"
452       end
453       s
454     end
455
456
457     def print_help()
458       puts( "Usage:" )
459       puts()
460       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
461       puts()
462       puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
463       puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
464       puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
465       puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
466       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
467       puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
468       puts( "           -" + SPECIES_OPTION + ": species for protein summary" )
469       puts()
470     end
471
472   end # class
473
474 end # module Evoruby