in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_summary.rb
1 #
2 # = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class
3 #
4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
8 #
9 # last modified: 11/24/2009
10
11 require 'lib/evo/util/constants'
12 require 'lib/evo/util/util'
13 require 'lib/evo/util/command_line_arguments'
14 require 'lib/evo/io/parser/hmmscan_parser'
15
16 module Evoruby
17
18   class HmmscanSummary
19
20     PRG_NAME       = "hsp"
21     PRG_VERSION    = "2.000"
22     PRG_DESC       = "hmmscan parser"
23     PRG_DATE       = "2012.10.19"
24     COPYRIGHT      = "2012 Christian M Zmasek"
25     CONTACT        = "phylosoft@gmail.com"
26     WWW            = "www.phylosoft.org"
27
28     DELIMITER_OPTION              = "d"
29     I_E_VALUE_THRESHOLD_OPTION    = "ie"
30     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
31     HMM_FOR_PROTEIN_OUTPUT        = "m"
32     IGNORE_DUF_OPTION             = "i"
33     PARSE_OUT_DESCRIPITION_OPTION = "a"
34     HELP_OPTION_1                 = "help"
35     HELP_OPTION_2                 = "h"
36
37     USE_AVOID_HMMS = true
38     AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
39     LIMIT_FOR_CLOSE_DOMAINS = 20
40
41     def initialize
42       @domain_counts = Hash.new
43     end
44
45     def run
46
47       Util.print_program_information( PRG_NAME,
48         PRG_VERSION,
49         PRG_DESC,
50         PRG_DATE,
51         COPYRIGHT,
52         CONTACT,
53         WWW,
54         STDOUT )
55
56       begin
57         cla = CommandLineArguments.new( ARGV )
58       rescue ArgumentError => e
59         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
60       end
61
62       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
63            cla.is_option_set?( HELP_OPTION_2 ) )
64         print_help
65         exit( 0 )
66       end
67
68       if ( cla.get_number_of_files != 2 )
69         print_help
70         exit( -1 )
71       end
72
73       allowed_opts = Array.new
74       allowed_opts.push( DELIMITER_OPTION )
75       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
76       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
77       allowed_opts.push( IGNORE_DUF_OPTION )
78       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
79       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
80
81       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
82       if ( disallowed.length > 0 )
83         Util.fatal_error( PRG_NAME,
84           "unknown option(s): " + disallowed,
85           STDOUT )
86       end
87
88       inpath = cla.get_file_name( 0 )
89       outpath = cla.get_file_name( 1 )
90
91       column_delimiter = "\t"
92       if ( cla.is_option_set?( DELIMITER_OPTION ) )
93         begin
94           column_delimiter = cla.get_option_value( DELIMITER_OPTION )
95         rescue ArgumentError => e
96           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
97         end
98       end
99
100       i_e_value_threshold = -1.0
101       if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
102         begin
103           i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
104         rescue ArgumentError => e
105           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
106         end
107         if ( i_e_value_threshold < 0.0 )
108           Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
109         end
110       end
111
112       fs_e_value_threshold = -1.0
113       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
114         begin
115           fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
116         rescue ArgumentError => e
117           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
118         end
119         if ( fs_e_value_threshold < 0.0 )
120           Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
121         end
122       end
123
124       hmm_for_protein_output = ""
125       if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
126         begin
127           hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
128         rescue ArgumentError => e
129           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
130         end
131       end
132
133       ignore_dufs = false
134       if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
135         ignore_dufs = true
136       end
137
138       parse_descriptions = false
139       if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
140         parse_descriptions = true
141       end
142
143       puts()
144       puts( "hmmpfam outputfile  : " + inpath )
145       puts( "outputfile          : " + outpath )
146       if ( i_e_value_threshold >= 0.0 )
147         puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
148       else
149         puts( "i-E-value threshold : no threshold" )
150       end
151       if ( parse_descriptions )
152         puts( "parse descriptions  : true" )
153       else
154         puts( "parse descriptions  : false" )
155       end
156       if ( ignore_dufs )
157         puts( "ignore DUFs         : true" )
158       else
159         puts( "ignore DUFs         : false" )
160       end
161       if ( column_delimiter == "\t" )
162         puts( "column delimiter    : TAB" )
163       else
164         puts( "column delimiter     : " + column_delimiter )
165       end
166       if ( fs_e_value_threshold >= 0.0 )
167         puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
168       else
169         puts( "E-value threshold   : no threshold" )
170       end
171       if ( !hmm_for_protein_output.empty? )
172         puts( "HMM for proteins    : " + hmm_for_protein_output )
173       end
174       puts()
175
176       begin
177         parse( inpath,
178           outpath,
179           column_delimiter,
180           i_e_value_threshold,
181           ignore_dufs,
182           parse_descriptions,
183           fs_e_value_threshold,
184           hmm_for_protein_output )
185       rescue ArgumentError, IOError => e
186         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
187       end
188       domain_counts = get_domain_counts()
189
190
191       puts
192       puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
193       puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
194       puts
195       puts( Util.draw_histogram( domain_counts, "#" ) )
196       puts
197       Util.print_message( PRG_NAME, 'OK' )
198       puts
199
200     end # def run
201
202     private
203
204     # raises ArgumentError, IOError
205     def parse( inpath,
206         outpath,
207         column_delimiter,
208         i_e_value_threshold,
209         ignore_dufs,
210         get_descriptions,
211         fs_e_value_threshold,
212         hmm_for_protein_output )
213       Util.check_file_for_readability( inpath )
214       Util.check_file_for_writability( outpath )
215
216       outfile = File.open( outpath, "a" )
217
218       query     = ""
219       desc      = ""
220       model     = ""
221       env_from  = ""
222       env_to    = ""
223       i_e_value = ""
224
225       hmmscan_results_per_protein = []
226
227       hmmscan_parser = HmmscanParser.new( inpath )
228
229       prev_query = ""
230
231       hmmscan_parser.parse.each do | r |
232         model     = r.model
233         query     = r.query
234         i_e_value = r.i_e_value
235         env_from  = r.env_from
236         env_to    = r.env_to
237
238         if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
239            ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
240           count_model( model )
241           outfile.print( query +
242              column_delimiter )
243           if ( get_descriptions )
244             outfile.print( desc +
245                column_delimiter )
246           end
247           outfile.print( model +
248              column_delimiter +
249              env_from.to_s +
250              column_delimiter +
251              env_to.to_s +
252              column_delimiter +
253              i_e_value.to_s )
254           outfile.print( Constants::LINE_DELIMITER )
255         end
256
257         if !hmm_for_protein_output.empty?
258           if  !prev_query.empty? && prev_query != query
259             if !hmmscan_results_per_protein.empty?
260               process_hmmscan_results_per_protein( hmmscan_results_per_protein,
261                 fs_e_value_threshold,
262                 hmm_for_protein_output,
263                 i_e_value_threshold )
264             end
265             hmmscan_results_per_protein.clear
266           end
267           prev_query = query
268
269           if USE_AVOID_HMMS
270             if !AVOID_HHMS.include? r.model
271               hmmscan_results_per_protein << r
272             end
273           else
274             hmmscan_results_per_protein << r
275           end
276         end
277       end
278       if !hmm_for_protein_output.empty?
279         if !hmmscan_results_per_protein.empty?
280           process_hmmscan_results_per_protein( hmmscan_results_per_protein,
281             fs_e_value_threshold,
282             hmm_for_protein_output,
283             i_e_value_threshold )
284         end
285       end
286       outfile.flush()
287       outfile.close()
288
289     end # def parse
290
291     def count_model( model )
292       if ( @domain_counts.has_key?( model ) )
293         count = @domain_counts[ model ].to_i
294         count += 1
295         @domain_counts[ model ] = count
296       else
297         @domain_counts[ model ] = 1
298       end
299     end
300
301     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
302         fs_e_value_threshold,
303         hmm_for_protein_output,
304         i_e_value_threshold )
305
306       dc = 0
307       # filter according to i-Evalue threshold
308       # abort if fs Evalue too high
309       hmmscan_results_per_protein_filtered = []
310
311       hmmscan_results_per_protein.each do | r |
312         if r.model == hmm_for_protein_output
313           if r.fs_e_value > fs_e_value_threshold
314             return
315           end
316         end
317         if r.i_e_value <= i_e_value_threshold
318           hmmscan_results_per_protein_filtered << r
319           if r.model == hmm_for_protein_output
320             dc += 1
321           end
322         end
323       end
324
325       if dc == 0
326         # passed on protein E-value, failed in per domain E-values
327         return
328       end
329
330       hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
331
332       own = nil
333       hmmscan_results_per_protein_filtered.each do | r |
334         if r.model == hmm_for_protein_output
335           own = r
336         end
337       end
338
339       s = ""
340       s << own.query + "\t"
341       s << "HUMAN" + "\t"
342       s << own.fs_e_value.to_s + "\t"
343       s << own.qlen.to_s + "\t"
344       s << dc.to_s + "\t"
345       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
346       hmmscan_results_per_protein_filtered.each do | r |
347         s << r.model + " "
348       end
349       s << "\t"
350
351       overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
352
353       s << overview   + "\t"
354
355       s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
356
357       prev_r = nil
358       hmmscan_results_per_protein_filtered.each do | r |
359
360         if  prev_r != nil
361           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
362         else
363           s << make_interdomain_sequence( r.env_from, false )
364         end
365         s << r.model
366         s << "["
367         s << r.env_from.to_s << "-" << r.env_to.to_s
368         s << "|ie=" << r.i_e_value.to_s
369         s << "|ce=" << r.c_e_value.to_s
370         s << "]"
371         prev_r = r
372       end
373       s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
374       puts s
375     end
376
377
378     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
379       linkers = ""
380       prev_r = nil
381       hmmscan_results_per_protein_filtered.each do | r |
382         if r.model == hmm_for_protein_output
383           if  prev_r != nil
384             linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
385           end
386           prev_r = r
387         end
388       end
389       linkers
390     end
391
392     def get_domain_counts()
393       return @domain_counts
394     end
395
396     def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
397       overview = ""
398       prev_r = nil
399       hmmscan_results_per_protein_filtered.each do | r |
400         if r.model == hmm_for_protein_output
401           if prev_r == nil
402             overview << hmm_for_protein_output
403           else
404             if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
405               overview << "~" << hmm_for_protein_output
406             else
407               overview << "----" << hmm_for_protein_output
408             end
409           end
410           prev_r = r
411         end
412       end
413       overview
414     end
415
416     def make_interdomain_sequence( d, mark_short = true )
417       s = ""
418       d /= 20
419       if d >= 10
420         s << "----//----"
421       elsif d >= 1
422         d.times do
423           s << "-"
424         end
425       elsif mark_short
426         s << "~"
427       end
428       s
429     end
430
431
432
433     def print_help()
434       puts( "Usage:" )
435       puts()
436       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
437       puts()
438       puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
439       puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
440       puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
441       puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
442       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
443       puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
444       puts()
445     end
446
447   end # class
448
449 end # module Evoruby