v 1.001
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_summary.rb
1 #
2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
3 #
4 # Copyright::    Copyright (C) 2017 Christian M Zmasek
5 # License::      GNU Lesser General Public License (LGPL)
6
7 require 'set'
8 require 'lib/evo/util/constants'
9 require 'lib/evo/util/util'
10 require 'lib/evo/util/command_line_arguments'
11 require 'lib/evo/io/parser/hmmscan_parser'
12
13 module Evoruby
14   class HmmscanSummary
15
16     PRG_NAME       = "hsp"
17     PRG_VERSION    = "2.003"
18     PRG_DESC       = "Summarize hmmscan output tables into simpler tables"
19     PRG_DATE       = "170213"
20     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
21
22     DELIMITER_OPTION              = "d"
23     SPECIES_OPTION                = "s"
24     I_E_VALUE_THRESHOLD_OPTION    = "ie"
25     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
26     HMM_FOR_PROTEIN_OUTPUT        = "m"
27     IGNORE_DUF_OPTION             = "i"
28     PARSE_OUT_DESCRIPITION_OPTION = "a"
29     HELP_OPTION_1                 = "help"
30     HELP_OPTION_2                 = "h"
31
32     USE_AVOID_HMMS = false
33     AVOID_HHMS = [ "x", "y", "z" ]
34     LIMIT_FOR_CLOSE_DOMAINS = 20 # Used for protein architecture summary
35
36     def initialize
37       @domain_counts = Hash.new
38     end
39
40     def run
41
42       Util.print_program_information( PRG_NAME,
43       PRG_VERSION,
44       PRG_DESC,
45       PRG_DATE,
46       WWW,
47       STDOUT )
48
49       if ( ARGV == nil || ( ARGV.length < 1 )  )
50         print_help
51         exit( -1 )
52       end
53
54       begin
55         cla = CommandLineArguments.new( ARGV )
56       rescue ArgumentError => e
57         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
58       end
59
60       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
61       cla.is_option_set?( HELP_OPTION_2 ) )
62         print_help
63         exit( 0 )
64       end
65
66       allowed_opts = Array.new
67       allowed_opts.push( DELIMITER_OPTION )
68       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
69       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
70       allowed_opts.push( IGNORE_DUF_OPTION )
71       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
72       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
73       allowed_opts.push( SPECIES_OPTION )
74
75       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
76       if ( disallowed.length > 0 )
77         Util.fatal_error( PRG_NAME,
78         "unknown option(s): " + disallowed,
79         STDOUT )
80       end
81
82       inpath = cla.get_file_name( 0 )
83
84       outpath = ""
85       if ( cla.get_number_of_files == 1 )
86         outpath = inpath + Constants::DOMAIN_TABLE_SUFFIX
87       elsif ( cla.get_number_of_files == 2 )
88         outpath = cla.get_file_name( 1 )
89       else
90         print_help
91         exit( -1 )
92       end
93
94       column_delimiter = "\t"
95       if ( cla.is_option_set?( DELIMITER_OPTION ) )
96         begin
97           column_delimiter = cla.get_option_value( DELIMITER_OPTION )
98         rescue ArgumentError => e
99           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
100         end
101       end
102
103       i_e_value_threshold = -1.0
104       if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
105         begin
106           i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
107         rescue ArgumentError => e
108           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
109         end
110         if ( i_e_value_threshold < 0.0 )
111           Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
112         end
113       end
114
115       fs_e_value_threshold = -1.0
116       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
117         begin
118           fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
119         rescue ArgumentError => e
120           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
121         end
122         if ( fs_e_value_threshold < 0.0 )
123           Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
124         end
125       end
126
127       hmm_for_protein_output = ""
128       if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
129         begin
130           hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
131         rescue ArgumentError => e
132           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
133         end
134       end
135
136       species = "HUMAN"
137       if ( cla.is_option_set?( SPECIES_OPTION ) )
138         begin
139           species = cla.get_option_value( SPECIES_OPTION )
140         rescue ArgumentError => e
141           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
142         end
143       end
144
145       ignore_dufs = false
146       if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
147         ignore_dufs = true
148       end
149
150       parse_descriptions = false
151       if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
152         parse_descriptions = true
153       end
154
155       puts()
156       puts( "hmmscan outputfile  : " + inpath )
157       puts( "outputfile          : " + outpath )
158
159       if ( i_e_value_threshold >= 0.0 )
160         puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
161       else
162         puts( "i-E-value threshold : no threshold" )
163       end
164       if ( parse_descriptions )
165         puts( "parse descriptions  : true" )
166       else
167         puts( "parse descriptions  : false" )
168       end
169       if ( ignore_dufs )
170         puts( "ignore DUFs         : true" )
171       else
172         puts( "ignore DUFs         : false" )
173       end
174       if ( column_delimiter == "\t" )
175         puts( "column delimiter    : TAB" )
176       else
177         puts( "column delimiter    : " + column_delimiter )
178       end
179       if !hmm_for_protein_output.empty?
180         puts( "HMM for proteins    : " + hmm_for_protein_output )
181         puts( "species             : " + species )
182         if fs_e_value_threshold >= 0.0
183           puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
184         else
185           puts( "E-value threshold   : no threshold" )
186         end
187       end
188       puts()
189
190       begin
191         parse( inpath,
192         outpath,
193         column_delimiter,
194         i_e_value_threshold,
195         ignore_dufs,
196         parse_descriptions,
197         fs_e_value_threshold,
198         hmm_for_protein_output,
199         species )
200       rescue IOError => e
201         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
202       end
203       domain_counts = get_domain_counts()
204
205       puts
206       puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
207       puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
208       puts
209       puts( Util.draw_histogram( domain_counts, "#" ) )
210       puts
211       Util.print_message( PRG_NAME, "wrote: " + outpath )
212       Util.print_message( PRG_NAME, "next step in standard analysis pipeline: d2f.rb")
213       Util.print_message( PRG_NAME, 'OK' )
214       puts
215
216     end # def run
217
218     private
219
220     # raises ArgumentError, IOError
221     def parse( inpath,
222       outpath,
223       column_delimiter,
224       i_e_value_threshold,
225       ignore_dufs,
226       get_descriptions,
227       fs_e_value_threshold,
228       hmm_for_protein_output,
229       species )
230
231       Util.check_file_for_readability( inpath )
232       Util.check_file_for_writability( outpath )
233
234       hmmscan_parser = HmmscanParser.new( inpath )
235       results = hmmscan_parser.parse
236
237       outfile = File.open( outpath, "a" )
238
239       query     = ""
240       desc      = ""
241       model     = ""
242       env_from  = ""
243       env_to    = ""
244       i_e_value = ""
245
246       hmmscan_results_per_protein = []
247
248       prev_query = ""
249
250       results.each do | r |
251         model     = r.model
252         desc      = r.desc
253         query     = r.query
254         i_e_value = r.i_e_value
255         env_from  = r.env_from
256         env_to    = r.env_to
257
258         if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
259         ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
260           count_model( model )
261           outfile.print( query +
262           column_delimiter )
263           if ( get_descriptions )
264             outfile.print( desc +
265             column_delimiter )
266           end
267           outfile.print( model +
268           column_delimiter +
269           env_from.to_s +
270           column_delimiter +
271           env_to.to_s +
272           column_delimiter +
273           i_e_value.to_s )
274           outfile.print( Constants::LINE_DELIMITER )
275         end
276
277         if !hmm_for_protein_output.empty?
278           if  !prev_query.empty? && prev_query != query
279             if !hmmscan_results_per_protein.empty?
280               process_hmmscan_results_per_protein( hmmscan_results_per_protein,
281               fs_e_value_threshold,
282               hmm_for_protein_output,
283               i_e_value_threshold,
284               species )
285             end
286             hmmscan_results_per_protein.clear
287           end
288           prev_query = query
289
290           if USE_AVOID_HMMS
291             if !AVOID_HHMS.include? r.model
292               hmmscan_results_per_protein << r
293             end
294           else
295             hmmscan_results_per_protein << r
296           end
297         end
298       end
299
300       if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
301         process_hmmscan_results_per_protein( hmmscan_results_per_protein,
302         fs_e_value_threshold,
303         hmm_for_protein_output,
304         i_e_value_threshold,
305         species )
306       end
307
308       outfile.flush()
309       outfile.close()
310     end # def parse
311
312     def process_id( id )
313       if  id =~ /(sp|tr)\|\S+\|(\S+)/
314         id = $2
315       end
316       id
317     end
318
319     def count_model( model )
320       if ( @domain_counts.has_key?( model ) )
321         count = @domain_counts[ model ].to_i
322         count += 1
323         @domain_counts[ model ] = count
324       else
325         @domain_counts[ model ] = 1
326       end
327     end
328
329     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
330       fs_e_value_threshold,
331       hmm_for_protein_output,
332       i_e_value_threshold,
333       species )
334
335       dc = 0
336       # filter according to i-Evalue threshold
337       # abort if fs Evalue too high
338       hmmscan_results_per_protein_filtered = []
339
340       hmmscan_results_per_protein.each do | r |
341
342         if r.model == hmm_for_protein_output
343           if fs_e_value_threshold > 0.0 && r.fs_e_value > fs_e_value_threshold
344             return
345           end
346         end
347         if i_e_value_threshold <= 0 || r.i_e_value <= i_e_value_threshold
348           hmmscan_results_per_protein_filtered << r
349           if r.model == hmm_for_protein_output
350             dc += 1
351           end
352         end
353       end
354
355       if dc == 0
356         # passed on protein E-value, failed in per domain E-values
357         return
358       end
359
360       hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
361
362       own = nil
363       hmmscan_results_per_protein_filtered.each do | r |
364         if r.model == hmm_for_protein_output
365           own = r
366         end
367       end
368
369       s = ""
370       s << own.query + "\t"
371       s << species + "\t"
372       s << own.fs_e_value.to_s + "\t"
373       s << own.qlen.to_s + "\t"
374       s << dc.to_s + "\t"
375       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
376       hmmscan_results_per_protein_filtered.each do | r |
377         s << r.model + " "
378       end
379       s << "\t"
380
381       overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
382
383       s << overview  + "\t"
384
385       s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
386
387       prev_r = nil
388       hmmscan_results_per_protein_filtered.each do | r |
389         if  prev_r != nil
390           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
391         else
392           s << make_interdomain_sequence( r.env_from, false )
393         end
394         s << r.model
395         s << "["
396         s << r.env_from.to_s << "-" << r.env_to.to_s
397         s << "|ie=" << r.i_e_value.to_s
398         s << "|ce=" << r.c_e_value.to_s
399         s << "]"
400         prev_r = r
401       end
402       s << make_interdomain_sequence( own.qlen - prev_r.env_to, false )
403       puts s
404     end
405
406     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
407       linkers = ""
408       prev_r = nil
409       hmmscan_results_per_protein_filtered.each do | r |
410         if r.model == hmm_for_protein_output
411           if  prev_r != nil
412             linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
413           end
414           prev_r = r
415         end
416       end
417       linkers
418     end
419
420     def get_domain_counts()
421       return @domain_counts
422     end
423
424     def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
425       overview = ""
426       prev_r = nil
427       hmmscan_results_per_protein_filtered.each do | r |
428         if r.model == hmm_for_protein_output
429           if prev_r == nil
430             overview << hmm_for_protein_output
431           else
432             if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
433               overview << "~" << hmm_for_protein_output
434             else
435               overview << "----" << hmm_for_protein_output
436             end
437           end
438           prev_r = r
439         end
440       end
441       overview
442     end
443
444     def make_interdomain_sequence( d, mark_short = true )
445       s = ""
446       d /= 20
447       if d >= 10
448         s << "----//----"
449       elsif d >= 1
450         d.times do
451           s << "-"
452         end
453       elsif mark_short
454         s << "~"
455       end
456       s
457     end
458
459     def print_help()
460       puts( "Usage:" )
461       puts()
462       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> [outputfile]" )
463       puts()
464       puts( "  options: -" + DELIMITER_OPTION + "=<s> : column delimiter for outputfile, default is TAB" )
465       puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + "=<f>: i-E-value threshold, default is no threshold" )
466       puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + "     : parse query description (in addition to query name)" )
467       puts( "           -" + IGNORE_DUF_OPTION  + "     : ignore DUFs" )
468       puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + "=<s> : HMM for protein architectures summary" )
469       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + "=<f>: E-value threshold for full protein sequences, only for protein architectures summary" )
470       puts( "           -" + SPECIES_OPTION + "=<s> : species for protein architectures summary" )
471       puts()
472       puts( "  [next step in standard analysis pipeline: d2f.rb]")
473       puts()
474       puts( "Examples:" )
475       puts()
476       puts( "  " + "hmmscan --max --domtblout P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 -E 10 Pfam-A.hmm P53_ni.fasta" )
477       puts()
478       puts( "  " + PRG_NAME + ".rb P53_hmmscan_300_10" )
479       puts()
480     end
481
482   end # class
483
484 end # module Evoruby