in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_summary.rb
1 #
2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
8 #
9 # last modified: 121003
10
11 require 'lib/evo/util/constants'
12 require 'lib/evo/util/util'
13 require 'lib/evo/util/command_line_arguments'
14 require 'lib/evo/io/parser/hmmscan_parser'
15 require 'lib/evo/io/parser/uniprot_parser'
16
17 module Evoruby
18
19   class HmmscanSummary
20
21     PRG_NAME       = "hsp"
22     PRG_VERSION    = "2.000"
23     PRG_DESC       = "hmmscan summary"
24     PRG_DATE       = "2012.10.23"
25     COPYRIGHT      = "2012 Christian M Zmasek"
26     CONTACT        = "phylosoft@gmail.com"
27     WWW            = "www.phylosoft.org"
28
29     DELIMITER_OPTION              = "d"
30     I_E_VALUE_THRESHOLD_OPTION    = "ie"
31     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
32     HMM_FOR_PROTEIN_OUTPUT        = "m"
33     IGNORE_DUF_OPTION             = "i"
34     PARSE_OUT_DESCRIPITION_OPTION = "a"
35     UNIPROT                       = "u"
36     HELP_OPTION_1                 = "help"
37     HELP_OPTION_2                 = "h"
38
39     USE_AVOID_HMMS = true
40     AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
41     LIMIT_FOR_CLOSE_DOMAINS = 20
42
43     def initialize
44       @domain_counts = Hash.new
45     end
46
47     def run
48
49       Util.print_program_information( PRG_NAME,
50         PRG_VERSION,
51         PRG_DESC,
52         PRG_DATE,
53         COPYRIGHT,
54         CONTACT,
55         WWW,
56         STDOUT )
57
58       begin
59         cla = CommandLineArguments.new( ARGV )
60       rescue ArgumentError => e
61         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
62       end
63
64       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
65            cla.is_option_set?( HELP_OPTION_2 ) )
66         print_help
67         exit( 0 )
68       end
69
70       if ( cla.get_number_of_files != 2 )
71         print_help
72         exit( -1 )
73       end
74
75       allowed_opts = Array.new
76       allowed_opts.push( DELIMITER_OPTION )
77       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
78       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
79       allowed_opts.push( IGNORE_DUF_OPTION )
80       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
81       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
82       allowed_opts.push( UNIPROT )
83
84       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
85       if ( disallowed.length > 0 )
86         Util.fatal_error( PRG_NAME,
87           "unknown option(s): " + disallowed,
88           STDOUT )
89       end
90
91       inpath = cla.get_file_name( 0 )
92       outpath = cla.get_file_name( 1 )
93
94       column_delimiter = "\t"
95       if ( cla.is_option_set?( DELIMITER_OPTION ) )
96         begin
97           column_delimiter = cla.get_option_value( DELIMITER_OPTION )
98         rescue ArgumentError => e
99           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
100         end
101       end
102
103       i_e_value_threshold = -1.0
104       if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
105         begin
106           i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
107         rescue ArgumentError => e
108           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
109         end
110         if ( i_e_value_threshold < 0.0 )
111           Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
112         end
113       end
114
115       fs_e_value_threshold = -1.0
116       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
117         begin
118           fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
119         rescue ArgumentError => e
120           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
121         end
122         if ( fs_e_value_threshold < 0.0 )
123           Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
124         end
125       end
126
127       hmm_for_protein_output = ""
128       if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
129         begin
130           hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
131         rescue ArgumentError => e
132           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
133         end
134       end
135       
136       uniprot = ""
137        if ( cla.is_option_set?( UNIPROT ) )
138         begin
139            uniprot = cla.get_option_value( UNIPROT )
140         rescue ArgumentError => e
141           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
142         end
143       end
144
145       ignore_dufs = false
146       if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
147         ignore_dufs = true
148       end
149
150       parse_descriptions = false
151       if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
152         parse_descriptions = true
153       end
154
155       puts()
156       puts( "hmmpfam outputfile  : " + inpath )
157       puts( "outputfile          : " + outpath )
158       if ( i_e_value_threshold >= 0.0 )
159         puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
160       else
161         puts( "i-E-value threshold : no threshold" )
162       end
163       if ( parse_descriptions )
164         puts( "parse descriptions  : true" )
165       else
166         puts( "parse descriptions  : false" )
167       end
168       if ( ignore_dufs )
169         puts( "ignore DUFs         : true" )
170       else
171         puts( "ignore DUFs         : false" )
172       end
173       if ( column_delimiter == "\t" )
174         puts( "column delimiter    : TAB" )
175       else
176         puts( "column delimiter     : " + column_delimiter )
177       end
178       if fs_e_value_threshold >= 0.0 
179         puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
180       else
181         puts( "E-value threshold   : no threshold" )
182       end
183       if !hmm_for_protein_output.empty? 
184         puts( "HMM for proteins    : " + hmm_for_protein_output )
185       end
186       if !uniprot.empty? 
187         puts( "Uniprot             : " + uniprot )
188       end
189       puts()
190
191       begin
192         parse( inpath,
193           outpath,
194           column_delimiter,
195           i_e_value_threshold,
196           ignore_dufs,
197           parse_descriptions,
198           fs_e_value_threshold,
199           hmm_for_protein_output,
200           uniprot )
201       rescue ArgumentError, IOError => e
202         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
203       end
204       domain_counts = get_domain_counts()
205
206
207       puts
208       puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
209       puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
210       puts
211       puts( Util.draw_histogram( domain_counts, "#" ) )
212       puts
213       Util.print_message( PRG_NAME, 'OK' )
214       puts
215
216     end # def run
217
218     private
219
220     # raises ArgumentError, IOError
221     def parse( inpath,
222         outpath,
223         column_delimiter,
224         i_e_value_threshold,
225         ignore_dufs,
226         get_descriptions,
227         fs_e_value_threshold,
228         hmm_for_protein_output,
229         uniprot )
230       Util.check_file_for_readability( inpath )
231       Util.check_file_for_writability( outpath )
232
233       hmmscan_parser = HmmscanParser.new( inpath )
234       results = hmmscan_parser.parse
235       
236       uniprot_entries = nil
237       if !uniprot.empty? 
238         uniprot_entries = read_uniprot( results, uniprot  )
239       end
240       
241       outfile = File.open( outpath, "a" )
242
243       query     = ""
244       desc      = ""
245       model     = ""
246       env_from  = ""
247       env_to    = ""
248       i_e_value = ""
249
250       hmmscan_results_per_protein = []
251
252       
253
254       prev_query = ""
255
256       results.each do | r |
257         model     = r.model
258         query     = r.query
259         i_e_value = r.i_e_value
260         env_from  = r.env_from
261         env_to    = r.env_to
262
263         if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
264            ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
265           count_model( model )
266           outfile.print( query +
267              column_delimiter )
268           if ( get_descriptions )
269             outfile.print( desc +
270                column_delimiter )
271           end
272           outfile.print( model +
273              column_delimiter +
274              env_from.to_s +
275              column_delimiter +
276              env_to.to_s +
277              column_delimiter +
278              i_e_value.to_s )
279           outfile.print( Constants::LINE_DELIMITER )
280         end
281
282         if !hmm_for_protein_output.empty?
283           if  !prev_query.empty? && prev_query != query
284             if !hmmscan_results_per_protein.empty?
285               process_hmmscan_results_per_protein( hmmscan_results_per_protein,
286                 fs_e_value_threshold,
287                 hmm_for_protein_output,
288                 i_e_value_threshold )
289             end
290             hmmscan_results_per_protein.clear
291           end
292           prev_query = query
293
294           if USE_AVOID_HMMS
295             if !AVOID_HHMS.include? r.model
296               hmmscan_results_per_protein << r
297             end
298           else
299             hmmscan_results_per_protein << r
300           end
301         end
302       end
303       if !hmm_for_protein_output.empty?
304         if !hmmscan_results_per_protein.empty?
305           process_hmmscan_results_per_protein( hmmscan_results_per_protein,
306             fs_e_value_threshold,
307             hmm_for_protein_output,
308             i_e_value_threshold, 
309              uniprot_entries )
310         end
311       end
312       outfile.flush()
313       outfile.close()
314
315     end # def parse
316
317     
318      def read_uniprot( hmmscan_results, uniprot  )  
319         ids = []
320          hmmscan_results.each do | r |
321            ids << r.query
322          end 
323          uniprot_parser = UniprotParser.new uniprot
324          uniprot_entries = uniprot_parser.parse ids 
325          uniprot_entries
326       end
327     
328     def count_model( model )
329       if ( @domain_counts.has_key?( model ) )
330         count = @domain_counts[ model ].to_i
331         count += 1
332         @domain_counts[ model ] = count
333       else
334         @domain_counts[ model ] = 1
335       end
336     end
337
338     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
339         fs_e_value_threshold,
340         hmm_for_protein_output,
341         i_e_value_threshold,
342          uniprot_entries      )
343
344       dc = 0
345       # filter according to i-Evalue threshold
346       # abort if fs Evalue too high
347       hmmscan_results_per_protein_filtered = []
348
349       hmmscan_results_per_protein.each do | r |
350         if r.model == hmm_for_protein_output
351           if r.fs_e_value > fs_e_value_threshold
352             return
353           end
354         end
355         if r.i_e_value <= i_e_value_threshold
356           hmmscan_results_per_protein_filtered << r
357           if r.model == hmm_for_protein_output
358             dc += 1
359           end
360         end
361       end
362
363       if dc == 0
364         # passed on protein E-value, failed in per domain E-values
365         return
366       end
367
368       hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
369
370       own = nil
371       hmmscan_results_per_protein_filtered.each do | r |
372         if r.model == hmm_for_protein_output
373           own = r
374         end
375       end
376
377       s = ""
378       s << own.query + "\t"
379       s << "HUMAN" + "\t"
380       s << own.fs_e_value.to_s + "\t"
381       s << own.qlen.to_s + "\t"
382       s << dc.to_s + "\t"
383       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
384       hmmscan_results_per_protein_filtered.each do | r |
385         s << r.model + " "
386       end
387       s << "\t"
388      s <<  uniprot_entries[  own.query ]
389        s << "\t"
390       overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
391
392       s << overview   + "\t"
393
394       s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
395
396       prev_r = nil
397       hmmscan_results_per_protein_filtered.each do | r |
398
399         if  prev_r != nil
400           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
401         else
402           s << make_interdomain_sequence( r.env_from, false )
403         end
404         s << r.model
405         s << "["
406         s << r.env_from.to_s << "-" << r.env_to.to_s
407         s << "|ie=" << r.i_e_value.to_s
408         s << "|ce=" << r.c_e_value.to_s
409         s << "]"
410         prev_r = r
411       end
412       s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
413       puts s
414     end
415
416
417     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
418       linkers = ""
419       prev_r = nil
420       hmmscan_results_per_protein_filtered.each do | r |
421         if r.model == hmm_for_protein_output
422           if  prev_r != nil
423             linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
424           end
425           prev_r = r
426         end
427       end
428       linkers
429     end
430
431     def get_domain_counts()
432       return @domain_counts
433     end
434
435     def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
436       overview = ""
437       prev_r = nil
438       hmmscan_results_per_protein_filtered.each do | r |
439         if r.model == hmm_for_protein_output
440           if prev_r == nil
441             overview << hmm_for_protein_output
442           else
443             if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
444               overview << "~" << hmm_for_protein_output
445             else
446               overview << "----" << hmm_for_protein_output
447             end
448           end
449           prev_r = r
450         end
451       end
452       overview
453     end
454
455     def make_interdomain_sequence( d, mark_short = true )
456       s = ""
457       d /= 20
458       if d >= 10
459         s << "----//----"
460       elsif d >= 1
461         d.times do
462           s << "-"
463         end
464       elsif mark_short
465         s << "~"
466       end
467       s
468     end
469
470
471
472     def print_help()
473       puts( "Usage:" )
474       puts()
475       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
476       puts()
477       puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
478       puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
479       puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
480       puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
481       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
482       puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
483       puts()
484     end
485
486   end # class
487
488 end # module Evoruby