inprogress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_summary.rb
1 #
2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
8 #
9 # last modified: 121003
10
11 require 'set'
12
13 require 'lib/evo/util/constants'
14 require 'lib/evo/util/util'
15 require 'lib/evo/util/command_line_arguments'
16 require 'lib/evo/io/parser/hmmscan_parser'
17 require 'lib/evo/io/web/uniprotkb'
18
19 module Evoruby
20
21   class HmmscanSummary
22
23     PRG_NAME       = "hsp"
24     PRG_VERSION    = "2.000"
25     PRG_DESC       = "hmmscan summary"
26     PRG_DATE       = "2012.10.23"
27     COPYRIGHT      = "2012 Christian M Zmasek"
28     CONTACT        = "phylosoft@gmail.com"
29     WWW            = "www.phylosoft.org"
30
31     DELIMITER_OPTION              = "d"
32     SPECIES_OPTION                = "s"
33     I_E_VALUE_THRESHOLD_OPTION    = "ie"
34     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
35     HMM_FOR_PROTEIN_OUTPUT        = "m"
36     IGNORE_DUF_OPTION             = "i"
37     PARSE_OUT_DESCRIPITION_OPTION = "a"
38     UNIPROT                       = "u"
39     HELP_OPTION_1                 = "help"
40     HELP_OPTION_2                 = "h"
41
42     USE_AVOID_HMMS = true
43     AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
44     LIMIT_FOR_CLOSE_DOMAINS = 20
45
46     def initialize
47       @domain_counts = Hash.new
48     end
49
50     def run
51
52
53
54       Util.print_program_information( PRG_NAME,
55         PRG_VERSION,
56         PRG_DESC,
57         PRG_DATE,
58         COPYRIGHT,
59         CONTACT,
60         WWW,
61         STDOUT )
62
63       begin
64         cla = CommandLineArguments.new( ARGV )
65       rescue ArgumentError => e
66         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
67       end
68
69       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
70             cla.is_option_set?( HELP_OPTION_2 ) )
71         print_help
72         exit( 0 )
73       end
74
75       if ( cla.get_number_of_files != 2 )
76         print_help
77         exit( -1 )
78       end
79
80       allowed_opts = Array.new
81       allowed_opts.push( DELIMITER_OPTION )
82       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
83       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
84       allowed_opts.push( IGNORE_DUF_OPTION )
85       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
86       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
87       allowed_opts.push( UNIPROT )
88       allowed_opts.push( SPECIES_OPTION )
89
90       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
91       if ( disallowed.length > 0 )
92         Util.fatal_error( PRG_NAME,
93           "unknown option(s): " + disallowed,
94           STDOUT )
95       end
96
97       inpath = cla.get_file_name( 0 )
98       outpath = cla.get_file_name( 1 )
99
100       column_delimiter = "\t"
101       if ( cla.is_option_set?( DELIMITER_OPTION ) )
102         begin
103           column_delimiter = cla.get_option_value( DELIMITER_OPTION )
104         rescue ArgumentError => e
105           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
106         end
107       end
108
109       i_e_value_threshold = -1.0
110       if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
111         begin
112           i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
113         rescue ArgumentError => e
114           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
115         end
116         if ( i_e_value_threshold < 0.0 )
117           Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
118         end
119       end
120
121       
122
123       fs_e_value_threshold = -1.0
124       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
125         begin
126           fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
127         rescue ArgumentError => e
128           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
129         end
130         if ( fs_e_value_threshold < 0.0 )
131           Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
132         end
133       end
134
135       hmm_for_protein_output = ""
136       if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
137         begin
138           hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
139         rescue ArgumentError => e
140           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
141         end
142       end
143
144       uniprot = ""
145       if ( cla.is_option_set?( UNIPROT ) )
146         begin
147           uniprot = cla.get_option_value( UNIPROT )
148         rescue ArgumentError => e
149           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
150         end
151       end
152
153       species = "HUMAN"
154        if ( cla.is_option_set?( SPECIES_OPTION ) )
155         begin
156           species = cla.get_option_value( SPECIES_OPTION )
157         rescue ArgumentError => e
158           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
159         end
160       end
161
162       ignore_dufs = false
163       if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
164         ignore_dufs = true
165       end
166
167       parse_descriptions = false
168       if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
169         parse_descriptions = true
170       end
171
172       puts()
173       puts( "hmmpfam outputfile  : " + inpath )
174       puts( "outputfile          : " + outpath )
175       puts( "species             : " + species )
176       if ( i_e_value_threshold >= 0.0 )
177         puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
178       else
179         puts( "i-E-value threshold : no threshold" )
180       end
181       if ( parse_descriptions )
182         puts( "parse descriptions  : true" )
183       else
184         puts( "parse descriptions  : false" )
185       end
186       if ( ignore_dufs )
187         puts( "ignore DUFs         : true" )
188       else
189         puts( "ignore DUFs         : false" )
190       end
191       if ( column_delimiter == "\t" )
192         puts( "column delimiter    : TAB" )
193       else
194         puts( "column delimiter     : " + column_delimiter )
195       end
196       if fs_e_value_threshold >= 0.0
197         puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
198       else
199         puts( "E-value threshold   : no threshold" )
200       end
201       if !hmm_for_protein_output.empty?
202         puts( "HMM for proteins    : " + hmm_for_protein_output )
203       end
204       if !uniprot.empty?
205         puts( "Uniprot             : " + uniprot )
206       end
207       puts()
208
209       begin
210         parse( inpath,
211           outpath,
212           column_delimiter,
213           i_e_value_threshold,
214           ignore_dufs,
215           parse_descriptions,
216           fs_e_value_threshold,
217           hmm_for_protein_output,
218           uniprot,
219           species )
220       rescue IOError => e
221         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
222       end
223       domain_counts = get_domain_counts()
224
225
226       puts
227       puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
228       puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
229       puts
230       puts( Util.draw_histogram( domain_counts, "#" ) )
231       puts
232       Util.print_message( PRG_NAME, 'OK' )
233       puts
234
235     end # def run
236
237     private
238
239     # raises ArgumentError, IOError
240     def parse( inpath,
241         outpath,
242         column_delimiter,
243         i_e_value_threshold,
244         ignore_dufs,
245         get_descriptions,
246         fs_e_value_threshold,
247         hmm_for_protein_output,
248         uniprot,
249         species )
250
251
252
253       Util.check_file_for_readability( inpath )
254       Util.check_file_for_writability( outpath )
255
256       hmmscan_parser = HmmscanParser.new( inpath )
257       results = hmmscan_parser.parse
258
259       outfile = File.open( outpath, "a" )
260
261       query     = ""
262       desc      = ""
263       model     = ""
264       env_from  = ""
265       env_to    = ""
266       i_e_value = ""
267
268       hmmscan_results_per_protein = []
269
270
271
272       prev_query = ""
273
274       results.each do | r |
275         model     = r.model
276         query     = r.query
277         i_e_value = r.i_e_value
278         env_from  = r.env_from
279         env_to    = r.env_to
280
281         if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
282             ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
283           count_model( model )
284           outfile.print( query +
285               column_delimiter )
286           if ( get_descriptions )
287             outfile.print( desc +
288                 column_delimiter )
289           end
290           outfile.print( model +
291               column_delimiter +
292               env_from.to_s +
293               column_delimiter +
294               env_to.to_s +
295               column_delimiter +
296               i_e_value.to_s )
297           outfile.print( Constants::LINE_DELIMITER )
298         end
299
300         if !hmm_for_protein_output.empty?
301           if  !prev_query.empty? && prev_query != query
302             if !hmmscan_results_per_protein.empty?
303               process_hmmscan_results_per_protein( hmmscan_results_per_protein,
304                 fs_e_value_threshold,
305                 hmm_for_protein_output,
306                 i_e_value_threshold,
307                 false,
308                 species )
309             end
310             hmmscan_results_per_protein.clear
311           end
312           prev_query = query
313
314           if USE_AVOID_HMMS
315             if !AVOID_HHMS.include? r.model
316               hmmscan_results_per_protein << r
317             end
318           else
319             hmmscan_results_per_protein << r
320           end
321         end
322       end
323       if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
324         process_hmmscan_results_per_protein( hmmscan_results_per_protein,
325           fs_e_value_threshold,
326           hmm_for_protein_output,
327           i_e_value_threshold,
328           false,
329           species )
330       end
331
332       outfile.flush()
333       outfile.close()
334
335     end # def parse
336
337     def process_id( id )
338       if  id =~ /(sp|tr)\|\S+\|(\S+)/
339         id = $2
340       end
341       id
342     end
343
344
345
346     def count_model( model )
347       if ( @domain_counts.has_key?( model ) )
348         count = @domain_counts[ model ].to_i
349         count += 1
350         @domain_counts[ model ] = count
351       else
352         @domain_counts[ model ] = 1
353       end
354     end
355
356     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
357         fs_e_value_threshold,
358         hmm_for_protein_output,
359         i_e_value_threshold,
360         uniprotkb,
361         species      )
362
363       dc = 0
364       # filter according to i-Evalue threshold
365       # abort if fs Evalue too high
366       hmmscan_results_per_protein_filtered = []
367
368       hmmscan_results_per_protein.each do | r |
369         if r.model == hmm_for_protein_output
370           if r.fs_e_value > fs_e_value_threshold
371             return
372           end
373         end
374         if r.i_e_value <= i_e_value_threshold
375           hmmscan_results_per_protein_filtered << r
376           if r.model == hmm_for_protein_output
377             dc += 1
378           end
379         end
380       end
381
382       if dc == 0
383         # passed on protein E-value, failed in per domain E-values
384         return
385       end
386
387       hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
388
389       own = nil
390       hmmscan_results_per_protein_filtered.each do | r |
391         if r.model == hmm_for_protein_output
392           own = r
393         end
394       end
395
396       s = ""
397       s << own.query + "\t"
398       s << species + "\t"
399       s << own.fs_e_value.to_s + "\t"
400       s << own.qlen.to_s + "\t"
401       s << dc.to_s + "\t"
402       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
403       hmmscan_results_per_protein_filtered.each do | r |
404         s << r.model + " "
405       end
406       s << "\t"
407       #e = UniprotKB::get_entry_by_id( process_id( own.query ) )
408
409       #if e != nil
410       #  s << uniprot_annotation( e )
411       # # s << "\uniprot_annotationt"
412       #end
413
414       overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
415
416       s << overview  + "\t"
417
418       s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
419
420       prev_r = nil
421       hmmscan_results_per_protein_filtered.each do | r |
422
423         if  prev_r != nil
424           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
425         else
426           s << make_interdomain_sequence( r.env_from, false )
427         end
428         s << r.model
429         s << "["
430         s << r.env_from.to_s << "-" << r.env_to.to_s
431         s << "|ie=" << r.i_e_value.to_s
432         s << "|ce=" << r.c_e_value.to_s
433         s << "]"
434         prev_r = r
435       end
436       s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
437       puts s
438     end
439
440     def uniprot_annotation( e )
441       s = ""
442       pdb_ids = e.get_pdb_ids
443       if !pdb_ids.empty?
444         pdb_ids.each do | pdb |
445           s << pdb << ", "
446         end
447       else
448         s << "-"
449       end
450       s
451     end
452
453     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
454       linkers = ""
455       prev_r = nil
456       hmmscan_results_per_protein_filtered.each do | r |
457         if r.model == hmm_for_protein_output
458           if  prev_r != nil
459             linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
460           end
461           prev_r = r
462         end
463       end
464       linkers
465     end
466
467     def get_domain_counts()
468       return @domain_counts
469     end
470
471     def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
472       overview = ""
473       prev_r = nil
474       hmmscan_results_per_protein_filtered.each do | r |
475         if r.model == hmm_for_protein_output
476           if prev_r == nil
477             overview << hmm_for_protein_output
478           else
479             if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
480               overview << "~" << hmm_for_protein_output
481             else
482               overview << "----" << hmm_for_protein_output
483             end
484           end
485           prev_r = r
486         end
487       end
488       overview
489     end
490
491     def make_interdomain_sequence( d, mark_short = true )
492       s = ""
493       d /= 20
494       if d >= 10
495         s << "----//----"
496       elsif d >= 1
497         d.times do
498           s << "-"
499         end
500       elsif mark_short
501         s << "~"
502       end
503       s
504     end
505
506
507
508     def print_help()
509       puts( "Usage:" )
510       puts()
511       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
512       puts()
513       puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
514       puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
515       puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
516       puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
517       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
518       puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
519       puts()
520     end
521
522   end # class
523
524 end # module Evoruby