inprogress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_summary.rb
1 #
2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
8 #
9 # last modified: 121003
10
11 require 'set'
12
13 require 'lib/evo/util/constants'
14 require 'lib/evo/util/util'
15 require 'lib/evo/util/command_line_arguments'
16 require 'lib/evo/io/parser/hmmscan_parser'
17 require 'lib/evo/io/web/uniprotkb'
18
19 module Evoruby
20
21   class HmmscanSummary
22
23     PRG_NAME       = "hsp"
24     PRG_VERSION    = "2.000"
25     PRG_DESC       = "hmmscan summary"
26     PRG_DATE       = "2012.10.23"
27     COPYRIGHT      = "2012 Christian M Zmasek"
28     CONTACT        = "phylosoft@gmail.com"
29     WWW            = "www.phylosoft.org"
30
31     DELIMITER_OPTION              = "d"
32     SPECIES_OPTION                = "s"
33     I_E_VALUE_THRESHOLD_OPTION    = "ie"
34     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
35     HMM_FOR_PROTEIN_OUTPUT        = "m"
36     IGNORE_DUF_OPTION             = "i"
37     PARSE_OUT_DESCRIPITION_OPTION = "a"
38     UNIPROT                       = "u"
39     HELP_OPTION_1                 = "help"
40     HELP_OPTION_2                 = "h"
41
42     USE_AVOID_HMMS = true
43     AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
44     LIMIT_FOR_CLOSE_DOMAINS = 20
45
46     def initialize
47       @domain_counts = Hash.new
48     end
49
50     def run
51
52
53
54       Util.print_program_information( PRG_NAME,
55         PRG_VERSION,
56         PRG_DESC,
57         PRG_DATE,
58         COPYRIGHT,
59         CONTACT,
60         WWW,
61         STDOUT )
62
63       begin
64         cla = CommandLineArguments.new( ARGV )
65       rescue ArgumentError => e
66         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
67       end
68
69       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
70             cla.is_option_set?( HELP_OPTION_2 ) )
71         print_help
72         exit( 0 )
73       end
74
75       if ( cla.get_number_of_files != 2 )
76         print_help
77         exit( -1 )
78       end
79
80       allowed_opts = Array.new
81       allowed_opts.push( DELIMITER_OPTION )
82       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
83       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
84       allowed_opts.push( IGNORE_DUF_OPTION )
85       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
86       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
87       allowed_opts.push( UNIPROT )
88       allowed_opts.push( SPECIES_OPTION )
89
90       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
91       if ( disallowed.length > 0 )
92         Util.fatal_error( PRG_NAME,
93           "unknown option(s): " + disallowed,
94           STDOUT )
95       end
96
97       inpath = cla.get_file_name( 0 )
98       outpath = cla.get_file_name( 1 )
99
100       column_delimiter = "\t"
101       if ( cla.is_option_set?( DELIMITER_OPTION ) )
102         begin
103           column_delimiter = cla.get_option_value( DELIMITER_OPTION )
104         rescue ArgumentError => e
105           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
106         end
107       end
108
109       i_e_value_threshold = -1.0
110       if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
111         begin
112           i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
113         rescue ArgumentError => e
114           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
115         end
116         if ( i_e_value_threshold < 0.0 )
117           Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
118         end
119       end
120
121       
122
123       fs_e_value_threshold = -1.0
124       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
125         begin
126           fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
127         rescue ArgumentError => e
128           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
129         end
130         if ( fs_e_value_threshold < 0.0 )
131           Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
132         end
133       end
134
135       hmm_for_protein_output = ""
136       if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
137         begin
138           hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
139         rescue ArgumentError => e
140           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
141         end
142       end
143
144       uniprot = ""
145       if ( cla.is_option_set?( UNIPROT ) )
146         begin
147           uniprot = cla.get_option_value( UNIPROT )
148         rescue ArgumentError => e
149           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
150         end
151       end
152
153       species = "HUMAN"
154        if ( cla.is_option_set?( SPECIES_OPTION ) )
155         begin
156           species = cla.get_option_value( SPECIES_OPTION )
157         rescue ArgumentError => e
158           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
159         end
160       end
161
162       ignore_dufs = false
163       if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
164         ignore_dufs = true
165       end
166
167       parse_descriptions = false
168       if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
169         parse_descriptions = true
170       end
171
172       puts()
173       puts( "hmmpfam outputfile  : " + inpath )
174       puts( "outputfile          : " + outpath )
175       puts( "species             : " + species )
176       if ( i_e_value_threshold >= 0.0 )
177         puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
178       else
179         puts( "i-E-value threshold : no threshold" )
180       end
181       if ( parse_descriptions )
182         puts( "parse descriptions  : true" )
183       else
184         puts( "parse descriptions  : false" )
185       end
186       if ( ignore_dufs )
187         puts( "ignore DUFs         : true" )
188       else
189         puts( "ignore DUFs         : false" )
190       end
191       if ( column_delimiter == "\t" )
192         puts( "column delimiter    : TAB" )
193       else
194         puts( "column delimiter     : " + column_delimiter )
195       end
196       if fs_e_value_threshold >= 0.0
197         puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
198       else
199         puts( "E-value threshold   : no threshold" )
200       end
201       if !hmm_for_protein_output.empty?
202         puts( "HMM for proteins    : " + hmm_for_protein_output )
203       end
204       if !uniprot.empty?
205         puts( "Uniprot             : " + uniprot )
206       end
207       puts()
208
209       begin
210         parse( inpath,
211           outpath,
212           column_delimiter,
213           i_e_value_threshold,
214           ignore_dufs,
215           parse_descriptions,
216           fs_e_value_threshold,
217           hmm_for_protein_output,
218           uniprot,
219           species )
220       rescue IOError => e
221         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
222       end
223       domain_counts = get_domain_counts()
224
225
226       puts
227       puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
228       puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
229       puts
230       puts( Util.draw_histogram( domain_counts, "#" ) )
231       puts
232       Util.print_message( PRG_NAME, 'OK' )
233       puts
234
235     end # def run
236
237     private
238
239     # raises ArgumentError, IOError
240     def parse( inpath,
241         outpath,
242         column_delimiter,
243         i_e_value_threshold,
244         ignore_dufs,
245         get_descriptions,
246         fs_e_value_threshold,
247         hmm_for_protein_output,
248         uniprot,
249         species )
250
251
252
253       Util.check_file_for_readability( inpath )
254       Util.check_file_for_writability( outpath )
255
256       hmmscan_parser = HmmscanParser.new( inpath )
257       results = hmmscan_parser.parse
258
259       outfile = File.open( outpath, "a" )
260
261       query     = ""
262       desc      = ""
263       model     = ""
264       env_from  = ""
265       env_to    = ""
266       i_e_value = ""
267
268       hmmscan_results_per_protein = []
269
270
271
272       prev_query = ""
273
274       results.each do | r |
275         model     = r.model
276         query     = r.query
277         i_e_value = r.i_e_value
278         env_from  = r.env_from
279         env_to    = r.env_to
280
281         if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
282             ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
283           count_model( model )
284           outfile.print( query +
285               column_delimiter )
286           if ( get_descriptions )
287             outfile.print( desc +
288                 column_delimiter )
289           end
290           outfile.print( model +
291               column_delimiter +
292               env_from.to_s +
293               column_delimiter +
294               env_to.to_s +
295               column_delimiter +
296               i_e_value.to_s )
297           outfile.print( Constants::LINE_DELIMITER )
298         end
299
300         if !hmm_for_protein_output.empty?
301           if  !prev_query.empty? && prev_query != query
302             if !hmmscan_results_per_protein.empty?
303               process_hmmscan_results_per_protein( hmmscan_results_per_protein,
304                 fs_e_value_threshold,
305                 hmm_for_protein_output,
306                 i_e_value_threshold,
307                 false,
308                 species )
309             end
310             hmmscan_results_per_protein.clear
311           end
312           prev_query = query
313
314           if USE_AVOID_HMMS
315             if !AVOID_HHMS.include? r.model
316               hmmscan_results_per_protein << r
317             end
318           else
319             hmmscan_results_per_protein << r
320           end
321         end
322       end
323       if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
324         process_hmmscan_results_per_protein( hmmscan_results_per_protein,
325           fs_e_value_threshold,
326           hmm_for_protein_output,
327           i_e_value_threshold,
328           true )
329       end
330
331       outfile.flush()
332       outfile.close()
333
334     end # def parse
335
336     def process_id( id )
337       if  id =~ /(sp|tr)\|\S+\|(\S+)/
338         id = $2
339       end
340       id
341     end
342
343
344
345     def count_model( model )
346       if ( @domain_counts.has_key?( model ) )
347         count = @domain_counts[ model ].to_i
348         count += 1
349         @domain_counts[ model ] = count
350       else
351         @domain_counts[ model ] = 1
352       end
353     end
354
355     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
356         fs_e_value_threshold,
357         hmm_for_protein_output,
358         i_e_value_threshold,
359         uniprotkb,
360         species      )
361
362       dc = 0
363       # filter according to i-Evalue threshold
364       # abort if fs Evalue too high
365       hmmscan_results_per_protein_filtered = []
366
367       hmmscan_results_per_protein.each do | r |
368         if r.model == hmm_for_protein_output
369           if r.fs_e_value > fs_e_value_threshold
370             return
371           end
372         end
373         if r.i_e_value <= i_e_value_threshold
374           hmmscan_results_per_protein_filtered << r
375           if r.model == hmm_for_protein_output
376             dc += 1
377           end
378         end
379       end
380
381       if dc == 0
382         # passed on protein E-value, failed in per domain E-values
383         return
384       end
385
386       hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
387
388       own = nil
389       hmmscan_results_per_protein_filtered.each do | r |
390         if r.model == hmm_for_protein_output
391           own = r
392         end
393       end
394
395       s = ""
396       s << own.query + "\t"
397       s << species + "\t"
398       s << own.fs_e_value.to_s + "\t"
399       s << own.qlen.to_s + "\t"
400       s << dc.to_s + "\t"
401       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
402       hmmscan_results_per_protein_filtered.each do | r |
403         s << r.model + " "
404       end
405       s << "\t"
406       e = UniprotKB::get_entry_by_id( process_id( own.query ) )
407
408       if e != nil
409         s << uniprot_annotation( e )
410        # s << "\uniprot_annotationt"
411       end
412
413       overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
414
415       s << overview  + "\t"
416
417       s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
418
419       prev_r = nil
420       hmmscan_results_per_protein_filtered.each do | r |
421
422         if  prev_r != nil
423           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
424         else
425           s << make_interdomain_sequence( r.env_from, false )
426         end
427         s << r.model
428         s << "["
429         s << r.env_from.to_s << "-" << r.env_to.to_s
430         s << "|ie=" << r.i_e_value.to_s
431         s << "|ce=" << r.c_e_value.to_s
432         s << "]"
433         prev_r = r
434       end
435       s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
436       puts s
437     end
438
439     def uniprot_annotation( e )
440       s = ""
441       pdb_ids = e.get_pdb_ids
442       if !pdb_ids.empty?
443         pdb_ids.each do | pdb |
444           s << pdb << ", "
445         end
446       else
447         s << "-"
448       end
449       s
450     end
451
452     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
453       linkers = ""
454       prev_r = nil
455       hmmscan_results_per_protein_filtered.each do | r |
456         if r.model == hmm_for_protein_output
457           if  prev_r != nil
458             linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
459           end
460           prev_r = r
461         end
462       end
463       linkers
464     end
465
466     def get_domain_counts()
467       return @domain_counts
468     end
469
470     def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
471       overview = ""
472       prev_r = nil
473       hmmscan_results_per_protein_filtered.each do | r |
474         if r.model == hmm_for_protein_output
475           if prev_r == nil
476             overview << hmm_for_protein_output
477           else
478             if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
479               overview << "~" << hmm_for_protein_output
480             else
481               overview << "----" << hmm_for_protein_output
482             end
483           end
484           prev_r = r
485         end
486       end
487       overview
488     end
489
490     def make_interdomain_sequence( d, mark_short = true )
491       s = ""
492       d /= 20
493       if d >= 10
494         s << "----//----"
495       elsif d >= 1
496         d.times do
497           s << "-"
498         end
499       elsif mark_short
500         s << "~"
501       end
502       s
503     end
504
505
506
507     def print_help()
508       puts( "Usage:" )
509       puts()
510       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
511       puts()
512       puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
513       puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
514       puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
515       puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
516       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
517       puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
518       puts()
519     end
520
521   end # class
522
523 end # module Evoruby