in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_summary.rb
1 #
2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
8 #
9 # last modified: 121003
10
11 require 'set'
12
13 require 'lib/evo/util/constants'
14 require 'lib/evo/util/util'
15 require 'lib/evo/util/command_line_arguments'
16 require 'lib/evo/io/parser/hmmscan_parser'
17 require 'lib/evo/io/parser/uniprot_parser'
18 require 'lib/evo/io/web/uniprotkb'
19
20 module Evoruby
21
22   class HmmscanSummary
23
24     PRG_NAME       = "hsp"
25     PRG_VERSION    = "2.000"
26     PRG_DESC       = "hmmscan summary"
27     PRG_DATE       = "2012.10.23"
28     COPYRIGHT      = "2012 Christian M Zmasek"
29     CONTACT        = "phylosoft@gmail.com"
30     WWW            = "www.phylosoft.org"
31
32     DELIMITER_OPTION              = "d"
33     I_E_VALUE_THRESHOLD_OPTION    = "ie"
34     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
35     HMM_FOR_PROTEIN_OUTPUT        = "m"
36     IGNORE_DUF_OPTION             = "i"
37     PARSE_OUT_DESCRIPITION_OPTION = "a"
38     UNIPROT                       = "u"
39     HELP_OPTION_1                 = "help"
40     HELP_OPTION_2                 = "h"
41
42     USE_AVOID_HMMS = true
43     AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
44     LIMIT_FOR_CLOSE_DOMAINS = 20
45
46     def initialize
47       @domain_counts = Hash.new
48     end
49
50     def run
51
52       ukb = UniprotKB.new
53       ukb.get
54
55       Util.print_program_information( PRG_NAME,
56         PRG_VERSION,
57         PRG_DESC,
58         PRG_DATE,
59         COPYRIGHT,
60         CONTACT,
61         WWW,
62         STDOUT )
63
64       begin
65         cla = CommandLineArguments.new( ARGV )
66       rescue ArgumentError => e
67         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
68       end
69
70       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
71            cla.is_option_set?( HELP_OPTION_2 ) )
72         print_help
73         exit( 0 )
74       end
75
76       if ( cla.get_number_of_files != 2 )
77         print_help
78         exit( -1 )
79       end
80
81       allowed_opts = Array.new
82       allowed_opts.push( DELIMITER_OPTION )
83       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
84       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
85       allowed_opts.push( IGNORE_DUF_OPTION )
86       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
87       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
88       allowed_opts.push( UNIPROT )
89
90       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
91       if ( disallowed.length > 0 )
92         Util.fatal_error( PRG_NAME,
93           "unknown option(s): " + disallowed,
94           STDOUT )
95       end
96
97       inpath = cla.get_file_name( 0 )
98       outpath = cla.get_file_name( 1 )
99
100       column_delimiter = "\t"
101       if ( cla.is_option_set?( DELIMITER_OPTION ) )
102         begin
103           column_delimiter = cla.get_option_value( DELIMITER_OPTION )
104         rescue ArgumentError => e
105           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
106         end
107       end
108
109       i_e_value_threshold = -1.0
110       if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
111         begin
112           i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
113         rescue ArgumentError => e
114           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
115         end
116         if ( i_e_value_threshold < 0.0 )
117           Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
118         end
119       end
120
121       fs_e_value_threshold = -1.0
122       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
123         begin
124           fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
125         rescue ArgumentError => e
126           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
127         end
128         if ( fs_e_value_threshold < 0.0 )
129           Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
130         end
131       end
132
133       hmm_for_protein_output = ""
134       if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
135         begin
136           hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
137         rescue ArgumentError => e
138           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
139         end
140       end
141
142       uniprot = ""
143       if ( cla.is_option_set?( UNIPROT ) )
144         begin
145           uniprot = cla.get_option_value( UNIPROT )
146         rescue ArgumentError => e
147           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
148         end
149       end
150
151       ignore_dufs = false
152       if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
153         ignore_dufs = true
154       end
155
156       parse_descriptions = false
157       if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
158         parse_descriptions = true
159       end
160
161       puts()
162       puts( "hmmpfam outputfile  : " + inpath )
163       puts( "outputfile          : " + outpath )
164       if ( i_e_value_threshold >= 0.0 )
165         puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
166       else
167         puts( "i-E-value threshold : no threshold" )
168       end
169       if ( parse_descriptions )
170         puts( "parse descriptions  : true" )
171       else
172         puts( "parse descriptions  : false" )
173       end
174       if ( ignore_dufs )
175         puts( "ignore DUFs         : true" )
176       else
177         puts( "ignore DUFs         : false" )
178       end
179       if ( column_delimiter == "\t" )
180         puts( "column delimiter    : TAB" )
181       else
182         puts( "column delimiter     : " + column_delimiter )
183       end
184       if fs_e_value_threshold >= 0.0
185         puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
186       else
187         puts( "E-value threshold   : no threshold" )
188       end
189       if !hmm_for_protein_output.empty?
190         puts( "HMM for proteins    : " + hmm_for_protein_output )
191       end
192       if !uniprot.empty?
193         puts( "Uniprot             : " + uniprot )
194       end
195       puts()
196
197       begin
198         parse( inpath,
199           outpath,
200           column_delimiter,
201           i_e_value_threshold,
202           ignore_dufs,
203           parse_descriptions,
204           fs_e_value_threshold,
205           hmm_for_protein_output,
206           uniprot )
207       rescue IOError => e
208         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
209       end
210       domain_counts = get_domain_counts()
211
212
213       puts
214       puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
215       puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
216       puts
217       puts( Util.draw_histogram( domain_counts, "#" ) )
218       puts
219       Util.print_message( PRG_NAME, 'OK' )
220       puts
221
222     end # def run
223
224     private
225
226     # raises ArgumentError, IOError
227     def parse( inpath,
228         outpath,
229         column_delimiter,
230         i_e_value_threshold,
231         ignore_dufs,
232         get_descriptions,
233         fs_e_value_threshold,
234         hmm_for_protein_output,
235         uniprot )
236
237
238
239       Util.check_file_for_readability( inpath )
240       Util.check_file_for_writability( outpath )
241
242       hmmscan_parser = HmmscanParser.new( inpath )
243       results = hmmscan_parser.parse
244
245       uniprot_entries = nil
246       if !uniprot.empty? && !hmm_for_protein_output.empty?
247         uniprot_entries = read_uniprot( results, uniprot  )
248       end
249
250       outfile = File.open( outpath, "a" )
251
252       query     = ""
253       desc      = ""
254       model     = ""
255       env_from  = ""
256       env_to    = ""
257       i_e_value = ""
258
259       hmmscan_results_per_protein = []
260
261
262
263       prev_query = ""
264
265       results.each do | r |
266         model     = r.model
267         query     = r.query
268         i_e_value = r.i_e_value
269         env_from  = r.env_from
270         env_to    = r.env_to
271
272         if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
273            ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
274           count_model( model )
275           outfile.print( query +
276              column_delimiter )
277           if ( get_descriptions )
278             outfile.print( desc +
279                column_delimiter )
280           end
281           outfile.print( model +
282              column_delimiter +
283              env_from.to_s +
284              column_delimiter +
285              env_to.to_s +
286              column_delimiter +
287              i_e_value.to_s )
288           outfile.print( Constants::LINE_DELIMITER )
289         end
290
291         if !hmm_for_protein_output.empty?
292           if  !prev_query.empty? && prev_query != query
293             if !hmmscan_results_per_protein.empty?
294               process_hmmscan_results_per_protein( hmmscan_results_per_protein,
295                 fs_e_value_threshold,
296                 hmm_for_protein_output,
297                 i_e_value_threshold,
298                 uniprot_entries )
299             end
300             hmmscan_results_per_protein.clear
301           end
302           prev_query = query
303
304           if USE_AVOID_HMMS
305             if !AVOID_HHMS.include? r.model
306               hmmscan_results_per_protein << r
307             end
308           else
309             hmmscan_results_per_protein << r
310           end
311         end
312       end
313       if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
314         process_hmmscan_results_per_protein( hmmscan_results_per_protein,
315           fs_e_value_threshold,
316           hmm_for_protein_output,
317           i_e_value_threshold,
318           uniprot_entries )
319       end
320
321       outfile.flush()
322       outfile.close()
323
324     end # def parse
325
326     def process_id( id )
327       if  id =~ /(sp|tr)\|\S+\|(\S+)/
328         id = $2
329       end
330       id
331     end
332
333     def read_uniprot( hmmscan_results, uniprot  )
334       ids = Set.new
335       hmmscan_results.each do | r |
336
337         ids << process_id( r.query )
338       end
339       uniprot_parser = UniprotParser.new uniprot
340       uniprot_entries = uniprot_parser.parse ids
341       uniprot_entries
342     end
343
344     def count_model( model )
345       if ( @domain_counts.has_key?( model ) )
346         count = @domain_counts[ model ].to_i
347         count += 1
348         @domain_counts[ model ] = count
349       else
350         @domain_counts[ model ] = 1
351       end
352     end
353
354     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
355         fs_e_value_threshold,
356         hmm_for_protein_output,
357         i_e_value_threshold,
358         uniprot_entries )
359
360       dc = 0
361       # filter according to i-Evalue threshold
362       # abort if fs Evalue too high
363       hmmscan_results_per_protein_filtered = []
364
365       hmmscan_results_per_protein.each do | r |
366         if r.model == hmm_for_protein_output
367           if r.fs_e_value > fs_e_value_threshold
368             return
369           end
370         end
371         if r.i_e_value <= i_e_value_threshold
372           hmmscan_results_per_protein_filtered << r
373           if r.model == hmm_for_protein_output
374             dc += 1
375           end
376         end
377       end
378
379       if dc == 0
380         # passed on protein E-value, failed in per domain E-values
381         return
382       end
383
384       hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
385
386       own = nil
387       hmmscan_results_per_protein_filtered.each do | r |
388         if r.model == hmm_for_protein_output
389           own = r
390         end
391       end
392
393       s = ""
394       s << own.query + "\t"
395       s << "HUMAN" + "\t"
396       s << own.fs_e_value.to_s + "\t"
397       s << own.qlen.to_s + "\t"
398       s << dc.to_s + "\t"
399       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
400       hmmscan_results_per_protein_filtered.each do | r |
401         s << r.model + " "
402       end
403       s << "\t"
404       e = uniprot_entries[ process_id( own.query ) ]
405       if e != nil && e.de != nil
406         e.de.each { |i| s << i + " " }
407       else
408         s << "-"
409       end
410       s << "\t"
411
412       if e != nil && e.gn != nil
413         e.gn.each { |i| s << i + " " }
414       else
415         s << "-"
416       end
417
418
419
420       s << "\t"
421       overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
422
423       s << overview   + "\t"
424
425       s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
426
427       prev_r = nil
428       hmmscan_results_per_protein_filtered.each do | r |
429
430         if  prev_r != nil
431           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
432         else
433           s << make_interdomain_sequence( r.env_from, false )
434         end
435         s << r.model
436         s << "["
437         s << r.env_from.to_s << "-" << r.env_to.to_s
438         s << "|ie=" << r.i_e_value.to_s
439         s << "|ce=" << r.c_e_value.to_s
440         s << "]"
441         prev_r = r
442       end
443       s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
444       puts s
445     end
446
447
448     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
449       linkers = ""
450       prev_r = nil
451       hmmscan_results_per_protein_filtered.each do | r |
452         if r.model == hmm_for_protein_output
453           if  prev_r != nil
454             linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
455           end
456           prev_r = r
457         end
458       end
459       linkers
460     end
461
462     def get_domain_counts()
463       return @domain_counts
464     end
465
466     def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
467       overview = ""
468       prev_r = nil
469       hmmscan_results_per_protein_filtered.each do | r |
470         if r.model == hmm_for_protein_output
471           if prev_r == nil
472             overview << hmm_for_protein_output
473           else
474             if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
475               overview << "~" << hmm_for_protein_output
476             else
477               overview << "----" << hmm_for_protein_output
478             end
479           end
480           prev_r = r
481         end
482       end
483       overview
484     end
485
486     def make_interdomain_sequence( d, mark_short = true )
487       s = ""
488       d /= 20
489       if d >= 10
490         s << "----//----"
491       elsif d >= 1
492         d.times do
493           s << "-"
494         end
495       elsif mark_short
496         s << "~"
497       end
498       s
499     end
500
501
502
503     def print_help()
504       puts( "Usage:" )
505       puts()
506       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
507       puts()
508       puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
509       puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
510       puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
511       puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
512       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
513       puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
514       puts()
515     end
516
517   end # class
518
519 end # module Evoruby