inprogress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_summary.rb
1 #
2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
8 #
9 # last modified: 121003
10
11 require 'set'
12
13 require 'lib/evo/util/constants'
14 require 'lib/evo/util/util'
15 require 'lib/evo/util/command_line_arguments'
16 require 'lib/evo/io/parser/hmmscan_parser'
17 require 'lib/evo/io/web/uniprotkb'
18
19 module Evoruby
20
21   class HmmscanSummary
22
23     PRG_NAME       = "hsp"
24     PRG_VERSION    = "2.000"
25     PRG_DESC       = "hmmscan summary"
26     PRG_DATE       = "2012.10.23"
27     COPYRIGHT      = "2012 Christian M Zmasek"
28     CONTACT        = "phylosoft@gmail.com"
29     WWW            = "www.phylosoft.org"
30
31     DELIMITER_OPTION              = "d"
32     SPECIES_OPTION                = "s"
33     I_E_VALUE_THRESHOLD_OPTION    = "ie"
34     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
35     HMM_FOR_PROTEIN_OUTPUT        = "m"
36     IGNORE_DUF_OPTION             = "i"
37     PARSE_OUT_DESCRIPITION_OPTION = "a"
38     UNIPROT                       = "u"
39     HELP_OPTION_1                 = "help"
40     HELP_OPTION_2                 = "h"
41
42     USE_AVOID_HMMS = true
43     AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
44     LIMIT_FOR_CLOSE_DOMAINS = 20
45
46     def initialize
47       @domain_counts = Hash.new
48     end
49
50     def run
51
52
53
54       Util.print_program_information( PRG_NAME,
55         PRG_VERSION,
56         PRG_DESC,
57         PRG_DATE,
58         COPYRIGHT,
59         CONTACT,
60         WWW,
61         STDOUT )
62
63       begin
64         cla = CommandLineArguments.new( ARGV )
65       rescue ArgumentError => e
66         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
67       end
68
69       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
70             cla.is_option_set?( HELP_OPTION_2 ) )
71         print_help
72         exit( 0 )
73       end
74
75       if ( cla.get_number_of_files != 2 )
76         print_help
77         exit( -1 )
78       end
79
80       allowed_opts = Array.new
81       allowed_opts.push( DELIMITER_OPTION )
82       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
83       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
84       allowed_opts.push( IGNORE_DUF_OPTION )
85       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
86       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
87       allowed_opts.push( UNIPROT )
88       allowed_opts.push( SPECIES_OPTION )
89
90       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
91       if ( disallowed.length > 0 )
92         Util.fatal_error( PRG_NAME,
93           "unknown option(s): " + disallowed,
94           STDOUT )
95       end
96
97       inpath = cla.get_file_name( 0 )
98       outpath = cla.get_file_name( 1 )
99
100       column_delimiter = "\t"
101       if ( cla.is_option_set?( DELIMITER_OPTION ) )
102         begin
103           column_delimiter = cla.get_option_value( DELIMITER_OPTION )
104         rescue ArgumentError => e
105           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
106         end
107       end
108
109       i_e_value_threshold = -1.0
110       if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
111         begin
112           i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
113         rescue ArgumentError => e
114           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
115         end
116         if ( i_e_value_threshold < 0.0 )
117           Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
118         end
119       end
120
121       
122
123       fs_e_value_threshold = -1.0
124       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
125         begin
126           fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
127         rescue ArgumentError => e
128           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
129         end
130         if ( fs_e_value_threshold < 0.0 )
131           Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
132         end
133       end
134
135       hmm_for_protein_output = ""
136       if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
137         begin
138           hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
139         rescue ArgumentError => e
140           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
141         end
142       end
143
144       uniprot = ""
145       if ( cla.is_option_set?( UNIPROT ) )
146         begin
147           uniprot = cla.get_option_value( UNIPROT )
148         rescue ArgumentError => e
149           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
150         end
151       end
152
153       species = "HUMAN"
154        if ( cla.is_option_set?( SPECIES_OPTION ) )
155         begin
156           species = cla.get_option_value( SPECIES_OPTION )
157         rescue ArgumentError => e
158           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
159         end
160       end
161
162       ignore_dufs = false
163       if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
164         ignore_dufs = true
165       end
166
167       parse_descriptions = false
168       if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
169         parse_descriptions = true
170       end
171
172       puts()
173       puts( "hmmpfam outputfile  : " + inpath )
174       puts( "outputfile          : " + outpath )
175       puts( "species             : " + species )
176       if ( i_e_value_threshold >= 0.0 )
177         puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
178       else
179         puts( "i-E-value threshold : no threshold" )
180       end
181       if ( parse_descriptions )
182         puts( "parse descriptions  : true" )
183       else
184         puts( "parse descriptions  : false" )
185       end
186       if ( ignore_dufs )
187         puts( "ignore DUFs         : true" )
188       else
189         puts( "ignore DUFs         : false" )
190       end
191       if ( column_delimiter == "\t" )
192         puts( "column delimiter    : TAB" )
193       else
194         puts( "column delimiter     : " + column_delimiter )
195       end
196       if fs_e_value_threshold >= 0.0
197         puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
198       else
199         puts( "E-value threshold   : no threshold" )
200       end
201       if !hmm_for_protein_output.empty?
202         puts( "HMM for proteins    : " + hmm_for_protein_output )
203       end
204       if !uniprot.empty?
205         puts( "Uniprot             : " + uniprot )
206       end
207       puts()
208
209       begin
210         parse( inpath,
211           outpath,
212           column_delimiter,
213           i_e_value_threshold,
214           ignore_dufs,
215           parse_descriptions,
216           fs_e_value_threshold,
217           hmm_for_protein_output,
218           uniprot,
219           species )
220       rescue IOError => e
221         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
222       end
223       domain_counts = get_domain_counts()
224
225
226       puts
227       puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
228       puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
229       puts
230       puts( Util.draw_histogram( domain_counts, "#" ) )
231       puts
232       Util.print_message( PRG_NAME, 'OK' )
233       puts
234
235     end # def run
236
237     private
238
239     # raises ArgumentError, IOError
240     def parse( inpath,
241         outpath,
242         column_delimiter,
243         i_e_value_threshold,
244         ignore_dufs,
245         get_descriptions,
246         fs_e_value_threshold,
247         hmm_for_protein_output,
248         uniprot,
249         species )
250
251
252
253       Util.check_file_for_readability( inpath )
254       Util.check_file_for_writability( outpath )
255
256       hmmscan_parser = HmmscanParser.new( inpath )
257       results = hmmscan_parser.parse
258
259       outfile = File.open( outpath, "a" )
260
261       query     = ""
262       desc      = ""
263       model     = ""
264       env_from  = ""
265       env_to    = ""
266       i_e_value = ""
267
268       hmmscan_results_per_protein = []
269
270
271
272       prev_query = ""
273
274       results.each do | r |
275         model     = r.model
276         query     = r.query
277         i_e_value = r.i_e_value
278         env_from  = r.env_from
279         env_to    = r.env_to
280
281         if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
282             ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
283           count_model( model )
284           outfile.print( query +
285               column_delimiter )
286           if ( get_descriptions )
287             outfile.print( desc +
288                 column_delimiter )
289           end
290           outfile.print( model +
291               column_delimiter +
292               env_from.to_s +
293               column_delimiter +
294               env_to.to_s +
295               column_delimiter +
296               i_e_value.to_s )
297           outfile.print( Constants::LINE_DELIMITER )
298         end
299
300         if !hmm_for_protein_output.empty?
301           if  !prev_query.empty? && prev_query != query
302             if !hmmscan_results_per_protein.empty?
303               process_hmmscan_results_per_protein( hmmscan_results_per_protein,
304                 fs_e_value_threshold,
305                 hmm_for_protein_output,
306                 i_e_value_threshold,
307                 true )
308             end
309             hmmscan_results_per_protein.clear
310           end
311           prev_query = query
312
313           if USE_AVOID_HMMS
314             if !AVOID_HHMS.include? r.model
315               hmmscan_results_per_protein << r
316             end
317           else
318             hmmscan_results_per_protein << r
319           end
320         end
321       end
322       if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
323         process_hmmscan_results_per_protein( hmmscan_results_per_protein,
324           fs_e_value_threshold,
325           hmm_for_protein_output,
326           i_e_value_threshold,
327           true )
328       end
329
330       outfile.flush()
331       outfile.close()
332
333     end # def parse
334
335     def process_id( id )
336       if  id =~ /(sp|tr)\|\S+\|(\S+)/
337         id = $2
338       end
339       id
340     end
341
342
343
344     def count_model( model )
345       if ( @domain_counts.has_key?( model ) )
346         count = @domain_counts[ model ].to_i
347         count += 1
348         @domain_counts[ model ] = count
349       else
350         @domain_counts[ model ] = 1
351       end
352     end
353
354     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
355         fs_e_value_threshold,
356         hmm_for_protein_output,
357         i_e_value_threshold,
358         uniprotkb )
359
360       dc = 0
361       # filter according to i-Evalue threshold
362       # abort if fs Evalue too high
363       hmmscan_results_per_protein_filtered = []
364
365       hmmscan_results_per_protein.each do | r |
366         if r.model == hmm_for_protein_output
367           if r.fs_e_value > fs_e_value_threshold
368             return
369           end
370         end
371         if r.i_e_value <= i_e_value_threshold
372           hmmscan_results_per_protein_filtered << r
373           if r.model == hmm_for_protein_output
374             dc += 1
375           end
376         end
377       end
378
379       if dc == 0
380         # passed on protein E-value, failed in per domain E-values
381         return
382       end
383
384       hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
385
386       own = nil
387       hmmscan_results_per_protein_filtered.each do | r |
388         if r.model == hmm_for_protein_output
389           own = r
390         end
391       end
392
393       s = ""
394       s << own.query + "\t"
395       s << species + "\t"
396       s << own.fs_e_value.to_s + "\t"
397       s << own.qlen.to_s + "\t"
398       s << dc.to_s + "\t"
399       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
400       hmmscan_results_per_protein_filtered.each do | r |
401         s << r.model + " "
402       end
403       s << "\t"
404       e = UniprotKB::get_entry_by_id( process_id( own.query ) )
405
406       if e != nil
407         s << uniprot_annotation( e )
408        # s << "\uniprot_annotationt"
409       end
410
411       overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
412
413       s << overview  + "\t"
414
415       s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
416
417       prev_r = nil
418       hmmscan_results_per_protein_filtered.each do | r |
419
420         if  prev_r != nil
421           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
422         else
423           s << make_interdomain_sequence( r.env_from, false )
424         end
425         s << r.model
426         s << "["
427         s << r.env_from.to_s << "-" << r.env_to.to_s
428         s << "|ie=" << r.i_e_value.to_s
429         s << "|ce=" << r.c_e_value.to_s
430         s << "]"
431         prev_r = r
432       end
433       s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
434       puts s
435     end
436
437     def uniprot_annotation( e )
438       s = ""
439       pdb_ids = e.get_pdb_ids
440       if !pdb_ids.empty?
441         pdb_ids.each do | pdb |
442           s << pdb << ", "
443         end
444       else
445         s << "-"
446       end
447       s
448     end
449
450     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
451       linkers = ""
452       prev_r = nil
453       hmmscan_results_per_protein_filtered.each do | r |
454         if r.model == hmm_for_protein_output
455           if  prev_r != nil
456             linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
457           end
458           prev_r = r
459         end
460       end
461       linkers
462     end
463
464     def get_domain_counts()
465       return @domain_counts
466     end
467
468     def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
469       overview = ""
470       prev_r = nil
471       hmmscan_results_per_protein_filtered.each do | r |
472         if r.model == hmm_for_protein_output
473           if prev_r == nil
474             overview << hmm_for_protein_output
475           else
476             if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
477               overview << "~" << hmm_for_protein_output
478             else
479               overview << "----" << hmm_for_protein_output
480             end
481           end
482           prev_r = r
483         end
484       end
485       overview
486     end
487
488     def make_interdomain_sequence( d, mark_short = true )
489       s = ""
490       d /= 20
491       if d >= 10
492         s << "----//----"
493       elsif d >= 1
494         d.times do
495           s << "-"
496         end
497       elsif mark_short
498         s << "~"
499       end
500       s
501     end
502
503
504
505     def print_help()
506       puts( "Usage:" )
507       puts()
508       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
509       puts()
510       puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
511       puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
512       puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
513       puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
514       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
515       puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
516       puts()
517     end
518
519   end # class
520
521 end # module Evoruby