inprogress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_summary.rb
1 #
2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
8 #
9
10 require 'set'
11
12 require 'lib/evo/util/constants'
13 require 'lib/evo/util/util'
14 require 'lib/evo/util/command_line_arguments'
15 require 'lib/evo/io/parser/hmmscan_parser'
16 require 'lib/evo/io/web/uniprotkb'
17
18 module Evoruby
19
20   class HmmscanSummary
21
22     PRG_NAME       = "hsp"
23     PRG_VERSION    = "2.001"
24     PRG_DESC       = "hmmscan summary"
25     PRG_DATE       = "2013.10.23"
26     COPYRIGHT      = "2013 Christian M Zmasek"
27     CONTACT        = "phyloxml@gmail.com"
28     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
29
30     DELIMITER_OPTION              = "d"
31     SPECIES_OPTION                = "s"
32     I_E_VALUE_THRESHOLD_OPTION    = "ie"
33     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
34     HMM_FOR_PROTEIN_OUTPUT        = "m"
35     IGNORE_DUF_OPTION             = "i"
36     PARSE_OUT_DESCRIPITION_OPTION = "a"
37     UNIPROT                       = "u"
38     HELP_OPTION_1                 = "help"
39     HELP_OPTION_2                 = "h"
40
41     USE_AVOID_HMMS = true
42     AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
43     LIMIT_FOR_CLOSE_DOMAINS = 20
44
45     def initialize
46       @domain_counts = Hash.new
47     end
48
49     def run
50
51       Util.print_program_information( PRG_NAME,
52         PRG_VERSION,
53         PRG_DESC,
54         PRG_DATE,
55         COPYRIGHT,
56         CONTACT,
57         WWW,
58         STDOUT )
59
60       begin
61         cla = CommandLineArguments.new( ARGV )
62       rescue ArgumentError => e
63         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
64       end
65
66       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
67            cla.is_option_set?( HELP_OPTION_2 ) )
68         print_help
69         exit( 0 )
70       end
71
72       if ( cla.get_number_of_files != 2 )
73         print_help
74         exit( -1 )
75       end
76
77       allowed_opts = Array.new
78       allowed_opts.push( DELIMITER_OPTION )
79       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
80       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
81       allowed_opts.push( IGNORE_DUF_OPTION )
82       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
83       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
84       allowed_opts.push( UNIPROT )
85       allowed_opts.push( SPECIES_OPTION )
86
87       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
88       if ( disallowed.length > 0 )
89         Util.fatal_error( PRG_NAME,
90           "unknown option(s): " + disallowed,
91           STDOUT )
92       end
93
94       inpath = cla.get_file_name( 0 )
95       outpath = cla.get_file_name( 1 )
96
97       column_delimiter = "\t"
98       if ( cla.is_option_set?( DELIMITER_OPTION ) )
99         begin
100           column_delimiter = cla.get_option_value( DELIMITER_OPTION )
101         rescue ArgumentError => e
102           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
103         end
104       end
105
106       i_e_value_threshold = -1.0
107       if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
108         begin
109           i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
110         rescue ArgumentError => e
111           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
112         end
113         if ( i_e_value_threshold < 0.0 )
114           Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
115         end
116       end
117
118
119
120       fs_e_value_threshold = -1.0
121       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
122         begin
123           fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
124         rescue ArgumentError => e
125           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
126         end
127         if ( fs_e_value_threshold < 0.0 )
128           Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
129         end
130       end
131
132       hmm_for_protein_output = ""
133       if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
134         begin
135           hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
136         rescue ArgumentError => e
137           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
138         end
139       end
140
141       uniprot = ""
142       if ( cla.is_option_set?( UNIPROT ) )
143         begin
144           uniprot = cla.get_option_value( UNIPROT )
145         rescue ArgumentError => e
146           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
147         end
148       end
149
150       species = "HUMAN"
151       if ( cla.is_option_set?( SPECIES_OPTION ) )
152         begin
153           species = cla.get_option_value( SPECIES_OPTION )
154         rescue ArgumentError => e
155           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
156         end
157       end
158
159       ignore_dufs = false
160       if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
161         ignore_dufs = true
162       end
163
164       parse_descriptions = false
165       if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
166         parse_descriptions = true
167       end
168
169       puts()
170       puts( "hmmpfam outputfile  : " + inpath )
171       puts( "outputfile          : " + outpath )
172       puts( "species             : " + species )
173       if ( i_e_value_threshold >= 0.0 )
174         puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
175       else
176         puts( "i-E-value threshold : no threshold" )
177       end
178       if ( parse_descriptions )
179         puts( "parse descriptions  : true" )
180       else
181         puts( "parse descriptions  : false" )
182       end
183       if ( ignore_dufs )
184         puts( "ignore DUFs         : true" )
185       else
186         puts( "ignore DUFs         : false" )
187       end
188       if ( column_delimiter == "\t" )
189         puts( "column delimiter    : TAB" )
190       else
191         puts( "column delimiter     : " + column_delimiter )
192       end
193       if fs_e_value_threshold >= 0.0
194         puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
195       else
196         puts( "E-value threshold   : no threshold" )
197       end
198       if !hmm_for_protein_output.empty?
199         puts( "HMM for proteins    : " + hmm_for_protein_output )
200       end
201       if !uniprot.empty?
202         puts( "Uniprot             : " + uniprot )
203       end
204       puts()
205
206       begin
207         parse( inpath,
208           outpath,
209           column_delimiter,
210           i_e_value_threshold,
211           ignore_dufs,
212           parse_descriptions,
213           fs_e_value_threshold,
214           hmm_for_protein_output,
215           uniprot,
216           species )
217       rescue IOError => e
218         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
219       end
220       domain_counts = get_domain_counts()
221
222       puts
223       puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
224       puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
225       puts
226       puts( Util.draw_histogram( domain_counts, "#" ) )
227       puts
228       Util.print_message( PRG_NAME, 'OK' )
229       puts
230
231     end # def run
232
233     private
234
235     # raises ArgumentError, IOError
236     def parse( inpath,
237         outpath,
238         column_delimiter,
239         i_e_value_threshold,
240         ignore_dufs,
241         get_descriptions,
242         fs_e_value_threshold,
243         hmm_for_protein_output,
244         uniprot,
245         species )
246
247       Util.check_file_for_readability( inpath )
248       Util.check_file_for_writability( outpath )
249
250       hmmscan_parser = HmmscanParser.new( inpath )
251       results = hmmscan_parser.parse
252
253       outfile = File.open( outpath, "a" )
254
255       query     = ""
256       desc      = ""
257       model     = ""
258       env_from  = ""
259       env_to    = ""
260       i_e_value = ""
261
262       hmmscan_results_per_protein = []
263
264       prev_query = ""
265
266       results.each do | r |
267         model     = r.model
268         query     = r.query
269         i_e_value = r.i_e_value
270         env_from  = r.env_from
271         env_to    = r.env_to
272
273         if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
274            ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
275           count_model( model )
276           outfile.print( query +
277              column_delimiter )
278           if ( get_descriptions )
279             outfile.print( desc +
280                column_delimiter )
281           end
282           outfile.print( model +
283              column_delimiter +
284              env_from.to_s +
285              column_delimiter +
286              env_to.to_s +
287              column_delimiter +
288              i_e_value.to_s )
289           outfile.print( Constants::LINE_DELIMITER )
290         end
291
292         if !hmm_for_protein_output.empty?
293           if  !prev_query.empty? && prev_query != query
294             if !hmmscan_results_per_protein.empty?
295               process_hmmscan_results_per_protein( hmmscan_results_per_protein,
296                 fs_e_value_threshold,
297                 hmm_for_protein_output,
298                 i_e_value_threshold,
299                 false,
300                 species )
301             end
302             hmmscan_results_per_protein.clear
303           end
304           prev_query = query
305
306           if USE_AVOID_HMMS
307             if !AVOID_HHMS.include? r.model
308               hmmscan_results_per_protein << r
309             end
310           else
311             hmmscan_results_per_protein << r
312           end
313         end
314       end
315
316       if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
317         process_hmmscan_results_per_protein( hmmscan_results_per_protein,
318           fs_e_value_threshold,
319           hmm_for_protein_output,
320           i_e_value_threshold,
321           uniprot,
322           species )
323       end
324
325       outfile.flush()
326       outfile.close()
327     end # def parse
328
329     def process_id( id )
330       if  id =~ /(sp|tr)\|\S+\|(\S+)/
331         id = $2
332       end
333       id
334     end
335
336     def count_model( model )
337       if ( @domain_counts.has_key?( model ) )
338         count = @domain_counts[ model ].to_i
339         count += 1
340         @domain_counts[ model ] = count
341       else
342         @domain_counts[ model ] = 1
343       end
344     end
345
346     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
347         fs_e_value_threshold,
348         hmm_for_protein_output,
349         i_e_value_threshold,
350         uniprotkb,
351         species )
352
353       dc = 0
354       # filter according to i-Evalue threshold
355       # abort if fs Evalue too high
356       hmmscan_results_per_protein_filtered = []
357
358       hmmscan_results_per_protein.each do | r |
359
360
361         if r.model == hmm_for_protein_output
362           if i_e_value_threshold > 0.0 && r.fs_e_value > fs_e_value_threshold
363             return
364           end
365         end
366         if i_e_value_threshold <= 0 || r.i_e_value <= i_e_value_threshold
367           hmmscan_results_per_protein_filtered << r
368           if r.model == hmm_for_protein_output
369             dc += 1
370           end
371         end
372       end
373
374       if dc == 0
375         # passed on protein E-value, failed in per domain E-values
376         return
377       end
378
379       hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
380
381       own = nil
382       hmmscan_results_per_protein_filtered.each do | r |
383         if r.model == hmm_for_protein_output
384           own = r
385         end
386       end
387
388       s = ""
389       s << own.query + "\t"
390       s << species + "\t"
391       s << own.fs_e_value.to_s + "\t"
392       s << own.qlen.to_s + "\t"
393       s << dc.to_s + "\t"
394       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
395       hmmscan_results_per_protein_filtered.each do | r |
396         s << r.model + " "
397       end
398       s << "\t"
399
400       if !uniprotkb.empty?
401         #e = UniprotKB::get_entry_by_id( process_id( own.query ) )
402
403         #if e != nil
404         #  s << uniprot_annotation( e )
405         # # s << "\uniprot_annotationt"
406         #end
407       end
408
409       overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
410
411       s << overview  + "\t"
412
413       s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
414
415       prev_r = nil
416       hmmscan_results_per_protein_filtered.each do | r |
417
418         if  prev_r != nil
419           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
420         else
421           s << make_interdomain_sequence( r.env_from, false )
422         end
423         s << r.model
424         s << "["
425         s << r.env_from.to_s << "-" << r.env_to.to_s
426         s << "|ie=" << r.i_e_value.to_s
427         s << "|ce=" << r.c_e_value.to_s
428         s << "]"
429         prev_r = r
430       end
431       s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
432       puts s
433     end
434
435     def uniprot_annotation( e )
436       s = ""
437       pdb_ids = e.get_pdb_ids
438       if !pdb_ids.empty?
439         pdb_ids.each do | pdb |
440           s << pdb << ", "
441         end
442       else
443         s << "-"
444       end
445       s
446     end
447
448     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
449       linkers = ""
450       prev_r = nil
451       hmmscan_results_per_protein_filtered.each do | r |
452         if r.model == hmm_for_protein_output
453           if  prev_r != nil
454             linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
455           end
456           prev_r = r
457         end
458       end
459       linkers
460     end
461
462     def get_domain_counts()
463       return @domain_counts
464     end
465
466     def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
467       overview = ""
468       prev_r = nil
469       hmmscan_results_per_protein_filtered.each do | r |
470         if r.model == hmm_for_protein_output
471           if prev_r == nil
472             overview << hmm_for_protein_output
473           else
474             if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
475               overview << "~" << hmm_for_protein_output
476             else
477               overview << "----" << hmm_for_protein_output
478             end
479           end
480           prev_r = r
481         end
482       end
483       overview
484     end
485
486     def make_interdomain_sequence( d, mark_short = true )
487       s = ""
488       d /= 20
489       if d >= 10
490         s << "----//----"
491       elsif d >= 1
492         d.times do
493           s << "-"
494         end
495       elsif mark_short
496         s << "~"
497       end
498       s
499     end
500
501
502     def print_help()
503       puts( "Usage:" )
504       puts()
505       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
506       puts()
507       puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
508       puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
509       puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
510       puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
511       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
512       puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
513       puts( "           -" + SPECIES_OPTION + ": species for protein summary" )
514       puts()
515     end
516
517   end # class
518
519 end # module Evoruby