pdb
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_summary.rb
1 #
2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
8 #
9 # last modified: 121003
10
11 require 'set'
12
13 require 'lib/evo/util/constants'
14 require 'lib/evo/util/util'
15 require 'lib/evo/util/command_line_arguments'
16 require 'lib/evo/io/parser/hmmscan_parser'
17 require 'lib/evo/io/web/uniprotkb'
18
19 module Evoruby
20
21   class HmmscanSummary
22
23     PRG_NAME       = "hsp"
24     PRG_VERSION    = "2.000"
25     PRG_DESC       = "hmmscan summary"
26     PRG_DATE       = "2012.10.23"
27     COPYRIGHT      = "2012 Christian M Zmasek"
28     CONTACT        = "phylosoft@gmail.com"
29     WWW            = "www.phylosoft.org"
30
31     DELIMITER_OPTION              = "d"
32     I_E_VALUE_THRESHOLD_OPTION    = "ie"
33     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
34     HMM_FOR_PROTEIN_OUTPUT        = "m"
35     IGNORE_DUF_OPTION             = "i"
36     PARSE_OUT_DESCRIPITION_OPTION = "a"
37     UNIPROT                       = "u"
38     HELP_OPTION_1                 = "help"
39     HELP_OPTION_2                 = "h"
40
41     USE_AVOID_HMMS = true
42     AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
43     LIMIT_FOR_CLOSE_DOMAINS = 20
44
45     def initialize
46       @domain_counts = Hash.new
47     end
48
49     def run
50
51
52
53       Util.print_program_information( PRG_NAME,
54         PRG_VERSION,
55         PRG_DESC,
56         PRG_DATE,
57         COPYRIGHT,
58         CONTACT,
59         WWW,
60         STDOUT )
61
62       begin
63         cla = CommandLineArguments.new( ARGV )
64       rescue ArgumentError => e
65         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
66       end
67
68       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
69            cla.is_option_set?( HELP_OPTION_2 ) )
70         print_help
71         exit( 0 )
72       end
73
74       if ( cla.get_number_of_files != 2 )
75         print_help
76         exit( -1 )
77       end
78
79       allowed_opts = Array.new
80       allowed_opts.push( DELIMITER_OPTION )
81       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
82       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
83       allowed_opts.push( IGNORE_DUF_OPTION )
84       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
85       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
86       allowed_opts.push( UNIPROT )
87
88       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
89       if ( disallowed.length > 0 )
90         Util.fatal_error( PRG_NAME,
91           "unknown option(s): " + disallowed,
92           STDOUT )
93       end
94
95       inpath = cla.get_file_name( 0 )
96       outpath = cla.get_file_name( 1 )
97
98       column_delimiter = "\t"
99       if ( cla.is_option_set?( DELIMITER_OPTION ) )
100         begin
101           column_delimiter = cla.get_option_value( DELIMITER_OPTION )
102         rescue ArgumentError => e
103           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
104         end
105       end
106
107       i_e_value_threshold = -1.0
108       if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
109         begin
110           i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
111         rescue ArgumentError => e
112           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
113         end
114         if ( i_e_value_threshold < 0.0 )
115           Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
116         end
117       end
118
119       fs_e_value_threshold = -1.0
120       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
121         begin
122           fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
123         rescue ArgumentError => e
124           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
125         end
126         if ( fs_e_value_threshold < 0.0 )
127           Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
128         end
129       end
130
131       hmm_for_protein_output = ""
132       if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
133         begin
134           hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
135         rescue ArgumentError => e
136           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
137         end
138       end
139
140       uniprot = ""
141       if ( cla.is_option_set?( UNIPROT ) )
142         begin
143           uniprot = cla.get_option_value( UNIPROT )
144         rescue ArgumentError => e
145           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
146         end
147       end
148
149       ignore_dufs = false
150       if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
151         ignore_dufs = true
152       end
153
154       parse_descriptions = false
155       if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
156         parse_descriptions = true
157       end
158
159       puts()
160       puts( "hmmpfam outputfile  : " + inpath )
161       puts( "outputfile          : " + outpath )
162       if ( i_e_value_threshold >= 0.0 )
163         puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
164       else
165         puts( "i-E-value threshold : no threshold" )
166       end
167       if ( parse_descriptions )
168         puts( "parse descriptions  : true" )
169       else
170         puts( "parse descriptions  : false" )
171       end
172       if ( ignore_dufs )
173         puts( "ignore DUFs         : true" )
174       else
175         puts( "ignore DUFs         : false" )
176       end
177       if ( column_delimiter == "\t" )
178         puts( "column delimiter    : TAB" )
179       else
180         puts( "column delimiter     : " + column_delimiter )
181       end
182       if fs_e_value_threshold >= 0.0
183         puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
184       else
185         puts( "E-value threshold   : no threshold" )
186       end
187       if !hmm_for_protein_output.empty?
188         puts( "HMM for proteins    : " + hmm_for_protein_output )
189       end
190       if !uniprot.empty?
191         puts( "Uniprot             : " + uniprot )
192       end
193       puts()
194
195       begin
196         parse( inpath,
197           outpath,
198           column_delimiter,
199           i_e_value_threshold,
200           ignore_dufs,
201           parse_descriptions,
202           fs_e_value_threshold,
203           hmm_for_protein_output,
204           uniprot )
205       rescue IOError => e
206         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
207       end
208       domain_counts = get_domain_counts()
209
210
211       puts
212       puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
213       puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
214       puts
215       puts( Util.draw_histogram( domain_counts, "#" ) )
216       puts
217       Util.print_message( PRG_NAME, 'OK' )
218       puts
219
220     end # def run
221
222     private
223
224     # raises ArgumentError, IOError
225     def parse( inpath,
226         outpath,
227         column_delimiter,
228         i_e_value_threshold,
229         ignore_dufs,
230         get_descriptions,
231         fs_e_value_threshold,
232         hmm_for_protein_output,
233         uniprot )
234
235
236
237       Util.check_file_for_readability( inpath )
238       Util.check_file_for_writability( outpath )
239
240       hmmscan_parser = HmmscanParser.new( inpath )
241       results = hmmscan_parser.parse
242
243       outfile = File.open( outpath, "a" )
244
245       query     = ""
246       desc      = ""
247       model     = ""
248       env_from  = ""
249       env_to    = ""
250       i_e_value = ""
251
252       hmmscan_results_per_protein = []
253
254
255
256       prev_query = ""
257
258       results.each do | r |
259         model     = r.model
260         query     = r.query
261         i_e_value = r.i_e_value
262         env_from  = r.env_from
263         env_to    = r.env_to
264
265         if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
266            ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
267           count_model( model )
268           outfile.print( query +
269              column_delimiter )
270           if ( get_descriptions )
271             outfile.print( desc +
272                column_delimiter )
273           end
274           outfile.print( model +
275              column_delimiter +
276              env_from.to_s +
277              column_delimiter +
278              env_to.to_s +
279              column_delimiter +
280              i_e_value.to_s )
281           outfile.print( Constants::LINE_DELIMITER )
282         end
283
284         if !hmm_for_protein_output.empty?
285           if  !prev_query.empty? && prev_query != query
286             if !hmmscan_results_per_protein.empty?
287               process_hmmscan_results_per_protein( hmmscan_results_per_protein,
288                 fs_e_value_threshold,
289                 hmm_for_protein_output,
290                 i_e_value_threshold,
291                 true )
292             end
293             hmmscan_results_per_protein.clear
294           end
295           prev_query = query
296
297           if USE_AVOID_HMMS
298             if !AVOID_HHMS.include? r.model
299               hmmscan_results_per_protein << r
300             end
301           else
302             hmmscan_results_per_protein << r
303           end
304         end
305       end
306       if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
307         process_hmmscan_results_per_protein( hmmscan_results_per_protein,
308           fs_e_value_threshold,
309           hmm_for_protein_output,
310           i_e_value_threshold,
311           true )
312       end
313
314       outfile.flush()
315       outfile.close()
316
317     end # def parse
318
319     def process_id( id )
320       if  id =~ /(sp|tr)\|\S+\|(\S+)/
321         id = $2
322       end
323       id
324     end
325
326
327
328     def count_model( model )
329       if ( @domain_counts.has_key?( model ) )
330         count = @domain_counts[ model ].to_i
331         count += 1
332         @domain_counts[ model ] = count
333       else
334         @domain_counts[ model ] = 1
335       end
336     end
337
338     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
339         fs_e_value_threshold,
340         hmm_for_protein_output,
341         i_e_value_threshold,
342         uniprotkb )
343
344       dc = 0
345       # filter according to i-Evalue threshold
346       # abort if fs Evalue too high
347       hmmscan_results_per_protein_filtered = []
348
349       hmmscan_results_per_protein.each do | r |
350         if r.model == hmm_for_protein_output
351           if r.fs_e_value > fs_e_value_threshold
352             return
353           end
354         end
355         if r.i_e_value <= i_e_value_threshold
356           hmmscan_results_per_protein_filtered << r
357           if r.model == hmm_for_protein_output
358             dc += 1
359           end
360         end
361       end
362
363       if dc == 0
364         # passed on protein E-value, failed in per domain E-values
365         return
366       end
367
368       hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
369
370       own = nil
371       hmmscan_results_per_protein_filtered.each do | r |
372         if r.model == hmm_for_protein_output
373           own = r
374         end
375       end
376
377       s = ""
378       s << own.query + "\t"
379       s << "HUMAN" + "\t"
380       s << own.fs_e_value.to_s + "\t"
381       s << own.qlen.to_s + "\t"
382       s << dc.to_s + "\t"
383       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
384       hmmscan_results_per_protein_filtered.each do | r |
385         s << r.model + " "
386       end
387       s << "\t"
388       e = UniprotKB::get_entry_by_id( process_id( own.query ) )
389       # if e != nil && e.de != nil
390       #   e.de.each do |i|
391       #
392       #   end
393       # else
394       #   s << "-"
395       # end
396       s << "\t"
397       if e != nil && e.dr != nil
398         e.dr.each do | dr |
399           if dr != nil
400             if dr =~ /PDB;\s+([A-Z0-9]{4});/
401               s << $1
402
403             end
404           end
405         end
406       else
407         s << "-"
408       end
409       s << "\t"
410
411
412
413
414
415       s << "\t"
416       overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
417
418       s << overview   + "\t"
419
420       s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
421
422       prev_r = nil
423       hmmscan_results_per_protein_filtered.each do | r |
424
425         if  prev_r != nil
426           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
427         else
428           s << make_interdomain_sequence( r.env_from, false )
429         end
430         s << r.model
431         s << "["
432         s << r.env_from.to_s << "-" << r.env_to.to_s
433         s << "|ie=" << r.i_e_value.to_s
434         s << "|ce=" << r.c_e_value.to_s
435         s << "]"
436         prev_r = r
437       end
438       s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
439       puts s
440     end
441
442
443     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
444       linkers = ""
445       prev_r = nil
446       hmmscan_results_per_protein_filtered.each do | r |
447         if r.model == hmm_for_protein_output
448           if  prev_r != nil
449             linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
450           end
451           prev_r = r
452         end
453       end
454       linkers
455     end
456
457     def get_domain_counts()
458       return @domain_counts
459     end
460
461     def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
462       overview = ""
463       prev_r = nil
464       hmmscan_results_per_protein_filtered.each do | r |
465         if r.model == hmm_for_protein_output
466           if prev_r == nil
467             overview << hmm_for_protein_output
468           else
469             if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
470               overview << "~" << hmm_for_protein_output
471             else
472               overview << "----" << hmm_for_protein_output
473             end
474           end
475           prev_r = r
476         end
477       end
478       overview
479     end
480
481     def make_interdomain_sequence( d, mark_short = true )
482       s = ""
483       d /= 20
484       if d >= 10
485         s << "----//----"
486       elsif d >= 1
487         d.times do
488           s << "-"
489         end
490       elsif mark_short
491         s << "~"
492       end
493       s
494     end
495
496
497
498     def print_help()
499       puts( "Usage:" )
500       puts()
501       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
502       puts()
503       puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
504       puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
505       puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
506       puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
507       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
508       puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
509       puts()
510     end
511
512   end # class
513
514 end # module Evoruby