2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
4 # Copyright:: Copyright (C) 2017 Christian M Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
8 require 'lib/evo/util/constants'
9 require 'lib/evo/util/util'
10 require 'lib/evo/util/command_line_arguments'
11 require 'lib/evo/io/parser/hmmscan_parser'
18 PRG_DESC = "Summarize hmmscan output tables into simpler tables"
20 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
22 DELIMITER_OPTION = "d"
24 I_E_VALUE_THRESHOLD_OPTION = "ie"
25 FS_E_VALUE_THRESHOLD_OPTION = "pe"
26 HMM_FOR_PROTEIN_OUTPUT = "m"
27 IGNORE_DUF_OPTION = "i"
28 PARSE_OUT_DESCRIPITION_OPTION = "a"
29 HELP_OPTION_1 = "help"
32 USE_AVOID_HMMS = false
33 AVOID_HHMS = [ "x", "y", "z" ]
34 LIMIT_FOR_CLOSE_DOMAINS = 20 # Used for protein architecture summary
37 @domain_counts = Hash.new
42 Util.print_program_information( PRG_NAME,
49 if ( ARGV == nil || ( ARGV.length < 1 ) )
55 cla = CommandLineArguments.new( ARGV )
56 rescue ArgumentError => e
57 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
60 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
61 cla.is_option_set?( HELP_OPTION_2 ) )
66 allowed_opts = Array.new
67 allowed_opts.push( DELIMITER_OPTION )
68 allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
69 allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
70 allowed_opts.push( IGNORE_DUF_OPTION )
71 allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
72 allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
73 allowed_opts.push( SPECIES_OPTION )
75 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
76 if ( disallowed.length > 0 )
77 Util.fatal_error( PRG_NAME,
78 "unknown option(s): " + disallowed,
82 inpath = cla.get_file_name( 0 )
85 if ( cla.get_number_of_files == 1 )
86 outpath = inpath + Constants::DOMAIN_TABLE_SUFFIX
87 elsif ( cla.get_number_of_files == 2 )
88 outpath = cla.get_file_name( 1 )
94 column_delimiter = "\t"
95 if ( cla.is_option_set?( DELIMITER_OPTION ) )
97 column_delimiter = cla.get_option_value( DELIMITER_OPTION )
98 rescue ArgumentError => e
99 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
103 i_e_value_threshold = -1.0
104 if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
106 i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
107 rescue ArgumentError => e
108 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
110 if ( i_e_value_threshold < 0.0 )
111 Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
115 fs_e_value_threshold = -1.0
116 if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
118 fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
119 rescue ArgumentError => e
120 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
122 if ( fs_e_value_threshold < 0.0 )
123 Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
127 hmm_for_protein_output = ""
128 if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
130 hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
131 rescue ArgumentError => e
132 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
137 if ( cla.is_option_set?( SPECIES_OPTION ) )
139 species = cla.get_option_value( SPECIES_OPTION )
140 rescue ArgumentError => e
141 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
146 if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
150 parse_descriptions = false
151 if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
152 parse_descriptions = true
156 puts( "hmmscan outputfile : " + inpath )
157 puts( "outputfile : " + outpath )
159 if ( i_e_value_threshold >= 0.0 )
160 puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
162 puts( "i-E-value threshold : no threshold" )
164 if ( parse_descriptions )
165 puts( "parse descriptions : true" )
167 puts( "parse descriptions : false" )
170 puts( "ignore DUFs : true" )
172 puts( "ignore DUFs : false" )
174 if ( column_delimiter == "\t" )
175 puts( "column delimiter : TAB" )
177 puts( "column delimiter : " + column_delimiter )
179 if !hmm_for_protein_output.empty?
180 puts( "HMM for proteins : " + hmm_for_protein_output )
181 puts( "species : " + species )
182 if fs_e_value_threshold >= 0.0
183 puts( "E-value threshold : " + fs_e_value_threshold.to_s )
185 puts( "E-value threshold : no threshold" )
197 fs_e_value_threshold,
198 hmm_for_protein_output,
201 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
203 domain_counts = get_domain_counts()
206 puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
207 puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
209 puts( Util.draw_histogram( domain_counts, "#" ) )
211 Util.print_message( PRG_NAME, "wrote: " + outpath )
212 Util.print_message( PRG_NAME, "next step in standard analysis pipeline: d2f.rb")
213 Util.print_message( PRG_NAME, 'OK' )
220 # raises ArgumentError, IOError
227 fs_e_value_threshold,
228 hmm_for_protein_output,
231 Util.check_file_for_readability( inpath )
232 Util.check_file_for_writability( outpath )
234 hmmscan_parser = HmmscanParser.new( inpath )
235 results = hmmscan_parser.parse
237 outfile = File.open( outpath, "a" )
246 hmmscan_results_per_protein = []
250 results.each do | r |
254 i_e_value = r.i_e_value
255 env_from = r.env_from
258 if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
259 ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
261 outfile.print( query +
263 if ( get_descriptions )
264 outfile.print( desc +
267 outfile.print( model +
274 outfile.print( Constants::LINE_DELIMITER )
277 if !hmm_for_protein_output.empty?
278 if !prev_query.empty? && prev_query != query
279 if !hmmscan_results_per_protein.empty?
280 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
281 fs_e_value_threshold,
282 hmm_for_protein_output,
286 hmmscan_results_per_protein.clear
291 if !AVOID_HHMS.include? r.model
292 hmmscan_results_per_protein << r
295 hmmscan_results_per_protein << r
300 if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
301 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
302 fs_e_value_threshold,
303 hmm_for_protein_output,
313 if id =~ /(sp|tr)\|\S+\|(\S+)/
319 def count_model( model )
320 if ( @domain_counts.has_key?( model ) )
321 count = @domain_counts[ model ].to_i
323 @domain_counts[ model ] = count
325 @domain_counts[ model ] = 1
329 def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
330 fs_e_value_threshold,
331 hmm_for_protein_output,
336 # filter according to i-Evalue threshold
337 # abort if fs Evalue too high
338 hmmscan_results_per_protein_filtered = []
340 hmmscan_results_per_protein.each do | r |
342 if r.model == hmm_for_protein_output
343 if fs_e_value_threshold > 0.0 && r.fs_e_value > fs_e_value_threshold
347 if i_e_value_threshold <= 0 || r.i_e_value <= i_e_value_threshold
348 hmmscan_results_per_protein_filtered << r
349 if r.model == hmm_for_protein_output
356 # passed on protein E-value, failed in per domain E-values
360 hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
363 hmmscan_results_per_protein_filtered.each do | r |
364 if r.model == hmm_for_protein_output
370 s << own.query + "\t"
372 s << own.fs_e_value.to_s + "\t"
373 s << own.qlen.to_s + "\t"
375 s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
376 hmmscan_results_per_protein_filtered.each do | r |
381 overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
385 s << calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + "\t"
388 hmmscan_results_per_protein_filtered.each do | r |
390 s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
392 s << make_interdomain_sequence( r.env_from, false )
396 s << r.env_from.to_s << "-" << r.env_to.to_s
397 s << "|ie=" << r.i_e_value.to_s
398 s << "|ce=" << r.c_e_value.to_s
402 s << make_interdomain_sequence( own.qlen - prev_r.env_to, false )
406 def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
409 hmmscan_results_per_protein_filtered.each do | r |
410 if r.model == hmm_for_protein_output
412 linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
420 def get_domain_counts()
421 return @domain_counts
424 def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
427 hmmscan_results_per_protein_filtered.each do | r |
428 if r.model == hmm_for_protein_output
430 overview << hmm_for_protein_output
432 if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
433 overview << "~" << hmm_for_protein_output
435 overview << "----" << hmm_for_protein_output
444 def make_interdomain_sequence( d, mark_short = true )
462 puts( " " + PRG_NAME + ".rb [options] <hmmscan outputfile> [outputfile]" )
464 puts( " options: -" + DELIMITER_OPTION + "=<s> : column delimiter for outputfile, default is TAB" )
465 puts( " -" + I_E_VALUE_THRESHOLD_OPTION + "=<f>: i-E-value threshold, default is no threshold" )
466 puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + " : parse query description (in addition to query name)" )
467 puts( " -" + IGNORE_DUF_OPTION + " : ignore DUFs" )
468 puts( " -" + HMM_FOR_PROTEIN_OUTPUT + "=<s> : HMM for protein architectures summary" )
469 puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + "=<f>: E-value threshold for full protein sequences, only for protein architectures summary" )
470 puts( " -" + SPECIES_OPTION + "=<s> : species for protein architectures summary" )
472 puts( " [next step in standard analysis pipeline: d2f.rb]")
476 puts( " " + "hmmscan --max --domtblout P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 -E 10 Pfam-A.hmm P53_ni.fasta" )
478 puts( " " + PRG_NAME + ".rb P53_hmmscan_300_10" )