2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
4 # Copyright:: Copyright (C) 2012 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
9 # last modified: 121003
13 require 'lib/evo/util/constants'
14 require 'lib/evo/util/util'
15 require 'lib/evo/util/command_line_arguments'
16 require 'lib/evo/io/parser/hmmscan_parser'
17 require 'lib/evo/io/web/uniprotkb'
25 PRG_DESC = "hmmscan summary"
26 PRG_DATE = "2012.10.23"
27 COPYRIGHT = "2012 Christian M Zmasek"
28 CONTACT = "phylosoft@gmail.com"
29 WWW = "www.phylosoft.org"
31 DELIMITER_OPTION = "d"
33 I_E_VALUE_THRESHOLD_OPTION = "ie"
34 FS_E_VALUE_THRESHOLD_OPTION = "pe"
35 HMM_FOR_PROTEIN_OUTPUT = "m"
36 IGNORE_DUF_OPTION = "i"
37 PARSE_OUT_DESCRIPITION_OPTION = "a"
39 HELP_OPTION_1 = "help"
43 AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
44 LIMIT_FOR_CLOSE_DOMAINS = 20
47 @domain_counts = Hash.new
54 Util.print_program_information( PRG_NAME,
64 cla = CommandLineArguments.new( ARGV )
65 rescue ArgumentError => e
66 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
69 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
70 cla.is_option_set?( HELP_OPTION_2 ) )
75 if ( cla.get_number_of_files != 2 )
80 allowed_opts = Array.new
81 allowed_opts.push( DELIMITER_OPTION )
82 allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
83 allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
84 allowed_opts.push( IGNORE_DUF_OPTION )
85 allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
86 allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
87 allowed_opts.push( UNIPROT )
88 allowed_opts.push( SPECIES_OPTION )
90 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
91 if ( disallowed.length > 0 )
92 Util.fatal_error( PRG_NAME,
93 "unknown option(s): " + disallowed,
97 inpath = cla.get_file_name( 0 )
98 outpath = cla.get_file_name( 1 )
100 column_delimiter = "\t"
101 if ( cla.is_option_set?( DELIMITER_OPTION ) )
103 column_delimiter = cla.get_option_value( DELIMITER_OPTION )
104 rescue ArgumentError => e
105 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
109 i_e_value_threshold = -1.0
110 if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
112 i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
113 rescue ArgumentError => e
114 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
116 if ( i_e_value_threshold < 0.0 )
117 Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
123 fs_e_value_threshold = -1.0
124 if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
126 fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
127 rescue ArgumentError => e
128 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
130 if ( fs_e_value_threshold < 0.0 )
131 Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
135 hmm_for_protein_output = ""
136 if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
138 hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
139 rescue ArgumentError => e
140 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
145 if ( cla.is_option_set?( UNIPROT ) )
147 uniprot = cla.get_option_value( UNIPROT )
148 rescue ArgumentError => e
149 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
154 if ( cla.is_option_set?( SPECIES_OPTION ) )
156 species = cla.get_option_value( SPECIES_OPTION )
157 rescue ArgumentError => e
158 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
163 if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
167 parse_descriptions = false
168 if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
169 parse_descriptions = true
173 puts( "hmmpfam outputfile : " + inpath )
174 puts( "outputfile : " + outpath )
175 puts( "species : " + species )
176 if ( i_e_value_threshold >= 0.0 )
177 puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
179 puts( "i-E-value threshold : no threshold" )
181 if ( parse_descriptions )
182 puts( "parse descriptions : true" )
184 puts( "parse descriptions : false" )
187 puts( "ignore DUFs : true" )
189 puts( "ignore DUFs : false" )
191 if ( column_delimiter == "\t" )
192 puts( "column delimiter : TAB" )
194 puts( "column delimiter : " + column_delimiter )
196 if fs_e_value_threshold >= 0.0
197 puts( "E-value threshold : " + fs_e_value_threshold.to_s )
199 puts( "E-value threshold : no threshold" )
201 if !hmm_for_protein_output.empty?
202 puts( "HMM for proteins : " + hmm_for_protein_output )
205 puts( "Uniprot : " + uniprot )
216 fs_e_value_threshold,
217 hmm_for_protein_output,
221 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
223 domain_counts = get_domain_counts()
227 puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
228 puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
230 puts( Util.draw_histogram( domain_counts, "#" ) )
232 Util.print_message( PRG_NAME, 'OK' )
239 # raises ArgumentError, IOError
246 fs_e_value_threshold,
247 hmm_for_protein_output,
253 Util.check_file_for_readability( inpath )
254 Util.check_file_for_writability( outpath )
256 hmmscan_parser = HmmscanParser.new( inpath )
257 results = hmmscan_parser.parse
259 outfile = File.open( outpath, "a" )
268 hmmscan_results_per_protein = []
274 results.each do | r |
277 i_e_value = r.i_e_value
278 env_from = r.env_from
281 if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
282 ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
284 outfile.print( query +
286 if ( get_descriptions )
287 outfile.print( desc +
290 outfile.print( model +
297 outfile.print( Constants::LINE_DELIMITER )
300 if !hmm_for_protein_output.empty?
301 if !prev_query.empty? && prev_query != query
302 if !hmmscan_results_per_protein.empty?
303 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
304 fs_e_value_threshold,
305 hmm_for_protein_output,
310 hmmscan_results_per_protein.clear
315 if !AVOID_HHMS.include? r.model
316 hmmscan_results_per_protein << r
319 hmmscan_results_per_protein << r
323 if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
324 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
325 fs_e_value_threshold,
326 hmm_for_protein_output,
337 if id =~ /(sp|tr)\|\S+\|(\S+)/
345 def count_model( model )
346 if ( @domain_counts.has_key?( model ) )
347 count = @domain_counts[ model ].to_i
349 @domain_counts[ model ] = count
351 @domain_counts[ model ] = 1
355 def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
356 fs_e_value_threshold,
357 hmm_for_protein_output,
363 # filter according to i-Evalue threshold
364 # abort if fs Evalue too high
365 hmmscan_results_per_protein_filtered = []
367 hmmscan_results_per_protein.each do | r |
368 if r.model == hmm_for_protein_output
369 if r.fs_e_value > fs_e_value_threshold
373 if r.i_e_value <= i_e_value_threshold
374 hmmscan_results_per_protein_filtered << r
375 if r.model == hmm_for_protein_output
382 # passed on protein E-value, failed in per domain E-values
386 hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
389 hmmscan_results_per_protein_filtered.each do | r |
390 if r.model == hmm_for_protein_output
396 s << own.query + "\t"
398 s << own.fs_e_value.to_s + "\t"
399 s << own.qlen.to_s + "\t"
401 s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
402 hmmscan_results_per_protein_filtered.each do | r |
406 e = UniprotKB::get_entry_by_id( process_id( own.query ) )
409 s << uniprot_annotation( e )
410 # s << "\uniprot_annotationt"
413 overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
417 s << calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + "\t"
420 hmmscan_results_per_protein_filtered.each do | r |
423 s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
425 s << make_interdomain_sequence( r.env_from, false )
429 s << r.env_from.to_s << "-" << r.env_to.to_s
430 s << "|ie=" << r.i_e_value.to_s
431 s << "|ce=" << r.c_e_value.to_s
435 s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
439 def uniprot_annotation( e )
441 pdb_ids = e.get_pdb_ids
443 pdb_ids.each do | pdb |
452 def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
455 hmmscan_results_per_protein_filtered.each do | r |
456 if r.model == hmm_for_protein_output
458 linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
466 def get_domain_counts()
467 return @domain_counts
470 def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
473 hmmscan_results_per_protein_filtered.each do | r |
474 if r.model == hmm_for_protein_output
476 overview << hmm_for_protein_output
478 if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
479 overview << "~" << hmm_for_protein_output
481 overview << "----" << hmm_for_protein_output
490 def make_interdomain_sequence( d, mark_short = true )
510 puts( " " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
512 puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
513 puts( " -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" )
514 puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" )
515 puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" )
516 puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" )
517 puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )