2 # = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
9 # last modified: 11/24/2009
11 require 'lib/evo/util/constants'
12 require 'lib/evo/util/util'
13 require 'lib/evo/util/command_line_arguments'
14 require 'lib/evo/io/parser/hmmscan_parser'
22 PRG_DESC = "hmmscan parser"
23 PRG_DATE = "2012.10.19"
24 COPYRIGHT = "2012 Christian M Zmasek"
25 CONTACT = "phylosoft@gmail.com"
26 WWW = "www.phylosoft.org"
28 DELIMITER_OPTION = "d"
29 I_E_VALUE_THRESHOLD_OPTION = "ie"
30 FS_E_VALUE_THRESHOLD_OPTION = "pe"
31 HMM_FOR_PROTEIN_OUTPUT = "m"
32 IGNORE_DUF_OPTION = "i"
33 PARSE_OUT_DESCRIPITION_OPTION = "a"
34 HELP_OPTION_1 = "help"
38 AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
39 LIMIT_FOR_CLOSE_DOMAINS = 20
42 @domain_counts = Hash.new
47 Util.print_program_information( PRG_NAME,
57 cla = CommandLineArguments.new( ARGV )
58 rescue ArgumentError => e
59 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
62 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
63 cla.is_option_set?( HELP_OPTION_2 ) )
68 if ( cla.get_number_of_files != 2 )
73 allowed_opts = Array.new
74 allowed_opts.push( DELIMITER_OPTION )
75 allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
76 allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
77 allowed_opts.push( IGNORE_DUF_OPTION )
78 allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
79 allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
81 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
82 if ( disallowed.length > 0 )
83 Util.fatal_error( PRG_NAME,
84 "unknown option(s): " + disallowed,
88 inpath = cla.get_file_name( 0 )
89 outpath = cla.get_file_name( 1 )
91 column_delimiter = "\t"
92 if ( cla.is_option_set?( DELIMITER_OPTION ) )
94 column_delimiter = cla.get_option_value( DELIMITER_OPTION )
95 rescue ArgumentError => e
96 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
100 i_e_value_threshold = -1.0
101 if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
103 i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
104 rescue ArgumentError => e
105 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
107 if ( i_e_value_threshold < 0.0 )
108 Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
112 fs_e_value_threshold = -1.0
113 if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
115 fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
116 rescue ArgumentError => e
117 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
119 if ( fs_e_value_threshold < 0.0 )
120 Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
124 hmm_for_protein_output = ""
125 if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
127 hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
128 rescue ArgumentError => e
129 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
134 if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
138 parse_descriptions = false
139 if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
140 parse_descriptions = true
144 puts( "hmmpfam outputfile : " + inpath )
145 puts( "outputfile : " + outpath )
146 if ( i_e_value_threshold >= 0.0 )
147 puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
149 puts( "i-E-value threshold : no threshold" )
151 if ( parse_descriptions )
152 puts( "parse descriptions : true" )
154 puts( "parse descriptions : false" )
157 puts( "ignore DUFs : true" )
159 puts( "ignore DUFs : false" )
161 if ( column_delimiter == "\t" )
162 puts( "column delimiter : TAB" )
164 puts( "column delimiter : " + column_delimiter )
166 if ( fs_e_value_threshold >= 0.0 )
167 puts( "E-value threshold : " + fs_e_value_threshold.to_s )
169 puts( "E-value threshold : no threshold" )
171 if ( !hmm_for_protein_output.empty? )
172 puts( "HMM for proteins : " + hmm_for_protein_output )
183 fs_e_value_threshold,
184 hmm_for_protein_output )
185 rescue ArgumentError, IOError => e
186 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
188 domain_counts = get_domain_counts()
192 puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
193 puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
195 puts( Util.draw_histogram( domain_counts, "#" ) )
197 Util.print_message( PRG_NAME, 'OK' )
204 # raises ArgumentError, IOError
211 fs_e_value_threshold,
212 hmm_for_protein_output )
213 Util.check_file_for_readability( inpath )
214 Util.check_file_for_writability( outpath )
216 outfile = File.open( outpath, "a" )
225 hmmscan_results_per_protein = []
227 hmmscan_parser = HmmscanParser.new( inpath )
231 hmmscan_parser.parse.each do | r |
234 i_e_value = r.i_e_value
235 env_from = r.env_from
238 if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
239 ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
241 outfile.print( query +
243 if ( get_descriptions )
244 outfile.print( desc +
247 outfile.print( model +
254 outfile.print( Constants::LINE_DELIMITER )
257 if !hmm_for_protein_output.empty?
258 if !prev_query.empty? && prev_query != query
259 if !hmmscan_results_per_protein.empty?
260 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
261 fs_e_value_threshold,
262 hmm_for_protein_output,
263 i_e_value_threshold )
265 hmmscan_results_per_protein.clear
270 if !AVOID_HHMS.include? r.model
271 hmmscan_results_per_protein << r
274 hmmscan_results_per_protein << r
278 if !hmm_for_protein_output.empty?
279 if !hmmscan_results_per_protein.empty?
280 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
281 fs_e_value_threshold,
282 hmm_for_protein_output,
283 i_e_value_threshold )
291 def count_model( model )
292 if ( @domain_counts.has_key?( model ) )
293 count = @domain_counts[ model ].to_i
295 @domain_counts[ model ] = count
297 @domain_counts[ model ] = 1
301 def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
302 fs_e_value_threshold,
303 hmm_for_protein_output,
304 i_e_value_threshold )
307 # filter according to i-Evalue threshold
308 # abort if fs Evalue too high
309 hmmscan_results_per_protein_filtered = []
311 hmmscan_results_per_protein.each do | r |
312 if r.model == hmm_for_protein_output
313 if r.fs_e_value > fs_e_value_threshold
317 if r.i_e_value <= i_e_value_threshold
318 hmmscan_results_per_protein_filtered << r
319 if r.model == hmm_for_protein_output
326 # passed on protein E-value, failed in per domain E-values
330 hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
333 hmmscan_results_per_protein_filtered.each do | r |
334 if r.model == hmm_for_protein_output
340 s << own.query + "\t"
342 s << own.fs_e_value.to_s + "\t"
343 s << own.qlen.to_s + "\t"
345 s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
346 hmmscan_results_per_protein_filtered.each do | r |
351 overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
355 s << calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + "\t"
358 hmmscan_results_per_protein_filtered.each do | r |
361 s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
363 s << make_interdomain_sequence( r.env_from, false )
367 s << r.env_from.to_s << "-" << r.env_to.to_s
368 s << "|ie=" << r.i_e_value.to_s
369 s << "|ce=" << r.c_e_value.to_s
373 s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
378 def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
381 hmmscan_results_per_protein_filtered.each do | r |
382 if r.model == hmm_for_protein_output
384 linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
392 def get_domain_counts()
393 return @domain_counts
396 def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
399 hmmscan_results_per_protein_filtered.each do | r |
400 if r.model == hmm_for_protein_output
402 overview << hmm_for_protein_output
404 if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
405 overview << "~" << hmm_for_protein_output
407 overview << "----" << hmm_for_protein_output
416 def make_interdomain_sequence( d, mark_short = true )
436 puts( " " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
438 puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
439 puts( " -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" )
440 puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" )
441 puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" )
442 puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" )
443 puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )