2 # = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
9 # last modified: 11/24/2009
11 require 'lib/evo/util/constants'
12 require 'lib/evo/util/util'
13 require 'lib/evo/util/command_line_arguments'
14 require 'lib/evo/io/parser/hmmscan_parser'
22 PRG_DESC = "hmmscan parser"
23 PRG_DATE = "2012.10.19"
24 COPYRIGHT = "2012 Christian M Zmasek"
25 CONTACT = "phylosoft@gmail.com"
26 WWW = "www.phylosoft.org"
28 DELIMITER_OPTION = "d"
29 I_E_VALUE_THRESHOLD_OPTION = "e"
30 FS_E_VALUE_THRESHOLD_OPTION = "pe"
31 HMM_FOR_PROTEIN_OUTPUT = "m"
32 IGNORE_DUF_OPTION = "i"
33 PARSE_OUT_DESCRIPITION_OPTION = "a"
34 HELP_OPTION_1 = "help"
38 @domain_counts = Hash.new
41 # raises ArgumentError, IOError
49 hmm_for_protein_output )
50 Util.check_file_for_readability( inpath )
51 Util.check_file_for_writability( outpath )
53 outfile = File.open( outpath, "a" )
62 hmmscan_results_per_protein = []
64 hmmscan_parser = HmmscanParser.new( inpath )
68 hmmscan_parser.parse.each do | r |
71 i_e_value = r.i_e_value
75 if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
76 ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
78 outfile.print( query +
80 if ( get_descriptions )
84 outfile.print( model +
91 outfile.print( Constants::LINE_DELIMITER )
94 if !prev_query.empty? && prev_query != query
95 if !hmmscan_results_per_protein.empty?
96 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
98 hmm_for_protein_output )
100 hmmscan_results_per_protein.clear
103 hmmscan_results_per_protein << r
105 if !hmmscan_results_per_protein.empty?
106 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
107 fs_e_value_threshold,
108 hmm_for_protein_output )
115 def count_model( model )
116 if ( @domain_counts.has_key?( model ) )
117 count = @domain_counts[ model ].to_i
119 @domain_counts[ model ] = count
121 @domain_counts[ model ] = 1
125 def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
126 fs_e_value_threshold,
127 hmm_for_protein_output )
130 hmmscan_results_per_protein.each do | r |
131 if r.model == hmm_for_protein_output
132 fs_e_value = r.fs_e_value
133 if fs_e_value > fs_e_value_threshold
140 first = hmmscan_results_per_protein[ 0 ]
142 s << first.query + "\t"
143 s << fs_e_value.to_s + "\t"
144 s << first.qlen.to_s + "\t"
145 # s << first.fs_e_value.to_s + "\t"
146 # s << first.out_of.to_s + "\t"
147 hmmscan_results_per_protein.each do | r |
154 def get_domain_counts()
155 return @domain_counts
160 Util.print_program_information( PRG_NAME,
170 cla = CommandLineArguments.new( ARGV )
171 rescue ArgumentError => e
172 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
175 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
176 cla.is_option_set?( HELP_OPTION_2 ) )
181 if ( cla.get_number_of_files != 2 )
186 allowed_opts = Array.new
187 allowed_opts.push( DELIMITER_OPTION )
188 allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
189 allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
190 allowed_opts.push( IGNORE_DUF_OPTION )
191 allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
192 allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
194 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
195 if ( disallowed.length > 0 )
196 Util.fatal_error( PRG_NAME,
197 "unknown option(s): " + disallowed,
201 inpath = cla.get_file_name( 0 )
202 outpath = cla.get_file_name( 1 )
204 column_delimiter = "\t"
205 if ( cla.is_option_set?( DELIMITER_OPTION ) )
207 column_delimiter = cla.get_option_value( DELIMITER_OPTION )
208 rescue ArgumentError => e
209 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
213 i_e_value_threshold = -1.0
214 if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
216 i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
217 rescue ArgumentError => e
218 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
220 if ( i_e_value_threshold < 0.0 )
221 Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
225 fs_e_value_threshold = -1.0
226 if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
228 fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
229 rescue ArgumentError => e
230 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
232 if ( fs_e_value_threshold < 0.0 )
233 Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
238 hmm_for_protein_output = ""
239 if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
241 hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
242 rescue ArgumentError => e
243 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
249 if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
253 parse_descriptions = false
254 if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
255 parse_descriptions = true
259 puts( "hmmpfam outputfile : " + inpath )
260 puts( "outputfile : " + outpath )
261 if ( i_e_value_threshold >= 0.0 )
262 puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
264 puts( "i-E-value threshold : no threshold" )
266 if ( parse_descriptions )
267 puts( "parse descriptions : true" )
269 puts( "parse descriptions : false" )
272 puts( "ignore DUFs : true" )
274 puts( "ignore DUFs : false" )
276 if ( column_delimiter == "\t" )
277 puts( "column delimiter : TAB" )
279 puts( "column delimiter : " + column_delimiter )
281 if ( fs_e_value_threshold >= 0.0 )
282 puts( "E-value threshold : " + fs_e_value_threshold.to_s )
284 puts( "E-value threshold : no threshold" )
286 if ( !hmm_for_protein_output.empty? )
287 puts( "HMM for proteins : " + hmm_for_protein_output )
298 fs_e_value_threshold,
299 hmm_for_protein_output )
300 rescue ArgumentError, IOError => e
301 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
303 domain_counts = get_domain_counts()
307 puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
308 puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
310 puts( Util.draw_histogram( domain_counts, "#" ) )
312 Util.print_message( PRG_NAME, 'OK' )
320 puts( " " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
322 puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
323 puts( " -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" )
324 puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" )
325 puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" )
326 puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" )
327 puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
331 end # class HmmscanParser