2 # = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
9 # last modified: 11/24/2009
11 require 'lib/evo/util/constants'
12 require 'lib/evo/util/util'
13 require 'lib/evo/util/command_line_arguments'
21 PRG_DESC = "hmmscan parser"
22 PRG_DATE = "2009.11.24"
23 COPYRIGHT = "2009 Christian M Zmasek"
24 CONTACT = "phylosoft@gmail.com"
25 WWW = "www.phylosoft.org"
27 DELIMITER_OPTION = "d"
28 E_VALUE_THRESHOLD_OPTION = "e"
29 IGNORE_DUF_OPTION = "i"
30 PARSE_OUT_DESCRIPITION_OPTION = "a"
31 HELP_OPTION_1 = "help"
35 @domain_counts = Hash.new
38 # raises ArgumentError, IOError
45 Util.check_file_for_readability( inpath )
46 Util.check_file_for_writability( outpath )
48 outfile = File.open( outpath, "a" )
55 i_e_value = String.new
59 nl = Constants::LINE_DELIMITER
61 File.open( inpath ) do | file |
62 while line = file.gets
63 if !HmmscanParser.is_ignorable?( line ) && line =~ /^\S+\s+\S/
65 # tn acc tlen query acc qlen Evalue score bias # of c-E i-E score bias hf ht af at ef et acc desc
66 # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
67 line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
75 if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
76 ( !ignore_dufs || ( model !~ /^DUF\d+/ ) ) )
78 outfile.print( query +
80 if ( get_descriptions )
84 outfile.print( model +
94 end # while line = file.gets
103 def count_model( model )
104 if ( @domain_counts.has_key?( model ) )
105 count = @domain_counts[ model ].to_i
107 @domain_counts[ model ] = count
109 @domain_counts[ model ] = 1
114 def get_domain_counts()
115 return @domain_counts
120 Util.print_program_information( PRG_NAME,
130 cla = CommandLineArguments.new( ARGV )
131 rescue ArgumentError => e
132 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
135 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
136 cla.is_option_set?( HELP_OPTION_2 ) )
141 if ( cla.get_number_of_files != 2 )
146 allowed_opts = Array.new
147 allowed_opts.push( DELIMITER_OPTION )
148 allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
149 allowed_opts.push( IGNORE_DUF_OPTION )
150 allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
152 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
153 if ( disallowed.length > 0 )
154 Util.fatal_error( PRG_NAME,
155 "unknown option(s): " + disallowed,
159 inpath = cla.get_file_name( 0 )
160 outpath = cla.get_file_name( 1 )
162 column_delimiter = "\t"
163 if ( cla.is_option_set?( DELIMITER_OPTION ) )
165 column_delimiter = cla.get_option_value( DELIMITER_OPTION )
166 rescue ArgumentError => e
167 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
171 e_value_threshold = -1.0
172 if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
174 e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
175 rescue ArgumentError => e
176 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
178 if ( e_value_threshold < 0.0 )
179 Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
184 if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
188 parse_descriptions = false
189 if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
190 parse_descriptions = true
194 puts( "hmmpfam outputfile: " + inpath )
195 puts( "outputfile : " + outpath )
196 if ( e_value_threshold >= 0.0 )
197 puts( "E-value threshold : " + e_value_threshold.to_s )
199 puts( "E-value threshold : no threshold" )
201 if ( parse_descriptions )
202 puts( "parse descriptions: true" )
204 puts( "parse descriptions: false" )
207 puts( "ignore DUFs : true" )
209 puts( "ignore DUFs : false" )
211 if ( column_delimiter == "\t" )
212 puts( "column delimiter : TAB" )
214 puts( "column delimiter : " + column_delimiter )
219 queries_count = parse( inpath,
225 rescue ArgumentError, IOError => e
226 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
228 domain_counts = get_domain_counts()
231 puts( "read output for a total of " + queries_count.to_s + " query sequences" )
233 puts( "domain counts (considering potential E-value threshold and ignoring of DUFs):" )
234 puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
236 puts( Util.draw_histogram( domain_counts, "#" ) )
238 Util.print_message( PRG_NAME, 'OK' )
246 puts( " " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
248 puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
249 puts( " -" + E_VALUE_THRESHOLD_OPTION + ": E-value threshold, default is no threshold" )
250 puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" )
251 puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" )
259 def HmmscanParser.is_ignorable?( line )
260 return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
263 end # class HmmscanParser