in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / apps / hmmscan_parser.rb
1 #
2 # = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class
3 #
4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
8 #
9 # last modified: 11/24/2009
10
11 require 'lib/evo/util/constants'
12 require 'lib/evo/util/util'
13 require 'lib/evo/util/command_line_arguments'
14
15 module Evoruby
16
17     class HmmscanParser
18
19         PRG_NAME       = "hsp"
20         PRG_VERSION    = "1.0.1"
21         PRG_DESC       = "hmmscan parser"
22         PRG_DATE       = "2009.11.24"
23         COPYRIGHT      = "2009 Christian M Zmasek"
24         CONTACT        = "phylosoft@gmail.com"
25         WWW            = "www.phylosoft.org"
26
27         DELIMITER_OPTION              = "d"
28         E_VALUE_THRESHOLD_OPTION      = "e"
29         IGNORE_DUF_OPTION             = "i"
30         PARSE_OUT_DESCRIPITION_OPTION = "a"
31         HELP_OPTION_1                 = "help"
32         HELP_OPTION_2                 = "h"
33
34         def initialize
35             @domain_counts = Hash.new
36         end
37
38         # raises ArgumentError, IOError
39         def parse( inpath,
40                 outpath,
41                 column_delimiter,
42                 e_value_threshold,
43                 ignore_dufs,
44                 get_descriptions )
45             Util.check_file_for_readability( inpath )
46             Util.check_file_for_writability( outpath )
47
48             outfile = File.open( outpath, "a" )
49
50             query    = String.new
51             desc     = String.new
52             model    = String.new
53             env_from = String.new
54             env_to   = String.new
55             i_e_value  = String.new
56
57             queries_count = 0
58
59             nl = Constants::LINE_DELIMITER
60
61             File.open( inpath ) do | file |
62                 while line = file.gets
63                     if !HmmscanParser.is_ignorable?( line ) && line =~ /^\S+\s+\S/
64
65                         #         tn      acc     tlen    query   acc     qlen    Evalue  score   bias    #       of      c-E     i-E     score   bias    hf      ht      af      at      ef      et      acc     desc
66                         #         1       2       3       4       5       6       7       8       9       10      11      12      13      14      15      16      17      18      19      20      21      22      23
67                         line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
68
69                         model     = $1
70                         query     = $4
71                         i_e_value = $13.to_f
72                         env_from  = $20.to_i
73                         env_to    = $21.to_i
74
75                         if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
76                                  ( !ignore_dufs || ( model !~ /^DUF\d+/ ) ) )
77                             count_model( model )
78                             outfile.print( query +
79                                  column_delimiter )
80                             if ( get_descriptions )
81                                 outfile.print( desc +
82                                      column_delimiter )
83                             end
84                             outfile.print( model +
85                                  column_delimiter +
86                                  env_from.to_s +
87                                  column_delimiter +
88                                  env_to.to_s +
89                                  column_delimiter +
90                                  i_e_value.to_s )
91                             outfile.print( nl )
92                         end
93                     end
94                 end # while line = file.gets
95             end
96             outfile.flush()
97             outfile.close()
98
99             return queries_count
100
101         end # def parse
102
103         def count_model( model )
104             if ( @domain_counts.has_key?( model ) )
105                 count = @domain_counts[ model ].to_i
106                 count += 1
107                 @domain_counts[ model ] = count
108             else
109                 @domain_counts[ model ] = 1
110             end
111         end
112
113
114         def get_domain_counts()
115             return @domain_counts
116         end
117
118         def run()
119
120             Util.print_program_information( PRG_NAME,
121                 PRG_VERSION,
122                 PRG_DESC,
123                 PRG_DATE,
124                 COPYRIGHT,
125                 CONTACT,
126                 WWW,
127                 STDOUT )
128
129             begin
130                 cla = CommandLineArguments.new( ARGV )
131             rescue ArgumentError => e
132                 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
133             end
134
135             if ( cla.is_option_set?( HELP_OPTION_1 ) ||
136                      cla.is_option_set?( HELP_OPTION_2 ) )
137                 print_help
138                 exit( 0 )
139             end
140
141             if ( cla.get_number_of_files != 2 )
142                 print_help
143                 exit( -1 )
144             end
145
146             allowed_opts = Array.new
147             allowed_opts.push( DELIMITER_OPTION )
148             allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
149             allowed_opts.push( IGNORE_DUF_OPTION )
150             allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
151
152             disallowed = cla.validate_allowed_options_as_str( allowed_opts )
153             if ( disallowed.length > 0 )
154                 Util.fatal_error( PRG_NAME,
155                     "unknown option(s): " + disallowed,
156                     STDOUT )
157             end
158
159             inpath = cla.get_file_name( 0 )
160             outpath = cla.get_file_name( 1 )
161
162             column_delimiter = "\t"
163             if ( cla.is_option_set?( DELIMITER_OPTION ) )
164                 begin
165                     column_delimiter = cla.get_option_value( DELIMITER_OPTION )
166                 rescue ArgumentError => e
167                     Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
168                 end
169             end
170
171             e_value_threshold = -1.0
172             if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
173                 begin
174                     e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
175                 rescue ArgumentError => e
176                     Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
177                 end
178                 if ( e_value_threshold < 0.0 )
179                     Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
180                 end
181             end
182
183             ignore_dufs = false
184             if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
185                 ignore_dufs = true
186             end
187
188             parse_descriptions = false
189             if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
190                 parse_descriptions = true
191             end
192
193             puts()
194             puts( "hmmpfam outputfile: " + inpath )
195             puts( "outputfile        : " + outpath )
196             if ( e_value_threshold >= 0.0 )
197                 puts( "E-value threshold : " + e_value_threshold.to_s )
198             else
199                 puts( "E-value threshold : no threshold" )
200             end
201             if ( parse_descriptions )
202                 puts( "parse descriptions: true" )
203             else
204                 puts( "parse descriptions: false" )
205             end
206             if ( ignore_dufs )
207                 puts( "ignore DUFs       : true" )
208             else
209                 puts( "ignore DUFs       : false" )
210             end
211             if ( column_delimiter == "\t" )
212                 puts( "column delimiter  : TAB" )
213             else
214                 puts( "column delimiter  : " + column_delimiter )
215             end
216             puts()
217
218             begin
219                 queries_count = parse( inpath,
220                     outpath,
221                     column_delimiter,
222                     e_value_threshold,
223                     ignore_dufs,
224                     parse_descriptions )
225             rescue ArgumentError, IOError => e
226                 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
227             end
228             domain_counts = get_domain_counts()
229
230             puts
231             puts( "read output for a total of " + queries_count.to_s + " query sequences" )
232             puts
233             puts( "domain counts (considering potential E-value threshold and ignoring of DUFs):" )
234             puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
235             puts
236             puts( Util.draw_histogram( domain_counts, "#" ) )
237             puts
238             Util.print_message( PRG_NAME, 'OK' )
239             puts
240
241         end # def run()
242
243         def print_help()
244             puts( "Usage:" )
245             puts()
246             puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
247             puts()
248             puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
249             puts( "           -" + E_VALUE_THRESHOLD_OPTION  + ": E-value threshold, default is no threshold" )
250             puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
251             puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
252             puts()
253         end
254
255
256         private
257
258
259         def HmmscanParser.is_ignorable?( line )
260             return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
261         end
262
263     end # class HmmscanParser
264
265 end # module Evoruby