in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / apps / domain_sequence_extractor_old.rb
1 #
2 # = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: domain_sequence_extractor.rb,v 1.19 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/util/command_line_arguments'
13 require 'lib/evo/io/parser/hmmsearch_domain_extractor'
14
15 module Evoruby
16
17     class DomainSequenceExtractor
18
19         PRG_NAME       = "dsx"
20         PRG_VERSION    = "1.1.0"
21         PRG_DESC       = "extraction of domain sequences from hmmsearch output"
22         PRG_DATE       = "2008.01.03"
23         COPYRIGHT      = "2008-2009 Christian M Zmasek"
24         CONTACT        = "phylosoft@gmail.com"
25         WWW            = "www.phylosoft.org"
26
27         E_VALUE_THRESHOLD_OPTION           = 'e'
28         LENGTH_THRESHOLD_OPTION            = 'l'
29         ADD_POSITION_OPTION                = 'p'
30         ADD_DOMAIN_NUMBER_OPTION           = 'd'
31         ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT  = 'dd'
32         ADD_DOMAIN_NUMBER_OPTION_AS_LETTER = 'dl'
33         TRIM_OPTION                        = 't'
34         LOG_FILE_SUFFIX                    = '_domain_seq_extr.log'
35         PASSED_SEQS_SUFFIX                 = '_domain_seq_extr_passed'
36         FAILED_SEQS_SUFFIX                 = '_domain_seq_extr_failed'
37         HELP_OPTION_1                      = 'help'
38         HELP_OPTION_2                      = 'h'
39
40         def run()
41
42             Util.print_program_information( PRG_NAME,
43                 PRG_VERSION,
44                 PRG_DESC ,
45                 PRG_DATE,
46                 COPYRIGHT,
47                 CONTACT,
48                 WWW,
49                 STDOUT )
50
51             ld = Constants::LINE_DELIMITER
52
53             begin
54                 cla = CommandLineArguments.new( ARGV )
55             rescue ArgumentError
56                 Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
57             end
58
59             if ( cla.is_option_set?( HELP_OPTION_1 ) ||
60                      cla.is_option_set?( HELP_OPTION_2 ) )
61                 print_help
62                 exit( 0 )
63             end
64
65             if ( cla.get_number_of_files != 3 )
66                 print_help
67                 exit( -1 )
68             end
69
70             allowed_opts = Array.new
71             allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
72             allowed_opts.push( ADD_POSITION_OPTION )
73             allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
74             allowed_opts.push( LENGTH_THRESHOLD_OPTION )
75             allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT )
76             allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER )
77             allowed_opts.push( TRIM_OPTION )
78
79             disallowed = cla.validate_allowed_options_as_str( allowed_opts )
80             if ( disallowed.length > 0 )
81                 Util.fatal_error( PRG_NAME,
82                     "unknown option(s): " + disallowed,
83                     STDOUT )
84             end
85
86             hmmsearch_output    = cla.get_file_name( 0 )
87             fasta_sequence_file = cla.get_file_name( 1 )
88             outfile             = cla.get_file_name( 2 )
89
90             add_position = false
91             if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
92                 add_position = true
93             end
94
95             trim = false
96             if ( cla.is_option_set?( TRIM_OPTION ) )
97                 trim = true
98             end
99
100             add_domain_number           = false
101             add_domain_number_as_letter = false
102             add_domain_number_as_digit  = false
103
104             if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
105                 add_domain_number = true
106             end
107             if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) )
108                 add_domain_number_as_letter = true
109             end
110             if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) )
111                 add_domain_number_as_digit = true
112             end
113
114             if ( add_domain_number_as_letter && add_domain_number_as_digit )
115                 puts( "attempt to add domain number as letter and digit at the same time" )
116                 print_help
117                 exit( -1 )
118             end
119
120             e_value_threshold = -1.0
121             if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
122                 begin
123                     e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
124                 rescue ArgumentError => e
125                     Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
126                 end
127                 if ( e_value_threshold < 0.0 )
128                     Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
129                 end
130             end
131
132             length_threshold = -1
133             if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
134                 begin
135                     length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
136                 rescue ArgumentError => e
137                     Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
138                 end
139                 if ( length_threshold < 0)
140                     Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
141                 end
142             end
143
144             log = String.new
145
146             puts()
147             puts( "Hmmsearch outputfile                   : " + hmmsearch_output )
148             log << "Hmmsearch outputfile                   : " + hmmsearch_output + ld
149             puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file )
150             log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld
151             puts( "Outputfile                             : " + outfile )
152             log << "Outputfile                             : " + outfile + ld
153             puts( "Passed sequences outfile (fasta)       : " + outfile + PASSED_SEQS_SUFFIX )
154             log << "Passed sequences outfile (fasta)       : " + outfile + PASSED_SEQS_SUFFIX + ld
155             puts( "Failed sequences outfile (fasta)       : " + outfile + FAILED_SEQS_SUFFIX )
156             log << "Failed sequences outfile (fasta)       : " + outfile + FAILED_SEQS_SUFFIX + ld
157             puts( "Logfile                                : " + outfile + LOG_FILE_SUFFIX )
158             log <<  "Logfile                                : " + outfile + LOG_FILE_SUFFIX + ld
159             if ( e_value_threshold >= 0.0 )
160                 puts( "E-value threshold : " + e_value_threshold.to_s )
161                 log << "E-value threshold : " + e_value_threshold.to_s + ld
162             else
163                 puts( "E-value threshold : no threshold" )
164                 log << "E-value threshold : no threshold" + ld
165             end
166             if ( length_threshold > 0 )
167                 puts( "Length threshold  : " + length_threshold.to_s )
168                 log << "Length threshold  : " + length_threshold.to_s + ld
169             else
170                 puts( "Length threshold  : no threshold" )
171                 log << "Length threshold  : no threshold" + ld
172             end
173
174             if ( trim )
175                 puts( "Trim last 2 chars : true" )
176                 log << "Trim last 2 chars : true" + ld
177             else
178                 puts( "Trim names        : false" )
179                 log << "Trim names        : false" + ld
180             end
181
182
183             if ( add_position )
184                 puts( "Add positions (rel to complete seq) to extracted domains: true" )
185                 log << "Add positions (rel to complete seq) to extracted domains: true" + ld
186             else
187                 puts( "Add positions (rel to complete seq) to extracted domains: false" )
188                 log << "Add positions (rel to complete seq) to extracted domains: false" + ld
189             end
190
191             if ( add_domain_number || add_domain_number_as_digit || add_domain_number_as_letter )
192                 puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
193                 log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
194             else
195                 puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
196                 log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
197             end
198
199             puts
200
201             domain_count = 0
202             begin
203                 parser = HmmsearchDomainExtractor.new()
204                 domain_count = parser.parse( hmmsearch_output,
205                     fasta_sequence_file,
206                     outfile,
207                     outfile + PASSED_SEQS_SUFFIX,
208                     outfile + FAILED_SEQS_SUFFIX,
209                     e_value_threshold,
210                     length_threshold,
211                     add_position,
212                     add_domain_number,
213                     add_domain_number_as_digit,
214                     add_domain_number_as_letter,
215                     trim,
216                     log )
217             rescue ArgumentError, IOError, StandardError => e
218                 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
219             rescue Exception => e
220                 Util.fatal_error( PRG_NAME, "unexpected exception!: " + e.to_s, STDOUT )
221             end
222
223             puts
224             Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
225             Util.print_message( PRG_NAME, "wrote;               " + outfile )
226             Util.print_message( PRG_NAME, "wrote:               " + outfile + LOG_FILE_SUFFIX )
227             Util.print_message( PRG_NAME, "(wrote:              " + outfile + PASSED_SEQS_SUFFIX + ")" )
228             Util.print_message( PRG_NAME, "(wrote:              " + outfile + FAILED_SEQS_SUFFIX + ")" )
229
230             begin
231                 f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
232                 f.print( log )
233                 f.close
234             rescue Exception => e
235                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
236             end
237
238             puts
239             Util.print_message( PRG_NAME, "OK" )
240             puts
241
242         end
243
244         def print_help()
245             puts()
246             puts( "Usage:" )
247             puts()
248             puts( "  " + PRG_NAME + ".rb [options] <hmmsearch outputfile> <file containing complete sequences in fasta format> <outputfile>" )
249             puts()
250             puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f>: E-value threshold, default is no threshold" )
251             puts( "           -" + LENGTH_THRESHOLD_OPTION   + "=<i>: length threshold, default is no threshold" )
252             puts( "           -" + ADD_POSITION_OPTION  + ": to add positions (rel to complete seq) to extracted domains" )
253             puts( "           -" + ADD_DOMAIN_NUMBER_OPTION  + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
254             puts( "           -" + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT  + ": to add numbers to extracted domains as digit (example \"domain2\")" )
255             puts( "           -" + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER  + ": to add numbers to extracted domains as letter (example \"domaina\")" )
256             puts( "           -" + TRIM_OPTION  + ": to remove the last 2 characters from sequence names" )
257             puts()
258         end
259
260     end # class DomainSequenceExtractor
261
262 end # module Evoruby