2 # = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class
4 # Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: domain_sequence_extractor.rb,v 1.19 2010/12/13 19:00:11 cmzmasek Exp $
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/util/command_line_arguments'
13 require 'lib/evo/io/parser/hmmsearch_domain_extractor'
17 class DomainSequenceExtractor
21 PRG_DESC = "extraction of domain sequences from hmmsearch output"
22 PRG_DATE = "2008.01.03"
23 COPYRIGHT = "2008-2009 Christian M Zmasek"
24 CONTACT = "phylosoft@gmail.com"
25 WWW = "www.phylosoft.org"
27 E_VALUE_THRESHOLD_OPTION = 'e'
28 LENGTH_THRESHOLD_OPTION = 'l'
29 ADD_POSITION_OPTION = 'p'
30 ADD_DOMAIN_NUMBER_OPTION = 'd'
31 ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT = 'dd'
32 ADD_DOMAIN_NUMBER_OPTION_AS_LETTER = 'dl'
34 LOG_FILE_SUFFIX = '_domain_seq_extr.log'
35 PASSED_SEQS_SUFFIX = '_domain_seq_extr_passed'
36 FAILED_SEQS_SUFFIX = '_domain_seq_extr_failed'
37 HELP_OPTION_1 = 'help'
42 Util.print_program_information( PRG_NAME,
51 ld = Constants::LINE_DELIMITER
54 cla = CommandLineArguments.new( ARGV )
56 Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
59 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
60 cla.is_option_set?( HELP_OPTION_2 ) )
65 if ( cla.get_number_of_files != 3 )
70 allowed_opts = Array.new
71 allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
72 allowed_opts.push( ADD_POSITION_OPTION )
73 allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
74 allowed_opts.push( LENGTH_THRESHOLD_OPTION )
75 allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT )
76 allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER )
77 allowed_opts.push( TRIM_OPTION )
79 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
80 if ( disallowed.length > 0 )
81 Util.fatal_error( PRG_NAME,
82 "unknown option(s): " + disallowed,
86 hmmsearch_output = cla.get_file_name( 0 )
87 fasta_sequence_file = cla.get_file_name( 1 )
88 outfile = cla.get_file_name( 2 )
91 if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
96 if ( cla.is_option_set?( TRIM_OPTION ) )
100 add_domain_number = false
101 add_domain_number_as_letter = false
102 add_domain_number_as_digit = false
104 if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
105 add_domain_number = true
107 if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) )
108 add_domain_number_as_letter = true
110 if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) )
111 add_domain_number_as_digit = true
114 if ( add_domain_number_as_letter && add_domain_number_as_digit )
115 puts( "attempt to add domain number as letter and digit at the same time" )
120 e_value_threshold = -1.0
121 if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
123 e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
124 rescue ArgumentError => e
125 Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
127 if ( e_value_threshold < 0.0 )
128 Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
132 length_threshold = -1
133 if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
135 length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
136 rescue ArgumentError => e
137 Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
139 if ( length_threshold < 0)
140 Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
147 puts( "Hmmsearch outputfile : " + hmmsearch_output )
148 log << "Hmmsearch outputfile : " + hmmsearch_output + ld
149 puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file )
150 log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld
151 puts( "Outputfile : " + outfile )
152 log << "Outputfile : " + outfile + ld
153 puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX )
154 log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld
155 puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX )
156 log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld
157 puts( "Logfile : " + outfile + LOG_FILE_SUFFIX )
158 log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld
159 if ( e_value_threshold >= 0.0 )
160 puts( "E-value threshold : " + e_value_threshold.to_s )
161 log << "E-value threshold : " + e_value_threshold.to_s + ld
163 puts( "E-value threshold : no threshold" )
164 log << "E-value threshold : no threshold" + ld
166 if ( length_threshold > 0 )
167 puts( "Length threshold : " + length_threshold.to_s )
168 log << "Length threshold : " + length_threshold.to_s + ld
170 puts( "Length threshold : no threshold" )
171 log << "Length threshold : no threshold" + ld
175 puts( "Trim last 2 chars : true" )
176 log << "Trim last 2 chars : true" + ld
178 puts( "Trim names : false" )
179 log << "Trim names : false" + ld
184 puts( "Add positions (rel to complete seq) to extracted domains: true" )
185 log << "Add positions (rel to complete seq) to extracted domains: true" + ld
187 puts( "Add positions (rel to complete seq) to extracted domains: false" )
188 log << "Add positions (rel to complete seq) to extracted domains: false" + ld
191 if ( add_domain_number || add_domain_number_as_digit || add_domain_number_as_letter )
192 puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
193 log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
195 puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
196 log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
203 parser = HmmsearchDomainExtractor.new()
204 domain_count = parser.parse( hmmsearch_output,
207 outfile + PASSED_SEQS_SUFFIX,
208 outfile + FAILED_SEQS_SUFFIX,
213 add_domain_number_as_digit,
214 add_domain_number_as_letter,
217 rescue ArgumentError, IOError, StandardError => e
218 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
219 rescue Exception => e
220 Util.fatal_error( PRG_NAME, "unexpected exception!: " + e.to_s, STDOUT )
224 Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
225 Util.print_message( PRG_NAME, "wrote; " + outfile )
226 Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
227 Util.print_message( PRG_NAME, "(wrote: " + outfile + PASSED_SEQS_SUFFIX + ")" )
228 Util.print_message( PRG_NAME, "(wrote: " + outfile + FAILED_SEQS_SUFFIX + ")" )
231 f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
234 rescue Exception => e
235 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
239 Util.print_message( PRG_NAME, "OK" )
248 puts( " " + PRG_NAME + ".rb [options] <hmmsearch outputfile> <file containing complete sequences in fasta format> <outputfile>" )
250 puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f>: E-value threshold, default is no threshold" )
251 puts( " -" + LENGTH_THRESHOLD_OPTION + "=<i>: length threshold, default is no threshold" )
252 puts( " -" + ADD_POSITION_OPTION + ": to add positions (rel to complete seq) to extracted domains" )
253 puts( " -" + ADD_DOMAIN_NUMBER_OPTION + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
254 puts( " -" + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT + ": to add numbers to extracted domains as digit (example \"domain2\")" )
255 puts( " -" + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER + ": to add numbers to extracted domains as letter (example \"domaina\")" )
256 puts( " -" + TRIM_OPTION + ": to remove the last 2 characters from sequence names" )
260 end # class DomainSequenceExtractor