2 # = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 require 'lib/evo/util/constants'
8 require 'lib/evo/util/util'
9 require 'lib/evo/util/command_line_arguments'
10 require 'lib/evo/io/parser/hmmscan_domain_extractor'
13 class DomainSequenceExtractor
17 PRG_DESC = "extraction of domain sequences from hmmscan output"
19 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
21 E_VALUE_THRESHOLD_OPTION = 'e'
22 LENGTH_THRESHOLD_OPTION = 'l'
23 ADD_POSITION_OPTION = 'p'
24 ADD_DOMAIN_NUMBER_OPTION = 'd'
27 LOG_FILE_SUFFIX = '_domain_seq_extr.log'
28 PASSED_SEQS_SUFFIX = '_with_passing_domains.fasta'
29 FAILED_SEQS_SUFFIX = '_with_no_passing_domains.fasta'
30 HELP_OPTION_1 = 'help'
34 Util.print_program_information( PRG_NAME,
41 ld = Constants::LINE_DELIMITER
44 cla = CommandLineArguments.new( ARGV )
46 Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
49 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
50 cla.is_option_set?( HELP_OPTION_2 ) )
55 if ( cla.get_number_of_files != 4 )
60 allowed_opts = Array.new
61 allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
62 allowed_opts.push( ADD_POSITION_OPTION )
63 allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
64 allowed_opts.push( LENGTH_THRESHOLD_OPTION )
65 allowed_opts.push( ADD_SPECIES )
66 allowed_opts.push( MIN_LINKER_OPT )
68 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
69 if ( disallowed.length > 0 )
70 Util.fatal_error( PRG_NAME,
71 "unknown option(s): " + disallowed,
75 domain_id = cla.get_file_name( 0 )
76 hmmsearch_output = cla.get_file_name( 1 )
77 fasta_sequence_file = cla.get_file_name( 2 )
78 outfile = cla.get_file_name( 3 )
80 if outfile.downcase.end_with?( ".fasta" )
81 outfile = outfile[ 0 .. outfile.length - 7 ]
82 elsif outfile.downcase.end_with?( ".fsa" )
83 outfile = outfile[ 0 .. outfile.length - 5 ]
87 if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
91 add_domain_number = false
92 if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
93 add_domain_number = true
97 if cla.is_option_set? ADD_SPECIES
101 e_value_threshold = -1.0
102 if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
104 e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
105 rescue ArgumentError => e
106 Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
108 if ( e_value_threshold < 0.0 )
109 Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
113 length_threshold = -1
114 if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
116 length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
117 rescue ArgumentError => e
118 Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
120 if ( length_threshold < 0)
121 Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
126 if ( cla.is_option_set?( MIN_LINKER_OPT ) )
128 min_linker = cla.get_option_value_as_int( MIN_LINKER_OPT )
129 rescue ArgumentError => e
130 Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
132 if ( !min_linker || min_linker > 100 || min_linker < -100 )
133 Forester::Util.fatal_error( PRG_NAME, "unexpected value for min linker " + min_linker.to_s, STDOUT )
140 puts( "Domain : " + domain_id )
141 log << "Domain : " + domain_id + ld
142 puts( "Hmmscan outputfile : " + hmmsearch_output )
143 log << "Hmmscan outputfile : " + hmmsearch_output + ld
144 puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file )
145 log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld
146 puts( "Outputfile : " + outfile + ".fasta" )
147 log << "Outputfile : " + outfile + ld
148 puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX )
149 log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld
150 puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX )
151 log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld
152 puts( "Logfile : " + outfile + LOG_FILE_SUFFIX )
153 log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld
154 if ( e_value_threshold >= 0.0 )
155 puts( "E-value threshold : " + e_value_threshold.to_s )
156 log << "E-value threshold : " + e_value_threshold.to_s + ld
158 puts( "E-value threshold : no threshold" )
159 log << "E-value threshold : no threshold" + ld
161 if ( length_threshold > 0 )
162 puts( "Length threshold : " + length_threshold.to_s )
163 log << "Length threshold : " + length_threshold.to_s + ld
165 puts( "Length threshold : no threshold" )
166 log << "Length threshold : no threshold" + ld
169 puts( "Min linker : " + min_linker.to_s )
170 log << "Min linker : " + min_linker.to_s + ld
175 puts( "Add positions (rel to complete seq) to extracted domains: true" )
176 log << "Add positions (rel to complete seq) to extracted domains: true" + ld
178 puts( "Add positions (rel to complete seq) to extracted domains: false" )
179 log << "Add positions (rel to complete seq) to extracted domains: false" + ld
182 if ( add_domain_number )
183 puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
184 log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
186 puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
187 log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
194 parser = HmmscanDomainExtractor.new()
195 domain_count = parser.parse( domain_id,
199 outfile + PASSED_SEQS_SUFFIX,
200 outfile + FAILED_SEQS_SUFFIX,
208 rescue ArgumentError, IOError => e
209 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
211 rescue Exception => e
213 Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
218 Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
219 Util.print_message( PRG_NAME, "wrote; " + outfile + ".fasta")
220 Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
221 Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX )
222 Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX )
225 f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
228 rescue Exception => e
229 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
233 Util.print_message( PRG_NAME, "OK" )
242 puts( " " + PRG_NAME + ".rb [options] <domain> <hmmscan outputfile> <file containing complete sequences in fasta format> <outputfile>" )
244 puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f>: iE-value threshold, default is no threshold" )
245 puts( " -" + LENGTH_THRESHOLD_OPTION + "=<i>: length threshold, default is no threshold" )
246 puts( " -" + ADD_POSITION_OPTION + ": to add positions (rel to complete seq) to extracted domains" )
247 puts( " -" + ADD_DOMAIN_NUMBER_OPTION + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
248 puts( " -" + ADD_SPECIES + ": to add species [in brackets]" )
249 puts( " -" + MIN_LINKER_OPT + "=<i>: to extract pairs of same domains with a distance inbetween shorter than a given value" )
253 end # class DomainSequenceExtractor