2 # = lib/evo/tool/domain_sequence_extractor - DomainSequenceExtractor class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 require 'lib/evo/util/constants'
8 require 'lib/evo/util/util'
9 require 'lib/evo/util/command_line_arguments'
10 require 'lib/evo/io/parser/hmmscan_domain_extractor'
13 class DomainSequenceExtractor
17 PRG_DESC = "Extraction of domain sequences from hmmscan output"
19 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
21 E_VALUE_THRESHOLD_DEFAULT = 0.1
22 LENGTH_THRESHOLD_DEFAULT = 50
24 E_VALUE_THRESHOLD_OPTION = 'e'
25 LENGTH_THRESHOLD_OPTION = 'l'
26 ADD_POSITION_OPTION = 'p'
27 ADD_DOMAIN_NUMBER_OPTION = 'd'
29 LOG_FILE_SUFFIX = '_domain_seq_extr.log'
30 PASSED_SEQS_SUFFIX = '_with_passing_domains.fasta'
31 FAILED_SEQS_SUFFIX = '_with_no_passing_domains.fasta'
32 HELP_OPTION_1 = 'help'
36 Util.print_program_information( PRG_NAME,
43 if ( ARGV == nil || ( ARGV.length < 1 ) )
49 cla = CommandLineArguments.new( ARGV )
51 Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
54 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
55 cla.is_option_set?( HELP_OPTION_2 ) )
60 unless ( cla.get_number_of_files == 2 || cla.get_number_of_files == 3 || cla.get_number_of_files == 4 )
65 allowed_opts = Array.new
66 allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
67 allowed_opts.push( ADD_POSITION_OPTION )
68 allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
69 allowed_opts.push( LENGTH_THRESHOLD_OPTION )
70 allowed_opts.push( ADD_SPECIES )
72 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
73 if ( disallowed.length > 0 )
74 Util.fatal_error( PRG_NAME,
75 "unknown option(s): " + disallowed,
79 e_value_threshold = E_VALUE_THRESHOLD_DEFAULT
80 if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
82 e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
83 rescue ArgumentError => e
84 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
86 if ( e_value_threshold < 0.0 )
87 Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
91 length_threshold = LENGTH_THRESHOLD_DEFAULT
92 if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
94 length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
95 rescue ArgumentError => e
96 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
98 if ( length_threshold < 0)
99 Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
103 domain_id = cla.get_file_name( 0 )
104 hmmscan_output = cla.get_file_name( 1 )
105 fasta_sequence_file = ""
108 if (cla.get_number_of_files == 4)
109 fasta_sequence_file = cla.get_file_name( 2 )
110 outfile = cla.get_file_name( 3 )
111 elsif (cla.get_number_of_files == 2 || cla.get_number_of_files == 3)
112 if (cla.get_number_of_files == 3)
113 fasta_sequence_file = cla.get_file_name( 2 )
115 hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
116 if ( hmmscan_index != nil )
117 prefix = hmmscan_output[0 .. hmmscan_index-1 ]
118 suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
119 files = Dir.entries( "." )
120 matching_files = Util.get_matching_files( files, prefix, suffix)
121 if matching_files.length < 1
122 Util.fatal_error( PRG_NAME, 'no file matching [' + prefix +
123 '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
125 if matching_files.length > 1
126 Util.fatal_error( PRG_NAME, 'more than one file matching [' +
127 prefix + '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
129 fasta_sequence_file = matching_files[ 0 ]
131 Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
134 hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
135 if ( hmmscan_index != nil )
136 outfile = hmmscan_output.sub(Constants::HMMSCAN, "_") + "_" + domain_id
137 e = e_value_threshold >= 1 ? e_value_threshold.to_i : e_value_threshold.to_s.sub!('.','_')
138 outfile += "_E" + e.to_s
139 outfile += "_L" + length_threshold.to_i.to_s
141 Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
145 if outfile.downcase.end_with?( ".fasta" )
146 outfile = outfile[ 0 .. outfile.length - 7 ]
147 elsif outfile.downcase.end_with?( ".fsa" )
148 outfile = outfile[ 0 .. outfile.length - 5 ]
152 if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
156 add_domain_number = false
157 if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
158 add_domain_number = true
162 if cla.is_option_set?( ADD_SPECIES )
167 ld = Constants::LINE_DELIMITER
170 puts( "Domain : " + domain_id )
171 log << "Domain : " + domain_id + ld
172 puts( "Hmmscan outputfile : " + hmmscan_output )
173 log << "Hmmscan outputfile : " + hmmscan_output + ld
174 puts( "Fasta sequencefile (complete sequences) : " + fasta_sequence_file )
175 log << "Fasta sequencefile (complete sequences) : " + fasta_sequence_file + ld
176 puts( "Outputfile : " + outfile + ".fasta" )
177 log << "Outputfile : " + outfile + ld
178 puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX )
179 log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld
180 puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX )
181 log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld
182 puts( "Logfile : " + outfile + LOG_FILE_SUFFIX )
183 log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld
184 if ( e_value_threshold >= 0.0 )
185 puts( "iE-value threshold : " + e_value_threshold.to_s )
186 log << "iE-value threshold : " + e_value_threshold.to_s + ld
188 puts( "iE-value threshold : no threshold" )
189 log << "iE-value threshold : no threshold" + ld
191 if ( length_threshold > 0 )
192 puts( "Length threshold (env) : " + length_threshold.to_s )
193 log << "Length threshold (env) : " + length_threshold.to_s + ld
195 puts( "Length threshold (env) : no threshold" )
196 log << "Length threshold (env) : no threshold" + ld
199 puts( "Add positions (rel to complete seq) to extracted domains : true" )
200 log << "Add positions (rel to complete seq) to extracted domains : true" + ld
202 puts( "Add positions (rel to complete seq) to extracted domains : false" )
203 log << "Add positions (rel to complete seq) to extracted domains : false" + ld
206 if ( add_domain_number )
207 puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
208 log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
210 puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
211 log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
219 parser = HmmscanDomainExtractor.new()
220 domain_count = parser.parse( domain_id,
224 outfile + PASSED_SEQS_SUFFIX,
225 outfile + FAILED_SEQS_SUFFIX,
233 rescue ArgumentError, IOError => e
234 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
236 rescue Exception => e
238 Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
243 Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
244 Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta")
245 Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
246 Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX )
247 Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX )
250 f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
253 rescue Exception => e
254 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
258 Util.print_message( PRG_NAME, "OK" )
267 puts( " " + PRG_NAME + ".rb [options] <target domain> <hmmscan outputfile> [file containing complete sequences in fasta format] [outputfile]" )
269 puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f> : iE-value threshold for target domain, default is " + E_VALUE_THRESHOLD_DEFAULT.to_s )
270 puts( " -" + LENGTH_THRESHOLD_OPTION + "=<i> : length threshold target domain (env), default is " + LENGTH_THRESHOLD_DEFAULT.to_s )
271 puts( " -" + ADD_DOMAIN_NUMBER_OPTION + " : to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
272 puts( " -" + ADD_POSITION_OPTION + " : to add positions (rel to complete seq) to extracted domains" )
273 puts( " -" + ADD_SPECIES + " : to add species [in brackets]" )
277 puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta P53_#{Constants::PFAM_V_FOR_EX}_10_Pkinase_E1_0e-06_L50" )
279 puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta" )
281 puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10" )
285 end # class DomainSequenceExtractor