1 #!/usr/local/bin/ruby -w
3 # = lib/evo/apps/phylogenies_decorator
5 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
6 # License:: GNU Lesser General Public License (LGPL)
8 # decoration of phylogenies with sequence/species names and domain architectures
10 # Environment variable FORESTER_HOME needs to point to the appropriate
11 # directory (e.g. setenv FORESTER_HOME $HOME/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester/)
13 require 'lib/evo/util/constants'
14 require 'lib/evo/util/util'
15 require 'lib/evo/util/command_line_arguments'
20 class PhylogeniesDecorator
22 DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -mp -or'
23 DECORATOR_OPTIONS_DOMAINS = '-p -t'
26 TMP_FILE_1 = '___PD1___'
27 TMP_FILE_2 = '___PD2___'
28 LOG_FILE = '00_phylogenies_decorator.log'
29 FORESTER_HOME = ENV[Constants::FORESTER_HOME_ENV_VARIABLE]
30 JAVA_HOME = ENV[Constants::JAVA_HOME_ENV_VARIABLE]
32 PRG_NAME = "phylogenies_decorator"
34 PRG_DESC = "decoration of phylogenies with sequence/species names and domain architectures"
36 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
38 HELP_OPTION_1 = "help"
40 NO_DOMAINS_OPTION = 'nd'
43 EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION = 'tc'
45 NL = Constants::LINE_DELIMITER
48 Util.print_program_information( PRG_NAME,
55 if ( ARGV == nil || ARGV.length < 2 )
60 if FORESTER_HOME == nil || FORESTER_HOME.length < 1
61 Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::FORESTER_HOME_ENV_VARIABLE} has not been set" )
63 if JAVA_HOME == nil || JAVA_HOME.length < 1
64 Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::JAVA_HOME_ENV_VARIABLE} has not been set" )
67 if !File.exist?( FORESTER_HOME )
68 Util.fatal_error( PRG_NAME, '[' + FORESTER_HOME + '] does not exist' )
70 if !File.exist?( JAVA_HOME )
71 Util.fatal_error( PRG_NAME, '[' + JAVA_HOME + '] does not exist' )
74 decorator = JAVA_HOME + '/bin/java -cp ' + FORESTER_HOME + '/java/forester.jar org.forester.application.decorator'
77 cla = CommandLineArguments.new( ARGV )
78 rescue ArgumentError => e
79 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
82 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
83 cla.is_option_set?( HELP_OPTION_2 ) )
88 if ( cla.get_number_of_files != 2 && cla.get_number_of_files != 3 )
93 allowed_opts = Array.new
94 allowed_opts.push(NO_DOMAINS_OPTION)
95 allowed_opts.push(NO_SEQS_OPTION)
96 allowed_opts.push(EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION)
97 allowed_opts.push(VERBOSE_OPTION)
99 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
100 if ( disallowed.length > 0 )
101 Util.fatal_error( PRG_NAME,
102 "unknown option(s): " + disallowed,
107 if cla.is_option_set?(NO_DOMAINS_OPTION)
111 no_seqs_files = false
112 if cla.is_option_set?(NO_SEQS_OPTION)
116 extr_bracketed_tc = false
117 if cla.is_option_set?(EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION)
118 extr_bracketed_tc = true
122 if cla.is_option_set?(VERBOSE_OPTION)
126 if File.exist? LOG_FILE
127 Util.fatal_error( PRG_NAME, 'logfile [' + LOG_FILE + '] already exists' )
130 in_suffix = cla.get_file_name( 0 )
131 out_suffix = cla.get_file_name( 1 )
133 mapping_files_dir = nil
135 if cla.get_number_of_files == 3
136 mapping_files_dir = cla.get_file_name( 2 )
138 mapping_files_dir = Dir.getwd
140 unless File.exist? mapping_files_dir
141 Util.fatal_error( PRG_NAME, 'mapping files directory [' + mapping_files_dir + '] does not exist' )
143 unless File.directory? mapping_files_dir
144 Util.fatal_error( PRG_NAME, '[' + mapping_files_dir + '] is not a directory' )
146 if Dir.entries(mapping_files_dir).length <= 2
147 Util.fatal_error( PRG_NAME, 'mapping files directory [' + mapping_files_dir + '] is empty' )
150 mapping_files_dir = Util.canonical_path( mapping_files_dir.to_s )
155 log << "Program : " + PRG_NAME + NL
156 log << "Version : " + PRG_VERSION + NL
157 log << "Program date : " + PRG_DATE + NL
158 log << "Input/Output dir : " + Dir.getwd + NL
159 log << "Mappings file dir : " + mapping_files_dir + NL
160 log << "Input suffix : " + in_suffix + NL
161 log << "Output suffix : " + out_suffix + NL
162 log << "No domains data : " + no_domains.to_s + NL
163 log << "No mol seq data : " + no_seqs_files.to_s + NL
164 log << "Extract tax codes : " + extr_bracketed_tc.to_s + NL
165 log << "Date/time: " + now.to_s + NL + NL
167 Util.print_message( PRG_NAME, 'Input/Output dir : ' + Dir.getwd )
168 Util.print_message( PRG_NAME, 'Mappings file dir: ' + mapping_files_dir )
169 Util.print_message( PRG_NAME, 'Input suffix : ' + in_suffix )
170 Util.print_message( PRG_NAME, 'Output suffix : ' + out_suffix )
171 Util.print_message( PRG_NAME, 'No domains data : ' + no_domains.to_s )
172 Util.print_message( PRG_NAME, 'No mol seq data : ' + no_seqs_files.to_s )
173 Util.print_message( PRG_NAME, 'Extract tax codes: ' + extr_bracketed_tc.to_s )
175 if ( File.exist?( TMP_FILE_1 ) )
176 File.delete( TMP_FILE_1 )
178 if ( File.exist?( TMP_FILE_2 ) )
179 File.delete( TMP_FILE_2 )
182 files = Dir.entries( "." )
186 files.each { | phylogeny_file |
187 if ( !File.directory?( phylogeny_file ) &&
188 phylogeny_file !~ /^\./ &&
189 phylogeny_file !~ /^00/ &&
190 phylogeny_file !~ /#{out_suffix}$/ &&
191 phylogeny_file =~ /#{in_suffix}$/ )
193 Util.check_file_for_readability( phylogeny_file )
195 Util.fatal_error( PRG_NAME, 'can not read from: ' + phylogeny_file + ': '+ $!.to_s )
200 outfile = phylogeny_file.sub( /#{in_suffix}$/, out_suffix )
203 outfile = outfile.sub( /_ni_/, '_' )
206 if File.exist?( outfile )
207 msg = counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile +
208 ' : already exists, skipping'
209 Util.print_message( PRG_NAME, msg )
217 Util.print_message( PRG_NAME, counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile )
218 log << counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile + NL
220 phylogeny_id = phylogeny_file
221 if phylogeny_id == nil || phylogeny_id.size < 1
222 Util.fatal_error( PRG_NAME, 'could not get id from ' + phylogeny_file.to_s )
225 Util.print_message( PRG_NAME, "Id: " + phylogeny_id )
227 log << "Id: " + phylogeny_id + NL
229 ids_mapfile_path = nil
230 domains_mapfile_path = nil
233 ids_mapfile_name = get_file( mapping_files_dir, phylogeny_id, Constants::ID_MAP_FILE_SUFFIX )
234 ids_mapfile_path = Util.canonical_path(mapping_files_dir, ids_mapfile_name)
237 Util.check_file_for_readability( ids_mapfile_path)
239 Util.fatal_error( PRG_NAME, "failed to read from [#{ids_mapfile_path}]: " + $!.to_s )
242 Util.print_message( PRG_NAME, "Ids mapfile: " + ids_mapfile_path )
244 log << "Ids mapfile: " + ids_mapfile_path + NL
247 seqs_file_name = get_file( mapping_files_dir, phylogeny_id, Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX )
248 seqs_file_path = Util.canonical_path(mapping_files_dir, seqs_file_name)
250 Util.check_file_for_readability( seqs_file_path )
252 Util.fatal_error( PRG_NAME, "failed to read from [#{seqs_file_path}]: " + $!.to_s )
255 Util.print_message( PRG_NAME, "Seq file: " + seqs_file_path )
257 log << "Seq file: " + seqs_file_path + NL
261 domains_mapfile_name = get_file( mapping_files_dir , phylogeny_id, Constants::DOMAINS_TO_FORESTER_OUTFILE_SUFFIX )
262 domains_mapfile_path = Util.canonical_path(mapping_files_dir, domains_mapfile_name)
264 Util.check_file_for_readability( domains_mapfile_path )
266 Util.fatal_error( PRG_NAME, "failed to read from [#{domains_mapfile_path}]: " + $!.to_s )
269 Util.print_message( PRG_NAME, "Domains file: " + domains_mapfile_path )
271 log << "Domains file: " + domains_mapfile_path + NL
277 FileUtils.cp(phylogeny_file, TMP_FILE_1)
280 ' -t -p -f=m ' + phylogeny_file + ' ' +
281 seqs_file_path + ' ' + TMP_FILE_1
286 execute_cmd( cmd, log )
288 Util.fatal_error( PRG_NAME, 'error: ' + $!.to_s )
293 cmd = decorator + ' ' + DECORATOR_OPTIONS_DOMAINS + ' ' +
294 '-f=d ' + TMP_FILE_1 + ' ' +
295 domains_mapfile_path + ' ' + TMP_FILE_2
300 execute_cmd( cmd, log )
302 Util.fatal_error( PRG_NAME, 'error: ' + $!.to_s )
306 opts = DECORATOR_OPTIONS_SEQ_NAMES
312 cmd = decorator + ' ' + opts + ' -f=n ' + TMP_FILE_1 + ' ' +
313 ids_mapfile_path + ' ' + outfile
318 execute_cmd( cmd, log )
320 Util.fatal_error( PRG_NAME, 'error: ' + $!.to_s )
322 File.delete( TMP_FILE_1 )
324 cmd = decorator + ' ' + opts + ' -f=n ' + TMP_FILE_2 + ' ' +
325 ids_mapfile_path + ' ' + outfile
330 execute_cmd( cmd, log )
332 Util.fatal_error( PRG_NAME, 'error: ' + $!.to_s )
334 File.delete( TMP_FILE_1 )
335 File.delete( TMP_FILE_2 )
339 open( LOG_FILE, 'w' ) do | f |
345 Util.print_message( PRG_NAME, 'OK' )
349 def execute_cmd( cmd, log )
350 log << 'executing ' + cmd + NL
351 IO.popen( cmd , 'r+' ) do | pipe |
353 log << pipe.read + NL + NL
356 raise StandardError, "failed to execute " + cmd
361 def get_file( files_in_dir, phylogeny_id, suffix_pattern )
363 Util.get_matching_file( files_in_dir, phylogeny_id, suffix_pattern )
365 Util.fatal_error( PRG_NAME, 'error: ' + $!.to_s )
372 puts " " + PRG_NAME + ".rb [options] <suffix of in-trees to be decorated> <suffix for decorated out-trees> [mapping files directory, default: current dir]"
374 puts " " + PRG_NAME + ".rb [options] <input directory> <output directory> <mapping files directory>"
376 puts " required file (in mapping files directory): " + "name mappings : #{Constants::ID_MAP_FILE_SUFFIX}"
377 puts " optional files (in mapping files directory): " + "sequences : #{Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX}"
378 puts " " + "domain architectures: #{Constants::DOMAINS_TO_FORESTER_OUTFILE_SUFFIX}"
380 puts " options: -" + NO_DOMAINS_OPTION + ": to not add domain architecture information (#{Constants::DOMAINS_TO_FORESTER_OUTFILE_SUFFIX} file)"
381 puts " -" + NO_SEQS_OPTION + ": to not add molecular sequence information (#{Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX} file)"
382 puts " -" + EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION + ": to extract bracketed taxonomic codes, e.g. [NEMVE]"
383 puts " -" + VERBOSE_OPTION + " : verbose"
385 puts "Examples: " + PRG_NAME + ".rb .xml _d.xml"
386 puts " " + PRG_NAME + ".rb -#{NO_DOMAINS_OPTION} -#{NO_SEQS_OPTION} .xml _d.xml"
387 puts " " + PRG_NAME + ".rb -#{NO_DOMAINS_OPTION} -#{NO_SEQS_OPTION} .xml _d.xml mappings_dir"
389 puts " " + PRG_NAME + ".rb in_trees_dir out_dir mappings_dir"
392 end # class PhylogenyiesDecorator