1 #!/usr/local/bin/ruby -w
3 # = lib/evo/apps/phylogenies_decorator
5 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
6 # License:: GNU Lesser General Public License (LGPL)
8 # decoration of phylogenies with sequence/species names and domain architectures
10 # Environment variable FORESTER_HOME needs to point to the appropriate
11 # directory (e.g. setenv FORESTER_HOME $HOME/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester/)
13 require 'lib/evo/util/constants'
14 require 'lib/evo/util/util'
15 require 'lib/evo/util/command_line_arguments'
20 class PhylogeniesDecorator
22 DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -mp -or'
23 DECORATOR_OPTIONS_DOMAINS = '-p -t'
24 DOMAINS_MAPFILE_SUFFIX = '.dff'
27 TMP_FILE_1 = '___PD1___'
28 TMP_FILE_2 = '___PD2___'
29 LOG_FILE = '00_phylogenies_decorator.log'
30 FORESTER_HOME = ENV[Constants::FORESTER_HOME_ENV_VARIABLE]
31 JAVA_HOME = ENV[Constants::JAVA_HOME_ENV_VARIABLE]
33 PRG_NAME = "phylogenies_decorator"
35 PRG_DESC = "decoration of phylogenies with sequence/species names and domain architectures"
37 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
39 HELP_OPTION_1 = "help"
41 NO_DOMAINS_OPTION = 'nd'
44 EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION = 'tc'
46 NL = Constants::LINE_DELIMITER
49 Util.print_program_information( PRG_NAME,
56 if ( ARGV == nil || ARGV.length < 2 )
61 if FORESTER_HOME == nil || FORESTER_HOME.length < 1
62 Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::FORESTER_HOME_ENV_VARIABLE} has not been set" )
64 if JAVA_HOME == nil || JAVA_HOME.length < 1
65 Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::JAVA_HOME_ENV_VARIABLE} has not been set" )
68 if !File.exist?( FORESTER_HOME )
69 Util.fatal_error( PRG_NAME, '[' + FORESTER_HOME + '] does not exist' )
71 if !File.exist?( JAVA_HOME )
72 Util.fatal_error( PRG_NAME, '[' + JAVA_HOME + '] does not exist' )
75 decorator = JAVA_HOME + '/bin/java -cp ' + FORESTER_HOME + '/java/forester.jar org.forester.application.decorator'
78 cla = CommandLineArguments.new( ARGV )
79 rescue ArgumentError => e
80 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
83 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
84 cla.is_option_set?( HELP_OPTION_2 ) )
89 if ( cla.get_number_of_files != 2 )
94 allowed_opts = Array.new
95 allowed_opts.push(NO_DOMAINS_OPTION)
96 allowed_opts.push(NO_SEQS_OPTION)
97 allowed_opts.push(EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION)
98 allowed_opts.push(VERBOSE_OPTION)
100 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
101 if ( disallowed.length > 0 )
102 Util.fatal_error( PRG_NAME,
103 "unknown option(s): " + disallowed,
108 if cla.is_option_set?(NO_DOMAINS_OPTION)
112 no_seqs_files = false
113 if cla.is_option_set?(NO_SEQS_OPTION)
117 extr_bracketed_tc = false
118 if cla.is_option_set?(EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION)
119 extr_bracketed_tc = true
123 if cla.is_option_set?(VERBOSE_OPTION)
127 if File.exist?( LOG_FILE )
128 Util.fatal_error( PRG_NAME, 'logfile [' + LOG_FILE + '] already exists' )
131 in_suffix = cla.get_file_name( 0 )
132 out_suffix = cla.get_file_name( 1 )
137 log << "Program : " + PRG_NAME + NL
138 log << "Version : " + PRG_VERSION + NL
139 log << "Program date : " + PRG_DATE + NL
140 log << "No domains data : " + no_domains.to_s + NL
141 log << "No mol seq data : " + no_seqs_files.to_s + NL
142 log << "Extract tax codes : " + extr_bracketed_tc.to_s + NL
143 log << "Date/time: " + now.to_s + NL
144 log << "Directory: " + Dir.getwd + NL + NL
146 Util.print_message( PRG_NAME, 'input suffix : ' + in_suffix )
147 Util.print_message( PRG_NAME, 'output suffix : ' + out_suffix )
149 log << 'input suffix : ' + in_suffix + NL
150 log << 'output suffix : ' + out_suffix + NL
152 if ( File.exist?( TMP_FILE_1 ) )
153 File.delete( TMP_FILE_1 )
155 if ( File.exist?( TMP_FILE_2 ) )
156 File.delete( TMP_FILE_2 )
159 files = Dir.entries( "." )
163 files.each { | phylogeny_file |
164 if ( !File.directory?( phylogeny_file ) &&
165 phylogeny_file !~ /^\./ &&
166 phylogeny_file !~ /^00/ &&
167 phylogeny_file !~ /#{out_suffix}$/ &&
168 phylogeny_file =~ /#{in_suffix}$/ )
170 Util.check_file_for_readability( phylogeny_file )
172 Util.fatal_error( PRG_NAME, 'can not read from: ' + phylogeny_file + ': '+ $! )
177 outfile = phylogeny_file.sub( /#{in_suffix}$/, out_suffix )
180 outfile = outfile.sub( /_ni_/, '_' )
183 if File.exist?( outfile )
184 msg = counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile +
185 ' : already exists, skipping'
186 Util.print_message( PRG_NAME, msg )
194 Util.print_message( PRG_NAME, counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile )
195 log << counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile + NL
197 phylogeny_id = get_id( phylogeny_file )
198 if phylogeny_id == nil || phylogeny_id.size < 1
199 Util.fatal_error( PRG_NAME, 'could not get id from ' + phylogeny_file.to_s )
202 Util.print_message( PRG_NAME, "Id: " + phylogeny_id )
204 log << "Id: " + phylogeny_id + NL
206 ids_mapfile_name = nil
207 domains_mapfile_name = nil
210 ids_mapfile_name = get_file( ".", phylogeny_id, Constants::ID_MAP_FILE_SUFFIX )
213 Util.check_file_for_readability( ids_mapfile_name )
215 Util.fatal_error( PRG_NAME, 'failed to read from [#{ids_mapfile_name}]: ' + $! )
218 Util.print_message( PRG_NAME, "Ids mapfile: " + ids_mapfile_name )
220 log << "Ids mapfile: " + ids_mapfile_name + NL
223 seqs_file_name = get_file( ".", phylogeny_id, Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX )
225 Util.check_file_for_readability( seqs_file_name )
227 Util.fatal_error( PRG_NAME, 'failed to read from [#{seqs_file_name }]: ' + $! )
230 Util.print_message( PRG_NAME, "Seq file: " + seqs_file_name )
232 log << "Seq file: " + seqs_file_name + NL
236 domains_mapfile_name = get_file( ".", phylogeny_id, Constants::DOMAINS_TO_FORESTER_OUTFILE_SUFFIX )
238 Util.check_file_for_readability( domains_mapfile_name )
240 Util.fatal_error( PRG_NAME, 'failed to read from [#{domains_mapfile_name}]: ' + $! )
243 Util.print_message( PRG_NAME, "Domains file: " + domains_mapfile_name )
245 log << "Domains file: " + domains_mapfile_name + NL
249 FileUtils.cp(phylogeny_file, TMP_FILE_1)
252 ' -t -p -f=m ' + phylogeny_file + ' ' +
253 seqs_file_name + ' ' + TMP_FILE_1
258 execute_cmd( cmd, log )
260 Util.fatal_error( PRG_NAME, 'error: ' + $! )
265 cmd = decorator + ' ' + DECORATOR_OPTIONS_DOMAINS + ' ' +
266 '-f=d ' + TMP_FILE_1 + ' ' +
267 domains_mapfile_name + ' ' + TMP_FILE_2
272 execute_cmd( cmd, log )
274 Util.fatal_error( PRG_NAME, 'error: ' + $! )
278 opts = DECORATOR_OPTIONS_SEQ_NAMES
284 cmd = decorator + ' ' + opts + ' -f=n ' + TMP_FILE_1 + ' ' +
285 ids_mapfile_name + ' ' + outfile
290 execute_cmd( cmd, log )
292 Util.fatal_error( PRG_NAME, 'error: ' + $! )
294 File.delete( TMP_FILE_1 )
296 cmd = decorator + ' ' + opts + ' -f=n ' + TMP_FILE_2 + ' ' +
297 ids_mapfile_name + ' ' + outfile
302 execute_cmd( cmd, log )
304 Util.fatal_error( PRG_NAME, 'error: ' + $! )
306 File.delete( TMP_FILE_1 )
307 File.delete( TMP_FILE_2 )
311 open( LOG_FILE, 'w' ) do | f |
317 Util.print_message( PRG_NAME, 'OK' )
321 def execute_cmd( cmd, log )
322 log << 'executing ' + cmd + NL
323 IO.popen( cmd , 'r+' ) do | pipe |
325 log << pipe.read + NL + NL
330 def get_id( phylogeny_file_name )
331 return phylogeny_file_name
332 #if phylogeny_file_name =~ /^(.+?_DA)_/
334 #elsif phylogeny_file_name =~ /^(.+?)_/
340 def get_file( files_in_dir, phylogeny_id, suffix_pattern )
341 Util.get_matching_file( files_in_dir, phylogeny_id, suffix_pattern )
342 # matching_files = Util.get_matching_files( files_in_dir, phylogeny_id, suffix_pattern )
343 # if matching_files.length < 1
344 # Util.fatal_error( PRG_NAME, 'no file matching [' + phylogeny_id +
345 # '...' + suffix_pattern + '] present in current directory' )
347 # if matching_files.length > 1
348 # Util.fatal_error( PRG_NAME, 'more than one file matching [' +
349 # phylogeny_id + '...' + suffix_pattern + '] present in current directory' )
351 # matching_files[ 0 ]
354 def get_seq_file( files_in_dir, phylogeny_id )
355 matching_files = Array.new
357 files_in_dir.each { | file |
359 if ( !File.directory?( file ) &&
362 ( file =~ /^#{phylogeny_id}__.+\d$/ || file =~ /^#{phylogeny_id}_.*\.fasta$/ ) )
363 matching_files << file
367 if matching_files.length < 1
368 Util.fatal_error( PRG_NAME, 'no seq file matching [' +
369 phylogeny_id + '_] present in current directory' )
371 if matching_files.length > 1
372 Util.fatal_error( PRG_NAME, 'more than one seq file matching [' +
373 phylogeny_id + '_] present in current directory' )
381 puts " " + PRG_NAME + ".rb <suffix of in-trees to be decorated> <suffix for decorated out-trees> "
383 puts " required files (in this dir): " + "name mappings : .nim"
384 puts " " + "sequences : _ni.fasta"
385 puts " " + "domain architectures: .dff"
387 puts " options: -" + NO_DOMAINS_OPTION + ": to not add domain architecture information (.dff file)"
388 puts " -" + NO_SEQS_OPTION + ": to not add molecular sequence information (_ni.fasta file)"
389 puts " -" + EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION + ": to extract bracketed taxonomic codes, e.g. [NEMVE]"
390 puts " -" + VERBOSE_OPTION + ": verbose"
392 puts "Examples: " + PRG_NAME + ".rb .xml _d.xml"
393 puts " " + PRG_NAME + ".rb -#{NO_DOMAINS_OPTION} -#{NO_SEQS_OPTION} .xml _d.xml"
396 end # class PhylogenyiesDecorator