1 #!/usr/local/bin/ruby -w
3 # = lib/evo/apps/phylogenies_decorator
5 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
6 # License:: GNU Lesser General Public License (LGPL)
8 # Last modified: 2017/02/09
10 # decoration of phylogenies with sequence/species names and domain architectures
12 # Environment variable FORESTER_HOME needs to point to the appropriate
13 # directory (e.g. setenv FORESTER_HOME $HOME/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester/)
15 require 'lib/evo/util/constants'
16 require 'lib/evo/util/util'
17 require 'lib/evo/util/command_line_arguments'
21 class PhylogeniesDecorator
23 #DECORATOR_OPTIONS_SEQ_NAMES = '-r=1 -mdn'
24 #DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -sn'
25 #DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -tc -mp -or'
26 DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -mp -or'
27 # -mdn is a hidden expert option to rename e.g. "6_ORYLA3" to "6_[3]_ORYLA"
28 #DECORATOR_OPTIONS_SEQ_NAMES = '-sn -r=1'
29 #DECORATOR_OPTIONS_DOMAINS = '-r=1'
30 DECORATOR_OPTIONS_DOMAINS = '-p -t'
31 IDS_MAPFILE_SUFFIX = '.nim'
32 DOMAINS_MAPFILE_SUFFIX = '.dff'
35 FIXED_NIM_FILE = nil #'all.nim' #TODO this should be a command line option
36 TMP_FILE_1 = '___PD1___'
37 TMP_FILE_2 = '___PD2___'
38 LOG_FILE = '00_phylogenies_decorator.log'
39 FORESTER_HOME = ENV[Constants::FORESTER_HOME_ENV_VARIABLE]
40 JAVA_HOME = ENV[Constants::JAVA_HOME_ENV_VARIABLE]
42 PRG_NAME = "phylogenies_decorator"
44 PRG_DESC = "decoration of phylogenies with sequence/species names and domain architectures"
46 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
48 HELP_OPTION_1 = "help"
50 NO_DOMAINS_OPTION = 'nd'
52 NL = Constants::LINE_DELIMITER
55 Util.print_program_information( PRG_NAME,
62 if ( ARGV == nil || ARGV.length < 2 )
67 if FORESTER_HOME == nil || FORESTER_HOME.length < 1
68 Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::FORESTER_HOME_ENV_VARIABLE} has not been set" )
70 if JAVA_HOME == nil || JAVA_HOME.length < 1
71 Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::JAVA_HOME_ENV_VARIABLE} has not been set" )
74 if !File.exist?( FORESTER_HOME )
75 Util.fatal_error( PRG_NAME, '[' + FORESTER_HOME + '] does not exist' )
77 if !File.exist?( JAVA_HOME )
78 Util.fatal_error( PRG_NAME, '[' + JAVA_HOME + '] does not exist' )
81 decorator = JAVA_HOME + '/bin/java -cp ' + FORESTER_HOME + '/java/forester.jar org.forester.application.decorator'
84 cla = CommandLineArguments.new( ARGV )
85 rescue ArgumentError => e
86 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
89 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
90 cla.is_option_set?( HELP_OPTION_2 ) )
95 if ( cla.get_number_of_files != 2 )
100 allowed_opts = Array.new
101 allowed_opts.push(NO_DOMAINS_OPTION)
103 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
104 if ( disallowed.length > 0 )
105 Util.fatal_error( PRG_NAME,
106 "unknown option(s): " + disallowed,
111 if cla.is_option_set?(NO_DOMAINS_OPTION)
115 if File.exist?( LOG_FILE )
116 Util.fatal_error( PRG_NAME, 'logfile [' + LOG_FILE + '] already exists' )
119 in_suffix = cla.get_file_name( 0 )
120 out_suffix = cla.get_file_name( 1 )
125 log << "Program : " + PRG_NAME + NL
126 log << "Version : " + PRG_VERSION + NL
127 log << "Program date : " + PRG_DATE + NL
128 log << "Options for seq names: " + DECORATOR_OPTIONS_SEQ_NAMES + NL
129 log << "Options for domains : " + DECORATOR_OPTIONS_DOMAINS + NL
130 log << "FORESTER_HOME : " + FORESTER_HOME + NL
131 log << "JAVA_HOME : " + JAVA_HOME + NL + NL
132 log << "Date/time: " + now.to_s + NL
133 log << "Directory: " + Dir.getwd + NL + NL
135 Util.print_message( PRG_NAME, 'input suffix : ' + in_suffix )
136 Util.print_message( PRG_NAME, 'output suffix : ' + out_suffix )
138 log << 'input suffix : ' + in_suffix + NL
139 log << 'output suffix : ' + out_suffix + NL
141 if ( File.exist?( TMP_FILE_1 ) )
142 File.delete( TMP_FILE_1 )
144 if ( File.exist?( TMP_FILE_2 ) )
145 File.delete( TMP_FILE_2 )
148 files = Dir.entries( "." )
152 files.each { | phylogeny_file |
153 if ( !File.directory?( phylogeny_file ) &&
154 phylogeny_file !~ /^\./ &&
155 phylogeny_file !~ /^00/ &&
156 phylogeny_file !~ /#{out_suffix}$/ &&
157 phylogeny_file =~ /#{in_suffix}$/ )
159 Util.check_file_for_readability( phylogeny_file )
161 Util.fatal_error( PRG_NAME, 'can not read from: ' + phylogeny_file + ': '+ $! )
166 outfile = phylogeny_file.sub( /#{in_suffix}$/, out_suffix )
169 outfile = outfile.sub( /_ni_/, '_' )
172 if File.exist?( outfile )
173 msg = counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile +
174 ' : already exists, skipping'
175 Util.print_message( PRG_NAME, msg )
180 Util.print_message( PRG_NAME, counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile )
181 log << counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile + NL
183 phylogeny_id = get_id( phylogeny_file )
184 if phylogeny_id == nil || phylogeny_id.size < 1
185 Util.fatal_error( PRG_NAME, 'could not get id from ' + phylogeny_file.to_s )
188 Util.print_message( PRG_NAME, "Id: " + phylogeny_id )
189 log << "Id: " + phylogeny_id + NL
191 ids_mapfile_name = nil
192 domains_mapfile_name = nil
195 if ( FIXED_NIM_FILE == nil )
196 ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX )
198 ids_mapfile_name = FIXED_NIM_FILE
202 Util.check_file_for_readability( ids_mapfile_name )
204 Util.fatal_error( PRG_NAME, 'failed to read from [#{ids_mapfile_name}]: ' + $! )
206 Util.print_message( PRG_NAME, "Ids mapfile: " + ids_mapfile_name )
207 log << "Ids mapfile: " + ids_mapfile_name + NL
209 seqs_file_name = get_seq_file( files, phylogeny_id )
211 Util.check_file_for_readability( seqs_file_name )
213 Util.fatal_error( PRG_NAME, 'failed to read from [#{seqs_file_name }]: ' + $! )
215 Util.print_message( PRG_NAME, "Seq file: " + seqs_file_name )
216 log << "Seq file: " + seqs_file_name + NL
219 domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX )
221 Util.check_file_for_readability( domains_mapfile_name )
223 Util.fatal_error( PRG_NAME, 'failed to read from [#{domains_mapfile_name}]: ' + $! )
225 Util.print_message( PRG_NAME, "Domains file: " + domains_mapfile_name )
226 log << "Domains file: " + domains_mapfile_name + NL
230 ' -t -p -f=m ' + phylogeny_file + ' ' +
231 seqs_file_name + ' ' + TMP_FILE_1
234 execute_cmd( cmd, log )
236 Util.fatal_error( PRG_NAME, 'error: ' + $! )
240 cmd = decorator + ' ' + DECORATOR_OPTIONS_DOMAINS + ' ' +
241 '-f=d ' + TMP_FILE_1 + ' ' +
242 domains_mapfile_name + ' ' + TMP_FILE_2
245 execute_cmd( cmd, log )
247 Util.fatal_error( PRG_NAME, 'error: ' + $! )
252 cmd = decorator + ' ' + DECORATOR_OPTIONS_SEQ_NAMES + ' ' +
253 '-f=n ' + TMP_FILE_1 + ' ' +
254 ids_mapfile_name + ' ' + outfile
257 execute_cmd( cmd, log )
259 Util.fatal_error( PRG_NAME, 'error: ' + $! )
261 File.delete( TMP_FILE_1 )
263 cmd = decorator + ' ' + DECORATOR_OPTIONS_SEQ_NAMES + ' ' +
264 '-f=n ' + TMP_FILE_2 + ' ' +
265 ids_mapfile_name + ' ' + outfile
268 execute_cmd( cmd, log )
270 Util.fatal_error( PRG_NAME, 'error: ' + $! )
272 File.delete( TMP_FILE_1 )
273 File.delete( TMP_FILE_2 )
277 open( LOG_FILE, 'w' ) do | f |
281 Util.print_message( PRG_NAME, 'OK' )
285 def execute_cmd( cmd, log )
286 log << 'executing ' + cmd + NL
287 IO.popen( cmd , 'r+' ) do | pipe |
289 log << pipe.read + NL + NL
294 def get_id( phylogeny_file_name )
295 if phylogeny_file_name =~ /^(.+?_DA)_/
297 elsif phylogeny_file_name =~ /^(.+?)_/
303 def get_file( files_in_dir, phylogeny_id, suffix_pattern )
304 matching_files = Util.get_matching_files( files_in_dir, phylogeny_id, suffix_pattern )
305 if matching_files.length < 1
306 Util.fatal_error( PRG_NAME, 'no file matching [' + phylogeny_id +
307 '...' + suffix_pattern + '] present in current directory' )
309 if matching_files.length > 1
310 Util.fatal_error( PRG_NAME, 'more than one file matching [' +
311 phylogeny_id + '...' + suffix_pattern + '] present in current directory' )
316 def get_seq_file( files_in_dir, phylogeny_id )
317 matching_files = Array.new
319 files_in_dir.each { | file |
321 if ( !File.directory?( file ) &&
324 ( file =~ /^#{phylogeny_id}__.+\d$/ || file =~ /^#{phylogeny_id}_.*\.fasta$/ ) )
325 matching_files << file
329 if matching_files.length < 1
330 Util.fatal_error( PRG_NAME, 'no seq file matching [' +
331 phylogeny_id + '_] present in current directory' )
333 if matching_files.length > 1
334 Util.fatal_error( PRG_NAME, 'more than one seq file matching [' +
335 phylogeny_id + '_] present in current directory' )
343 puts " " + PRG_NAME + ".rb <suffix of in-trees to be decorated> <suffix for decorated out-trees> "
345 puts " required files (in this dir): " + "name mappings : .nim"
346 puts " " + "sequences : _ni.fasta"
347 puts " " + "domain architectures: .dff"
349 puts " options: -" + NO_DOMAINS_OPTION + ": to not add domain architecture information (.dff file)"
351 puts "Example: " + PRG_NAME + ".rb .xml _d.xml"
354 end # class PhylogenyiesDecorator