1 #!/usr/local/bin/ruby -w
3 # = lib/evo/apps/phylogenies_decorator
5 # Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek
6 # License:: GNU Lesser General Public License (LGPL)
8 # decoration of phylogenies with sequence/species names and domain architectures
10 # $Id: phylogenies_decorator.rb,v 1.34 2010/12/13 19:00:11 cmzmasek Exp $
12 # Environment variable FORESTER_HOME needs to point to the appropriate
13 # directory (e.g. setenv FORESTER_HOME $HOME/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester/)
15 require 'lib/evo/util/constants'
16 require 'lib/evo/util/util'
17 require 'lib/evo/util/command_line_arguments'
23 class PhylogeniesDecorator
25 #DECORATOR_OPTIONS_SEQ_NAMES = '-r=1 -mdn'
26 #DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -sn'
27 DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -tc -mp -or'
28 # -mdn is a hidden expert option to rename e.g. "6_ORYLA3" to "6_[3]_ORYLA"
29 #DECORATOR_OPTIONS_SEQ_NAMES = '-sn -r=1'
30 #DECORATOR_OPTIONS_DOMAINS = '-r=1'
31 DECORATOR_OPTIONS_DOMAINS = '-p -t'
32 IDS_MAPFILE_SUFFIX = '.nim'
33 DOMAINS_MAPFILE_SUFFIX = '_hmmscan_10.dff'
36 TMP_FILE_1 = '___PD1___'
37 TMP_FILE_2 = '___PD2___'
38 LOG_FILE = '00_phylogenies_decorator.log'
39 FORESTER_HOME = ENV[Constants::FORESTER_HOME_ENV_VARIABLE]
40 JAVA_HOME = ENV[Constants::JAVA_HOME_ENV_VARIABLE]
42 PRG_NAME = "phylogenies_decorator"
43 PRG_DATE = "2013.11.15"
44 PRG_DESC = "decoration of phylogenies with sequence/species names and domain architectures"
46 COPYRIGHT = "2013 Christian M Zmasek"
47 CONTACT = "phylosoft@gmail.com"
48 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
51 HELP_OPTION_1 = "help"
54 NL = Constants::LINE_DELIMITER
58 Util.print_program_information( PRG_NAME,
67 if ( ARGV == nil || ARGV.length > 3 || ARGV.length < 2 )
72 if FORESTER_HOME == nil || FORESTER_HOME.length < 1
73 Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::FORESTER_HOME_ENV_VARIABLE} has not been set" )
75 if JAVA_HOME == nil || JAVA_HOME.length < 1
76 Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::JAVA_HOME_ENV_VARIABLE} has not been set" )
79 if !File.exist?( FORESTER_HOME )
80 Util.fatal_error( PRG_NAME, '[' + FORESTER_HOME + '] does not exist' )
82 if !File.exist?( JAVA_HOME )
83 Util.fatal_error( PRG_NAME, '[' + JAVA_HOME + '] does not exist' )
86 decorator = JAVA_HOME + '/bin/java -cp ' + FORESTER_HOME + '/java/forester.jar org.forester.application.decorator'
89 cla = CommandLineArguments.new( ARGV )
90 rescue ArgumentError => e
91 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
94 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
95 cla.is_option_set?( HELP_OPTION_2 ) )
100 if File.exist?( LOG_FILE )
101 Util.fatal_error( PRG_NAME, 'logfile [' + LOG_FILE + '] already exists' )
104 in_suffix = cla.get_file_name( 0 )
105 out_suffix = cla.get_file_name( 1 )
110 log << "Program : " + PRG_NAME + NL
111 log << "Version : " + PRG_VERSION + NL
112 log << "Program date : " + PRG_DATE + NL
113 log << "Options for seq names: " + DECORATOR_OPTIONS_SEQ_NAMES + NL
114 log << "Options for domains : " + DECORATOR_OPTIONS_DOMAINS + NL
115 log << "FORESTER_HOME : " + FORESTER_HOME + NL
116 log << "JAVA_HOME : " + JAVA_HOME + NL + NL
117 log << "Date/time: " + now.to_s + NL
118 log << "Directory: " + Dir.getwd + NL + NL
120 Util.print_message( PRG_NAME, 'input suffix : ' + in_suffix )
121 Util.print_message( PRG_NAME, 'output suffix : ' + out_suffix )
123 log << 'input suffix : ' + in_suffix + NL
124 log << 'output suffix : ' + out_suffix + NL
126 if ( File.exists?( TMP_FILE_1 ) )
127 File.delete( TMP_FILE_1 )
129 if ( File.exists?( TMP_FILE_2 ) )
130 File.delete( TMP_FILE_2 )
133 files = Dir.entries( "." )
137 files.each { | phylogeny_file |
138 if ( !File.directory?( phylogeny_file ) &&
139 phylogeny_file !~ /^\./ &&
140 phylogeny_file !~ /^00/ &&
141 phylogeny_file !~ /#{out_suffix}$/ &&
142 phylogeny_file =~ /#{in_suffix}$/ )
144 Util.check_file_for_readability( phylogeny_file )
146 Util.fatal_error( PRG_NAME, 'can not read from: ' + phylogeny_file + ': '+ $! )
151 outfile = phylogeny_file.sub( /#{in_suffix}$/, out_suffix )
154 outfile = outfile.sub( /_ni_/, '_' )
157 if File.exists?( outfile )
158 msg = counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile +
159 ' : already exists, skipping'
160 Util.print_message( PRG_NAME, msg )
165 Util.print_message( PRG_NAME, counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile )
166 log << counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile + NL
168 phylogeny_id = get_id( phylogeny_file )
169 if phylogeny_id == nil || phylogeny_id.size < 1
170 Util.fatal_error( PRG_NAME, 'could not get id from ' + phylogeny_file.to_s )
173 Util.print_message( PRG_NAME, "id: " + phylogeny_id )
174 log << "id: " + phylogeny_id + NL
176 ids_mapfile_name = nil
177 domains_mapfile_name = nil
180 ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX )
181 domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX )
182 seqs_file_name = get_seq_file( files, phylogeny_id )
185 Util.check_file_for_readability( domains_mapfile_name )
187 Util.fatal_error( PRG_NAME, 'failed to read from [#{domains_mapfile_name}]: ' + $! )
191 Util.check_file_for_readability( ids_mapfile_name )
193 Util.fatal_error( PRG_NAME, 'failed to read from [#{ids_mapfile_name}]: ' + $! )
197 Util.check_file_for_readability( seqs_file_name )
199 Util.fatal_error( PRG_NAME, 'failed to read from [#{seqs_file_name }]: ' + $! )
203 ' -t -p -f=m ' + phylogeny_file + ' ' +
204 seqs_file_name + ' ' + TMP_FILE_1
207 execute_cmd( cmd, log )
209 Util.fatal_error( PRG_NAME, 'error: ' + $! )
212 cmd = decorator + ' ' + DECORATOR_OPTIONS_DOMAINS + ' ' +
213 '-f=d ' + TMP_FILE_1 + ' ' +
214 domains_mapfile_name + ' ' +TMP_FILE_2
217 execute_cmd( cmd, log )
219 Util.fatal_error( PRG_NAME, 'error: ' + $! )
222 cmd = decorator + ' ' + DECORATOR_OPTIONS_SEQ_NAMES + ' ' +
223 '-f=n ' + TMP_FILE_2 + ' ' +
224 ids_mapfile_name + ' ' + outfile
227 execute_cmd( cmd, log )
229 Util.fatal_error( PRG_NAME, 'error: ' + $! )
232 File.delete( TMP_FILE_1 )
233 File.delete( TMP_FILE_2 )
237 open( LOG_FILE, 'w' ) do | f |
241 Util.print_message( PRG_NAME, 'OK' )
245 def execute_cmd( cmd, log )
246 log << 'excuting ' + cmd + NL
247 IO.popen( cmd , 'r+' ) do | pipe |
249 log << pipe.read + NL + NL
255 def get_id( phylogeny_file_name )
256 if phylogeny_file_name =~ /^(.+?)__/
258 elsif phylogeny_file_name =~ /^(.+?)_/
264 def get_file( files_in_dir, phylogeny_id, suffix_pattern )
265 matching_files = Array.new
267 files_in_dir.each { | file |
269 if ( !File.directory?( file ) &&
272 file =~ /^#{phylogeny_id}.*#{suffix_pattern}$/ )
273 matching_files << file
276 if matching_files.length < 1
277 Util.fatal_error( PRG_NAME, 'no file matching [' + phylogeny_id +
278 '...' + suffix_pattern + '] present in current directory' )
280 if matching_files.length > 1
281 Util.fatal_error( PRG_NAME, 'more than one file matching [' +
282 phylogeny_id + '...' + suffix_pattern + '] present in current directory' )
287 def get_seq_file( files_in_dir, phylogeny_id )
288 matching_files = Array.new
290 files_in_dir.each { | file |
292 if ( !File.directory?( file ) &&
295 ( file =~ /^#{phylogeny_id}__.+\d$/ || file =~ /^#{phylogeny_id}_.*\.fasta$/ ) )
296 matching_files << file
300 if matching_files.length < 1
301 Util.fatal_error( PRG_NAME, 'no seq file matching [' +
302 phylogeny_id + '_] present in current directory' )
304 if matching_files.length > 1
305 Util.fatal_error( PRG_NAME, 'more than one seq file matching [' +
306 phylogeny_id + '_] present in current directory' )
315 puts( " " + PRG_NAME + ".rb <suffix of intrees to be decorated> <suffix for decorated outtrees> " )
319 end # class PhylogenyiesDecorator