2 # = lib/evo/apps/phylogeny_factory - PhylogenyFactory class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: phylogeny_factory.rb,v 1.32 2010/12/13 19:00:11 cmzmasek Exp $
9 require 'lib/evo/util/constants'
10 require 'lib/evo/util/util'
11 require 'lib/evo/util/command_line_arguments'
18 class PhylogenyFactory
20 PRG_NAME = "phylogeny_factory"
22 PRG_DESC = "automated phylogeny reconstruction using queing system"
24 COPYRIGHT = "2013 Christian M Zmasek"
25 CONTACT = "phylosoft@gmail.com"
26 WWW = "www.phylosoft.org"
28 USE_JOB_SUBMISSION_SYSTEM_OPTION = 's'
30 LOG_FILE = '00_phylogeny_factory.log'
31 TEMPLATE_FILE = '00_phylogeny_factory.template'
32 PBS_O_WORKDIR = '$PBS_O_WORKDIR/'
33 MIN_LENGTH_DEFAULT = 40
34 PFAM_HHMS = "/home/czmasek/DATA/PFAM/PFAM270X/PFAM_A_HMMs/"
35 WALLTIME = '100:00:00'
38 TMP_CMD_FILE_SUFFIX = '_QSUB'
42 PHYLO_OPT = 'PHYLO_OPT'
49 NL = Constants::LINE_DELIMITER
53 Util.print_program_information( PRG_NAME,
63 cla = CommandLineArguments.new( ARGV )
64 rescue ArgumentError => e
65 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
68 allowed_opts = Array.new
69 allowed_opts.push( USE_JOB_SUBMISSION_SYSTEM_OPTION )
70 allowed_opts.push( BS_OPTION )
72 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
73 if ( disallowed.length > 0 )
74 Util.fatal_error( PRG_NAME,
75 "unknown option(s): " + disallowed,
79 if File.exists?( LOG_FILE )
80 puts( '[' + PRG_NAME + '] > log file [' + LOG_FILE + '] already exists' )
84 if !File.exists?( TEMPLATE_FILE )
85 puts( '[' + PRG_NAME + '] > template file [' + TEMPLATE_FILE + '] not found' )
89 use_job_submission_system = false
90 if cla.is_option_set?( USE_JOB_SUBMISSION_SYSTEM_OPTION )
91 use_job_submission_system = true
95 if cla.is_option_set?( BS_OPTION )
96 bootstraps = cla.get_option_value_as_int( BS_OPTION )
99 puts( '[' + PRG_NAME + '] > negative bootstrap value' )
109 log << "Program : " + PRG_NAME + NL
110 log << "Version : " + PRG_VERSION + NL
111 log << "Program date: " + PRG_DATE + NL + NL
112 log << "Bootstraps : " + bootstraps.to_s + NL
113 log << "Date/time : " + now.to_s + NL
114 log << "Directory : " + Dir.getwd + NL + NL
116 puts( '[' + PRG_NAME + '] > reading ' + TEMPLATE_FILE )
118 paths = Hash.new # path placeholder -> full path
119 min_lengths = Hash.new # alignment id -> minimal length
120 options = Hash.new # option placeholder -> option
124 log << "////////////////////////////////////////////////////////////////// #{NL}"
125 log << "Template file [" + TEMPLATE_FILE + "]:#{NL}"
129 open( TEMPLATE_FILE ).each { | line |
132 elsif ( line =~ /^\$\s*(\S+)\s*=\s*(\S+)/ )
134 puts( '[' + PRG_NAME + '] > paths : ' + $1 + ' => ' + $2 )
136 elsif ( line =~ /^%\s*#{RSL}\s*(\S+)\s*=\s*(\S+)/ )
137 min_lengths[ $1 ] = $2
138 puts( '[' + PRG_NAME + '] > min lengths: ' + $1 + ' => ' + $2 )
140 elsif ( line =~ /^%\s*(\S+)\s*=\s*(\S+)/ )
144 value = update_phylo_pl_options( value, bootstraps )
146 options[ key ] = value
147 puts( '[' + PRG_NAME + '] > options : ' + key + ' => ' + value )
149 elsif ( line =~ /^>\s*(.+)/ )
150 command = command + $1 + ";#{NL}"
152 elsif ( line =~ /^-/ )
153 commands << prepare( command, paths )
157 log << "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ #{NL}#{NL}"
159 files = Dir.entries( "." )
161 files.each { | file |
162 if ( !File.directory?( file ) &&
164 file !~ /#{TEMPLATE_FILE}/ &&
169 aln_name = file.to_str
170 id = get_id( aln_name )
171 if id != nil && id.length > 0
173 puts( '[' + PRG_NAME + '] > file [id] : ' + aln_name + ' [' + id + ']' )
176 puts( '[' + PRG_NAME + '] > file [id] : ' + aln_name + ' [WARNING: could not get id!]' )
178 commands.each do | cmd |
179 cmd = subst_hmm( cmd, id )
180 cmd = subst_min_length( cmd, id, min_lengths )
181 cmd = subst_options( cmd, options )
182 if use_job_submission_system
183 cmd = subst_aln_name( cmd, PBS_O_WORKDIR + aln_name )
185 cmd = subst_aln_name( cmd, aln_name )
191 puts( '[' + PRG_NAME + '] > WARNING : command still contains placeholder: ' + problem )
192 log << "WARNING: command still contains placeholder: " + cmd + NL
194 tmp_cmd_file = file.to_str[ 0..4 ] + TMP_CMD_FILE_SUFFIX
195 if ( File.exists?( tmp_cmd_file ) )
196 File.delete( tmp_cmd_file )
198 if use_job_submission_system
199 open( tmp_cmd_file, 'w' ) do |f|
206 if use_job_submission_system
207 IO.popen( 'qsub -q ' + QUEUE + ' -l walltime=' + WALLTIME + ' ' + tmp_cmd_file , 'r+' ) do | pipe |
211 spawn( 'nohup ' + cmd + ' &', STDERR => "/dev/null" )
215 if ( File.exists?( tmp_cmd_file ) )
216 File.delete( tmp_cmd_file )
223 open( LOG_FILE, 'w' ) do | f |
228 puts( '[' + PRG_NAME + '] > OK' )
233 def prepare( command, paths )
234 paths.each_pair{ | name, full |
235 command = command.gsub( name, full )
240 def subst_options( command, options )
241 opt_placeholders = command.scan( /%\[\S+\]%/ )
242 opt_placeholders.each { | opt_placeholder |
243 opt_placeholder = opt_placeholder.gsub( OPTION_OPEN , '' )
244 opt_placeholder = opt_placeholder.gsub( OPTION_CLOSE, '' )
245 opt_value = options[ opt_placeholder ]
246 if ( opt_value != nil && opt_value.size > 0 )
247 command = command.gsub( OPTION_OPEN + opt_placeholder + OPTION_CLOSE, opt_value )
253 def subst_aln_name( command, aln_name )
254 command = command.gsub( '$', aln_name )
258 def subst_hmm( command, id )
259 if id != nil && id.length > 0
260 hmm = PFAM_HHMS + id + ".hmm"
261 command = command.gsub( OPTION_OPEN + HMM + OPTION_CLOSE, hmm )
266 def update_phylo_pl_options( phylo_pl_options, bootstraps )
267 #opts = phylo_pl_options
269 puts "opts: " + phylo_pl_options
270 unless phylo_pl_options =~ /B\d/
271 phylo_pl_options = 'B' + bootstraps.to_s + phylo_pl_options
273 phylo_pl_options = '-' + phylo_pl_options
275 puts "new opts: " + phylo_pl_options
279 def subst_min_length( command, id, min_lengths )
281 if id != nil && id.length > 0
282 min_length = min_lengths[ id ]
284 if min_length != nil && min_length.size > 0
285 command = command.gsub( OPTION_OPEN + RSL + OPTION_CLOSE, min_length )
287 command = command.gsub( OPTION_OPEN + RSL + OPTION_CLOSE, MIN_LENGTH_DEFAULT.to_s )
292 def get_id( aln_name )
293 if aln_name =~ /_{2}(.+)_{2}/
299 end # class PhylogenyFactory
306 # Name convention if alignment specific parameters
308 # the substring between the first two double underscores is a
309 # unique identifier and needs to match the identifiers
310 # in '% <parameter-type> <unique-id>=<value>' statements
312 # alignment name : 'x__bcl2__e1'
313 # parameter statments: '% RSL bcl2=60'
314 $ PROBCONS=/home/czmasek/SOFTWARE/PROBCONS/probcons_v1_12/probcons
315 $ DIALIGN_TX=/home/czmasek/SOFTWARE/DIALIGNTX/DIALIGN-TX_1.0.2/source/dialign-tx
316 $ DIALIGN_CONF=/home/czmasek/SOFTWARE/DIALIGNTX/DIALIGN-TX_1.0.2/conf
317 $ MAFFT=/home/czmasek/SOFTWARE/MAFFT/mafft-7.017-without-extensions/scripts/mafft
318 $ MUSCLE=/home/czmasek/SOFTWARE/MUSCLE/muscle3.8.31/src/muscle
319 $ CLUSTALO=/home/czmasek/SOFTWARE/CLUSTAL_OMEGA/clustal-omega-1.1.0/src/clustalo
320 $ KALIGN=/home/czmasek/SOFTWARE/KALIGN/kalign203/kalign
321 $ HMMALIGN=/home/czmasek/SOFTWARE/HMMER/hmmer-3.0/src/hmmalign
322 $ MSA_PRO=/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/ruby/evoruby/exe/msa_pro.rb
323 $ PHYLO_PL=/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/archive/perl/phylo_pl.pl
326 % RSL Hormone_recep=60
328 % RSL Y_phosphatase=100
329 % RSL Y_phosphatase2=75
330 % RSL Y_phosphatase3=50
331 % RSL Y_phosphatase3C=40
333 % PHYLO_OPT=B100q@1r4j2IGS21X
335 % TMP_DIR = /home/czmasek/tmp/
338 > KALIGN $ > $_kalign
339 > MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_kalign $_kalign_05_%[RSL]%.aln
340 > PHYLO_PL %[PHYLO_OPT]% $_kalign_05_%[RSL]%.aln $_kalign_05_%[RSL]% %[TMP_DIR]%
343 > KALIGN $ > $_kalign_
344 > MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_kalign_ $_kalign_09_%[RSL]%.aln
345 > PHYLO_PL %[PHYLO_OPT]% $_kalign_09_%[RSL]%.aln $_kalign_09_%[RSL]% %[TMP_DIR]%
349 > HMMALIGN --amino --trim --outformat Pfam -o $_hmmalign %[HMM]% $ > /dev/null
350 > MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_hmmalign $_hmmalign_05_%[RSL]%.aln
351 > PHYLO_PL %[PHYLO_OPT]% $_hmmalign_05_%[RSL]%.aln $_hmmalign_05_%[RSL]% %[TMP_DIR]%
354 > HMMALIGN --amino --trim --outformat Pfam -o $_hmmalign_ %[HMM]% $ > /dev/null
355 > MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_hmmalign_ $_hmmalign_09_%[RSL]%.aln
356 > PHYLO_PL %[PHYLO_OPT]% $_hmmalign_09_%[RSL]%.aln $_hmmalign_09_%[RSL]% %[TMP_DIR]%
360 > MAFFT --maxiterate 1000 --localpair $ > $_mafft
361 > MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_mafft $_mafft_05_%[RSL]%.aln
362 > PHYLO_PL %[PHYLO_OPT]% $_mafft_05_%[RSL]%.aln $_mafft_05_%[RSL]% %[TMP_DIR]%
365 > MAFFT --maxiterate 1000 --localpair $ > $_mafft_
366 > MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_mafft_ $_mafft_09_%[RSL]%.aln
367 > PHYLO_PL %[PHYLO_OPT]% $_mafft_09_%[RSL]%.aln $_mafft_09_%[RSL]% %[TMP_DIR]%
371 > MUSCLE -maxiters 1000 -maxtrees 100 -in $ -out $_muscle
372 > MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_muscle $_muscle_05_%[RSL]%.aln
373 > PHYLO_PL %[PHYLO_OPT]% $_muscle_05_%[RSL]%.aln $_muscle_05_%[RSL]% %[TMP_DIR]%
376 > MUSCLE -maxiters 1000 -maxtrees 100 -in $ -out $_muscle_
377 > MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_muscle_ $_muscle_09_%[RSL]%.aln
378 > PHYLO_PL %[PHYLO_OPT]% $_muscle_09_%[RSL]%.aln $_muscle_09_%[RSL]% %[TMP_DIR]%
382 > CLUSTALO --full --full-iter --iter=5 -i $ -o $_clustalo
383 > MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_clustalo $_clustalo_05_%[RSL]%.aln
384 > PHYLO_PL %[PHYLO_OPT]% $_clustalo_05_%[RSL]%.aln $_clustalo_05_%[RSL]% %[TMP_DIR]%
387 > CLUSTALO --full --full-iter --iter=5 -i $ -o $_clustalo_
388 > MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_clustalo_ $_clustalo_09_%[RSL]%.aln
389 > PHYLO_PL %[PHYLO_OPT]% $_clustalo_09_%[RSL]%.aln $_clustalo_09_%[RSL]% %[TMP_DIR]%
393 > PROBCONS $ > $_probcons
394 > MSA_PRO -o=p -n=10 -d -rem_red -rr=0.5 -c -rsl=%[RSL]% $_probcons $_probcons_05_%[RSL]%.aln
395 > PHYLO_PL %[PHYLO_OPT]% $_probcons_05_%[RSL]%.aln $_probcons_05_%[RSL]% %[TMP_DIR]%
398 > PROBCONS $ > $_probcons_
399 > MSA_PRO -o=p -n=10 -d -rem_red -rr=0.9 -c -rsl=%[RSL]% $_probcons_ $_probcons_09_%[RSL]%.aln
400 > PHYLO_PL %[PHYLO_OPT]% $_probcons_09_%[RSL]%.aln $_probcons_09_%[RSL]% %[TMP_DIR]%
404 > DIALIGN_TX DIALIGN_CONF $ $_dialigntx
405 > MSA_PRO -o=p -n=10 -d -rem_red -rr=0.5 -c -rsl=%[RSL]% $_dialigntx $_dialigntx_05_%[RSL]%.aln
406 > PHYLO_PL %[PHYLO_OPT]% $_dialigntx_05_%[RSL]%.aln $_dialigntx_05_%[RSL]% %[TMP_DIR]%
409 > DIALIGN_TX DIALIGN_CONF $ $_dialigntx_
410 > MSA_PRO -o=p -n=10 -d -rem_red -rr=0.9 -c -rsl=%[RSL]% $_dialigntx_ $_dialigntx_09_%[RSL]%.aln
411 > PHYLO_PL %[PHYLO_OPT]% $_dialigntx_09_%[RSL]%.aln $_dialigntx_09_%[RSL]% %[TMP_DIR]%