class PhylogenyFactory
PRG_NAME = "phylogeny_factory"
- PRG_DATE = "130402"
+ PRG_DATE = "1301111"
PRG_DESC = "automated phylogeny reconstruction using queing system"
- PRG_VERSION = "1.002"
+ PRG_VERSION = "1.100"
COPYRIGHT = "2013 Christian M Zmasek"
CONTACT = "phylosoft@gmail.com"
WWW = "www.phylosoft.org"
USE_JOB_SUBMISSION_SYSTEM_OPTION = 's'
+ BS_OPTION = 'b'
LOG_FILE = '00_phylogeny_factory.log'
TEMPLATE_FILE = '00_phylogeny_factory.template'
PBS_O_WORKDIR = '$PBS_O_WORKDIR/'
- MIN_LENGTH_DEFAULT = 50
- PFAM_HHMS = "/home/czmasek/DATA/PFAM/PFAM260X/PFAM_A_HMMs/"
+ MIN_LENGTH_DEFAULT = 40
+ PFAM_HHMS = "/home/czmasek/DATA/PFAM/PFAM270X/PFAM_A_HMMs/"
WALLTIME = '100:00:00'
QUEUE = 'default'
RSL = 'RSL'
HMM = 'HMM'
+ PHYLO_OPT = 'PHYLO_OPT'
OPTION_OPEN = '%['
OPTION_CLOSE = ']%'
allowed_opts = Array.new
allowed_opts.push( USE_JOB_SUBMISSION_SYSTEM_OPTION )
+ allowed_opts.push( BS_OPTION )
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
use_job_submission_system = true
end
+ bootstraps = 1
+ if cla.is_option_set?( BS_OPTION )
+ bootstraps = cla.get_option_value_as_int( BS_OPTION )
+ end
+ if bootstraps < 0
+ puts( '[' + PRG_NAME + '] > negative bootstrap value' )
+ exit( -1 )
+ end
+ if bootstraps == 0
+ bootstraps = 1
+ end
+
log = String.new
now = DateTime.now
log << "Program : " + PRG_NAME + NL
log << "Version : " + PRG_VERSION + NL
log << "Program date: " + PRG_DATE + NL + NL
+ log << "Bootstraps : " + bootstraps.to_s + NL
log << "Date/time : " + now.to_s + NL
log << "Directory : " + Dir.getwd + NL + NL
paths = Hash.new # path placeholder -> full path
min_lengths = Hash.new # alignment id -> minimal length
options = Hash.new # option placeholder -> option
- ids = Set.new
commands = Array.new
puts( '[' + PRG_NAME + '] > min lengths: ' + $1 + ' => ' + $2 )
elsif ( line =~ /^%\s*(\S+)\s*=\s*(\S+)/ )
- options[ $1 ] = $2
- puts( '[' + PRG_NAME + '] > options : ' + $1 + ' => ' + $2 )
+ key = $1
+ value = $2
+ if key == PHYLO_OPT
+ value = update_phylo_pl_options( value, bootstraps )
+ end
+ options[ key ] = value
+ puts( '[' + PRG_NAME + '] > options : ' + key + ' => ' + value )
elsif ( line =~ /^>\s*(.+)/ )
command = command + $1 + ";#{NL}"
file !~ /^00/ )
aln_name = file.to_str
id = get_id( aln_name )
- if !ids.include?( id )
- ids.add( id )
+ if id != nil && id.length > 0
+ puts
+ puts( '[' + PRG_NAME + '] > file [id] : ' + aln_name + ' [' + id + ']' )
+ else
+ puts
+ puts( '[' + PRG_NAME + '] > file [id] : ' + aln_name + ' [WARNING: could not get id!]' )
end
- puts( '[' + PRG_NAME + '] > file [id] : ' + aln_name + ' [' + id + ']' )
commands.each do | cmd |
- id = get_id( aln_name )
cmd = subst_hmm( cmd, id )
cmd = subst_min_length( cmd, id, min_lengths )
cmd = subst_options( cmd, options )
if ( cmd =~ /%/ )
cmd =~ /(%.*?%)/
problem = $1
- puts( '[' + PRG_NAME + '] > WARNING : [' + id + '] command still contains placeholder: ' + problem )
+ puts( '[' + PRG_NAME + '] > WARNING : command still contains placeholder: ' + problem )
log << "WARNING: command still contains placeholder: " + cmd + NL
else
tmp_cmd_file = file.to_str[ 0..4 ] + TMP_CMD_FILE_SUFFIX
end
def subst_hmm( command, id )
- if id != nil && id.length > 1
+ if id != nil && id.length > 0
hmm = PFAM_HHMS + id + ".hmm"
command = command.gsub( OPTION_OPEN + HMM + OPTION_CLOSE, hmm )
end
command
end
+ def update_phylo_pl_options( phylo_pl_options, bootstraps )
+ #opts = phylo_pl_options
+ puts
+ puts "opts: " + phylo_pl_options
+ unless phylo_pl_options =~ /B\d/
+ phylo_pl_options = 'B' + bootstraps.to_s + phylo_pl_options
+ end
+ phylo_pl_options = '-' + phylo_pl_options
+ puts
+ puts "new opts: " + phylo_pl_options
+ phylo_pl_options
+ end
+
def subst_min_length( command, id, min_lengths )
min_length = nil
- if id != nil && id.length > 1
+ if id != nil && id.length > 0
min_length = min_lengths[ id ]
end
if min_length != nil && min_length.size > 0
end
def get_id( aln_name )
- id = aln_name[ 0, aln_name.index( "__" ) ]
- if id == nil || id.length < 1
- puts( '[' + PRG_NAME + '] > WARNING: could not get id from [' + aln_name + ']' )
+ if aln_name =~ /_{2}(.+)_{2}/
+ return $1
end
- id
+ nil
end
end # class PhylogenyFactory
end # module Evoruby
+
+
+=begin
+
+# Name convention if alignment specific parameters
+# are to be used:
+# the substring between the first two double underscores is a
+# unique identifier and needs to match the identifiers
+# in '% <parameter-type> <unique-id>=<value>' statements
+# Example:
+# alignment name : 'x__bcl2__e1'
+# parameter statments: '% RSL bcl2=60'
+$ PROBCONS=/home/czmasek/SOFTWARE/PROBCONS/probcons_v1_12/probcons
+$ DIALIGN_TX=/home/czmasek/SOFTWARE/DIALIGNTX/DIALIGN-TX_1.0.2/source/dialign-tx
+$ DIALIGN_CONF=/home/czmasek/SOFTWARE/DIALIGNTX/DIALIGN-TX_1.0.2/conf
+$ MAFFT=/home/czmasek/SOFTWARE/MAFFT/mafft-7.017-without-extensions/scripts/mafft
+$ MUSCLE=/home/czmasek/SOFTWARE/MUSCLE/muscle3.8.31/src/muscle
+$ CLUSTALO=/home/czmasek/SOFTWARE/CLUSTAL_OMEGA/clustal-omega-1.1.0/src/clustalo
+$ KALIGN=/home/czmasek/SOFTWARE/KALIGN/kalign203/kalign
+$ HMMALIGN=/home/czmasek/SOFTWARE/HMMER/hmmer-3.0/src/hmmalign
+$ MSA_PRO=/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/ruby/evoruby/exe/msa_pro.rb
+$ PHYLO_PL=/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/archive/perl/phylo_pl.pl
+
+
+% RSL Hormone_recep=60
+%
+% RSL Y_phosphatase=100
+% RSL Y_phosphatase2=75
+% RSL Y_phosphatase3=50
+% RSL Y_phosphatase3C=40
+
+% PHYLO_OPT=B100q@1r4j2IGS21X
+
+% TMP_DIR = /home/czmasek/tmp/
+
+
+> KALIGN $ > $_kalign
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_kalign $_kalign_05_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_kalign_05_%[RSL]%.aln $_kalign_05_%[RSL]% %[TMP_DIR]%
+-
+
+> KALIGN $ > $_kalign_
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_kalign_ $_kalign_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_kalign_09_%[RSL]%.aln $_kalign_09_%[RSL]% %[TMP_DIR]%
+-
+
+
+> HMMALIGN --amino --trim --outformat Pfam -o $_hmmalign %[HMM]% $ > /dev/null
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_hmmalign $_hmmalign_05_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_hmmalign_05_%[RSL]%.aln $_hmmalign_05_%[RSL]% %[TMP_DIR]%
+-
+
+> HMMALIGN --amino --trim --outformat Pfam -o $_hmmalign_ %[HMM]% $ > /dev/null
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_hmmalign_ $_hmmalign_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_hmmalign_09_%[RSL]%.aln $_hmmalign_09_%[RSL]% %[TMP_DIR]%
+-
+
+
+> MAFFT --maxiterate 1000 --localpair $ > $_mafft
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_mafft $_mafft_05_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_mafft_05_%[RSL]%.aln $_mafft_05_%[RSL]% %[TMP_DIR]%
+-
+
+> MAFFT --maxiterate 1000 --localpair $ > $_mafft_
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_mafft_ $_mafft_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_mafft_09_%[RSL]%.aln $_mafft_09_%[RSL]% %[TMP_DIR]%
+-
+
+
+> MUSCLE -maxiters 1000 -maxtrees 100 -in $ -out $_muscle
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_muscle $_muscle_05_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_muscle_05_%[RSL]%.aln $_muscle_05_%[RSL]% %[TMP_DIR]%
+-
+
+> MUSCLE -maxiters 1000 -maxtrees 100 -in $ -out $_muscle_
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_muscle_ $_muscle_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_muscle_09_%[RSL]%.aln $_muscle_09_%[RSL]% %[TMP_DIR]%
+-
+
+
+> CLUSTALO --full --full-iter --iter=5 -i $ -o $_clustalo
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_clustalo $_clustalo_05_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_clustalo_05_%[RSL]%.aln $_clustalo_05_%[RSL]% %[TMP_DIR]%
+-
+
+> CLUSTALO --full --full-iter --iter=5 -i $ -o $_clustalo_
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_clustalo_ $_clustalo_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_clustalo_09_%[RSL]%.aln $_clustalo_09_%[RSL]% %[TMP_DIR]%
+-
+
+
+> PROBCONS $ > $_probcons
+> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.5 -c -rsl=%[RSL]% $_probcons $_probcons_05_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_probcons_05_%[RSL]%.aln $_probcons_05_%[RSL]% %[TMP_DIR]%
+-
+
+> PROBCONS $ > $_probcons_
+> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.9 -c -rsl=%[RSL]% $_probcons_ $_probcons_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_probcons_09_%[RSL]%.aln $_probcons_09_%[RSL]% %[TMP_DIR]%
+-
+
+
+> DIALIGN_TX DIALIGN_CONF $ $_dialigntx
+> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.5 -c -rsl=%[RSL]% $_dialigntx $_dialigntx_05_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_dialigntx_05_%[RSL]%.aln $_dialigntx_05_%[RSL]% %[TMP_DIR]%
+-
+
+> DIALIGN_TX DIALIGN_CONF $ $_dialigntx_
+> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.9 -c -rsl=%[RSL]% $_dialigntx_ $_dialigntx_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_dialigntx_09_%[RSL]%.aln $_dialigntx_09_%[RSL]% %[TMP_DIR]%
+-
+
+=end
+