#
#
+require 'fileutils'
module Evoruby
class RunPhyloPipeline
- PFAM = "/home/czmasek/DATA/PFAM/PFAM270X/"
- HMMSCAN = "/home/czmasek/SOFTWARE/HMMER/hmmer-3.0/src/hmmscan"
- HSP = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/ruby/evoruby/exe/hsp.rb"
- D2F = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/ruby/evoruby/exe/d2f.rb"
- DSX = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/ruby/evoruby/exe/dsx.rb"
- TAP = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/ruby/evoruby/exe/tap.rb"
+ LAUNCH_ANALYSIS = false
+ HOME = "/home/czmasek/"
+ FORESTER_RUBY = "#{HOME}SOFTWARE/FORESTER/DEV/forester/forester/ruby/evoruby/exe/"
+ PFAM = "#{HOME}DATA/PFAM/PFAM270X/"
+ HMMSCAN = "#{HOME}SOFTWARE/HMMER/hmmer-3.0/src/hmmscan"
+ HSP = "#{FORESTER_RUBY}hsp.rb"
+ D2F = "#{FORESTER_RUBY}d2f.rb"
+ DSX = "#{FORESTER_RUBY}dsx.rb"
+ TAP = "#{FORESTER_RUBY}tap.rb"
+ PF = "#{FORESTER_RUBY}phylogeny_factory.rb"
+ TEMPLATE_FILE = '00_phylogeny_factory.template'
def run
- unless ARGV.length >= 4 && ARGV.length <= 6
+ unless ARGV.length >= 2 && ARGV.length <= 4
error "arguments are: <min-length> " +
"<neg E-value exponent for domain extraction> [E-value for hmmscan, default is 10] [hmmscan option, default is --nobias, --max for no heuristics]"
end
-
-
length = ARGV[ 0 ].to_i
e_value_exp = ARGV[ 1 ].to_i
input_files = Dir.entries(".").select { |f| !File.directory?( f ) && f.downcase.end_with?( ".fasta" ) }
+ input_files.sort!
+
puts "Input files:"
input_files.each do | input |
puts input
end
puts
+ counter = 1
input_files.each do | input |
+ puts counter.to_s + "/" + input_files.size.to_s + " " + input + ": "
+
+ counter += 1
+
hmm_name = ""
+ id_norm = false
+ orig_input = input
if input.downcase.end_with?( "_ni.fasta" )
hmm_name = input[ 0 .. input.length - 10 ]
elsif input.downcase.end_with?( ".fasta" )
hmm_name = input[ 0 .. input.length - 7 ]
- puts
- puts "0. identifier normalization:"
- cmd = "#{TAP} #{input}"
- run_command( cmd )
- puts
+ unless File.exist? hmm_name
+ id_norm = true
+ puts
+ puts "a. identifier normalization:"
+ cmd = "#{TAP} #{input} #{hmm_name}_ni.fasta #{hmm_name}.nim"
+ run_command( cmd )
+ input = hmm_name + "_ni.fasta"
+ else
+ input = hmm_name + "/" + hmm_name + "_ni.fasta"
+ unless File.exist? input
+ error "expected to already exist: " + input
+ end
+ puts "a. identifier normalization already done: " + input
+ end
else
error "illegal name: " + input
end
+ unless File.exist? hmm_name
+ Dir.mkdir( hmm_name )
+ end
+
puts
- puts "1. hmmscan:"
- cmd = "#{HMMSCAN} #{hmmscan_option} --domtblout #{hmm_name}_hmmscan_#{e_for_hmmscan.to_s} -E #{e_for_hmmscan.to_s} #{PFAM}Pfam-A.hmm #{input}"
- run_command( cmd )
+ hmmscan_output = hmm_name + "/" + hmm_name + "_hmmscan_" + e_for_hmmscan.to_s
+ unless File.exist? hmmscan_output
+ puts "b. hmmscan:"
+ cmd = "#{HMMSCAN} #{hmmscan_option} --domtblout #{hmmscan_output} -E #{e_for_hmmscan.to_s} #{PFAM}Pfam-A.hmm #{input}"
+ run_command( cmd )
+ else
+ puts "b. hmmscan output already exists: " + hmmscan_output
+ end
puts
- puts "2. hmmscan to simple domain table:"
- cmd = "#{HSP} #{hmm_name}_hmmscan_#{e_for_hmmscan.to_s} #{hmm_name}_hmmscan_#{e_for_hmmscan.to_s}_domain_table"
- run_command( cmd )
+
+ hsp_output = hmm_name + "/" + hmm_name + "_hmmscan_#{e_for_hmmscan.to_s}_domain_table"
+ unless File.exist? hsp_output
+ puts "c. hmmscan to simple domain table:"
+ cmd = "#{HSP} #{hmmscan_output} #{hsp_output}"
+ run_command( cmd )
+ else
+ puts "c. hmmscan to simple domain table output already exists: " + hsp_output
+ end
puts
- puts "3. domain table to forester format:"
- cmd = "#{D2F} -e=10 #{hmm_name}_hmmscan_#{e_for_hmmscan.to_s}_domain_table #{input} #{hmm_name}_hmmscan_#{e_for_hmmscan.to_s}.dff"
- run_command( cmd )
+ d2f_output = "#{hmm_name}/#{hmm_name}_hmmscan_#{e_for_hmmscan.to_s}.dff"
+ unless File.exist? d2f_output
+ puts "d. domain table to forester format:"
+ cmd = "#{D2F} -e=10 #{hsp_output} #{input} #{d2f_output}"
+ run_command( cmd )
+ else
+ puts "d. domain table to forester format output already exists: " + d2f_output
+ end
puts
- puts "4. dsx:"
- cmd = "#{DSX} -d -e=1e-#{e_value_exp.to_s} -l=#{length} #{hmm_name} #{hmm_name}_hmmscan_#{e_for_hmmscan.to_s} #{input} #{hmm_name}__#{hmm_name}__ee#{e_value_exp.to_s}_#{length}"
- run_command( cmd )
+ dsx_output_base = "#{hmm_name}__#{hmm_name}__ee#{e_value_exp.to_s}_#{length}"
+ dsx_output = hmm_name +"/" + dsx_output_base
+ unless File.exist? dsx_output + ".fasta"
+ puts "e. dsx:"
+ cmd = "#{DSX} -d -e=1e-#{e_value_exp.to_s} -l=#{length} #{hmm_name} #{hmmscan_output} #{input} #{dsx_output}"
+ run_command( cmd )
+ else
+ puts "e. dsx output already exists: " + dsx_output + ".fasta"
+ end
puts
+ if id_norm
+ FileUtils.mv "#{hmm_name}_ni.fasta", "#{hmm_name}/#{hmm_name}_ni.fasta"
+ FileUtils.mv "#{hmm_name}.nim", "#{hmm_name}/#{hmm_name}.nim"
+ FileUtils.cp orig_input, "#{hmm_name}/#{orig_input}"
+ end
+
+ msa_dir = hmm_name + "/msa_ee#{e_value_exp.to_s}_#{length}"
+ msa_100_dir =hmm_name + "/msa100_ee#{e_value_exp.to_s}_#{length}"
+
+ unless File.exist? msa_dir
+ Dir.mkdir( msa_dir )
+ end
+ unless File.exist? msa_100_dir
+ Dir.mkdir( msa_100_dir )
+ end
+
+ run_1 = false
+ run_100 = false
+
+ unless File.exist? "#{msa_dir}/#{dsx_output_base}"
+ run_1 = true
+ FileUtils.cp "#{dsx_output}.fasta", "#{msa_dir}/#{dsx_output_base}"
+ end
+
+ unless File.exist? "#{msa_100_dir}/#{dsx_output_base}"
+ run_100 = true
+ FileUtils.cp "#{dsx_output}.fasta", "#{msa_100_dir}/#{dsx_output_base}"
+ end
+
+ if File.exist?( TEMPLATE_FILE )
+ if run_1
+ FileUtils.cp TEMPLATE_FILE, msa_dir
+ end
+ if run_100
+ FileUtils.cp TEMPLATE_FILE, msa_100_dir
+ end
+
+ if LAUNCH_ANALYSIS && ( run_1 || run_100 )
+ puts "f. analysis:"
+ if run_1
+ Dir.chdir msa_dir
+ run_command "#{PF} -b=1 -s"
+ Dir.chdir "../.."
+ end
+ if run_100
+ Dir.chdir msa_100_dir
+ run_command "#{PF} -b=100 -s"
+ Dir.chdir "../.."
+ end
+ puts
+ end
+ end
+
end
end