in progress...
[jalview.git] / forester / ruby / evoruby / exe / run_phylo_pipeline_x.rb
1 #!/usr/local/bin/ruby -w
2 #
3 # = run_phylo_pipeline
4 #
5 # Copyright::  Copyright (C) 2010 Christian M. Zmasek
6 # License::    GNU Lesser General Public License (LGPL)
7 #
8 # $Id Exp $
9 #
10 #
11
12 require 'fileutils'
13
14 module Evoruby
15
16   class RunPhyloPipeline
17
18     LAUNCH_ANALYSIS = false
19     HOME          = "/home/czmasek/"
20     FORESTER_RUBY = "#{HOME}SOFTWARE/FORESTER/DEV/forester/forester/ruby/evoruby/exe/"
21     PFAM          = "#{HOME}DATA/PFAM/PFAM270X/"
22     HMMSCAN       = "#{HOME}SOFTWARE/HMMER/hmmer-3.0/src/hmmscan"
23     HSP           = "#{FORESTER_RUBY}hsp.rb"
24     D2F           = "#{FORESTER_RUBY}d2f.rb"
25     DSX           = "#{FORESTER_RUBY}dsx.rb"
26     TAP           = "#{FORESTER_RUBY}tap.rb"
27     PF            = "#{FORESTER_RUBY}phylogeny_factory.rb"
28     TEMPLATE_FILE = '00_phylogeny_factory.template'
29
30     def run
31       unless ARGV.length >= 2 && ARGV.length <= 4
32         error "arguments are:  <min-length> " +
33          "<neg E-value exponent for domain extraction> [E-value for hmmscan, default is 10] [hmmscan option, default is --nobias, --max for no heuristics]"
34       end
35
36       length      = ARGV[ 0 ].to_i
37       e_value_exp = ARGV[ 1 ].to_i
38
39       e_for_hmmscan = 10
40       hmmscan_option = "--nobias"
41
42       if ARGV.length == 4
43         hmmscan_option = ARGV[ 3 ]
44       end
45       if ARGV.length == 3 || ARGV.length == 4
46         e_for_hmmscan = ARGV[ 2 ].to_i
47       end
48
49       if e_value_exp < 0
50         error "E-value exponent for domain extraction cannot be negative"
51       end
52       if length <= 1
53         error "length cannot be smaller than or equal to 1"
54       end
55       if e_for_hmmscan < 1
56         error "E-value for hmmscan cannot be smaller than 1"
57       end
58
59       input_files = Dir.entries(".").select { |f| !File.directory?( f ) && f.downcase.end_with?( ".fasta" ) }
60
61       input_files.sort!
62
63       puts "Input files:"
64       input_files.each do | input |
65         puts input
66       end
67       puts
68
69       counter = 1
70       input_files.each do | input |
71
72         puts counter.to_s + "/" +  input_files.size.to_s + " " + input + ": "
73
74         counter += 1
75
76         hmm_name = ""
77         id_norm = false
78         orig_input = input
79
80         if input.downcase.end_with?( "_ni.fasta" )
81           hmm_name = input[ 0 .. input.length - 10 ]
82         elsif input.downcase.end_with?( ".fasta" )
83           hmm_name = input[ 0 .. input.length - 7 ]
84           unless File.exist? hmm_name
85             id_norm = true
86             puts
87             puts "a. identifier normalization:"
88             cmd = "#{TAP} #{input} #{hmm_name}_ni.fasta #{hmm_name}.nim"
89             run_command( cmd )
90             input = hmm_name + "_ni.fasta"
91           else
92             input = hmm_name + "/" + hmm_name + "_ni.fasta"
93             unless File.exist? input
94               error "expected to already exist: " + input
95             end
96             puts "a. identifier normalization already done: " + input
97           end
98         else
99           error "illegal name: " + input
100         end
101
102         unless File.exist? hmm_name
103           Dir.mkdir( hmm_name )
104         end
105
106         puts
107         hmmscan_output = hmm_name + "/" + hmm_name + "_hmmscan_" + e_for_hmmscan.to_s
108         unless File.exist? hmmscan_output
109           puts "b. hmmscan:"
110           cmd = "#{HMMSCAN} #{hmmscan_option} --domtblout #{hmmscan_output} -E #{e_for_hmmscan.to_s} #{PFAM}Pfam-A.hmm #{input}"
111           run_command( cmd )
112         else
113           puts "b. hmmscan output already exists: " + hmmscan_output
114         end
115         puts
116
117
118         hsp_output = hmm_name + "/" + hmm_name + "_hmmscan_#{e_for_hmmscan.to_s}_domain_table"
119         unless File.exist? hsp_output
120           puts "c. hmmscan to simple domain table:"
121           cmd = "#{HSP} #{hmmscan_output} #{hsp_output}"
122           run_command( cmd )
123         else
124           puts "c. hmmscan to simple domain table output already exists: " + hsp_output
125         end
126         puts
127
128         d2f_output = "#{hmm_name}/#{hmm_name}_hmmscan_#{e_for_hmmscan.to_s}.dff"
129         unless File.exist? d2f_output
130           puts "d. domain table to forester format:"
131           cmd = "#{D2F} -e=10 #{hsp_output} #{input} #{d2f_output}"
132           run_command( cmd )
133         else
134           puts "d. domain table to forester format output already exists: " + d2f_output
135         end
136         puts
137
138         dsx_output_base = "#{hmm_name}__#{hmm_name}__ee#{e_value_exp.to_s}_#{length}"
139         dsx_output = hmm_name +"/" + dsx_output_base
140         unless File.exist? dsx_output + ".fasta"
141           puts "e. dsx:"
142           cmd = "#{DSX} -d -e=1e-#{e_value_exp.to_s} -l=#{length} #{hmm_name} #{hmmscan_output} #{input} #{dsx_output}"
143           run_command( cmd )
144         else
145           puts "e. dsx output already exists: " + dsx_output + ".fasta"
146         end
147         puts
148
149         if id_norm
150           FileUtils.mv "#{hmm_name}_ni.fasta", "#{hmm_name}/#{hmm_name}_ni.fasta"
151           FileUtils.mv "#{hmm_name}.nim", "#{hmm_name}/#{hmm_name}.nim"
152           FileUtils.cp orig_input, "#{hmm_name}/#{orig_input}"
153         end
154
155         msa_dir = hmm_name + "/msa_ee#{e_value_exp.to_s}_#{length}"
156         msa_100_dir =hmm_name + "/msa100_ee#{e_value_exp.to_s}_#{length}"
157
158         unless File.exist? msa_dir
159           Dir.mkdir( msa_dir )
160         end
161         unless File.exist? msa_100_dir
162           Dir.mkdir( msa_100_dir )
163         end
164
165         run_1 = false
166         run_100 = false
167
168         unless File.exist? "#{msa_dir}/#{dsx_output_base}"
169           run_1 = true
170           FileUtils.cp "#{dsx_output}.fasta", "#{msa_dir}/#{dsx_output_base}"
171         end
172
173         unless File.exist? "#{msa_100_dir}/#{dsx_output_base}"
174           run_100 = true
175           FileUtils.cp "#{dsx_output}.fasta", "#{msa_100_dir}/#{dsx_output_base}"
176         end
177
178         if File.exist?( TEMPLATE_FILE )
179           if run_1
180             FileUtils.cp TEMPLATE_FILE, msa_dir
181           end
182           if run_100
183             FileUtils.cp TEMPLATE_FILE, msa_100_dir
184           end
185
186           if LAUNCH_ANALYSIS && ( run_1 || run_100 )
187             puts "f. analysis:"
188             if run_1
189               Dir.chdir msa_dir
190               run_command "#{PF} -b=1 -s"
191               Dir.chdir "../.."
192             end
193             if run_100
194               Dir.chdir msa_100_dir
195               run_command "#{PF} -b=100 -s"
196               Dir.chdir "../.."
197             end
198             puts
199           end
200         end
201
202       end
203
204     end
205
206     def run_command cmd
207       puts cmd
208       `#{cmd}`
209     end
210
211     def get_base_name n
212       if n.downcase.end_with?( "_ni.fasta" )
213         n[ 0 .. n.length - 10 ]
214       elsif n.downcase.end_with?( ".fasta" )
215         n[ 0 .. n.length - 7 ]
216       else
217         error "illegal name: " + n
218       end
219     end
220
221     def error msg
222       puts
223       puts msg
224       puts
225       exit
226     end
227
228   end
229
230   p = RunPhyloPipeline.new()
231
232   p.run()
233
234 end
235
236
237