inprogress
[jalview.git] / forester / ruby / evoruby / exe / run_phylo_pipeline.rb
index d68c58b..5cd0401 100755 (executable)
 # Copyright::  Copyright (C) 2010 Christian M. Zmasek
 # License::    GNU Lesser General Public License (LGPL)
 #
-# $Id: run_phylo_pipeline.rb,v 1.15 2010/10/09 02:35:42 cmzmasek Exp $
+# $Id Exp $
 #
 #
 
 
-#  hmmscan --nobias --domtblout <BACTH_CHIPI>_hmmscan_240_10 -E 10 /home/czmasek/DATA/PFAM/PFAM240/Pfam-A.hmm <BACTH_CHIPI>.fasta
-
-#  hsp <BACTH_CHIPI>_hmmscan_240_10 <BACTH_CHIPI>_hmmscan_240_10_domain_table
-
-#  d2f -e=10 <BACTH_CHIPI>_hmmscan_240_10_domain_table <BACTH_CHIPI>.fasta <BACTH_CHIPI>_hmmscan_240_10.dff
-
-# hmmsearch --nobias -E 1000 --domtblout <BACTH_CHIPI>.hmmsearch_SusD  <~/DATA/PFAM/PFAM240/PFAM_A_HMMs/SusD.hmm> BACTH_CHIPI.fasta
-
-# dsx -dd -e=<1e-2> -l=<200> <BACTH_CHIPI>.hmmsearch_SusD <BACTH_CHIPI>.fasta BACTH_CHIPI_e2_200
-
-
 module Evoruby
 
   class RunPhyloPipeline
 
+    PFAM      = "/home/czmasek/DATA/PFAM/PFAM270X/"
+    HMMSCAN  = "/home/czmasek/SOFTWARE/HMMER/hmmer-3.0/src/hmmscan"
+    HSP       = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/ruby/evoruby/exe/hsp.rb"
+    D2F       = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/ruby/evoruby/exe/d2f.rb"
+    DSX       = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/ruby/evoruby/exe/dsx.rb"
+
     def run
-      unless ARGV.length == 4
-        puts "arguments are: [inputfile].fasta [hmm-name] [min-length] [neg e-value exponent]"
-        exit
+      unless ARGV.length >= 4 && ARGV.length <= 6
+        error "arguments are: <fasta formatted inputfile> <hmm-name> <min-length> " +
+         "<neg E-value exponent for domain extraction> [E-value for hmmscan, default is 20] [hmmscan option, default is --nobias, --max for no heuristics]"
       end
 
-      hmmscan   = "/home/czmasek/SOFTWARE/HMMER/hmmer-3.0b3/src/hmmscan"
-      hmmsearch = "/home/czmasek/SOFTWARE/HMMER/hmmer-3.0b3/src/hmmsearch"
-      hsp       = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester-atv/ruby/evoruby/exe/hsp.rb"
-      d2f       = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester-atv/ruby/evoruby/exe/d2f.rb"
-      dsx       = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester-atv/ruby/evoruby/exe/dsx.rb"
-
-      base_name   = ARGV[ 0 ]
+      input       = ARGV[ 0 ]
       hmm         = ARGV[ 1 ]
-      length      = ARGV[ 2 ]
-      e_value_exp = ARGV[ 3 ]
-      do_domain_combination_analysis = true
+      length      = ARGV[ 2 ].to_i
+      e_value_exp = ARGV[ 3 ].to_i
+
+      e_for_hmmscan = 20
+      hmmscan_option = "--nobias"
 
-      if do_domain_combination_analysis
+      if ARGV.length == 6
+        hmmscan_option = ARGV[ 5 ]
+      end
+      if ARGV.length == 5 || ARGV.length == 6
+        e_for_hmmscan = ARGV[ 4 ].to_i
+      end
 
-        cmd = "#{hmmscan} --nobias --domtblout #{base_name}_hmmscan_240_10 -E 10 /home/czmasek/DATA/PFAM/PFAM240/Pfam-A.hmm #{base_name}.fasta"
-        run_command( cmd )
+      if e_value_exp < 0
+        error "E-value exponent for domain extraction cannot be negative"
+      end
+      if length <= 1
+        error "length cannot be smaller than or equal to 1"
+      end
+      if e_for_hmmscan < 1
+        error "E-value for hmmscan cannot be smaller than 1"
+      end
 
-        cmd = "#{hsp} #{base_name}_hmmscan_240_10 #{base_name}_hmmscan_240_10_domain_table"
-        run_command( cmd )
+      base_name = get_base_name input
 
-        cmd = "#{d2f} -e=10 #{base_name}_hmmscan_240_10_domain_table #{base_name}.fasta #{base_name}_hmmscan_240_10.dff"
-        run_command( cmd )
+      puts
+      puts "1. hmmscan:"
+      cmd = "#{HMMSCAN} #{hmmscan_option} --domtblout #{base_name}_hmmscan_#{e_for_hmmscan.to_s} -E #{e_for_hmmscan.to_s} #{PFAM}Pfam-A.hmm #{input}"
+      run_command( cmd )
+      puts
 
-      end
+      puts "2. hmmscan to simple domain table:"
+      cmd = "#{HSP} #{base_name}_hmmscan_#{e_for_hmmscan.to_s} #{base_name}_hmmscan_#{e_for_hmmscan.to_s}_domain_table"
+      run_command( cmd )
+      puts
 
-      cmd = "#{hmmsearch} --nobias -E 1000 --domtblout #{base_name}.hmmsearch_#{hmm}  ~/DATA/PFAM/PFAM240/PFAM_A_HMMs/#{hmm}.hmm #{base_name}.fasta"
+      puts "3. domain table to forester format:"
+      cmd = "#{D2F} -e=10 #{base_name}_hmmscan_#{e_for_hmmscan.to_s}_domain_table #{input} #{base_name}_hmmscan_#{e_for_hmmscan.to_s}.dff"
       run_command( cmd )
+      puts
 
-      cmd = "#{dsx} -dd -e=1e-#{e_value_exp.to_s} -l=#{length} #{base_name}.hmmsearch_#{hmm} #{base_name}.fasta #{base_name}_e#{e_value_exp.to_s}_#{length}"
+      puts "4. dsx:"
+      cmd = "#{DSX} -d -e=1e-#{e_value_exp.to_s} -l=#{length} #{hmm} #{base_name}_hmmscan_#{e_for_hmmscan.to_s} #{input} #{base_name}__#{hmm}__ee#{e_value_exp.to_s}_#{length}"
       run_command( cmd )
+      puts
 
     end
 
-    def run_command( cmd )
+    def run_command cmd
       puts cmd
       `#{cmd}`
     end
 
+    def get_base_name n
+      if n.downcase.end_with?( "_ni.fasta" )
+        n[ 0 .. n.length - 10 ]
+      elsif n.downcase.end_with?( ".fasta" )
+        n[ 0 .. n.length - 7 ]
+      elsif n.downcase.end_with?( "_ni.fsa" )
+        n[ 0 .. n.length - 8 ]
+      elsif n.downcase.end_with?( ".fsa" )
+        n[ 0 .. n.length - 5 ]
+      else
+        n
+      end
+    end
+
+    def error msg
+      puts
+      puts msg
+      puts
+      exit
+    end
+
   end
 
   p = RunPhyloPipeline.new()