in progress
authorcmzmasek <chris.zma@outlook.com>
Thu, 16 Feb 2017 01:10:56 +0000 (17:10 -0800)
committercmzmasek <chris.zma@outlook.com>
Thu, 16 Feb 2017 01:10:56 +0000 (17:10 -0800)
13 files changed:
forester/ruby/evoruby/exe/fae.rb [deleted file]
forester/ruby/evoruby/exe/fastx.rb [new file with mode: 0755]
forester/ruby/evoruby/exe/hsa.rb
forester/ruby/evoruby/exe/hsp.rb [changed mode: 0755->0644]
forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb
forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb
forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb
forester/ruby/evoruby/lib/evo/tool/fasta_extractor.rb
forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb
forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb
forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb
forester/ruby/evoruby/lib/evo/util/constants.rb
forester/ruby/scripts/hmm_split.rb

diff --git a/forester/ruby/evoruby/exe/fae.rb b/forester/ruby/evoruby/exe/fae.rb
deleted file mode 100755 (executable)
index f75af89..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/local/bin/ruby -w
-#
-# = exe/fae
-#
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# $Id: fae.rb,v 1.1 2008/09/10 02:16:34 cmzmasek Exp $
-
-
-require 'lib/evo/tools/fasta_extractor'
-
-module Evoruby
-    
-    mse = FastaExtractor.new()
-    
-    mse.run()
-    
-end  # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/exe/fastx.rb b/forester/ruby/evoruby/exe/fastx.rb
new file mode 100755 (executable)
index 0000000..62c720a
--- /dev/null
@@ -0,0 +1,17 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/fastx
+#
+#
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
+
+require 'lib/evo/tool/fasta_extractor'
+
+module Evoruby
+    
+    mse = FastaExtractor.new()
+    
+    mse.run()
+    
+end  # module Evoruby
\ No newline at end of file
index a79da24..b5987e4 100755 (executable)
@@ -2,12 +2,8 @@
 #
 # = exe/hsp
 #
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# $Id: hsp.rb,v 1.1 2009/11/25 05:42:04 cmzmasek Exp $
-#
-# last modified: 11/24/2009
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
 
 require 'lib/evo/tool/hmmscan_analysis'
 
old mode 100755 (executable)
new mode 100644 (file)
index 7b09d13..105d3dc
@@ -2,19 +2,15 @@
 #
 # = exe/hsp
 #
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# $Id: hsp.rb,v 1.1 2009/11/25 05:42:04 cmzmasek Exp $
-#
-# last modified: 11/24/2009
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
 
 require 'lib/evo/tool/hmmscan_summary'
 
 module Evoruby
 
-    hsp = HmmscanSummary.new()
+  hsp = HmmscanSummary.new()
 
-    hsp.run()
+  hsp.run()
 
-end  # module Evoruby
+end  # module Evoruby
\ No newline at end of file
index 39a0088..5186bf5 100644 (file)
@@ -1,11 +1,8 @@
 #
 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanDomainExtractor class
 #
-# Copyright::  Copyright (C) 2012 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# $Id:  $
-
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
 
 require 'lib/evo/util/constants'
 require 'lib/evo/msa/msa_factory'
@@ -14,31 +11,27 @@ require 'lib/evo/io/writer/fasta_writer'
 require 'lib/evo/io/parser/fasta_parser'
 require 'lib/evo/io/parser/hmmscan_parser'
 
-
-
 module Evoruby
-
   class HmmscanDomainExtractor
 
     ADD_TO_CLOSE_PAIRS = 0
-
     def initialize
     end
 
     # raises ArgumentError, IOError, StandardError
     def parse( domain_id,
-        hmmscan_output,
-        fasta_sequence_file,
-        outfile,
-        passed_seqs_outfile,
-        failed_seqs_outfile,
-        e_value_threshold,
-        length_threshold,
-        add_position,
-        add_domain_number,
-        add_species,
-        min_linker,
-        log )
+      hmmscan_output,
+      fasta_sequence_file,
+      outfile,
+      passed_seqs_outfile,
+      failed_seqs_outfile,
+      e_value_threshold,
+      length_threshold,
+      add_position,
+      add_domain_number,
+      add_species,
+      min_linker,
+      log )
 
       Util.check_file_for_readability( hmmscan_output )
       Util.check_file_for_readability( fasta_sequence_file )
@@ -106,7 +99,7 @@ module Evoruby
         i_e_value = r.i_e_value
 
         if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
-             ( ( length_threshold <= 0 )   || ( env_to - env_from + 1 ) >= length_threshold.to_f ) )
+        ( ( length_threshold <= 0 )   || ( env_to - env_from + 1 ) >= length_threshold.to_f ) )
           hmmscan_datas << HmmsearchData.new( sequence, number, out_of, env_from, env_to, i_e_value )
           if ( number > max_domain_copy_number_per_protein )
             max_domain_copy_number_sequence    = sequence
@@ -137,21 +130,21 @@ module Evoruby
         if number == out_of
           if !hmmscan_datas.empty?
             process_hmmscan_datas( hmmscan_datas,
-              in_msa,
-              add_position,
-              add_domain_number,
-              add_species,
-              out_msa,
-              out_msa_singles,
-              out_msa_pairs,
-              out_msa_isolated,
-              min_linker,
-              out_msa_single_domains_protein_seqs,
-              out_msa_close_pairs_protein_seqs,
-              out_msa_close_pairs_only_protein_seqs,
-              out_msa_isolated_protein_seqs,
-              out_msa_isolated_only_protein_seqs,
-              out_msa_isolated_and_close_pair_protein_seqs )
+            in_msa,
+            add_position,
+            add_domain_number,
+            add_species,
+            out_msa,
+            out_msa_singles,
+            out_msa_pairs,
+            out_msa_isolated,
+            min_linker,
+            out_msa_single_domains_protein_seqs,
+            out_msa_close_pairs_protein_seqs,
+            out_msa_close_pairs_only_protein_seqs,
+            out_msa_isolated_protein_seqs,
+            out_msa_isolated_only_protein_seqs,
+            out_msa_isolated_and_close_pair_protein_seqs )
             domain_pass_counter += hmmscan_datas.length
             if passed_seqs.find_by_name_start( sequence, true ).length < 1
               add_sequence( sequence, in_msa, passed_seqs )
@@ -193,11 +186,11 @@ module Evoruby
       write_msa( failed_seqs, failed_seqs_outfile )
 
       if out_msa_pairs
-        write_msa( out_msa_pairs, outfile +"_" + min_linker.to_s + ".fasta")
+        write_msa( out_msa_pairs, outfile + "_" + min_linker.to_s + ".fasta")
       end
 
       if out_msa_singles
-        write_msa( out_msa_singles, outfile +"_singles.fasta")
+        write_msa( out_msa_singles, outfile + "_singles.fasta")
       end
 
       if out_msa_isolated
@@ -205,7 +198,7 @@ module Evoruby
       end
 
       if out_msa_single_domains_protein_seqs
-        write_msa( out_msa_single_domains_protein_seqs, outfile +"_proteins_with_singles.fasta" )
+        write_msa( out_msa_single_domains_protein_seqs, outfile + "_proteins_with_singles.fasta" )
       end
 
       if out_msa_close_pairs_protein_seqs
@@ -230,9 +223,9 @@ module Evoruby
 
       if min_linker
         if ( out_msa_single_domains_protein_seqs.get_number_of_seqs +
-             out_msa_close_pairs_only_protein_seqs.get_number_of_seqs +
-             out_msa_isolated_only_protein_seqs.get_number_of_seqs +
-             out_msa_isolated_and_close_pair_protein_seqs.get_number_of_seqs ) != passed_seqs.get_number_of_seqs
+        out_msa_close_pairs_only_protein_seqs.get_number_of_seqs +
+        out_msa_isolated_only_protein_seqs.get_number_of_seqs +
+        out_msa_isolated_and_close_pair_protein_seqs.get_number_of_seqs ) != passed_seqs.get_number_of_seqs
           error_msg = "this should not have happened"
           raise StandardError, error_msg
         end
@@ -263,7 +256,6 @@ module Evoruby
 
     end # parse
 
-
     private
 
     def write_msa( msa, filename )
@@ -279,7 +271,6 @@ module Evoruby
       end
     end
 
-
     def add_sequence( sequence_name, in_msa, add_to_msa )
       seqs = in_msa.find_by_name_start( sequence_name, true )
       if ( seqs.length < 1 )
@@ -295,21 +286,21 @@ module Evoruby
     end
 
     def process_hmmscan_datas( hmmscan_datas,
-        in_msa,
-        add_position,
-        add_domain_number,
-        add_species,
-        out_msa,
-        out_msa_singles,
-        out_msa_pairs,
-        out_msa_isolated,
-        min_linker,
-        out_msa_single_domains_protein_seqs,
-        out_msa_close_pairs_protein_seqs,
-        out_msa_close_pairs_only_protein_seqs,
-        out_msa_isolated_protein_seqs,
-        out_msa_isolated_only_protein_seqs,
-        out_msa_isolated_and_close_pair_protein_seqs )
+      in_msa,
+      add_position,
+      add_domain_number,
+      add_species,
+      out_msa,
+      out_msa_singles,
+      out_msa_pairs,
+      out_msa_isolated,
+      min_linker,
+      out_msa_single_domains_protein_seqs,
+      out_msa_close_pairs_protein_seqs,
+      out_msa_close_pairs_only_protein_seqs,
+      out_msa_isolated_protein_seqs,
+      out_msa_isolated_only_protein_seqs,
+      out_msa_isolated_and_close_pair_protein_seqs )
 
       actual_out_of = hmmscan_datas.size
       saw_close_pair = false
@@ -327,28 +318,28 @@ module Evoruby
         seq_name =  hmmscan_data.seq_name
 
         extract_domain( seq_name,
-          index + 1,
-          actual_out_of,
-          hmmscan_data.env_from,
-          hmmscan_data.env_to,
-          in_msa,
-          out_msa,
-          add_position,
-          add_domain_number,
-          add_species )
+        index + 1,
+        actual_out_of,
+        hmmscan_data.env_from,
+        hmmscan_data.env_to,
+        in_msa,
+        out_msa,
+        add_position,
+        add_domain_number,
+        add_species )
 
         if min_linker
           if actual_out_of == 1
             extract_domain( seq_name,
-              1,
-              1,
-              hmmscan_data.env_from,
-              hmmscan_data.env_to,
-              in_msa,
-              out_msa_singles,
-              add_position,
-              add_domain_number,
-              add_species )
+            1,
+            1,
+            hmmscan_data.env_from,
+            hmmscan_data.env_to,
+            in_msa,
+            out_msa_singles,
+            add_position,
+            add_domain_number,
+            add_species )
             if out_msa_single_domains_protein_seqs.find_by_name_start( seq_name, true ).length < 1
               add_sequence( seq_name, in_msa, out_msa_single_domains_protein_seqs )
             else
@@ -361,20 +352,20 @@ module Evoruby
             last = index == hmmscan_datas.length - 1
 
             if ( ( first && ( ( hmmscan_datas[ index + 1 ].env_from - hmmscan_data.env_to ) > min_linker) )  ||
-                 ( last && ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) ||
-                 ( !first && !last &&  ( ( hmmscan_datas[ index + 1 ].env_from - hmmscan_data.env_to ) > min_linker ) &&
-                   ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) )
+            ( last && ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) ||
+            ( !first && !last &&  ( ( hmmscan_datas[ index + 1 ].env_from - hmmscan_data.env_to ) > min_linker ) &&
+            ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) )
 
               extract_domain( seq_name,
-                index + 1,
-                actual_out_of,
-                hmmscan_data.env_from,
-                hmmscan_data.env_to,
-                in_msa,
-                out_msa_isolated,
-                add_position,
-                add_domain_number,
-                add_species )
+              index + 1,
+              actual_out_of,
+              hmmscan_data.env_from,
+              hmmscan_data.env_to,
+              in_msa,
+              out_msa_isolated,
+              add_position,
+              add_domain_number,
+              add_species )
               saw_isolated = true
 
             elsif !first
@@ -396,15 +387,15 @@ module Evoruby
               end
 
               extract_domain( seq_name,
-                index.to_s  + "+" + ( index + 1 ).to_s,
-                actual_out_of,
-                from,
-                to,
-                in_msa,
-                out_msa_pairs,
-                add_position,
-                add_domain_number,
-                add_species )
+              index.to_s  + "+" + ( index + 1 ).to_s,
+              actual_out_of,
+              from,
+              to,
+              in_msa,
+              out_msa_pairs,
+              add_position,
+              add_domain_number,
+              add_species )
               saw_close_pair = true
             end
           end
@@ -456,15 +447,15 @@ module Evoruby
     end # def process_hmmscan_data
 
     def extract_domain( sequence,
-        number,
-        out_of,
-        seq_from,
-        seq_to,
-        in_msa,
-        out_msa,
-        add_position,
-        add_domain_number,
-        add_species )
+      number,
+      out_of,
+      seq_from,
+      seq_to,
+      in_msa,
+      out_msa,
+      add_position,
+      add_domain_number,
+      add_species )
       if number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
         error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
         raise StandardError, error_msg
index a8144ec..ac03d43 100644 (file)
@@ -13,9 +13,9 @@ module Evoruby
   class DomainSequenceExtractor
 
     PRG_NAME       = "dsx"
-    PRG_VERSION    = "2.001"
-    PRG_DESC       = "extraction of domain sequences from hmmscan output"
-    PRG_DATE       = "20170213"
+    PRG_VERSION    = "2.002"
+    PRG_DESC       = "Extraction of domain sequences from hmmscan output"
+    PRG_DATE       = "20170214"
     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
 
     E_VALUE_THRESHOLD_OPTION           = 'e'
@@ -23,7 +23,6 @@ module Evoruby
     ADD_POSITION_OPTION                = 'p'
     ADD_DOMAIN_NUMBER_OPTION           = 'd'
     ADD_SPECIES                        = 's'
-    MIN_LINKER_OPT                     = 'ml'
     LOG_FILE_SUFFIX                    = '_domain_seq_extr.log'
     PASSED_SEQS_SUFFIX                 = '_with_passing_domains.fasta'
     FAILED_SEQS_SUFFIX                 = '_with_no_passing_domains.fasta'
@@ -38,7 +37,10 @@ module Evoruby
       WWW,
       STDOUT )
 
-      ld = Constants::LINE_DELIMITER
+      if ( ARGV == nil || ( ARGV.length < 1 )  )
+        print_help
+        exit( -1 )
+      end
 
       begin
         cla = CommandLineArguments.new( ARGV )
@@ -52,7 +54,7 @@ module Evoruby
         exit( 0 )
       end
 
-      if ( cla.get_number_of_files != 4 )
+      unless ( cla.get_number_of_files == 2 || cla.get_number_of_files == 3 || cla.get_number_of_files == 4 )
         print_help
         exit( -1 )
       end
@@ -63,7 +65,6 @@ module Evoruby
       allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
       allowed_opts.push( LENGTH_THRESHOLD_OPTION )
       allowed_opts.push( ADD_SPECIES )
-      allowed_opts.push( MIN_LINKER_OPT )
 
       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
       if ( disallowed.length > 0 )
@@ -72,75 +73,101 @@ module Evoruby
         STDOUT )
       end
 
-      domain_id           = cla.get_file_name( 0 )
-      hmmsearch_output    = cla.get_file_name( 1 )
-      fasta_sequence_file = cla.get_file_name( 2 )
-      outfile             = cla.get_file_name( 3 )
-
-      if outfile.downcase.end_with?( ".fasta" )
-        outfile = outfile[ 0 .. outfile.length - 7 ]
-      elsif outfile.downcase.end_with?( ".fsa" )
-        outfile = outfile[ 0 .. outfile.length - 5 ]
-      end
-
-      add_position = false
-      if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
-        add_position = true
-      end
-
-      add_domain_number           = false
-      if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
-        add_domain_number = true
-      end
-
-      add_species = false
-      if cla.is_option_set? ADD_SPECIES
-        add_species = true
-      end
-
-      e_value_threshold = -1.0
+      e_value_threshold = 0.1
       if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
         begin
           e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
         rescue ArgumentError => e
-          Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
         end
         if ( e_value_threshold < 0.0 )
-          Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
+          Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
         end
       end
 
-      length_threshold = -1
+      length_threshold = 50
       if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
         begin
           length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
         rescue ArgumentError => e
-          Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
         end
         if ( length_threshold < 0)
-          Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
+          Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
         end
       end
 
-      min_linker = nil
-      if ( cla.is_option_set?( MIN_LINKER_OPT ) )
-        begin
-          min_linker = cla.get_option_value_as_int( MIN_LINKER_OPT )
-        rescue ArgumentError => e
-          Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+      domain_id           = cla.get_file_name( 0 )
+      hmmscan_output      = cla.get_file_name( 1 )
+      fasta_sequence_file = ""
+      outfile             = ""
+
+      if (cla.get_number_of_files == 4)
+        fasta_sequence_file = cla.get_file_name( 2 )
+        outfile             = cla.get_file_name( 3 )
+      elsif (cla.get_number_of_files == 2 || cla.get_number_of_files == 3)
+        if (cla.get_number_of_files == 3)
+          fasta_sequence_file = cla.get_file_name( 2 )
+        else
+          hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
+          if ( hmmscan_index != nil )
+            prefix = hmmscan_output[0 .. hmmscan_index-1 ]
+            suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
+            files = Dir.entries( "." )
+            matching_files = Util.get_matching_files( files, prefix, suffix)
+            if matching_files.length < 1
+              Util.fatal_error( PRG_NAME, 'no file matching [' + prefix +
+              '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
+            end
+            if matching_files.length > 1
+              Util.fatal_error( PRG_NAME, 'more than one file matching [' +
+              prefix  + '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
+            end
+            fasta_sequence_file = matching_files[ 0 ]
+          else
+            Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
+          end
         end
-        if ( !min_linker || min_linker > 100 || min_linker < -100 )
-          Forester::Util.fatal_error( PRG_NAME, "unexpected value for min linker " + min_linker.to_s, STDOUT )
+        hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
+        if ( hmmscan_index != nil )
+          outfile = hmmscan_output.sub(Constants::HMMSCAN, "_") + "_" + domain_id
+          e = e_value_threshold >= 1 ? e_value_threshold.to_i : e_value_threshold.to_s.sub!('.','_')
+          outfile += "_E" + e.to_s
+          outfile += "_L" + length_threshold.to_i.to_s
+        else
+          Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
         end
       end
 
+      if outfile.downcase.end_with?( ".fasta" )
+        outfile = outfile[ 0 .. outfile.length - 7 ]
+      elsif outfile.downcase.end_with?( ".fsa" )
+        outfile = outfile[ 0 .. outfile.length - 5 ]
+      end
+
+      add_position = false
+      if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
+        add_position = true
+      end
+
+      add_domain_number = false
+      if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
+        add_domain_number = true
+      end
+
+      add_species = false
+      if cla.is_option_set?( ADD_SPECIES )
+        add_species = true
+      end
+
       log = String.new
+      ld = Constants::LINE_DELIMITER
 
       puts()
       puts( "Domain                                 : " + domain_id )
       log << "Domain                                 : " + domain_id + ld
-      puts( "Hmmscan outputfile                     : " + hmmsearch_output )
-      log << "Hmmscan outputfile                     : " + hmmsearch_output + ld
+      puts( "Hmmscan outputfile                     : " + hmmscan_output )
+      log << "Hmmscan outputfile                     : " + hmmscan_output + ld
       puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file )
       log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld
       puts( "Outputfile                             : " + outfile + ".fasta" )
@@ -165,11 +192,6 @@ module Evoruby
         puts( "Length threshold  : no threshold" )
         log << "Length threshold  : no threshold" + ld
       end
-      if ( min_linker )
-        puts( "Min linker        : " + min_linker.to_s )
-        log << "Min linker        :  " + min_linker.to_s +  ld
-
-      end
 
       if ( add_position )
         puts( "Add positions (rel to complete seq) to extracted domains: true" )
@@ -193,7 +215,7 @@ module Evoruby
       begin
         parser = HmmscanDomainExtractor.new()
         domain_count = parser.parse( domain_id,
-        hmmsearch_output,
+        hmmscan_output,
         fasta_sequence_file,
         outfile,
         outfile + PASSED_SEQS_SUFFIX,
@@ -203,7 +225,7 @@ module Evoruby
         add_position,
         add_domain_number,
         add_species,
-        min_linker,
+        false,
         log )
       rescue ArgumentError, IOError => e
         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
@@ -239,14 +261,21 @@ module Evoruby
       puts()
       puts( "Usage:" )
       puts()
-      puts( "  " + PRG_NAME + ".rb [options] <domain> <hmmscan outputfile> <file containing complete sequences in fasta format> <outputfile>" )
+      puts( "  " + PRG_NAME + ".rb [options] <domain> <hmmscan outputfile> [file containing complete sequences in fasta format] [outputfile]" )
+      puts()
+      puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f> : iE-value threshold, default is 0.1" )
+      puts( "           -" + LENGTH_THRESHOLD_OPTION   + "=<i> : length threshold, default is 50" )
+      puts( "           -" + ADD_DOMAIN_NUMBER_OPTION  + "     : to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
+      puts( "           -" + ADD_POSITION_OPTION  + "     : to add positions (rel to complete seq) to extracted domains" )
+      puts( "           -" + ADD_SPECIES  + "     : to add species [in brackets]" )
       puts()
-      puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f>: iE-value threshold, default is no threshold" )
-      puts( "           -" + LENGTH_THRESHOLD_OPTION   + "=<i>: length threshold, default is no threshold" )
-      puts( "           -" + ADD_POSITION_OPTION  + ": to add positions (rel to complete seq) to extracted domains" )
-      puts( "           -" + ADD_DOMAIN_NUMBER_OPTION  + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
-      puts( "           -" + ADD_SPECIES  + ": to add species [in brackets]" )
-      puts( "           -" + MIN_LINKER_OPT  + "=<i>: to extract pairs of same domains with a distance inbetween shorter than a given value" )
+      puts( "Examples:" )
+      puts
+      puts( "  " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta P53_#{Constants::PFAM_V_FOR_EX}_10_Pkinase_E1_0e-06_L50" )
+      puts
+      puts( "  " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta" )
+      puts
+      puts( "  " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10" )
       puts()
     end
 
index b928468..48459ae 100644 (file)
@@ -183,7 +183,7 @@ module Evoruby
         if ( cla.get_number_of_files == 2 )
           original_sequences_file = cla.get_file_name( 1 )
         else
-          hmmscan_index = domains_list_file.index("hmmscan")
+          hmmscan_index = domains_list_file.index(Constants::HMMSCAN)
           if ( hmmscan_index != nil )
             prefix = domains_list_file[0 .. hmmscan_index-1 ]
             suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
@@ -198,14 +198,18 @@ module Evoruby
               prefix  + '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
             end
             original_sequences_file = matching_files[ 0 ]
+          else
+            Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
           end
+
         end
         outfile = domains_list_file
         if (outfile.end_with?(Constants::DOMAIN_TABLE_SUFFIX) )
           outfile = outfile.chomp(Constants::DOMAIN_TABLE_SUFFIX)
         end
         if ( e_value_threshold >= 0.0 )
-          outfile = outfile + Constants::DOMAINS_TO_FORESTER_EVALUE_CUTOFF_SUFFIX + e_value_threshold.to_s
+          e = e_value_threshold >= 1 ? e_value_threshold.to_i : e_value_threshold.to_s.sub!('.','_')
+          outfile = outfile + Constants::DOMAINS_TO_FORESTER_EVALUE_CUTOFF_SUFFIX + e.to_s
         end
         outfile = outfile + Constants::DOMAINS_TO_FORESTER_OUTFILE_SUFFIX
       end
@@ -248,7 +252,7 @@ module Evoruby
 
       puts
       Util.print_message( PRG_NAME, "wrote: " + outfile )
-      Util.print_message( PRG_NAME, "next steps in standard analysis pipeline: hmmsearch followed by dsx.rb")
+      Util.print_message( PRG_NAME, "next step in standard analysis pipeline: dsx.rb")
       Util.print_message( PRG_NAME, 'OK' )
       puts
 
@@ -262,16 +266,18 @@ module Evoruby
       puts
       puts( "  " + PRG_NAME + ".rb [options] <domain table (parsed hmmpfam output)> [file containing complete sequences in fasta format] [outputfile]" )
       puts()
-      puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f> : E-value threshold, default is no threshold" )
-      puts( "               -" + OVERWRITE_IF_SAME_FROM_TO_OPTION  + " : overwrite domain with same start and end with domain with better E-value" )
+      puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f>: E-value threshold, default is no threshold" )
+      puts( "           -" + OVERWRITE_IF_SAME_FROM_TO_OPTION  + "    : overwrite domain with same start and end with domain with better E-value" )
       puts
+      puts( "  [next step in standard analysis pipeline: dsx.rb]")
+      puts()
       puts( "Examples:" )
       puts
-      puts( "  " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta P53_hmmscan_300_10.dff" )
+      puts( "  " + PRG_NAME + ".rb -o P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10.dff" )
       puts
-      puts( "  " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta" )
+      puts( "  " + PRG_NAME + ".rb -o P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta" )
       puts
-      puts( "  " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table" )
+      puts( "  " + PRG_NAME + ".rb -o P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table" )
       puts()
     end
 
index b2d0d5c..d9ad2a8 100644 (file)
 #
 # = lib/evo/apps/fasta_extractor.rb - FastaExtractor class
 #
-# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# $Id: fasta_extractor.rb,v 1.2 2010/12/13 19:00:11 cmzmasek Exp $
-
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
 
-require 'lib/evo/util/util'
 require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
 require 'lib/evo/util/command_line_arguments'
 
-
 module Evoruby
-
-    class FastaExtractor
-
-        PRG_NAME                           = "fae"
-        PRG_VERSION                        = "1.0.0"
-        PRG_DESC                           = "extraction of nucleotide sequences from a fasta file by names from wublast search"
-        PRG_DATE                           = "2008.08.09"
-        COPYRIGHT                          = "2008-2009 Christian M Zmasek"
-        CONTACT                            = "phylosoft@gmail.com"
-        WWW                                = "www.phylosoft.org"
-        HELP_OPTION_1                      = 'help'
-        HELP_OPTION_2                      = 'h'
-
-
-        def run()
-
-            Util.print_program_information( PRG_NAME,
-                PRG_VERSION,
-                PRG_DESC ,
-                PRG_DATE,
-                COPYRIGHT,
-                CONTACT,
-                WWW,
-                STDOUT )
-
-            ld = Constants::LINE_DELIMITER
-
-            begin
-                cla = CommandLineArguments.new( ARGV )
-            rescue ArgumentError => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-            end
-
-            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
-                     cla.is_option_set?( HELP_OPTION_2 ) )
-                print_help
-                exit( 0 )
-            end
-
-            if ( cla.get_number_of_files != 3 )
-                print_help
-                exit( -1 )
-            end
-
-            allowed_opts = Array.new
-
-            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
-            if ( disallowed.length > 0 )
-                Util.fatal_error( PRG_NAME,
-                    "unknown option(s): " + disallowed,
-                    STDOUT )
+  class FastaExtractor
+
+    PRG_NAME       = "fastx"
+    PRG_VERSION    = "1.000"
+    PRG_DESC       = "extraction of molecular sequences from a fasta file"
+    PRG_DATE       = "170215"
+    WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
+
+    HELP_OPTION_1                      = 'help'
+    HELP_OPTION_2                      = 'h'
+    def run()
+
+      Util.print_program_information( PRG_NAME,
+      PRG_VERSION,
+      PRG_DESC ,
+      PRG_DATE,
+      WWW,
+      STDOUT )
+
+      if ( ARGV == nil || ( ARGV.length < 1 )  )
+        print_help
+        exit( -1 )
+      end
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+      cla.is_option_set?( HELP_OPTION_2 ) )
+        print_help
+        exit( 0 )
+      end
+
+      if ( cla.get_number_of_files != 3 )
+        print_help
+        exit( -1 )
+      end
+
+      allowed_opts = Array.new
+
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME,
+        "unknown option(s): " + disallowed,
+        STDOUT )
+      end
+
+      input_file  = cla.get_file_name( 0 )
+      query       = cla.get_file_name( 1 )
+      output_file = cla.get_file_name( 2 )
+
+      if  !File.exist?( input_file )
+        Util.fatal_error( PRG_NAME, "error: input file [#{input_file}] does not exist" )
+      end
+      if File.exist?( output_file   )
+        Util.fatal_error( PRG_NAME, "error: [#{output_file}] already exists" )
+      end
+
+      results = extract_sequences( query, input_file, output_file )
+
+      Util.print_message( PRG_NAME, "matched: " + results )
+      Util.print_message( PRG_NAME, "wrote:   " + output_file )
+      Util.print_message( PRG_NAME, "OK" )
+
+    end
+
+    def extract_sequences( query, fasta_file, output_file )
+      output = File.open( output_file, "a" )
+      matching_state = false
+      matches = 0
+      total = 0
+      File.open( fasta_file ) do | file |
+        while line = file.gets
+          if !Util.is_string_empty?( line )
+            if line =~ /^\s*>/
+              total += 1
+              if total % 10000 == 0
+                STDOUT.write "\r#{matches}/#{total}"
+                STDOUT.flush
+              end
+              if line =~ /#{query}/
+                matching_state = true
+                matches += 1
+                output.print( line )
+              else
+                matching_state = false
+              end
+            elsif matching_state
+              output.print( line )
             end
-
-            input_file  = cla.get_file_name( 0 )
-            names_file  = cla.get_file_name( 1 )
-            output_file = cla.get_file_name( 2 )
-
-            if  !File.exist?( input_file )
-                Util.fatal_error( PRG_NAME, "error: input file [#{input_file}] does not exist" )
-            end
-            if  !File.exist?( names_file )
-                Util.fatal_error( PRG_NAME, "error: names file [#{names_file}] does not exist" )
-            end
-            if File.exist?( output_file   )
-                Util.fatal_error( PRG_NAME, "error: [#{output_file }] already exists" )
-            end
-
-            names = extract_names_with_frames( names_file )
-
-            extract_sequences( names, input_file, output_file )
-
-            puts
-            Util.print_message( PRG_NAME, "OK" )
-            puts
-
+          end
         end
-
-
-        def extract_names_with_frames( names_file )
-            names = Hash.new()
-            File.open( names_file ) do | file |
-                while line = file.gets
-                    if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
-                        if ( line =~ /(\S+)\s+([+|-]\d)\s+\d+\s+(\S+)/ )
-                            name  = $1
-                            frame = $2
-                            e     = $3
-                            names[ name ] =  "[" + frame + "] [" + e + "]"
-                        end
-                    end
-                end
-            end
-            names
-        end
-
-        def extract_sequences( names, fasta_file, output_file )
-            output = File.open( output_file, "a" )
-            matching_state = false
-            counter = 0
-            File.open( fasta_file ) do | file |
-                while line = file.gets
-                    if !Util.is_string_empty?( line )
-                        if ( line =~ /\s*>\s*(.+)/ )
-                            name = $1
-                            if names.has_key?( name )
-                                matching_state = true
-                                counter += 1
-                                puts counter.to_s + ". " +name + " " + names[ name ]
-                                output.print( ">" + name + " " + names[ name ] )
-                                output.print( Evoruby::Constants::LINE_DELIMITER )
-                            else
-                                matching_state = false
-                            end
-                        elsif matching_state
-                            output.print( line )
-                        end
-                    end
-                end
-            end
-            output.close()
-        end
-
-        def print_help()
-            puts( "Usage:" )
-            puts()
-            puts( "  " + PRG_NAME + ".rb <input fasta file> <names file based on blast output> <output file>" )
-            puts()
-        end
-
-    end # class FastaExtractor
+      end
+      output.close()
+      matches.to_s + "/" + total.to_s
+    end
+
+    def print_help()
+      puts( "Usage:" )
+      puts()
+      puts( "  " + PRG_NAME + ".rb <input fasta file> <query> <output file>" )
+      puts()
+      puts( "Examples:" )
+      puts
+      puts( "  " + PRG_NAME + ".rb Pfam-A.fasta kinase kinases" )
+      puts()
+    end
+
+  end # class FastaExtractor
 end
\ No newline at end of file
index f702b55..203d110 100644 (file)
@@ -1,11 +1,8 @@
 #
 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
 #
-# Copyright::  Copyright (C) 2012 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
-#
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
 
 require 'lib/evo/util/constants'
 require 'lib/evo/util/util'
index caff316..6e9afb6 100644 (file)
@@ -153,7 +153,7 @@ module Evoruby
       end
 
       puts()
-      puts( "hmmpfam outputfile  : " + inpath )
+      puts( "hmmscan outputfile  : " + inpath )
       puts( "outputfile          : " + outpath )
 
       if ( i_e_value_threshold >= 0.0 )
@@ -469,9 +469,11 @@ module Evoruby
       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + "=<f>: E-value threshold for full protein sequences, only for protein architectures summary" )
       puts( "           -" + SPECIES_OPTION + "=<s> : species for protein architectures summary" )
       puts()
-      puts( "Example:" )
+      puts( "  [next step in standard analysis pipeline: d2f.rb]")
       puts()
-      puts( "  " + "hmmscan --nobias --domtblout P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 -E 10 Pfam-A.hmm P53_ni.fasta" )
+      puts( "Examples:" )
+      puts()
+      puts( "  " + "hmmscan --max --domtblout P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 -E 10 Pfam-A.hmm P53_ni.fasta" )
       puts()
       puts( "  " + PRG_NAME + ".rb P53_hmmscan_300_10" )
       puts()
index 6a4bb5f..1d8a25b 100644 (file)
@@ -19,7 +19,7 @@ module Evoruby
   class TaxonomyProcessor
 
     PRG_NAME       = "tap"
-    PRG_DATE       = "170213"
+    PRG_DATE       = "170214"
     PRG_DESC       = "Replacement of labels in multiple sequence files"
     PRG_VERSION    = "2.004"
     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
@@ -208,6 +208,8 @@ module Evoruby
       puts( "  options: -" + EXTRACT_TAXONOMY_OPTION + "    : to extract taxonomy information from bracketed expressions" )
       puts( "           -" + ANNOTATION_OPTION + "=<s>: to add an annotation to all entries" )
       puts()
+      puts( "  [next steps in standard analysis pipeline: hmmscan followed by hsp.rb]")
+      puts()
       puts( "Example:" )
       puts()
       puts( "  " + PRG_NAME + ".rb P53.fasta" )
index b54801b..c9e15f9 100644 (file)
@@ -14,6 +14,7 @@ module Evoruby
     ID_NORMALIZED_FASTA_FILE_SUFFIX          = "_ni.fasta"
     ID_MAP_FILE_SUFFIX                       = ".nim"
     DOMAIN_TABLE_SUFFIX                      = "_domain_table"
+    HMMSCAN                                  = "_hmmscan_"
     DOMAINS_TO_FORESTER_OUTFILE_SUFFIX       = ".dff"
     DOMAINS_TO_FORESTER_EVALUE_CUTOFF_SUFFIX = "_dtfE"
     
index 8ae000d..69edd90 100755 (executable)
@@ -1,80 +1,68 @@
 #!/usr/local/bin/ruby -w
 #
-# = hmm_split 
-#
-# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# $Id: hmm_split.rb,v 1.5 2008/11/17 22:32:43 cmzmasek Exp $
-#
-# To split a Pfam HMM file into one file for each HMM.
+# = hmm_split
 #
-
+# Copyright::    Copyright (C) 2017 Christian M Zmasek
+# License::      GNU Lesser General Public License (LGPL)
 
 module ForesterScripts
-    
-    if RUBY_VERSION !~ /1.9/
-                      puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
-                      exit( -1 )
-                end     
-    
-    
-    if ( ARGV == nil || ARGV.length != 3 )
-        puts( "usage: hmm_split.rb <Pfam HMM file> <outfile suffix> <outdir>" )         
-        exit( -1 )
-    end    
-       
-       hmmfile = ARGV[ 0 ]
-       suffix  = ARGV[ 1 ]
-       outdir  = ARGV[ 2 ]
-     
-       if ( !File.exists?( outdir ) )
-           puts( "outdir [" + outdir + "] does not exist" )
-           exit( -1 ) 
-       end 
-       if ( !File.exists?( hmmfile ) ) 
-           puts( "Pfam HMM file [" + hmmfile + "] does not exist" )
-           exit( -1 ) 
-       end                
-       
-       data = String.new
-       name = String.new
-       line_count = 0
-       count = 0
-       
-       File.open( hmmfile ) do | file |
-           while line = file.gets
-               data = data + line
-               line_count += 1
-               if ( line =~ /NAME\s+(.+)/ )
-                   if name.length > 0
-                       puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" )
-                       exit( -1 )                        
-                   end    
-                   name = $1    
-               elsif ( line =~ /\/\// )
-                   if name.length < 1
-                       puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" )
-                       exit( -1 )                        
-                   end
-                                       
-                   outfile = outdir + '/' + name + suffix
-                   if ( File.exists?( outfile ) ) 
-                       puts( "file [" + outfile + "] already exists" )
-                       exit( -1 )  
-                   end                   
-                   open( outfile, 'w' ) do | out |
-                       out.write( data )
-                   end
-                   count += 1
-                   puts( count.to_s + ": " + name )
-                   data = String.new
-                   name = String.new                                      
-               end
-           end
-       end 
-         
-       puts()
-       puts( "wrote " + count.to_s + " individual HMM files to " + outdir )    
-    
+
+  if ( ARGV == nil || ARGV.length != 3 )
+    puts( "usage: hmm_split.rb <Pfam HMM file> <outfile suffix> <outdir>" )
+    exit( -1 )
+  end
+
+  hmmfile = ARGV[ 0 ]
+  suffix  = ARGV[ 1 ]
+  outdir  = ARGV[ 2 ]
+
+  if ( !File.exist?( outdir ) )
+    puts( "outdir [" + outdir + "] does not exist" )
+    exit( -1 )
+  end
+  if ( !File.exist?( hmmfile ) )
+    puts( "Pfam HMM file [" + hmmfile + "] does not exist" )
+    exit( -1 )
+  end
+
+  data = String.new
+  name = String.new
+  line_count = 0
+  count = 0
+
+  File.open( hmmfile ) do | file |
+    while line = file.gets
+      data = data + line
+      line_count += 1
+      if ( line =~ /NAME\s+(.+)/ )
+        if name.length > 0
+          puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" )
+          exit( -1 )
+        end
+        name = $1
+      elsif ( line =~ /\/\// )
+        if name.length < 1
+          puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" )
+          exit( -1 )
+        end
+
+        outfile = outdir + '/' + name + suffix
+        if ( File.exist?( outfile ) )
+          puts( "file [" + outfile + "] already exists" )
+          exit( -1 )
+        end
+        open( outfile, 'w' ) do | out |
+          out.write( data )
+        end
+        count += 1
+        puts( count.to_s + ": " + name )
+        data = String.new
+        name = String.new
+      end
+    end
+  end
+
+  puts()
+  puts( "wrote " + count.to_s + " individual HMM files to " + outdir )
+
 end    
\ No newline at end of file