in progress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 26 Sep 2012 03:16:38 +0000 (03:16 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 26 Sep 2012 03:16:38 +0000 (03:16 +0000)
forester/ruby/evoruby/exe/dsx_old.rb [new file with mode: 0755]
forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb
forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor_old.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb

diff --git a/forester/ruby/evoruby/exe/dsx_old.rb b/forester/ruby/evoruby/exe/dsx_old.rb
new file mode 100755 (executable)
index 0000000..1ff35f7
--- /dev/null
@@ -0,0 +1,20 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/dsx
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: dsx.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $
+#
+# last modified: 06/11/2007
+
+require 'lib/evo/apps/domain_sequence_extractor'
+
+module Evoruby
+
+    dsx = DomainSequenceExtractor.new()
+
+    dsx.run()
+
+end  # module Evoruby
index ac6199b..517042a 100644 (file)
 #
 # = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class
 #
-# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# Copyright::  Copyright (C) 2012 Christian M. Zmasek
 # License::    GNU Lesser General Public License (LGPL)
 #
-# $Id: domain_sequence_extractor.rb,v 1.19 2010/12/13 19:00:11 cmzmasek Exp $
+# $Id:Exp $
 
 
 require 'lib/evo/util/constants'
 require 'lib/evo/util/util'
 require 'lib/evo/util/command_line_arguments'
-require 'lib/evo/io/parser/hmmsearch_domain_extractor'
+require 'lib/evo/io/parser/hmmscan_domain_extractor'
 
 module Evoruby
 
-    class DomainSequenceExtractor
-
-        PRG_NAME       = "dsx"
-        PRG_VERSION    = "1.1.0"
-        PRG_DESC       = "extraction of domain sequences from hmmsearch output"
-        PRG_DATE       = "2008.01.03"
-        COPYRIGHT      = "2008-2009 Christian M Zmasek"
-        CONTACT        = "phylosoft@gmail.com"
-        WWW            = "www.phylosoft.org"
-
-        E_VALUE_THRESHOLD_OPTION           = 'e'
-        LENGTH_THRESHOLD_OPTION            = 'l'
-        ADD_POSITION_OPTION                = 'p'
-        ADD_DOMAIN_NUMBER_OPTION           = 'd'
-        ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT  = 'dd'
-        ADD_DOMAIN_NUMBER_OPTION_AS_LETTER = 'dl'
-        TRIM_OPTION                        = 't'
-        LOG_FILE_SUFFIX                    = '_domain_seq_extr.log'
-        PASSED_SEQS_SUFFIX                 = '_domain_seq_extr_passed'
-        FAILED_SEQS_SUFFIX                 = '_domain_seq_extr_failed'
-        HELP_OPTION_1                      = 'help'
-        HELP_OPTION_2                      = 'h'
-
-        def run()
-
-            Util.print_program_information( PRG_NAME,
-                PRG_VERSION,
-                PRG_DESC ,
-                PRG_DATE,
-                COPYRIGHT,
-                CONTACT,
-                WWW,
-                STDOUT )
-
-            ld = Constants::LINE_DELIMITER
-
-            begin
-                cla = CommandLineArguments.new( ARGV )
-            rescue ArgumentError
-                Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
-            end
-
-            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
-                     cla.is_option_set?( HELP_OPTION_2 ) )
-                print_help
-                exit( 0 )
-            end
-
-            if ( cla.get_number_of_files != 3 )
-                print_help
-                exit( -1 )
-            end
-
-            allowed_opts = Array.new
-            allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
-            allowed_opts.push( ADD_POSITION_OPTION )
-            allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
-            allowed_opts.push( LENGTH_THRESHOLD_OPTION )
-            allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT )
-            allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER )
-            allowed_opts.push( TRIM_OPTION )
-
-            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
-            if ( disallowed.length > 0 )
-                Util.fatal_error( PRG_NAME,
-                    "unknown option(s): " + disallowed,
-                    STDOUT )
-            end
-
-            hmmsearch_output    = cla.get_file_name( 0 )
-            fasta_sequence_file = cla.get_file_name( 1 )
-            outfile             = cla.get_file_name( 2 )
-
-            add_position = false
-            if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
-                add_position = true
-            end
-
-            trim = false
-            if ( cla.is_option_set?( TRIM_OPTION ) )
-                trim = true
-            end
-
-            add_domain_number           = false
-            add_domain_number_as_letter = false
-            add_domain_number_as_digit  = false
-
-            if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
-                add_domain_number = true
-            end
-            if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) )
-                add_domain_number_as_letter = true
-            end
-            if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) )
-                add_domain_number_as_digit = true
-            end
-
-            if ( add_domain_number_as_letter && add_domain_number_as_digit )
-                puts( "attempt to add domain number as letter and digit at the same time" )
-                print_help
-                exit( -1 )
-            end
-
-            e_value_threshold = -1.0
-            if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
-                begin
-                    e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
-                rescue ArgumentError => e
-                    Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
-                end
-                if ( e_value_threshold < 0.0 )
-                    Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
-                end
-            end
-
-            length_threshold = -1
-            if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
-                begin
-                    length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
-                rescue ArgumentError => e
-                    Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
-                end
-                if ( length_threshold < 0)
-                    Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
-                end
-            end
-
-            log = String.new
-
-            puts()
-            puts( "Hmmsearch outputfile                   : " + hmmsearch_output )
-            log << "Hmmsearch outputfile                   : " + hmmsearch_output + ld
-            puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file )
-            log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld
-            puts( "Outputfile                             : " + outfile )
-            log << "Outputfile                             : " + outfile + ld
-            puts( "Passed sequences outfile (fasta)       : " + outfile + PASSED_SEQS_SUFFIX )
-            log << "Passed sequences outfile (fasta)       : " + outfile + PASSED_SEQS_SUFFIX + ld
-            puts( "Failed sequences outfile (fasta)       : " + outfile + FAILED_SEQS_SUFFIX )
-            log << "Failed sequences outfile (fasta)       : " + outfile + FAILED_SEQS_SUFFIX + ld
-            puts( "Logfile                                : " + outfile + LOG_FILE_SUFFIX )
-            log <<  "Logfile                                : " + outfile + LOG_FILE_SUFFIX + ld
-            if ( e_value_threshold >= 0.0 )
-                puts( "E-value threshold : " + e_value_threshold.to_s )
-                log << "E-value threshold : " + e_value_threshold.to_s + ld
-            else
-                puts( "E-value threshold : no threshold" )
-                log << "E-value threshold : no threshold" + ld
-            end
-            if ( length_threshold > 0 )
-                puts( "Length threshold  : " + length_threshold.to_s )
-                log << "Length threshold  : " + length_threshold.to_s + ld
-            else
-                puts( "Length threshold  : no threshold" )
-                log << "Length threshold  : no threshold" + ld
-            end
-
-            if ( trim )
-                puts( "Trim last 2 chars : true" )
-                log << "Trim last 2 chars : true" + ld
-            else
-                puts( "Trim names        : false" )
-                log << "Trim names        : false" + ld
-            end
-
-
-            if ( add_position )
-                puts( "Add positions (rel to complete seq) to extracted domains: true" )
-                log << "Add positions (rel to complete seq) to extracted domains: true" + ld
-            else
-                puts( "Add positions (rel to complete seq) to extracted domains: false" )
-                log << "Add positions (rel to complete seq) to extracted domains: false" + ld
-            end
-
-            if ( add_domain_number || add_domain_number_as_digit || add_domain_number_as_letter )
-                puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
-                log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
-            else
-                puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
-                log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
-            end
-
-            puts
-
-            domain_count = 0
-            begin
-                parser = HmmsearchDomainExtractor.new()
-                domain_count = parser.parse( hmmsearch_output,
-                    fasta_sequence_file,
-                    outfile,
-                    outfile + PASSED_SEQS_SUFFIX,
-                    outfile + FAILED_SEQS_SUFFIX,
-                    e_value_threshold,
-                    length_threshold,
-                    add_position,
-                    add_domain_number,
-                    add_domain_number_as_digit,
-                    add_domain_number_as_letter,
-                    trim,
-                    log )
-            rescue ArgumentError, IOError, StandardError => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
-            rescue Exception => e
-                Util.fatal_error( PRG_NAME, "unexpected exception!: " + e.to_s, STDOUT )
-            end
-
-            puts
-            Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
-            Util.print_message( PRG_NAME, "wrote;               " + outfile )
-            Util.print_message( PRG_NAME, "wrote:               " + outfile + LOG_FILE_SUFFIX )
-            Util.print_message( PRG_NAME, "(wrote:              " + outfile + PASSED_SEQS_SUFFIX + ")" )
-            Util.print_message( PRG_NAME, "(wrote:              " + outfile + FAILED_SEQS_SUFFIX + ")" )
-
-            begin
-                f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
-                f.print( log )
-                f.close
-            rescue Exception => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-            end
-
-            puts
-            Util.print_message( PRG_NAME, "OK" )
-            puts
-
+  class DomainSequenceExtractor
+
+    PRG_NAME       = "dsx"
+    PRG_VERSION    = "1.000"
+    PRG_DESC       = "extraction of domain sequences from hmmscan output"
+    PRG_DATE       = "20120906"
+    COPYRIGHT      = "2012 Christian M Zmasek"
+    CONTACT        = "phylosoft@gmail.com"
+    WWW            = "www.phylosoft.org"
+
+    E_VALUE_THRESHOLD_OPTION           = 'e'
+    LENGTH_THRESHOLD_OPTION            = 'l'
+    ADD_POSITION_OPTION                = 'p'
+    ADD_DOMAIN_NUMBER_OPTION           = 'd'
+    ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT  = 'dd'
+    ADD_DOMAIN_NUMBER_OPTION_AS_LETTER = 'dl'
+    ADD_SPECIES                        = 's'
+    MIN_LINKER_OPT                        = 'ml'
+    TRIM_OPTION                        = 't'
+    LOG_FILE_SUFFIX                    = '_domain_seq_extr.log'
+    PASSED_SEQS_SUFFIX                 = '_domain_seq_extr_passed'
+    FAILED_SEQS_SUFFIX                 = '_domain_seq_extr_failed'
+    HELP_OPTION_1                      = 'help'
+    HELP_OPTION_2                      = 'h'
+
+    def run()
+
+      Util.print_program_information( PRG_NAME,
+        PRG_VERSION,
+        PRG_DESC ,
+        PRG_DATE,
+        COPYRIGHT,
+        CONTACT,
+        WWW,
+        STDOUT )
+
+      ld = Constants::LINE_DELIMITER
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError
+        Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
+      end
+
+      if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+           cla.is_option_set?( HELP_OPTION_2 ) )
+        print_help
+        exit( 0 )
+      end
+
+      if ( cla.get_number_of_files != 4 )
+        print_help
+        exit( -1 )
+      end
+
+      allowed_opts = Array.new
+      allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
+      allowed_opts.push( ADD_POSITION_OPTION )
+      allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
+      allowed_opts.push( LENGTH_THRESHOLD_OPTION )
+      allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT )
+      allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER )
+      allowed_opts.push( TRIM_OPTION )
+      allowed_opts.push( ADD_SPECIES )
+      allowed_opts.push( MIN_LINKER_OPT )
+
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME,
+          "unknown option(s): " + disallowed,
+          STDOUT )
+      end
+
+      domain_id           = cla.get_file_name( 0 )
+      hmmsearch_output    = cla.get_file_name( 1 )
+      fasta_sequence_file = cla.get_file_name( 2 )
+      outfile             = cla.get_file_name( 3 )
+
+      add_position = false
+      if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
+        add_position = true
+      end
+
+      trim = false
+      if ( cla.is_option_set?( TRIM_OPTION ) )
+        trim = true
+      end
+
+      add_domain_number           = false
+      add_domain_number_as_letter = false
+      add_domain_number_as_digit  = false
+
+      if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
+        add_domain_number = true
+      end
+      if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) )
+        add_domain_number_as_letter = true
+      end
+      if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) )
+        add_domain_number_as_digit = true
+      end
+
+      add_species = false
+      if cla.is_option_set? ADD_SPECIES
+        add_species = true
+      end
+
+      if ( add_domain_number_as_letter && add_domain_number_as_digit )
+        puts( "attempt to add domain number as letter and digit at the same time" )
+        print_help
+        exit( -1 )
+      end
+
+      e_value_threshold = -1.0
+      if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
+        begin
+          e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
+        rescue ArgumentError => e
+          Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
         end
-
-        def print_help()
-            puts()
-            puts( "Usage:" )
-            puts()
-            puts( "  " + PRG_NAME + ".rb [options] <hmmsearch outputfile> <file containing complete sequences in fasta format> <outputfile>" )
-            puts()
-            puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f>: E-value threshold, default is no threshold" )
-            puts( "           -" + LENGTH_THRESHOLD_OPTION   + "=<i>: length threshold, default is no threshold" )
-            puts( "           -" + ADD_POSITION_OPTION  + ": to add positions (rel to complete seq) to extracted domains" )
-            puts( "           -" + ADD_DOMAIN_NUMBER_OPTION  + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
-            puts( "           -" + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT  + ": to add numbers to extracted domains as digit (example \"domain2\")" )
-            puts( "           -" + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER  + ": to add numbers to extracted domains as letter (example \"domaina\")" )
-            puts( "           -" + TRIM_OPTION  + ": to remove the last 2 characters from sequence names" )
-            puts()
+        if ( e_value_threshold < 0.0 )
+          Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
         end
-
-    end # class DomainSequenceExtractor
+      end
+
+      length_threshold = -1
+      if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
+        begin
+          length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
+        rescue ArgumentError => e
+          Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+        if ( length_threshold < 0)
+          Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
+        end
+      end
+      
+      
+       min_linker = nil
+       if ( cla.is_option_set?( MIN_LINKER_OPT ) )
+        begin
+           min_linker = cla.get_option_value_as_int( MIN_LINKER_OPT )
+        rescue ArgumentError => e
+          Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+         if ( !min_linker || min_linker > 100 || min_linker < -100 )
+          Forester::Util.fatal_error( PRG_NAME, "unexpected value for min linker " + min_linker.to_s, STDOUT )
+        end
+      end
+      
+
+      log = String.new
+
+      puts()
+      puts( "Domain                                 : " + domain_id )
+      log << "Domain                                 : " + domain_id + ld
+      puts( "Hmmscan outputfile                     : " + hmmsearch_output )
+      log << "Hmmscan outputfile                     : " + hmmsearch_output + ld
+      puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file )
+      log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld
+      puts( "Outputfile                             : " + outfile )
+      log << "Outputfile                             : " + outfile + ld
+      puts( "Passed sequences outfile (fasta)       : " + outfile + PASSED_SEQS_SUFFIX )
+      log << "Passed sequences outfile (fasta)       : " + outfile + PASSED_SEQS_SUFFIX + ld
+      puts( "Failed sequences outfile (fasta)       : " + outfile + FAILED_SEQS_SUFFIX )
+      log << "Failed sequences outfile (fasta)       : " + outfile + FAILED_SEQS_SUFFIX + ld
+      puts( "Logfile                                : " + outfile + LOG_FILE_SUFFIX )
+      log <<  "Logfile                                : " + outfile + LOG_FILE_SUFFIX + ld
+      if ( e_value_threshold >= 0.0 )
+        puts( "E-value threshold : " + e_value_threshold.to_s )
+        log << "E-value threshold : " + e_value_threshold.to_s + ld
+      else
+        puts( "E-value threshold : no threshold" )
+        log << "E-value threshold : no threshold" + ld
+      end
+      if ( length_threshold > 0 )
+        puts( "Length threshold  : " + length_threshold.to_s )
+        log << "Length threshold  : " + length_threshold.to_s + ld
+      else
+        puts( "Length threshold  : no threshold" )
+        log << "Length threshold  : no threshold" + ld
+      end
+
+      if ( trim )
+        puts( "Trim last 2 chars : true" )
+        log << "Trim last 2 chars : true" + ld
+      else
+        puts( "Trim names        : false" )
+        log << "Trim names        : false" + ld
+      end
+      
+      if ( min_linker )
+        puts( "Min linker        : " + min_linker.to_s )
+        log << "Min linker        :  " + min_linker.to_s +  ld
+      
+      end
+
+
+      if ( add_position )
+        puts( "Add positions (rel to complete seq) to extracted domains: true" )
+        log << "Add positions (rel to complete seq) to extracted domains: true" + ld
+      else
+        puts( "Add positions (rel to complete seq) to extracted domains: false" )
+        log << "Add positions (rel to complete seq) to extracted domains: false" + ld
+      end
+
+      if ( add_domain_number || add_domain_number_as_digit || add_domain_number_as_letter )
+        puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
+        log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
+      else
+        puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
+        log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
+      end
+
+      puts
+
+      domain_count = 0
+      begin
+        parser = HmmscanDomainExtractor.new()
+        domain_count = parser.parse( domain_id,
+          hmmsearch_output,
+          fasta_sequence_file,
+          outfile,
+          outfile + PASSED_SEQS_SUFFIX,
+          outfile + FAILED_SEQS_SUFFIX,
+          e_value_threshold,
+          length_threshold,
+          add_position,
+          add_domain_number,
+          add_domain_number_as_digit,
+          add_domain_number_as_letter,
+          trim,
+          add_species,
+          min_linker,
+          log )
+      rescue ArgumentError, IOError, StandardError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "unexpected exception!: " + e.to_s, STDOUT )
+      end
+
+      puts
+      Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
+      Util.print_message( PRG_NAME, "wrote;               " + outfile )
+      Util.print_message( PRG_NAME, "wrote:               " + outfile + LOG_FILE_SUFFIX )
+      Util.print_message( PRG_NAME, "(wrote:              " + outfile + PASSED_SEQS_SUFFIX + ")" )
+      Util.print_message( PRG_NAME, "(wrote:              " + outfile + FAILED_SEQS_SUFFIX + ")" )
+
+      begin
+        f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
+        f.print( log )
+        f.close
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      puts
+      Util.print_message( PRG_NAME, "OK" )
+      puts
+
+    end
+
+    def print_help()
+      puts()
+      puts( "Usage:" )
+      puts()
+      puts( "  " + PRG_NAME + ".rb [options] <domain> <hmmscan outputfile> <file containing complete sequences in fasta format> <outputfile>" )
+      puts()
+      puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f>: E-value threshold, default is no threshold" )
+      puts( "           -" + LENGTH_THRESHOLD_OPTION   + "=<i>: length threshold, default is no threshold" )
+      puts( "           -" + ADD_POSITION_OPTION  + ": to add positions (rel to complete seq) to extracted domains" )
+      puts( "           -" + ADD_DOMAIN_NUMBER_OPTION  + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
+      puts( "           -" + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT  + ": to add numbers to extracted domains as digit (example \"domain2\")" )
+      puts( "           -" + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER  + ": to add numbers to extracted domains as letter (example \"domaina\")" )
+      puts( "           -" + TRIM_OPTION  + ": to remove the last 2 characters from sequence names" )
+      puts( "           -" + ADD_SPECIES  + ": to add species [in brackets]" )
+      puts( "           -" + MIN_LINKER_OPT  + "=<i>: to extract pairs of same domains with a distance inbetween shorter than a given value" )
+      puts()
+    end
+
+  end # class DomainSequenceExtractor
 
 end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor_old.rb b/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor_old.rb
new file mode 100644 (file)
index 0000000..ac6199b
--- /dev/null
@@ -0,0 +1,262 @@
+#
+# = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: domain_sequence_extractor.rb,v 1.19 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/io/parser/hmmsearch_domain_extractor'
+
+module Evoruby
+
+    class DomainSequenceExtractor
+
+        PRG_NAME       = "dsx"
+        PRG_VERSION    = "1.1.0"
+        PRG_DESC       = "extraction of domain sequences from hmmsearch output"
+        PRG_DATE       = "2008.01.03"
+        COPYRIGHT      = "2008-2009 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        E_VALUE_THRESHOLD_OPTION           = 'e'
+        LENGTH_THRESHOLD_OPTION            = 'l'
+        ADD_POSITION_OPTION                = 'p'
+        ADD_DOMAIN_NUMBER_OPTION           = 'd'
+        ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT  = 'dd'
+        ADD_DOMAIN_NUMBER_OPTION_AS_LETTER = 'dl'
+        TRIM_OPTION                        = 't'
+        LOG_FILE_SUFFIX                    = '_domain_seq_extr.log'
+        PASSED_SEQS_SUFFIX                 = '_domain_seq_extr_passed'
+        FAILED_SEQS_SUFFIX                 = '_domain_seq_extr_failed'
+        HELP_OPTION_1                      = 'help'
+        HELP_OPTION_2                      = 'h'
+
+        def run()
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC ,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            ld = Constants::LINE_DELIMITER
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError
+                Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
+            end
+
+            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+                     cla.is_option_set?( HELP_OPTION_2 ) )
+                print_help
+                exit( 0 )
+            end
+
+            if ( cla.get_number_of_files != 3 )
+                print_help
+                exit( -1 )
+            end
+
+            allowed_opts = Array.new
+            allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
+            allowed_opts.push( ADD_POSITION_OPTION )
+            allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
+            allowed_opts.push( LENGTH_THRESHOLD_OPTION )
+            allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT )
+            allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER )
+            allowed_opts.push( TRIM_OPTION )
+
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME,
+                    "unknown option(s): " + disallowed,
+                    STDOUT )
+            end
+
+            hmmsearch_output    = cla.get_file_name( 0 )
+            fasta_sequence_file = cla.get_file_name( 1 )
+            outfile             = cla.get_file_name( 2 )
+
+            add_position = false
+            if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
+                add_position = true
+            end
+
+            trim = false
+            if ( cla.is_option_set?( TRIM_OPTION ) )
+                trim = true
+            end
+
+            add_domain_number           = false
+            add_domain_number_as_letter = false
+            add_domain_number_as_digit  = false
+
+            if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
+                add_domain_number = true
+            end
+            if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) )
+                add_domain_number_as_letter = true
+            end
+            if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) )
+                add_domain_number_as_digit = true
+            end
+
+            if ( add_domain_number_as_letter && add_domain_number_as_digit )
+                puts( "attempt to add domain number as letter and digit at the same time" )
+                print_help
+                exit( -1 )
+            end
+
+            e_value_threshold = -1.0
+            if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
+                begin
+                    e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
+                rescue ArgumentError => e
+                    Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+                end
+                if ( e_value_threshold < 0.0 )
+                    Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
+                end
+            end
+
+            length_threshold = -1
+            if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
+                begin
+                    length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
+                rescue ArgumentError => e
+                    Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+                end
+                if ( length_threshold < 0)
+                    Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
+                end
+            end
+
+            log = String.new
+
+            puts()
+            puts( "Hmmsearch outputfile                   : " + hmmsearch_output )
+            log << "Hmmsearch outputfile                   : " + hmmsearch_output + ld
+            puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file )
+            log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld
+            puts( "Outputfile                             : " + outfile )
+            log << "Outputfile                             : " + outfile + ld
+            puts( "Passed sequences outfile (fasta)       : " + outfile + PASSED_SEQS_SUFFIX )
+            log << "Passed sequences outfile (fasta)       : " + outfile + PASSED_SEQS_SUFFIX + ld
+            puts( "Failed sequences outfile (fasta)       : " + outfile + FAILED_SEQS_SUFFIX )
+            log << "Failed sequences outfile (fasta)       : " + outfile + FAILED_SEQS_SUFFIX + ld
+            puts( "Logfile                                : " + outfile + LOG_FILE_SUFFIX )
+            log <<  "Logfile                                : " + outfile + LOG_FILE_SUFFIX + ld
+            if ( e_value_threshold >= 0.0 )
+                puts( "E-value threshold : " + e_value_threshold.to_s )
+                log << "E-value threshold : " + e_value_threshold.to_s + ld
+            else
+                puts( "E-value threshold : no threshold" )
+                log << "E-value threshold : no threshold" + ld
+            end
+            if ( length_threshold > 0 )
+                puts( "Length threshold  : " + length_threshold.to_s )
+                log << "Length threshold  : " + length_threshold.to_s + ld
+            else
+                puts( "Length threshold  : no threshold" )
+                log << "Length threshold  : no threshold" + ld
+            end
+
+            if ( trim )
+                puts( "Trim last 2 chars : true" )
+                log << "Trim last 2 chars : true" + ld
+            else
+                puts( "Trim names        : false" )
+                log << "Trim names        : false" + ld
+            end
+
+
+            if ( add_position )
+                puts( "Add positions (rel to complete seq) to extracted domains: true" )
+                log << "Add positions (rel to complete seq) to extracted domains: true" + ld
+            else
+                puts( "Add positions (rel to complete seq) to extracted domains: false" )
+                log << "Add positions (rel to complete seq) to extracted domains: false" + ld
+            end
+
+            if ( add_domain_number || add_domain_number_as_digit || add_domain_number_as_letter )
+                puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
+                log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
+            else
+                puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
+                log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
+            end
+
+            puts
+
+            domain_count = 0
+            begin
+                parser = HmmsearchDomainExtractor.new()
+                domain_count = parser.parse( hmmsearch_output,
+                    fasta_sequence_file,
+                    outfile,
+                    outfile + PASSED_SEQS_SUFFIX,
+                    outfile + FAILED_SEQS_SUFFIX,
+                    e_value_threshold,
+                    length_threshold,
+                    add_position,
+                    add_domain_number,
+                    add_domain_number_as_digit,
+                    add_domain_number_as_letter,
+                    trim,
+                    log )
+            rescue ArgumentError, IOError, StandardError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "unexpected exception!: " + e.to_s, STDOUT )
+            end
+
+            puts
+            Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
+            Util.print_message( PRG_NAME, "wrote;               " + outfile )
+            Util.print_message( PRG_NAME, "wrote:               " + outfile + LOG_FILE_SUFFIX )
+            Util.print_message( PRG_NAME, "(wrote:              " + outfile + PASSED_SEQS_SUFFIX + ")" )
+            Util.print_message( PRG_NAME, "(wrote:              " + outfile + FAILED_SEQS_SUFFIX + ")" )
+
+            begin
+                f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
+                f.print( log )
+                f.close
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+
+            puts
+            Util.print_message( PRG_NAME, "OK" )
+            puts
+
+        end
+
+        def print_help()
+            puts()
+            puts( "Usage:" )
+            puts()
+            puts( "  " + PRG_NAME + ".rb [options] <hmmsearch outputfile> <file containing complete sequences in fasta format> <outputfile>" )
+            puts()
+            puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f>: E-value threshold, default is no threshold" )
+            puts( "           -" + LENGTH_THRESHOLD_OPTION   + "=<i>: length threshold, default is no threshold" )
+            puts( "           -" + ADD_POSITION_OPTION  + ": to add positions (rel to complete seq) to extracted domains" )
+            puts( "           -" + ADD_DOMAIN_NUMBER_OPTION  + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
+            puts( "           -" + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT  + ": to add numbers to extracted domains as digit (example \"domain2\")" )
+            puts( "           -" + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER  + ": to add numbers to extracted domains as letter (example \"domaina\")" )
+            puts( "           -" + TRIM_OPTION  + ": to remove the last 2 characters from sequence names" )
+            puts()
+        end
+
+    end # class DomainSequenceExtractor
+
+end # module Evoruby
index 417820c..bf7cb71 100644 (file)
@@ -38,6 +38,7 @@ module Evoruby
         add_domain_number_as_letter,
         trim_name,
         add_species,
+        min_linker,
         log )
 
       Util.check_file_for_readability( hmmsearch_output )
@@ -56,8 +57,13 @@ module Evoruby
       end
 
       out_msa = Msa.new
+
       failed_seqs = Msa.new
       passed_seqs = Msa.new
+      out_msa_pairs = nil
+      if min_linker
+        out_msa_pairs = Msa.new
+      end
 
       ld = Constants::LINE_DELIMITER
 
@@ -70,6 +76,12 @@ module Evoruby
       failed_species_counts         = Hash.new
       passed_species_counts         = Hash.new
 
+      prev_sequence = nil
+      prev_number   = nil
+      prev_env_from = nil
+      prev_env_to   = nil
+      prev_i_e_value  = nil
+
       File.open( hmmsearch_output ) do | file |
         while line = file.gets
           if !is_ignorable?( line ) && line =~ /^\S+\s+/
@@ -115,6 +127,34 @@ module Evoruby
                 add_sequence( sequence, in_msa, passed_seqs )
                 proteins_with_passing_domains += 1
               end
+
+              if min_linker
+                if  sequence == prev_sequence  && ( ( ( e_value_threshold.to_f < 0.0 ) || ( prev_i_e_value <= e_value_threshold  ) ) &&
+                     ( ( length_threshold.to_f <= 0 )   || (  ( prev_env_to - prev_env_from + 1 ) >= length_threshold.to_f    ) )  )
+                  diff = env_from - prev_env_to
+                  if diff <= min_linker
+                    extract_domain( sequence,
+                      prev_number.to_s + "/" + number.to_s,
+                      out_of,
+                      prev_env_from,
+                      env_to,
+                      in_msa,
+                      out_msa_pairs,
+                      false,
+                      true,
+                      false,
+                      false,
+                      trim_name ,
+                      add_species )
+                  end
+                end
+                prev_sequence = sequence
+                prev_number   = number
+                prev_env_from = env_from
+                prev_env_to   = env_to
+                prev_i_e_value  = i_e_value
+              end
+
             else
               print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" )
               log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)"
@@ -168,6 +208,15 @@ module Evoruby
         raise IOError, error_msg
       end
 
+      if out_msa_pairs
+        begin
+          io.write_to_file( out_msa_pairs, outfile+"_" + min_linker.to_s, w )
+        rescue Exception
+          error_msg = "could not write to \"" + outfile+"_" + min_linker.to_s + "\""
+          raise IOError, error_msg
+        end
+      end
+
       begin
         io.write_to_file( passed_seqs, passed_seqs_outfile, w )
       rescue Exception
@@ -200,7 +249,6 @@ module Evoruby
 
     private
 
-
     def add_sequence( sequence_name, in_msa, add_to_msa )
       seqs = in_msa.find_by_name_start( sequence_name, true )
       if ( seqs.length < 1 )
@@ -229,11 +277,11 @@ module Evoruby
         add_domain_number_as_letter,
         trim_name,
         add_species )
-      if ( number < 1 || out_of < 1 || number > out_of )
+      if  number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
         error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s
         raise ArgumentError, error_msg
       end
-      if ( seq_from < 1 || seq_to < 1 || seq_from >= seq_to )
+      if  seq_from < 1 || seq_to < 1 || seq_from >= seq_to
         error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s
         raise ArgumentError, error_msg
       end
@@ -262,15 +310,15 @@ module Evoruby
       end
 
       if out_of != 1
-        if ( add_domain_number_as_digit )
+        if add_domain_number_as_digit
           seq.set_name( seq.get_name + number.to_s )
-        elsif ( add_domain_number_as_letter )
+        elsif add_domain_number_as_letter
           if number > 25
             error_msg = 'too many identical domains per sequence, cannot use letters to distinguish them'
             raise StandardError, error_msg
           end
           seq.set_name( seq.get_name + ( number + 96 ).chr )
-        elsif ( add_domain_number )
+        elsif add_domain_number
           seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
         end
       end
@@ -292,7 +340,7 @@ module Evoruby
       end
 
       out_msa.add_sequence( seq )
-      
+
     end
 
     def count_species( sequence, species_counts_map )