in progress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Fri, 7 Sep 2012 17:24:58 +0000 (17:24 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Fri, 7 Sep 2012 17:24:58 +0000 (17:24 +0000)
forester/ruby/evoruby/exe/dsx2.rb [new file with mode: 0755]
forester/ruby/evoruby/exe/select_same_gn.rb
forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor_new.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/apps/domains_to_forester.rb
forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb

diff --git a/forester/ruby/evoruby/exe/dsx2.rb b/forester/ruby/evoruby/exe/dsx2.rb
new file mode 100755 (executable)
index 0000000..dc793d1
--- /dev/null
@@ -0,0 +1,20 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/dsx
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: dsx.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $
+#
+# last modified: 06/11/2007
+
+require 'lib/evo/apps/domain_sequence_extractor_new'
+
+module Evoruby
+
+    dsx = DomainSequenceExtractor.new()
+
+    dsx.run()
+
+end  # module Evoruby
index 7ab2c91..5458eef 100755 (executable)
@@ -77,6 +77,6 @@ module Evoruby
     end
   end
   w = FastaWriter.new
-  w.write(unique_genes_msa, "uniques.fasta")
-  w.write(longest_non_unique_genes_msa, "non_uniques_longest.fasta")
+  w.write(unique_genes_msa, "seqs_from_unique_genes.fasta")
+  w.write(longest_non_unique_genes_msa, "longest_seqs_from_nonunique_genes.fasta")
 end
diff --git a/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor_new.rb b/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor_new.rb
new file mode 100644 (file)
index 0000000..efa3968
--- /dev/null
@@ -0,0 +1,266 @@
+#
+# = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class
+#
+# Copyright::  Copyright (C) 2012 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id:Exp $
+
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/io/parser/hmmscan_domain_extractor'
+
+module Evoruby
+
+  class DomainSequenceExtractor
+
+    PRG_NAME       = "dsx"
+    PRG_VERSION    = "1.000"
+    PRG_DESC       = "extraction of domain sequences from hmmscan output"
+    PRG_DATE       = "20120906"
+    COPYRIGHT      = "2012 Christian M Zmasek"
+    CONTACT        = "phylosoft@gmail.com"
+    WWW            = "www.phylosoft.org"
+
+    E_VALUE_THRESHOLD_OPTION           = 'e'
+    LENGTH_THRESHOLD_OPTION            = 'l'
+    ADD_POSITION_OPTION                = 'p'
+    ADD_DOMAIN_NUMBER_OPTION           = 'd'
+    ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT  = 'dd'
+    ADD_DOMAIN_NUMBER_OPTION_AS_LETTER = 'dl'
+    TRIM_OPTION                        = 't'
+    LOG_FILE_SUFFIX                    = '_domain_seq_extr.log'
+    PASSED_SEQS_SUFFIX                 = '_domain_seq_extr_passed'
+    FAILED_SEQS_SUFFIX                 = '_domain_seq_extr_failed'
+    HELP_OPTION_1                      = 'help'
+    HELP_OPTION_2                      = 'h'
+
+    def run()
+
+      Util.print_program_information( PRG_NAME,
+        PRG_VERSION,
+        PRG_DESC ,
+        PRG_DATE,
+        COPYRIGHT,
+        CONTACT,
+        WWW,
+        STDOUT )
+
+      ld = Constants::LINE_DELIMITER
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError
+        Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
+      end
+
+      if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+           cla.is_option_set?( HELP_OPTION_2 ) )
+        print_help
+        exit( 0 )
+      end
+
+      if ( cla.get_number_of_files != 4 )
+        print_help
+        exit( -1 )
+      end
+
+      allowed_opts = Array.new
+      allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
+      allowed_opts.push( ADD_POSITION_OPTION )
+      allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
+      allowed_opts.push( LENGTH_THRESHOLD_OPTION )
+      allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT )
+      allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER )
+      allowed_opts.push( TRIM_OPTION )
+
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME,
+          "unknown option(s): " + disallowed,
+          STDOUT )
+      end
+
+      domain_id           = cla.get_file_name( 0 )
+      hmmsearch_output    = cla.get_file_name( 1 )
+      fasta_sequence_file = cla.get_file_name( 2 )
+      outfile             = cla.get_file_name( 3 )
+
+      add_position = false
+      if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
+        add_position = true
+      end
+
+      trim = false
+      if ( cla.is_option_set?( TRIM_OPTION ) )
+        trim = true
+      end
+
+      add_domain_number           = false
+      add_domain_number_as_letter = false
+      add_domain_number_as_digit  = false
+
+      if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
+        add_domain_number = true
+      end
+      if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) )
+        add_domain_number_as_letter = true
+      end
+      if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) )
+        add_domain_number_as_digit = true
+      end
+
+      if ( add_domain_number_as_letter && add_domain_number_as_digit )
+        puts( "attempt to add domain number as letter and digit at the same time" )
+        print_help
+        exit( -1 )
+      end
+
+      e_value_threshold = -1.0
+      if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
+        begin
+          e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
+        rescue ArgumentError => e
+          Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+        if ( e_value_threshold < 0.0 )
+          Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
+        end
+      end
+
+      length_threshold = -1
+      if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
+        begin
+          length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
+        rescue ArgumentError => e
+          Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+        if ( length_threshold < 0)
+          Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
+        end
+      end
+
+      log = String.new
+
+      puts()
+      puts( "Domain                                 : " + domain_id )
+      log << "Domain                                 : " + domain_id + ld
+      puts( "Hmmscan outputfile                     : " + hmmsearch_output )
+      log << "Hmmscan outputfile                     : " + hmmsearch_output + ld
+      puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file )
+      log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld
+      puts( "Outputfile                             : " + outfile )
+      log << "Outputfile                             : " + outfile + ld
+      puts( "Passed sequences outfile (fasta)       : " + outfile + PASSED_SEQS_SUFFIX )
+      log << "Passed sequences outfile (fasta)       : " + outfile + PASSED_SEQS_SUFFIX + ld
+      puts( "Failed sequences outfile (fasta)       : " + outfile + FAILED_SEQS_SUFFIX )
+      log << "Failed sequences outfile (fasta)       : " + outfile + FAILED_SEQS_SUFFIX + ld
+      puts( "Logfile                                : " + outfile + LOG_FILE_SUFFIX )
+      log <<  "Logfile                                : " + outfile + LOG_FILE_SUFFIX + ld
+      if ( e_value_threshold >= 0.0 )
+        puts( "E-value threshold : " + e_value_threshold.to_s )
+        log << "E-value threshold : " + e_value_threshold.to_s + ld
+      else
+        puts( "E-value threshold : no threshold" )
+        log << "E-value threshold : no threshold" + ld
+      end
+      if ( length_threshold > 0 )
+        puts( "Length threshold  : " + length_threshold.to_s )
+        log << "Length threshold  : " + length_threshold.to_s + ld
+      else
+        puts( "Length threshold  : no threshold" )
+        log << "Length threshold  : no threshold" + ld
+      end
+
+      if ( trim )
+        puts( "Trim last 2 chars : true" )
+        log << "Trim last 2 chars : true" + ld
+      else
+        puts( "Trim names        : false" )
+        log << "Trim names        : false" + ld
+      end
+
+
+      if ( add_position )
+        puts( "Add positions (rel to complete seq) to extracted domains: true" )
+        log << "Add positions (rel to complete seq) to extracted domains: true" + ld
+      else
+        puts( "Add positions (rel to complete seq) to extracted domains: false" )
+        log << "Add positions (rel to complete seq) to extracted domains: false" + ld
+      end
+
+      if ( add_domain_number || add_domain_number_as_digit || add_domain_number_as_letter )
+        puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
+        log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
+      else
+        puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
+        log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
+      end
+
+      puts
+
+      domain_count = 0
+      begin
+        parser = HmmscanDomainExtractor.new()
+        domain_count = parser.parse( domain_id,
+          hmmsearch_output,
+          fasta_sequence_file,
+          outfile,
+          outfile + PASSED_SEQS_SUFFIX,
+          outfile + FAILED_SEQS_SUFFIX,
+          e_value_threshold,
+          length_threshold,
+          add_position,
+          add_domain_number,
+          add_domain_number_as_digit,
+          add_domain_number_as_letter,
+          trim,
+          log )
+      rescue ArgumentError, IOError, StandardError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "unexpected exception!: " + e.to_s, STDOUT )
+      end
+
+      puts
+      Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
+      Util.print_message( PRG_NAME, "wrote;               " + outfile )
+      Util.print_message( PRG_NAME, "wrote:               " + outfile + LOG_FILE_SUFFIX )
+      Util.print_message( PRG_NAME, "(wrote:              " + outfile + PASSED_SEQS_SUFFIX + ")" )
+      Util.print_message( PRG_NAME, "(wrote:              " + outfile + FAILED_SEQS_SUFFIX + ")" )
+
+      begin
+        f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
+        f.print( log )
+        f.close
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      puts
+      Util.print_message( PRG_NAME, "OK" )
+      puts
+
+    end
+
+    def print_help()
+      puts()
+      puts( "Usage:" )
+      puts()
+      puts( "  " + PRG_NAME + ".rb [options] <domain> <hmmscan outputfile> <file containing complete sequences in fasta format> <outputfile>" )
+      puts()
+      puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f>: E-value threshold, default is no threshold" )
+      puts( "           -" + LENGTH_THRESHOLD_OPTION   + "=<i>: length threshold, default is no threshold" )
+      puts( "           -" + ADD_POSITION_OPTION  + ": to add positions (rel to complete seq) to extracted domains" )
+      puts( "           -" + ADD_DOMAIN_NUMBER_OPTION  + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
+      puts( "           -" + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT  + ": to add numbers to extracted domains as digit (example \"domain2\")" )
+      puts( "           -" + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER  + ": to add numbers to extracted domains as letter (example \"domaina\")" )
+      puts( "           -" + TRIM_OPTION  + ": to remove the last 2 characters from sequence names" )
+      puts()
+    end
+
+  end # class DomainSequenceExtractor
+
+end # module Evoruby
index a03cc2a..a031150 100644 (file)
@@ -4,7 +4,7 @@
 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
 # License::    GNU Lesser General Public License (LGPL)
 #
-# $Id: domains_to_forester.rb,v 1.11 2010/12/13 19:00:11 cmzmasek Exp $
+# $Id: Exp $
 #
 # last modified: 06/11/2007
 
@@ -18,235 +18,235 @@ require 'lib/evo/sequence/domain_structure'
 
 module Evoruby
 
-    class DomainsToForester
-
-        PRG_NAME       = "d2f"
-        PRG_DESC       = "parsed hmmpfam output to forester format"
-        PRG_VERSION    = "1.0.0"
-        PRG_DATE       = "2007.12.18"
-        COPYRIGHT      = "2007 Christian M Zmasek"
-        CONTACT        = "phylosoft@gmail.com"
-        WWW            = "www.phylosoft.org"
-
-        E_VALUE_THRESHOLD_OPTION         = "e"
-        OVERWRITE_IF_SAME_FROM_TO_OPTION = "o"
-        HELP_OPTION_1                    = "help"
-        HELP_OPTION_2                    = "h"
-
-        def parse( domains_list_file,
-                original_seqs_file,
-                outfile,
-                column_delimiter,
-                e_value_threshold,
-                overwrite_if_same_from_to )
-            Util.check_file_for_readability( domains_list_file )
-            Util.check_file_for_readability( original_seqs_file )
-            Util.check_file_for_writability( outfile )
-
-            domain_structures = Hash.new() # protein name is key, domain structure is value
-
-            f = MsaFactory.new
-
-            original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new )
-            if ( original_seqs.get_number_of_seqs < 1 )
-                error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format"
-                raise ArgumentError, error_msg
+  class DomainsToForester
+
+    PRG_NAME       = "d2f"
+    PRG_DESC       = "parsed hmmpfam output to forester format"
+    PRG_VERSION    = "1.001"
+    PRG_DATE       = "20120807"
+    COPYRIGHT      = "2012 Christian M Zmasek"
+    CONTACT        = "phylosoft@gmail.com"
+    WWW            = "www.phylosoft.org"
+
+    E_VALUE_THRESHOLD_OPTION         = "e"
+    OVERWRITE_IF_SAME_FROM_TO_OPTION = "o"
+    HELP_OPTION_1                    = "help"
+    HELP_OPTION_2                    = "h"
+
+    def parse( domains_list_file,
+        original_seqs_file,
+        outfile,
+        column_delimiter,
+        e_value_threshold,
+        overwrite_if_same_from_to )
+      Util.check_file_for_readability( domains_list_file )
+      Util.check_file_for_readability( original_seqs_file )
+      Util.check_file_for_writability( outfile )
+
+      domain_structures = Hash.new() # protein name is key, domain structure is value
+
+      f = MsaFactory.new
+
+      original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new )
+      if ( original_seqs.get_number_of_seqs < 1 )
+        error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format"
+        raise ArgumentError, error_msg
+      end
+
+      File.open( domains_list_file ) do | file |
+        while line = file.gets
+          if !is_ignorable?( line )
+            a = line.split( column_delimiter )
+            l = a.length
+            if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) )
+              error_msg = "unexpected format at line: " + line
+              raise IOError, error_msg
             end
-
-            File.open( domains_list_file ) do | file |
-                while line = file.gets
-                    if ( !is_ignorable?( line ) )
-                        a = line.split( column_delimiter )
-                        l = a.length
-                        if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) )
-                            error_msg = "unexpected format at line: " + line
-                            raise IOError, error_msg
-                        end
-                        protein_name = a[ 0 ]
-                        domain_name  = a[ 1 ]
-                        seq_from     = -1
-                        seq_to       = -1
-                        begin
-                            seq_from = a[ 2 ].to_i
-                        rescue Exception
-                            error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]"
-                            raise IOError, error_msg
-                        end
-                        begin
-                            seq_to = a[ 3 ].to_i
-                        rescue Exception
-                            error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]"
-                            raise IOError, error_msg
-                        end
-
-                        e_value = -1
-                        if ( l > 4 )
-                            begin
-                                e_value = a[ 4 ].to_f
-                            rescue Exception
-                                error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]"
-                                raise IOError, error_msg
-                            end
-                        end
-
-                        seq = original_seqs.get_by_name( protein_name, true, false )
-
-                        total_length = seq.get_length
-
-                        if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) )  )
-                            pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value )
-                            ds = nil
-                            if ( domain_structures.has_key?( protein_name ) )
-                                ds = domain_structures[ protein_name ]
-                            else
-                                ds = DomainStructure.new( total_length )
-                                domain_structures[ protein_name ] = ds
-                            end
-                            ds.add_domain( pd, overwrite_if_same_from_to )
-                        end
-
-                    end
-                end
-            end
-
-            out = File.open( outfile, "a" )
-            ds = domain_structures.sort
-            for d in ds
-                protein_name     = d[ 0 ]
-                domain_structure = d[ 1 ]
-                out.print( protein_name.to_s )
-                out.print( ":" )
-                out.print( domain_structure.to_NHX )
-                out.print( Constants::LINE_DELIMITER  )
-            end
-
-            out.flush()
-            out.close()
-
-        end # parse
-
-
-
-
-        def run()
-
-            Util.print_program_information( PRG_NAME,
-                PRG_VERSION,
-                PRG_DESC,
-                PRG_DATE,
-                COPYRIGHT,
-                CONTACT,
-                WWW,
-                STDOUT )
-
+            protein_name = a[ 0 ]
+            domain_name  = a[ 1 ]
+            seq_from     = -1
+            seq_to       = -1
             begin
-                cla = CommandLineArguments.new( ARGV )
-            rescue ArgumentError => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
-            end
-
-            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
-                     cla.is_option_set?( HELP_OPTION_2 ) )
-                print_help
-                exit( 0 )
-            end
-
-            if ( cla.get_number_of_files != 3 )
-                print_help
-                exit( -1 )
+              seq_from = a[ 2 ].to_i
+            rescue Exception
+              error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]"
+              raise IOError, error_msg
             end
-
-            allowed_opts = Array.new
-            allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
-            allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION )
-
-            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
-            if ( disallowed.length > 0 )
-                Util.fatal_error( PRG_NAME,
-                    "unknown option(s): " + disallowed,
-                    STDOUT )
+            begin
+              seq_to = a[ 3 ].to_i
+            rescue Exception
+              error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]"
+              raise IOError, error_msg
             end
 
-            domains_list_file       = cla.get_file_name( 0 )
-            original_sequences_file = cla.get_file_name( 1 )
-            outfile                 = cla.get_file_name( 2 )
-
-
-            e_value_threshold = -1.0
-            if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
-                begin
-                    e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
-                rescue ArgumentError => e
-                    Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
-                end
-                if ( e_value_threshold < 0.0 )
-                    Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
-                end
-            end
-            overwrite_if_same_from_to = false
-            if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) )
-                overwrite_if_same_from_to = true
+            e_value = -1
+            if l > 4
+              begin
+                e_value = a[ 4 ].to_f
+              rescue Exception
+                error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]"
+                raise IOError, error_msg
+              end
             end
 
-            puts()
-            puts( "Domains list file                      : " + domains_list_file )
-            puts( "Fasta sequencefile (complete sequences): " + original_sequences_file )
-            puts( "Outputfile                             : " + outfile )
-            if ( e_value_threshold >= 0.0 )
-                puts( "E-value threshold                      : " + e_value_threshold.to_s )
-            else
-                puts( "E-value threshold                      : no threshold" )
-            end
-            if ( overwrite_if_same_from_to )
-                puts( "Overwrite if same from and to          : true" )
-            else
-                puts( "Overwrite if same from and to          : false" )
-            end
+            seq = original_seqs.get_by_name_start( protein_name )
 
-            puts
+            total_length = seq.get_length
 
-            begin
-                parse( domains_list_file,
-                    original_sequences_file,
-                    outfile,
-                    " ",
-                    e_value_threshold,
-                    overwrite_if_same_from_to )
-
-            rescue ArgumentError, IOError, StandardError => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
-            rescue Exception => e
-                Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
+            if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) )  )
+              pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value )
+              ds = nil
+              if ( domain_structures.has_key?( protein_name ) )
+                ds = domain_structures[ protein_name ]
+              else
+                ds = DomainStructure.new( total_length )
+                domain_structures[ protein_name ] = ds
+              end
+              ds.add_domain( pd, overwrite_if_same_from_to )
             end
 
-
-            puts
-            Util.print_message( PRG_NAME, 'OK' )
-            puts
-
+          end
         end
-
-        private
-
-        def print_help()
-            puts()
-            puts( "Usage:" )
-            puts()
-            puts( "  " + PRG_NAME + ".rb [options] <domains list file (parsed hmmpfam output)> <file containing complete sequences in fasta format> <outputfile>" )
-            puts()
-            puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f> : E-value threshold, default is no threshold" )
-            puts( "               -" + OVERWRITE_IF_SAME_FROM_TO_OPTION  + " : overwrite domain with same start and end with domain with better E-value" )
-            puts()
+      end
+
+      out = File.open( outfile, "a" )
+      ds = domain_structures.sort
+      for d in ds
+        protein_name     = d[ 0 ]
+        domain_structure = d[ 1 ]
+        out.print( protein_name.to_s )
+        out.print( ":" )
+        out.print( domain_structure.to_NHX )
+        out.print( Constants::LINE_DELIMITER  )
+      end
+
+      out.flush()
+      out.close()
+
+    end # parse
+
+
+
+
+    def run()
+
+      Util.print_program_information( PRG_NAME,
+        PRG_VERSION,
+        PRG_DESC,
+        PRG_DATE,
+        COPYRIGHT,
+        CONTACT,
+        WWW,
+        STDOUT )
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+      end
+
+      if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+           cla.is_option_set?( HELP_OPTION_2 ) )
+        print_help
+        exit( 0 )
+      end
+
+      if cla.get_number_of_files != 3
+        print_help
+        exit( -1 )
+      end
+
+      allowed_opts = Array.new
+      allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
+      allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION )
+
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME,
+          "unknown option(s): " + disallowed,
+          STDOUT )
+      end
+
+      domains_list_file       = cla.get_file_name( 0 )
+      original_sequences_file = cla.get_file_name( 1 )
+      outfile                 = cla.get_file_name( 2 )
+
+
+      e_value_threshold = -1.0
+      if cla.is_option_set?( E_VALUE_THRESHOLD_OPTION )
+        begin
+          e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
         end
-
-
-
-        def is_ignorable?( line )
-            return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/)
+        if ( e_value_threshold < 0.0 )
+          Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
         end
-
-
-    end # class DomainsToForester
+      end
+      overwrite_if_same_from_to = false
+      if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) )
+        overwrite_if_same_from_to = true
+      end
+
+      puts
+      puts( "Domains list file                      : " + domains_list_file )
+      puts( "Fasta sequencefile (complete sequences): " + original_sequences_file )
+      puts( "Outputfile                             : " + outfile )
+      if ( e_value_threshold >= 0.0 )
+        puts( "E-value threshold                      : " + e_value_threshold.to_s )
+      else
+        puts( "E-value threshold                      : no threshold" )
+      end
+      if ( overwrite_if_same_from_to )
+        puts( "Overwrite if same from and to          : true" )
+      else
+        puts( "Overwrite if same from and to          : false" )
+      end
+
+      puts
+
+      begin
+        parse( domains_list_file,
+          original_sequences_file,
+          outfile,
+          " ",
+          e_value_threshold,
+          overwrite_if_same_from_to )
+
+      rescue ArgumentError, IOError, StandardError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
+      end
+
+
+      puts
+      Util.print_message( PRG_NAME, 'OK' )
+      puts
+
+    end
+
+    private
+
+    def print_help()
+      puts
+      puts( "Usage:" )
+      puts
+      puts( "  " + PRG_NAME + ".rb [options] <domains list file (parsed hmmpfam output)> <file containing complete sequences in fasta format> <outputfile>" )
+      puts()
+      puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f> : E-value threshold, default is no threshold" )
+      puts( "               -" + OVERWRITE_IF_SAME_FROM_TO_OPTION  + " : overwrite domain with same start and end with domain with better E-value" )
+      puts
+    end
+
+
+
+    def is_ignorable?( line )
+      return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/)
+    end
+
+
+  end # class DomainsToForester
 
 
 end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb
new file mode 100644 (file)
index 0000000..bd3ae8b
--- /dev/null
@@ -0,0 +1,307 @@
+#
+# = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanDomainExtractor class
+#
+# Copyright::  Copyright (C) 2012 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id:  $
+
+
+require 'lib/evo/util/constants'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/parser/fasta_parser'
+
+
+module Evoruby
+
+  class HmmscanDomainExtractor
+
+    TRIM_BY = 2
+
+    def initialize
+    end
+
+    # raises ArgumentError, IOError, StandardError
+    def parse( domain_id,
+        hmmsearch_output,
+        fasta_sequence_file,
+        outfile,
+        passed_seqs_outfile,
+        failed_seqs_outfile,
+        e_value_threshold,
+        length_threshold,
+        add_position,
+        add_domain_number,
+        add_domain_number_as_digit,
+        add_domain_number_as_letter,
+        trim_name,
+        log )
+
+      Util.check_file_for_readability( hmmsearch_output )
+      Util.check_file_for_readability( fasta_sequence_file )
+      Util.check_file_for_writability( outfile )
+      Util.check_file_for_writability( passed_seqs_outfile )
+      Util.check_file_for_writability( failed_seqs_outfile )
+
+      in_msa = nil
+      factory = MsaFactory.new()
+      in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
+
+      if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
+        error_msg = "could not find fasta sequences in " + fasta_sequence_file
+        raise IOError, error_msg
+      end
+
+      out_msa = Msa.new
+      failed_seqs = Msa.new
+      passed_seqs = Msa.new
+
+      ld = Constants::LINE_DELIMITER
+
+      domain_pass_counter     = 0
+      domain_fail_counter     = 0
+      proteins_with_passing_domains = 0
+      proteins_with_failing_domains = 0
+      max_domain_copy_number_per_protein = -1
+      max_domain_copy_number_sequence    = ''
+      failed_species_counts         = Hash.new
+      passed_species_counts         = Hash.new
+
+      File.open( hmmsearch_output ) do | file |
+        while line = file.gets
+          if !is_ignorable?( line ) && line =~ /^\S+\s+/
+
+            #         tn      acc     tlen    query   acc     qlen    Evalue  score   bias    #       of      c-E     i-E     score   bias    hf      ht      af      at      ef      et      acc     desc
+            #         1       2       3       4       5       6       7       8       9       10      11      12      13      14      15      16      17      18      19      20      21      22      23
+            line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
+
+            target_name = $1
+            if domain_id != target_name
+              next
+            end
+            
+            
+            sequence = $4
+            number   = $10.to_i
+            out_of   = $11.to_i
+            env_from = $20.to_i
+            env_to   = $21.to_i
+            i_e_value  = $13.to_f
+            if ( number > max_domain_copy_number_per_protein )
+              max_domain_copy_number_sequence    = sequence
+              max_domain_copy_number_per_protein = number
+            end
+            if ( ( ( e_value_threshold.to_f < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
+                 ( ( length_threshold.to_f <= 0 )   || ( env_to - env_from + 1 ) >= length_threshold.to_f )  )
+              extract_domain( sequence,
+                number,
+                out_of,
+                env_from,
+                env_to,
+                in_msa,
+                out_msa,
+                add_position,
+                add_domain_number,
+                add_domain_number_as_digit,
+                add_domain_number_as_letter,
+                trim_name )
+              domain_pass_counter += 1
+              count_species( sequence, passed_species_counts )
+              if passed_seqs.find_by_name_start( sequence, true ).length < 1
+                add_sequence( sequence, in_msa, passed_seqs )
+                proteins_with_passing_domains += 1
+              end
+            else
+              print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" )
+              log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)"
+              if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
+                print( " iE=" + i_e_value.to_s )
+                log << " iE=" + i_e_value.to_s
+              end
+              if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
+                le = env_to - env_from + 1
+                print( " l=" + le.to_s )
+                log << " l=" + le.to_s
+              end
+              print( Constants::LINE_DELIMITER )
+              log << Constants::LINE_DELIMITER
+              domain_fail_counter  += 1
+              count_species( sequence, failed_species_counts )
+              if failed_seqs.find_by_name_start( sequence, true ).length < 1
+                add_sequence( sequence, in_msa, failed_seqs )
+                proteins_with_failing_domains += 1
+              end
+            end
+          end
+        end
+      end
+
+      if domain_pass_counter < 1
+        error_msg = "no domain sequences were extracted"
+        raise StandardError, error_msg
+      end
+
+      log << Constants::LINE_DELIMITER
+      puts( "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s )
+      log << "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s
+      log << Constants::LINE_DELIMITER
+
+      if ( max_domain_copy_number_per_protein > 1 )
+        puts( "First protein with this copy number: " + max_domain_copy_number_sequence )
+        log << "First protein with this copy number: " + max_domain_copy_number_sequence
+        log << Constants::LINE_DELIMITER
+      end
+
+      io = MsaIO.new()
+      w = FastaWriter.new()
+      w.set_line_width( 60 )
+      w.clean( true )
+
+      begin
+        io.write_to_file( out_msa, outfile, w )
+      rescue Exception
+        error_msg = "could not write to \"" + outfile + "\""
+        raise IOError, error_msg
+      end
+
+      begin
+        io.write_to_file( passed_seqs, passed_seqs_outfile, w )
+      rescue Exception
+        error_msg = "could not write to \"" + passed_seqs_outfile + "\""
+        raise IOError, error_msg
+      end
+
+      begin
+        io.write_to_file( failed_seqs, failed_seqs_outfile, w )
+      rescue Exception
+        error_msg = "could not write to \"" + failed_seqs_outfile + "\""
+        raise IOError, error_msg
+      end
+
+      log << ld
+      log << "passing domains              : " + domain_pass_counter.to_s + ld
+      log << "failing domains              : " + domain_fail_counter.to_s + ld
+      log << "proteins with passing domains: " + proteins_with_passing_domains.to_s + ld
+      log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld
+      log << ld
+      log << 'passing domains counts per species: ' << ld
+      passed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld }
+      log << ld
+      log << 'failing domains counts per species: ' << ld
+      failed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld }
+      log << ld
+      return domain_pass_counter
+
+    end # parse
+
+    private
+
+
+    def add_sequence( sequence_name, in_msa, add_to_msa )
+      seqs = in_msa.find_by_name_start( sequence_name, true )
+      if ( seqs.length < 1 )
+        error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
+        raise StandardError, error_msg
+      end
+      if ( seqs.length > 1 )
+        error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
+        raise StandardError, error_msg
+      end
+      seq = in_msa.get_sequence( seqs[ 0 ] )
+      add_to_msa.add_sequence( seq )
+    end
+
+    # raises ArgumentError, StandardError
+    def extract_domain( sequence,
+        number,
+        out_of,
+        seq_from,
+        seq_to,
+        in_msa,
+        out_msa,
+        add_position,
+        add_domain_number,
+        add_domain_number_as_digit,
+        add_domain_number_as_letter,
+        trim_name )
+      if ( number < 1 || out_of < 1 || number > out_of )
+        error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s
+        raise ArgumentError, error_msg
+      end
+      if ( seq_from < 1 || seq_to < 1 || seq_from >= seq_to )
+        error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s
+        raise ArgumentError, error_msg
+      end
+      seqs = in_msa.find_by_name_start( sequence, true )
+      if seqs.length < 1
+        error_msg = "sequence \"" + sequence + "\" not found in sequence file"
+        raise StandardError, error_msg
+      end
+      if seqs.length > 1
+        error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
+        raise StandardError, error_msg
+      end
+      # hmmsearch is 1 based, wheres sequences are 0 bases in this package.
+      seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
+      
+      seq.set_name( seq.get_name.split[ 0 ] )
+      
+      if add_position
+        seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
+      end
+
+      if trim_name
+        seq.set_name( seq.get_name[ 0, seq.get_name.length - TRIM_BY ] )
+      end
+
+      if out_of != 1
+        if ( add_domain_number_as_digit )
+          seq.set_name( seq.get_name + number.to_s )
+        elsif ( add_domain_number_as_letter )
+          if number > 25
+            error_msg = 'too many identical domains per sequence, cannot use letters to distinguish them'
+            raise StandardError, error_msg
+          end
+          seq.set_name( seq.get_name + ( number + 96 ).chr )
+        elsif ( add_domain_number )
+          seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
+        end
+      end
+
+      # if ( seq.get_name.length > 10 )
+      #   error_msg = "sequence name [" + seq.get_name + "] is longer than 10 characters"
+      #   raise StandardError, error_msg
+      # end
+
+      out_msa.add_sequence( seq )
+    end
+
+    def count_species( sequence, species_counts_map )
+      species = get_species( sequence )
+      if species != nil
+        if !species_counts_map.has_key?( species )
+          species_counts_map[ species ] = 1
+        else
+          species_counts_map[ species ] = species_counts_map[ species ] + 1
+        end
+      end
+    end
+
+    def get_species( sequence_name )
+      if sequence_name =~ /^.+_(.+)$/
+        return $1
+      else
+        return nil
+      end
+    end
+
+    def is_ignorable?( line )
+      return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
+    end
+
+  end # class HmmscanDomainExtractor
+
+end # module Evoruby
+
index d228024..238d13c 100644 (file)
@@ -76,7 +76,6 @@ module Evoruby
                         #         1       2       3       4       5       6       7       8       9       10      11      12      13      14      15      16      17      18      19      20      21      22      23
                         line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
 
-                        # line =~ /^(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)/
                         sequence = $1
                         number   = $10.to_i
                         out_of   = $11.to_i