refactored
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Tue, 2 Oct 2012 18:53:34 +0000 (18:53 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Tue, 2 Oct 2012 18:53:34 +0000 (18:53 +0000)
13 files changed:
forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/tool/evo_nursery.rb [new file with mode: 0755]
forester/ruby/evoruby/lib/evo/tool/fasta_extractor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/tool/hmmscan_parser.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/tool/msa_processor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/tool/new_tap.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/tool/tseq_taxonomy_processor.rb [new file with mode: 0644]

diff --git a/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb
new file mode 100644 (file)
index 0000000..efe6f6a
--- /dev/null
@@ -0,0 +1,268 @@
+#
+# = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class
+#
+# Copyright::  Copyright (C) 2012 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id:Exp $
+
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/io/parser/hmmscan_domain_extractor'
+
+module Evoruby
+
+  class DomainSequenceExtractor
+
+    PRG_NAME       = "dsx"
+    PRG_VERSION    = "2.000"
+    PRG_DESC       = "extraction of domain sequences from hmmscan output"
+    PRG_DATE       = "20121001"
+    COPYRIGHT      = "2012 Christian M Zmasek"
+    CONTACT        = "phylosoft@gmail.com"
+    WWW            = "www.phylosoft.org"
+
+    E_VALUE_THRESHOLD_OPTION           = 'e'
+    LENGTH_THRESHOLD_OPTION            = 'l'
+    ADD_POSITION_OPTION                = 'p'
+    ADD_DOMAIN_NUMBER_OPTION           = 'd'
+    ADD_SPECIES                        = 's'
+    MIN_LINKER_OPT                     = 'ml'
+    LOG_FILE_SUFFIX                    = '_domain_seq_extr.log'
+    PASSED_SEQS_SUFFIX                 = '_with_passing_domains.fasta'
+    FAILED_SEQS_SUFFIX                 = '_with_no_passing_domains.fasta'
+    HELP_OPTION_1                      = 'help'
+    HELP_OPTION_2                      = 'h'
+
+    def run()
+
+      Util.print_program_information( PRG_NAME,
+        PRG_VERSION,
+        PRG_DESC ,
+        PRG_DATE,
+        COPYRIGHT,
+        CONTACT,
+        WWW,
+        STDOUT )
+
+      ld = Constants::LINE_DELIMITER
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError
+        Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
+      end
+
+      if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+           cla.is_option_set?( HELP_OPTION_2 ) )
+        print_help
+        exit( 0 )
+      end
+
+      if ( cla.get_number_of_files != 4 )
+        print_help
+        exit( -1 )
+      end
+
+      allowed_opts = Array.new
+      allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
+      allowed_opts.push( ADD_POSITION_OPTION )
+      allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
+      allowed_opts.push( LENGTH_THRESHOLD_OPTION )
+      allowed_opts.push( ADD_SPECIES )
+      allowed_opts.push( MIN_LINKER_OPT )
+
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME,
+          "unknown option(s): " + disallowed,
+          STDOUT )
+      end
+
+      domain_id           = cla.get_file_name( 0 )
+      hmmsearch_output    = cla.get_file_name( 1 )
+      fasta_sequence_file = cla.get_file_name( 2 )
+      outfile             = cla.get_file_name( 3 )
+
+      if outfile.downcase.end_with?( ".fasta" )
+        outfile = outfile[ 0 .. outfile.length - 7 ]
+      elsif outfile.downcase.end_with?( ".fsa" )
+        outfile = outfile[ 0 .. outfile.length - 5 ]
+      end
+
+
+      add_position = false
+      if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
+        add_position = true
+      end
+
+      add_domain_number           = false
+      if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
+        add_domain_number = true
+      end
+
+      add_species = false
+      if cla.is_option_set? ADD_SPECIES
+        add_species = true
+      end
+
+      e_value_threshold = -1.0
+      if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
+        begin
+          e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
+        rescue ArgumentError => e
+          Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+        if ( e_value_threshold < 0.0 )
+          Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
+        end
+      end
+
+      length_threshold = -1
+      if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
+        begin
+          length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
+        rescue ArgumentError => e
+          Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+        if ( length_threshold < 0)
+          Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
+        end
+      end
+
+
+      min_linker = nil
+      if ( cla.is_option_set?( MIN_LINKER_OPT ) )
+        begin
+          min_linker = cla.get_option_value_as_int( MIN_LINKER_OPT )
+        rescue ArgumentError => e
+          Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+        if ( !min_linker || min_linker > 100 || min_linker < -100 )
+          Forester::Util.fatal_error( PRG_NAME, "unexpected value for min linker " + min_linker.to_s, STDOUT )
+        end
+      end
+
+
+      log = String.new
+
+      puts()
+      puts( "Domain                                 : " + domain_id )
+      log << "Domain                                 : " + domain_id + ld
+      puts( "Hmmscan outputfile                     : " + hmmsearch_output )
+      log << "Hmmscan outputfile                     : " + hmmsearch_output + ld
+      puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file )
+      log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld
+      puts( "Outputfile                             : " + outfile + ".fasta" )
+      log << "Outputfile                             : " + outfile + ld
+      puts( "Passed sequences outfile (fasta)       : " + outfile + PASSED_SEQS_SUFFIX )
+      log << "Passed sequences outfile (fasta)       : " + outfile + PASSED_SEQS_SUFFIX + ld
+      puts( "Failed sequences outfile (fasta)       : " + outfile + FAILED_SEQS_SUFFIX )
+      log << "Failed sequences outfile (fasta)       : " + outfile + FAILED_SEQS_SUFFIX + ld
+      puts( "Logfile                                : " + outfile + LOG_FILE_SUFFIX )
+      log <<  "Logfile                                : " + outfile + LOG_FILE_SUFFIX + ld
+      if ( e_value_threshold >= 0.0 )
+        puts( "E-value threshold : " + e_value_threshold.to_s )
+        log << "E-value threshold : " + e_value_threshold.to_s + ld
+      else
+        puts( "E-value threshold : no threshold" )
+        log << "E-value threshold : no threshold" + ld
+      end
+      if ( length_threshold > 0 )
+        puts( "Length threshold  : " + length_threshold.to_s )
+        log << "Length threshold  : " + length_threshold.to_s + ld
+      else
+        puts( "Length threshold  : no threshold" )
+        log << "Length threshold  : no threshold" + ld
+      end
+      if ( min_linker )
+        puts( "Min linker        : " + min_linker.to_s )
+        log << "Min linker        :  " + min_linker.to_s +  ld
+
+      end
+
+
+      if ( add_position )
+        puts( "Add positions (rel to complete seq) to extracted domains: true" )
+        log << "Add positions (rel to complete seq) to extracted domains: true" + ld
+      else
+        puts( "Add positions (rel to complete seq) to extracted domains: false" )
+        log << "Add positions (rel to complete seq) to extracted domains: false" + ld
+      end
+
+      if ( add_domain_number )
+        puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
+        log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
+      else
+        puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
+        log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
+      end
+
+      puts
+
+      domain_count = 0
+      begin
+        parser = HmmscanDomainExtractor.new()
+        domain_count = parser.parse( domain_id,
+          hmmsearch_output,
+          fasta_sequence_file,
+          outfile,
+          outfile + PASSED_SEQS_SUFFIX,
+          outfile + FAILED_SEQS_SUFFIX,
+          e_value_threshold,
+          length_threshold,
+          add_position,
+          add_domain_number,
+          add_species,
+          min_linker,
+          log )
+      rescue ArgumentError, IOError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+
+      rescue Exception => e
+        puts e.backtrace
+        Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
+
+      end
+
+      puts
+      Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
+      Util.print_message( PRG_NAME, "wrote;               " + outfile + ".fasta")
+      Util.print_message( PRG_NAME, "wrote:               " + outfile + LOG_FILE_SUFFIX )
+      Util.print_message( PRG_NAME, "wrote:               " + outfile + PASSED_SEQS_SUFFIX )
+      Util.print_message( PRG_NAME, "wrote:               " + outfile + FAILED_SEQS_SUFFIX )
+
+      begin
+        f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
+        f.print( log )
+        f.close
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      puts
+      Util.print_message( PRG_NAME, "OK" )
+      puts
+
+    end
+
+    def print_help()
+      puts()
+      puts( "Usage:" )
+      puts()
+      puts( "  " + PRG_NAME + ".rb [options] <domain> <hmmscan outputfile> <file containing complete sequences in fasta format> <outputfile>" )
+      puts()
+      puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f>: E-value threshold, default is no threshold" )
+      puts( "           -" + LENGTH_THRESHOLD_OPTION   + "=<i>: length threshold, default is no threshold" )
+      puts( "           -" + ADD_POSITION_OPTION  + ": to add positions (rel to complete seq) to extracted domains" )
+      puts( "           -" + ADD_DOMAIN_NUMBER_OPTION  + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
+      puts( "           -" + ADD_SPECIES  + ": to add species [in brackets]" )
+      puts( "           -" + MIN_LINKER_OPT  + "=<i>: to extract pairs of same domains with a distance inbetween shorter than a given value" )
+      puts()
+    end
+
+  end # class DomainSequenceExtractor
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb b/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb
new file mode 100644 (file)
index 0000000..85af666
--- /dev/null
@@ -0,0 +1,261 @@
+#
+# = lib/evo/apps/domains_to_forester - DomainsToForester class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: Exp $
+#
+# last modified: 06/11/2007
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/sequence/protein_domain'
+require 'lib/evo/sequence/domain_structure'
+
+module Evoruby
+
+  class DomainsToForester
+
+    PRG_NAME       = "d2f"
+    PRG_DESC       = "parsed hmmpfam output to forester format"
+    PRG_VERSION    = "1.001"
+    PRG_DATE       = "20120807"
+    COPYRIGHT      = "2012 Christian M Zmasek"
+    CONTACT        = "phylosoft@gmail.com"
+    WWW            = "www.phylosoft.org"
+
+    E_VALUE_THRESHOLD_OPTION         = "e"
+    OVERWRITE_IF_SAME_FROM_TO_OPTION = "o"
+    HELP_OPTION_1                    = "help"
+    HELP_OPTION_2                    = "h"
+
+    def parse( domains_list_file,
+        original_seqs_file,
+        outfile,
+        column_delimiter,
+        e_value_threshold,
+        overwrite_if_same_from_to )
+      Util.check_file_for_readability( domains_list_file )
+      Util.check_file_for_readability( original_seqs_file )
+      Util.check_file_for_writability( outfile )
+
+      domain_structures = Hash.new() # protein name is key, domain structure is value
+
+      f = MsaFactory.new
+
+      original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new )
+      if ( original_seqs.get_number_of_seqs < 1 )
+        error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format"
+        raise ArgumentError, error_msg
+      end
+
+      File.open( domains_list_file ) do | file |
+        while line = file.gets
+          if !is_ignorable?( line )
+            
+            a = line.split( column_delimiter )
+            l = a.length
+            if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) )
+              error_msg = "unexpected format at line: " + line
+              raise IOError, error_msg
+            end
+            protein_name = a[ 0 ]
+            domain_name  = a[ 1 ]
+            seq_from     = -1
+            seq_to       = -1
+            ##########################################
+            if domain_name =~ /RRM_\d/
+              puts "ignoring " + line 
+              next
+            end
+            ##########################################
+            
+            
+            begin
+              seq_from = a[ 2 ].to_i
+            rescue Exception
+              error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]"
+              raise IOError, error_msg
+            end
+            begin
+              seq_to = a[ 3 ].to_i
+            rescue Exception
+              error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]"
+              raise IOError, error_msg
+            end
+
+            e_value = -1
+            if l > 4
+              begin
+                e_value = a[ 4 ].to_f
+              rescue Exception
+                error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]"
+                raise IOError, error_msg
+              end
+            end
+
+            seq = original_seqs.get_by_name_start( protein_name )
+
+            total_length = seq.get_length
+
+            if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) )  )
+              pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value )
+              ds = nil
+              if ( domain_structures.has_key?( protein_name ) )
+                ds = domain_structures[ protein_name ]
+              else
+                ds = DomainStructure.new( total_length )
+                domain_structures[ protein_name ] = ds
+              end
+              ds.add_domain( pd, overwrite_if_same_from_to )
+            end
+
+          end
+        end
+      end
+
+      out = File.open( outfile, "a" )
+      ds = domain_structures.sort
+      for d in ds
+        protein_name     = d[ 0 ]
+        domain_structure = d[ 1 ]
+        out.print( protein_name.to_s )
+        out.print( ": " )
+        out.print( domain_structure.to_NHX )
+        out.print( Constants::LINE_DELIMITER  )
+      end
+
+      out.flush()
+      out.close()
+
+    end # parse
+
+
+
+
+    def run()
+
+      Util.print_program_information( PRG_NAME,
+        PRG_VERSION,
+        PRG_DESC,
+        PRG_DATE,
+        COPYRIGHT,
+        CONTACT,
+        WWW,
+        STDOUT )
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+      end
+
+      if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+           cla.is_option_set?( HELP_OPTION_2 ) )
+        print_help
+        exit( 0 )
+      end
+
+      if cla.get_number_of_files != 3
+        print_help
+        exit( -1 )
+      end
+
+      allowed_opts = Array.new
+      allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
+      allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION )
+
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME,
+          "unknown option(s): " + disallowed,
+          STDOUT )
+      end
+
+      domains_list_file       = cla.get_file_name( 0 )
+      original_sequences_file = cla.get_file_name( 1 )
+      outfile                 = cla.get_file_name( 2 )
+
+
+      e_value_threshold = -1.0
+      if cla.is_option_set?( E_VALUE_THRESHOLD_OPTION )
+        begin
+          e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+        if ( e_value_threshold < 0.0 )
+          Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
+        end
+      end
+      overwrite_if_same_from_to = false
+      if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) )
+        overwrite_if_same_from_to = true
+      end
+
+      puts
+      puts( "Domains list file                      : " + domains_list_file )
+      puts( "Fasta sequencefile (complete sequences): " + original_sequences_file )
+      puts( "Outputfile                             : " + outfile )
+      if ( e_value_threshold >= 0.0 )
+        puts( "E-value threshold                      : " + e_value_threshold.to_s )
+      else
+        puts( "E-value threshold                      : no threshold" )
+      end
+      if ( overwrite_if_same_from_to )
+        puts( "Overwrite if same from and to          : true" )
+      else
+        puts( "Overwrite if same from and to          : false" )
+      end
+
+      puts
+
+      begin
+        parse( domains_list_file,
+          original_sequences_file,
+          outfile,
+          " ",
+          e_value_threshold,
+          overwrite_if_same_from_to )
+
+      rescue ArgumentError, IOError, StandardError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
+      end
+
+
+      puts
+      Util.print_message( PRG_NAME, 'OK' )
+      puts
+
+    end
+
+    private
+
+    def print_help()
+      puts
+      puts( "Usage:" )
+      puts
+      puts( "  " + PRG_NAME + ".rb [options] <domains list file (parsed hmmpfam output)> <file containing complete sequences in fasta format> <outputfile>" )
+      puts()
+      puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f> : E-value threshold, default is no threshold" )
+      puts( "               -" + OVERWRITE_IF_SAME_FROM_TO_OPTION  + " : overwrite domain with same start and end with domain with better E-value" )
+      puts
+    end
+
+
+
+    def is_ignorable?( line )
+      return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/)
+    end
+
+
+  end # class DomainsToForester
+
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/tool/evo_nursery.rb b/forester/ruby/evoruby/lib/evo/tool/evo_nursery.rb
new file mode 100755 (executable)
index 0000000..e6f653f
--- /dev/null
@@ -0,0 +1,317 @@
+#
+# = lib/evo/apps/evo_nursery.rb - EvoNursery class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: evo_nursery.rb,v 1.11 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+
+require 'lib/evo/soft/fastme'
+require 'lib/evo/soft/tree_puzzle'
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/writer/phylip_sequential_writer'
+require 'lib/evo/io/parser/general_msa_parser'
+require 'lib/evo/io/writer/msa_writer'
+
+require 'iconv'
+
+module Evoruby
+
+    class EvoNursery
+        GAP_RATIO           = 0.50
+        GAP_RATIO_FOR_SEQS  = 0.75
+        MIN_LENGTH          = 30
+        MIN_SEQS            = 4
+        MAX_SEQS            = 3000
+        MAX_ALN_FILE_SIZE   = 5000000
+        MODEL               = :auto
+        RATES               = :uniform
+        FASTME_INITIAL_TREE = :GME
+        ALN_NAME            = '_align_'
+        TREE_PUZZLE_OUTDIST = TreePuzzle::OUTDIST
+        TREE_PUZZLE_OUTFILE = TreePuzzle::OUTFILE
+        FASTME_OUTTREE      = FastMe::OUTTREE
+        FASTME_OUTPUT_D     = FastMe::OUTPUT_D
+
+        PRG_NAME       = "evo_nursery"
+        PRG_DATE       = "2012.03.21"
+        PRG_DESC       = "pfam alignments to evolutionary trees"
+        PRG_VERSION    = "0.20"
+        COPYRIGHT      = "2009-2012 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        HELP_OPTION_1       = "help"
+        HELP_OPTION_2       = "h"
+
+        def run
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            if RUBY_VERSION !~ /1.9/
+                puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
+                exit( -1 )
+            end
+
+            forester_home = Util.get_env_variable_value( Constants::FORESTER_HOME_ENV_VARIABLE )
+            java_home = Util.get_env_variable_value( Constants::JAVA_HOME_ENV_VARIABLE )
+            decorator = java_home + '/bin/java -cp ' + forester_home + '/java/forester.jar org.forester.application.decorator'
+
+            if ( ARGV == nil || ARGV.length != 1 )
+                help
+                exit( -1 )
+            end
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+
+            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+                     cla.is_option_set?( HELP_OPTION_2 ) )
+                help
+                exit( 0 )
+            end
+
+            output_dir = cla.get_file_name( 0 )
+
+            if output_dir !~ /\/$/
+                output_dir = output_dir + '/'
+            end
+
+            if !File.exists?( output_dir )
+                Util.fatal_error( PRG_NAME, output_dir.to_s + " does not exist", STDOUT )
+            end
+            ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
+            files = Dir.entries( "." )
+            skipped = Array.new
+            counter = 1
+            analyzed = 0;
+            begin
+                files.each { |pfam_aln_file|
+                    if ( !File.directory?( pfam_aln_file ) &&
+                             pfam_aln_file !~ /^\./ &&
+                             pfam_aln_file !~ /.+\.tre$/  )
+
+                        tree_out_file = output_dir + File.basename( pfam_aln_file ) + ".xml"
+
+                        if File.exists?( tree_out_file )
+                            puts
+                            puts
+                            puts "***** skipping " + File.basename( pfam_aln_file ) + ", already exists"
+                            puts
+                            skipped.push( File.basename( pfam_aln_file ) + " [already exists]" )
+                            next
+                        end
+
+                        puts
+                        puts counter.to_s + ": " + pfam_aln_file.to_str
+                        counter += 1
+                        if File.size( pfam_aln_file ) > MAX_ALN_FILE_SIZE
+                            puts "***** skipping, file size: " +  File.size( pfam_aln_file ).to_s
+                            skipped.push( File.basename( pfam_aln_file ) + " [file size: " +  File.size( pfam_aln_file ).to_s + "]" )
+                            next
+                        end
+
+                        f = MsaFactory.new()
+                        msa = f.create_msa_from_file( pfam_aln_file, GeneralMsaParser.new() )
+
+                        if msa.get_number_of_seqs < MIN_SEQS || msa.get_number_of_seqs > MAX_SEQS
+                            puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s
+                            skipped.push( File.basename( pfam_aln_file ) + " [seqs: " +  msa.get_number_of_seqs.to_s + "]" )
+                            next
+                        end
+
+                        msa.remove_gap_columns_w_gap_ratio!( GAP_RATIO )
+
+                        length = msa.get_length
+                        if length < MIN_LENGTH
+                            puts "***** skipping, length: " + length.to_s
+                            skipped.push( File.basename( pfam_aln_file ) + " [length: " +  length.to_s + "]" )
+                            next
+                        end
+
+                        msa.remove_sequences_by_gap_ratio!( GAP_RATIO_FOR_SEQS )
+
+                        if msa.get_number_of_seqs < MIN_SEQS
+                            puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s
+                            skipped.push( File.basename( pfam_aln_file ) + " [seqs: " +  msa.get_number_of_seqs.to_s + "]" )
+                            next
+                        end
+
+                        map_file = output_dir + File.basename( pfam_aln_file ) + ".map"
+                        f = File.open( map_file, 'a' )
+                        for i in 0 ... msa.get_number_of_seqs
+                            name = msa.get_sequence( i ).get_name()
+                            name =~ /(.+)_(.+)\/.+/
+                            acc = $1
+                            tax_code = $2
+
+                            mapping_str = i.to_s
+                            mapping_str << "\t"
+                            mapping_str << 'TAXONOMY_CODE:'
+                            mapping_str << tax_code
+                            mapping_str << "\t"
+                            mapping_str << 'SEQ_SYMBOL:'
+                            mapping_str << ( acc + '_' + tax_code )
+                            mapping_str << "\t"
+                            if ( acc.length < 6 )
+                                acc = acc + '_' + tax_code
+                            end
+                            mapping_str << 'SEQ_ACCESSION:'
+                            mapping_str << acc
+                            mapping_str << "\t"
+                            mapping_str << 'SEQ_ACCESSION_SOURCE:UniProtKB'
+                            mapping_str << "\t"
+                            mapping_str << 'NODE_NAME:'
+                            mapping_str << name
+                            f.print( mapping_str )
+                            f.print( "\n" )
+                            name = msa.get_sequence( i ).set_name( i.to_s )
+                        end
+                        f.close
+
+                        io = MsaIO.new()
+                        w = MsaWriter
+                        w = PhylipSequentialWriter.new()
+                        w.clean( true )
+                        w.set_max_name_length( 10 )
+                        if File.exists?( output_dir + ALN_NAME )
+                            File.unlink( output_dir + ALN_NAME )
+                        end
+                        io.write_to_file( msa, output_dir + ALN_NAME, w )
+
+                        tp = TreePuzzle.new()
+                        tp.run( output_dir + ALN_NAME,
+                            MODEL,
+                            RATES,
+                            msa.get_number_of_seqs )
+
+                        File.rename( output_dir + ALN_NAME, output_dir  + File.basename( pfam_aln_file ) + ".aln" )
+
+                        fastme = FastMe.new()
+                        fastme.run( TREE_PUZZLE_OUTDIST, 0, FASTME_INITIAL_TREE )
+
+                        pfam_acc = nil
+                        pfam_de = nil
+                        File.open( pfam_aln_file ) do |file|
+                            while line = file.gets
+                                line = ic.iconv( line )
+                                if line =~ /^#=AC\s+(.+)/
+                                    pfam_acc = $1
+                                end
+                                if line =~ /^#=DE\s+(.+)/
+                                    pfam_de = $1
+                                end
+                                if pfam_acc && pfam_de
+                                    break
+                                end
+                            end
+                        end
+                        if !pfam_acc || !pfam_de
+                            Util.fatal_error( PRG_NAME, "problem with " + pfam_aln_file.to_s, STDOUT )
+                        end
+
+                        puzzle_model = nil
+                        File.open( TREE_PUZZLE_OUTFILE ) do |file|
+                            while line = file.gets
+                                line = ic.iconv( line )
+                                if line =~ /^Model\s+of\s+substitution:\s+(.+)/
+                                    puzzle_model = $1
+                                    break
+                                end
+                            end
+                        end
+                        if !puzzle_model
+                            Util.fatal_error( PRG_NAME, "problem with puzzle outfile: " + TREE_PUZZLE_OUTFILE.to_s, STDOUT )
+                        end
+
+                        desc = pfam_de
+                        desc << ' | '
+                        desc << 'ML pwd estimation by TREE-PUZZLE version '
+                        desc << TreePuzzle::VERSION
+                        desc << ', model: '
+                        desc << puzzle_model
+                        desc << ', rates: '
+                        desc << RATES.to_s
+                        desc << '; tree estimation by FastME version '
+                        desc << FastMe::VERSION
+                        desc << ', initial tree: '
+                        desc << FASTME_INITIAL_TREE.to_s
+                        desc << '; aln length: '
+                        desc << msa.get_length.to_s
+
+                        cmd = decorator + " -table -p -pn=\"" + pfam_aln_file +
+                         "\" -pi=pfam:" + pfam_acc +
+                         " -pd=\"" + desc + "\" " +
+                         FASTME_OUTTREE + ' ' +
+                         map_file + ' ' + tree_out_file
+
+                        IO.popen( cmd , 'r+' ) do | pipe |
+                            pipe.close_write
+                        end
+                        analyzed += 1
+
+                        File.unlink( map_file )
+                        File.unlink(TREE_PUZZLE_OUTDIST)
+                        File.unlink( TREE_PUZZLE_OUTFILE )
+                        File.unlink( FASTME_OUTPUT_D )
+                    end
+                }
+            rescue ArgumentError, IOError, StandardError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+            end
+
+            puts()
+            puts( 'Skipped:' )
+            puts()
+            for i in 0 ... skipped.size
+                puts i.to_s + ": " + skipped[ i ]
+            end
+
+            puts()
+            puts( 'Skipped : ' + skipped.size.to_s + ' alignments' )
+            puts( 'Analyzed: ' +  analyzed.to_s    + ' alignments' )
+
+            puts( 'Min gap ratio for col del  : ' + GAP_RATIO.to_s )
+            puts( 'Min gap ratio for seq del  : ' + GAP_RATIO_FOR_SEQS.to_s )
+            puts( 'Minimal aln length         : ' + MIN_LENGTH.to_s )
+            puts( 'Minimal number of sequences: ' + MIN_SEQS.to_s )
+            puts( 'Maximal number of sequences: ' + MAX_SEQS.to_s )
+            puts( 'Maximal aln file size      : ' + MAX_ALN_FILE_SIZE.to_s )
+            puts( 'Model              : ' + MODEL.to_s )
+            puts( 'FastME initial tree: ' + FASTME_INITIAL_TREE.to_s )
+
+            puts()
+            puts( '[' + PRG_NAME + '] > OK' )
+            puts()
+
+        end  # run
+
+        private
+
+        def help
+            puts( "Usage:" )
+            puts()
+            puts( "  " + PRG_NAME + ".rb <output dir> " )
+            puts()
+        end
+
+
+    end # class EvoNursery
+
+end # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/tool/fasta_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/fasta_extractor.rb
new file mode 100644 (file)
index 0000000..b2d0d5c
--- /dev/null
@@ -0,0 +1,146 @@
+#
+# = lib/evo/apps/fasta_extractor.rb - FastaExtractor class
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: fasta_extractor.rb,v 1.2 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+require 'lib/evo/util/util'
+require 'lib/evo/util/constants'
+require 'lib/evo/util/command_line_arguments'
+
+
+module Evoruby
+
+    class FastaExtractor
+
+        PRG_NAME                           = "fae"
+        PRG_VERSION                        = "1.0.0"
+        PRG_DESC                           = "extraction of nucleotide sequences from a fasta file by names from wublast search"
+        PRG_DATE                           = "2008.08.09"
+        COPYRIGHT                          = "2008-2009 Christian M Zmasek"
+        CONTACT                            = "phylosoft@gmail.com"
+        WWW                                = "www.phylosoft.org"
+        HELP_OPTION_1                      = 'help'
+        HELP_OPTION_2                      = 'h'
+
+
+        def run()
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC ,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            ld = Constants::LINE_DELIMITER
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+
+            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+                     cla.is_option_set?( HELP_OPTION_2 ) )
+                print_help
+                exit( 0 )
+            end
+
+            if ( cla.get_number_of_files != 3 )
+                print_help
+                exit( -1 )
+            end
+
+            allowed_opts = Array.new
+
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME,
+                    "unknown option(s): " + disallowed,
+                    STDOUT )
+            end
+
+            input_file  = cla.get_file_name( 0 )
+            names_file  = cla.get_file_name( 1 )
+            output_file = cla.get_file_name( 2 )
+
+            if  !File.exist?( input_file )
+                Util.fatal_error( PRG_NAME, "error: input file [#{input_file}] does not exist" )
+            end
+            if  !File.exist?( names_file )
+                Util.fatal_error( PRG_NAME, "error: names file [#{names_file}] does not exist" )
+            end
+            if File.exist?( output_file   )
+                Util.fatal_error( PRG_NAME, "error: [#{output_file }] already exists" )
+            end
+
+            names = extract_names_with_frames( names_file )
+
+            extract_sequences( names, input_file, output_file )
+
+            puts
+            Util.print_message( PRG_NAME, "OK" )
+            puts
+
+        end
+
+
+        def extract_names_with_frames( names_file )
+            names = Hash.new()
+            File.open( names_file ) do | file |
+                while line = file.gets
+                    if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
+                        if ( line =~ /(\S+)\s+([+|-]\d)\s+\d+\s+(\S+)/ )
+                            name  = $1
+                            frame = $2
+                            e     = $3
+                            names[ name ] =  "[" + frame + "] [" + e + "]"
+                        end
+                    end
+                end
+            end
+            names
+        end
+
+        def extract_sequences( names, fasta_file, output_file )
+            output = File.open( output_file, "a" )
+            matching_state = false
+            counter = 0
+            File.open( fasta_file ) do | file |
+                while line = file.gets
+                    if !Util.is_string_empty?( line )
+                        if ( line =~ /\s*>\s*(.+)/ )
+                            name = $1
+                            if names.has_key?( name )
+                                matching_state = true
+                                counter += 1
+                                puts counter.to_s + ". " +name + " " + names[ name ]
+                                output.print( ">" + name + " " + names[ name ] )
+                                output.print( Evoruby::Constants::LINE_DELIMITER )
+                            else
+                                matching_state = false
+                            end
+                        elsif matching_state
+                            output.print( line )
+                        end
+                    end
+                end
+            end
+            output.close()
+        end
+
+        def print_help()
+            puts( "Usage:" )
+            puts()
+            puts( "  " + PRG_NAME + ".rb <input fasta file> <names file based on blast output> <output file>" )
+            puts()
+        end
+
+    end # class FastaExtractor
+end
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb
new file mode 100644 (file)
index 0000000..6ae3cf1
--- /dev/null
@@ -0,0 +1,205 @@
+#
+# = lib/evo/apps/fasta_taxonomy_processor - FastaTaxonomyProcessor class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: fasta_taxonomy_processor.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+require 'lib/evo/util/util'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/msa/msa'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/parser/sp_taxonomy_parser'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/writer/phylip_sequential_writer'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/apps/tseq_taxonomy_processor'
+
+module Evoruby
+
+    class FastaTaxonomyProcessor
+
+        PRG_NAME       = "fasta_tap"
+        PRG_DATE       = "2009.01.20"
+        PRG_DESC       = "preprocessing of multiple sequence files in ncbi fasta format"
+        PRG_VERSION    = "1.00"
+        COPYRIGHT      = "2009 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        def initialize()
+            @tax_ids_to_sp_taxonomies = Hash.new()
+        end
+
+        def run()
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            if  ARGV == nil || ARGV.length != 4
+                puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in ncbi fasta format> <name for fasta outfile> <name for map outfile>" )
+                puts()
+                exit( -1 )
+            end
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+            allowed_opts = Array.new
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
+            end
+
+            sp_taxonomy_infile = cla.get_file_name( 0 )
+            sequences_infile = cla.get_file_name( 1 )
+            sequences_outfile = cla.get_file_name( 2 )
+            mapping_outfile = cla.get_file_name( 3 )
+
+            Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
+            Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
+            Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
+            Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
+
+            sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
+
+            Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
+
+            fasta_parser = FastaParser.new
+            msa_fac = MsaFactory.new
+
+            seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser )
+
+            Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
+
+            removed = seqs.remove_redundant_sequences!( true, true )
+
+            if removed.size > 0
+                Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
+                removed.each { | seq_name |
+                    puts seq_name
+                }
+                Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
+            end
+
+            mapping_out = File.open( mapping_outfile, "a" )
+
+            for i in 0 ... seqs.get_number_of_seqs
+                seq = seqs.get_sequence( i )
+                seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
+            end
+
+            io = MsaIO.new()
+
+            w = FastaWriter.new()
+
+            w.set_max_name_length( 10 )
+            w.clean( true )
+            begin
+                io.write_to_file( seqs, sequences_outfile, w )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
+            end
+            mapping_out.close()
+
+            Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
+            Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
+            Util.print_message( PRG_NAME, "OK" )
+
+        end
+
+        private
+
+        def modify_name( seq, i, sp_taxonomies, mapping_outfile )
+
+            #i = i + 1792
+            
+            seq_desc = seq.get_name
+
+            taxonomy_sn = nil
+
+            if seq_desc =~ /\[(.+)\]/
+                taxonomy_sn = $1
+            else
+                Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]"  )
+            end
+
+            matching_sp_taxonomy = nil
+
+            sp_taxonomies.each { |sp_taxonomy|
+                if ( sp_taxonomy.scientific_name == taxonomy_sn )
+                    matching_sp_taxonomy = sp_taxonomy
+                end
+            }
+
+            if  matching_sp_taxonomy == nil
+                Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" )
+            end
+
+            new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
+
+            gi = nil
+            if seq_desc =~ /gi\|(.+?)\|/
+                gi = $1
+            else
+              Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]"  )
+            end
+
+            seq_name = ""
+
+            if seq_desc =~ /\|\s*([^|]+?)\s*\[/
+                seq_name = $1
+            end
+
+            if  seq_name =~ /\[.+\]$/
+                # Redundant taxonomy information hides here.
+                seq_name = seq_name.sub(/\[.+\]$/, '')
+            end
+            if  seq_name =~ /^\s*hypothetical\s+protein\s*/i
+                # Pointless information.
+                seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
+            end
+            if  seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i
+                # Pointless information.
+                seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' )
+            end
+
+            if gi != nil
+            mapping_outfile.print( new_name + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
+                 TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" +
+                 TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" +
+                 TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
+                 TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
+                 Constants::LINE_DELIMITER )
+            else
+                 mapping_outfile.print( new_name + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
+                 TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
+                 TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
+                 Constants::LINE_DELIMITER )
+                
+            end    
+            new_name
+        end
+
+    end
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_parser.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_parser.rb
new file mode 100644 (file)
index 0000000..c2f8177
--- /dev/null
@@ -0,0 +1,265 @@
+#
+# = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
+#
+# last modified: 11/24/2009
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+
+module Evoruby
+
+    class HmmscanParser
+
+        PRG_NAME       = "hsp"
+        PRG_VERSION    = "1.0.1"
+        PRG_DESC       = "hmmscan parser"
+        PRG_DATE       = "2009.11.24"
+        COPYRIGHT      = "2009 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        DELIMITER_OPTION              = "d"
+        E_VALUE_THRESHOLD_OPTION      = "e"
+        IGNORE_DUF_OPTION             = "i"
+        PARSE_OUT_DESCRIPITION_OPTION = "a"
+        HELP_OPTION_1                 = "help"
+        HELP_OPTION_2                 = "h"
+
+        def initialize
+            @domain_counts = Hash.new
+        end
+
+        # raises ArgumentError, IOError
+        def parse( inpath,
+                outpath,
+                column_delimiter,
+                e_value_threshold,
+                ignore_dufs,
+                get_descriptions )
+            Util.check_file_for_readability( inpath )
+            Util.check_file_for_writability( outpath )
+
+            outfile = File.open( outpath, "a" )
+
+            query    = String.new
+            desc     = String.new
+            model    = String.new
+            env_from = String.new
+            env_to   = String.new
+            i_e_value  = String.new
+
+            queries_count = 0
+
+            nl = Constants::LINE_DELIMITER
+
+            File.open( inpath ) do | file |
+                while line = file.gets
+                    if !HmmscanParser.is_ignorable?( line ) && line =~ /^\S+\s+\S/
+
+                        #         tn      acc     tlen    query   acc     qlen    Evalue  score   bias    #       of      c-E     i-E     score   bias    hf      ht      af      at      ef      et      acc     desc
+                        #         1       2       3       4       5       6       7       8       9       10      11      12      13      14      15      16      17      18      19      20      21      22      23
+                        line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
+
+                        model     = $1
+                        query     = $4
+                        i_e_value = $13.to_f
+                        env_from  = $20.to_i
+                        env_to    = $21.to_i
+
+                        if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
+                                 ( !ignore_dufs || ( model !~ /^DUF\d+/ ) ) )
+                            count_model( model )
+                            outfile.print( query +
+                                 column_delimiter )
+                            if ( get_descriptions )
+                                outfile.print( desc +
+                                     column_delimiter )
+                            end
+                            outfile.print( model +
+                                 column_delimiter +
+                                 env_from.to_s +
+                                 column_delimiter +
+                                 env_to.to_s +
+                                 column_delimiter +
+                                 i_e_value.to_s )
+                            outfile.print( nl )
+                        end
+                    end
+                end # while line = file.gets
+            end
+            outfile.flush()
+            outfile.close()
+
+            return queries_count
+
+        end # def parse
+
+        def count_model( model )
+            if ( @domain_counts.has_key?( model ) )
+                count = @domain_counts[ model ].to_i
+                count += 1
+                @domain_counts[ model ] = count
+            else
+                @domain_counts[ model ] = 1
+            end
+        end
+
+
+        def get_domain_counts()
+            return @domain_counts
+        end
+
+        def run()
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+            end
+
+            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+                     cla.is_option_set?( HELP_OPTION_2 ) )
+                print_help
+                exit( 0 )
+            end
+
+            if ( cla.get_number_of_files != 2 )
+                print_help
+                exit( -1 )
+            end
+
+            allowed_opts = Array.new
+            allowed_opts.push( DELIMITER_OPTION )
+            allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
+            allowed_opts.push( IGNORE_DUF_OPTION )
+            allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
+
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME,
+                    "unknown option(s): " + disallowed,
+                    STDOUT )
+            end
+
+            inpath = cla.get_file_name( 0 )
+            outpath = cla.get_file_name( 1 )
+
+            column_delimiter = "\t"
+            if ( cla.is_option_set?( DELIMITER_OPTION ) )
+                begin
+                    column_delimiter = cla.get_option_value( DELIMITER_OPTION )
+                rescue ArgumentError => e
+                    Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+                end
+            end
+
+            e_value_threshold = -1.0
+            if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
+                begin
+                    e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
+                rescue ArgumentError => e
+                    Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+                end
+                if ( e_value_threshold < 0.0 )
+                    Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
+                end
+            end
+
+            ignore_dufs = false
+            if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
+                ignore_dufs = true
+            end
+
+            parse_descriptions = false
+            if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
+                parse_descriptions = true
+            end
+
+            puts()
+            puts( "hmmpfam outputfile: " + inpath )
+            puts( "outputfile        : " + outpath )
+            if ( e_value_threshold >= 0.0 )
+                puts( "E-value threshold : " + e_value_threshold.to_s )
+            else
+                puts( "E-value threshold : no threshold" )
+            end
+            if ( parse_descriptions )
+                puts( "parse descriptions: true" )
+            else
+                puts( "parse descriptions: false" )
+            end
+            if ( ignore_dufs )
+                puts( "ignore DUFs       : true" )
+            else
+                puts( "ignore DUFs       : false" )
+            end
+            if ( column_delimiter == "\t" )
+                puts( "column delimiter  : TAB" )
+            else
+                puts( "column delimiter  : " + column_delimiter )
+            end
+            puts()
+
+            begin
+                queries_count = parse( inpath,
+                    outpath,
+                    column_delimiter,
+                    e_value_threshold,
+                    ignore_dufs,
+                    parse_descriptions )
+            rescue ArgumentError, IOError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+            end
+            domain_counts = get_domain_counts()
+
+            puts
+            puts( "read output for a total of " + queries_count.to_s + " query sequences" )
+            puts
+            puts( "domain counts (considering potential E-value threshold and ignoring of DUFs):" )
+            puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
+            puts
+            puts( Util.draw_histogram( domain_counts, "#" ) )
+            puts
+            Util.print_message( PRG_NAME, 'OK' )
+            puts
+
+        end # def run()
+
+        def print_help()
+            puts( "Usage:" )
+            puts()
+            puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
+            puts()
+            puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
+            puts( "           -" + E_VALUE_THRESHOLD_OPTION  + ": E-value threshold, default is no threshold" )
+            puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
+            puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
+            puts()
+        end
+
+
+        private
+
+
+        def HmmscanParser.is_ignorable?( line )
+            return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
+        end
+
+    end # class HmmscanParser
+
+end # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb b/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb
new file mode 100644 (file)
index 0000000..6299417
--- /dev/null
@@ -0,0 +1,848 @@
+#
+# = lib/evo/apps/msa_processor.rb - MsaProcessor class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: msa_processor.rb,v 1.33 2010/12/13 19:00:10 cmzmasek Exp $
+#
+
+require 'date'
+require 'set'
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/writer/phylip_sequential_writer'
+require 'lib/evo/io/writer/nexus_writer'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/io/parser/general_msa_parser'
+require 'lib/evo/io/writer/msa_writer'
+
+module Evoruby
+
+  class MsaProcessor
+
+    PRG_NAME       = "msa_pro"
+    PRG_DATE       = "2012.05.11"
+    PRG_DESC       = "processing of multiple sequence alignments"
+    PRG_VERSION    = "1.06"
+    COPYRIGHT      = "2008-2010 Christian M Zmasek"
+    CONTACT        = "phylosoft@gmail.com"
+    WWW            = "www.phylosoft.org"
+
+
+    NAME_LENGTH_DEFAULT                = 10
+    WIDTH_DEFAULT_FASTA                = 60
+    INPUT_TYPE_OPTION                  = "i"
+    OUTPUT_TYPE_OPTION                 = "o"
+    MAXIMAL_NAME_LENGTH_OPTION         = "n"
+    WIDTH_OPTION                       = "w"
+    CLEAN_UP_SEQ_OPTION                = "c"
+    REM_RED_OPTION                     = "rem_red"
+    REMOVE_GAP_COLUMNS_OPTION          = "rgc"
+    REMOVE_GAP_ONLY_COLUMNS            = "rgoc"
+    REMOVE_COLUMNS_GAP_RATIO_OPTION    = "rr"
+    REMOVE_ALL_GAP_CHARACTERS_OPTION   = "rg"
+    REMOVE_ALL_SEQUENCES_LISTED_OPTION = "r"
+    KEEP_ONLY_SEQUENCES_LISTED_OPTION  = "k"
+
+    KEEP_MATCHING_SEQUENCES_OPTION     = "mk"
+    REMOVE_MATCHING_SEQUENCES_OPTION   = "mr"
+
+    TRIM_OPTION                        = "t"
+    REMOVE_SEQS_GAP_RATIO_OPTION       = "rsgr"
+    REMOVE_SEQS_NON_GAP_LENGTH_OPTION  = "rsl"
+    SPLIT                              = "split"
+    LOG_SUFFIX                         = "_msa_pro.log"
+    HELP_OPTION_1                      = "help"
+    HELP_OPTION_2                      = "h"
+
+
+    def initialize()
+      @input_format_set = false
+      @output_format_set = false
+      @fasta_input      = false
+      @phylip_input     = true
+      @name_length      = NAME_LENGTH_DEFAULT
+      @name_length_set  = false
+      @width            = WIDTH_DEFAULT_FASTA     # fasta only
+      @pi_output        = true
+      @fasta_output     = false
+      @nexus_output     = false
+      @clean            = false  # phylip only
+      @rgc              = false
+      @rgoc             = false
+      @rg               = false  # fasta only
+      @rem_red          = false
+      @rgr              = -1
+      @rsgr             = -1
+      @rsl              = -1
+      @remove_matching  = nil
+      @keep_matching    = nil
+
+      @seqs_name_file   = nil
+      @remove_seqs      = false
+      @keep_seqs        = false
+      @trim             = false
+      @split            = -1
+      @first            = -1
+      @last             = -1
+    end
+
+
+    def run()
+
+      Util.print_program_information( PRG_NAME,
+        PRG_VERSION,
+        PRG_DESC,
+        PRG_DATE,
+        COPYRIGHT,
+        CONTACT,
+        WWW,
+        STDOUT )
+
+      if ( ARGV == nil || ARGV.length < 1 )
+        Util.print_message( PRG_NAME, "Illegal number of arguments" )
+        print_help
+        exit( -1 )
+      end
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "Error: " + e.to_s, STDOUT )
+      end
+
+      if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+           cla.is_option_set?( HELP_OPTION_2 ) )
+        print_help
+        exit( 0 )
+      end
+
+      if ( cla.get_number_of_files != 2 || ARGV.length < 2 )
+        Util.print_message( PRG_NAME, "Illegal number of arguments" )
+        print_help
+        exit( -1 )
+      end
+
+      allowed_opts = Array.new
+      allowed_opts.push( INPUT_TYPE_OPTION )
+      allowed_opts.push( OUTPUT_TYPE_OPTION )
+      allowed_opts.push( MAXIMAL_NAME_LENGTH_OPTION )
+      allowed_opts.push( WIDTH_OPTION )
+      allowed_opts.push( CLEAN_UP_SEQ_OPTION )
+      allowed_opts.push( REMOVE_GAP_COLUMNS_OPTION )
+      allowed_opts.push( REMOVE_GAP_ONLY_COLUMNS )
+      allowed_opts.push( REMOVE_COLUMNS_GAP_RATIO_OPTION )
+      allowed_opts.push( REMOVE_ALL_GAP_CHARACTERS_OPTION )
+      allowed_opts.push( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
+      allowed_opts.push( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
+      allowed_opts.push( TRIM_OPTION )
+      allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION )
+      allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
+      allowed_opts.push( SPLIT )
+      allowed_opts.push( REM_RED_OPTION )
+      allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
+      allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
+
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME,
+          "unknown option(s): " + disallowed )
+      end
+
+      input = cla.get_file_name( 0 )
+      output = cla.get_file_name( 1 )
+
+      analyze_command_line( cla )
+
+      begin
+        Util.check_file_for_readability( input )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      begin
+        Util.check_file_for_writability( output )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      if ( @rg )
+        set_pi_output( false )
+        set_fasta_output( true )
+        set_nexus_output( false )
+      end
+
+      if ( !@input_format_set )
+        fasta_like = false
+        begin
+          fasta_like = Util.looks_like_fasta?( input )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+        end
+        @fasta_input = fasta_like
+        @phylip_input = !fasta_like
+        if ( !@output_format_set )
+          @fasta_output = fasta_like
+          @pi_output = !fasta_like
+          @nexus_output = false
+        end
+      end
+
+      ld = Constants::LINE_DELIMITER
+      log = PRG_NAME + " " + PRG_VERSION + " [" + PRG_DATE + "]" + " LOG" + ld
+      now = DateTime.now
+      log << "Date/time: " + now.to_s + ld
+
+      puts()
+      puts( "Input alignment  : " + input )
+      log << "Input alignment  : " + input + ld
+      puts( "Output alignment : " + output )
+      log << "Output alignment : " + output + ld
+      if ( @phylip_input )
+        puts( "Input is         : Phylip, or something like it" )
+        log << "Input is         : Phylip, or something like it" + ld
+      elsif ( @fasta_input )
+        puts( "Input is         : Fasta" )
+        log << "Input is         : Fasta" + ld
+      end
+      if( @rgr >= 0 )
+        puts( "Max col gap ratio: " + @rgr.to_s )
+        log << "Max col gap ratio: " + @rgr.to_s + ld
+      elsif ( @rgc )
+        puts( "Remove gap colums" )
+        log << "Remove gap colums" + ld
+      elsif( @rgoc )
+        puts( "Remove gap only colums" )
+        log << "Remove gap only colums" + ld
+      end
+      if ( @clean )
+        puts( "Clean up         : true" )
+        log << "Clean up         : true" + ld
+      end
+
+      if ( @pi_output )
+        puts( "Output is        : Phylip interleaved" )
+        log << "Output is        : Phylip interleaved" + ld
+      elsif ( @fasta_output )
+        puts( "Output is        : Fasta" )
+        log << "Output is        : Fasta" + ld
+        if ( @width )
+          puts( "Width            : " + @width.to_s )
+          log << "Width            : " + @width.to_s + ld
+        end
+        if ( @rg )
+          puts( "Remove all gap characters (alignment is destroyed)" )
+          log << "Remove all gap characters (alignment is destroyed)" + ld
+        end
+      elsif ( @nexus_output )
+        puts( "Output is        : Nexus" )
+        log << "Output is        : Nexus" + ld
+      end
+      if ( @name_length_set || !@fasta_output )
+        puts( "Max name length  : " + @name_length.to_s )
+        log << "Max name length  : " + @name_length.to_s + ld
+      end
+      if( @rsgr >= 0 )
+        puts( "Remove sequences for which the gap ratio > " + @rsgr.to_s )
+        log << "Remove sequences for which the gap ratio > " + @rsgr.to_s + ld
+      end
+      if( @rsl >= 0 )
+        puts( "Remove sequences with less than "  + @rsl.to_s + " non-gap characters" )
+        log << "Remove sequences with less than "  + @rsl.to_s + " non-gap characters" + ld
+      end
+      if ( @remove_seqs )
+        puts( "Remove sequences listed in: " + @seqs_name_file )
+        log << "Remove sequences listed in: " + @seqs_name_file + ld
+      elsif ( @keep_seqs )
+        puts( "Keep only sequences listed in: " + @seqs_name_file )
+        log << "Keep only sequences listed in: " + @seqs_name_file + ld
+      end
+      if ( @trim )
+        puts( "Keep only columns from: "+ @first.to_s + " to " + @last.to_s )
+        log << "Keep only columns from: "+ @first.to_s + " to " + @last.to_s + ld
+      end
+      if ( @rem_red )
+        puts( "Remove redundant sequences: true" )
+        log << "Remove redundant sequences: true" + ld
+      end
+      if ( @split > 0 )
+        puts( "Split            : " + @split.to_s )
+        log << "Split            : " + @split.to_s + ld
+      end
+      puts()
+
+      f = MsaFactory.new()
+
+      msa = nil
+
+      begin
+        if ( @phylip_input )
+          msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
+        elsif ( @fasta_input )
+          msa = f.create_msa_from_file( input, FastaParser.new() )
+        end
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+      end
+
+      if ( msa.is_aligned() )
+        Util.print_message( PRG_NAME, "Length of original alignment         : " + msa.get_length.to_s )
+        log << "Length of original alignment         : " + msa.get_length.to_s + ld
+        gp = msa.calculate_gap_proportion
+        Util.print_message( PRG_NAME, "Gap-proportion of original alignment : " + gp.to_s )
+        log << "Gap-proportion of original alignment : " +  gp.to_s + ld
+      else
+        Util.print_message( PRG_NAME, "the input is not aligned" )
+        log << "The input is not aligned" + ld
+      end
+
+      all_names = Set.new()
+      for i in 0 ... msa.get_number_of_seqs()
+        current_name = msa.get_sequence( i ).get_name
+        if all_names.include?( current_name )
+          Util.print_warning_message( PRG_NAME, "sequence name [" + current_name + "] is not unique" )
+        else
+          all_names.add( current_name )
+        end
+      end
+
+      begin
+
+        if ( @remove_seqs || @keep_seqs )
+          names = Util.file2array( @seqs_name_file, true )
+          if ( names == nil ||  names.length() < 1 )
+            error_msg = "file \"" + @seqs_name_file.to_s + "\" appears empty"
+            Util.fatal_error( PRG_NAME, error_msg )
+          end
+
+          if ( @remove_seqs )
+            c = 0
+            for i in 0 ... names.length()
+              to_delete = msa.find_by_name( names[ i ], true, false )
+              if ( to_delete.length() < 1 )
+                error_msg = "sequence name \"" + names[ i ] + "\" not found"
+                Util.fatal_error( PRG_NAME, error_msg )
+              elsif ( to_delete.length() > 1 )
+                error_msg = "sequence name \"" + names[ i ] + "\" is not unique"
+                Util.fatal_error( PRG_NAME, error_msg )
+              else
+                msa.remove_sequence!( to_delete[ 0 ] )
+                c += 1
+              end
+            end
+            Util.print_message( PRG_NAME, "Removed " + c.to_s + " sequences" )
+            log <<  "Removed " + c.to_s + " sequences" + ld
+          elsif ( @keep_seqs )
+            msa_new = Msa.new()
+            r = 0
+            k = 0
+            for j in 0 ... msa.get_number_of_seqs()
+              if ( names.include?( msa.get_sequence( j ).get_name() ) )
+                msa_new.add_sequence( msa.get_sequence( j ) )
+                k += 1
+              else
+                r += 1
+              end
+            end
+            msa = msa_new
+            Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
+            log << "Kept    " + k.to_s + " sequences" + ld
+            Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
+            log << "removed " + r.to_s + " sequences" + ld
+          end
+        end
+
+        if ( @trim )
+          msa.trim!( @first, @last )
+        end
+        if( @rgr >= 0 )
+          msa.remove_gap_columns_w_gap_ratio!( @rgr )
+        elsif ( @rgc )
+          msa.remove_gap_columns!()
+        elsif( @rgoc )
+          msa.remove_gap_only_columns!()
+        end
+        if( @rsgr >= 0 )
+          n = msa.get_number_of_seqs()
+          removed = msa.remove_sequences_by_gap_ratio!( @rsgr )
+          k = msa.get_number_of_seqs()
+          r = n - k
+          Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
+          log << "Kept    " + k.to_s + " sequences" + ld
+          Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences"  )
+          log << "Removed " + r.to_s + " sequences:" + ld
+          removed.each { | seq_name |
+            log << "         " + seq_name  + ld
+          }
+        end
+        if( @rsl >= 0 )
+          n = msa.get_number_of_seqs()
+          removed = msa.remove_sequences_by_non_gap_length!( @rsl )
+          k = msa.get_number_of_seqs()
+          r = n - k
+          Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
+          log << "Kept    " + k.to_s + " sequences" + ld
+          Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
+          log << "Removed " + r.to_s + " sequences:" + ld
+          removed.each { | seq_name |
+            log << "         " + seq_name  + ld
+          }
+        end
+        if ( @keep_matching )
+          n = msa.get_number_of_seqs
+          to_be_removed = Set.new
+          for ii in 0 ...  n
+            seq = msa.get_sequence( ii )
+            if !seq.get_name.downcase.index( @keep_matching.downcase )
+              to_be_removed.add( ii )
+            end
+          end
+          to_be_removed_ary = to_be_removed.to_a.sort.reverse
+          to_be_removed_ary.each { | index |
+            msa.remove_sequence!( index )
+          }
+          # msa = sort( msa )
+        end
+        if ( @remove_matching )
+          n = msa.get_number_of_seqs
+          to_be_removed = Set.new
+          for iii in 0 ... n
+
+            seq = msa.get_sequence( iii )
+
+            if seq.get_name.downcase.index( @remove_matching.downcase )
+              to_be_removed.add( iii )
+            end
+          end
+          to_be_removed_ary = to_be_removed.to_a.sort.reverse
+          to_be_removed_ary.each { | index |
+            msa.remove_sequence!( index )
+          }
+          msa = sort( msa )
+        end
+
+
+
+        if ( @split > 0 )
+          begin
+            msas = msa.split( @split, true )
+            io = MsaIO.new()
+            w = MsaWriter
+            if ( @pi_output )
+              w = PhylipSequentialWriter.new()
+              w.clean( @clean )
+              w.set_max_name_length( @name_length )
+            elsif( @fasta_output )
+              w = FastaWriter.new()
+              w.set_line_width( @width )
+              if ( @rg )
+                w.remove_gap_chars( true )
+                Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
+                log << "removing gap character, the output is likely to become unaligned" + ld
+              end
+              w.clean( @clean )
+              if ( @name_length_set )
+                w.set_max_name_length( @name_length )
+              end
+            elsif( @nexus_output )
+              w = NexusWriter.new()
+              w.clean( @clean )
+              w.set_max_name_length( @name_length )
+            end
+            i = 0
+            for m in msas
+              i = i + 1
+              io.write_to_file( m, output + "_" + i.to_s, w )
+            end
+            Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files"  )
+            log << "wrote " + msas.length.to_s + " files" + ld
+          rescue Exception => e
+            Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+          end
+
+        end
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+      end
+
+      if ( @split <= 0 )
+
+        unless ( @rg )
+          if ( msa.is_aligned() )
+            Util.print_message( PRG_NAME, "Length of processed alignment        : " + msa.get_length.to_s )
+            log <<  "Length of processed alignment        : " + msa.get_length.to_s + ld
+            gp = msa.calculate_gap_proportion
+            Util.print_message( PRG_NAME, "Gap-proportion of processed alignment: " + gp.to_s )
+            log << "Gap-proportion of processed alignment: " +  gp.to_s + ld
+          else
+            Util.print_warning_message( PRG_NAME, "output is not aligned" )
+            log << "output is not aligned" + ld
+          end
+        end
+
+        if @rem_red
+          removed = msa.remove_redundant_sequences!( true, false )
+          if removed.size > 0
+            identicals = msa.get_identical_seqs_detected
+            log << "the following " + identicals.size.to_s + " sequences are identical:" + ld
+            identicals.each { | s |
+              log << s + ld
+            }
+            log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld
+            removed.each { | seq_name |
+              log << seq_name + ld
+            }
+            Util.print_message( PRG_NAME, "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" )
+            log << "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" + ld
+          end
+        end
+
+        io = MsaIO.new()
+
+        w = MsaWriter
+
+        if ( @pi_output )
+          w = PhylipSequentialWriter.new()
+          w.clean( @clean )
+          w.set_max_name_length( @name_length )
+        elsif( @fasta_output )
+          w = FastaWriter.new()
+          w.set_line_width( @width )
+          if ( @rg )
+            w.remove_gap_chars( true )
+            Util.print_warning_message( PRG_NAME, "removing gap characters, the output is likely to become unaligned"  )
+            log << "removing gap character, the output is likely to become unaligned" + ld
+          end
+          w.clean( @clean )
+          if ( @name_length_set )
+            w.set_max_name_length( @name_length )
+          end
+        elsif( @nexus_output )
+          w = NexusWriter.new()
+          w.clean( @clean )
+          w.set_max_name_length( @name_length )
+        end
+
+
+        begin
+          io.write_to_file( msa, output, w )
+        rescue Exception => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+        end
+
+        begin
+          f = File.open( output + LOG_SUFFIX, 'a' )
+          f.print( log )
+          f.close
+        rescue Exception => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+        end
+
+
+      end
+      Util.print_message( PRG_NAME, "OK" )
+      puts
+    end
+
+
+    private
+
+    def sort( msa )
+      names = Set.new
+      for i in 0 ... msa.get_number_of_seqs
+        name = msa.get_sequence( i ).get_name
+        names.add( name )
+      end
+      sorted_ary = names.to_a.sort
+      new_msa = Msa.new
+      sorted_ary.each { | seq_name |
+        seq = msa.get_sequence( msa.find_by_name( seq_name, true, false )[ 0 ] )
+        new_msa.add_sequence( seq )
+      }
+      new_msa
+    end
+
+    def set_fasta_input( fi = true )
+      @fasta_input = fi
+      @input_format_set = true
+    end
+    def set_phylip_input( pi = true )
+      @phylip_input = pi
+      @input_format_set = true
+    end
+    def set_name_length( i )
+      @name_length = i
+      @name_length_set = true
+    end
+    def set_width( i )
+      @width = i
+    end
+    def set_fasta_output( fo = true )
+      @fasta_output = fo
+      @output_format_set = true
+    end
+    def set_pi_output( pso = true )
+      @pi_output = pso
+      @output_format_set = true
+    end
+    def set_nexus_output( nexus = true )
+      @nexus_output = nexus
+      @output_format_set = true
+    end
+    def set_clean( c = true )
+      @clean = c
+    end
+    def set_remove_gap_columns( rgc = true )
+      @rgc = rgc
+    end
+    def set_remove_gap_only_columns( rgoc = true )
+      @rgoc = rgoc
+    end
+    def set_remove_gaps( rg = true )
+      @rg = rg
+    end
+    def set_remove_gap_ratio( rgr )
+      @rgr = rgr
+    end
+    def set_remove_seqs_gap_ratio( rsgr )
+      @rsgr = rsgr
+    end
+    def set_remove_seqs_min_non_gap_length( rsl )
+      @rsl = rsl
+    end
+    def set_remove_seqs( file )
+      @seqs_name_file = file
+      @remove_seqs    = true
+      @keep_seqs      = false
+    end
+    def set_keep_seqs( file )
+      @seqs_name_file = file
+      @keep_seqs      = true
+      @remove_seqs    = false
+    end
+    def set_trim( first, last )
+      @trim            = true
+      @first           = first
+      @last            = last
+    end
+    def set_remove_matching( remove )
+      @remove_matching  = remove
+    end
+    def set_keep_matching( keep )
+      @keep_matching = keep
+    end
+    def set_rem_red( rr )
+      @rem_red = rr
+    end
+
+
+
+    def set_split( s )
+      if ( s > 0 )
+        @split            = s
+        @clean            = false  # phylip only
+        @rgc              = false
+        @rgoc             = false
+        @rg               = false  # fasta only
+        @rgr              = -1
+        @rsgr             = -1
+        @rsl              = -1
+        @seqs_name_file   = nil
+        @remove_seqs      = false
+        @keep_seqs        = false
+        @trim             = false
+        @first            = -1
+        @last             = -1
+      end
+    end
+    def analyze_command_line( cla )
+      if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
+        begin
+          type = cla.get_option_value( INPUT_TYPE_OPTION )
+          if ( type == "p" )
+            set_phylip_input( true )
+            set_fasta_input( false )
+          elsif ( type == "f" )
+            set_fasta_input( true )
+            set_phylip_input( false )
+          end
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( OUTPUT_TYPE_OPTION ) )
+        begin
+          type = cla.get_option_value( OUTPUT_TYPE_OPTION )
+          if ( type == "p" )
+            set_pi_output( true )
+            set_fasta_output( false )
+            set_nexus_output( false )
+          elsif ( type == "f" )
+            set_pi_output( false )
+            set_fasta_output( true )
+            set_nexus_output( false )
+          elsif ( type == "n" )
+            set_pi_output( false )
+            set_fasta_output( false )
+            set_nexus_output( true )
+          end
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( MAXIMAL_NAME_LENGTH_OPTION ) )
+        begin
+          l = cla.get_option_value_as_int( MAXIMAL_NAME_LENGTH_OPTION )
+          set_name_length( l )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( WIDTH_OPTION ) )
+        begin
+          w = cla.get_option_value_as_int( WIDTH_OPTION )
+          set_width( w )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( CLEAN_UP_SEQ_OPTION ) )
+        set_clean( true )
+      end
+      if ( cla.is_option_set?( REMOVE_GAP_COLUMNS_OPTION ) )
+        set_remove_gap_columns( true )
+      end
+      if ( cla.is_option_set?( REM_RED_OPTION ) )
+        set_rem_red( true )
+      end
+      if ( cla.is_option_set?( REMOVE_GAP_ONLY_COLUMNS ) )
+        set_remove_gap_only_columns( true )
+      end
+      if ( cla.is_option_set?( REMOVE_ALL_GAP_CHARACTERS_OPTION ) )
+        set_remove_gaps( true )
+      end
+      if ( cla.is_option_set?( REMOVE_COLUMNS_GAP_RATIO_OPTION ) )
+        begin
+          f = cla.get_option_value_as_float( REMOVE_COLUMNS_GAP_RATIO_OPTION )
+          set_remove_gap_ratio( f )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) )
+        begin
+          s = cla.get_option_value( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
+          set_remove_seqs( s )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) )
+        begin
+          s = cla.get_option_value( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
+          set_keep_seqs( s )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( TRIM_OPTION ) )
+        begin
+          s = cla.get_option_value( TRIM_OPTION )
+          if ( s =~ /(\d+)-(\d+)/ )
+            set_trim( $1.to_i(), $2.to_i() )
+          else
+            puts( "illegal argument" )
+            print_help
+            exit( -1 )
+          end
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) )
+        begin
+          f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION )
+          set_remove_seqs_gap_ratio( f )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) )
+        begin
+          f = cla.get_option_value_as_int( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
+          set_remove_seqs_min_non_gap_length( f )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( SPLIT ) )
+        begin
+          s = cla.get_option_value_as_int( SPLIT )
+          set_split( s )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+
+      end
+      if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) )
+        begin
+          s = cla.get_option_value( REMOVE_MATCHING_SEQUENCES_OPTION )
+          set_remove_matching( s )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( KEEP_MATCHING_SEQUENCES_OPTION ) )
+        begin
+          s = cla.get_option_value( KEEP_MATCHING_SEQUENCES_OPTION )
+          set_keep_matching( s )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+
+
+    end
+
+    def print_help()
+      puts()
+      puts( "Usage:" )
+      puts()
+      puts( "  " + PRG_NAME + ".rb [options] <input alignment> <output>" )
+      puts()
+      puts( "  options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta, p for phylip selex type" )
+      puts( "           -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta, n for nexus, p for phylip sequential (default)" )
+      puts( "           -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
+      puts( "           -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
+      puts( "           -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" )
+      puts( "           -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" )
+      puts( "           -" + REMOVE_GAP_ONLY_COLUMNS + ": remove gap-only columns" )
+      puts( "           -" + REMOVE_COLUMNS_GAP_RATIO_OPTION + "=<n>: remove columns for which ( seqs with gap / number of sequences > n )" )
+      puts( "           -" + REMOVE_ALL_GAP_CHARACTERS_OPTION + ": remove all gap characters (destroys alignment, fasta output only)" )
+      puts( "           -" + REMOVE_ALL_SEQUENCES_LISTED_OPTION + "=<file>: remove all sequences listed in file" )
+      puts( "           -" + KEEP_ONLY_SEQUENCES_LISTED_OPTION + "=<file>: keep only sequences listed in file" )
+      puts( "           -" + TRIM_OPTION + "=<first>-<last>: remove columns before first and after last" )
+      puts( "           -" + REMOVE_SEQS_GAP_RATIO_OPTION + "=<n>: remove sequences for which the gap ratio > n (after column operations)" )
+      puts( "           -" + REMOVE_SEQS_NON_GAP_LENGTH_OPTION + "=<n> remove sequences with less than n non-gap characters (after column operations)" )
+      puts( "           -" + REMOVE_MATCHING_SEQUENCES_OPTION + "=<s> remove all sequences with names containing s" )
+      puts( "           -" + KEEP_MATCHING_SEQUENCES_OPTION + "=<s> keep only sequences with names containing s" )
+      puts( "           -" + SPLIT + "=<n> split a fasta file into n files of equal number of sequences (expect for " )
+      puts( "            last one), cannot be used with other options" )
+      puts( "           -" + REM_RED_OPTION + ": remove redundant sequences" )
+      puts()
+    end
+
+
+
+
+
+  end # class MsaProcessor
+
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb
new file mode 100644 (file)
index 0000000..b499c72
--- /dev/null
@@ -0,0 +1,551 @@
+#
+# = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/msa/msa'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/util/command_line_arguments'
+
+
+
+module Evoruby
+
+  class MultiSequenceExtractor
+
+    PRG_NAME                           = "mse"
+    PRG_VERSION                        = "1.02"
+    PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
+    PRG_DATE                           = "2012.07.20"
+    COPYRIGHT                          = "2008-2012 Christian M Zmasek"
+    CONTACT                            = "phylosoft@gmail.com"
+    WWW                                = "www.phylosoft.org"
+    HELP_OPTION_1                      = 'help'
+    HELP_OPTION_2                      = 'h'
+
+    EXT_OPTION                          = 'e'
+    EXTRACT_LINKERS_OPTION              = 'l'
+    LOG_SUFFIX                          = ".mse_log"
+    LINKERS_SUFFIX                      = ".linkers"
+    FASTA_SUFFIX                        = ".fasta"
+    FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
+    NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
+    PROTEINS_LIST_FILE_SEPARATOR        = "\t"
+
+
+    def run()
+
+      Util.print_program_information( PRG_NAME,
+        PRG_VERSION,
+        PRG_DESC ,
+        PRG_DATE,
+        COPYRIGHT,
+        CONTACT,
+        WWW,
+        STDOUT )
+
+      ld = Constants::LINE_DELIMITER
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+           cla.is_option_set?( HELP_OPTION_2 ) )
+        print_help
+        exit( 0 )
+      end
+
+      if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
+        print_help
+        exit( -1 )
+      end
+
+      allowed_opts = Array.new
+      allowed_opts.push(EXT_OPTION)
+      allowed_opts.push(EXTRACT_LINKERS_OPTION)
+
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME,
+          "unknown option(s): " + disallowed,
+          STDOUT )
+      end
+
+      seq_names_files_suffix = cla.get_file_name( 0 )
+      input_dir              = cla.get_file_name( 1 )
+      out_dir                = cla.get_file_name( 2 )
+      out_dir_doms           = cla.get_file_name( 3 )
+      mapping_file            = nil
+
+      if ( cla.get_number_of_files == 5 )
+        mapping_file = cla.get_file_name( 4 )
+        begin
+          Util.check_file_for_readability( mapping_file )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+        end
+      end
+
+      extension = 0
+      if cla.is_option_set?(EXT_OPTION)
+        extension = cla.get_option_value_as_int(EXT_OPTION)
+        if extension < 0
+          extension = 0
+        end
+      end
+
+      extract_linkers = false
+      if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
+        extract_linkers = true
+      end
+
+      if  !File.exist?( input_dir )
+        Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
+      end
+      if  !File.exist?( out_dir )
+        Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
+      end
+      if  !File.exist?( out_dir_doms )
+        Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
+      end
+      if !File.directory?( input_dir )
+        Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
+      end
+      if !File.directory?( out_dir )
+        Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
+      end
+      if !File.directory?( out_dir_doms )
+        Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
+      end
+
+
+      log = String.new
+
+      log << "Program            : " + PRG_NAME + ld
+      log << "Version            : " + PRG_VERSION + ld
+      log << "Program date       : " + PRG_DATE + ld
+
+      puts()
+      puts( "Sequence names files suffix: " + seq_names_files_suffix )
+      log << "Sequence names files suffix: " + seq_names_files_suffix + ld
+      puts( "Input dir                  : " + input_dir )
+      log << "Input dir                  : " + input_dir + ld
+      puts( "Output dir                 : " + out_dir )
+      log << "Output dir                 : " + out_dir + ld
+      puts( "Output dir domains         : " + out_dir_doms )
+      log << "Output dir domains         : " + out_dir_doms + ld
+      if ( mapping_file != nil )
+        puts( "Mapping file               : " + mapping_file )
+        log << "Mapping file               : " + mapping_file + ld
+      end
+      if extension > 0
+        puts( "Extension                  : " + extension.to_s )
+        log << "Extension                  : " + extension.to_s + ld
+      end
+      if extract_linkers
+        puts( "Extract linkers            : true" )
+        log << "Extract linkers            : true" + ld
+      end
+      log << "Date                       : " + Time.now.to_s + ld
+      puts
+
+      if ( mapping_file != nil )
+        species_codes_to_paths = extract_mappings( mapping_file )
+      end
+
+      input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
+
+      counter = 0
+
+      input_files.each { |input_file|
+        counter += 1
+        puts
+        puts
+        puts counter.to_s + "/" + input_files.size.to_s
+        read_seq_family_file( input_file,
+          seq_names_files_suffix,
+          input_dir,
+          species_codes_to_paths,
+          log,
+          out_dir,
+          out_dir_doms,
+          mapping_file,
+          extension,
+          extract_linkers )
+      }
+      puts
+      Util.print_message( PRG_NAME, "OK" )
+      puts
+
+    end
+
+
+    def read_seq_family_file( input_file,
+        seq_names_files_suffix,
+        input_dir,
+        species_codes_to_paths,
+        log,
+        out_dir,
+        out_dir_doms,
+        mapping_file,
+        extension,
+        extract_linkers )
+
+      begin
+        Util.check_file_for_readability( input_file )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+      basename = File.basename( input_file, seq_names_files_suffix )
+      out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
+      out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
+      out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
+      doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
+      doms_ext_out_file_path_fasta_file           = nil
+      if extension > 0
+        doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX
+      end
+      begin
+        Util.check_file_for_writability( out_file_path_fasta_file )
+        Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
+        Util.check_file_for_writability( out_file_path_ids_map  )
+        Util.check_file_for_writability( doms_out_file_path_fasta_file  )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      ids_map_writer = nil
+      begin
+        ids_map_writer = File.open( out_file_path_ids_map, 'a' )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      current_species         = ""
+      current_msa            = nil
+      new_msa                = Msa.new
+      new_msa_normalized_ids = Msa.new
+      new_msa_domains = Msa.new
+      new_msa_domains_extended = Msa.new
+      per_species_counter = 0
+      linkers = ""
+
+      puts basename
+
+      File.open( input_file ) do | file |
+        while line = file.gets
+          line.strip!
+          if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
+            values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
+            mod_line = nil
+            if ( values.length < 2 )
+              Util.fatal_error( PRG_NAME, "unexpected format: " + line )
+            end
+            species = values[ 0 ]
+            if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
+              next
+            end
+            seq_name = values[ 1 ]
+            domain_ranges = nil
+            if ( values.length > 3 )
+              domain_ranges_block = values[ 3 ]
+              domain_ranges = domain_ranges_block.split( "/" )
+            end
+            if ( species != current_species )
+              current_species = species
+              my_file = input_dir + Constants::FILE_SEPARATOR + current_species
+
+              if ( !File.exist?( my_file ) )
+                if species_codes_to_paths == nil
+                  Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
+                elsif ( !species_codes_to_paths.has_key?( current_species ) )
+                  Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
+                end
+                my_file = species_codes_to_paths[ current_species ]
+              end
+              my_path = File.expand_path( my_file )
+              my_readlink = my_path
+              if ( File.symlink?( my_path ) )
+                my_readlink = File.readlink( my_path )
+              end
+              current_msa = read_fasta_file( my_file )
+
+              if ( per_species_counter > 0 )
+                print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
+                per_species_counter = 0
+              end
+              puts " " + current_species + " [" + my_readlink + "]"
+              log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
+            end
+            puts "   " + seq_name
+            log << "   " + seq_name + Constants::LINE_DELIMITER
+            per_species_counter = per_species_counter + 1
+            seq = nil
+
+            if current_msa.find_by_name_start( seq_name, true ).size > 0
+              begin
+                seq = current_msa.get_by_name_start( seq_name, true ).copy
+              rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+              end
+            else
+              # Not found, try finding by partial match.
+              begin
+                seq = current_msa.get_by_name( seq_name, true, true )
+              rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+              end
+            end
+
+            normalized_id = per_species_counter.to_s( 16 ).upcase +
+             "_" + current_species
+
+            per_species_counter.to_i
+
+            ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
+
+            orig_name = nil
+            if seq != nil
+              orig_name = seq.get_name
+              seq.set_name( seq.get_name + " [" + current_species + "]" )
+              new_msa.add_sequence( seq )
+            else
+              Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
+            end
+
+            if domain_ranges != nil
+              first = true
+              prev_to = -1
+
+              domain_ranges.each { |range|
+                if range != nil && range.length > 0
+                  s = range.split("-")
+                  from = s[ 0 ].to_i
+                  to = s[ 1 ].to_i
+                  new_msa_domains.add_sequence( Sequence.new( orig_name +
+                       " [" + from.to_s +
+                       "-" + to.to_s +
+                       "] [" + basename + "] [" +
+                       current_species + "]",
+                      seq.get_sequence_as_string[from..to] ) )
+                  if extension > 0
+                    from_e = from - extension
+                    if from_e < 0
+                      from_e = 0
+                    end
+                    to_e = to + extension
+                    if to_e > seq.get_sequence_as_string.length - 1
+                      to_e = seq.get_sequence_as_string.length - 1
+                    end
+                    new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
+                         " [" + from.to_s +
+                         "-" + to.to_s  +
+                         "] [extended by " +
+                         extension.to_s +
+                         "] [" + basename + "] [" +
+                         current_species + "]",
+                        seq.get_sequence_as_string[ from_e..to_e ] ) )
+                  end # extension > 0
+                  if  extract_linkers
+                    if first
+                      first = false
+                      f = 0
+                      t = from - 1
+                      if extension > 0
+                        f = from - extension
+                      end
+                      mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
+                    else
+                      mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
+                    end
+                    prev_to = to
+                  end
+                end # range != nil && range.length > 0
+              }
+              if extract_linkers && prev_to > 0
+                f = prev_to + 1
+                t = seq.get_sequence_as_string.length - 1
+                if extension > 0
+                  t = prev_to + extension
+                end
+                mod_line += get_linker_sequence( f, t, seq ) + "]"
+              end
+            end
+
+            new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
+            if mod_line
+              linkers << mod_line + Constants::LINE_DELIMITER
+            end
+          end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
+        end # while line = file.gets
+
+      end
+
+      ids_map_writer.close
+
+      if ( per_species_counter > 0 )
+        print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
+      end
+
+      io = MsaIO.new()
+
+      fasta_writer = FastaWriter.new()
+      fasta_writer.remove_gap_chars
+      fasta_writer.clean
+
+      begin
+        io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      if new_msa_domains != nil
+        begin
+          io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
+        rescue Exception => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+        end
+      end
+
+      if extension > 0 && new_msa_domains_extended != nil
+        begin
+          io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
+        rescue Exception => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+        end
+      end
+
+      begin
+        io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      begin
+        f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
+        f.print( log )
+        f.close
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      if extract_linkers && linkers != nil
+        begin
+          f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LINKERS_SUFFIX , 'a' )
+          f.print( linkers )
+          f.close
+        rescue Exception => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+        end
+      end
+
+
+    end
+
+
+    def get_linker_sequence( from, to, seq )
+      if from < 0
+        from = 0
+      end
+      if to > seq.get_sequence_as_string.length - 1
+        to = seq.get_sequence_as_string.length - 1
+      end
+      if from > to
+        return ""
+      else
+        return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
+      end
+    end
+
+    def obtain_inputfiles( input_dir, seq_names_files_suffix )
+      input_files = Array.new()
+      Dir.foreach( input_dir ) { |file_name|
+        if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
+          input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
+        end
+      }
+      input_files
+    end
+
+    def extract_mappings( mapping_file )
+      species_code_to_path = Hash.new()
+      File.open( mapping_file ) do | file |
+        while line = file.gets
+          if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
+            if ( line =~ /(\S+)\s+(\S+)/ )
+              species = $1
+              path = $2
+              if ( species_code_to_path.has_key?( species ) )
+                Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
+              end
+              if ( species_code_to_path.has_value?( path ) )
+                Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
+              end
+              if ( !File.exist?( path ) )
+                Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
+              end
+              if ( !File.file?( path ) )
+                Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
+              end
+              if ( !File.readable?( path ) )
+                Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
+              end
+              if ( File.size( path ) < 10000 )
+                Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
+              end
+              if ( !Util.looks_like_fasta?( path ) )
+                Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
+              end
+              species_code_to_path[ species ] = path
+              puts species + " -> " + path
+            end
+          end
+        end
+      end
+      species_code_to_path
+    end
+
+    def print_counts( per_species_counter, log, ld )
+      puts "   [sum: " + per_species_counter.to_s + "]"
+      log << "   [sum: " + per_species_counter.to_s + "]" + ld
+    end
+
+    def read_fasta_file( input )
+      f = MsaFactory.new()
+      msa = nil
+      begin
+        msa = f.create_msa_from_file( input, FastaParser.new() )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+      msa
+    end
+
+    def print_help()
+      puts( "Usage:" )
+      puts()
+      puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
+         "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
+         "genome multiple-sequence ('fasta') files not in input dir]" )
+      puts()
+      puts( "  option: -" + EXT_OPTION  + "=<int>: to extend extracted domains" )
+      puts( "          -" + EXTRACT_LINKERS_OPTION  + "      : to extract linkers" )
+      puts()
+      puts( "  " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" )
+      puts()
+    end
+
+  end # class MultiSequenceExtractor
+end
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/tool/new_tap.rb b/forester/ruby/evoruby/lib/evo/tool/new_tap.rb
new file mode 100644 (file)
index 0000000..1dc7431
--- /dev/null
@@ -0,0 +1,167 @@
+#
+# = lib/evo/apps/ -  class
+#
+# Copyright::  Copyright (C) 2009 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: new_tap.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+require 'lib/evo/util/util'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/msa/msa'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/io/parser/general_msa_parser'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/writer/phylip_sequential_writer'
+require 'lib/evo/util/command_line_arguments'
+
+module Evoruby
+
+    class TaxonomyProcessor
+
+        PRG_NAME       = ""
+        PRG_DATE       = "2009.10.09"
+        PRG_DESC       = "replacement of labels in multiple sequence files"
+        PRG_VERSION    = "1.00"
+        COPYRIGHT      = "2009 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        REMOVE_REDUNDANT_SEQS_OPTION = "rr"
+        
+        def initialize()
+            @taxonomies = Hash.new()
+        end
+
+        def run()
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            if ( ARGV == nil || ( ARGV.length != 3 && ARGV.length != 4 ) )
+                puts( "Usage: #{PRG_NAME}.rb <input sequences> <output sequences> <output map>" )
+                puts()
+                puts( "  options: -" + REMOVE_REDUNDANT_SEQS_OPTION + ": to remove redundant sequences" )
+                puts()
+                exit( -1 )
+            end
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+            
+            input     = cla.get_file_name( 0 )
+            output    = cla.get_file_name( 1 )
+            map_file = cla.get_file_name( 2 )
+
+            allowed_opts = Array.new
+            allowed_opts.push( REMOVE_REDUNDANT_SEQS_OPTION ) 
+            
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
+            end
+
+            
+            remove_redudant = false
+            if ( cla.is_option_set?( REMOVE_REDUNDANT_SEQS_OPTION ) )
+                remove_redudant = true
+            end
+
+            if ( File.exists?( output ) )
+                Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" )
+            end
+            if ( File.exists?( map_file ) )
+                Util.fatal_error( PRG_NAME, "map file [" + map_file + "] already exists" )
+            end
+            if ( !File.exists?( input) )
+                Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" )
+            end
+           
+            fasta_like = Util.looks_like_fasta?( input )
+
+            puts()
+            puts( "Input alignment : " + input )
+            puts( "Output alignment: " + output )
+            puts( "Output map      : " + map_file )
+            if ( fasta_like )
+                puts( "Format          : Fasta"  )
+            else
+                puts( "Format          : Phylip like" )
+            end
+            puts()
+
+            species_map = Hash.new
+           
+            f = MsaFactory.new()
+            begin
+                if ( fasta_like )
+                    msa = f.create_msa_from_file( input, FastaParser.new() )
+                else
+                    msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
+                end
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s )
+            end
+
+            if ( msa == nil || msa.get_number_of_seqs() < 1 )
+                Util.fatal_error( PRG_NAME, "failed to read MSA" )
+            end
+            begin
+                Util.check_file_for_writability( map_file )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT )
+            end
+
+            if ( remove_redudant ) 
+                removed = msa.remove_redundant_sequences!( true )
+                if removed.size > 0
+                    Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
+                    removed.each { | seq_name |
+                        puts seq_name
+                    }
+                    Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" )
+                end
+            end
+
+            lf = File.open( map_file, "a" )
+            for i in 0 ... msa.get_number_of_seqs
+                seq  = msa.get_sequence( i )
+            end
+
+            io = MsaIO.new()
+            w = nil
+            if ( fasta_like )
+                w = FastaWriter.new()
+            else
+                w = PhylipSequentialWriter.new()
+            end
+            w.set_max_name_length( 10 )
+            w.clean( true )
+            begin
+                io.write_to_file( msa, output, w )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
+            end
+            lf.close()
+            if ( @taxonomies.length > 0 )
+                Util.print_message( PRG_NAME, "number of unique taxonomies: " + @taxonomies.length.to_s )
+            end
+            Util.print_message( PRG_NAME, "wrote: " + map_file )
+            Util.print_message( PRG_NAME, "wrote: " + output )
+            Util.print_message( PRG_NAME, "OK" )
+        end
+
+    end # class 
+
+end # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb b/forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb
new file mode 100644 (file)
index 0000000..69a6bc4
--- /dev/null
@@ -0,0 +1,299 @@
+#!/usr/local/bin/ruby -w
+#
+# = lib/evo/apps/phylogenies_decorator
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# decoration of phylogenies with sequence/species names and domain architectures
+#
+# $Id: phylogenies_decorator.rb,v 1.34 2010/12/13 19:00:11 cmzmasek Exp $
+#
+# Environment variable FORESTER_HOME needs to point to the appropriate
+# directory (e.g. setenv FORESTER_HOME $HOME/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester-atv/)
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+
+require 'date'
+
+module Evoruby
+
+    class PhylogeniesDecorator
+
+        DECORATOR_OPTIONS_SEQ_NAMES = '-r=1 -mdn'
+        # -mdn is a hidden expert option to rename e.g. "6_ORYLA3" to "6_[3]_ORYLA"
+        #DECORATOR_OPTIONS_SEQ_NAMES = '-sn -r=1'
+        DECORATOR_OPTIONS_DOMAINS = '-r=1'
+        IDS_MAPFILE_SUFFIX        = '.nim'
+        DOMAINS_MAPFILE_SUFFIX    = '.dff'
+        SLEEP_TIME                = 0.1
+        REMOVE_NI                 = true
+        TMP_FILE                  = '___PD___'
+        LOG_FILE                  = '00_phylogenies_decorator.log'
+        FORESTER_HOME             = ENV[Constants::FORESTER_HOME_ENV_VARIABLE]
+        JAVA_HOME                 = ENV[Constants::JAVA_HOME_ENV_VARIABLE]
+
+        PRG_NAME       = "phylogenies_decorator"
+        PRG_DATE       = "2008.09.02"
+        PRG_DESC       = "decoration of phylogenies with sequence/species names and domain architectures"
+        PRG_VERSION    = "1.0.1"
+        COPYRIGHT      = "2008-2009 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        IDS_ONLY_OPTION     = "n"
+        DOMAINS_ONLY_OPTION = "d"
+        HELP_OPTION_1       = "help"
+        HELP_OPTION_2       = "h"
+
+        NL = Constants::LINE_DELIMITER
+
+        def run
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            if ( ARGV == nil || ARGV.length > 3 || ARGV.length < 2  )
+                print_help
+                exit( -1 )
+            end
+
+            if FORESTER_HOME == nil || FORESTER_HOME.length < 1
+                Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::FORESTER_HOME_ENV_VARIABLE} has not been set" )
+            end
+            if JAVA_HOME == nil ||  JAVA_HOME.length < 1
+                Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::JAVA_HOME_ENV_VARIABLE} has not been set" )
+            end
+
+            if !File.exist?( FORESTER_HOME )
+                Util.fatal_error( PRG_NAME, '[' + FORESTER_HOME + '] does not exist' )
+            end
+            if !File.exist?( JAVA_HOME )
+                Util.fatal_error( PRG_NAME, '[' + JAVA_HOME + '] does not exist' )
+            end
+
+            decorator = JAVA_HOME + '/bin/java -cp ' + FORESTER_HOME + '/java/forester.jar org.forester.application.decorator'
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+
+            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+                     cla.is_option_set?( HELP_OPTION_2 ) )
+                print_help
+                exit( 0 )
+            end
+
+            if File.exist?( LOG_FILE )
+                Util.fatal_error( PRG_NAME, 'logfile [' + LOG_FILE + '] already exists' )
+            end
+
+            allowed_opts = Array.new
+            allowed_opts.push( IDS_ONLY_OPTION )
+            allowed_opts.push( DOMAINS_ONLY_OPTION )
+
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
+            end
+
+            ids_only = false
+            domains_only = false
+
+            in_suffix = cla.get_file_name( 0 )
+            out_suffix = cla.get_file_name( 1 )
+
+            if cla.is_option_set?( IDS_ONLY_OPTION )
+                ids_only = true
+            end
+            if cla.is_option_set?( DOMAINS_ONLY_OPTION )
+                domains_only = true
+            end
+
+            if ( ids_only && domains_only )
+                Util.fatal_error( PRG_NAME, 'attempt to use ids only and domains only at the same time' )
+            end
+
+            log = String.new
+
+            now = DateTime.now
+            log << "Program              : " + PRG_NAME + NL
+            log << "Version              : " + PRG_VERSION + NL
+            log << "Program date         : " + PRG_DATE + NL
+            log << "Options for seq names: " + DECORATOR_OPTIONS_SEQ_NAMES + NL
+            log << "Options for domains  : " + DECORATOR_OPTIONS_DOMAINS + NL
+            log << "FORESTER_HOME        : " + FORESTER_HOME + NL
+            log << "JAVA_HOME            : " + JAVA_HOME + NL + NL
+            log << "Date/time: " + now.to_s + NL
+            log << "Directory: " + Dir.getwd  + NL + NL
+
+            Util.print_message( PRG_NAME, 'input suffix     : ' + in_suffix )
+            Util.print_message( PRG_NAME, 'output suffix    : ' + out_suffix )
+
+            log << 'input suffix     : ' + in_suffix + NL
+            log << 'output suffix    : ' + out_suffix + NL
+
+            if ( File.exists?( TMP_FILE ) )
+                File.delete( TMP_FILE )
+            end
+
+            files = Dir.entries( "." )
+
+            counter = 0
+
+            files.each { | phylogeny_file |
+                if ( !File.directory?( phylogeny_file ) &&
+                         phylogeny_file !~ /^\./ &&
+                         phylogeny_file !~ /^00/ &&
+                         phylogeny_file !~ /#{out_suffix}$/ &&
+                         phylogeny_file =~ /#{in_suffix}$/ )
+                    begin
+                        Util.check_file_for_readability( phylogeny_file )
+                    rescue ArgumentError
+                        Util.fatal_error( PRG_NAME, 'can not read from: ' + phylogeny_file + ': '+ $! )
+                    end
+
+                    counter += 1
+
+                    outfile = phylogeny_file.sub( /#{in_suffix}$/, out_suffix )
+
+                    if REMOVE_NI
+                        outfile = outfile.sub( /_ni_/, '_' )
+                    end
+
+                    if File.exists?( outfile )
+                        msg = counter.to_s + ': ' + phylogeny_file + ' -> ' +  outfile +
+                         ' : already exists, skipping'
+                        Util.print_message( PRG_NAME, msg  )
+                        log << msg + NL
+                        next
+                    end
+
+                    Util.print_message( PRG_NAME, counter.to_s + ': ' + phylogeny_file + ' -> ' +  outfile )
+                    log << counter.to_s + ': ' + phylogeny_file + ' -> ' +  outfile + NL
+
+                    phylogeny_id = get_id( phylogeny_file )
+
+                    ids_mapfile_name = nil
+                    domains_mapfile_name = nil
+
+                    if ids_only
+                        ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX )
+                    elsif domains_only
+                        domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX )
+                    else
+                        ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX )
+                        domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX )
+                    end
+
+                    if domains_mapfile_name != nil
+                        begin
+                            Util.check_file_for_readability( domains_mapfile_name )
+                        rescue ArgumentError
+                            Util.fatal_error( PRG_NAME, 'failed to read from [#{domains_mapfile_name}]: ' + $! )
+                        end
+                    end
+
+                    if ids_mapfile_name != nil
+                        begin
+                            Util.check_file_for_readability( ids_mapfile_name )
+                        rescue ArgumentError
+                            Util.fatal_error( PRG_NAME, 'failed to read from [#{ids_mapfile_name}]: ' + $! )
+                        end
+                    end
+
+                    if domains_mapfile_name != nil
+                        if ids_mapfile_name != nil
+                            my_outfile = TMP_FILE
+                        else
+                            my_outfile = outfile
+                        end
+                        cmd = decorator + ' ' + DECORATOR_OPTIONS_DOMAINS + ' ' +
+                         '-f=d ' + phylogeny_file + ' ' +
+                         domains_mapfile_name + ' ' + my_outfile
+                        execute_cmd( cmd, log )
+                    end
+
+                    if ids_mapfile_name != nil
+                        if domains_mapfile_name != nil
+                            my_infile = TMP_FILE
+                        else
+                            my_infile = phylogeny_file
+                        end
+                        cmd = decorator + ' ' +  DECORATOR_OPTIONS_SEQ_NAMES + ' ' +
+                         '-f=s ' + my_infile + ' ' +
+                         ids_mapfile_name + ' ' + outfile
+                        execute_cmd( cmd, log )
+                    end
+
+                    if ( File.exists?( TMP_FILE ) )
+                        File.delete( TMP_FILE )
+                    end
+                end
+            }
+            open( LOG_FILE, 'w' ) do | f |
+                f.write( log )
+            end
+            puts
+            Util.print_message( PRG_NAME, 'OK' )
+            puts
+        end # def run
+
+        def execute_cmd( cmd, log )
+            log << 'excuting ' + cmd + NL
+            IO.popen( cmd , 'r+' ) do | pipe |
+                pipe.close_write
+                log << pipe.read + NL + NL
+            end
+            sleep( SLEEP_TIME )
+        end
+
+
+        def get_id( phylogeny_file_name )
+            phylogeny_file_name =~ /^([^_]+)/
+            $1
+        end
+
+        def get_file( files_in_dir, phylogeny_id, suffix_pattern )
+            matching_files = Array.new
+            files_in_dir.each { | file |
+
+                if ( !File.directory?( file ) &&
+                         file !~ /^\./ &&
+                         file !~ /^00/ &&
+                         file =~ /^#{phylogeny_id}.*#{suffix_pattern}$/ )
+                    matching_files << file
+                end
+            }
+            if matching_files.length < 1
+                Util.fatal_error( PRG_NAME, 'no file matching [' + phylogeny_id +
+                     '_] [' + suffix_pattern + '] present in current directory' )
+            elsif matching_files.length > 1
+                Util.fatal_error( PRG_NAME, 'more than one file matching [' + phylogeny_id +
+                     '_] [' + suffix_pattern + '] present in current directory' )
+            end
+            matching_files[ 0 ]
+        end
+
+        def print_help()
+            puts( "Usage:" )
+            puts()
+            puts( "  " + PRG_NAME + ".rb [options] <suffix of intrees to be decorated> <suffix for decorated outtrees> " )
+            puts()
+            puts( "  options: -" + IDS_ONLY_OPTION + ": decorate with sequence/species names only" )
+            puts( "           -" + DOMAINS_ONLY_OPTION + ": decorate with domain structures" )
+            puts()
+        end
+    end # class PhylogenyiesDecorator
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb b/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb
new file mode 100644 (file)
index 0000000..999541e
--- /dev/null
@@ -0,0 +1,267 @@
+#
+# = lib/evo/apps/phylogeny_factory - PhylogenyFactory class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: phylogeny_factory.rb,v 1.32 2010/12/13 19:00:11 cmzmasek Exp $
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+
+require 'set'
+require 'date'
+
+module Evoruby
+
+    class PhylogenyFactory
+
+        PRG_NAME       = "phylogeny_factory"
+        PRG_DATE       = "2010.05.26"
+        PRG_DESC       = "automated phylogeny reconstruction using queing system"
+        PRG_VERSION    = "1.1"
+        COPYRIGHT      = "2010 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        USE_JOB_SUBMISSION_SYSTEM_OPTION  = 's'
+        LOG_FILE                          = '00_phylogeny_factory.log'
+        TEMPLATE_FILE                     = '00_phylogeny_factory.template'
+        PBS_O_WORKDIR                     = '$PBS_O_WORKDIR/'
+        MIN_LENGTH_DEFAULT                = 40
+        WALLTIME                          = '100:00:00'
+        QUEUE                             = 'default'
+
+        TMP_CMD_FILE_SUFFIX = '_QSUB'
+
+        HMM                 = 'HMM'
+        RSL                 = 'RSL'
+
+        OPTION_OPEN          = '%['
+        OPTION_CLOSE          = ']%'
+
+        WAIT                 = 1.0
+
+        NL = Constants::LINE_DELIMITER
+
+        def run
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+
+            allowed_opts = Array.new
+            allowed_opts.push( USE_JOB_SUBMISSION_SYSTEM_OPTION )
+
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME,
+                    "unknown option(s): " + disallowed,
+                    STDOUT )
+            end
+
+            if File.exists?( LOG_FILE )
+                puts( '[' + PRG_NAME + '] > log file [' + LOG_FILE + '] already exists' )
+                exit( -1 )
+            end
+
+            if !File.exists?( TEMPLATE_FILE )
+                puts( '[' + PRG_NAME + '] > template file [' + TEMPLATE_FILE + '] not found' )
+                exit( -1 )
+            end
+
+            use_job_submission_system = false
+            if cla.is_option_set?( USE_JOB_SUBMISSION_SYSTEM_OPTION )
+                use_job_submission_system = true
+            end
+
+            log = String.new
+
+            now = DateTime.now
+            log << "Program     : " + PRG_NAME + NL
+            log << "Version     : " + PRG_VERSION + NL
+            log << "Program date: " + PRG_DATE + NL + NL
+            log << "Date/time   : " + now.to_s + NL
+            log << "Directory   : " + Dir.getwd  + NL + NL
+
+            puts( '[' + PRG_NAME + '] > reading ' + TEMPLATE_FILE )
+
+            paths       = Hash.new  # path placeholder -> full path
+            min_lengths = Hash.new  # alignment id -> minimal length
+            hmms        = Hash.new  # alignment id -> hmm
+            options     = Hash.new  # option placeholder -> option
+            ids         = Set.new
+
+            commands    = Array.new
+
+            log <<  "////////////////////////////////////////////////////////////////// #{NL}"
+            log << "Template file [" + TEMPLATE_FILE + "]:#{NL}"
+
+            command = String.new
+
+            open( TEMPLATE_FILE ).each { | line |
+                log << line
+                if ( line =~ /^#/ )
+                elsif ( line =~ /^\$\s*(\S+)\s*=\s*(\S+)/ )
+                    paths[ $1 ] = $2
+                    puts( '[' + PRG_NAME + '] > paths      : ' + $1 + ' => ' + $2 )
+
+                elsif ( line =~ /^%\s*#{HMM}\s*(\S+)\s*=\s*(\S+)/ )
+                    hmms[ $1 ] = $2
+                    puts( '[' + PRG_NAME + '] > hmms       : ' + $1 + ' => ' + $2 )
+
+                elsif ( line =~ /^%\s*#{RSL}\s*(\S+)\s*=\s*(\S+)/ )
+                    min_lengths[ $1 ] = $2
+                    puts( '[' + PRG_NAME + '] > min lengths: ' + $1 + ' => ' + $2 )
+
+                elsif ( line =~ /^%\s*(\S+)\s*=\s*(\S+)/ )
+                    options[ $1 ] = $2
+                    puts( '[' + PRG_NAME + '] > options    : ' + $1 + ' => ' + $2 )
+
+                elsif ( line =~ /^>\s*(.+)/ )
+                    command = command + $1 + ";#{NL}"
+
+                elsif ( line =~ /^-/  )
+                    commands << prepare( command, paths )
+                    command = String.new
+                end
+            }
+            log << "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ #{NL}#{NL}"
+
+            files = Dir.entries( "." )
+
+            files.each { | file |
+                if ( !File.directory?( file ) &&
+                         file !~ /^\./ &&
+                         file !~ /#{TEMPLATE_FILE}/ &&
+                         file !~ /.bck$/ &&
+                         file !~ /.log$/ &&
+                         file !~ /nohup/ &&
+                         file !~ /^00/ )
+                    aln_name = file.to_str
+                    id = get_id( aln_name )
+                    if !ids.include?( id )
+                        ids.add( id )
+                    end
+                    puts( '[' + PRG_NAME + '] > file [id]  : ' + aln_name + ' [' + id + ']' )
+                    commands.each do | cmd |
+
+                        cmd = subst_hmm( cmd, aln_name, hmms )
+                        cmd = subst_min_length( cmd, aln_name, min_lengths )
+                        cmd = subst_options( cmd, options )
+                        if use_job_submission_system
+                            cmd = subst_aln_name( cmd, PBS_O_WORKDIR + aln_name )
+                        else
+                            cmd = subst_aln_name( cmd, aln_name )
+                        end
+
+                        if ( cmd =~ /%/ )
+                            cmd =~ /(%.*?%)/
+                            problem = $1
+                            puts( '[' + PRG_NAME + '] > WARNING    : [' + id + '] command still contains placeholder: ' + problem )
+                            log << "WARNING: command still contains placeholder: " + cmd + NL
+                        else
+                            tmp_cmd_file = file.to_str[ 0..4 ] + TMP_CMD_FILE_SUFFIX
+                            if ( File.exists?( tmp_cmd_file ) )
+                                File.delete( tmp_cmd_file )
+                            end
+                            if use_job_submission_system
+                                open( tmp_cmd_file, 'w' ) do |f|
+                                    f.write( cmd )
+                                end
+                            end
+
+                            log << cmd + NL
+
+                            if use_job_submission_system
+                                IO.popen( 'qsub -q ' + QUEUE  + ' -l walltime=' + WALLTIME + ' ' + tmp_cmd_file , 'r+' ) do | pipe |
+                                    pipe.close_write
+                                end
+                            else
+                                spawn( 'nohup ' + cmd + ' &', STDERR => "/dev/null" )
+                            end
+
+                            sleep( WAIT )
+                            if ( File.exists?( tmp_cmd_file ) )
+                                File.delete( tmp_cmd_file )
+                            end
+                        end
+                    end
+                end
+            }
+
+            open( LOG_FILE, 'w' ) do | f |
+                f.write( log )
+            end
+
+            puts()
+            puts( '[' + PRG_NAME + '] > OK' )
+            puts()
+
+        end # def run
+
+        def prepare( command, paths )
+            paths.each_pair{ | name, full |
+                command = command.gsub( name, full )
+            }
+            command
+        end
+
+        def subst_options( command, options )
+            opt_placeholders = command.scan( /%\[\S+\]%/ )
+            opt_placeholders.each { | opt_placeholder |
+                opt_placeholder = opt_placeholder.gsub( OPTION_OPEN , '' )
+                opt_placeholder = opt_placeholder.gsub( OPTION_CLOSE, '' )
+                opt_value = options[ opt_placeholder ]
+                if ( opt_value != nil && opt_value.size > 0 )
+                    command = command.gsub( OPTION_OPEN + opt_placeholder + OPTION_CLOSE, opt_value )
+                end
+            }
+            command
+        end
+
+        def subst_aln_name( command, aln_name )
+            command = command.gsub( '$', aln_name )
+            command
+        end
+
+        def subst_hmm( command, aln_name, hmms )
+            id = get_id( aln_name )
+            hmm = hmms[ id ]
+            if ( hmm != nil && hmm.size > 0 )
+                command = command.gsub( OPTION_OPEN + HMM + OPTION_CLOSE, hmm )
+            end
+            command
+        end
+
+        def subst_min_length( command, aln_name, min_lengths )
+            id = get_id( aln_name )
+            min_length = min_lengths[ id ]
+            if ( min_length != nil && min_length.size > 0 )
+                command = command.gsub( OPTION_OPEN + RSL + OPTION_CLOSE, min_length )
+            else
+                command = command.gsub( OPTION_OPEN + RSL + OPTION_CLOSE, MIN_LENGTH_DEFAULT.to_s )
+            end
+            command
+        end
+
+        def get_id( aln_name )
+            aln_name =~ /^([^_]+)/
+            $1
+        end
+
+    end # class PhylogenyFactory
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb
new file mode 100644 (file)
index 0000000..d316f85
--- /dev/null
@@ -0,0 +1,317 @@
+#
+# = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: taxonomy_processor.rb,v 1.26 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+require 'lib/evo/util/util'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/msa/msa'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/io/parser/general_msa_parser'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/writer/phylip_sequential_writer'
+require 'lib/evo/util/command_line_arguments'
+
+module Evoruby
+
+  class TaxonomyProcessor
+
+    PRG_NAME       = "tap"
+    PRG_DATE       = "2012.09.27"
+    PRG_DESC       = "replacement of species names in multiple sequence files"
+    PRG_VERSION    = "1.02"
+    COPYRIGHT      = "2012 Christian M Zmasek"
+    CONTACT        = "phylosoft@gmail.com"
+    WWW            = "www.phylosoft.org"
+
+    SIMPLE = true
+
+    EXTRACT_TAXONOMY_OPTION = "t"
+
+    def initialize()
+      @taxonomies = Hash.new()
+    end
+
+    def run()
+
+      Util.print_program_information( PRG_NAME,
+        PRG_VERSION,
+        PRG_DESC,
+        PRG_DATE,
+        COPYRIGHT,
+        CONTACT,
+        WWW,
+        STDOUT )
+
+      if ( ARGV == nil || (  ARGV.length != 1 && ARGV.length != 3 && ARGV.length != 4 && ARGV.length != 5 && ARGV.length != 6 ) )
+        puts( "Usage: #{PRG_NAME}.rb [options] [input map file] <input sequences> [output sequences] [output id list]" )
+        puts()
+        puts( "  options: -" + EXTRACT_TAXONOMY_OPTION + ": to extract taxonomy information from bracketed expression" )
+        puts()
+        exit( -1 )
+      end
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+
+      mapfile   = nil
+      input     = nil
+      output    = nil
+      list_file = nil
+
+
+
+      if cla.get_number_of_files == 4
+        mapfile   = cla.get_file_name( 0 )
+        input     = cla.get_file_name( 1 )
+        output    = cla.get_file_name( 2 )
+        list_file = cla.get_file_name( 3 )
+      elsif cla.get_number_of_files == 3
+        input     = cla.get_file_name( 0 )
+        output    = cla.get_file_name( 1 )
+        list_file = cla.get_file_name( 2 )
+      elsif cla.get_number_of_files == 1
+        input     = cla.get_file_name( 0 )
+        i = nil
+        if input.downcase.end_with?( ".fasta" )
+          i = input[ 0 .. input.length - 7 ]
+        elsif input.downcase.end_with?( ".fsa" )
+          i = input[ 0 .. input.length - 5 ]
+        else
+          i = input
+        end
+        output    = i + "_ni.fasta"
+        list_file = i + ".nim"
+      end
+
+
+      allowed_opts = Array.new
+      allowed_opts.push( EXTRACT_TAXONOMY_OPTION )
+
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
+      end
+
+      extract_taxonomy = false
+      if ( cla.is_option_set?( EXTRACT_TAXONOMY_OPTION ) )
+        extract_taxonomy = true
+      end
+
+      if ( File.exists?( output ) )
+        Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" )
+      end
+      if ( File.exists?( list_file ) )
+        Util.fatal_error( PRG_NAME, "list file [" + list_file + "] already exists" )
+      end
+      if ( !File.exists?( input) )
+        Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" )
+      end
+      if ( mapfile != nil && !File.exists?( mapfile ) )
+        Util.fatal_error( PRG_NAME, "mapfile [" + mapfile + "] does not exist" )
+      end
+
+      fasta_like = Util.looks_like_fasta?( input )
+
+      puts()
+      if mapfile != nil
+        puts( "Map file        : " + mapfile )
+      end
+      puts( "Input alignment : " + input )
+      puts( "Output alignment: " + output )
+      puts( "Name list       : " + list_file )
+      if ( fasta_like )
+        puts( "Format          : Fasta"  )
+      else
+        puts( "Format          : Phylip like" )
+      end
+      if ( extract_taxonomy )
+        puts( "Extract taxonomy: true"  )
+      end
+      puts()
+
+      species_map = Hash.new
+      if mapfile != nil
+        File.open( mapfile ) do | file |
+          while line = file.gets
+            if ( line =~/(.+)#(.+)/ || line =~/(.+)\s+(.+)/ )
+              species_map[ $1 ] = $2
+              Util.print_message( PRG_NAME, "mapping: " + $1 + ' => ' + $2 )
+            end
+          end
+        end
+      end
+
+      f = MsaFactory.new()
+      begin
+        if ( fasta_like )
+          msa = f.create_msa_from_file( input, FastaParser.new() )
+        else
+          msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
+        end
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s )
+      end
+
+      if ( msa == nil || msa.get_number_of_seqs() < 1 )
+        Util.fatal_error( PRG_NAME, "failed to read MSA" )
+      end
+      begin
+        Util.check_file_for_writability( list_file )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT )
+      end
+
+      #removed = msa.remove_redundant_sequences!( true )
+      #if removed.size > 0
+      #  Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
+      #  removed.each { | seq_name |
+      #    puts seq_name
+      #  }
+      #  Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" )
+      #end
+
+      lf = File.open( list_file, "a" )
+      for i in 0 ... msa.get_number_of_seqs
+        seq  = msa.get_sequence( i )
+        seq.set_name( Util::normalize_seq_name( modify_name( seq.get_name(), i, lf, species_map, extract_taxonomy ), 10 ) )
+      end
+
+      io = MsaIO.new()
+      w = nil
+      if ( fasta_like )
+        w = FastaWriter.new()
+      else
+        w = PhylipSequentialWriter.new()
+      end
+      w.set_max_name_length( 10 )
+      w.clean( true )
+      begin
+        io.write_to_file( msa, output, w )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
+      end
+      lf.close()
+      if ( @taxonomies.length > 0 )
+        Util.print_message( PRG_NAME, "number of unique taxonomies: " + @taxonomies.length.to_s )
+      end
+      Util.print_message( PRG_NAME, "wrote: " + list_file )
+      Util.print_message( PRG_NAME, "wrote: " + output )
+      Util.print_message( PRG_NAME, "OK" )
+    end
+
+    private
+
+    def modify_name( desc, counter, file, species_map, extract_taxonomy )
+      new_desc = nil
+      my_species = nil
+      # if desc =~ /^>?\s*\S{1,10}_([0-9A-Z]{3,5})/
+      if desc =~ /^>?\s*\S{1,10}_([A-Z]{3,5})/
+        new_desc = counter.to_s( 16 ) + "_" + $1
+      elsif SIMPLE
+        new_desc = counter.to_s( 16 )
+      elsif extract_taxonomy
+        if ( desc.count( "[" ) != desc.count( "]" ) )
+          Util.fatal_error( PRG_NAME, "illegal bracket count in: " + desc )
+        end
+        species = nil
+        species_map.each_key do | key |
+          if desc =~ /[\b|_]#{key}\b/  # Added boundaries to prevent e.g. RAT matching ARATH.
+            species = species_map[ key ]
+            new_desc = counter.to_s( 16 ) + "_" + species
+            break
+          end
+        end
+        if species == nil
+          if desc =~/.*\[(\S{3,}?)\]/
+            species = $1
+            species.strip!
+            species.upcase!
+            species.gsub!( /\s+/, " " )
+            species.gsub!( /-/, "" )
+            species.gsub!( /\)/, "" )
+            species.gsub!( /\(/, "" )
+            species.gsub!( /\'/, "" )
+            if species =~ /\S+\s\S+/ || species =~ /\S{3,5}/
+              if species =~ /(\S+)\s(\S+)/
+                code = $1[ 0..2 ] + $2[ 0..1 ]
+              elsif  species =~ /\S{3,5}/
+                code = species
+              elsif species.count( " " ) > 2
+                species =~ /(\S+)\s+(\S+)\s+(\S+)$/
+                third_last = $1
+                second_last = $2
+                last = $3
+                code = code[ 0 ] + third_last[ 0 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ]
+              elsif species.count( " " ) > 1
+                species =~ /(\S+)\s+(\S+)$/
+                second_last = $1
+                last = $2
+                code = code[ 0..1 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ]
+              end
+              new_desc = counter.to_s( 16 ) + "_" + code
+              if @taxonomies.has_key?( code )
+                if ( !@taxonomies.has_value?( species ) )
+                  Util.fatal_error( PRG_NAME, "code [#{code}] is not unique in [#{desc}]" )
+                end
+              else
+                if ( @taxonomies.has_value?( species ) )
+                  Util.fatal_error( PRG_NAME, "genome [#{species}] is not unique in [#{desc}]" )
+                else
+                  @taxonomies[ code ] = species
+                end
+              end
+            else
+              Util.fatal_error( PRG_NAME, "illegal format [#{species}] in: " + desc )
+            end
+          else
+            Util.fatal_error( PRG_NAME, "illegal format in: " + desc )
+          end
+        end
+      else
+        species = nil
+        my_species = nil
+        species_map.each_key do | key |
+          if desc =~ /#{key}/
+            species = species_map[ key ]
+            species = species.gsub( /\s+/, "" )
+            species = species.gsub( /_/, " " )
+            my_species = species
+            if species =~ /(\S+)\s+(\S+)/
+              species = $1[0..2] + $2[0..1]
+            end
+            species = species.gsub( /\s+/, "" )
+            species = species.slice(0, 5)
+            species.upcase!
+            break
+          end
+        end
+        if species == nil
+          Util.fatal_error( PRG_NAME, "species not found in: " + desc  )
+        else
+          new_desc = counter.to_s( 16 ) + "_" + species
+        end
+      end
+      if new_desc == nil
+        Util.fatal_error( PRG_NAME, "failed to extract species from: " + desc  )
+      end
+      if my_species != nil
+        file.print( new_desc + ": " + desc + " [" + my_species + "]" + "\n" )
+      else
+        file.print( new_desc + ": " + desc + "\n" )
+      end
+      new_desc
+    end
+
+  end # class TaxonomyProcessor
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/tool/tseq_taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/tool/tseq_taxonomy_processor.rb
new file mode 100644 (file)
index 0000000..f708247
--- /dev/null
@@ -0,0 +1,190 @@
+#
+# = lib/evo/apps/tseq_taxonomy_processor - TseqTaxonomyProcessor class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: tseq_taxonomy_processor.rb,v 1.6 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+require 'lib/evo/util/util'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/msa/msa'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/parser/sp_taxonomy_parser'
+require 'lib/evo/io/parser/ncbi_tseq_parser'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/writer/phylip_sequential_writer'
+require 'lib/evo/util/command_line_arguments'
+
+module Evoruby
+
+    class TseqTaxonomyProcessor
+
+        PRG_NAME       = "tseq_tap"
+        PRG_DATE       = "2009.01.06"
+        PRG_DESC       = "preprocessing of multiple sequence files in ncbi tseq xml format"
+        PRG_VERSION    = "1.02"
+        COPYRIGHT      = "2009 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        TAXONOMY_CODE           = "TAXONOMY_CODE:"
+        TAXONOMY_ID             = "TAXONOMY_ID:"
+        TAXONOMY_ID_TYPE        = "TAXONOMY_ID_TYPE:"
+        TAXONOMY_SN             = "TAXONOMY_SN:"
+        TAXONOMY_CN             = "TAXONOMY_CN:"
+        SEQ_ACCESSION           = "SEQ_ACCESSION:"
+        SEQ_ACCESSION_SOURCE    = "SEQ_ACCESSION_SOURCE:"
+        SEQ_SECONDARY_ACCESSION = "SEQ_SECONDARY_ACCESSION:"
+        SEQ_SYMBOL              = "SEQ_SYMBOL:"
+        SEQ_NAME                = "SEQ_NAME:"
+        SEQ_MOL_SEQ             = "SEQ_MOL_SEQ:"
+
+        def initialize()
+            @tax_ids_to_sp_taxonomies = Hash.new()
+        end
+
+        def run()
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            if  ARGV == nil || ARGV.length != 4
+                puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in tseq xml format> <name for fasta outfile> <name for map outfile>" )
+                puts()
+
+                exit( -1 )
+            end
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+            allowed_opts = Array.new
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
+            end
+
+            sp_taxonomy_infile = cla.get_file_name( 0 )
+            sequences_infile = cla.get_file_name( 1 )
+            sequences_outfile = cla.get_file_name( 2 )
+            mapping_outfile = cla.get_file_name( 3 )
+
+            Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
+            Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
+            Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
+            Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
+
+            sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
+
+            Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
+
+            tseq_parser = NcbiTSeqParser.new
+            msa_fac = MsaFactory.new
+
+            seqs = msa_fac.create_msa_from_file( sequences_infile, tseq_parser )
+
+            Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
+
+            removed = seqs.remove_redundant_sequences!( true, true )
+
+            if removed.size > 0
+                Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
+                removed.each { | seq_name |
+                    puts seq_name
+                }
+                Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
+            end
+
+            mapping_out = File.open( mapping_outfile, "a" )
+
+            for i in 0 ... seqs.get_number_of_seqs
+                seq = seqs.get_sequence( i )
+                if seq.get_taxonomy == nil
+                    Util.fatal_error( PRG_NAME, "sequence [" + seq.get_name + "] has no taxonomy information" )
+                end
+                seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
+            end
+
+            io = MsaIO.new()
+
+            w = FastaWriter.new()
+
+            w.set_max_name_length( 10 )
+            w.clean( true )
+            begin
+                io.write_to_file( seqs, sequences_outfile, w )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
+            end
+            mapping_out.close()
+
+            Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
+            Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
+            Util.print_message( PRG_NAME, "OK" )
+
+        end
+
+        private
+
+        def modify_name( seq, i, sp_taxonomies, mapping_outfile )
+
+            tax_id = seq.get_taxonomy.get_id
+            matching_sp_taxonomy = nil
+
+            if @tax_ids_to_sp_taxonomies.has_key?( tax_id )
+                # This is so that a second lookup will be much faster.
+                matching_sp_taxonomy = @tax_ids_to_sp_taxonomies[ tax_id ]
+            else
+                sp_taxonomies.each { |sp_taxonomy|
+                    if ( sp_taxonomy.id == tax_id )
+                        if  matching_sp_taxonomy != nil
+                            Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] is not unique" )
+                        end
+                        matching_sp_taxonomy = sp_taxonomy
+                        @tax_ids_to_sp_taxonomies[ tax_id ] = sp_taxonomy
+                    end
+                }
+            end
+            if  matching_sp_taxonomy == nil
+                Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] for [" +  seq.get_taxonomy.get_name + "] not found" )
+            end
+
+            new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
+
+            seq_name = seq.get_name
+            if  seq_name =~ /\[.+\]$/
+                # Redundant taxonomy information hides here.
+                seq_name = seq_name.sub(/\[.+\]$/, '')
+            end
+            if  seq_name =~ /^\s*hypothetical\s+protein\s*/i
+                # Pointless information.
+                seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
+            end
+
+            mapping_outfile.print( new_name + "\t" +
+                 TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
+                 TAXONOMY_ID + tax_id + "\t" +
+                 TAXONOMY_ID_TYPE + seq.get_taxonomy.get_id_source + "\t" +
+                 TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
+                 SEQ_ACCESSION + seq.get_accession + "\t" +
+                 SEQ_ACCESSION_SOURCE + seq.get_accession_source + "\t" +
+                 SEQ_SYMBOL + seq.get_symbol + "\t" +
+                 SEQ_NAME + seq_name + "\t" +
+                 SEQ_MOL_SEQ + seq.get_sequence_as_string +
+                 Constants::LINE_DELIMITER )
+            new_name
+        end
+
+    end 
+
+end # module Evoruby
\ No newline at end of file