initial commit
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 9 Feb 2011 01:20:06 +0000 (01:20 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 9 Feb 2011 01:20:06 +0000 (01:20 +0000)
72 files changed:
forester/ruby/00_README.txt [new file with mode: 0644]
forester/ruby/evoruby/exe/d2f.rb [new file with mode: 0755]
forester/ruby/evoruby/exe/dsx.rb [new file with mode: 0755]
forester/ruby/evoruby/exe/evo_nursery.rb [new file with mode: 0755]
forester/ruby/evoruby/exe/fae.rb [new file with mode: 0755]
forester/ruby/evoruby/exe/fasta_tap.rb [new file with mode: 0755]
forester/ruby/evoruby/exe/hsp.rb [new file with mode: 0755]
forester/ruby/evoruby/exe/msa_pro.rb [new file with mode: 0755]
forester/ruby/evoruby/exe/mse.rb [new file with mode: 0755]
forester/ruby/evoruby/exe/phylogenies_decorator.rb [new file with mode: 0755]
forester/ruby/evoruby/exe/phylogeny_factory.rb [new file with mode: 0644]
forester/ruby/evoruby/exe/run_phylo_pipeline.rb [new file with mode: 0755]
forester/ruby/evoruby/exe/tap.rb [new file with mode: 0755]
forester/ruby/evoruby/exe/test.rb [new file with mode: 0755]
forester/ruby/evoruby/exe/tseq_tap.rb [new file with mode: 0755]
forester/ruby/evoruby/files/00_phylogeny_factory.template [new file with mode: 0644]
forester/ruby/evoruby/files/00_sample_tap_mapfile [new file with mode: 0644]
forester/ruby/evoruby/files/test/fasta_file.txt [new file with mode: 0644]
forester/ruby/evoruby/files/test/general_msa_file.txt [new file with mode: 0644]
forester/ruby/evoruby/files/test/ncbi_tseq.xml [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/apps/domains_to_forester.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/apps/evo_nursery.rb [new file with mode: 0755]
forester/ruby/evoruby/lib/evo/apps/fasta_extractor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/apps/fasta_taxonomy_processor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/apps/hmmscan_parser.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/apps/msa_processor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/apps/new_tap.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/apps/phylogenies_decorator.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/apps/phylogeny_factory.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/apps/taxonomy_processor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/apps/tseq_taxonomy_processor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/io/msa_io.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/io/parser/basic_table_parser.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/io/parser/fasta_parser.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/io/parser/msa_parser.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/io/parser/ncbi_tseq_parser.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/io/parser/sp_taxonomy_parser.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/io/writer/fasta_writer.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/io/writer/msa_writer.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/io/writer/nexus_writer.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/io/writer/phylip_sequential_writer.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/msa/msa.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/msa/msa_factory.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/sequence/domain_structure.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/sequence/protein_domain.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/sequence/sequence.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/soft/fastme.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/soft/raxml.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/soft/resource_locations.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/soft/tree_puzzle.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/table/basic_table.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/taxonomy/sp_taxonomy.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/taxonomy/taxonomy.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/util/constants.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/util/util.rb [new file with mode: 0644]
forester/ruby/scripts/delete_ext_nodes.rb [new file with mode: 0755]
forester/ruby/scripts/hmm_split.rb [new file with mode: 0755]
forester/ruby/scripts/parameters.rb_dir_qsub [new file with mode: 0644]
forester/ruby/scripts/pfam2go_reformat.rb [new file with mode: 0755]
forester/ruby/scripts/pfam_summarize.rb [new file with mode: 0755]
forester/ruby/scripts/pfam_to_scop.rb [new file with mode: 0755]
forester/ruby/scripts/rb_dir_qsub.rb [new file with mode: 0644]
forester/ruby/scripts/rb_dir_x.rb [new file with mode: 0644]
forester/ruby/scripts/rb_qsub.rb [new file with mode: 0755]
forester/ruby/scripts/replace.rb [new file with mode: 0755]
forester/ruby/scripts/replace_id.rb [new file with mode: 0644]
forester/ruby/scripts/scoptastic.rb [new file with mode: 0755]

diff --git a/forester/ruby/00_README.txt b/forester/ruby/00_README.txt
new file mode 100644 (file)
index 0000000..28f1768
--- /dev/null
@@ -0,0 +1,5 @@
+This folder contains the "evoruby" Ruby classes.
+They are not (yet) essential for the rest of the FORESTER package.
+
+To use evoruby, add "path/to/forester-atv/ruby/evoruby" to RUBYLIB.
+(e.g., for C shell: "setenv RUBYLIB $HOME/forester-atv/ruby/evoruby:$HOME/some/other/rubylibrary")
\ No newline at end of file
diff --git a/forester/ruby/evoruby/exe/d2f.rb b/forester/ruby/evoruby/exe/d2f.rb
new file mode 100755 (executable)
index 0000000..cb48db9
--- /dev/null
@@ -0,0 +1,20 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/d2f
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: d2f.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $
+#
+# last modified: 06/11/2007
+
+require 'lib/evo/apps/domains_to_forester'
+
+module Evoruby
+
+    dtf = DomainsToForester.new()
+
+    dtf.run()
+
+end  # module Evoruby
diff --git a/forester/ruby/evoruby/exe/dsx.rb b/forester/ruby/evoruby/exe/dsx.rb
new file mode 100755 (executable)
index 0000000..1ff35f7
--- /dev/null
@@ -0,0 +1,20 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/dsx
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: dsx.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $
+#
+# last modified: 06/11/2007
+
+require 'lib/evo/apps/domain_sequence_extractor'
+
+module Evoruby
+
+    dsx = DomainSequenceExtractor.new()
+
+    dsx.run()
+
+end  # module Evoruby
diff --git a/forester/ruby/evoruby/exe/evo_nursery.rb b/forester/ruby/evoruby/exe/evo_nursery.rb
new file mode 100755 (executable)
index 0000000..0ea157d
--- /dev/null
@@ -0,0 +1,20 @@
+#!/usr/local/bin/ruby -W0
+#
+# = exe/evo_nursery
+#
+# Copyright::  Copyright (C) 2009-2010 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: evo_nursery.rb,v 1.1 2009/10/07 21:59:41 cmzmasek Exp $
+#
+# last modified: 2009/10/07
+
+require 'lib/evo/apps/evo_nursery'
+
+module Evoruby
+
+    en = EvoNursery.new()
+
+    en.run()
+
+end  # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/exe/fae.rb b/forester/ruby/evoruby/exe/fae.rb
new file mode 100755 (executable)
index 0000000..009fa0b
--- /dev/null
@@ -0,0 +1,19 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/fae
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: fae.rb,v 1.1 2008/09/10 02:16:34 cmzmasek Exp $
+
+
+require 'lib/evo/apps/fasta_extractor'
+
+module Evoruby
+    
+    mse = FastaExtractor.new()
+    
+    mse.run()
+    
+end  # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/exe/fasta_tap.rb b/forester/ruby/evoruby/exe/fasta_tap.rb
new file mode 100755 (executable)
index 0000000..60afce9
--- /dev/null
@@ -0,0 +1,19 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/fasta_tap
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: fasta_tap.rb,v 1.1 2009/01/20 20:44:54 cmzmasek Exp $
+
+
+require 'lib/evo/apps/fasta_taxonomy_processor'
+
+module Evoruby
+
+    tap = FastaTaxonomyProcessor.new()
+
+    tap.run()
+
+end  # module Evoruby
diff --git a/forester/ruby/evoruby/exe/hsp.rb b/forester/ruby/evoruby/exe/hsp.rb
new file mode 100755 (executable)
index 0000000..cf0febc
--- /dev/null
@@ -0,0 +1,20 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/hsp
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: hsp.rb,v 1.1 2009/11/25 05:42:04 cmzmasek Exp $
+#
+# last modified: 11/24/2009
+
+require 'lib/evo/apps/hmmscan_parser'
+
+module Evoruby
+
+    hsp = HmmscanParser.new()
+
+    hsp.run()
+
+end  # module Evoruby
diff --git a/forester/ruby/evoruby/exe/msa_pro.rb b/forester/ruby/evoruby/exe/msa_pro.rb
new file mode 100755 (executable)
index 0000000..3fd2566
--- /dev/null
@@ -0,0 +1,20 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/msa_pro
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: msa_pro.rb,v 1.4 2008/08/28 17:09:06 cmzmasek Exp $
+#
+
+
+require 'lib/evo/apps/msa_processor'
+
+module Evoruby
+
+    mp = MsaProcessor.new()
+
+    mp.run()
+
+end  # module Evoruby
diff --git a/forester/ruby/evoruby/exe/mse.rb b/forester/ruby/evoruby/exe/mse.rb
new file mode 100755 (executable)
index 0000000..d0d7612
--- /dev/null
@@ -0,0 +1,19 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/d2f
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: mse.rb,v 1.2 2008/08/28 17:09:06 cmzmasek Exp $
+
+
+require 'lib/evo/apps/multi_sequence_extractor'
+
+module Evoruby
+    
+    mse = MultiSequenceExtractor.new()
+    
+    mse.run()
+    
+end  # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/exe/phylogenies_decorator.rb b/forester/ruby/evoruby/exe/phylogenies_decorator.rb
new file mode 100755 (executable)
index 0000000..a23861b
--- /dev/null
@@ -0,0 +1,20 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/phylogeny_factory
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: phylogenies_decorator.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $
+#
+
+require 'lib/evo/apps/phylogenies_decorator'
+
+module Evoruby
+
+    fd = PhylogeniesDecorator.new
+
+    fd.run
+
+end  # module Evoruby
diff --git a/forester/ruby/evoruby/exe/phylogeny_factory.rb b/forester/ruby/evoruby/exe/phylogeny_factory.rb
new file mode 100644 (file)
index 0000000..4682571
--- /dev/null
@@ -0,0 +1,20 @@
+#!/usr/local/bin/ruby -w
+# 
+# = exe/phylogeny_factory
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: phylogeny_factory.rb,v 1.5 2008/08/28 17:09:06 cmzmasek Exp $
+#
+
+require 'lib/evo/apps/phylogeny_factory'
+
+module Evoruby
+
+    pf = PhylogenyFactory.new
+
+    pf.run
+
+end  # module Evoruby
diff --git a/forester/ruby/evoruby/exe/run_phylo_pipeline.rb b/forester/ruby/evoruby/exe/run_phylo_pipeline.rb
new file mode 100755 (executable)
index 0000000..d68c58b
--- /dev/null
@@ -0,0 +1,81 @@
+#!/usr/local/bin/ruby -w
+#
+# = run_phylo_pipeline
+#
+# Copyright::  Copyright (C) 2010 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: run_phylo_pipeline.rb,v 1.15 2010/10/09 02:35:42 cmzmasek Exp $
+#
+#
+
+
+#  hmmscan --nobias --domtblout <BACTH_CHIPI>_hmmscan_240_10 -E 10 /home/czmasek/DATA/PFAM/PFAM240/Pfam-A.hmm <BACTH_CHIPI>.fasta
+
+#  hsp <BACTH_CHIPI>_hmmscan_240_10 <BACTH_CHIPI>_hmmscan_240_10_domain_table
+
+#  d2f -e=10 <BACTH_CHIPI>_hmmscan_240_10_domain_table <BACTH_CHIPI>.fasta <BACTH_CHIPI>_hmmscan_240_10.dff
+
+# hmmsearch --nobias -E 1000 --domtblout <BACTH_CHIPI>.hmmsearch_SusD  <~/DATA/PFAM/PFAM240/PFAM_A_HMMs/SusD.hmm> BACTH_CHIPI.fasta
+
+# dsx -dd -e=<1e-2> -l=<200> <BACTH_CHIPI>.hmmsearch_SusD <BACTH_CHIPI>.fasta BACTH_CHIPI_e2_200
+
+
+module Evoruby
+
+  class RunPhyloPipeline
+
+    def run
+      unless ARGV.length == 4
+        puts "arguments are: [inputfile].fasta [hmm-name] [min-length] [neg e-value exponent]"
+        exit
+      end
+
+      hmmscan   = "/home/czmasek/SOFTWARE/HMMER/hmmer-3.0b3/src/hmmscan"
+      hmmsearch = "/home/czmasek/SOFTWARE/HMMER/hmmer-3.0b3/src/hmmsearch"
+      hsp       = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester-atv/ruby/evoruby/exe/hsp.rb"
+      d2f       = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester-atv/ruby/evoruby/exe/d2f.rb"
+      dsx       = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester-atv/ruby/evoruby/exe/dsx.rb"
+
+      base_name   = ARGV[ 0 ]
+      hmm         = ARGV[ 1 ]
+      length      = ARGV[ 2 ]
+      e_value_exp = ARGV[ 3 ]
+      do_domain_combination_analysis = true
+
+      if do_domain_combination_analysis
+
+        cmd = "#{hmmscan} --nobias --domtblout #{base_name}_hmmscan_240_10 -E 10 /home/czmasek/DATA/PFAM/PFAM240/Pfam-A.hmm #{base_name}.fasta"
+        run_command( cmd )
+
+        cmd = "#{hsp} #{base_name}_hmmscan_240_10 #{base_name}_hmmscan_240_10_domain_table"
+        run_command( cmd )
+
+        cmd = "#{d2f} -e=10 #{base_name}_hmmscan_240_10_domain_table #{base_name}.fasta #{base_name}_hmmscan_240_10.dff"
+        run_command( cmd )
+
+      end
+
+      cmd = "#{hmmsearch} --nobias -E 1000 --domtblout #{base_name}.hmmsearch_#{hmm}  ~/DATA/PFAM/PFAM240/PFAM_A_HMMs/#{hmm}.hmm #{base_name}.fasta"
+      run_command( cmd )
+
+      cmd = "#{dsx} -dd -e=1e-#{e_value_exp.to_s} -l=#{length} #{base_name}.hmmsearch_#{hmm} #{base_name}.fasta #{base_name}_e#{e_value_exp.to_s}_#{length}"
+      run_command( cmd )
+
+    end
+
+    def run_command( cmd )
+      puts cmd
+      `#{cmd}`
+    end
+
+  end
+
+  p = RunPhyloPipeline.new()
+
+  p.run()
+
+end
+
+
+
diff --git a/forester/ruby/evoruby/exe/tap.rb b/forester/ruby/evoruby/exe/tap.rb
new file mode 100755 (executable)
index 0000000..12af133
--- /dev/null
@@ -0,0 +1,20 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/tap
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: tap.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $
+#
+# last modified: 05/18/2007
+
+require 'lib/evo/apps/taxonomy_processor'
+
+module Evoruby
+
+    tap = TaxonomyProcessor.new()
+
+    tap.run()
+
+end  # module Evoruby
diff --git a/forester/ruby/evoruby/exe/test.rb b/forester/ruby/evoruby/exe/test.rb
new file mode 100755 (executable)
index 0000000..8a4431d
--- /dev/null
@@ -0,0 +1,1171 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/test - Test class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: test.rb,v 1.18 2010/10/08 22:04:17 cmzmasek Exp $
+#
+# last modified: 05/15/2007
+
+
+require 'lib/evo/util/constants'
+require 'lib/evo/taxonomy/taxonomy'
+require 'lib/evo/sequence/sequence'
+require 'lib/evo/msa/msa'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/sequence/domain_structure'
+require 'lib/evo/sequence/protein_domain'
+require 'lib/evo/table/basic_table'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/writer/phylip_sequential_writer'
+require 'lib/evo/io/writer/nexus_writer'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/io/parser/ncbi_tseq_parser'
+require 'lib/evo/io/parser/hmmsearch_domain_extractor'
+require 'lib/evo/apps/domain_sequence_extractor'
+require 'lib/evo/apps/hmmscan_parser'
+require 'lib/evo/apps/domains_to_forester'
+require 'lib/evo/io/parser/general_msa_parser'
+require 'lib/evo/io/parser/basic_table_parser'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/soft/fastme'
+require 'lib/evo/soft/tree_puzzle'
+
+
+
+module Evoruby
+
+    class Test
+
+        GENERAL_MSA_FILE = "files/test/general_msa_file.txt"
+        FASTA_FILE       = "files/test/fasta_file.txt"
+        TSEQ_FILE        = "files/test/ncbi_tseq.xml"
+
+        def initialize()
+            @failures  = 0
+            @successes = 0
+        end
+
+
+
+        def test_taxonomy()
+            begin
+                tax = Taxonomy.new( "pig" )
+
+                if tax.get_name != "pig"
+                    return false
+                end
+
+                tax1 = Taxonomy.new( "dog", "id", "source" )
+                tax2 = tax1.copy
+
+                if tax2.get_name != "dog"
+                    return false
+                end
+                if tax2.get_id != "id"
+                    return false
+                end
+                if tax2.get_id_source != "source"
+                    return false
+                end
+
+                if !( tax1 == tax2 )
+                    return false
+                end
+
+                if !( tax1 == tax1 )
+                    return false
+                end
+
+                tax3 = Taxonomy.new( "dog", "id"  )
+                if ( tax1 == tax3 )
+                    return false
+                end
+
+                tax4 = Taxonomy.new( "dog" )
+                tax5 = Taxonomy.new( "dog" )
+                if !( tax4 == tax5 )
+                    return false
+                end
+
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+
+        def test_sequence()
+            begin
+                seq = Sequence.new( "seq1", "WLIQ" )
+                if ( seq.get_length != 4 )
+                    return false
+                end
+                if ( seq.get_residue( 3 ) != "Q" )
+                    return false
+                end
+                seq.append!( "E?-*X_Y" )
+                if ( seq.get_length != 11 )
+                    return false
+                end
+                if ( seq.get_residue( 3 ) != "Q" )
+                    return false
+                end
+                if ( seq.get_residue( 4 ) != "E" )
+                    return false
+                end
+                seq.append!( "A V_" )
+                if ( seq.get_length != 15 )
+                    return false
+                end
+                if ( !Test::same?( seq.get_gap_length, 5 ) )
+                    return false
+                end
+                if ( !Test::same?( seq.get_gap_ratio, 5.0 / 15.0 ) )
+                    return false
+                end
+                seq.delete_residue!( 0 )
+                seq.delete_residue!( 2 )
+                seq2 = seq.copy()
+                seq.delete_residue!( 0 )
+                seq.delete_residue!( 0 )
+                seq = nil
+                if ( seq2.get_length != 13 )
+                    return false
+                end
+                if ( seq2.get_sequence_as_string != "LIE?-*X_YA V_" )
+                    return false
+                end
+                if ( seq2.get_slice( 2, 2 ) != "E?" )
+                    return false
+                end
+                if ( seq2.get_slice( 0, 1 ) != "L" )
+                    return false
+                end
+                if ( seq2.get_subsequence( 1, 4 ).get_sequence_as_string != "IE?-" )
+                    return false
+                end
+                if ( seq2.get_name() != "seq1" )
+                    return false
+                end
+                if ( seq2.get_slice!( 2, 2 ) != "E?" )
+                    return false
+                end
+                if ( seq2.get_sequence_as_string != "LI-*X_YA V_" )
+                    return false
+                end
+                if ( seq2.get_length != 11 )
+                    return false
+                end
+                if ( seq2.get_character_code( 0 ) != 76 )
+                    return false
+                end
+                str_0 = " Li-*X_YA V_ 3 3    1212 ?? B1J OU.Z "
+                if ( Util.clean_seq_str( str_0 ) != "LI-X-YAV-XXXXXX-X" )
+                    return false
+                end
+
+                tax = Taxonomy.new( "dog", "tax_id", "tax_source" )
+                seqn = Sequence.new( "seqn", "VVVVV", "acc", "acc source", tax, "symbol", "2accession", "2source" )
+                seqc = seqn.copy
+                if ( seqc.get_name() != "seqn" )
+                    return false
+                end
+                if ( seqc.get_accession() != "acc" )
+                    return false
+                end
+                if ( seqc.get_accession_source() != "acc source" )
+                    return false
+                end
+                if ( seqc.get_taxonomy.get_name != "dog" )
+                    return false
+                end
+                if ( seqc.get_taxonomy.get_id != "tax_id" )
+                    return false
+                end
+                if ( seqc.get_symbol != "symbol" )
+                    return false
+                end
+                if ( seqc.get_secondary_accession != "2accession" )
+                    return false
+                end
+                if ( seqc.get_secondary_accession_source != "2source" )
+                    return false
+                end
+
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+
+        def test_msa()
+            begin
+                msa = Msa.new()
+                seq0 = Sequence.new( "seq 0", "a-*-_ x-ijklmnopqrstuvwxyz" )
+                seq1 = Sequence.new( "seq 1", "ab--_ X-ijklmnopqrstuvwxyz" )
+                seq2 = Sequence.new( "seq 2", "abc-_?x-ijklmnopqrstuvwxyz" )
+                seq3 = Sequence.new( "seq 3", "abcd_?x-ijklmnopqrstuvwxyz" )
+                seq4 = Sequence.new( "seq 4", "abcde?x-ijklmnopqrstuvwxyz" )
+                seq5 = Sequence.new( "seq 5", "abcdefx-ijklmnopqrstuvwxyz" )
+                msa.add_sequence( seq0 );
+                msa.add_sequence( seq1 );
+                msa.add_sequence( seq2 );
+                msa.add_sequence( seq3 );
+                msa.add_sequence( seq4 );
+                msa.add_sequence( seq5 );
+                msa.add(                      "seq 6", "abcdefg-ijklmnopqrstuvwxyz" );
+                if ( msa.get_sequence( 0 ).get_name() != "seq 0" )
+                    return false
+                end
+                if ( msa.get_by_name( "Eq 1", false, true ).get_name != "seq 1" )
+                    return false
+                end
+                if ( msa.find_by_name( "Eq 2", false, true )[ 0 ] != 2 )
+                    return false
+                end
+                if ( !msa.is_aligned )
+                    return false
+                end
+                if ( msa.get_number_of_seqs != 7 )
+                    return false
+                end
+                if ( msa.get_length != 26 )
+                    return false
+                end
+                msa.add( "seq 7", "abcdefgqijklmnopqrstuvwxyz" );
+                if ( msa.get_number_of_seqs != 8 )
+                    return false
+                end
+                msa.remove_sequence!( 7 )
+                if ( msa.get_number_of_seqs != 7 )
+                    return false
+                end
+                msa.remove_gap_only_columns!()
+                if ( msa.get_length() != 25 )
+                    return false
+                end
+                if ( msa.get_by_name( "seq 0" ).get_sequence_as_string != "a-*-_ xijklmnopqrstuvwxyz" )
+                    return false
+                end
+                msa.remove_gap_columns_w_gap_ratio!( 6.1 / 7.0 )
+                if ( msa.get_length() != 25 )
+                    return false
+                end
+                msa.remove_gap_columns_w_gap_ratio!( 6.0 / 7.0 )
+                if ( msa.get_length() != 25 )
+                    return false
+                end
+                if ( msa.get_by_name( "seq 0" ).get_sequence_as_string != "a-*-_ xijklmnopqrstuvwxyz" )
+                    return false
+                end
+                msa.remove_gap_columns_w_gap_ratio!( 5.0 / 7.0 )
+                if ( msa.get_length() != 25 )
+                    return false
+                end
+                if ( msa.get_by_name( "seq 0" ).get_sequence_as_string != "a-*-_ xijklmnopqrstuvwxyz" )
+                    return false
+                end
+                msa.remove_gap_columns_w_gap_ratio!( 2.0 / 7.0 )
+                if ( msa.get_length() != 23 )
+                    return false
+                end
+                if ( msa.get_by_name( "seq 0" ).get_sequence_as_string != "a-* xijklmnopqrstuvwxyz" )
+                    puts msa.get_by_name( "seq 0" ).get_sequence_as_string
+                    return false
+                end
+                msa.remove_gap_columns_w_gap_ratio!( 1.0 / 7.0 )
+                if ( msa.get_length() != 21 )
+                    return false
+                end
+                if ( msa.get_by_name( "seq 0" ).get_sequence_as_string != "a-xijklmnopqrstuvwxyz" )
+                    return false
+                end
+                msa2 = Evoruby::Msa.new()
+                msa2.add( "seq0", "abcdefgh" );
+                msa2.add( "seq1", "a-cdefgh" );
+                msa2.add( "seq2", "a--defgh" );
+                msa2.add( "seq3", "a---efgh" );
+                msa2.add( "seq4", "a----fgh" );
+                msa2.add( "seq5", "a" );
+                if ( msa2.is_aligned )
+                    return false
+                end
+                msa2.remove_sequence!( 5 )
+                if ( !msa2.is_aligned )
+                    return false
+                end
+                if ( msa2.get_number_of_seqs != 5 )
+                    return false
+                end
+                msa2.remove_gap_only_columns!()
+
+                if ( msa2.get_length != 8 )
+                    return false
+                end
+
+                msa2.remove_sequences_by_gap_ratio!( 4.0 / 8.0 )
+                if ( msa2.get_number_of_seqs != 5 )
+                    return false
+                end
+                msa2.remove_sequences_by_gap_ratio!( 3.0 / 8.0 )
+                if ( msa2.get_number_of_seqs != 4 )
+                    return false
+                end
+                msa2.remove_sequences_by_gap_ratio!( 1.0 / 8.0 )
+                if ( msa2.get_number_of_seqs != 2 )
+                    return false
+                end
+                msa2.remove_sequences_by_gap_ratio!( 0.0 )
+                if ( msa2.get_number_of_seqs != 1 )
+                    return false
+                end
+                msa2.add( "seq1", "a-cdefgh" );
+                msa2.add( "seq2", "a--defgh" );
+                msa2.add( "seq3", "a---efgh" );
+                msa2.add( "seq4", "a----fgh" );
+
+                msa2.remove_sequences_by_non_gap_length!( 4 )
+                if ( msa2.get_number_of_seqs != 5 )
+                    return false
+                end
+                msa2.remove_sequences_by_non_gap_length!( 5 )
+                if ( msa2.get_number_of_seqs != 4 )
+                    return false
+                end
+                msa2.remove_sequences_by_non_gap_length!( 8 )
+                if ( msa2.get_number_of_seqs != 1 )
+                    return false
+                end
+                msa2.add( "seq1", "a-cdefgh" );
+                msa2.add( "seq2", "a--defgh" );
+                msa2.add( "seq3", "a---efgh" );
+                msa2.add( "seq4", "a----fgh" );
+                msa2.trim!( 0, 7 )
+                if ( msa2.get_by_name( "seq0" ).get_sequence_as_string != "abcdefgh" )
+                    return false
+                end
+                msa2.trim!( 3, 4 )
+                if ( msa2.get_by_name( "seq0" ).get_sequence_as_string != "de" )
+                    return false
+                end
+                msa3 = Evoruby::Msa.new()
+                msa3.add( "seq0", "abcdefgh-abcdef--*" );
+                msa3.add( "seq1", "b-deefgh-a____f--*" );
+                msa3.add( "seq2", "A________abcdef--*" );
+                msa3.add( "seq3", "A   Efgh---------*" );
+                msa3.add( "seq4", "    eFhh---------*" );
+                msa3.add( "seq5", "----------------ee" );
+                if ( !Test::same?( msa3.calculate_overlap( 0, 0 ), 14 ) )
+                    return false
+                end
+                if ( !Test::same?( msa3.calculate_overlap( 0, 1 ), 9 ) )
+                    return false
+                end
+                if ( !Test::same?( msa3.calculate_overlap( 0, 5 ), 0 ) )
+                    return false
+                end
+                if ( !Test::same?( msa3.calculate_overlap( 4, 5 ), 0 ) )
+                    return false
+                end
+                if ( !msa3.overlap?( 2, 3 ) )
+                    return false
+                end
+                if ( msa3.overlap?( 2, 3, 2 ) )
+                    return false
+                end
+                if ( msa3.overlap?( 4, 5 ) )
+                    return false
+                end
+                if ( !Test::same?( msa3.calculate_identities( 4, 5 ), 0 ) )
+                    return false
+                end
+                if ( !Test::same?( msa3.calculate_identities( 3, 4 ), 3 ) )
+                    return false
+                end
+                if ( msa3.split_into_overlapping_msa.length != 3 )
+                    return false
+                end
+                if ( msa3.split_into_overlapping_msa( 5 ).length != 4 )
+                    return false
+                end
+
+
+                msa4 = Msa.new()
+                seq0 = Sequence.new( "seq 0", "ABCDED" )
+                seq1 = Sequence.new( "seq 1", "ABCDEE" )
+                seq2 = Sequence.new( "seq 2", "abcded" )
+                seq3 = Sequence.new( "seq 3", " ABCDEE" )
+                seq4 = Sequence.new( "seq 4", "ABCDEV" )
+                seq5 = Sequence.new( "seq 5", "ABCDED" )
+                seq6 = Sequence.new( "seq 6", "AB.DEI" )
+                seq7 = Sequence.new( "seq 7", "aB-DEi*" )
+                seq8 = Sequence.new( "seq 8", "ABCDED" )
+                seq9 = Sequence.new( "seq 9", "ABCDED" )
+                seq10 = Sequence.new( "seq 10", "ABCDED" )
+                seq11 = Sequence.new( "seq 11", "ABCDED" )
+                msa4.add_sequence( seq0 );
+                msa4.add_sequence( seq1 );
+                msa4.add_sequence( seq2 );
+                msa4.add_sequence( seq3 );
+                msa4.add_sequence( seq4 );
+                msa4.add_sequence( seq5 );
+                msa4.add_sequence( seq6 );
+                msa4.add_sequence( seq7 );
+                msa4.add_sequence( seq8 );
+                msa4.add_sequence( seq9 );
+                msa4.add_sequence( seq10 );
+                msa4.add_sequence( seq11 );
+
+                msa4.remove_redundant_sequences!
+
+                puts msa4.to_str
+
+                if msa4.get_number_of_seqs != 4
+                    return false
+                end
+
+                if msa4.get_sequence( 0 ).get_name != "seq 0"
+                    return false
+                end
+                if msa4.get_sequence( 1 ).get_name != "seq 1"
+                    return false
+                end
+                if msa4.get_sequence( 2 ).get_name != "seq 4"
+                    return false
+                end
+                if msa4.get_sequence( 3 ).get_name != "seq 6"
+                    return false
+                end
+
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_msa_factory()
+            begin
+                f = MsaFactory.new()
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_domain_structure()
+            begin
+                ds = DomainStructure.new( 190 )
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_protein_domain()
+            begin
+                ds = ProteinDomain.new( "domain", 23, 466, "d1", 0.4 )
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_basic_table()
+            begin
+                t = BasicTable.new()
+                t.set_value( 233, 923, "snake" )
+                t.set_value( 233, 923, "lizard" )
+                if ( t.get_value_as_string( 233, 923 ) != "lizard" )
+                    return false
+                end
+                if ( t.get_value_as_string( 33, 23 ) != "" )
+                    return false
+                end
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_msa_io()
+            begin
+                msaio = MsaIO.new()
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_phylip_sequentialwriter()
+            begin
+                p = PhylipSequentialWriter.new()
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_nexus_writer()
+            begin
+                n = NexusWriter.new()
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_fasta_writer()
+            begin
+                f = FastaWriter.new()
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_general_msa_parser( path_to_evoruby )
+            begin
+                g = GeneralMsaParser.new()
+                f = MsaFactory.new()
+                sep = ""
+                if ( !Util::is_string_empty?( path_to_evoruby ) )
+                    sep = Constants::FILE_SEPARATOR
+                end
+                msa = f.create_msa_from_file( path_to_evoruby +
+                     sep +
+                     GENERAL_MSA_FILE, g )
+
+                if ( msa.get_length() != 29 )
+                    return false
+                end
+                if ( msa.get_number_of_seqs() != 7 )
+                    return false
+                end
+
+                seq0 = msa.get_sequence( 0 )
+                seq1 = msa.get_sequence( 1 )
+                seq2 = msa.get_sequence( 2 )
+                seq3 = msa.get_sequence( 3 )
+                seq4 = msa.get_sequence( 4 )
+                seq5 = msa.get_sequence( 5 )
+                seq6 = msa.get_sequence( 6 )
+
+                if ( seq0.get_name() != "sequence0" )
+                    return false
+                end
+                if ( seq0.get_sequence_as_string() != "ABCDE.GHIJKLMNOPQR.TUVWabcxy0" )
+                    return false
+                end
+
+                if ( seq1.get_name() != "sequence1" )
+                    return false
+                end
+                if ( seq1.get_sequence_as_string() != "abcdefghijklmnopqrstuvwabcxy1" )
+                    return false
+                end
+
+                if ( seq2.get_name() != "sequence2" )
+                    return false
+                end
+                if ( seq2.get_sequence_as_string() != "abcdefghijkl---x_-*?_XXabcxy2" )
+                    return false
+                end
+
+                if ( seq3.get_name() != "sequence3" )
+                    return false
+                end
+                if ( seq3.get_sequence_as_string() != "12345678901234567890123abcxy3" )
+                    return false
+                end
+
+                if ( seq4.get_name() != "sequence4" )
+                    return false
+                end
+                if ( seq4.get_sequence_as_string() != "--------------------------xy4" )
+                    return false
+                end
+
+                if ( seq5.get_name() != "sequence5" )
+                    return false
+                end
+                if ( seq5.get_sequence_as_string() != "a*c*ef****************wabcxy5" )
+                    return false
+                end
+
+                if ( seq6.get_name() != "sequence6" )
+                    return false
+                end
+                if ( seq6.get_sequence_as_string() != "ururufhfghfgftgfhftgfttabcxy6" )
+                    return false
+                end
+
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_fasta_parser( path_to_evoruby )
+            begin
+                fasta = FastaParser.new()
+                f = MsaFactory.new()
+                sep = ""
+                if ( !Util::is_string_empty?( path_to_evoruby ) )
+                    sep = Constants::FILE_SEPARATOR
+                end
+                msa = f.create_msa_from_file( path_to_evoruby +
+                     sep +
+                     FASTA_FILE, fasta )
+
+                if ( msa.get_length() != 6 )
+                    return false
+                end
+                if ( msa.get_number_of_seqs() != 4 )
+                    return false
+                end
+
+                seq0 = msa.get_sequence( 0 )
+                seq1 = msa.get_sequence( 1 )
+                seq2 = msa.get_sequence( 2 )
+                seq3 = msa.get_sequence( 3 )
+
+                if ( seq0.get_name() != "sequence 0" )
+                    return false
+                end
+                if ( seq0.get_sequence_as_string() != "ABCDEF" )
+                    return false
+                end
+
+                if ( seq1.get_name() != "sequence 1" )
+                    return false
+                end
+                if ( seq1.get_sequence_as_string() != "abcdef" )
+                    return false
+                end
+
+                if ( seq2.get_name() != "sequence 2" )
+                    return false
+                end
+                if ( seq2.get_sequence_as_string() != "123456" )
+                    return false
+                end
+                if ( seq3.get_name() != "sequence 3" )
+                    return false
+                end
+                if ( seq3.get_sequence_as_string() != "a-c--f" )
+                    return false
+                end
+
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_ncbi_tseq_parser( path_to_evoruby )
+            begin
+                parser = NcbiTSeqParser.new
+                f = MsaFactory.new
+                sep = ""
+                if ( !Util::is_string_empty?( path_to_evoruby ) )
+                    sep = Constants::FILE_SEPARATOR
+                end
+                msa = f.create_msa_from_file( path_to_evoruby +
+                     sep +
+                     TSEQ_FILE, parser )
+
+                if ( msa.get_number_of_seqs() != 9 )
+                    return false
+                end
+
+                seq0 = msa.get_sequence( 0 )
+                seq1 = msa.get_sequence( 1 )
+                seq8 = msa.get_sequence( 8 )
+
+                if ( seq0.get_name() != "SusD [Bacteroides thetaiotaomicron VPI-5482]" )
+                    return false
+                end
+                if ( seq0.get_sequence_as_string() != "MKTKYIKQLFSAALIAVLSSGVTSCINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQENQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK" )
+                    return false
+                end
+                if ( seq0.get_accession != "29341016" )
+                    return false
+                end
+                if ( seq0.get_accession_source != "gi" )
+                    return false
+                end
+                if ( seq0.get_taxonomy.get_name != "Bacteroides thetaiotaomicron VPI-5482" )
+                    return false
+                end
+                if ( seq0.get_taxonomy.get_id != "226186" )
+                    return false
+                end
+                if ( seq0.get_taxonomy.get_id_source != "ncbi" )
+                    return false
+                end
+
+
+                if ( seq1.get_name() != "SusD, outer membrane protein [Bacteroides thetaiotaomicron VPI-5482]" )
+                    return false
+                end
+                if ( seq1.get_accession != "29349109" )
+                    return false
+                end
+                if ( seq1.get_accession_source != "gi" )
+                    return false
+                end
+                if ( seq1.get_taxonomy.get_name != "Bacteroides thetaiotaomicron VPI-5482" )
+                    return false
+                end
+                if ( seq1.get_taxonomy.get_id != "226186" )
+                    return false
+                end
+                if ( seq1.get_taxonomy.get_id_source != "ncbi" )
+                    return false
+                end
+
+
+                if ( seq8.get_name() != "Chain A, B. Thetaiotaomicron Susd With Maltotriose" )
+                    return false
+                end
+                if ( seq8.get_accession != "pdb|3CKB|A" )
+                    return false
+                end
+                if ( seq8.get_accession_source != "ncbi" )
+                    return false
+                end
+                if ( seq8.get_taxonomy.get_name != "Bacteroides thetaiotaomicron" )
+                    return false
+                end
+                if ( seq8.get_taxonomy.get_id != "818" )
+                    return false
+                end
+                if ( seq8.get_taxonomy.get_id_source != "ncbi" )
+                    return false
+                end
+
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_hmmsearch_domain_extractor()
+            begin
+                h = Evoruby::HmmsearchDomainExtractor.new()
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_domain_sequence_extractor()
+            begin
+                h = Evoruby::DomainSequenceExtractor.new()
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_hmmscan_parser()
+            begin
+                h = Evoruby::HmmscanParser.new()
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_domains_to_forester()
+            begin
+                d = Evoruby::DomainsToForester.new()
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+
+        def test_basic_table_parser()
+            begin
+                b = Evoruby::BasicTableParser.new()
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+
+        def test_cla()
+            begin
+                cla = CommandLineArguments.new( Array.new )
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_tree_puzzle()
+            begin
+                tp = TreePuzzle.new()
+                tp.run( '/home/czmasek/scratch/small.aln',
+                    :wag,
+                    :uniform,
+                    200 )
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+        def test_fastme()
+            begin
+                fastme = FastMe.new()
+                fastme.run( '/home/czmasek/scratch/outdist', 0, :GME )
+            rescue Exception => e
+                puts()
+                puts( e.to_s )
+                puts()
+                return false
+            end
+            return true
+        end
+
+
+        def run()
+
+            t0 = Time.now
+
+            puts
+            puts "ruby version " + RUBY_VERSION
+            puts Constants::EVORUBY + " version " + Constants::EVORUBY_VERSION
+            puts
+
+            path_to_evoruby = Test.get_path_to_evoruby()
+
+            if ( Util.is_string_empty?( path_to_evoruby ) )
+                path_to_evoruby = ""
+                puts()
+                puts( "Warning! Path to evoruby could not be established. Some tests will might fail." )
+                puts()
+            end
+
+            print( "--- Taxonomy: " )
+            if ( test_taxonomy() )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- Sequence: " )
+            if ( test_sequence() )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- Msa: " )
+            if ( test_msa() )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- MsaFactory: " )
+            if ( test_msa_factory() )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- DomainStructure: " )
+            if ( test_domain_structure() )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- ProteinDomain: " )
+            if ( test_protein_domain() )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- BasicTable: " )
+            if ( test_basic_table() )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- MsaIO: " )
+            if ( test_msa_io )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- PhylipSequentialWriter: " )
+            if ( test_phylip_sequentialwriter )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- FastaWriter : " )
+            if ( test_fasta_writer )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- NexusWriter: " )
+            if ( test_nexus_writer )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- FastaParser: " )
+            if ( test_fasta_parser( path_to_evoruby ) )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- NCBI Tseq parser: " )
+            if (  test_ncbi_tseq_parser( path_to_evoruby ) )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- GeneralMsaParser: " )
+            if ( test_general_msa_parser( path_to_evoruby ) )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+
+            print( "--- Hmmsearch domain extractor: " )
+            if ( test_hmmsearch_domain_extractor )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- Domain sequence extractor: " )
+            if ( test_domain_sequence_extractor )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- Hmmscan parser: " )
+            if ( test_hmmscan_parser )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+
+            print( "--- Domains 2 forester: " )
+            if ( test_domains_to_forester )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- BasicTableParser: " )
+            if ( test_basic_table_parser )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- TreePuzzle (wrapper): " )
+            if ( test_tree_puzzle() )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+            print( "--- FastMe (wrapper): " )
+            if ( test_fastme() )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+
+
+
+            print( "--- CLA: " )
+            if ( test_cla() )
+                puts( "ok" )
+                @successes += 1
+            else
+                puts( "FAILED" )
+                @failures += 1
+            end
+            puts
+            puts "ruby version " + RUBY_VERSION
+            puts Constants::EVORUBY + " version " + Constants::EVORUBY_VERSION
+            puts
+
+            td = Time.at( Time.now - t0 )
+            puts( "Time            : #{ td.sec }.#{ td.usec }s" )
+            puts()
+
+            puts( "Successful tests: " + @successes.to_s )
+            puts( "Failed tests    : " + @failures.to_s )
+            puts()
+            if ( @failures < 1 )
+                puts( "OK" )
+            else
+                puts( "NOT ok" )
+            end
+
+            puts()
+        end
+
+        private
+
+        def Test.same?( n, m )
+            return ( ( n - m ).abs < 0.000001 )
+        end
+
+        def Test.get_path_to_evoruby()
+            rubylib = ENV['RUBYLIB'].split(':')
+            evoruby_path = nil
+            rubylib.each do | path |
+                if ( path =~ /evoruby/ )
+                    evoruby_path = path
+                    break
+                end
+            end
+            evoruby_path
+        end
+
+    end # class Test
+
+
+    test = Test.new()
+
+    test.run()
+
+
+end # module Evoruby
+
diff --git a/forester/ruby/evoruby/exe/tseq_tap.rb b/forester/ruby/evoruby/exe/tseq_tap.rb
new file mode 100755 (executable)
index 0000000..e0075fa
--- /dev/null
@@ -0,0 +1,19 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/tseq_tap
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: tseq_tap.rb,v 1.1 2008/12/31 06:00:08 cmzmasek Exp $
+
+
+require 'lib/evo/apps/tseq_taxonomy_processor'
+
+module Evoruby
+
+    tap = TseqTaxonomyProcessor.new()
+
+    tap.run()
+
+end  # module Evoruby
diff --git a/forester/ruby/evoruby/files/00_phylogeny_factory.template b/forester/ruby/evoruby/files/00_phylogeny_factory.template
new file mode 100644 (file)
index 0000000..fda13d4
--- /dev/null
@@ -0,0 +1,100 @@
+# $Id: 00_phylogeny_factory.template,v 1.9 2008/11/25 22:25:52 cmzmasek Exp $
+
+# Name convention if alignment specific parameters
+# are to be used (such as HMMs for hmmalign):
+#  the substring before the first underscore is a
+#  unique identifier and needs to match the identifiers
+#  in '% <parameter-type> <unique-id>=<value>' statements 
+#  Example:
+#  alignment name     : 'bcl2_new_alignment'
+#  parameter statments: '% HMM bcl2=Bcl2_ls' 
+#                       '% RSL bcl2=60' 
+$ PROBCONS=/home/czmasek/SOFTWARE/PROBCONS/probcons_v1_12/probcons
+$ DIALIGN_TX=/home/czmasek/SOFTWARE/DIALIGNTX/DIALIGN-TX_1.0.1/source/dialign-tx
+$ DIALIGN_DIR=/home/czmasek/SOFTWARE/DIALIGNTX/DIALIGN-TX_1.0.1/conf
+$ MAFFT=/home/czmasek/SOFTWARE/MAFFT/mafft-6.240/src/mafft
+$ T_COFFEE=/home/czmasek/SOFTWARE/T_COFFEE/T-COFFEE_distribution_Version_6.78/bin_linux/t_coffee
+$ MUSCLE=/home/czmasek/SOFTWARE/MUSCLE/muscle3.7/muscle
+$ CLUSTALW=/home/czmasek/SOFTWARE/CLUSTALW/clustalw-2.0.9/src/clustalw2
+$ KALIGN=/home/czmasek/SOFTWARE/KALIGN/kalign203/kalign
+$ HMMALIGN=/home/czmasek/SOFTWARE/HMMER/hmmer-2.3.2/src/hmmalign
+$ MSA_PRO=/home/czmasek/SOFTWARE/FORESTER/DEV/forester-atv/ruby/evoruby/exe/msa_pro.rb
+$ PHYLO_PL=/home/czmasek/SOFTWARE/FORESTER/DEV/forester-atv/perl/phylo_pl.pl 
+
+# Default value is 40.
+% RSL SRCR=50
+% RSL NACHT=50
+% RSL TIR=60
+% RSL Bcl2=100
+% RSL homeobox=40
+
+% PHYLO_OPT=-WIB100q@1nxbwS21
+
+% TMP_DIR  = /home/czmasek/tmp/
+
+# Need to give full path for HMM files.
+% HMM NACHT=/home/czmasek/DATA/PFAM/NACHT_ls_cz.hmm
+% HMM TIR=/home/czmasek/DATA/PFAM/PFAM_LS/TIR.ls.hmm
+% HMM SRCR=/home/czmasek/DATA/PFAM/PFAM_LS/SRCR.ls.hmm
+% HMM Homeobox=/home/czmasek/DATA/PFAM/PFAM_LS/Homeobox.ls.hmm
+% HMM Cofilin_ADF=/home/czmasek/DATA/PFAM/PFAM_LS/Cofilin_ADF.ls.hmm
+
+
+> KALIGN $ -o $_kalign
+> MSA_PRO -o=p -n=10 -rr=0.5 -rsl=%[RSL]% $_kalign $_kalign_05_%[RSL]%.aln
+> MSA_PRO -o=n -n=10 -rr=0.5 -rsl=%[RSL]% $_kalign $_kalign_05_%[RSL]%.nex
+> PHYLO_PL %[PHYLO_OPT]% $_kalign_05_%[RSL]%.aln $_kalign_05_%[RSL]% %[TMP_DIR]%
+- 
+
+> HMMALIGN -q -m -o $_hmmalign_m %[HMM]% $ 
+> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_hmmalign_m $_hmmalign_m_05_%[RSL]%.aln
+> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_hmmalign_m $_hmmalign_m_05_%[RSL]%.nex
+> PHYLO_PL %[PHYLO_OPT]% $_hmmalign_m_05_%[RSL]%.aln $_hmmalign_m_05_%[RSL]% %[TMP_DIR]%
+- 
+
+> HMMALIGN -q -o $_hmmalign %[HMM]% $ 
+> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_hmmalign $_hmmalign_05_%[RSL]%.aln
+> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_hmmalign $_hmmalign_05_%[RSL]%.nex
+> PHYLO_PL %[PHYLO_OPT]% $_hmmalign_05_%[RSL]%.aln $_hmmalign_05_%[RSL]% %[TMP_DIR]%
+- 
+
+> PROBCONS $ > $_probcons
+> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_probcons $_probcons_05_%[RSL]%.aln
+> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_probcons $_probcons_05_%[RSL]%.nex
+> PHYLO_PL %[PHYLO_OPT]% $_probcons_05_%[RSL]%.aln $_probcons_05_%[RSL]% %[TMP_DIR]%
+-  
+    
+> DIALIGN_TX DIALIGN_DIR $ $_dialigntx
+> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_dialigntx $_dialigntx_05_%[RSL]%.aln
+> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_dialigntx $_dialigntx_05_%[RSL]%.nex
+> PHYLO_PL %[PHYLO_OPT]% $_dialigntx_05_%[RSL]%.aln $_dialigntx_05_%[RSL]% %[TMP_DIR]%
+-
+    
+> MAFFT --maxiterate 1000 --localpair --quiet $ > $_mafft
+> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_mafft $_mafft_05_%[RSL]%.aln
+> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_mafft $_mafft_05_%[RSL]%.nex
+> PHYLO_PL %[PHYLO_OPT]% $_mafft_05_%[RSL]%.aln $_mafft_05_%[RSL]% %[TMP_DIR]%
+-
+    
+#> T_COFFEE $ -outfile $_tcoffee
+#> rm $.dnd
+#> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_tcoffee $_tcoffee_05_%[RSL]%.aln
+#> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_tcoffee $_tcoffee_05_%[RSL]%.nex
+#> PHYLO_PL %[PHYLO_OPT]% $_tcoffee_05_%[RSL]%.aln $_tcoffee_05_%[RSL]% %[TMP_DIR]%
+#> rm $.dnd
+#> rm $.html
+#-
+    
+> MUSCLE  -maxiters 1000 -maxtrees 100 -clw -in $ -out $_muscle
+> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_muscle $_muscle_05_%[RSL]%.aln
+> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_muscle $_muscle_05_%[RSL]%.nex
+> PHYLO_PL %[PHYLO_OPT]%  $_muscle_05_%[RSL]%.aln  $_muscle_05_%[RSL]% %[TMP_DIR]%
+-
+
+> CLUSTALW $ -outfile=$_clustalw
+> rm $.dnd
+> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_clustalw $_clustalw_05_%[RSL]%.aln
+> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_clustalw $_clustalw_05_%[RSL]%.nex
+> PHYLO_PL %[PHYLO_OPT]% $_clustalw_05_%[RSL]%.aln $_clustalw_05_%[RSL]% %[TMP_DIR]%
+> rm $.dnd
+-
diff --git a/forester/ruby/evoruby/files/00_sample_tap_mapfile b/forester/ruby/evoruby/files/00_sample_tap_mapfile
new file mode 100644 (file)
index 0000000..aa0606c
--- /dev/null
@@ -0,0 +1,46 @@
+# $Id: 00_sample_tap_mapfile,v 1.2 2008/08/29 23:58:31 cmzmasek Exp $
+
+RAT#RAT
+Geodia cydonium#GEOCY
+Lubomirskia baicalensis#LUBBA
+Suberites domuncula#SUBDO
+Hydra vulgaris#HYDAT
+Apis mellifera#APIME
+Drosophila pseudoobscura#DROPS
+Aedes aegypti#AEDAE
+Tribolium castaneum#TRICA
+Caenorhabditis briggsae#CAEBR
+HUMAN#HUMAN
+Branchiostoma floridae#BRAFL
+amphioxus#BRAFL
+Brafl1#BRAFL
+Ciona intestinalis#CIOIN
+ciona#CIOIN
+cow#BOVIN
+dog#CANFA
+fugu#FUGRU
+mouse#MOUSE
+MOUSE#MOUSE
+Mus musculus#MOUSE
+Rattus norvegicus#RAT
+tetraodon#TETNG
+Tetraodon nigroviridis#TETNG
+urchin#STRPU
+Xenopus laevis#XENTR
+xenopus#XENTR
+zebrafish#BRARE
+Danio rerio#BRARE
+chicken#CHICK
+celegans#CAEEL
+Caenorhabditis elegans#CAEEL
+fruitfly#DROME
+Drosophila melanogaster#DROME
+Haemonchus contortus#HAECO
+Nematostella vectensis#NEMVE
+anemone#NEMVE
+human#HUMAN
+HUMAN#HUMAN
+Human#HUMAN
+Homo sapiens#HUMAN
+Chlamydomonas reinhardtii#CHLRE
+Monosiga brevicollis#MONBE
diff --git a/forester/ruby/evoruby/files/test/fasta_file.txt b/forester/ruby/evoruby/files/test/fasta_file.txt
new file mode 100644 (file)
index 0000000..d696196
--- /dev/null
@@ -0,0 +1,22 @@
+7 26
+# 7 26 is not needed and ignored
+CLUSTAL 
+PROBCONS 
+>sequence 0
+ABCDEF 
+>sequence 1       
+a
+b         
+c
+//comment 
+d           
+e
+!!comment  
+f  
+  
+  >  sequence 2  
+123456   
+ >  sequence 3   
+
+   a-c--f   
+
diff --git a/forester/ruby/evoruby/files/test/general_msa_file.txt b/forester/ruby/evoruby/files/test/general_msa_file.txt
new file mode 100644 (file)
index 0000000..5061707
--- /dev/null
@@ -0,0 +1,53 @@
+7 26
+# 7 26 is not needed and ignored
+
+sequence0 ABCDE GHIJ
+ KLMNOPQR TUVW
+sequence1 abcdefghi
+ jklm
+ nopq
+ rstu
+ vw
+sequence2 abcde       
+       fghijkl---x_-*?_XX           
+sequence3 12345678901234567890123                
+
+ # this is_a_comment            
+
+
+sequence4 ----------             
+ -------------                 
+
+sequence5 a*c*ef****************w   
+
+       % this is_another_comment
+
+sequence6 ururufhfghfgftgfhftgftt
+
+   // this is_yet_another_comment
+   !! this is_yet_another_comment
+
+ *  -- *
+
+sequence0 a
+ bc
+sequence1 a
+ b
+ c
+sequence2 abc
+
+sequence3 abc
+          *..
+sequence4 ---                
+sequence5 abc              
+ ...   
+sequence6 abc                          
+
+xy0
+xy1
+xy2
+xy3
+xy4
+xy5
+xy6
diff --git a/forester/ruby/evoruby/files/test/ncbi_tseq.xml b/forester/ruby/evoruby/files/test/ncbi_tseq.xml
new file mode 100644 (file)
index 0000000..2e3018f
--- /dev/null
@@ -0,0 +1,104 @@
+<?xml version="1.0"?>
+<!DOCTYPE TSeqSet PUBLIC "-//NCBI//NCBI TSeq/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_TSeq.dtd">
+<TSeqSet>
+<TSeq>
+  <TSeq_seqtype value="protein"/>
+  <TSeq_gi>29341016</TSeq_gi>
+  <TSeq_accver>AAO78806.1</TSeq_accver>
+  <TSeq_sid>gnl|mbpwusl|BT3701</TSeq_sid>
+  <TSeq_taxid>226186</TSeq_taxid>
+  <TSeq_orgname>Bacteroides thetaiotaomicron VPI-5482</TSeq_orgname>
+  <TSeq_defline>SusD [Bacteroides thetaiotaomicron VPI-5482]</TSeq_defline>
+  <TSeq_length>551</TSeq_length>
+  <TSeq_sequence>MKTKYIKQLFSAALIAVLSSGVTSCINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQENQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK</TSeq_sequence>
+</TSeq>
+
+<TSeq>
+  <TSeq_seqtype value="protein"/>
+  <TSeq_gi>29349109</TSeq_gi>
+  <TSeq_accver>NP_812612.1</TSeq_accver>
+  <TSeq_sid>gnl|REF_mbpwusl|BT3701</TSeq_sid>
+  <TSeq_taxid>226186</TSeq_taxid>
+  <TSeq_orgname>Bacteroides thetaiotaomicron VPI-5482</TSeq_orgname>
+  <TSeq_defline>SusD, outer membrane protein [Bacteroides thetaiotaomicron VPI-5482]</TSeq_defline>
+  <TSeq_length>551</TSeq_length>
+  <TSeq_sequence>MKTKYIKQLFSAALIAVLSSGVTSCINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQENQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK</TSeq_sequence>
+</TSeq>
+
+<TSeq>
+  <TSeq_seqtype value="protein"/>
+  <TSeq_gi>1478026</TSeq_gi>
+  <TSeq_accver>AAB42172.1</TSeq_accver>
+  <TSeq_taxid>818</TSeq_taxid>
+  <TSeq_orgname>Bacteroides thetaiotaomicron</TSeq_orgname>
+  <TSeq_defline>outer membrane protein</TSeq_defline>
+  <TSeq_length>554</TSeq_length>
+  <TSeq_sequence>MKTKYIKQLFSAALIAVLSSGVTSCINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQENQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPASHATYPDTDIPLFPFGRSIPDTCGSHFPPGRRRHRRHQLNFAKRAQLYKKGTEPLTEQETNRDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK</TSeq_sequence>
+</TSeq>
+
+<TSeq>
+  <TSeq_seqtype value="protein"/>
+  <TSeq_gi>188596440</TSeq_gi>
+  <TSeq_sid>pdb|3CK9|B</TSeq_sid>
+  <TSeq_taxid>818</TSeq_taxid>
+  <TSeq_orgname>Bacteroides thetaiotaomicron</TSeq_orgname>
+  <TSeq_defline>Chain B, B. Thetaiotaomicron Susd With Maltoheptaose</TSeq_defline>
+  <TSeq_length>527</TSeq_length>
+  <TSeq_sequence>GINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQKNQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK</TSeq_sequence>
+</TSeq>
+
+<TSeq>
+  <TSeq_seqtype value="protein"/>
+  <TSeq_gi>188596439</TSeq_gi>
+  <TSeq_sid>pdb|3CK9|A</TSeq_sid>
+  <TSeq_taxid>818</TSeq_taxid>
+  <TSeq_orgname>Bacteroides thetaiotaomicron</TSeq_orgname>
+  <TSeq_defline>Chain A, B. Thetaiotaomicron Susd With Maltoheptaose</TSeq_defline>
+  <TSeq_length>527</TSeq_length>
+  <TSeq_sequence>GINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQKNQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK</TSeq_sequence>
+</TSeq>
+
+<TSeq>
+  <TSeq_seqtype value="protein"/>
+  <TSeq_gi>171849127</TSeq_gi>
+  <TSeq_sid>pdb|3CKC|B</TSeq_sid>
+  <TSeq_taxid>818</TSeq_taxid>
+  <TSeq_orgname>Bacteroides thetaiotaomicron</TSeq_orgname>
+  <TSeq_defline>Chain B, B. Thetaiotaomicron Susd</TSeq_defline>
+  <TSeq_length>527</TSeq_length>
+  <TSeq_sequence>GINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQKNQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSXIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK</TSeq_sequence>
+</TSeq>
+
+<TSeq>
+  <TSeq_seqtype value="protein"/>
+  <TSeq_gi>171849126</TSeq_gi>
+  <TSeq_sid>pdb|3CKC|A</TSeq_sid>
+  <TSeq_taxid>818</TSeq_taxid>
+  <TSeq_orgname>Bacteroides thetaiotaomicron</TSeq_orgname>
+  <TSeq_defline>Chain A, B. Thetaiotaomicron Susd</TSeq_defline>
+  <TSeq_length>527</TSeq_length>
+  <TSeq_sequence>GINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQKNQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSXIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK</TSeq_sequence>
+</TSeq>
+
+<TSeq>
+  <TSeq_seqtype value="protein"/>
+  <TSeq_gi>171849125</TSeq_gi>
+  <TSeq_sid>pdb|3CKB|B</TSeq_sid>
+  <TSeq_taxid>818</TSeq_taxid>
+  <TSeq_orgname>Bacteroides thetaiotaomicron</TSeq_orgname>
+  <TSeq_defline>Chain B, B. Thetaiotaomicron Susd With Maltotriose</TSeq_defline>
+  <TSeq_length>527</TSeq_length>
+  <TSeq_sequence>GINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQKNQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK</TSeq_sequence>
+</TSeq>
+
+<TSeq>
+  <TSeq_seqtype value="protein"/>
+  <TSeq_sid>pdb|3CKB|A</TSeq_sid>
+  <TSeq_taxid>818</TSeq_taxid>
+  <TSeq_orgname>Bacteroides thetaiotaomicron</TSeq_orgname>
+  <TSeq_defline>Chain A, B. Thetaiotaomicron Susd With Maltotriose</TSeq_defline>
+  <TSeq_length>527</TSeq_length>
+  <TSeq_sequence>GINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQKNQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK</TSeq_sequence>
+</TSeq>
+
+</TSeqSet>
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb
new file mode 100644 (file)
index 0000000..ac6199b
--- /dev/null
@@ -0,0 +1,262 @@
+#
+# = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: domain_sequence_extractor.rb,v 1.19 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/io/parser/hmmsearch_domain_extractor'
+
+module Evoruby
+
+    class DomainSequenceExtractor
+
+        PRG_NAME       = "dsx"
+        PRG_VERSION    = "1.1.0"
+        PRG_DESC       = "extraction of domain sequences from hmmsearch output"
+        PRG_DATE       = "2008.01.03"
+        COPYRIGHT      = "2008-2009 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        E_VALUE_THRESHOLD_OPTION           = 'e'
+        LENGTH_THRESHOLD_OPTION            = 'l'
+        ADD_POSITION_OPTION                = 'p'
+        ADD_DOMAIN_NUMBER_OPTION           = 'd'
+        ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT  = 'dd'
+        ADD_DOMAIN_NUMBER_OPTION_AS_LETTER = 'dl'
+        TRIM_OPTION                        = 't'
+        LOG_FILE_SUFFIX                    = '_domain_seq_extr.log'
+        PASSED_SEQS_SUFFIX                 = '_domain_seq_extr_passed'
+        FAILED_SEQS_SUFFIX                 = '_domain_seq_extr_failed'
+        HELP_OPTION_1                      = 'help'
+        HELP_OPTION_2                      = 'h'
+
+        def run()
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC ,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            ld = Constants::LINE_DELIMITER
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError
+                Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
+            end
+
+            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+                     cla.is_option_set?( HELP_OPTION_2 ) )
+                print_help
+                exit( 0 )
+            end
+
+            if ( cla.get_number_of_files != 3 )
+                print_help
+                exit( -1 )
+            end
+
+            allowed_opts = Array.new
+            allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
+            allowed_opts.push( ADD_POSITION_OPTION )
+            allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
+            allowed_opts.push( LENGTH_THRESHOLD_OPTION )
+            allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT )
+            allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER )
+            allowed_opts.push( TRIM_OPTION )
+
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME,
+                    "unknown option(s): " + disallowed,
+                    STDOUT )
+            end
+
+            hmmsearch_output    = cla.get_file_name( 0 )
+            fasta_sequence_file = cla.get_file_name( 1 )
+            outfile             = cla.get_file_name( 2 )
+
+            add_position = false
+            if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
+                add_position = true
+            end
+
+            trim = false
+            if ( cla.is_option_set?( TRIM_OPTION ) )
+                trim = true
+            end
+
+            add_domain_number           = false
+            add_domain_number_as_letter = false
+            add_domain_number_as_digit  = false
+
+            if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
+                add_domain_number = true
+            end
+            if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) )
+                add_domain_number_as_letter = true
+            end
+            if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) )
+                add_domain_number_as_digit = true
+            end
+
+            if ( add_domain_number_as_letter && add_domain_number_as_digit )
+                puts( "attempt to add domain number as letter and digit at the same time" )
+                print_help
+                exit( -1 )
+            end
+
+            e_value_threshold = -1.0
+            if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
+                begin
+                    e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
+                rescue ArgumentError => e
+                    Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+                end
+                if ( e_value_threshold < 0.0 )
+                    Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
+                end
+            end
+
+            length_threshold = -1
+            if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
+                begin
+                    length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
+                rescue ArgumentError => e
+                    Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+                end
+                if ( length_threshold < 0)
+                    Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
+                end
+            end
+
+            log = String.new
+
+            puts()
+            puts( "Hmmsearch outputfile                   : " + hmmsearch_output )
+            log << "Hmmsearch outputfile                   : " + hmmsearch_output + ld
+            puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file )
+            log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld
+            puts( "Outputfile                             : " + outfile )
+            log << "Outputfile                             : " + outfile + ld
+            puts( "Passed sequences outfile (fasta)       : " + outfile + PASSED_SEQS_SUFFIX )
+            log << "Passed sequences outfile (fasta)       : " + outfile + PASSED_SEQS_SUFFIX + ld
+            puts( "Failed sequences outfile (fasta)       : " + outfile + FAILED_SEQS_SUFFIX )
+            log << "Failed sequences outfile (fasta)       : " + outfile + FAILED_SEQS_SUFFIX + ld
+            puts( "Logfile                                : " + outfile + LOG_FILE_SUFFIX )
+            log <<  "Logfile                                : " + outfile + LOG_FILE_SUFFIX + ld
+            if ( e_value_threshold >= 0.0 )
+                puts( "E-value threshold : " + e_value_threshold.to_s )
+                log << "E-value threshold : " + e_value_threshold.to_s + ld
+            else
+                puts( "E-value threshold : no threshold" )
+                log << "E-value threshold : no threshold" + ld
+            end
+            if ( length_threshold > 0 )
+                puts( "Length threshold  : " + length_threshold.to_s )
+                log << "Length threshold  : " + length_threshold.to_s + ld
+            else
+                puts( "Length threshold  : no threshold" )
+                log << "Length threshold  : no threshold" + ld
+            end
+
+            if ( trim )
+                puts( "Trim last 2 chars : true" )
+                log << "Trim last 2 chars : true" + ld
+            else
+                puts( "Trim names        : false" )
+                log << "Trim names        : false" + ld
+            end
+
+
+            if ( add_position )
+                puts( "Add positions (rel to complete seq) to extracted domains: true" )
+                log << "Add positions (rel to complete seq) to extracted domains: true" + ld
+            else
+                puts( "Add positions (rel to complete seq) to extracted domains: false" )
+                log << "Add positions (rel to complete seq) to extracted domains: false" + ld
+            end
+
+            if ( add_domain_number || add_domain_number_as_digit || add_domain_number_as_letter )
+                puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
+                log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
+            else
+                puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
+                log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
+            end
+
+            puts
+
+            domain_count = 0
+            begin
+                parser = HmmsearchDomainExtractor.new()
+                domain_count = parser.parse( hmmsearch_output,
+                    fasta_sequence_file,
+                    outfile,
+                    outfile + PASSED_SEQS_SUFFIX,
+                    outfile + FAILED_SEQS_SUFFIX,
+                    e_value_threshold,
+                    length_threshold,
+                    add_position,
+                    add_domain_number,
+                    add_domain_number_as_digit,
+                    add_domain_number_as_letter,
+                    trim,
+                    log )
+            rescue ArgumentError, IOError, StandardError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "unexpected exception!: " + e.to_s, STDOUT )
+            end
+
+            puts
+            Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
+            Util.print_message( PRG_NAME, "wrote;               " + outfile )
+            Util.print_message( PRG_NAME, "wrote:               " + outfile + LOG_FILE_SUFFIX )
+            Util.print_message( PRG_NAME, "(wrote:              " + outfile + PASSED_SEQS_SUFFIX + ")" )
+            Util.print_message( PRG_NAME, "(wrote:              " + outfile + FAILED_SEQS_SUFFIX + ")" )
+
+            begin
+                f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
+                f.print( log )
+                f.close
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+
+            puts
+            Util.print_message( PRG_NAME, "OK" )
+            puts
+
+        end
+
+        def print_help()
+            puts()
+            puts( "Usage:" )
+            puts()
+            puts( "  " + PRG_NAME + ".rb [options] <hmmsearch outputfile> <file containing complete sequences in fasta format> <outputfile>" )
+            puts()
+            puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f>: E-value threshold, default is no threshold" )
+            puts( "           -" + LENGTH_THRESHOLD_OPTION   + "=<i>: length threshold, default is no threshold" )
+            puts( "           -" + ADD_POSITION_OPTION  + ": to add positions (rel to complete seq) to extracted domains" )
+            puts( "           -" + ADD_DOMAIN_NUMBER_OPTION  + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
+            puts( "           -" + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT  + ": to add numbers to extracted domains as digit (example \"domain2\")" )
+            puts( "           -" + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER  + ": to add numbers to extracted domains as letter (example \"domaina\")" )
+            puts( "           -" + TRIM_OPTION  + ": to remove the last 2 characters from sequence names" )
+            puts()
+        end
+
+    end # class DomainSequenceExtractor
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/apps/domains_to_forester.rb b/forester/ruby/evoruby/lib/evo/apps/domains_to_forester.rb
new file mode 100644 (file)
index 0000000..a03cc2a
--- /dev/null
@@ -0,0 +1,252 @@
+#
+# = lib/evo/apps/domains_to_forester - DomainsToForester class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: domains_to_forester.rb,v 1.11 2010/12/13 19:00:11 cmzmasek Exp $
+#
+# last modified: 06/11/2007
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/sequence/protein_domain'
+require 'lib/evo/sequence/domain_structure'
+
+module Evoruby
+
+    class DomainsToForester
+
+        PRG_NAME       = "d2f"
+        PRG_DESC       = "parsed hmmpfam output to forester format"
+        PRG_VERSION    = "1.0.0"
+        PRG_DATE       = "2007.12.18"
+        COPYRIGHT      = "2007 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        E_VALUE_THRESHOLD_OPTION         = "e"
+        OVERWRITE_IF_SAME_FROM_TO_OPTION = "o"
+        HELP_OPTION_1                    = "help"
+        HELP_OPTION_2                    = "h"
+
+        def parse( domains_list_file,
+                original_seqs_file,
+                outfile,
+                column_delimiter,
+                e_value_threshold,
+                overwrite_if_same_from_to )
+            Util.check_file_for_readability( domains_list_file )
+            Util.check_file_for_readability( original_seqs_file )
+            Util.check_file_for_writability( outfile )
+
+            domain_structures = Hash.new() # protein name is key, domain structure is value
+
+            f = MsaFactory.new
+
+            original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new )
+            if ( original_seqs.get_number_of_seqs < 1 )
+                error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format"
+                raise ArgumentError, error_msg
+            end
+
+            File.open( domains_list_file ) do | file |
+                while line = file.gets
+                    if ( !is_ignorable?( line ) )
+                        a = line.split( column_delimiter )
+                        l = a.length
+                        if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) )
+                            error_msg = "unexpected format at line: " + line
+                            raise IOError, error_msg
+                        end
+                        protein_name = a[ 0 ]
+                        domain_name  = a[ 1 ]
+                        seq_from     = -1
+                        seq_to       = -1
+                        begin
+                            seq_from = a[ 2 ].to_i
+                        rescue Exception
+                            error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]"
+                            raise IOError, error_msg
+                        end
+                        begin
+                            seq_to = a[ 3 ].to_i
+                        rescue Exception
+                            error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]"
+                            raise IOError, error_msg
+                        end
+
+                        e_value = -1
+                        if ( l > 4 )
+                            begin
+                                e_value = a[ 4 ].to_f
+                            rescue Exception
+                                error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]"
+                                raise IOError, error_msg
+                            end
+                        end
+
+                        seq = original_seqs.get_by_name( protein_name, true, false )
+
+                        total_length = seq.get_length
+
+                        if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) )  )
+                            pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value )
+                            ds = nil
+                            if ( domain_structures.has_key?( protein_name ) )
+                                ds = domain_structures[ protein_name ]
+                            else
+                                ds = DomainStructure.new( total_length )
+                                domain_structures[ protein_name ] = ds
+                            end
+                            ds.add_domain( pd, overwrite_if_same_from_to )
+                        end
+
+                    end
+                end
+            end
+
+            out = File.open( outfile, "a" )
+            ds = domain_structures.sort
+            for d in ds
+                protein_name     = d[ 0 ]
+                domain_structure = d[ 1 ]
+                out.print( protein_name.to_s )
+                out.print( ":" )
+                out.print( domain_structure.to_NHX )
+                out.print( Constants::LINE_DELIMITER  )
+            end
+
+            out.flush()
+            out.close()
+
+        end # parse
+
+
+
+
+        def run()
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+            end
+
+            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+                     cla.is_option_set?( HELP_OPTION_2 ) )
+                print_help
+                exit( 0 )
+            end
+
+            if ( cla.get_number_of_files != 3 )
+                print_help
+                exit( -1 )
+            end
+
+            allowed_opts = Array.new
+            allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
+            allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION )
+
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME,
+                    "unknown option(s): " + disallowed,
+                    STDOUT )
+            end
+
+            domains_list_file       = cla.get_file_name( 0 )
+            original_sequences_file = cla.get_file_name( 1 )
+            outfile                 = cla.get_file_name( 2 )
+
+
+            e_value_threshold = -1.0
+            if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
+                begin
+                    e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
+                rescue ArgumentError => e
+                    Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+                end
+                if ( e_value_threshold < 0.0 )
+                    Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
+                end
+            end
+            overwrite_if_same_from_to = false
+            if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) )
+                overwrite_if_same_from_to = true
+            end
+
+            puts()
+            puts( "Domains list file                      : " + domains_list_file )
+            puts( "Fasta sequencefile (complete sequences): " + original_sequences_file )
+            puts( "Outputfile                             : " + outfile )
+            if ( e_value_threshold >= 0.0 )
+                puts( "E-value threshold                      : " + e_value_threshold.to_s )
+            else
+                puts( "E-value threshold                      : no threshold" )
+            end
+            if ( overwrite_if_same_from_to )
+                puts( "Overwrite if same from and to          : true" )
+            else
+                puts( "Overwrite if same from and to          : false" )
+            end
+
+            puts
+
+            begin
+                parse( domains_list_file,
+                    original_sequences_file,
+                    outfile,
+                    " ",
+                    e_value_threshold,
+                    overwrite_if_same_from_to )
+
+            rescue ArgumentError, IOError, StandardError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
+            end
+
+
+            puts
+            Util.print_message( PRG_NAME, 'OK' )
+            puts
+
+        end
+
+        private
+
+        def print_help()
+            puts()
+            puts( "Usage:" )
+            puts()
+            puts( "  " + PRG_NAME + ".rb [options] <domains list file (parsed hmmpfam output)> <file containing complete sequences in fasta format> <outputfile>" )
+            puts()
+            puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f> : E-value threshold, default is no threshold" )
+            puts( "               -" + OVERWRITE_IF_SAME_FROM_TO_OPTION  + " : overwrite domain with same start and end with domain with better E-value" )
+            puts()
+        end
+
+
+
+        def is_ignorable?( line )
+            return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/)
+        end
+
+
+    end # class DomainsToForester
+
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/apps/evo_nursery.rb b/forester/ruby/evoruby/lib/evo/apps/evo_nursery.rb
new file mode 100755 (executable)
index 0000000..2f634d6
--- /dev/null
@@ -0,0 +1,317 @@
+#
+# = lib/evo/apps/evo_nursery.rb - EvoNursery class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: evo_nursery.rb,v 1.11 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+
+require 'lib/evo/soft/fastme'
+require 'lib/evo/soft/tree_puzzle'
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/writer/phylip_sequential_writer'
+require 'lib/evo/io/parser/general_msa_parser'
+require 'lib/evo/io/writer/msa_writer'
+
+require 'iconv'
+
+module Evoruby
+
+    class EvoNursery
+        GAP_RATIO           = 0.75
+        GAP_RATIO_FOR_SEQS  = 0.75
+        MIN_LENGTH          = 40
+        MIN_SEQS            = 4
+        MAX_SEQS            = 1600
+        MAX_ALN_FILE_SIZE   = 4000000
+        MODEL               = :auto
+        RATES               = :uniform
+        FASTME_INITIAL_TREE = :GME
+        ALN_NAME            = '_align_'
+        TREE_PUZZLE_OUTDIST = TreePuzzle::OUTDIST
+        TREE_PUZZLE_OUTFILE = TreePuzzle::OUTFILE
+        FASTME_OUTTREE      = FastMe::OUTTREE
+        FASTME_OUTPUT_D     = FastMe::OUTPUT_D
+
+        PRG_NAME       = "evo_nursery"
+        PRG_DATE       = "2009.10.15"
+        PRG_DESC       = "pfam alignments to evolutionary trees"
+        PRG_VERSION    = "0.20"
+        COPYRIGHT      = "2009-2010 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        HELP_OPTION_1       = "help"
+        HELP_OPTION_2       = "h"
+
+        def run
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            if RUBY_VERSION !~ /1.9/
+                puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
+                exit( -1 )
+            end
+
+            forester_home = Util.get_env_variable_value( Constants::FORESTER_HOME_ENV_VARIABLE )
+            java_home = Util.get_env_variable_value( Constants::JAVA_HOME_ENV_VARIABLE )
+            decorator = java_home + '/bin/java -cp ' + forester_home + '/java/forester.jar org.forester.application.decorator'
+
+            if ( ARGV == nil || ARGV.length != 1 )
+                help
+                exit( -1 )
+            end
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+
+            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+                     cla.is_option_set?( HELP_OPTION_2 ) )
+                help
+                exit( 0 )
+            end
+
+            output_dir = cla.get_file_name( 0 )
+
+            if output_dir !~ /\/$/
+                output_dir = output_dir + '/'
+            end
+
+            if !File.exists?( output_dir )
+                Util.fatal_error( PRG_NAME, output_dir.to_s + " does not exist", STDOUT )
+            end
+            ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
+            files = Dir.entries( "." )
+            skipped = Array.new
+            counter = 1
+            analyzed = 0;
+            begin
+                files.each { |pfam_aln_file|
+                    if ( !File.directory?( pfam_aln_file ) &&
+                             pfam_aln_file !~ /^\./ &&
+                             pfam_aln_file !~ /.+\.tre$/  )
+
+                        tree_out_file = output_dir + File.basename( pfam_aln_file ) + ".xml"
+
+                        if File.exists?( tree_out_file )
+                            puts
+                            puts
+                            puts "***** skipping " + File.basename( pfam_aln_file ) + ", already exists"
+                            puts
+                            skipped.push( File.basename( pfam_aln_file ) + " [already exists]" )
+                            next
+                        end
+
+                        puts
+                        puts counter.to_s + ": " + pfam_aln_file.to_str
+                        counter += 1
+                        if File.size( pfam_aln_file ) > MAX_ALN_FILE_SIZE
+                            puts "***** skipping, file size: " +  File.size( pfam_aln_file ).to_s
+                            skipped.push( File.basename( pfam_aln_file ) + " [file size: " +  File.size( pfam_aln_file ).to_s + "]" )
+                            next
+                        end
+
+                        f = MsaFactory.new()
+                        msa = f.create_msa_from_file( pfam_aln_file, GeneralMsaParser.new() )
+
+                        if msa.get_number_of_seqs < MIN_SEQS || msa.get_number_of_seqs > MAX_SEQS
+                            puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s
+                            skipped.push( File.basename( pfam_aln_file ) + " [seqs: " +  msa.get_number_of_seqs.to_s + "]" )
+                            next
+                        end
+
+                        msa.remove_gap_columns_w_gap_ratio!( GAP_RATIO )
+
+                        length = msa.get_length
+                        if length < MIN_LENGTH
+                            puts "***** skipping, length: " + length.to_s
+                            skipped.push( File.basename( pfam_aln_file ) + " [length: " +  length.to_s + "]" )
+                            next
+                        end
+
+                        msa.remove_sequences_by_gap_ratio!( GAP_RATIO_FOR_SEQS )
+
+                        if msa.get_number_of_seqs < MIN_SEQS
+                            puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s
+                            skipped.push( File.basename( pfam_aln_file ) + " [seqs: " +  msa.get_number_of_seqs.to_s + "]" )
+                            next
+                        end
+
+                        map_file = output_dir + File.basename( pfam_aln_file ) + ".map"
+                        f = File.open( map_file, 'a' )
+                        for i in 0 ... msa.get_number_of_seqs
+                            name = msa.get_sequence( i ).get_name()
+                            name =~ /(.+)_(.+)\/.+/
+                            acc = $1
+                            tax_code = $2
+
+                            mapping_str = i.to_s
+                            mapping_str << "\t"
+                            mapping_str << 'TAXONOMY_CODE:'
+                            mapping_str << tax_code
+                            mapping_str << "\t"
+                            mapping_str << 'SEQ_SYMBOL:'
+                            mapping_str << ( acc + '_' + tax_code )
+                            mapping_str << "\t"
+                            if ( acc.length < 6 )
+                                acc = acc + '_' + tax_code
+                            end
+                            mapping_str << 'SEQ_ACCESSION:'
+                            mapping_str << acc
+                            mapping_str << "\t"
+                            mapping_str << 'SEQ_ACCESSION_SOURCE:UniProtKB'
+                            mapping_str << "\t"
+                            mapping_str << 'NODE_NAME:'
+                            mapping_str << name
+                            f.print( mapping_str )
+                            f.print( "\n" )
+                            name = msa.get_sequence( i ).set_name( i.to_s )
+                        end
+                        f.close
+
+                        io = MsaIO.new()
+                        w = MsaWriter
+                        w = PhylipSequentialWriter.new()
+                        w.clean( true )
+                        w.set_max_name_length( 10 )
+                        if File.exists?( output_dir + ALN_NAME )
+                            File.unlink( output_dir + ALN_NAME )
+                        end
+                        io.write_to_file( msa, output_dir + ALN_NAME, w )
+
+                        tp = TreePuzzle.new()
+                        tp.run( output_dir + ALN_NAME,
+                            MODEL,
+                            RATES,
+                            msa.get_number_of_seqs )
+
+                        File.rename( output_dir + ALN_NAME, output_dir  + File.basename( pfam_aln_file ) + ".aln" )
+
+                        fastme = FastMe.new()
+                        fastme.run( TREE_PUZZLE_OUTDIST, 0, FASTME_INITIAL_TREE )
+
+                        pfam_acc = nil
+                        pfam_de = nil
+                        File.open( pfam_aln_file ) do |file|
+                            while line = file.gets
+                                line = ic.iconv( line )
+                                if line =~ /^#=AC\s+(.+)/
+                                    pfam_acc = $1
+                                end
+                                if line =~ /^#=DE\s+(.+)/
+                                    pfam_de = $1
+                                end
+                                if pfam_acc && pfam_de
+                                    break
+                                end
+                            end
+                        end
+                        if !pfam_acc || !pfam_de
+                            Util.fatal_error( PRG_NAME, "problem with " + pfam_aln_file.to_s, STDOUT )
+                        end
+
+                        puzzle_model = nil
+                        File.open( TREE_PUZZLE_OUTFILE ) do |file|
+                            while line = file.gets
+                                line = ic.iconv( line )
+                                if line =~ /^Model\s+of\s+substitution:\s+(.+)/
+                                    puzzle_model = $1
+                                    break
+                                end
+                            end
+                        end
+                        if !puzzle_model
+                            Util.fatal_error( PRG_NAME, "problem with puzzle outfile: " + TREE_PUZZLE_OUTFILE.to_s, STDOUT )
+                        end
+
+                        desc = pfam_de
+                        desc << ' | '
+                        desc << 'ML pwd estimation by TREE-PUZZLE version '
+                        desc << TreePuzzle::VERSION
+                        desc << ', model: '
+                        desc << puzzle_model
+                        desc << ', rates: '
+                        desc << RATES.to_s
+                        desc << '; tree estimation by FastME version '
+                        desc << FastMe::VERSION
+                        desc << ', initial tree: '
+                        desc << FASTME_INITIAL_TREE.to_s
+                        desc << '; aln length: '
+                        desc << msa.get_length.to_s
+
+                        cmd = decorator + " -table -p -pn=\"" + pfam_aln_file +
+                         "\" -pi=pfam:" + pfam_acc +
+                         " -pd=\"" + desc + "\" " +
+                         FASTME_OUTTREE + ' ' +
+                         map_file + ' ' + tree_out_file
+
+                        IO.popen( cmd , 'r+' ) do | pipe |
+                            pipe.close_write
+                        end
+                        analyzed += 1
+
+                        File.unlink( map_file )
+                        File.unlink(TREE_PUZZLE_OUTDIST)
+                        File.unlink( TREE_PUZZLE_OUTFILE )
+                        File.unlink( FASTME_OUTPUT_D )
+                    end
+                }
+            rescue ArgumentError, IOError, StandardError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+            end
+
+            puts()
+            puts( 'Skipped:' )
+            puts()
+            for i in 0 ... skipped.size
+                puts i.to_s + ": " + skipped[ i ]
+            end
+
+            puts()
+            puts( 'Skipped : ' + skipped.size.to_s + ' alignments' )
+            puts( 'Analyzed: ' +  analyzed.to_s    + ' alignments' )
+
+            puts( 'Min gap ratio for col del  : ' + GAP_RATIO.to_s )
+            puts( 'Min gap ratio for seq del  : ' + GAP_RATIO_FOR_SEQS.to_s )
+            puts( 'Minimal aln length         : ' + MIN_LENGTH.to_s )
+            puts( 'Minimal number of sequences: ' + MIN_SEQS.to_s )
+            puts( 'Maximal number of sequences: ' + MAX_SEQS.to_s )
+            puts( 'Maximal aln file size      : ' + MAX_ALN_FILE_SIZE.to_s )
+            puts( 'Model              : ' + MODEL.to_s )
+            puts( 'FastME initial tree: ' + FASTME_INITIAL_TREE.to_s )
+
+            puts()
+            puts( '[' + PRG_NAME + '] > OK' )
+            puts()
+
+        end  # run
+
+        private
+
+        def help
+            puts( "Usage:" )
+            puts()
+            puts( "  " + PRG_NAME + ".rb <output dir> " )
+            puts()
+        end
+
+
+    end # class EvoNursery
+
+end # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/apps/fasta_extractor.rb b/forester/ruby/evoruby/lib/evo/apps/fasta_extractor.rb
new file mode 100644 (file)
index 0000000..b2d0d5c
--- /dev/null
@@ -0,0 +1,146 @@
+#
+# = lib/evo/apps/fasta_extractor.rb - FastaExtractor class
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: fasta_extractor.rb,v 1.2 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+require 'lib/evo/util/util'
+require 'lib/evo/util/constants'
+require 'lib/evo/util/command_line_arguments'
+
+
+module Evoruby
+
+    class FastaExtractor
+
+        PRG_NAME                           = "fae"
+        PRG_VERSION                        = "1.0.0"
+        PRG_DESC                           = "extraction of nucleotide sequences from a fasta file by names from wublast search"
+        PRG_DATE                           = "2008.08.09"
+        COPYRIGHT                          = "2008-2009 Christian M Zmasek"
+        CONTACT                            = "phylosoft@gmail.com"
+        WWW                                = "www.phylosoft.org"
+        HELP_OPTION_1                      = 'help'
+        HELP_OPTION_2                      = 'h'
+
+
+        def run()
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC ,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            ld = Constants::LINE_DELIMITER
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+
+            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+                     cla.is_option_set?( HELP_OPTION_2 ) )
+                print_help
+                exit( 0 )
+            end
+
+            if ( cla.get_number_of_files != 3 )
+                print_help
+                exit( -1 )
+            end
+
+            allowed_opts = Array.new
+
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME,
+                    "unknown option(s): " + disallowed,
+                    STDOUT )
+            end
+
+            input_file  = cla.get_file_name( 0 )
+            names_file  = cla.get_file_name( 1 )
+            output_file = cla.get_file_name( 2 )
+
+            if  !File.exist?( input_file )
+                Util.fatal_error( PRG_NAME, "error: input file [#{input_file}] does not exist" )
+            end
+            if  !File.exist?( names_file )
+                Util.fatal_error( PRG_NAME, "error: names file [#{names_file}] does not exist" )
+            end
+            if File.exist?( output_file   )
+                Util.fatal_error( PRG_NAME, "error: [#{output_file }] already exists" )
+            end
+
+            names = extract_names_with_frames( names_file )
+
+            extract_sequences( names, input_file, output_file )
+
+            puts
+            Util.print_message( PRG_NAME, "OK" )
+            puts
+
+        end
+
+
+        def extract_names_with_frames( names_file )
+            names = Hash.new()
+            File.open( names_file ) do | file |
+                while line = file.gets
+                    if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
+                        if ( line =~ /(\S+)\s+([+|-]\d)\s+\d+\s+(\S+)/ )
+                            name  = $1
+                            frame = $2
+                            e     = $3
+                            names[ name ] =  "[" + frame + "] [" + e + "]"
+                        end
+                    end
+                end
+            end
+            names
+        end
+
+        def extract_sequences( names, fasta_file, output_file )
+            output = File.open( output_file, "a" )
+            matching_state = false
+            counter = 0
+            File.open( fasta_file ) do | file |
+                while line = file.gets
+                    if !Util.is_string_empty?( line )
+                        if ( line =~ /\s*>\s*(.+)/ )
+                            name = $1
+                            if names.has_key?( name )
+                                matching_state = true
+                                counter += 1
+                                puts counter.to_s + ". " +name + " " + names[ name ]
+                                output.print( ">" + name + " " + names[ name ] )
+                                output.print( Evoruby::Constants::LINE_DELIMITER )
+                            else
+                                matching_state = false
+                            end
+                        elsif matching_state
+                            output.print( line )
+                        end
+                    end
+                end
+            end
+            output.close()
+        end
+
+        def print_help()
+            puts( "Usage:" )
+            puts()
+            puts( "  " + PRG_NAME + ".rb <input fasta file> <names file based on blast output> <output file>" )
+            puts()
+        end
+
+    end # class FastaExtractor
+end
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/apps/fasta_taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/apps/fasta_taxonomy_processor.rb
new file mode 100644 (file)
index 0000000..6ae3cf1
--- /dev/null
@@ -0,0 +1,205 @@
+#
+# = lib/evo/apps/fasta_taxonomy_processor - FastaTaxonomyProcessor class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: fasta_taxonomy_processor.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+require 'lib/evo/util/util'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/msa/msa'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/parser/sp_taxonomy_parser'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/writer/phylip_sequential_writer'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/apps/tseq_taxonomy_processor'
+
+module Evoruby
+
+    class FastaTaxonomyProcessor
+
+        PRG_NAME       = "fasta_tap"
+        PRG_DATE       = "2009.01.20"
+        PRG_DESC       = "preprocessing of multiple sequence files in ncbi fasta format"
+        PRG_VERSION    = "1.00"
+        COPYRIGHT      = "2009 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        def initialize()
+            @tax_ids_to_sp_taxonomies = Hash.new()
+        end
+
+        def run()
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            if  ARGV == nil || ARGV.length != 4
+                puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in ncbi fasta format> <name for fasta outfile> <name for map outfile>" )
+                puts()
+                exit( -1 )
+            end
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+            allowed_opts = Array.new
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
+            end
+
+            sp_taxonomy_infile = cla.get_file_name( 0 )
+            sequences_infile = cla.get_file_name( 1 )
+            sequences_outfile = cla.get_file_name( 2 )
+            mapping_outfile = cla.get_file_name( 3 )
+
+            Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
+            Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
+            Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
+            Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
+
+            sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
+
+            Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
+
+            fasta_parser = FastaParser.new
+            msa_fac = MsaFactory.new
+
+            seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser )
+
+            Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
+
+            removed = seqs.remove_redundant_sequences!( true, true )
+
+            if removed.size > 0
+                Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
+                removed.each { | seq_name |
+                    puts seq_name
+                }
+                Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
+            end
+
+            mapping_out = File.open( mapping_outfile, "a" )
+
+            for i in 0 ... seqs.get_number_of_seqs
+                seq = seqs.get_sequence( i )
+                seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
+            end
+
+            io = MsaIO.new()
+
+            w = FastaWriter.new()
+
+            w.set_max_name_length( 10 )
+            w.clean( true )
+            begin
+                io.write_to_file( seqs, sequences_outfile, w )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
+            end
+            mapping_out.close()
+
+            Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
+            Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
+            Util.print_message( PRG_NAME, "OK" )
+
+        end
+
+        private
+
+        def modify_name( seq, i, sp_taxonomies, mapping_outfile )
+
+            #i = i + 1792
+            
+            seq_desc = seq.get_name
+
+            taxonomy_sn = nil
+
+            if seq_desc =~ /\[(.+)\]/
+                taxonomy_sn = $1
+            else
+                Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]"  )
+            end
+
+            matching_sp_taxonomy = nil
+
+            sp_taxonomies.each { |sp_taxonomy|
+                if ( sp_taxonomy.scientific_name == taxonomy_sn )
+                    matching_sp_taxonomy = sp_taxonomy
+                end
+            }
+
+            if  matching_sp_taxonomy == nil
+                Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" )
+            end
+
+            new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
+
+            gi = nil
+            if seq_desc =~ /gi\|(.+?)\|/
+                gi = $1
+            else
+              Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]"  )
+            end
+
+            seq_name = ""
+
+            if seq_desc =~ /\|\s*([^|]+?)\s*\[/
+                seq_name = $1
+            end
+
+            if  seq_name =~ /\[.+\]$/
+                # Redundant taxonomy information hides here.
+                seq_name = seq_name.sub(/\[.+\]$/, '')
+            end
+            if  seq_name =~ /^\s*hypothetical\s+protein\s*/i
+                # Pointless information.
+                seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
+            end
+            if  seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i
+                # Pointless information.
+                seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' )
+            end
+
+            if gi != nil
+            mapping_outfile.print( new_name + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
+                 TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" +
+                 TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" +
+                 TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
+                 TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
+                 Constants::LINE_DELIMITER )
+            else
+                 mapping_outfile.print( new_name + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
+                 TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
+                 TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
+                 TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
+                 Constants::LINE_DELIMITER )
+                
+            end    
+            new_name
+        end
+
+    end
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/apps/hmmscan_parser.rb b/forester/ruby/evoruby/lib/evo/apps/hmmscan_parser.rb
new file mode 100644 (file)
index 0000000..c2f8177
--- /dev/null
@@ -0,0 +1,265 @@
+#
+# = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
+#
+# last modified: 11/24/2009
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+
+module Evoruby
+
+    class HmmscanParser
+
+        PRG_NAME       = "hsp"
+        PRG_VERSION    = "1.0.1"
+        PRG_DESC       = "hmmscan parser"
+        PRG_DATE       = "2009.11.24"
+        COPYRIGHT      = "2009 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        DELIMITER_OPTION              = "d"
+        E_VALUE_THRESHOLD_OPTION      = "e"
+        IGNORE_DUF_OPTION             = "i"
+        PARSE_OUT_DESCRIPITION_OPTION = "a"
+        HELP_OPTION_1                 = "help"
+        HELP_OPTION_2                 = "h"
+
+        def initialize
+            @domain_counts = Hash.new
+        end
+
+        # raises ArgumentError, IOError
+        def parse( inpath,
+                outpath,
+                column_delimiter,
+                e_value_threshold,
+                ignore_dufs,
+                get_descriptions )
+            Util.check_file_for_readability( inpath )
+            Util.check_file_for_writability( outpath )
+
+            outfile = File.open( outpath, "a" )
+
+            query    = String.new
+            desc     = String.new
+            model    = String.new
+            env_from = String.new
+            env_to   = String.new
+            i_e_value  = String.new
+
+            queries_count = 0
+
+            nl = Constants::LINE_DELIMITER
+
+            File.open( inpath ) do | file |
+                while line = file.gets
+                    if !HmmscanParser.is_ignorable?( line ) && line =~ /^\S+\s+\S/
+
+                        #         tn      acc     tlen    query   acc     qlen    Evalue  score   bias    #       of      c-E     i-E     score   bias    hf      ht      af      at      ef      et      acc     desc
+                        #         1       2       3       4       5       6       7       8       9       10      11      12      13      14      15      16      17      18      19      20      21      22      23
+                        line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
+
+                        model     = $1
+                        query     = $4
+                        i_e_value = $13.to_f
+                        env_from  = $20.to_i
+                        env_to    = $21.to_i
+
+                        if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
+                                 ( !ignore_dufs || ( model !~ /^DUF\d+/ ) ) )
+                            count_model( model )
+                            outfile.print( query +
+                                 column_delimiter )
+                            if ( get_descriptions )
+                                outfile.print( desc +
+                                     column_delimiter )
+                            end
+                            outfile.print( model +
+                                 column_delimiter +
+                                 env_from.to_s +
+                                 column_delimiter +
+                                 env_to.to_s +
+                                 column_delimiter +
+                                 i_e_value.to_s )
+                            outfile.print( nl )
+                        end
+                    end
+                end # while line = file.gets
+            end
+            outfile.flush()
+            outfile.close()
+
+            return queries_count
+
+        end # def parse
+
+        def count_model( model )
+            if ( @domain_counts.has_key?( model ) )
+                count = @domain_counts[ model ].to_i
+                count += 1
+                @domain_counts[ model ] = count
+            else
+                @domain_counts[ model ] = 1
+            end
+        end
+
+
+        def get_domain_counts()
+            return @domain_counts
+        end
+
+        def run()
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+            end
+
+            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+                     cla.is_option_set?( HELP_OPTION_2 ) )
+                print_help
+                exit( 0 )
+            end
+
+            if ( cla.get_number_of_files != 2 )
+                print_help
+                exit( -1 )
+            end
+
+            allowed_opts = Array.new
+            allowed_opts.push( DELIMITER_OPTION )
+            allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
+            allowed_opts.push( IGNORE_DUF_OPTION )
+            allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
+
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME,
+                    "unknown option(s): " + disallowed,
+                    STDOUT )
+            end
+
+            inpath = cla.get_file_name( 0 )
+            outpath = cla.get_file_name( 1 )
+
+            column_delimiter = "\t"
+            if ( cla.is_option_set?( DELIMITER_OPTION ) )
+                begin
+                    column_delimiter = cla.get_option_value( DELIMITER_OPTION )
+                rescue ArgumentError => e
+                    Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+                end
+            end
+
+            e_value_threshold = -1.0
+            if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
+                begin
+                    e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
+                rescue ArgumentError => e
+                    Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+                end
+                if ( e_value_threshold < 0.0 )
+                    Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
+                end
+            end
+
+            ignore_dufs = false
+            if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
+                ignore_dufs = true
+            end
+
+            parse_descriptions = false
+            if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
+                parse_descriptions = true
+            end
+
+            puts()
+            puts( "hmmpfam outputfile: " + inpath )
+            puts( "outputfile        : " + outpath )
+            if ( e_value_threshold >= 0.0 )
+                puts( "E-value threshold : " + e_value_threshold.to_s )
+            else
+                puts( "E-value threshold : no threshold" )
+            end
+            if ( parse_descriptions )
+                puts( "parse descriptions: true" )
+            else
+                puts( "parse descriptions: false" )
+            end
+            if ( ignore_dufs )
+                puts( "ignore DUFs       : true" )
+            else
+                puts( "ignore DUFs       : false" )
+            end
+            if ( column_delimiter == "\t" )
+                puts( "column delimiter  : TAB" )
+            else
+                puts( "column delimiter  : " + column_delimiter )
+            end
+            puts()
+
+            begin
+                queries_count = parse( inpath,
+                    outpath,
+                    column_delimiter,
+                    e_value_threshold,
+                    ignore_dufs,
+                    parse_descriptions )
+            rescue ArgumentError, IOError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+            end
+            domain_counts = get_domain_counts()
+
+            puts
+            puts( "read output for a total of " + queries_count.to_s + " query sequences" )
+            puts
+            puts( "domain counts (considering potential E-value threshold and ignoring of DUFs):" )
+            puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
+            puts
+            puts( Util.draw_histogram( domain_counts, "#" ) )
+            puts
+            Util.print_message( PRG_NAME, 'OK' )
+            puts
+
+        end # def run()
+
+        def print_help()
+            puts( "Usage:" )
+            puts()
+            puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
+            puts()
+            puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
+            puts( "           -" + E_VALUE_THRESHOLD_OPTION  + ": E-value threshold, default is no threshold" )
+            puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
+            puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
+            puts()
+        end
+
+
+        private
+
+
+        def HmmscanParser.is_ignorable?( line )
+            return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
+        end
+
+    end # class HmmscanParser
+
+end # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/apps/msa_processor.rb b/forester/ruby/evoruby/lib/evo/apps/msa_processor.rb
new file mode 100644 (file)
index 0000000..708a9b7
--- /dev/null
@@ -0,0 +1,839 @@
+#
+# = lib/evo/apps/msa_processor.rb - MsaProcessor class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: msa_processor.rb,v 1.33 2010/12/13 19:00:10 cmzmasek Exp $
+#
+
+require 'date'
+require 'set'
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/writer/phylip_sequential_writer'
+require 'lib/evo/io/writer/nexus_writer'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/io/parser/general_msa_parser'
+require 'lib/evo/io/writer/msa_writer'
+
+module Evoruby
+
+  class MsaProcessor
+
+    PRG_NAME       = "msa_pro"
+    PRG_DATE       = "2010.03.19"
+    PRG_DESC       = "processing of multiple sequence alignments"
+    PRG_VERSION    = "1.05"
+    COPYRIGHT      = "2008-2010 Christian M Zmasek"
+    CONTACT        = "phylosoft@gmail.com"
+    WWW            = "www.phylosoft.org"
+
+
+    NAME_LENGTH_DEFAULT                = 10
+    WIDTH_DEFAULT_FASTA                = 60
+    INPUT_TYPE_OPTION                  = "i"
+    OUTPUT_TYPE_OPTION                 = "o"
+    MAXIMAL_NAME_LENGTH_OPTION         = "n"
+    WIDTH_OPTION                       = "w"
+    CLEAN_UP_SEQ_OPTION                = "c"
+    REM_RED_OPTION                     = "rem_red"
+    REMOVE_GAP_COLUMNS_OPTION          = "rgc"
+    REMOVE_GAP_ONLY_COLUMNS            = "rgoc"
+    REMOVE_COLUMNS_GAP_RATIO_OPTION    = "rr"
+    REMOVE_ALL_GAP_CHARACTERS_OPTION   = "rg"
+    REMOVE_ALL_SEQUENCES_LISTED_OPTION = "r"
+    KEEP_ONLY_SEQUENCES_LISTED_OPTION  = "k"
+
+    KEEP_MATCHING_SEQUENCES_OPTION     = "mk"
+    REMOVE_MATCHING_SEQUENCES_OPTION   = "mr"
+
+    TRIM_OPTION                        = "t"
+    REMOVE_SEQS_GAP_RATIO_OPTION       = "rsgr"
+    REMOVE_SEQS_NON_GAP_LENGTH_OPTION  = "rsl"
+    SPLIT                              = "split"
+    LOG_SUFFIX                         = "_msa_pro.log"
+    HELP_OPTION_1                      = "help"
+    HELP_OPTION_2                      = "h"
+
+
+    def initialize()
+      @input_format_set = false
+      @output_format_set = false
+      @fasta_input      = false
+      @phylip_input     = true
+      @name_length      = NAME_LENGTH_DEFAULT
+      @name_length_set  = false
+      @width            = WIDTH_DEFAULT_FASTA     # fasta only
+      @pi_output        = true
+      @fasta_output     = false
+      @nexus_output     = false
+      @clean            = false  # phylip only
+      @rgc              = false
+      @rgoc             = false
+      @rg               = false  # fasta only
+      @rem_red          = false
+      @rgr              = -1
+      @rsgr             = -1
+      @rsl              = -1
+      @remove_matching  = nil
+      @keep_matching    = nil
+
+      @seqs_name_file   = nil
+      @remove_seqs      = false
+      @keep_seqs        = false
+      @trim             = false
+      @split            = -1
+      @first            = -1
+      @last             = -1
+    end
+
+
+    def run()
+
+      Util.print_program_information( PRG_NAME,
+        PRG_VERSION,
+        PRG_DESC,
+        PRG_DATE,
+        COPYRIGHT,
+        CONTACT,
+        WWW,
+        STDOUT )
+
+      if ( ARGV == nil || ARGV.length < 1 )
+        Util.print_message( PRG_NAME, "Illegal number of arguments" )
+        print_help
+        exit( -1 )
+      end
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "Error: " + e.to_s, STDOUT )
+      end
+
+      if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+           cla.is_option_set?( HELP_OPTION_2 ) )
+        print_help
+        exit( 0 )
+      end
+
+      if ( cla.get_number_of_files != 2 || ARGV.length < 2 )
+        Util.print_message( PRG_NAME, "Illegal number of arguments" )
+        print_help
+        exit( -1 )
+      end
+
+      allowed_opts = Array.new
+      allowed_opts.push( INPUT_TYPE_OPTION )
+      allowed_opts.push( OUTPUT_TYPE_OPTION )
+      allowed_opts.push( MAXIMAL_NAME_LENGTH_OPTION )
+      allowed_opts.push( WIDTH_OPTION )
+      allowed_opts.push( CLEAN_UP_SEQ_OPTION )
+      allowed_opts.push( REMOVE_GAP_COLUMNS_OPTION )
+      allowed_opts.push( REMOVE_GAP_ONLY_COLUMNS )
+      allowed_opts.push( REMOVE_COLUMNS_GAP_RATIO_OPTION )
+      allowed_opts.push( REMOVE_ALL_GAP_CHARACTERS_OPTION )
+      allowed_opts.push( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
+      allowed_opts.push( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
+      allowed_opts.push( TRIM_OPTION )
+      allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION )
+      allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
+      allowed_opts.push( SPLIT )
+      allowed_opts.push( REM_RED_OPTION )
+      allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
+      allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
+
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME,
+          "unknown option(s): " + disallowed )
+      end
+
+      input = cla.get_file_name( 0 )
+      output = cla.get_file_name( 1 )
+
+      analyze_command_line( cla )
+
+      begin
+        Util.check_file_for_readability( input )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      begin
+        Util.check_file_for_writability( output )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      if ( @rg )
+        set_pi_output( false )
+        set_fasta_output( true )
+        set_nexus_output( false )
+      end
+
+      if ( !@input_format_set )
+        fasta_like = false
+        begin
+          fasta_like = Util.looks_like_fasta?( input )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+        end
+        @fasta_input = fasta_like
+        @phylip_input = !fasta_like
+        if ( !@output_format_set )
+          @fasta_output = fasta_like
+          @pi_output = !fasta_like
+          @nexus_output = false
+        end
+      end
+
+      ld = Constants::LINE_DELIMITER
+      log = PRG_NAME + " " + PRG_VERSION + " [" + PRG_DATE + "]" + " LOG" + ld
+      now = DateTime.now
+      log << "Date/time: " + now.to_s + ld
+
+      puts()
+      puts( "Input alignment  : " + input )
+      log << "Input alignment  : " + input + ld
+      puts( "Output alignment : " + output )
+      log << "Output alignment : " + output + ld
+      if ( @phylip_input )
+        puts( "Input is         : Phylip, or something like it" )
+        log << "Input is         : Phylip, or something like it" + ld
+      elsif ( @fasta_input )
+        puts( "Input is         : Fasta" )
+        log << "Input is         : Fasta" + ld
+      end
+      if( @rgr >= 0 )
+        puts( "Max col gap ratio: " + @rgr.to_s )
+        log << "Max col gap ratio: " + @rgr.to_s + ld
+      elsif ( @rgc )
+        puts( "Remove gap colums" )
+        log << "Remove gap colums" + ld
+      elsif( @rgoc )
+        puts( "Remove gap only colums" )
+        log << "Remove gap only colums" + ld
+      end
+      if ( @clean )
+        puts( "Clean up         : true" )
+        log << "Clean up         : true" + ld
+      end
+
+      if ( @pi_output )
+        puts( "Output is        : Phylip interleaved" )
+        log << "Output is        : Phylip interleaved" + ld
+      elsif ( @fasta_output )
+        puts( "Output is        : Fasta" )
+        log << "Output is        : Fasta" + ld
+        if ( @width )
+          puts( "Width            : " + @width.to_s )
+          log << "Width            : " + @width.to_s + ld
+        end
+        if ( @rg )
+          puts( "Remove all gap characters (alignment is destroyed)" )
+          log << "Remove all gap characters (alignment is destroyed)" + ld
+        end
+      elsif ( @nexus_output )
+        puts( "Output is        : Nexus" )
+        log << "Output is        : Nexus" + ld
+      end
+      if ( @name_length_set || !@fasta_output )
+        puts( "Max name length  : " + @name_length.to_s )
+        log << "Max name length  : " + @name_length.to_s + ld
+      end
+      if( @rsgr >= 0 )
+        puts( "Remove sequences for which the gap ratio > " + @rsgr.to_s )
+        log << "Remove sequences for which the gap ratio > " + @rsgr.to_s + ld
+      end
+      if( @rsl >= 0 )
+        puts( "Remove sequences with less than "  + @rsl.to_s + " non-gap characters" )
+        log << "Remove sequences with less than "  + @rsl.to_s + " non-gap characters" + ld
+      end
+      if ( @remove_seqs )
+        puts( "Remove sequences listed in: " + @seqs_name_file )
+        log << "Remove sequences listed in: " + @seqs_name_file + ld
+      elsif ( @keep_seqs )
+        puts( "Keep only sequences listed in: " + @seqs_name_file )
+        log << "Keep only sequences listed in: " + @seqs_name_file + ld
+      end
+      if ( @trim )
+        puts( "Keep only columns from: "+ @first.to_s + " to " + @last.to_s )
+        log << "Keep only columns from: "+ @first.to_s + " to " + @last.to_s + ld
+      end
+      if ( @rem_red )
+        puts( "Remove redundant sequences: true" )
+        log << "Remove redundant sequences: true" + ld
+      end
+      if ( @split > 0 )
+        puts( "Split            : " + @split.to_s )
+        log << "Split            : " + @split.to_s + ld
+      end
+      puts()
+
+      f = MsaFactory.new()
+
+      msa = nil
+
+      begin
+        if ( @phylip_input )
+          msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
+        elsif ( @fasta_input )
+          msa = f.create_msa_from_file( input, FastaParser.new() )
+        end
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+      end
+
+      if ( msa.is_aligned() )
+        Util.print_message( PRG_NAME, "Length of original alignment : " + msa.get_length.to_s )
+        log << "Length of original alignment : " + msa.get_length.to_s + ld
+      else
+        Util.print_message( PRG_NAME, "the input is not aligned" )
+        log << "The input is not aligned" + ld
+      end
+
+      all_names = Set.new()
+      for i in 0 ... msa.get_number_of_seqs()
+        current_name = msa.get_sequence( i ).get_name
+        if all_names.include?( current_name )
+          Util.print_warning_message( PRG_NAME, "sequence name [" + current_name + "] is not unique" )
+        else
+          all_names.add( current_name )
+        end
+      end
+
+      begin
+
+        if ( @remove_seqs || @keep_seqs )
+          names = Util.file2array( @seqs_name_file, true )
+          if ( names == nil ||  names.length() < 1 )
+            error_msg = "file \"" + @seqs_name_file.to_s + "\" appears empty"
+            Util.fatal_error( PRG_NAME, error_msg )
+          end
+
+          if ( @remove_seqs )
+            c = 0
+            for i in 0 ... names.length()
+              to_delete = msa.find_by_name( names[ i ], true, false )
+              if ( to_delete.length() < 1 )
+                error_msg = "sequence name \"" + names[ i ] + "\" not found"
+                Util.fatal_error( PRG_NAME, error_msg )
+              elsif ( to_delete.length() > 1 )
+                error_msg = "sequence name \"" + names[ i ] + "\" is not unique"
+                Util.fatal_error( PRG_NAME, error_msg )
+              else
+                msa.remove_sequence!( to_delete[ 0 ] )
+                c += 1
+              end
+            end
+            Util.print_message( PRG_NAME, "Removed " + c.to_s + " sequences" )
+            log <<  "Removed " + c.to_s + " sequences" + ld
+          elsif ( @keep_seqs )
+            msa_new = Msa.new()
+            r = 0
+            k = 0
+            for j in 0 ... msa.get_number_of_seqs()
+              if ( names.include?( msa.get_sequence( j ).get_name() ) )
+                msa_new.add_sequence( msa.get_sequence( j ) )
+                k += 1
+              else
+                r += 1
+              end
+            end
+            msa = msa_new
+            Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
+            log << "Kept    " + k.to_s + " sequences" + ld
+            Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
+            log << "removed " + r.to_s + " sequences" + ld
+          end
+        end
+
+        if ( @trim )
+          msa.trim!( @first, @last )
+        end
+        if( @rgr >= 0 )
+          msa.remove_gap_columns_w_gap_ratio!( @rgr )
+        elsif ( @rgc )
+          msa.remove_gap_columns!()
+        elsif( @rgoc )
+          msa.remove_gap_only_columns!()
+        end
+        if( @rsgr >= 0 )
+          n = msa.get_number_of_seqs()
+          removed = msa.remove_sequences_by_gap_ratio!( @rsgr )
+          k = msa.get_number_of_seqs()
+          r = n - k
+          Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
+          log << "Kept    " + k.to_s + " sequences" + ld
+          Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences"  )
+          log << "Removed " + r.to_s + " sequences:" + ld
+          removed.each { | seq_name |
+            log << "         " + seq_name  + ld
+          }
+        end
+        if( @rsl >= 0 )
+          n = msa.get_number_of_seqs()
+          removed = msa.remove_sequences_by_non_gap_length!( @rsl )
+          k = msa.get_number_of_seqs()
+          r = n - k
+          Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
+          log << "Kept    " + k.to_s + " sequences" + ld
+          Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
+          log << "Removed " + r.to_s + " sequences:" + ld
+          removed.each { | seq_name |
+            log << "         " + seq_name  + ld
+          }
+        end
+        if ( @keep_matching )
+          n = msa.get_number_of_seqs
+          to_be_removed = Set.new
+          for ii in 0 ...  n
+            seq = msa.get_sequence( ii )
+            if !seq.get_name.downcase.index( @keep_matching.downcase )
+              to_be_removed.add( ii )
+            end
+          end
+          to_be_removed_ary = to_be_removed.to_a.sort.reverse
+          to_be_removed_ary.each { | index |
+            msa.remove_sequence!( index )
+          }
+          # msa = sort( msa )
+        end
+        if ( @remove_matching )
+          n = msa.get_number_of_seqs
+          to_be_removed = Set.new
+          for iii in 0 ... n
+
+            seq = msa.get_sequence( iii )
+
+            if seq.get_name.downcase.index( @remove_matching.downcase )
+              to_be_removed.add( iii )
+            end
+          end
+          to_be_removed_ary = to_be_removed.to_a.sort.reverse
+          to_be_removed_ary.each { | index |
+            msa.remove_sequence!( index )
+          }
+          msa = sort( msa )
+        end
+
+
+
+        if ( @split > 0 )
+          begin
+            msas = msa.split( @split, true )
+            io = MsaIO.new()
+            w = MsaWriter
+            if ( @pi_output )
+              w = PhylipSequentialWriter.new()
+              w.clean( @clean )
+              w.set_max_name_length( @name_length )
+            elsif( @fasta_output )
+              w = FastaWriter.new()
+              w.set_line_width( @width )
+              if ( @rg )
+                w.remove_gap_chars( true )
+                Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
+                log << "removing gap character, the output is likely to become unaligned" + ld
+              end
+              w.clean( @clean )
+              if ( @name_length_set )
+                w.set_max_name_length( @name_length )
+              end
+            elsif( @nexus_output )
+              w = NexusWriter.new()
+              w.clean( @clean )
+              w.set_max_name_length( @name_length )
+            end
+            i = 0
+            for m in msas
+              i = i + 1
+              io.write_to_file( m, output + "_" + i.to_s, w )
+            end
+            Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files"  )
+            log << "wrote " + msas.length.to_s + " files" + ld
+          rescue Exception => e
+            Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+          end
+
+        end
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+      end
+
+      if ( @split <= 0 )
+
+        unless ( @rg )
+          if ( msa.is_aligned() )
+            Util.print_message( PRG_NAME, "length of processed alignment: " + msa.get_length.to_s )
+            log <<  "length of processed alignment: " + msa.get_length.to_s + ld
+          else
+            Util.print_warning_message( PRG_NAME, "output is not aligned" )
+            log << "output is not aligned" + ld
+          end
+        end
+
+        if @rem_red
+          removed = msa.remove_redundant_sequences!( true, true )
+          if removed.size > 0
+            Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
+            log << "going to ignore the following " + removed.size.to_s + " redundant sequences:" + ld
+            removed.each { | seq_name |
+              puts seq_name
+              log << seq_name + ld
+            }
+            Util.print_message( PRG_NAME, "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" )
+            log << "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" + ld
+          end
+        end
+
+        io = MsaIO.new()
+
+        w = MsaWriter
+
+        if ( @pi_output )
+          w = PhylipSequentialWriter.new()
+          w.clean( @clean )
+          w.set_max_name_length( @name_length )
+        elsif( @fasta_output )
+          w = FastaWriter.new()
+          w.set_line_width( @width )
+          if ( @rg )
+            w.remove_gap_chars( true )
+            Util.print_warning_message( PRG_NAME, "removing gap characters, the output is likely to become unaligned"  )
+            log << "removing gap character, the output is likely to become unaligned" + ld
+          end
+          w.clean( @clean )
+          if ( @name_length_set )
+            w.set_max_name_length( @name_length )
+          end
+        elsif( @nexus_output )
+          w = NexusWriter.new()
+          w.clean( @clean )
+          w.set_max_name_length( @name_length )
+        end
+
+
+        begin
+          io.write_to_file( msa, output, w )
+        rescue Exception => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+        end
+
+        begin
+          f = File.open( output + LOG_SUFFIX, 'a' )
+          f.print( log )
+          f.close
+        rescue Exception => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+        end
+
+
+      end
+      Util.print_message( PRG_NAME, "OK" )
+      puts
+    end
+
+
+    private
+
+    def sort( msa )
+      names = Set.new
+      for i in 0 ... msa.get_number_of_seqs
+        name = msa.get_sequence( i ).get_name
+        names.add( name )
+      end
+      sorted_ary = names.to_a.sort
+      new_msa = Msa.new
+      sorted_ary.each { | seq_name |
+        seq = msa.get_sequence( msa.find_by_name( seq_name, true, false )[ 0 ] )
+        new_msa.add_sequence( seq )
+      }
+      new_msa
+    end
+
+    def set_fasta_input( fi = true )
+      @fasta_input = fi
+      @input_format_set = true
+    end
+    def set_phylip_input( pi = true )
+      @phylip_input = pi
+      @input_format_set = true
+    end
+    def set_name_length( i )
+      @name_length = i
+      @name_length_set = true
+    end
+    def set_width( i )
+      @width = i
+    end
+    def set_fasta_output( fo = true )
+      @fasta_output = fo
+      @output_format_set = true
+    end
+    def set_pi_output( pso = true )
+      @pi_output = pso
+      @output_format_set = true
+    end
+    def set_nexus_output( nexus = true )
+      @nexus_output = nexus
+      @output_format_set = true
+    end
+    def set_clean( c = true )
+      @clean = c
+    end
+    def set_remove_gap_columns( rgc = true )
+      @rgc = rgc
+    end
+    def set_remove_gap_only_columns( rgoc = true )
+      @rgoc = rgoc
+    end
+    def set_remove_gaps( rg = true )
+      @rg = rg
+    end
+    def set_remove_gap_ratio( rgr )
+      @rgr = rgr
+    end
+    def set_remove_seqs_gap_ratio( rsgr )
+      @rsgr = rsgr
+    end
+    def set_remove_seqs_min_non_gap_length( rsl )
+      @rsl = rsl
+    end
+    def set_remove_seqs( file )
+      @seqs_name_file = file
+      @remove_seqs    = true
+      @keep_seqs      = false
+    end
+    def set_keep_seqs( file )
+      @seqs_name_file = file
+      @keep_seqs      = true
+      @remove_seqs    = false
+    end
+    def set_trim( first, last )
+      @trim            = true
+      @first           = first
+      @last            = last
+    end
+    def set_remove_matching( remove )
+      @remove_matching  = remove
+    end
+    def set_keep_matching( keep )
+      @keep_matching = keep
+    end
+    def set_rem_red( rr )
+      @rem_red = rr
+    end
+
+
+
+    def set_split( s )
+      if ( s > 0 )
+        @split            = s
+        @clean            = false  # phylip only
+        @rgc              = false
+        @rgoc             = false
+        @rg               = false  # fasta only
+        @rgr              = -1
+        @rsgr             = -1
+        @rsl              = -1
+        @seqs_name_file   = nil
+        @remove_seqs      = false
+        @keep_seqs        = false
+        @trim             = false
+        @first            = -1
+        @last             = -1
+      end
+    end
+    def analyze_command_line( cla )
+      if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
+        begin
+          type = cla.get_option_value( INPUT_TYPE_OPTION )
+          if ( type == "p" )
+            set_phylip_input( true )
+            set_fasta_input( false )
+          elsif ( type == "f" )
+            set_fasta_input( true )
+            set_phylip_input( false )
+          end
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( OUTPUT_TYPE_OPTION ) )
+        begin
+          type = cla.get_option_value( OUTPUT_TYPE_OPTION )
+          if ( type == "p" )
+            set_pi_output( true )
+            set_fasta_output( false )
+            set_nexus_output( false )
+          elsif ( type == "f" )
+            set_pi_output( false )
+            set_fasta_output( true )
+            set_nexus_output( false )
+          elsif ( type == "n" )
+            set_pi_output( false )
+            set_fasta_output( false )
+            set_nexus_output( true )
+          end
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( MAXIMAL_NAME_LENGTH_OPTION ) )
+        begin
+          l = cla.get_option_value_as_int( MAXIMAL_NAME_LENGTH_OPTION )
+          set_name_length( l )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( WIDTH_OPTION ) )
+        begin
+          w = cla.get_option_value_as_int( WIDTH_OPTION )
+          set_width( w )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( CLEAN_UP_SEQ_OPTION ) )
+        set_clean( true )
+      end
+      if ( cla.is_option_set?( REMOVE_GAP_COLUMNS_OPTION ) )
+        set_remove_gap_columns( true )
+      end
+      if ( cla.is_option_set?( REM_RED_OPTION ) )
+        set_rem_red( true )
+      end
+      if ( cla.is_option_set?( REMOVE_GAP_ONLY_COLUMNS ) )
+        set_remove_gap_only_columns( true )
+      end
+      if ( cla.is_option_set?( REMOVE_ALL_GAP_CHARACTERS_OPTION ) )
+        set_remove_gaps( true )
+      end
+      if ( cla.is_option_set?( REMOVE_COLUMNS_GAP_RATIO_OPTION ) )
+        begin
+          f = cla.get_option_value_as_float( REMOVE_COLUMNS_GAP_RATIO_OPTION )
+          set_remove_gap_ratio( f )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) )
+        begin
+          s = cla.get_option_value( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
+          set_remove_seqs( s )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) )
+        begin
+          s = cla.get_option_value( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
+          set_keep_seqs( s )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( TRIM_OPTION ) )
+        begin
+          s = cla.get_option_value( TRIM_OPTION )
+          if ( s =~ /(\d+)-(\d+)/ )
+            set_trim( $1.to_i(), $2.to_i() )
+          else
+            puts( "illegal argument" )
+            print_help
+            exit( -1 )
+          end
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) )
+        begin
+          f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION )
+          set_remove_seqs_gap_ratio( f )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) )
+        begin
+          f = cla.get_option_value_as_int( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
+          set_remove_seqs_min_non_gap_length( f )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( SPLIT ) )
+        begin
+          s = cla.get_option_value_as_int( SPLIT )
+          set_split( s )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+
+      end
+      if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) )
+        begin
+          s = cla.get_option_value( REMOVE_MATCHING_SEQUENCES_OPTION )
+          set_remove_matching( s )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+      if ( cla.is_option_set?( KEEP_MATCHING_SEQUENCES_OPTION ) )
+        begin
+          s = cla.get_option_value( KEEP_MATCHING_SEQUENCES_OPTION )
+          set_keep_matching( s )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+
+
+    end
+
+    def print_help()
+      puts()
+      puts( "Usage:" )
+      puts()
+      puts( "  " + PRG_NAME + ".rb [options] <input alignment> <output>" )
+      puts()
+      puts( "  options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta, p for phylip selex type" )
+      puts( "           -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta, n for nexus, p for phylip sequential (default)" )
+      puts( "           -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
+      puts( "           -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
+      puts( "           -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" )
+      puts( "           -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" )
+      puts( "           -" + REMOVE_GAP_ONLY_COLUMNS + ": remove gap-only columns" )
+      puts( "           -" + REMOVE_COLUMNS_GAP_RATIO_OPTION + "=<n>: remove columns for which ( seqs with gap / number of sequences > n )" )
+      puts( "           -" + REMOVE_ALL_GAP_CHARACTERS_OPTION + ": remove all gap characters (destroys alignment, fasta output only)" )
+      puts( "           -" + REMOVE_ALL_SEQUENCES_LISTED_OPTION + "=<file>: remove all sequences listed in file" )
+      puts( "           -" + KEEP_ONLY_SEQUENCES_LISTED_OPTION + "=<file>: keep only sequences listed in file" )
+      puts( "           -" + TRIM_OPTION + "=<first>-<last>: remove columns before first and after last" )
+      puts( "           -" + REMOVE_SEQS_GAP_RATIO_OPTION + "=<n>: remove sequences for which the gap ratio > n (after column operations)" )
+      puts( "           -" + REMOVE_SEQS_NON_GAP_LENGTH_OPTION + "=<n> remove sequences with less than n non-gap characters (after column operations)" )
+      puts( "           -" + REMOVE_MATCHING_SEQUENCES_OPTION + "=<s> remove all sequences with names containing s" )
+      puts( "           -" + KEEP_MATCHING_SEQUENCES_OPTION + "=<s> keep only sequences with names containing s" )
+      puts( "           -" + SPLIT + "=<n> split a fasta file into n files of equal number of sequences (expect for " )
+      puts( "            last one), cannot be used with other options" )
+      puts( "           -" + REM_RED_OPTION + ": remove redundant sequences" )
+      puts()
+    end
+
+
+
+
+
+  end # class MsaProcessor
+
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb
new file mode 100644 (file)
index 0000000..7f9ffe8
--- /dev/null
@@ -0,0 +1,395 @@
+#
+# = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/msa/msa'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/util/command_line_arguments'
+
+
+
+module Evoruby
+
+    class MultiSequenceExtractor
+
+        PRG_NAME                           = "mse"
+        PRG_VERSION                        = "1.0.0"
+        PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
+        PRG_DATE                           = "2008.08.13"
+        COPYRIGHT                          = "2008-2009 Christian M Zmasek"
+        CONTACT                            = "phylosoft@gmail.com"
+        WWW                                = "www.phylosoft.org"
+        HELP_OPTION_1                      = 'help'
+        HELP_OPTION_2                      = 'h'
+
+        LOG_SUFFIX                          = ".mse_log"
+        FASTA_SUFFIX                        = ".fasta"
+        FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
+        NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
+        PROTEINS_LIST_FILE_SEPARATOR        = "\t"
+        CACHE_GENOMES                       = false
+
+        def run()
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC ,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            ld = Constants::LINE_DELIMITER
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+
+            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+                     cla.is_option_set?( HELP_OPTION_2 ) )
+                print_help
+                exit( 0 )
+            end
+
+            if ( cla.get_number_of_files != 3 && cla.get_number_of_files != 4 )
+                print_help
+                exit( -1 )
+            end
+
+            allowed_opts = Array.new
+
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME,
+                    "unknown option(s): " + disallowed,
+                    STDOUT )
+            end
+
+            seq_names_files_suffix = cla.get_file_name( 0 )
+            input_dir              = cla.get_file_name( 1 )
+            out_dir                = cla.get_file_name( 2 )
+            mapping_file            = nil
+
+            if ( cla.get_number_of_files == 4 )
+                mapping_file = cla.get_file_name( 3 )
+                begin
+                    Util.check_file_for_readability( mapping_file )
+                rescue ArgumentError => e
+                    Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+                end
+            end
+
+            if  !File.exist?( input_dir )
+                Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
+            end
+            if  !File.exist?( out_dir )
+                Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
+            end
+            if !File.directory?( input_dir )
+                Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
+            end
+            if !File.directory?( out_dir )
+                Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
+            end
+
+
+            log = String.new
+
+            log << "Program            : " + PRG_NAME + ld
+            log << "Version            : " + PRG_VERSION + ld
+            log << "Program date       : " + PRG_DATE + ld
+
+            puts()
+            puts( "Sequence names files suffix: " + seq_names_files_suffix )
+            log << "Sequence names files suffix: " + seq_names_files_suffix + ld
+            puts( "Input dir                  : " + input_dir )
+            log << "Input dir                  : " + input_dir + ld
+            puts( "Output dir                 : " + out_dir )
+            log << "Output dir                 : " + out_dir + ld
+            if ( mapping_file != nil )
+                puts( "Mapping file               : " + mapping_file )
+                log << "Mapping file               : " + mapping_file + ld
+            end
+            log << "Date                       : " + Time.now.to_s + ld
+            puts
+
+            if ( mapping_file != nil )
+                species_codes_to_paths = extract_mappings( mapping_file )
+            end
+
+            input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
+
+            counter = 0
+            species_to_genomes = Hash.new()
+
+            input_files.each { |input_file|
+                counter += 1
+                puts
+                puts
+                puts counter.to_s + "/" + input_files.size.to_s
+                read_seq_family_file( input_file,
+                    seq_names_files_suffix,
+                    input_dir,
+                    species_codes_to_paths,
+                    species_to_genomes,
+                    log,
+                    out_dir,
+                    mapping_file )
+            }
+            puts
+            Util.print_message( PRG_NAME, "OK" )
+            puts
+
+        end
+
+
+        def read_seq_family_file( input_file,
+                seq_names_files_suffix,
+                input_dir,
+                species_codes_to_paths,
+                species_to_genomes,
+                log,
+                out_dir,
+                mapping_file )
+
+            begin
+                Util.check_file_for_readability( input_file )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+
+            basename = File.basename( input_file, seq_names_files_suffix )
+            out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
+            out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
+            out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX 
+            begin
+                Util.check_file_for_writability( out_file_path_fasta_file )
+                Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
+                Util.check_file_for_writability( out_file_path_ids_map  )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+          
+            ids_map_writer = nil
+            begin
+                ids_map_writer = File.open( out_file_path_ids_map, 'a' )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+            
+            current_species         = ""
+            current_msa            = nil
+            new_msa                = Msa.new
+            new_msa_normalized_ids = Msa.new
+            per_species_counter = 0
+
+            puts basename
+
+            File.open( input_file ) do | file |
+                while line = file.gets
+                    if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
+                        values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
+                        if ( values.length < 2 )
+                            Util.fatal_error( PRG_NAME, "unexpected format: " + line )
+                        end
+                        species = values[ 0 ]
+                        seq_name = values[ 1 ]
+                        if ( species != current_species )
+                            current_species = species
+                            my_file = input_dir + Constants::FILE_SEPARATOR + current_species
+
+                            if ( !File.exist?( my_file ) )
+                                if species_codes_to_paths == nil
+                                    Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
+                                elsif ( !species_codes_to_paths.has_key?( current_species ) )
+                                    Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
+                                end
+                                my_file = species_codes_to_paths[ current_species ]
+                            end
+                            my_path = File.expand_path( my_file )
+                            my_readlink = my_path
+                            if ( File.symlink?( my_path ) )
+                                my_readlink = File.readlink( my_path )
+                            end
+                            current_msa = nil
+                            if ( CACHE_GENOMES && species_to_genomes.has_key?( species ) )
+                                current_msa = species_to_genomes[ species ]
+                            else
+                                current_msa = read_fasta_file( my_file )
+                                if CACHE_GENOMES
+                                    species_to_genomes[ species ] = current_msa
+                                end
+                            end
+
+                            if ( per_species_counter > 0 )
+                                print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
+                                per_species_counter = 0
+                            end
+                            puts " " + current_species + " [" + my_readlink + "]"
+                            log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
+                        end
+                        puts "   " + seq_name
+                        log << "   " + seq_name + Constants::LINE_DELIMITER
+                        per_species_counter = per_species_counter + 1
+                        seq = nil
+                        
+                        if current_msa.find_by_name_start( seq_name, true ).size > 0
+                            begin
+                                seq = current_msa.get_by_name_start( seq_name, true ).copy
+                            rescue ArgumentError => e
+                                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+                            end
+                        elsif    
+                            # Not found, try finding by partial match.
+                            begin
+                                seq = current_msa.get_by_name( seq_name, true, true )
+                            rescue ArgumentError => e
+                                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+                            end
+                        end
+
+                        normalized_id = per_species_counter.to_s( 16 ).upcase +
+                         "_" + current_species
+                        
+                        per_species_counter.to_i
+                        
+                        ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
+                        
+                        if ( seq != nil )
+                            seq.set_name( seq.get_name + " [" + current_species + "]" )
+                            new_msa.add_sequence( seq )
+                        else
+                            Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
+                        end
+                        
+                        new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
+                       
+                    end
+                end
+
+            end
+         
+            ids_map_writer.close
+           
+            if ( per_species_counter > 0 )
+                print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
+            end
+
+            io = MsaIO.new()
+
+            fasta_writer = FastaWriter.new()
+            fasta_writer.remove_gap_chars
+            fasta_writer.clean
+            
+            begin
+                io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+            
+            begin
+                io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+
+            begin
+                f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
+                f.print( log )
+                f.close
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+
+        end
+
+        def obtain_inputfiles( input_dir, seq_names_files_suffix )
+            input_files = Array.new()
+            Dir.foreach( input_dir ) { |file_name|
+                if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
+                    input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
+                end
+            }
+            input_files
+        end
+
+        def extract_mappings( mapping_file )
+            species_code_to_path = Hash.new()
+            File.open( mapping_file ) do | file |
+                while line = file.gets
+                    if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
+                        if ( line =~ /(\S+)\s+(\S+)/ )
+                            species = $1
+                            path = $2
+                            if ( species_code_to_path.has_key?( species ) )
+                                Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
+                            end
+                            if ( species_code_to_path.has_value?( path ) )
+                                Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
+                            end
+                            if ( !File.exist?( path ) )
+                                Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
+                            end
+                            if ( !File.file?( path ) )
+                                Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
+                            end
+                            if ( !File.readable?( path ) )
+                                Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
+                            end
+                            if ( File.size( path ) < 10000 )
+                                Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
+                            end
+                            if ( !Util.looks_like_fasta?( path ) )
+                                Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
+                            end
+                            species_code_to_path[ species ] = path
+                            puts species + " -> " + path
+                        end
+                    end
+                end
+            end
+            species_code_to_path
+        end
+
+        def print_counts( per_species_counter, log, ld )
+            puts "   [sum: " + per_species_counter.to_s + "]"
+            log << "   [sum: " + per_species_counter.to_s + "]" + ld
+        end
+
+        def read_fasta_file( input )
+            f = MsaFactory.new()
+            msa = nil
+            begin
+                msa = f.create_msa_from_file( input, FastaParser.new() )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+            msa
+        end
+
+        def print_help()
+            puts( "Usage:" )
+            puts()
+            puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
+                 "and possibly genome multiple-sequence ('fasta') files> <output directory> [mapping file for " +
+                 "genome multiple-sequence ('fasta') files not in input dir]" )
+            puts()
+            puts( "  " + "Example: \"mse.rb .prot . seqs ../genome_locations.txt\"" )
+            puts()
+        end
+
+    end # class MultiSequenceExtractor
+end
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/apps/new_tap.rb b/forester/ruby/evoruby/lib/evo/apps/new_tap.rb
new file mode 100644 (file)
index 0000000..1dc7431
--- /dev/null
@@ -0,0 +1,167 @@
+#
+# = lib/evo/apps/ -  class
+#
+# Copyright::  Copyright (C) 2009 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: new_tap.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+require 'lib/evo/util/util'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/msa/msa'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/io/parser/general_msa_parser'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/writer/phylip_sequential_writer'
+require 'lib/evo/util/command_line_arguments'
+
+module Evoruby
+
+    class TaxonomyProcessor
+
+        PRG_NAME       = ""
+        PRG_DATE       = "2009.10.09"
+        PRG_DESC       = "replacement of labels in multiple sequence files"
+        PRG_VERSION    = "1.00"
+        COPYRIGHT      = "2009 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        REMOVE_REDUNDANT_SEQS_OPTION = "rr"
+        
+        def initialize()
+            @taxonomies = Hash.new()
+        end
+
+        def run()
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            if ( ARGV == nil || ( ARGV.length != 3 && ARGV.length != 4 ) )
+                puts( "Usage: #{PRG_NAME}.rb <input sequences> <output sequences> <output map>" )
+                puts()
+                puts( "  options: -" + REMOVE_REDUNDANT_SEQS_OPTION + ": to remove redundant sequences" )
+                puts()
+                exit( -1 )
+            end
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+            
+            input     = cla.get_file_name( 0 )
+            output    = cla.get_file_name( 1 )
+            map_file = cla.get_file_name( 2 )
+
+            allowed_opts = Array.new
+            allowed_opts.push( REMOVE_REDUNDANT_SEQS_OPTION ) 
+            
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
+            end
+
+            
+            remove_redudant = false
+            if ( cla.is_option_set?( REMOVE_REDUNDANT_SEQS_OPTION ) )
+                remove_redudant = true
+            end
+
+            if ( File.exists?( output ) )
+                Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" )
+            end
+            if ( File.exists?( map_file ) )
+                Util.fatal_error( PRG_NAME, "map file [" + map_file + "] already exists" )
+            end
+            if ( !File.exists?( input) )
+                Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" )
+            end
+           
+            fasta_like = Util.looks_like_fasta?( input )
+
+            puts()
+            puts( "Input alignment : " + input )
+            puts( "Output alignment: " + output )
+            puts( "Output map      : " + map_file )
+            if ( fasta_like )
+                puts( "Format          : Fasta"  )
+            else
+                puts( "Format          : Phylip like" )
+            end
+            puts()
+
+            species_map = Hash.new
+           
+            f = MsaFactory.new()
+            begin
+                if ( fasta_like )
+                    msa = f.create_msa_from_file( input, FastaParser.new() )
+                else
+                    msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
+                end
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s )
+            end
+
+            if ( msa == nil || msa.get_number_of_seqs() < 1 )
+                Util.fatal_error( PRG_NAME, "failed to read MSA" )
+            end
+            begin
+                Util.check_file_for_writability( map_file )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT )
+            end
+
+            if ( remove_redudant ) 
+                removed = msa.remove_redundant_sequences!( true )
+                if removed.size > 0
+                    Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
+                    removed.each { | seq_name |
+                        puts seq_name
+                    }
+                    Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" )
+                end
+            end
+
+            lf = File.open( map_file, "a" )
+            for i in 0 ... msa.get_number_of_seqs
+                seq  = msa.get_sequence( i )
+            end
+
+            io = MsaIO.new()
+            w = nil
+            if ( fasta_like )
+                w = FastaWriter.new()
+            else
+                w = PhylipSequentialWriter.new()
+            end
+            w.set_max_name_length( 10 )
+            w.clean( true )
+            begin
+                io.write_to_file( msa, output, w )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
+            end
+            lf.close()
+            if ( @taxonomies.length > 0 )
+                Util.print_message( PRG_NAME, "number of unique taxonomies: " + @taxonomies.length.to_s )
+            end
+            Util.print_message( PRG_NAME, "wrote: " + map_file )
+            Util.print_message( PRG_NAME, "wrote: " + output )
+            Util.print_message( PRG_NAME, "OK" )
+        end
+
+    end # class 
+
+end # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/apps/phylogenies_decorator.rb b/forester/ruby/evoruby/lib/evo/apps/phylogenies_decorator.rb
new file mode 100644 (file)
index 0000000..69a6bc4
--- /dev/null
@@ -0,0 +1,299 @@
+#!/usr/local/bin/ruby -w
+#
+# = lib/evo/apps/phylogenies_decorator
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# decoration of phylogenies with sequence/species names and domain architectures
+#
+# $Id: phylogenies_decorator.rb,v 1.34 2010/12/13 19:00:11 cmzmasek Exp $
+#
+# Environment variable FORESTER_HOME needs to point to the appropriate
+# directory (e.g. setenv FORESTER_HOME $HOME/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester-atv/)
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+
+require 'date'
+
+module Evoruby
+
+    class PhylogeniesDecorator
+
+        DECORATOR_OPTIONS_SEQ_NAMES = '-r=1 -mdn'
+        # -mdn is a hidden expert option to rename e.g. "6_ORYLA3" to "6_[3]_ORYLA"
+        #DECORATOR_OPTIONS_SEQ_NAMES = '-sn -r=1'
+        DECORATOR_OPTIONS_DOMAINS = '-r=1'
+        IDS_MAPFILE_SUFFIX        = '.nim'
+        DOMAINS_MAPFILE_SUFFIX    = '.dff'
+        SLEEP_TIME                = 0.1
+        REMOVE_NI                 = true
+        TMP_FILE                  = '___PD___'
+        LOG_FILE                  = '00_phylogenies_decorator.log'
+        FORESTER_HOME             = ENV[Constants::FORESTER_HOME_ENV_VARIABLE]
+        JAVA_HOME                 = ENV[Constants::JAVA_HOME_ENV_VARIABLE]
+
+        PRG_NAME       = "phylogenies_decorator"
+        PRG_DATE       = "2008.09.02"
+        PRG_DESC       = "decoration of phylogenies with sequence/species names and domain architectures"
+        PRG_VERSION    = "1.0.1"
+        COPYRIGHT      = "2008-2009 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        IDS_ONLY_OPTION     = "n"
+        DOMAINS_ONLY_OPTION = "d"
+        HELP_OPTION_1       = "help"
+        HELP_OPTION_2       = "h"
+
+        NL = Constants::LINE_DELIMITER
+
+        def run
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            if ( ARGV == nil || ARGV.length > 3 || ARGV.length < 2  )
+                print_help
+                exit( -1 )
+            end
+
+            if FORESTER_HOME == nil || FORESTER_HOME.length < 1
+                Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::FORESTER_HOME_ENV_VARIABLE} has not been set" )
+            end
+            if JAVA_HOME == nil ||  JAVA_HOME.length < 1
+                Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::JAVA_HOME_ENV_VARIABLE} has not been set" )
+            end
+
+            if !File.exist?( FORESTER_HOME )
+                Util.fatal_error( PRG_NAME, '[' + FORESTER_HOME + '] does not exist' )
+            end
+            if !File.exist?( JAVA_HOME )
+                Util.fatal_error( PRG_NAME, '[' + JAVA_HOME + '] does not exist' )
+            end
+
+            decorator = JAVA_HOME + '/bin/java -cp ' + FORESTER_HOME + '/java/forester.jar org.forester.application.decorator'
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+
+            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+                     cla.is_option_set?( HELP_OPTION_2 ) )
+                print_help
+                exit( 0 )
+            end
+
+            if File.exist?( LOG_FILE )
+                Util.fatal_error( PRG_NAME, 'logfile [' + LOG_FILE + '] already exists' )
+            end
+
+            allowed_opts = Array.new
+            allowed_opts.push( IDS_ONLY_OPTION )
+            allowed_opts.push( DOMAINS_ONLY_OPTION )
+
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
+            end
+
+            ids_only = false
+            domains_only = false
+
+            in_suffix = cla.get_file_name( 0 )
+            out_suffix = cla.get_file_name( 1 )
+
+            if cla.is_option_set?( IDS_ONLY_OPTION )
+                ids_only = true
+            end
+            if cla.is_option_set?( DOMAINS_ONLY_OPTION )
+                domains_only = true
+            end
+
+            if ( ids_only && domains_only )
+                Util.fatal_error( PRG_NAME, 'attempt to use ids only and domains only at the same time' )
+            end
+
+            log = String.new
+
+            now = DateTime.now
+            log << "Program              : " + PRG_NAME + NL
+            log << "Version              : " + PRG_VERSION + NL
+            log << "Program date         : " + PRG_DATE + NL
+            log << "Options for seq names: " + DECORATOR_OPTIONS_SEQ_NAMES + NL
+            log << "Options for domains  : " + DECORATOR_OPTIONS_DOMAINS + NL
+            log << "FORESTER_HOME        : " + FORESTER_HOME + NL
+            log << "JAVA_HOME            : " + JAVA_HOME + NL + NL
+            log << "Date/time: " + now.to_s + NL
+            log << "Directory: " + Dir.getwd  + NL + NL
+
+            Util.print_message( PRG_NAME, 'input suffix     : ' + in_suffix )
+            Util.print_message( PRG_NAME, 'output suffix    : ' + out_suffix )
+
+            log << 'input suffix     : ' + in_suffix + NL
+            log << 'output suffix    : ' + out_suffix + NL
+
+            if ( File.exists?( TMP_FILE ) )
+                File.delete( TMP_FILE )
+            end
+
+            files = Dir.entries( "." )
+
+            counter = 0
+
+            files.each { | phylogeny_file |
+                if ( !File.directory?( phylogeny_file ) &&
+                         phylogeny_file !~ /^\./ &&
+                         phylogeny_file !~ /^00/ &&
+                         phylogeny_file !~ /#{out_suffix}$/ &&
+                         phylogeny_file =~ /#{in_suffix}$/ )
+                    begin
+                        Util.check_file_for_readability( phylogeny_file )
+                    rescue ArgumentError
+                        Util.fatal_error( PRG_NAME, 'can not read from: ' + phylogeny_file + ': '+ $! )
+                    end
+
+                    counter += 1
+
+                    outfile = phylogeny_file.sub( /#{in_suffix}$/, out_suffix )
+
+                    if REMOVE_NI
+                        outfile = outfile.sub( /_ni_/, '_' )
+                    end
+
+                    if File.exists?( outfile )
+                        msg = counter.to_s + ': ' + phylogeny_file + ' -> ' +  outfile +
+                         ' : already exists, skipping'
+                        Util.print_message( PRG_NAME, msg  )
+                        log << msg + NL
+                        next
+                    end
+
+                    Util.print_message( PRG_NAME, counter.to_s + ': ' + phylogeny_file + ' -> ' +  outfile )
+                    log << counter.to_s + ': ' + phylogeny_file + ' -> ' +  outfile + NL
+
+                    phylogeny_id = get_id( phylogeny_file )
+
+                    ids_mapfile_name = nil
+                    domains_mapfile_name = nil
+
+                    if ids_only
+                        ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX )
+                    elsif domains_only
+                        domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX )
+                    else
+                        ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX )
+                        domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX )
+                    end
+
+                    if domains_mapfile_name != nil
+                        begin
+                            Util.check_file_for_readability( domains_mapfile_name )
+                        rescue ArgumentError
+                            Util.fatal_error( PRG_NAME, 'failed to read from [#{domains_mapfile_name}]: ' + $! )
+                        end
+                    end
+
+                    if ids_mapfile_name != nil
+                        begin
+                            Util.check_file_for_readability( ids_mapfile_name )
+                        rescue ArgumentError
+                            Util.fatal_error( PRG_NAME, 'failed to read from [#{ids_mapfile_name}]: ' + $! )
+                        end
+                    end
+
+                    if domains_mapfile_name != nil
+                        if ids_mapfile_name != nil
+                            my_outfile = TMP_FILE
+                        else
+                            my_outfile = outfile
+                        end
+                        cmd = decorator + ' ' + DECORATOR_OPTIONS_DOMAINS + ' ' +
+                         '-f=d ' + phylogeny_file + ' ' +
+                         domains_mapfile_name + ' ' + my_outfile
+                        execute_cmd( cmd, log )
+                    end
+
+                    if ids_mapfile_name != nil
+                        if domains_mapfile_name != nil
+                            my_infile = TMP_FILE
+                        else
+                            my_infile = phylogeny_file
+                        end
+                        cmd = decorator + ' ' +  DECORATOR_OPTIONS_SEQ_NAMES + ' ' +
+                         '-f=s ' + my_infile + ' ' +
+                         ids_mapfile_name + ' ' + outfile
+                        execute_cmd( cmd, log )
+                    end
+
+                    if ( File.exists?( TMP_FILE ) )
+                        File.delete( TMP_FILE )
+                    end
+                end
+            }
+            open( LOG_FILE, 'w' ) do | f |
+                f.write( log )
+            end
+            puts
+            Util.print_message( PRG_NAME, 'OK' )
+            puts
+        end # def run
+
+        def execute_cmd( cmd, log )
+            log << 'excuting ' + cmd + NL
+            IO.popen( cmd , 'r+' ) do | pipe |
+                pipe.close_write
+                log << pipe.read + NL + NL
+            end
+            sleep( SLEEP_TIME )
+        end
+
+
+        def get_id( phylogeny_file_name )
+            phylogeny_file_name =~ /^([^_]+)/
+            $1
+        end
+
+        def get_file( files_in_dir, phylogeny_id, suffix_pattern )
+            matching_files = Array.new
+            files_in_dir.each { | file |
+
+                if ( !File.directory?( file ) &&
+                         file !~ /^\./ &&
+                         file !~ /^00/ &&
+                         file =~ /^#{phylogeny_id}.*#{suffix_pattern}$/ )
+                    matching_files << file
+                end
+            }
+            if matching_files.length < 1
+                Util.fatal_error( PRG_NAME, 'no file matching [' + phylogeny_id +
+                     '_] [' + suffix_pattern + '] present in current directory' )
+            elsif matching_files.length > 1
+                Util.fatal_error( PRG_NAME, 'more than one file matching [' + phylogeny_id +
+                     '_] [' + suffix_pattern + '] present in current directory' )
+            end
+            matching_files[ 0 ]
+        end
+
+        def print_help()
+            puts( "Usage:" )
+            puts()
+            puts( "  " + PRG_NAME + ".rb [options] <suffix of intrees to be decorated> <suffix for decorated outtrees> " )
+            puts()
+            puts( "  options: -" + IDS_ONLY_OPTION + ": decorate with sequence/species names only" )
+            puts( "           -" + DOMAINS_ONLY_OPTION + ": decorate with domain structures" )
+            puts()
+        end
+    end # class PhylogenyiesDecorator
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/apps/phylogeny_factory.rb b/forester/ruby/evoruby/lib/evo/apps/phylogeny_factory.rb
new file mode 100644 (file)
index 0000000..999541e
--- /dev/null
@@ -0,0 +1,267 @@
+#
+# = lib/evo/apps/phylogeny_factory - PhylogenyFactory class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: phylogeny_factory.rb,v 1.32 2010/12/13 19:00:11 cmzmasek Exp $
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+
+require 'set'
+require 'date'
+
+module Evoruby
+
+    class PhylogenyFactory
+
+        PRG_NAME       = "phylogeny_factory"
+        PRG_DATE       = "2010.05.26"
+        PRG_DESC       = "automated phylogeny reconstruction using queing system"
+        PRG_VERSION    = "1.1"
+        COPYRIGHT      = "2010 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        USE_JOB_SUBMISSION_SYSTEM_OPTION  = 's'
+        LOG_FILE                          = '00_phylogeny_factory.log'
+        TEMPLATE_FILE                     = '00_phylogeny_factory.template'
+        PBS_O_WORKDIR                     = '$PBS_O_WORKDIR/'
+        MIN_LENGTH_DEFAULT                = 40
+        WALLTIME                          = '100:00:00'
+        QUEUE                             = 'default'
+
+        TMP_CMD_FILE_SUFFIX = '_QSUB'
+
+        HMM                 = 'HMM'
+        RSL                 = 'RSL'
+
+        OPTION_OPEN          = '%['
+        OPTION_CLOSE          = ']%'
+
+        WAIT                 = 1.0
+
+        NL = Constants::LINE_DELIMITER
+
+        def run
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+
+            allowed_opts = Array.new
+            allowed_opts.push( USE_JOB_SUBMISSION_SYSTEM_OPTION )
+
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME,
+                    "unknown option(s): " + disallowed,
+                    STDOUT )
+            end
+
+            if File.exists?( LOG_FILE )
+                puts( '[' + PRG_NAME + '] > log file [' + LOG_FILE + '] already exists' )
+                exit( -1 )
+            end
+
+            if !File.exists?( TEMPLATE_FILE )
+                puts( '[' + PRG_NAME + '] > template file [' + TEMPLATE_FILE + '] not found' )
+                exit( -1 )
+            end
+
+            use_job_submission_system = false
+            if cla.is_option_set?( USE_JOB_SUBMISSION_SYSTEM_OPTION )
+                use_job_submission_system = true
+            end
+
+            log = String.new
+
+            now = DateTime.now
+            log << "Program     : " + PRG_NAME + NL
+            log << "Version     : " + PRG_VERSION + NL
+            log << "Program date: " + PRG_DATE + NL + NL
+            log << "Date/time   : " + now.to_s + NL
+            log << "Directory   : " + Dir.getwd  + NL + NL
+
+            puts( '[' + PRG_NAME + '] > reading ' + TEMPLATE_FILE )
+
+            paths       = Hash.new  # path placeholder -> full path
+            min_lengths = Hash.new  # alignment id -> minimal length
+            hmms        = Hash.new  # alignment id -> hmm
+            options     = Hash.new  # option placeholder -> option
+            ids         = Set.new
+
+            commands    = Array.new
+
+            log <<  "////////////////////////////////////////////////////////////////// #{NL}"
+            log << "Template file [" + TEMPLATE_FILE + "]:#{NL}"
+
+            command = String.new
+
+            open( TEMPLATE_FILE ).each { | line |
+                log << line
+                if ( line =~ /^#/ )
+                elsif ( line =~ /^\$\s*(\S+)\s*=\s*(\S+)/ )
+                    paths[ $1 ] = $2
+                    puts( '[' + PRG_NAME + '] > paths      : ' + $1 + ' => ' + $2 )
+
+                elsif ( line =~ /^%\s*#{HMM}\s*(\S+)\s*=\s*(\S+)/ )
+                    hmms[ $1 ] = $2
+                    puts( '[' + PRG_NAME + '] > hmms       : ' + $1 + ' => ' + $2 )
+
+                elsif ( line =~ /^%\s*#{RSL}\s*(\S+)\s*=\s*(\S+)/ )
+                    min_lengths[ $1 ] = $2
+                    puts( '[' + PRG_NAME + '] > min lengths: ' + $1 + ' => ' + $2 )
+
+                elsif ( line =~ /^%\s*(\S+)\s*=\s*(\S+)/ )
+                    options[ $1 ] = $2
+                    puts( '[' + PRG_NAME + '] > options    : ' + $1 + ' => ' + $2 )
+
+                elsif ( line =~ /^>\s*(.+)/ )
+                    command = command + $1 + ";#{NL}"
+
+                elsif ( line =~ /^-/  )
+                    commands << prepare( command, paths )
+                    command = String.new
+                end
+            }
+            log << "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ #{NL}#{NL}"
+
+            files = Dir.entries( "." )
+
+            files.each { | file |
+                if ( !File.directory?( file ) &&
+                         file !~ /^\./ &&
+                         file !~ /#{TEMPLATE_FILE}/ &&
+                         file !~ /.bck$/ &&
+                         file !~ /.log$/ &&
+                         file !~ /nohup/ &&
+                         file !~ /^00/ )
+                    aln_name = file.to_str
+                    id = get_id( aln_name )
+                    if !ids.include?( id )
+                        ids.add( id )
+                    end
+                    puts( '[' + PRG_NAME + '] > file [id]  : ' + aln_name + ' [' + id + ']' )
+                    commands.each do | cmd |
+
+                        cmd = subst_hmm( cmd, aln_name, hmms )
+                        cmd = subst_min_length( cmd, aln_name, min_lengths )
+                        cmd = subst_options( cmd, options )
+                        if use_job_submission_system
+                            cmd = subst_aln_name( cmd, PBS_O_WORKDIR + aln_name )
+                        else
+                            cmd = subst_aln_name( cmd, aln_name )
+                        end
+
+                        if ( cmd =~ /%/ )
+                            cmd =~ /(%.*?%)/
+                            problem = $1
+                            puts( '[' + PRG_NAME + '] > WARNING    : [' + id + '] command still contains placeholder: ' + problem )
+                            log << "WARNING: command still contains placeholder: " + cmd + NL
+                        else
+                            tmp_cmd_file = file.to_str[ 0..4 ] + TMP_CMD_FILE_SUFFIX
+                            if ( File.exists?( tmp_cmd_file ) )
+                                File.delete( tmp_cmd_file )
+                            end
+                            if use_job_submission_system
+                                open( tmp_cmd_file, 'w' ) do |f|
+                                    f.write( cmd )
+                                end
+                            end
+
+                            log << cmd + NL
+
+                            if use_job_submission_system
+                                IO.popen( 'qsub -q ' + QUEUE  + ' -l walltime=' + WALLTIME + ' ' + tmp_cmd_file , 'r+' ) do | pipe |
+                                    pipe.close_write
+                                end
+                            else
+                                spawn( 'nohup ' + cmd + ' &', STDERR => "/dev/null" )
+                            end
+
+                            sleep( WAIT )
+                            if ( File.exists?( tmp_cmd_file ) )
+                                File.delete( tmp_cmd_file )
+                            end
+                        end
+                    end
+                end
+            }
+
+            open( LOG_FILE, 'w' ) do | f |
+                f.write( log )
+            end
+
+            puts()
+            puts( '[' + PRG_NAME + '] > OK' )
+            puts()
+
+        end # def run
+
+        def prepare( command, paths )
+            paths.each_pair{ | name, full |
+                command = command.gsub( name, full )
+            }
+            command
+        end
+
+        def subst_options( command, options )
+            opt_placeholders = command.scan( /%\[\S+\]%/ )
+            opt_placeholders.each { | opt_placeholder |
+                opt_placeholder = opt_placeholder.gsub( OPTION_OPEN , '' )
+                opt_placeholder = opt_placeholder.gsub( OPTION_CLOSE, '' )
+                opt_value = options[ opt_placeholder ]
+                if ( opt_value != nil && opt_value.size > 0 )
+                    command = command.gsub( OPTION_OPEN + opt_placeholder + OPTION_CLOSE, opt_value )
+                end
+            }
+            command
+        end
+
+        def subst_aln_name( command, aln_name )
+            command = command.gsub( '$', aln_name )
+            command
+        end
+
+        def subst_hmm( command, aln_name, hmms )
+            id = get_id( aln_name )
+            hmm = hmms[ id ]
+            if ( hmm != nil && hmm.size > 0 )
+                command = command.gsub( OPTION_OPEN + HMM + OPTION_CLOSE, hmm )
+            end
+            command
+        end
+
+        def subst_min_length( command, aln_name, min_lengths )
+            id = get_id( aln_name )
+            min_length = min_lengths[ id ]
+            if ( min_length != nil && min_length.size > 0 )
+                command = command.gsub( OPTION_OPEN + RSL + OPTION_CLOSE, min_length )
+            else
+                command = command.gsub( OPTION_OPEN + RSL + OPTION_CLOSE, MIN_LENGTH_DEFAULT.to_s )
+            end
+            command
+        end
+
+        def get_id( aln_name )
+            aln_name =~ /^([^_]+)/
+            $1
+        end
+
+    end # class PhylogenyFactory
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/apps/taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/apps/taxonomy_processor.rb
new file mode 100644 (file)
index 0000000..d688844
--- /dev/null
@@ -0,0 +1,297 @@
+#
+# = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: taxonomy_processor.rb,v 1.26 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+require 'lib/evo/util/util'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/msa/msa'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/io/parser/general_msa_parser'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/writer/phylip_sequential_writer'
+require 'lib/evo/util/command_line_arguments'
+
+module Evoruby
+
+  class TaxonomyProcessor
+
+    PRG_NAME       = "tap"
+    PRG_DATE       = "2010.02.24"
+    PRG_DESC       = "replacement of species names in multiple sequence files"
+    PRG_VERSION    = "1.01"
+    COPYRIGHT      = "2010 Christian M Zmasek"
+    CONTACT        = "phylosoft@gmail.com"
+    WWW            = "www.phylosoft.org"
+
+    EXTRACT_TAXONOMY_OPTION = "t"
+
+    def initialize()
+      @taxonomies = Hash.new()
+    end
+
+    def run()
+
+      Util.print_program_information( PRG_NAME,
+        PRG_VERSION,
+        PRG_DESC,
+        PRG_DATE,
+        COPYRIGHT,
+        CONTACT,
+        WWW,
+        STDOUT )
+
+      if ( ARGV == nil || ( ARGV.length != 3 && ARGV.length != 4 && ARGV.length != 5 && ARGV.length != 6 ) )
+        puts( "Usage: #{PRG_NAME}.rb [options] [input map file] <input sequences> <output sequences> <output id list>" )
+        puts()
+        puts( "  options: -" + EXTRACT_TAXONOMY_OPTION + ": to extract taxonomy information from bracketed expression" )
+        puts()
+        exit( -1 )
+      end
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      mapfile   = nil
+      input     = nil
+      output    = nil
+      list_file = nil
+
+      if cla.get_number_of_files == 4
+        mapfile   = cla.get_file_name( 0 )
+        input     = cla.get_file_name( 1 )
+        output    = cla.get_file_name( 2 )
+        list_file = cla.get_file_name( 3 )
+      elsif cla.get_number_of_files == 3
+        input     = cla.get_file_name( 0 )
+        output    = cla.get_file_name( 1 )
+        list_file = cla.get_file_name( 2 )
+      end
+
+
+      allowed_opts = Array.new
+      allowed_opts.push( EXTRACT_TAXONOMY_OPTION )
+
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
+      end
+
+      extract_taxonomy = false
+      if ( cla.is_option_set?( EXTRACT_TAXONOMY_OPTION ) )
+        extract_taxonomy = true
+      end
+
+      if ( File.exists?( output ) )
+        Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" )
+      end
+      if ( File.exists?( list_file ) )
+        Util.fatal_error( PRG_NAME, "list file [" + list_file + "] already exists" )
+      end
+      if ( !File.exists?( input) )
+        Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" )
+      end
+      if ( mapfile != nil && !File.exists?( mapfile ) )
+        Util.fatal_error( PRG_NAME, "mapfile [" + mapfile + "] does not exist" )
+      end
+
+      fasta_like = Util.looks_like_fasta?( input )
+
+      puts()
+      if mapfile != nil
+        puts( "Map file        : " + mapfile )
+      end
+      puts( "Input alignment : " + input )
+      puts( "Output alignment: " + output )
+      puts( "Name list       : " + list_file )
+      if ( fasta_like )
+        puts( "Format          : Fasta"  )
+      else
+        puts( "Format          : Phylip like" )
+      end
+      if ( extract_taxonomy )
+        puts( "Extract taxonomy: true"  )
+      end
+      puts()
+
+      species_map = Hash.new
+      if mapfile != nil
+        File.open( mapfile ) do | file |
+          while line = file.gets
+            if ( line =~/(.+)#(.+)/ || line =~/(.+)\s+(.+)/ )
+              species_map[ $1 ] = $2
+              Util.print_message( PRG_NAME, "mapping: " + $1 + ' => ' + $2 )
+            end
+          end
+        end
+      end
+
+      f = MsaFactory.new()
+      begin
+        if ( fasta_like )
+          msa = f.create_msa_from_file( input, FastaParser.new() )
+        else
+          msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
+        end
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s )
+      end
+
+      if ( msa == nil || msa.get_number_of_seqs() < 1 )
+        Util.fatal_error( PRG_NAME, "failed to read MSA" )
+      end
+      begin
+        Util.check_file_for_writability( list_file )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT )
+      end
+
+      #removed = msa.remove_redundant_sequences!( true )
+      #if removed.size > 0
+      #  Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
+      #  removed.each { | seq_name |
+      #    puts seq_name
+      #  }
+      #  Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" )
+      #end
+
+      lf = File.open( list_file, "a" )
+      for i in 0 ... msa.get_number_of_seqs
+        seq  = msa.get_sequence( i )
+        seq.set_name( Util::normalize_seq_name( modify_name( seq.get_name(), i, lf, species_map, extract_taxonomy ), 10 ) )
+      end
+
+      io = MsaIO.new()
+      w = nil
+      if ( fasta_like )
+        w = FastaWriter.new()
+      else
+        w = PhylipSequentialWriter.new()
+      end
+      w.set_max_name_length( 10 )
+      w.clean( true )
+      begin
+        io.write_to_file( msa, output, w )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
+      end
+      lf.close()
+      if ( @taxonomies.length > 0 )
+        Util.print_message( PRG_NAME, "number of unique taxonomies: " + @taxonomies.length.to_s )
+      end
+      Util.print_message( PRG_NAME, "wrote: " + list_file )
+      Util.print_message( PRG_NAME, "wrote: " + output )
+      Util.print_message( PRG_NAME, "OK" )
+    end
+
+    private
+
+    def modify_name( desc, counter, file, species_map, extract_taxonomy )
+      new_desc = nil
+      my_species = nil
+      if desc =~ /^>?\s*\S{1,10}_([0-9A-Z]{3,5})/
+        new_desc = counter.to_s( 16 ) + "_" + $1
+      elsif extract_taxonomy
+        if ( desc.count( "[" ) != desc.count( "]" ) )
+          Util.fatal_error( PRG_NAME, "illegal bracket count in: " + desc )
+        end
+        species = nil
+        species_map.each_key do | key |
+          if desc =~ /[\b|_]#{key}\b/  # Added boundaries to prevent e.g. RAT matching ARATH.
+            species = species_map[ key ]
+            new_desc = counter.to_s( 16 ) + "_" + species
+            break
+          end
+        end
+        if species == nil
+          if desc =~/.*\[(\S{3,}?)\]/
+            species = $1
+            species.strip!
+            species.upcase!
+            species.gsub!( /\s+/, " " )
+            species.gsub!( /-/, "" )
+            species.gsub!( /\)/, "" )
+            species.gsub!( /\(/, "" )
+            species.gsub!( /\'/, "" )
+            if species =~ /\S+\s\S+/ || species =~ /\S{3,5}/
+              if species =~ /(\S+)\s(\S+)/
+                code = $1[ 0..2 ] + $2[ 0..1 ]
+              elsif  species =~ /\S{3,5}/
+                code = species
+              elsif species.count( " " ) > 2
+                species =~ /(\S+)\s+(\S+)\s+(\S+)$/
+                third_last = $1
+                second_last = $2
+                last = $3
+                code = code[ 0 ] + third_last[ 0 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ]
+              elsif species.count( " " ) > 1
+                species =~ /(\S+)\s+(\S+)$/
+                second_last = $1
+                last = $2
+                code = code[ 0..1 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ]
+              end
+              new_desc = counter.to_s( 16 ) + "_" + code
+              if @taxonomies.has_key?( code )
+                if ( !@taxonomies.has_value?( species ) )
+                  Util.fatal_error( PRG_NAME, "code [#{code}] is not unique in [#{desc}]" )
+                end
+              else
+                if ( @taxonomies.has_value?( species ) )
+                  Util.fatal_error( PRG_NAME, "genome [#{species}] is not unique in [#{desc}]" )
+                else
+                  @taxonomies[ code ] = species
+                end
+              end
+            else
+              Util.fatal_error( PRG_NAME, "illegal format [#{species}] in: " + desc )
+            end
+          else
+            Util.fatal_error( PRG_NAME, "illegal format in: " + desc )
+          end
+        end
+      else
+        species = nil
+        my_species = nil
+        species_map.each_key do | key |
+          if desc =~ /#{key}/
+            species = species_map[ key ]
+            species = species.gsub( /\s+/, "" )
+            species = species.gsub( /_/, " " )
+            my_species = species
+            if species =~ /(\S+)\s+(\S+)/
+              species = $1[0..2] + $2[0..1]
+            end
+            species = species.gsub( /\s+/, "" )
+            species = species.slice(0, 5)
+            species.upcase!
+            break
+          end
+        end
+        if species == nil
+          Util.fatal_error( PRG_NAME, "species not found in: " + desc  )
+        else
+          new_desc = counter.to_s( 16 ) + "_" + species
+        end
+      end
+      if new_desc == nil
+        Util.fatal_error( PRG_NAME, "failed to extract species from: " + desc  )
+      end
+      if my_species != nil
+        file.print( new_desc + ": " + desc + " [" + my_species + "]" + "\n" )
+      else
+        file.print( new_desc + ": " + desc + "\n" )
+      end
+      new_desc
+    end
+
+  end # class TaxonomyProcessor
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/apps/tseq_taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/apps/tseq_taxonomy_processor.rb
new file mode 100644 (file)
index 0000000..f708247
--- /dev/null
@@ -0,0 +1,190 @@
+#
+# = lib/evo/apps/tseq_taxonomy_processor - TseqTaxonomyProcessor class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: tseq_taxonomy_processor.rb,v 1.6 2010/12/13 19:00:11 cmzmasek Exp $
+
+
+require 'lib/evo/util/util'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/msa/msa'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/parser/sp_taxonomy_parser'
+require 'lib/evo/io/parser/ncbi_tseq_parser'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/writer/phylip_sequential_writer'
+require 'lib/evo/util/command_line_arguments'
+
+module Evoruby
+
+    class TseqTaxonomyProcessor
+
+        PRG_NAME       = "tseq_tap"
+        PRG_DATE       = "2009.01.06"
+        PRG_DESC       = "preprocessing of multiple sequence files in ncbi tseq xml format"
+        PRG_VERSION    = "1.02"
+        COPYRIGHT      = "2009 Christian M Zmasek"
+        CONTACT        = "phylosoft@gmail.com"
+        WWW            = "www.phylosoft.org"
+
+        TAXONOMY_CODE           = "TAXONOMY_CODE:"
+        TAXONOMY_ID             = "TAXONOMY_ID:"
+        TAXONOMY_ID_TYPE        = "TAXONOMY_ID_TYPE:"
+        TAXONOMY_SN             = "TAXONOMY_SN:"
+        TAXONOMY_CN             = "TAXONOMY_CN:"
+        SEQ_ACCESSION           = "SEQ_ACCESSION:"
+        SEQ_ACCESSION_SOURCE    = "SEQ_ACCESSION_SOURCE:"
+        SEQ_SECONDARY_ACCESSION = "SEQ_SECONDARY_ACCESSION:"
+        SEQ_SYMBOL              = "SEQ_SYMBOL:"
+        SEQ_NAME                = "SEQ_NAME:"
+        SEQ_MOL_SEQ             = "SEQ_MOL_SEQ:"
+
+        def initialize()
+            @tax_ids_to_sp_taxonomies = Hash.new()
+        end
+
+        def run()
+
+            Util.print_program_information( PRG_NAME,
+                PRG_VERSION,
+                PRG_DESC,
+                PRG_DATE,
+                COPYRIGHT,
+                CONTACT,
+                WWW,
+                STDOUT )
+
+            if  ARGV == nil || ARGV.length != 4
+                puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in tseq xml format> <name for fasta outfile> <name for map outfile>" )
+                puts()
+
+                exit( -1 )
+            end
+
+            begin
+                cla = CommandLineArguments.new( ARGV )
+            rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            end
+            allowed_opts = Array.new
+            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+            if ( disallowed.length > 0 )
+                Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
+            end
+
+            sp_taxonomy_infile = cla.get_file_name( 0 )
+            sequences_infile = cla.get_file_name( 1 )
+            sequences_outfile = cla.get_file_name( 2 )
+            mapping_outfile = cla.get_file_name( 3 )
+
+            Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
+            Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
+            Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
+            Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
+
+            sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
+
+            Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
+
+            tseq_parser = NcbiTSeqParser.new
+            msa_fac = MsaFactory.new
+
+            seqs = msa_fac.create_msa_from_file( sequences_infile, tseq_parser )
+
+            Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
+
+            removed = seqs.remove_redundant_sequences!( true, true )
+
+            if removed.size > 0
+                Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
+                removed.each { | seq_name |
+                    puts seq_name
+                }
+                Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
+            end
+
+            mapping_out = File.open( mapping_outfile, "a" )
+
+            for i in 0 ... seqs.get_number_of_seqs
+                seq = seqs.get_sequence( i )
+                if seq.get_taxonomy == nil
+                    Util.fatal_error( PRG_NAME, "sequence [" + seq.get_name + "] has no taxonomy information" )
+                end
+                seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
+            end
+
+            io = MsaIO.new()
+
+            w = FastaWriter.new()
+
+            w.set_max_name_length( 10 )
+            w.clean( true )
+            begin
+                io.write_to_file( seqs, sequences_outfile, w )
+            rescue Exception => e
+                Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
+            end
+            mapping_out.close()
+
+            Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
+            Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
+            Util.print_message( PRG_NAME, "OK" )
+
+        end
+
+        private
+
+        def modify_name( seq, i, sp_taxonomies, mapping_outfile )
+
+            tax_id = seq.get_taxonomy.get_id
+            matching_sp_taxonomy = nil
+
+            if @tax_ids_to_sp_taxonomies.has_key?( tax_id )
+                # This is so that a second lookup will be much faster.
+                matching_sp_taxonomy = @tax_ids_to_sp_taxonomies[ tax_id ]
+            else
+                sp_taxonomies.each { |sp_taxonomy|
+                    if ( sp_taxonomy.id == tax_id )
+                        if  matching_sp_taxonomy != nil
+                            Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] is not unique" )
+                        end
+                        matching_sp_taxonomy = sp_taxonomy
+                        @tax_ids_to_sp_taxonomies[ tax_id ] = sp_taxonomy
+                    end
+                }
+            end
+            if  matching_sp_taxonomy == nil
+                Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] for [" +  seq.get_taxonomy.get_name + "] not found" )
+            end
+
+            new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
+
+            seq_name = seq.get_name
+            if  seq_name =~ /\[.+\]$/
+                # Redundant taxonomy information hides here.
+                seq_name = seq_name.sub(/\[.+\]$/, '')
+            end
+            if  seq_name =~ /^\s*hypothetical\s+protein\s*/i
+                # Pointless information.
+                seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
+            end
+
+            mapping_outfile.print( new_name + "\t" +
+                 TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
+                 TAXONOMY_ID + tax_id + "\t" +
+                 TAXONOMY_ID_TYPE + seq.get_taxonomy.get_id_source + "\t" +
+                 TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
+                 SEQ_ACCESSION + seq.get_accession + "\t" +
+                 SEQ_ACCESSION_SOURCE + seq.get_accession_source + "\t" +
+                 SEQ_SYMBOL + seq.get_symbol + "\t" +
+                 SEQ_NAME + seq_name + "\t" +
+                 SEQ_MOL_SEQ + seq.get_sequence_as_string +
+                 Constants::LINE_DELIMITER )
+            new_name
+        end
+
+    end 
+
+end # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/io/msa_io.rb b/forester/ruby/evoruby/lib/evo/io/msa_io.rb
new file mode 100644 (file)
index 0000000..6f96f01
--- /dev/null
@@ -0,0 +1,24 @@
+#
+# = lib/evo/io/msa_io.rb - MsaIO class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: msa_io.rb,v 1.2 2007/06/12 04:51:35 cmzmasek Exp $
+#
+# last modified: 05/16/2007
+
+module Evoruby
+
+  class MsaIO
+
+        def initialize()
+        end
+
+        def write_to_file( msa, path, msa_writer )
+            msa_writer.write( msa, path )
+        end
+
+  end # module Evoruby
+
+end # class MsaIO
diff --git a/forester/ruby/evoruby/lib/evo/io/parser/basic_table_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/basic_table_parser.rb
new file mode 100644 (file)
index 0000000..31f27b4
--- /dev/null
@@ -0,0 +1,41 @@
+#
+# = lib/evo/io/parser/basic_table_parser - BasicTableParser class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: basic_table_parser.rb,v 1.3 2007/09/28 03:12:10 cmzmasek Exp $
+#
+# last modified: 05/16/2007
+
+module Evoruby
+
+    class BasicTableParser
+
+        START_OF_COMMENT_LINE_CHAR = "#"
+
+        # raises ArgumentError
+        def BasicTableParser.parse( path, column_delimiter )
+            Util.check_file_for_readability( path )
+            table = BasicTable.new
+            row = 0
+            File.open( path ) do | file |
+                while line = file.gets
+                    if ( !Util.is_string_empty?( line ) &&
+                         !line.slice( 0, 1 ).eql?( START_OF_COMMENT_LINE_CHAR ) )
+                        values = line.split( column_delimiter )
+                        col = 0
+                        values.each { | value | 
+                            table.set_value( row, col, value.strip! )
+                            col += 1
+                        }
+                        row += 1
+                    end
+                end
+            end
+            return table
+        end
+
+    end # class BasicTableParser
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/io/parser/fasta_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/fasta_parser.rb
new file mode 100644 (file)
index 0000000..2e3d51a
--- /dev/null
@@ -0,0 +1,77 @@
+#
+# = lib/evo/io/parser/fasta_parser - FastaParser class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: fasta_parser.rb,v 1.11 2010/10/08 22:04:17 cmzmasek Exp $
+#
+# last modified: 05/17/2007
+
+require 'lib/evo/io/parser/msa_parser'
+require 'lib/evo/msa/msa'
+
+require 'iconv'
+
+module Evoruby
+
+    class FastaParser < MsaParser
+
+        def initialize
+        end
+
+        def parse( path )
+            Util.check_file_for_readability( path )
+            msa = Msa.new
+            current_seq = String.new()
+            name        = String.new()
+            saw_first_seq = false
+            ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
+            File.open( path ) do | file |
+                while line = file.gets
+                    line = ic.iconv( line )
+                    if can_ignore?( line, saw_first_seq )
+
+                    elsif line =~ /^\s*>\s*(.+)/
+                        saw_first_seq = true
+                        add_seq( name, current_seq, msa )
+                        name = $1
+                        current_seq = String.new()
+                    elsif line =~ /^\s*(.+)/
+                        if name.length < 1
+                            error_msg = "format error at: " + line
+                            raise IOError, error_msg
+                        end
+                        # was: seq = $1.rstrip
+                        seq =  $1.gsub(/\s+/, '')
+                        current_seq = current_seq + seq
+                    else
+                        error_msg = "Unexpected line: " + line
+                        raise IOError, error_msg
+                    end
+                end
+            end
+            add_seq( name, current_seq, msa )
+            return msa
+        end
+
+        private
+
+        def add_seq( name, seq, msa )
+            if name.length > 0 && seq.length > 0
+                msa.add( name, seq )
+            end
+        end
+
+        def can_ignore?( line, saw_first_seq )
+            return ( line !~ /\S/  ||
+                 line =~ /^\s*#/ ||
+                 line =~ /^\s*%/ ||
+                 line =~ /^\s*\/\// ||
+                 line =~ /^\s*!!/ ||
+                 ( !saw_first_seq && line =~/^\s*[^>]/ ) )
+        end
+
+    end # class FastaParser
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb
new file mode 100644 (file)
index 0000000..6a170e3
--- /dev/null
@@ -0,0 +1,108 @@
+#
+# = lib/evo/io/parser/general_msa_parser - GeneralMsaParser class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: general_msa_parser.rb,v 1.8 2009/10/08 22:44:54 cmzmasek Exp $
+#
+# last modified: 2009/10/08
+
+require 'lib/evo/io/parser/msa_parser'
+require 'lib/evo/msa/msa'
+
+require 'iconv'
+
+module Evoruby
+
+    class GeneralMsaParser < MsaParser
+
+        def initialize
+        end
+
+        def parse( path )
+            Util.check_file_for_readability( path )
+            block                       = -1
+            current_seq_index_per_block = -1
+            current_name                = nil
+            saw_ignorable = true
+            is_first      = true
+            msa = Msa.new
+            ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
+            File.open( path ) do | file |
+                while line = file.gets
+                    line = ic.iconv( line )
+                    if can_ignore?( line )
+                        saw_ignorable = true
+                    elsif ( is_first && is_program_name_line?( line ) ) 
+                    elsif( line =~ /^\S+\s+.+\s*$/ || line =~ /^\s+.+\s*$/ || line =~ /^\S+\s*$/ )
+                        if ( saw_ignorable )
+                            block += 1
+                            current_seq_index_per_block = -1
+                            saw_ignorable = false
+                        end
+                        current_seq_index_per_block += 1
+                        if ( line =~ /^(\S+)\s+(.+?)\s*$/ )
+                            name = $1
+                            seq  = $2.gsub( /\s/, '.' )
+                            a = msa.find_by_name( name, false, false )
+                            if ( a.length < 1 )
+                                msa.add( name, seq )
+                            elsif ( a.length == 1 )
+                                msa.get_sequence( a[ 0 ] ).append!( seq )
+                            else
+                                error_msg = "Unexpected error at line: " + line
+                                raise IOError, error_msg
+                            end
+                            current_name = name
+                        elsif ( line =~ /^\s+(.+?)\s*$/ )
+                            seq = $1.gsub( /\s/, '.' )
+                            a = msa.find_by_name( current_name, false, false )
+                            if ( a.length != 1  )
+                                error_msg = "Unexpected error at line: " + line
+                                raise IOError, error_msg
+                            else
+                                msa.get_sequence( a[ 0 ] ).append!( seq )
+                            end
+
+                        elsif ( line =~ /^(\S+)\s*$/ )
+                            seq = $1
+                            if block == 0
+                                error_msg = "First block cannot contain unnamed sequences"
+                                raise IOError, error_msg
+                            else
+                                msa.get_sequence( current_seq_index_per_block ).append!( seq )
+                            end
+                            current_name = nil
+                        end
+                    else
+                        error_msg = "Unexpected line: " + line
+                        raise IOError, error_msg
+                    end
+                    if ( is_first )
+                        is_first = false
+                    end
+                end
+            end
+            return msa
+        end # def parse( path )
+
+        private
+
+        def can_ignore?( line )
+            return ( line !~ /[A-Za-z\-?\*_\.]/ ||
+                     line =~ /^\s+[*\.:]/ ||
+                     line =~ /^\s*#/ ||
+                     line =~ /^\s*%/ ||
+                     line =~ /^\s*\/\// ||
+                     line =~ /^\s*!!/  )
+        end
+        
+        def is_program_name_line?( line )
+            return ( line =~ /^CLUSTAL\s/ ||
+                     line =~ /^MUSCLE\s\(/ ||
+                     line =~ /^PROBCONS\s/ )             
+        end  
+    end # class GeneralMsaParser
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb
new file mode 100644 (file)
index 0000000..d228024
--- /dev/null
@@ -0,0 +1,298 @@
+#
+# = lib/evo/io/parser/hmmsearch_domain_extractor.rb - HmmsearchDomainExtractor class
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: hmmsearch_domain_extractor.rb,v 1.24 2009/11/25 06:30:24 cmzmasek Exp $
+
+
+require 'lib/evo/util/constants'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/parser/fasta_parser'
+
+
+module Evoruby
+
+    class HmmsearchDomainExtractor
+
+        TRIM_BY = 2
+
+        def initialize
+        end
+
+        # raises ArgumentError, IOError, StandardError
+        def parse( hmmsearch_output,
+                fasta_sequence_file,
+                outfile,
+                passed_seqs_outfile,
+                failed_seqs_outfile,
+                e_value_threshold,
+                length_threshold,
+                add_position,
+                add_domain_number,
+                add_domain_number_as_digit,
+                add_domain_number_as_letter,
+                trim_name,
+                log )
+
+            Util.check_file_for_readability( hmmsearch_output )
+            Util.check_file_for_readability( fasta_sequence_file )
+            Util.check_file_for_writability( outfile )
+            Util.check_file_for_writability( passed_seqs_outfile )
+            Util.check_file_for_writability( failed_seqs_outfile )
+
+            in_msa = nil
+            factory = MsaFactory.new()
+            in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
+
+            if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
+                error_msg = "could not find fasta sequences in " + fasta_sequence_file
+                raise IOError, error_msg
+            end
+
+            out_msa = Msa.new
+            failed_seqs = Msa.new
+            passed_seqs = Msa.new
+
+            ld = Constants::LINE_DELIMITER
+
+            domain_pass_counter     = 0
+            domain_fail_counter     = 0
+            proteins_with_passing_domains = 0
+            proteins_with_failing_domains = 0
+            max_domain_copy_number_per_protein = -1
+            max_domain_copy_number_sequence    = ''
+            failed_species_counts         = Hash.new
+            passed_species_counts         = Hash.new
+
+            File.open( hmmsearch_output ) do | file |
+                while line = file.gets
+                    if !is_ignorable?( line ) && line =~ /^\S+\s+/
+
+                        #         tn      acc     tlen    query   acc     qlen    Evalue  score   bias    #       of      c-E     i-E     score   bias    hf      ht      af      at      ef      et      acc     desc
+                        #         1       2       3       4       5       6       7       8       9       10      11      12      13      14      15      16      17      18      19      20      21      22      23
+                        line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
+
+                        # line =~ /^(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)/
+                        sequence = $1
+                        number   = $10.to_i
+                        out_of   = $11.to_i
+                        env_from = $20.to_i
+                        env_to   = $21.to_i
+                        i_e_value  = $13.to_f
+                        if ( number > max_domain_copy_number_per_protein )
+                            max_domain_copy_number_sequence    = sequence
+                            max_domain_copy_number_per_protein = number
+                        end
+                        if ( ( ( e_value_threshold.to_f < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
+                                 ( ( length_threshold.to_f <= 0 )   || ( env_to - env_from + 1 ) >= length_threshold.to_f )  )
+                            HmmsearchDomainExtractor.extract_domain( sequence,
+                                number,
+                                out_of,
+                                env_from,
+                                env_to,
+                                in_msa,
+                                out_msa,
+                                add_position,
+                                add_domain_number,
+                                add_domain_number_as_digit,
+                                add_domain_number_as_letter,
+                                trim_name )
+                            domain_pass_counter += 1
+                            count_species( sequence, passed_species_counts )
+                            if !passed_seqs.has?( sequence, true, false )
+                                HmmsearchDomainExtractor.add_sequence( sequence, in_msa, passed_seqs )
+                                proteins_with_passing_domains += 1
+                            end
+                        else
+                            print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" )
+                            log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)"
+                            if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
+                                print( " iE=" + i_e_value.to_s )
+                                log << " iE=" + i_e_value.to_s
+                            end
+                            if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
+                                le = env_to - env_from + 1
+                                print( " l=" + le.to_s )
+                                log << " l=" + le.to_s
+                            end
+                            print( Constants::LINE_DELIMITER )
+                            log << Constants::LINE_DELIMITER
+                            domain_fail_counter  += 1
+                            count_species( sequence, failed_species_counts )
+                            if !failed_seqs.has?( sequence, true, false )
+                                HmmsearchDomainExtractor.add_sequence( sequence, in_msa, failed_seqs )
+                                proteins_with_failing_domains += 1
+                            end
+                        end
+                    end
+                end
+            end
+
+            if domain_pass_counter < 1
+                error_msg = "no domain sequences were extracted"
+                raise StandardError, error_msg
+            end
+
+            log << Constants::LINE_DELIMITER
+            puts( "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s )
+            log << "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s
+            log << Constants::LINE_DELIMITER
+
+            if ( max_domain_copy_number_per_protein > 1 )
+                puts( "First protein with this copy number: " + max_domain_copy_number_sequence )
+                log << "First protein with this copy number: " + max_domain_copy_number_sequence
+                log << Constants::LINE_DELIMITER
+            end
+
+            io = MsaIO.new()
+            w = FastaWriter.new()
+            w.set_line_width( 60 )
+            w.clean( true )
+
+            begin
+                io.write_to_file( out_msa, outfile, w )
+            rescue Exception
+                error_msg = "could not write to \"" + outfile + "\""
+                raise IOError, error_msg
+            end
+
+            begin
+                io.write_to_file( passed_seqs, passed_seqs_outfile, w )
+            rescue Exception
+                error_msg = "could not write to \"" + passed_seqs_outfile + "\""
+                raise IOError, error_msg
+            end
+
+            begin
+                io.write_to_file( failed_seqs, failed_seqs_outfile, w )
+            rescue Exception
+                error_msg = "could not write to \"" + failed_seqs_outfile + "\""
+                raise IOError, error_msg
+            end
+
+            log << ld
+            log << "passing domains              : " + domain_pass_counter.to_s + ld
+            log << "failing domains              : " + domain_fail_counter.to_s + ld
+            log << "proteins with passing domains: " + proteins_with_passing_domains.to_s + ld
+            log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld
+            log << ld
+            log << 'passing domains counts per species: ' << ld
+            passed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld }
+            log << ld
+            log << 'failing domains counts per species: ' << ld
+            failed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld }
+            log << ld
+            return domain_pass_counter
+
+        end # parse
+
+        private
+
+
+        def HmmsearchDomainExtractor.add_sequence( sequence_name, in_msa, add_to_msa )
+            seqs = in_msa.find_by_name( sequence_name, true, false )
+            if ( seqs.length < 1 )
+                error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
+                raise StandardError, error_msg
+            end
+            if ( seqs.length > 1 )
+                error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
+                raise StandardError, error_msg
+            end
+            seq = in_msa.get_sequence( seqs[ 0 ] )
+            add_to_msa.add_sequence( seq )
+        end
+
+        # raises ArgumentError, StandardError
+        def HmmsearchDomainExtractor.extract_domain( sequence,
+                number,
+                out_of,
+                seq_from,
+                seq_to,
+                in_msa,
+                out_msa,
+                add_position,
+                add_domain_number,
+                add_domain_number_as_digit,
+                add_domain_number_as_letter,
+                trim_name )
+            if ( number < 1 || out_of < 1 || number > out_of )
+                error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s
+                raise ArgumentError, error_msg
+            end
+            if ( seq_from < 1 || seq_to < 1 || seq_from >= seq_to )
+                error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s
+                raise ArgumentError, error_msg
+            end
+            seqs = in_msa.find_by_name( sequence, true, false )
+            if ( seqs.length < 1 )
+                error_msg = "sequence \"" + sequence + "\" not found in sequence file"
+                raise StandardError, error_msg
+            end
+            if ( seqs.length > 1 )
+                error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
+                raise StandardError, error_msg
+            end
+            # hmmsearch is 1 based, wheres sequences are 0 bases in this package.
+            seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
+            if ( add_position )
+                seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
+            end
+
+            if ( trim_name )
+                seq.set_name( seq.get_name[ 0, seq.get_name.length - TRIM_BY ] )
+            end
+
+            if ( out_of != 1 )
+                if ( add_domain_number_as_digit )
+                    seq.set_name( seq.get_name + number.to_s )
+                elsif ( add_domain_number_as_letter )
+                    if number > 25
+                        error_msg = 'too many identical domains per sequence, cannot use letters to distinguish them'
+                        raise StandardError, error_msg
+                    end
+                    seq.set_name( seq.get_name + ( number + 96 ).chr )
+                elsif ( add_domain_number )
+                    seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
+                end
+            end
+
+            if ( seq.get_name.length > 10 )
+                error_msg = "sequence name [" + seq.get_name + "] is longer than 10 characters"
+                raise StandardError, error_msg
+            end
+
+            out_msa.add_sequence( seq )
+        end
+
+        def count_species( sequence, species_counts_map )
+            species = get_species( sequence )
+            if species != nil
+                if !species_counts_map.has_key?( species )
+                    species_counts_map[ species ] = 1
+                else
+                    species_counts_map[ species ] = species_counts_map[ species ] + 1
+                end
+            end
+        end
+
+        def get_species( sequence_name )
+            if sequence_name =~ /^.+_(.+)$/
+                return $1
+            else
+                return nil
+            end
+        end
+
+        def is_ignorable?( line )
+            return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
+        end
+
+    end # class HmmsearchDomainExtractor
+
+end # module Evoruby
+
diff --git a/forester/ruby/evoruby/lib/evo/io/parser/msa_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/msa_parser.rb
new file mode 100644 (file)
index 0000000..e1d1b56
--- /dev/null
@@ -0,0 +1,22 @@
+#
+# = lib/evo/io/parser/msa_parser - MsaParser class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: msa_parser.rb,v 1.2 2007/06/12 04:51:34 cmzmasek Exp $
+#
+# last modified: 05/16/2007
+
+module Evoruby
+
+    class MsaParser
+        def initialize()
+            raise TypeError, "Cannot instanciate abstract class MsaParser"
+        end
+
+        def parse( path )
+        end
+    end
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/io/parser/ncbi_tseq_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/ncbi_tseq_parser.rb
new file mode 100644 (file)
index 0000000..994755a
--- /dev/null
@@ -0,0 +1,153 @@
+#
+# = lib/evo/io/parser/ncbi_tseq_parser - NcbiTSeqParser class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: ncbi_tseq_parser.rb,v 1.5 2009/01/07 02:48:20 cmzmasek Exp $
+
+
+require 'lib/evo/io/parser/msa_parser'
+require 'lib/evo/taxonomy/taxonomy'
+require 'lib/evo/msa/msa'
+
+require 'iconv'
+
+module Evoruby
+
+    class NcbiTSeqParser < MsaParser
+
+        TSEQ_SEQ = "TSeq_sequence"
+        TSEQ_DEFLINE = "TSeq_defline"
+        TSEQ_ORGNAME = "TSeq_orgname"
+        TSEQ_TAXID = "TSeq_taxid"
+        TSEQ_SID = "TSeq_sid"
+        TSEQ_ACCVER = "TSeq_accver"
+        TSEQ_GI = "TSeq_gi"
+        TSEQ_TYPE = "TSeq_seqtype"
+        TSEQ_LENGTH = "TSeq_length"
+
+        def initialize
+        end
+
+
+        #  <TSeqSet>
+        #<TSeq>
+        #  <TSeq_seqtype value="protein"/>
+        #  <TSeq_gi>29341016</TSeq_gi>
+        #  <TSeq_accver>AAO78806.1</TSeq_accver>
+        #  <TSeq_sid>gnl|mbpwusl|BT3701</TSeq_sid>
+        #  <TSeq_taxid>226186</TSeq_taxid>
+        #  <TSeq_orgname>Bacteroides thetaiotaomicron VPI-5482</TSeq_orgname>
+        #  <TSeq_defline>SusD [Bacteroides thetaiotaomicron VPI-5482]</TSeq_defline>
+        #  <TSeq_length>551</TSeq_length>
+        #  <TSeq_sequence>MKTKYIKQLFSAALIAVLSSGVTSCINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQENQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK</TSeq_sequence>
+        #</TSeq>
+
+        def parse( path )
+            Util.check_file_for_readability( path )
+            seqs = Msa.new
+
+            in_seq        = false
+            gi = nil
+            accver = nil
+            sid = nil
+            taxid = nil
+            orgname = nil
+            defline = nil
+            seq_str = nil
+            line_counter = 1
+            ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
+            File.open( path ) do | file |
+                while line = file.gets
+                    line = ic.iconv( line )
+                    line_counter += 1
+                    if can_ignore?( line )
+
+                    elsif line =~ /^\s*<TSeq>/
+                        in_seq = true
+
+
+                    elsif in_seq
+                        if line =~ /^\s*<\/TSeq>/
+                            in_seq = false
+                            taxonomy = nil
+                            if taxid != nil || orgname != nil
+                                id_source = nil
+                                if taxid != nil
+                                    id_source = "ncbi"
+                                end
+                                taxonomy = Taxonomy.new( orgname, taxid , id_source )
+                            end
+                            id = nil
+                            id_source = nil
+                            symbol = nil
+                            if gi != nil
+                                id = gi
+                                id_source = "gi"
+                                if sid != nil
+                                    symbol = sid
+                                elsif accver != nil
+                                    symbol = accver
+                                end
+                            elsif sid != nil
+                                id = sid
+                                id_source = "ncbi"
+                                if accver != nil
+                                    symbol = accver
+                                end
+                            elsif accver != nil
+                                id = accver
+                                id_source = "ncbi"
+                            end
+
+                            sequence = Sequence.new( defline,
+                                seq_str,
+                                id,
+                                id_source,
+                                taxonomy,
+                                symbol )
+
+                            seqs.add_sequence( sequence )
+                            gi = nil
+                            accver = nil
+                            sid = nil
+                            taxid = nil
+                            orgname = nil
+                            defline = nil
+                            seq_str = nil
+                        elsif line =~ /^\s*<#{TSEQ_GI}>(\d+)<\/#{TSEQ_GI}>/
+                            gi = $1
+                        elsif line =~ /^\s*<#{TSEQ_ACCVER}>(.+)<\/#{TSEQ_ACCVER}>/
+                            accver = $1
+                        elsif line =~ /^\s*<#{TSEQ_SID}>(.+)<\/#{TSEQ_SID}>/
+                            sid = $1
+                        elsif line =~ /^\s*<#{TSEQ_TAXID}>(\d+)<\/#{TSEQ_TAXID}>/
+                            taxid = $1
+                        elsif line =~ /^\s*<#{TSEQ_ORGNAME}>(.+)<\/#{TSEQ_ORGNAME}>/
+                            orgname = $1
+                        elsif line =~ /^\s*<#{TSEQ_DEFLINE}>(.+)<\/#{TSEQ_DEFLINE}>/
+                            defline = $1
+                        elsif line =~ /^\s*<#{TSEQ_SEQ}>(.+)<\/#{TSEQ_SEQ}>/
+                            seq_str = $1
+                        elsif line =~ /^\s*<#{TSEQ_TYPE}/
+                        elsif line =~ /^\s*<#{TSEQ_LENGTH}/
+                        else
+                            error_msg = "unexpected line format at line #{line_counter}: " + line
+                            raise IOError, error_msg
+                        end
+                    end
+                end
+            end
+            return seqs
+        end
+
+        private
+
+        def can_ignore?( line )
+            return ( line !~ /\S/ )
+        end
+
+    end # class NcbiTSeqParser
+
+end # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/io/parser/sp_taxonomy_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/sp_taxonomy_parser.rb
new file mode 100644 (file)
index 0000000..cdc8af8
--- /dev/null
@@ -0,0 +1,42 @@
+#
+# = lib/evo/io/parser/sp_taxonomy_parser - SpTaxonomyParser class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: sp_taxonomy_parser.rb,v 1.2 2008/12/31 03:21:45 cmzmasek Exp $
+
+
+module Evoruby
+
+    require 'lib/evo/taxonomy/sp_taxonomy'
+
+    class SpTaxonomyParser
+
+        START_OF_COMMENT_LINE_CHAR = "#"
+
+        # raises ArgumentError
+        def SpTaxonomyParser.parse( path )
+            Util.check_file_for_readability( path )
+            row = 0
+            sp_taxonomies = Array.new
+            File.open( path ) do | file |
+                while line = file.gets
+                    row += 1
+                    if !Util.is_string_empty?( line )
+                        if line =~ /([A-Z0-9]{3,5})\s+[A-Z]\s+(\d+):\s+N=(.+)/
+                            code = $1
+                            id = $2
+                            sci_name = $3
+                            tax = SpTaxonomy.new(code, id, sci_name )
+                            #puts tax.to_str
+                            sp_taxonomies.push( tax )
+                        end
+                    end
+                end
+            end
+            sp_taxonomies
+        end
+    end # class SpTaxonomyParser
+
+end # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/io/writer/fasta_writer.rb b/forester/ruby/evoruby/lib/evo/io/writer/fasta_writer.rb
new file mode 100644 (file)
index 0000000..26f7461
--- /dev/null
@@ -0,0 +1,86 @@
+#
+# = lib/evo/io/writer/fasta_writer.rb - FastaWriter class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: fasta_writer.rb,v 1.6 2008/09/12 23:52:11 cmzmasek Exp $
+#
+# last modified: 05/16/2007
+
+require 'lib/evo/io/writer/msa_writer'
+
+module Evoruby
+
+    class FastaWriter < MsaWriter
+
+        LINE_WIDTH_DEFAULT      = 60
+        MAX_NAME_LENGTH_DEFAULT = 0
+
+        def initialize()
+            @line_width       = LINE_WIDTH_DEFAULT
+            @max_name_length  = MAX_NAME_LENGTH_DEFAULT
+            @remove_gap_chars = false
+            @clean            = false
+        end
+
+
+        def set_line_width( line_width = LINE_WIDTH_DEFAULT )
+            if ( line_width < 1 )
+                line_width = LINE_WIDTH_DEFAULT
+            end
+            @line_width = line_width
+        end
+
+        def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT )
+            if ( length < 1 )
+                length = MAX_NAME_LENGTH_DEFAULT
+            end
+            @max_name_length = length
+        end
+
+        def remove_gap_chars( remove_gap_chars = true )
+            @remove_gap_chars = remove_gap_chars
+        end
+
+        def clean( clean = true )
+            @clean = clean
+        end
+
+        def write( msa, path )
+            Util.check_file_for_writability( path )
+            f = File.open( path, "a" )
+            for i in 0 ... msa.get_number_of_seqs()
+                seq_obj = msa.get_sequence( i )
+                name = seq_obj.get_name()
+                f.print( ">" )
+                if ( @max_name_length != MAX_NAME_LENGTH_DEFAULT )
+                    name = Util.normalize_seq_name( name, @max_name_length )
+                end
+                f.print( name )
+                counter = 0
+                for j in 0 ... seq_obj.get_length()
+                    unless @remove_gap_chars && Util.is_aa_gap_character?( seq_obj.get_character_code( j ) )
+                        char = seq_obj.get_residue( j )
+                        if ( @clean )
+                            char = Util.clean_seq_str( char )
+                            if ( char.length < 1 )
+                                next
+                            end
+                        end
+                        if counter % @line_width == 0
+                            f.print( Evoruby::Constants::LINE_DELIMITER )
+                        end
+                        f.print( char )
+                        counter += 1
+                    end
+                end
+                f.print( Evoruby::Constants::LINE_DELIMITER )
+            end
+            f.close()
+        end
+
+    end # class FastaWriter
+
+end # module Evoruby
+
diff --git a/forester/ruby/evoruby/lib/evo/io/writer/msa_writer.rb b/forester/ruby/evoruby/lib/evo/io/writer/msa_writer.rb
new file mode 100644 (file)
index 0000000..36eec15
--- /dev/null
@@ -0,0 +1,30 @@
+#
+# = lib/evo/io/writer/msa_writer.rb - MsaWriter class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: msa_writer.rb,v 1.2 2007/06/12 04:51:35 cmzmasek Exp $
+#
+# last modified: 05/16/2007
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+
+module Evoruby
+
+    class MsaWriter
+
+        def initialize()
+            raise TypeError, "Cannot instanciate abstract class MsaWriter"
+        end
+
+        def set_max_name_length( length )
+        end
+
+        def write( msa, path )
+        end
+
+    end # class MsaWriter
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/io/writer/nexus_writer.rb b/forester/ruby/evoruby/lib/evo/io/writer/nexus_writer.rb
new file mode 100644 (file)
index 0000000..17a0c77
--- /dev/null
@@ -0,0 +1,82 @@
+#
+# = lib/evo/io/writer/nexus_writer.rb - NexusWriter class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: nexus_writer.rb,v 1.4 2009/11/04 01:50:59 cmzmasek Exp $
+#
+# last modified: 05/16/2007
+
+require 'lib/evo/io/writer/msa_writer'
+
+module Evoruby
+
+    class NexusWriter < MsaWriter
+
+        MAX_NAME_LENGTH_DEFAULT = 10
+
+        def initialize()
+            @max_name_length = MAX_NAME_LENGTH_DEFAULT
+            @clean           = false
+        end
+
+        def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT )
+            if length < 1
+                length = MAX_NAME_LENGTH_DEFAULT
+            end
+            @max_name_length = length
+        end
+
+        def clean( clean = true )
+            @clean = clean
+        end
+
+        def write( msa, path )
+            if ( !msa.is_aligned() )
+                error_msg = "attempt to write unaligned msa in nexus format"
+                raise StandardError, error_msg, caller
+            end
+
+            Util.check_file_for_writability( path )
+
+            f = File.open( path, "a" )
+
+            f.print( "Begin Data;" )
+            f.print( Evoruby::Constants::LINE_DELIMITER )
+            f.print( "   Dimensions NTax=" )
+            f.print( msa.get_number_of_seqs().to_s() )
+            f.print( " NChar=" )
+            f.print( msa.get_length().to_s() )
+            f.print( ";" )
+            f.print( Evoruby::Constants::LINE_DELIMITER )
+            f.print( "   Format DataType=Protein Interleave=No gap=-;" )
+            f.print( Evoruby::Constants::LINE_DELIMITER )
+            f.print( "   Matrix" )
+            f.print( Evoruby::Constants::LINE_DELIMITER )
+            for i in 0 ... msa.get_number_of_seqs()
+                seq_obj = msa.get_sequence( i )
+                name = seq_obj.get_name()
+                seq  = seq_obj.get_sequence_as_string()
+                name = name.gsub( /\s+$/, '')
+                name = name.gsub( /\s+/, '_')
+                name = Util.normalize_seq_name( name, @max_name_length )
+                f.print( "      " )
+                f.print( name )
+                f.print( " " )
+                if ( @clean )
+                    seq = Util.clean_seq_str( seq )
+                end
+                f.print( seq )
+                f.print( Evoruby::Constants::LINE_DELIMITER )
+            end
+            f.print( "   ;" )
+            f.print( Evoruby::Constants::LINE_DELIMITER )
+            f.print( "End;" )
+            f.print( Evoruby::Constants::LINE_DELIMITER )
+            f.close()
+        end
+
+    end # class NexusWriter
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/io/writer/phylip_sequential_writer.rb b/forester/ruby/evoruby/lib/evo/io/writer/phylip_sequential_writer.rb
new file mode 100644 (file)
index 0000000..15f4c5c
--- /dev/null
@@ -0,0 +1,70 @@
+#
+# = lib/evo/io/writer/phylip_sequential_writer.rb - PhylipSequentialWriter class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: phylip_sequential_writer.rb,v 1.4 2008/09/03 00:31:38 cmzmasek Exp $
+#
+# last modified: 05/16/2007
+
+require 'lib/evo/io/writer/msa_writer'
+
+module Evoruby
+
+    class PhylipSequentialWriter < MsaWriter
+
+        MAX_NAME_LENGTH_DEFAULT = 10
+
+        def initialize()
+            @max_name_length = MAX_NAME_LENGTH_DEFAULT
+            @clean           = false
+        end
+
+        def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT )
+            if length < 1
+                length = MAX_NAME_LENGTH_DEFAULT
+            end
+            @max_name_length = length
+        end
+
+        def clean( clean = true )
+            @clean = clean
+        end
+
+        def write( msa, path )
+            if ( !msa.is_aligned() )
+                error_msg = "attempt to write unaligned msa in phylip sequential format"
+                raise StandardError, error_msg, caller
+            end
+
+
+            Util.check_file_for_writability( path )
+
+            f = File.open( path, "a" )
+
+            f.print( msa.get_number_of_seqs().to_s() )
+            f.print( " " )
+            f.print( msa.get_length().to_s() )
+            f.print( Evoruby::Constants::LINE_DELIMITER )
+            for i in 0 ... msa.get_number_of_seqs()
+                seq_obj = msa.get_sequence( i )
+                name = seq_obj.get_name()
+                seq  = seq_obj.get_sequence_as_string()
+                name = name.gsub( /\s+$/, '')
+                name = name.gsub( /\s+/, '_')
+                name = Util.normalize_seq_name( name, @max_name_length )
+                f.print( name )
+                f.print( " " )
+                if ( @clean )
+                    seq = Util.clean_seq_str( seq )
+                end
+                f.print( seq )
+                f.print( Evoruby::Constants::LINE_DELIMITER )
+            end
+            f.close()
+        end
+
+    end # class PhylipSequentialWriter
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/msa/msa.rb b/forester/ruby/evoruby/lib/evo/msa/msa.rb
new file mode 100644 (file)
index 0000000..2924646
--- /dev/null
@@ -0,0 +1,513 @@
+#
+# = lib/evo/msa/msa.rb - Msa class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: msa.rb,v 1.11 2009/01/03 00:42:08 cmzmasek Exp $
+#
+
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/sequence/sequence'
+
+module Evoruby
+
+    class Msa
+
+        def initialize()
+            @sequences = Array.new()
+        end
+
+
+        def add_sequence( sequence )
+            @sequences.push( sequence )
+        end
+
+        def add( name, molecular_sequence_str )
+            add_sequence( Sequence.new( name, molecular_sequence_str ) )
+        end
+
+        def get_sequence( index )
+            if ( index < 0 || index > get_number_of_seqs() - 1 )
+                error_msg = "attempt to get sequence " <<
+                 index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
+                 " sequences"
+                raise ArgumentError, error_msg
+            end
+            return @sequences[ index ]
+        end
+
+        def remove_sequence!( index )
+            if ( index < 0 || index > get_number_of_seqs() - 1 )
+                error_msg = "attempt to remove sequence " <<
+                 index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
+                 " sequences"
+                raise ArgumentError, error_msg
+            end
+            @sequences.delete_at( index )
+        end
+
+        def is_aligned()
+            if ( get_number_of_seqs < 1 )
+                return false
+            else
+                l = @sequences[ 0 ].get_length()
+                for i in 0 ... get_number_of_seqs()
+                    if ( get_sequence( i ).get_length() != l )
+                        return false
+                    end
+                end
+            end
+            return true
+        end
+
+        def find_by_name( name, case_sensitive, partial_match )
+            indices = Array.new()
+            for i in 0 ... get_number_of_seqs()
+                current_name = get_sequence( i ).get_name()
+                if !case_sensitive
+                    current_name = current_name.downcase
+                    name = name.downcase
+                end
+                if current_name == name ||
+                     ( partial_match && current_name.include?( name ) )
+                    indices.push( i )
+                end
+            end
+            indices
+        end
+
+        def find_by_name_start( name, case_sensitive )
+            indices = Array.new()
+            for i in 0 ... get_number_of_seqs()
+                get_sequence( i ).get_name() =~ /^\s*(\S+)/
+                current_name = $1
+                if !case_sensitive
+                    current_name = current_name.downcase
+                    name = name.downcase
+                end
+                if  ( current_name == name )
+                    indices.push( i )
+                end
+            end
+            indices
+        end
+
+        def has?( name, case_sensitive = true, partial_match = false )
+            for i in 0 ... get_number_of_seqs()
+                current_name = get_sequence( i ).get_name()
+                if !case_sensitive
+                    current_name = current_name.downcase
+                    name = name.downcase
+                end
+                if current_name == name ||
+                     ( partial_match && current_name.include?( name ) )
+                    return true
+                end
+            end
+            false
+        end
+
+        # throws ArgumentError
+        def get_by_name( name, case_sensitive = true, partial_match = false )
+            indices = find_by_name( name, case_sensitive, partial_match )
+            if ( indices.length > 1 )
+                error_msg = "\"" + name + "\" not unique"
+                raise ArgumentError, error_msg
+            elsif ( indices.length < 1 )
+                error_msg = "\"" + name + "\" not found"
+                raise ArgumentError, error_msg
+            end
+            get_sequence( indices[ 0 ] )
+        end
+
+        # throws ArgumentError
+        def get_by_name_start( name, case_sensitive = true )
+            indices = find_by_name_start( name, case_sensitive )
+            if ( indices.length > 1 )
+                error_msg = "\"" + name + "\" not unique"
+                raise ArgumentError, error_msg
+            elsif ( indices.length < 1 )
+                error_msg = "\"" + name + "\" not found"
+                raise ArgumentError, error_msg
+            end
+            get_sequence( indices[ 0 ] )
+        end
+
+
+        def get_sub_alignment( seq_numbers )
+            msa = Msa.new()
+            for i in 0 ... seq_numbers.length()
+                msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() )
+            end
+            return msa
+        end
+
+        def get_number_of_seqs()
+            @sequences.length
+        end
+
+        def get_length()
+            if ( !is_aligned() )
+                error_msg = "attempt to get length of unaligned msa"
+                raise StandardError, error_msg, caller
+            end
+            if ( get_number_of_seqs() < 1 )
+                -1
+            else
+                @sequences[ 0 ].get_length()
+            end
+        end
+
+        def to_str()
+            s = String.new()
+            for i in 0...get_number_of_seqs()
+                s += @sequences[ i ].to_str + Constants::LINE_DELIMITER
+            end
+            s
+        end
+
+        def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
+            if ( !is_aligned() )
+                error_msg = "attempt to get overlap diagram of unaligned msa"
+                raise StandardError, error_msg, caller
+            end
+            for i in 0 ... get_number_of_seqs()
+                io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) )
+                for j in 0 ... get_number_of_seqs()
+                    if i == j
+                        io.print( " " )
+                    else
+                        if overlap?( i, j, min_overlap )
+                            io.print( "+" )
+                        else
+                            io.print( "-" )
+                        end
+                    end
+                end
+                io.print( Evoruby::Constants::LINE_DELIMITER )
+            end
+        end
+
+        #returns array of Msa with an overlap of min_overlap
+        def split_into_overlapping_msa( min_overlap = 1 )
+            if ( !is_aligned() )
+                error_msg = "attempt to split into overlapping msas of unaligned msa"
+                raise StandardError, error_msg, caller
+            end
+            msas = Array.new()
+            bins = get_overlaps( min_overlap )
+            for i in 0 ... bins.length
+                msas.push( get_sub_alignment( bins[ i ] ) )
+            end
+            msas
+        end
+
+        def overlap?( index_1, index_2, min_overlap = 1 )
+            seq_1 = get_sequence( index_1 )
+            seq_2 = get_sequence( index_2 )
+            overlap_count = 0
+            for i in 0...seq_1.get_length()
+                if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
+                     !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
+                    overlap_count += 1
+                    if overlap_count >= min_overlap
+                        return true
+                    end
+                end
+            end
+            return false
+        end
+
+        def calculate_overlap( index_1, index_2 )
+            seq_1 = get_sequence( index_1 )
+            seq_2 = get_sequence( index_2 )
+            overlap_count = 0
+            for i in 0...seq_1.get_length()
+                if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
+                     !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
+                    overlap_count += 1
+                end
+            end
+            overlap_count
+        end
+
+        def calculate_identities( index_1, index_2 )
+            seq_1 = get_sequence( index_1 )
+            seq_2 = get_sequence( index_2 )
+            identities_count = 0
+            for i in 0...seq_1.get_length()
+                if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
+                     !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
+                     seq_1.get_character_code( i ) != 63 &&
+                     ( seq_1.get_residue( i ).downcase() ==
+                         seq_2.get_residue( i ).downcase() )
+                    identities_count += 1
+                end
+            end
+            identities_count
+        end
+
+        def remove_gap_only_columns!()
+            remove_columns!( get_gap_only_columns() )
+        end
+
+        def remove_gap_columns!()
+            remove_columns!( get_gap_columns() )
+        end
+
+        # removes columns for which seqs with gap / number of sequences > gap_ratio
+        def remove_gap_columns_w_gap_ratio!( gap_ratio )
+            remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
+        end
+
+
+        def remove_sequences_by_gap_ratio!( gap_ratio )
+            if ( !is_aligned() )
+                error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
+                raise StandardError, error_msg, caller
+            end
+            n = get_number_of_seqs
+            removed = Array.new
+            for s in 0 ... n
+                if ( get_sequence( ( n - 1 ) - s  ).get_gap_ratio() > gap_ratio )
+                    if ( Evoruby::Constants::VERBOSE )
+                        puts( "removed: " + get_sequence( ( n - 1 ) - s  ).get_name )
+                    end
+                    removed << get_sequence( ( n - 1 ) - s  ).get_name
+                    remove_sequence!( ( n - 1 ) - s  )
+                end
+            end
+            removed
+        end
+
+
+        def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
+            n = get_number_of_seqs
+            removed = Array.new
+            to_be_removed = Set.new
+            for i in 0 ... ( n - 1 )
+                for j in ( i + 1 ) ... n
+                    if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
+                        if  !consider_taxonomy ||
+                             ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
+                                 ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
+                            if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
+                                 Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
+                                to_be_removed.add( j )
+                                if verbose
+                                    tax_i = ""
+                                    tax_j = ""
+                                    if get_sequence( i ).get_taxonomy != nil
+                                        tax_i = get_sequence( i ).get_taxonomy.get_name
+                                    end
+                                    if get_sequence( j ).get_taxonomy != nil
+                                        tax_j = get_sequence( j ).get_taxonomy.get_name
+                                    end
+                                    puts get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]"
+                                end
+                            end
+                        end
+                    end
+                end
+            end
+            to_be_removed_ary = to_be_removed.to_a.sort.reverse
+
+            to_be_removed_ary.each { | index |
+                removed.push( get_sequence( index ).get_name )
+                remove_sequence!( index )
+            }
+            removed
+        end
+
+
+        def remove_sequences_by_non_gap_length!( min_non_gap_length )
+            if ( !is_aligned() )
+                error_msg = "attempt to remove sequences by non gap length on unaligned msa"
+                raise StandardError, error_msg, caller
+            end
+            n = get_number_of_seqs
+            l = get_length
+            removed = Array.new
+            for s in 0 ... n
+                if ( ( l - get_sequence( ( n - 1 ) - s ).get_gap_length ) < min_non_gap_length )
+                    if ( Evoruby::Constants::VERBOSE )
+                        puts( "removed: " + get_sequence( ( n - 1 ) - s  ).get_name )
+                    end
+                    removed << get_sequence( ( n - 1 ) - s  ).get_name
+                    remove_sequence!( ( n - 1 ) - s )
+                end
+            end
+            removed
+        end
+
+        def trim!( first, last )
+            cols = Array.new()
+            for i in 0 ... get_length()
+                if ( i < first || i > last )
+                    cols.push( i )
+                end
+            end
+            remove_columns!( cols )
+        end
+
+        def get_gap_only_columns()
+            if ( !is_aligned() )
+                error_msg = "attempt to get gap only columns of unaligned msa"
+                raise StandardError, error_msg, caller
+            end
+            cols = Array.new()
+            for c in 0 ... get_length
+                nogap_char_found = false
+                for s in 0 ... get_number_of_seqs
+                    unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
+                        nogap_char_found = true
+                        break
+                    end
+                end
+                unless nogap_char_found
+                    cols.push( c )
+                end
+            end
+            return cols
+        end
+
+        def get_gap_columns()
+            if ( !is_aligned() )
+                error_msg = "attempt to get gap columns of unaligned msa"
+                raise StandardError, error_msg, caller
+            end
+            cols = Array.new()
+            for c in 0 ... get_length
+                gap_char_found = false
+                for s in 0 ... get_number_of_seqs
+                    if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
+                        gap_char_found = true
+                        break
+                    end
+                end
+                if gap_char_found
+                    cols.push( c )
+                end
+            end
+            return cols
+        end
+
+        # gap_ratio = seqs with gap / number of sequences
+        # returns column indices for which seqs with gap / number of sequences > gap_ratio
+        def get_gap_columns_w_gap_ratio( gap_ratio )
+            if ( !is_aligned() )
+                error_msg = "attempt to get gap columns with gap_ratio of unaligned msa"
+                raise StandardError, error_msg, caller
+            end
+            if ( gap_ratio < 0 || gap_ratio > 1 )
+                error_msg = "gap ratio must be between 0 and 1 inclusive"
+                raise ArgumentError, error_msg, caller
+            end
+            cols = Array.new()
+            for c in 0 ... get_length
+                gap_chars_found = 0
+                for s in 0 ... get_number_of_seqs
+                    if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
+                        gap_chars_found += 1
+                    end
+                end
+                if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio )
+                    cols.push( c )
+                end
+            end
+            return cols
+        end
+
+
+        # Split an alignment into n alignemnts of equal size, except last one
+        def split( n, verbose = false )
+            if ( n < 2 || n > get_number_of_seqs )
+                error_msg = "attempt to split into less than two or more than the number of sequences"
+                raise StandardError, error_msg, caller
+            end
+            msas = Array.new()
+            r = get_number_of_seqs % n
+            x = get_number_of_seqs / n
+            for i in 0 ... n
+                msa = Msa.new()
+                s = 0
+
+                if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
+                    y = x + r
+                    if ( verbose )
+                        puts( i.to_s + ": " + y.to_s )
+                    end
+                    for j in 0 ... y
+                        msa.add_sequence( get_sequence( ( i * x ) + j ) )
+                    end
+                else
+                    if ( verbose )
+                        puts( i.to_s + ": " + x.to_s )
+                    end
+                    for j in 0 ... x
+                        msa.add_sequence( get_sequence( ( i * x ) + j ) )
+                    end
+                end
+                msas.push( msa )
+            end
+            msas
+        end
+
+
+        private
+
+        def get_overlaps( min_overlap = 1 )
+            if ( !is_aligned() )
+                error_msg = "attempt to get overlaps of unaligned msa"
+                raise StandardError, error_msg, caller
+            end
+            bins = Array.new()
+            for i in 0 ... get_number_of_seqs()
+                found_bin = false
+                for j in 0 ... bins.length
+                    current_bin = bins[ j ]
+                    # does seq i overlap with all seqs in current_bin?
+                    all_overlap = true
+                    for z in 0 ... current_bin.length
+                        unless overlap?( i, current_bin[ z ], min_overlap )
+                            all_overlap = false
+                            break
+                        end
+                    end
+                    if all_overlap
+                        current_bin.push( i )
+                        found_bin = true
+                    end
+                end
+                unless found_bin
+                    new_bin = Array.new()
+                    new_bin.push( i )
+                    bins.push( new_bin )
+                end
+            end
+            return bins
+        end
+
+        def remove_columns!( cols )
+            if ( !is_aligned() )
+                error_msg = "attempt to remove columns of unaligned msa"
+                raise StandardError, error_msg, caller
+            end
+            cols.reverse!()
+            for c in 0 ... cols.length()
+                col = cols[ c ]
+                for s in 0 ... get_number_of_seqs()
+                    get_sequence( s ).delete_residue!( col )
+                end
+            end
+            return self
+        end
+
+
+    end # class Msa
+
+end # module Evoruby
+
diff --git a/forester/ruby/evoruby/lib/evo/msa/msa_factory.rb b/forester/ruby/evoruby/lib/evo/msa/msa_factory.rb
new file mode 100644 (file)
index 0000000..551161e
--- /dev/null
@@ -0,0 +1,24 @@
+#
+# = lib/evo/msa/msa_factory.rb - MsaFactory class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: msa_factory.rb,v 1.2 2007/06/12 04:51:34 cmzmasek Exp $
+#
+# last modified: 05/16/2007
+
+module Evoruby
+
+    class MsaFactory
+
+        def initialize
+        end
+
+        def create_msa_from_file( path, msa_parser )
+            msa_parser.parse( path )
+        end
+
+    end # class MsaFactory
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/sequence/domain_structure.rb b/forester/ruby/evoruby/lib/evo/sequence/domain_structure.rb
new file mode 100644 (file)
index 0000000..8189218
--- /dev/null
@@ -0,0 +1,73 @@
+#
+# = lib/evo/sequence/domain_structure.rb - DomainStructure class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: domain_structure.rb,v 1.2 2007/06/12 04:51:33 cmzmasek Exp $
+#
+# last modified: 05/16/2007
+
+require 'lib/evo/util/constants'
+
+module Evoruby
+
+    class DomainStructure
+
+        def initialize( total_length )
+            @domains = Hash.new
+            @total_length = total_length
+        end
+
+        def add_domain( domain, overwrite_if_same_from_to )
+            key = domain.get_from
+            if ( @domains.has_key?( key ) )
+                prev_domain = @domains[ key ]
+                if ( prev_domain.get_to == domain.get_to )
+                    puts( "WARNING: more than one domain at the same location [" +
+                        key.to_s + "-" + domain.get_to.to_s + "]: " + prev_domain.get_name + " and " + domain.get_name)
+                    if ( overwrite_if_same_from_to )
+                        puts( "         ignored the one with higher E-value [" +
+                        prev_domain.get_confidence().to_s + " vs " + domain.get_confidence().to_s + "]" )
+                        if prev_domain.get_confidence() < domain.get_confidence()
+                            return # keep previous one
+                        else
+                            @domains[ key ] = domain
+                            return
+                        end
+                    end
+                end
+
+                while ( @domains.has_key?( key ) )
+                    key = key + 0.0001
+                end
+
+            end
+            @domains[ key ] = domain
+        end
+
+        def to_NHX
+            str = String.new
+            str << get_total_length.to_s
+            a = @domains.sort
+            for d in a
+                domain = d[ 1 ]
+                str << Evoruby::Constants::DOMAIN_STRUCTURE_NHX_SEPARATOR
+                str << domain.get_from.to_s
+                str << Evoruby::Constants::DOMAIN_STRUCTURE_NHX_SEPARATOR
+                str << domain.get_to.to_s
+                str << Evoruby::Constants::DOMAIN_STRUCTURE_NHX_SEPARATOR
+                str << domain.get_confidence.to_s
+                str << Evoruby::Constants::DOMAIN_STRUCTURE_NHX_SEPARATOR
+                str << domain.get_name
+            end
+            return str
+        end
+
+        def get_total_length
+            return @total_length
+        end
+
+    end # class DomainStructure
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/sequence/protein_domain.rb b/forester/ruby/evoruby/lib/evo/sequence/protein_domain.rb
new file mode 100644 (file)
index 0000000..d9edb52
--- /dev/null
@@ -0,0 +1,45 @@
+#
+# = lib/evo/sequence/protein_domain.rb - ProteinDomain class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: protein_domain.rb,v 1.2 2007/06/12 04:51:33 cmzmasek Exp $
+#
+# last modified: 05/16/2007
+
+module Evoruby
+
+    class ProteinDomain
+
+        def initialize( name, from, to, id, confidence )
+            @name       = String.new( name )
+            @from       = from
+            @to         = to
+            @id         = String.new( id )
+            @confidence = confidence
+        end
+
+        def get_name()
+            return @name
+        end
+
+        def get_from()
+            return @from
+        end
+
+        def get_to()
+            return @to
+        end
+
+        def get_id()
+            return @id
+        end
+
+        def get_confidence()
+            return @confidence
+        end
+
+    end # class ProteinDomain
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/sequence/sequence.rb b/forester/ruby/evoruby/lib/evo/sequence/sequence.rb
new file mode 100644 (file)
index 0000000..77b8d7a
--- /dev/null
@@ -0,0 +1,165 @@
+#
+# = lib/evo/sequence/sequence.rb - Sequence class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: sequence.rb,v 1.10 2009/01/07 02:48:20 cmzmasek Exp $
+
+require 'set'
+
+module Evoruby
+
+    class Sequence
+
+        def initialize( name,
+                molecular_sequence_str,
+                accession = nil,
+                accession_source = nil,
+                taxonomy = nil,
+                symbol = nil,
+                secondary_accession = nil,
+                secondary_accession_source = nil )
+            @name               = String.new( name.strip() )
+            @molecular_sequence = String.new( molecular_sequence_str )
+            if ( accession == nil )
+                @accession = String.new()
+            else
+                @accession = String.new( accession.strip() )
+            end
+            if ( accession_source == nil )
+                @accession_source = String.new()
+            else
+                @accession_source = String.new( accession_source.strip() )
+            end
+            @taxonomy = taxonomy
+            if ( symbol == nil )
+                @symbol = String.new()
+            else
+                @symbol = String.new( symbol.strip() )
+            end
+            if ( secondary_accession == nil )
+                @secondary_accession = String.new()
+            else
+                @secondary_accession = String.new( secondary_accession.strip() )
+            end
+            if ( secondary_accession_source == nil )
+                @secondary_accession_source = String.new()
+            else
+                @secondary_accession_source = String.new( secondary_accession_source.strip() )
+            end
+        end
+
+        def copy
+            if get_taxonomy == nil
+                Sequence.new( get_name, get_sequence_as_string, get_accession, get_accession_source, nil, get_symbol, get_secondary_accession, get_secondary_accession_source )
+            else
+                Sequence.new( get_name, get_sequence_as_string, get_accession, get_accession_source, get_taxonomy.copy, get_symbol, get_secondary_accession, get_secondary_accession_source )
+            end
+        end
+
+        def get_name()
+            @name
+        end
+
+        def set_name( name )
+            @name = name
+        end
+
+        def get_sequence_as_string()
+            @molecular_sequence
+        end
+
+        def get_accession()
+            @accession
+        end
+
+        def get_accession_source()
+            @accession_source
+        end
+
+        def get_secondary_accession()
+            @secondary_accession
+        end
+
+        def get_secondary_accession_source()
+            @secondary_accession_source
+        end
+
+        def get_symbol()
+            @symbol
+        end
+
+        def get_taxonomy()
+            @taxonomy
+        end
+
+        def get_length()
+            @molecular_sequence.length
+        end
+
+        def get_residue( position )
+            get_slice( position, 1 )
+        end
+
+        def get_character_code( position )
+            @molecular_sequence.getbyte( position )
+        end
+
+        def get_gap_ratio()
+            return get_gap_length().to_f / get_length()
+        end
+
+        def get_gap_length()
+            counter = 0
+            for i in 0 ... get_length()
+                if ( Util.is_aa_gap_character?( get_character_code( i ) ) )
+                    counter += 1
+                end
+            end
+            return counter;
+        end
+
+        def delete_residue!( position )
+            if ( position < 0 || position >= get_length() )
+                error_msg = "attempt to delete residue at postion out of range"
+                raise ArgumentError, error_msg
+            end
+            @molecular_sequence.slice!( position )
+        end
+
+        def get_slice( start, length )
+            if ( start < 0 || start + length > get_length() )
+                error_msg = "attempt to get sequence residue(s) at postion out of range"
+                raise ArgumentError, error_msg
+            end
+            @molecular_sequence.slice( start, length )
+        end
+
+        def get_slice!( start, length )
+            if ( start < 0 || start + length > get_length() )
+                error_msg = "attempt to get sequence residue(s) at postion out of range"
+                raise ArgumentError, error_msg
+            end
+            @molecular_sequence.slice!( start, length )
+        end
+
+        def get_subsequence( first, last )
+            if ( last < first )
+                error_msg = "attempt to get subsequence from " + first + " to " + last
+                raise ArgumentError, error_msg
+            end
+            return Sequence.new( get_name, @molecular_sequence.slice( first, last - first + 1 ) )
+        end
+
+        def append!( molecular_sequence_str )
+            @molecular_sequence.concat( molecular_sequence_str )
+        end
+
+        def to_str()
+            return "[" + @name + "] " + @molecular_sequence
+        end
+
+    end # class Sequence
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/soft/fastme.rb b/forester/ruby/evoruby/lib/evo/soft/fastme.rb
new file mode 100644 (file)
index 0000000..a59756f
--- /dev/null
@@ -0,0 +1,70 @@
+#
+# = lib/soft/fastme - FastMe class
+#
+# Copyright::  Copyright (C) 2009 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: fastme.rb,v 1.3 2009/10/08 22:44:54 cmzmasek Exp $
+#
+# last modified: 2009/10/06
+
+require 'lib/evo/soft/resource_locations'
+require 'lib/evo/util/util'
+
+module Evoruby
+
+    class FastMe
+
+        VERBOSE = false
+       
+        OUTTREE      = 'output.tre'
+        OUTPUT_D     = 'output.d'
+        VERSION      = '2.0'
+        
+        def initialize
+            @fast_me_home = Util.get_env_variable_value( ResourceLocations::FASTME_HOME_ENV_VARIABLE )
+            Util.check_file_for_readability( @fast_me_home )
+        end
+
+        def run( pwd_file, bootstrap_number, initial_tree )
+            Util.check_file_for_readability( pwd_file )
+            if bootstrap_number == nil || bootstrap_number < 0
+                error_msg = "illegal bootstrap number: " + bootstrap_number
+                raise ArgumentError, error_msg
+            end
+            init_tree_option = determine_initial_tree( initial_tree )
+            input = String.new()
+            if bootstrap_number > 1
+                input = "-b #{init_tree_option} -i #{pwd_file} -n #{bootstrap_number} -s b"
+            else
+                input = "-b #{init_tree_option} -i #{pwd_file} -s b"
+            end
+            if VERBOSE
+                puts @fast_me_home + " " + input
+            end
+            IO.popen( @fast_me_home + " " + input, 'r+' ) do |io|
+                io.close_write
+                return io.read
+            end
+        end
+
+        private
+
+        def determine_initial_tree( initial_tree )
+            opt = nil
+            if ( initial_tree == :BME )
+                opt = "BME"
+            elsif ( initial_tree == :GME )
+                opt = "GME"
+            elsif ( initial_tree == :NJ )
+                opt = "NJ"
+            else
+                error_msg = "unknown initial tree"
+                raise ArgumentError, error_msg
+            end
+            return opt
+        end
+
+    end # class FastMe
+
+end # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/soft/raxml.rb b/forester/ruby/evoruby/lib/evo/soft/raxml.rb
new file mode 100644 (file)
index 0000000..92122b9
--- /dev/null
@@ -0,0 +1,52 @@
+#
+# = lib/soft/raxml - Raxml class
+#
+# Copyright::  Copyright (C) 2009 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: raxml.rb,v 1.1 2009/10/07 00:08:35 cmzmasek Exp $
+#
+# last modified: 2009/10/06
+
+require 'lib/evo/soft/resource_locations'
+require 'lib/evo/util/util'
+
+module Evoruby
+    
+    class Raxml 
+        
+        VERBOSE = true
+        
+        def initialize
+            @fast_me_home = Util.get_env_variable_value( ResourceLocations::FASTME_HOME_ENV_VARIABLE )
+            Util.check_file_for_readability( @fast_me_home )
+        end
+        
+        def run( pwd_file, bootstrap_number, initial_tree ) 
+            Util.check_file_for_readability( pwd_file )
+            if bootstrap_number == nil || bootstrap_number < 0
+                error_msg = "illegal bootstrap number: " + bootstrap_number
+                raise ArgumentError, error_msg
+            end
+            if initial_tree == nil || (!initial_tree.eql?( "BME" ) && !initial_tree.eql?( "GME" ) && !initial_tree.eql?( "NJ" ) )
+                error_msg = "illegal initial tree: " + initial_tree
+                raise ArgumentError, error_msg
+            end
+            input = String.new()
+            if bootstrap_number > 1 
+                input = '-b #{initial_tree} -i #{pwd_file} -n #{bootstrap_number} -s b'
+            else 
+                input = '-b #{initial_tree} -i #{pwd_file} -s b'
+            end
+            if VERBOSE
+                puts @fast_me_home + " " + input
+            end
+            IO.popen( @fast_me_home, 'r+' ) do |io|
+                io.puts input
+                io.close_write
+                return io.read
+            end
+        end
+    end # class Raxml 
+    
+end # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/soft/resource_locations.rb b/forester/ruby/evoruby/lib/evo/soft/resource_locations.rb
new file mode 100644 (file)
index 0000000..65772b5
--- /dev/null
@@ -0,0 +1,21 @@
+#
+# = lib/soft/resource_locations - ResourceLocations class
+#
+# Copyright::  Copyright (C) 2009 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: resource_locations.rb,v 1.1 2009/10/06 22:22:46 cmzmasek Exp $
+#
+# last modified: 2009/10/06
+
+module Evoruby
+
+    class ResourceLocations
+
+        FASTME_HOME_ENV_VARIABLE = 'FASTME_HOME'
+        TREEPUZZLE_HOME_ENV_VARIABLE = 'TREEPUZZLE_HOME'
+       
+    end # ResourceLocations
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/soft/tree_puzzle.rb b/forester/ruby/evoruby/lib/evo/soft/tree_puzzle.rb
new file mode 100644 (file)
index 0000000..416b896
--- /dev/null
@@ -0,0 +1,110 @@
+#
+# = lib/soft/tree_puzzle - TreePuzzle  class
+#
+# Copyright::  Copyright (C) 2009 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: tree_puzzle.rb,v 1.5 2009/10/08 22:44:54 cmzmasek Exp $
+#
+# last modified: 2009/10/06
+
+require 'lib/evo/soft/resource_locations'
+require 'lib/evo/util/util'
+
+module Evoruby
+
+    class TreePuzzle
+
+        VERBOSE = false
+        
+        OUTDIST = 'outdist'
+        OUTFILE = 'outfile'
+        VERSION = '5.2'
+        
+        def initialize
+            @tree_puzzle_home = Util.get_env_variable_value( ResourceLocations::TREEPUZZLE_HOME_ENV_VARIABLE )
+            Util.check_file_for_readability( @tree_puzzle_home )
+        end
+
+        def run( alignment_file, model, rate_heterogeneity, number_of_seqs )
+            Util.check_file_for_readability( alignment_file )
+
+            input = alignment_file
+            input << "\nk\nk"
+            if number_of_seqs <= 257
+                input << "\nk"
+            end
+            input << determine_model_option( model )
+            input << determine_rate_heterogeneity_option( rate_heterogeneity )
+            input << "\ny\n"
+
+            if VERBOSE
+                puts @tree_puzzle_home + " " + input
+            end
+            IO.popen( @tree_puzzle_home, 'r+' ) do |io|
+                io.puts input
+                io.close_write
+                return io.read
+            end
+        end
+
+        private
+
+        # "Model of substitution" order for DQO TREE-PUZZLE 5.0:
+        # Auto
+        # m -> Dayhoff (Dayhoff et al. 1978)
+        # m -> JTT (Jones et al. 1992)
+        # m -> mtREV24 (Adachi-Hasegawa 1996)
+        # m -> BLOSUM62 (Henikoff-Henikoff 92)
+        # m -> VT (Mueller-Vingron 2000)
+        # m -> WAG (Whelan-Goldman 2000)
+        # m -> Auto
+        def determine_model_option( model )
+            cmd = nil
+            if ( model == :pam )
+                cmd = "\nm"
+            elsif ( model == :jtt )
+                cmd = "\nm\nm"
+            elsif ( model == :mtrev24 )
+                cmd = "\nm\nm\nm"
+            elsif ( model == :blosum62 )
+                cmd = "\nm\nm\nm\nm"
+            elsif ( model == :vt )
+                cmd = "\nm\nm\nm\nm\nm"
+            elsif ( model == :wag )
+                cmd = "\nm\nm\nm\nm\nm\nm"
+            elsif ( model == :auto )
+                cmd = ""
+            else
+                error_msg = "unknown model"
+                raise ArgumentError, error_msg
+            end
+            cmd
+        end
+
+
+        # Model of rate heterogeneity:
+        #    "8 Gamma distributed rates"
+        #    "Two rates (1 invariable + 1 variable)"
+        #    "Mixed (1 invariable + 8 Gamma rates)"
+        #    otherwise: Uniform rate
+        def determine_rate_heterogeneity_option( rates )
+            opt = nil
+            if ( rates == :gamma8 )
+                opt = "\nw"
+            elsif ( rates == :inv1_var1 )
+                opt = "\nw\nw"
+            elsif ( rates == :inv1_gamma8 )
+                opt = "\nw\nw\nw"
+            elsif ( rates == :uniform )
+                opt = ""
+            else
+                error_msg = "unknown rate heterogeneity option"
+                raise ArgumentError, error_msg
+            end
+            return opt
+        end
+
+    end # class TreePuzzle
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/table/basic_table.rb b/forester/ruby/evoruby/lib/evo/table/basic_table.rb
new file mode 100644 (file)
index 0000000..8e5901a
--- /dev/null
@@ -0,0 +1,122 @@
+# = lib/evo/table/basic_table.rb - BasicTable class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)'s
+#
+# $Id: basic_table.rb,v 1.3 2007/09/28 03:12:10 cmzmasek Exp $
+#
+# last modified: 05/16/2007
+
+#require 'lib/evo/util/constants'
+
+module Evoruby
+
+  class BasicTable
+
+        def initialize()
+            @rows = Hash.new
+            @max_row = 0
+            @max_col = 0
+        end
+
+        # raises ArgumentError
+        def set_value( row, col, value )
+            if ( ( row < 0 ) || ( col < 0 ) )
+                raise( ArgumentError, "attempt to use negative values for row or column" )
+            end
+            if ( row > get_max_row() )
+                set_max_row( row )
+            end
+            if ( col > get_max_col() )
+                set_max_col( col )
+            end
+            row_map = nil
+            if ( @rows.has_key?( row ) )
+                row_map = @rows[ row ]
+            else
+                row_map = Hash.new
+                @rows[ row ] = row_map
+            end
+            row_map[ col ] = value
+        end
+
+        # raises ArgumentError
+        def get_value_as_string( row, col )
+            return ( get_value( row, col ) ).to_s
+        end
+
+        # raises ArgumentError
+        def get_value( row, col )
+            if ( ( row > get_max_row() ) || ( row < 0 ) )
+                raise( ArgumentError, "value for row (" + row.to_s +
+                         ") is out of range [max row: " + get_max_row().to_s + "]" )
+            elsif ( ( col > get_max_col() ) || ( row < 0 ) )
+                raise( ArgumentError, "value for column (" + col.to_s +
+                         ") is out of range [max column: " + get_max_col().to_s + "]" )
+            end
+            row_map = @rows[ row ]
+            if ( ( row_map == nil ) || ( row_map.length < 1 ) )
+                return nil
+            end
+            return row_map[ col ]
+        end
+
+        def get_max_col()
+            return @max_col
+        end
+
+        def get_max_row()
+            return @max_row
+        end
+
+        # raises ArgumentError
+        def get_columns_as_map( key_col, value_col )
+            map = Hash.new
+            for row in 0 .. get_max_row
+                key = get_value( row, key_col )
+                value = get_value( row, value_col )
+                if ( ( key != nil ) && ( value != nil ) )
+                    if ( map.has_key?( key ) )
+                        raise( ArgumentError, "attempt to use non-unique table value as key [" +
+                                        + key + "]" )
+                    end
+                    map[ key ] = value
+                end
+            end
+            return map
+        end
+
+        def to_s
+            str = String.new
+            for row in 0 .. get_max_row
+               for col in 0 .. get_max_col
+                   str << col.to_s << " "
+               end
+               str << LEvoruby::Constants::LINE_DELIMITER
+               for col in 0 .. get_max_col
+                   str << row.to_s << ": "
+                   str << get_value( row, col ) << " "
+               end
+               str << Evoruby::Constants::LINE_DELIMITER
+            end
+            return str
+        end
+
+
+        private
+
+        def get_row( row )
+            return @rows[ row ]
+        end
+
+        def set_max_col( max_col )
+            @max_col = max_col
+        end
+
+        def set_max_row( max_row )
+            @max_row = max_row
+        end
+
+    end # class BasicTable
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/taxonomy/sp_taxonomy.rb b/forester/ruby/evoruby/lib/evo/taxonomy/sp_taxonomy.rb
new file mode 100644 (file)
index 0000000..ffda5df
--- /dev/null
@@ -0,0 +1,38 @@
+#
+# = lib/evo/taxonomy/sp_taxonomy.rb - SpTaxonomy class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: sp_taxonomy.rb,v 1.1 2008/12/30 05:28:00 cmzmasek Exp $
+
+
+
+module Evoruby
+
+    class SpTaxonomy
+
+        attr_accessor :code, :id, :scientific_name, :common_name
+        
+        def initialize( code, id, scientific_name, common_name = nil )
+            @code = String.new( code.strip() )
+            @id = String.new( id.strip() )
+            @scientific_name = String.new( scientific_name.strip() )
+            if ( common_name == nil )
+                @common_name = String.new()
+            else
+                @common_name = String.new( common_name.strip() )
+            end
+        end
+
+        def copy
+            return Taxonomy.new( code, id, scientific_name, common_name  )
+        end
+
+        def to_str()
+            code + " " + id + ": N=" + scientific_name
+        end
+
+    end # class SpTaxonomy
+
+end # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/taxonomy/taxonomy.rb b/forester/ruby/evoruby/lib/evo/taxonomy/taxonomy.rb
new file mode 100644 (file)
index 0000000..47f011b
--- /dev/null
@@ -0,0 +1,65 @@
+#
+# = lib/evo/taxonomy/taxonomy.rb - Taxonomy class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: taxonomy.rb,v 1.2 2009/01/03 00:19:08 cmzmasek Exp $
+
+
+
+module Evoruby
+
+    class Taxonomy
+
+        def initialize( name, id = nil, id_source = nil )
+            @name = String.new( name.strip() )
+            if ( id == nil )
+                @id = String.new()
+            else
+                @id = String.new( id.strip() )
+            end
+            if ( id_source == nil )
+                @id_source = String.new()
+            else
+                @id_source = String.new( id_source.strip() )
+            end
+        end
+
+        def == ( taxonomy )
+            if taxonomy == nil
+                return false
+            else
+                return ( ( get_name == taxonomy.get_name ) &&
+                     ( get_id == taxonomy.get_id ) &&
+                     ( get_id_source == taxonomy.get_id_source ) )
+            end
+        end
+
+        def copy
+            return Taxonomy.new( get_name, get_id, get_id_source )
+        end
+
+        def get_name()
+            @name
+        end
+
+        def get_id()
+            @id
+        end
+
+        def get_id_source()
+            @id_source
+        end
+
+        def to_str()
+            if Util.is_string_empty?( get_id )
+                @name
+            else
+                "[" + get_id + "] " + @name
+            end
+        end
+
+    end # class Taxonomy
+
+end # module Evoruby
\ No newline at end of file
diff --git a/forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb b/forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb
new file mode 100644 (file)
index 0000000..1ad9924
--- /dev/null
@@ -0,0 +1,177 @@
+#
+# = lib/evo/util/command_line_arguments.rb - CommandLineArguments class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: command_line_arguments.rb,v 1.2 2007/06/12 04:51:34 cmzmasek Exp $
+#
+# last modified: 05/16/2007
+
+module Evoruby
+
+    class CommandLineArguments
+
+        OPTIONS_PREFIX          = "-"
+        EXTENDED_OPTIONS_PREFIX = "--"
+        OPTIONS_SEPARATOR       = "="
+
+        # raises ArgumentError
+        def initialize( args )
+            @options  = Hash.new
+            @extended_options = Hash.new
+            @file_names = Array.new
+            parse_arguments( args )
+        end
+
+        def get_file_names
+            return @file_names
+        end
+
+        def get_file_name( i )
+            return @file_names[ i ]
+        end
+
+        def get_number_of_files()
+            return @file_names.length
+        end
+
+        def is_option_set?( option_name )
+            o = get_all_options
+            return ( o.has_key?( option_name ) )
+        end
+
+        # raises ArgumentError
+        def get_option_value( option_name )
+            o = get_all_options
+            if ( o.has_key?( option_name ) )
+                value = o[ option_name ]
+                if ( !Util.is_string_empty?( value ) )
+                    return value
+                else
+                    raise( ArgumentError, "value for option \"" +
+                         option_name + "\" is not set", caller )
+                end
+            else
+                raise( ArgumentError, "option \"" + option_name +
+                     "\" is not set", caller )
+            end
+        end
+
+        def get_option_value_as_int( option_name )
+            return get_option_value( option_name ).to_i
+        end
+
+        def get_option_value_as_float( option_name )
+            return get_option_value( option_name ).to_f
+        end
+
+        # mandatory_options (Array)
+        #
+        def validate_mandatory_options( mandatory_options )
+            o = get_all_options
+            missing = Array.new
+            for ma in mandatory_options
+                if ( !o.has_key?( ma ) )
+                    missing.push( ma )
+                end
+            end
+            return missing
+        end
+
+        # mandatory_options (Array)
+        #
+        def validate_mandatory_options_as_str( mandatory_options )
+            missing = validate_mandatory_options( mandatory_options )
+            return missing.join( ", " )
+        end
+
+        # allowed_options (Array)
+        #
+        def validate_allowed_options( allowed_options )
+            o = get_all_options
+            disallowed = Array.new
+            o.each_key { |op|
+                if ( !allowed_options.include?( op ) )
+                    disallowed.push( op )
+                end
+            }
+            return disallowed
+        end
+
+        # allowed_options (Array)
+        #
+        def validate_allowed_options_as_str( allowed_options )
+            disallowed = validate_allowed_options( allowed_options )
+            return disallowed.join( ", " )
+        end
+
+        private
+
+        def get_all_options
+            o = Hash.new
+            o.merge!( get_options_list )
+            o.merge!( get_extended_options_list )
+            return o
+        end
+
+        def parse_arguments( args )
+            for arg in args
+                if ( arg.index( EXTENDED_OPTIONS_PREFIX ) == 0 )
+                    parse_option( arg.slice( EXTENDED_OPTIONS_PREFIX.length, arg.length() - 1 ),
+                                  get_extended_options_list )
+
+                elsif ( arg.index( OPTIONS_PREFIX ) == 0 )
+                    parse_option( arg.slice( OPTIONS_PREFIX.length, arg.length() - 1 ),
+                                  get_options_list )
+
+                else
+                    get_file_names.push( arg )
+                end
+            end
+        end
+
+        # raises ArgumentError
+        def parse_option( option, options_map )
+            sep_index = option.index( OPTIONS_SEPARATOR )
+            if ( sep_index == nil )
+                if ( Util.is_string_empty?( option ) )
+                    raise( ArgumentError, "attempt to set option with an empty name" )
+                end
+                if ( get_all_options.has_key?( option ) )
+                     raise( ArgumentError, "attempt to set option \"" +
+                            option + "\" mutiple times" )
+                end
+                options_map[ option ] = ""
+            else
+                key = option.slice( 0, sep_index )
+                value = option.slice( sep_index + 1, option.length() - 1 )
+                if ( Util.is_string_empty?( key ) )
+                    raise( ArgumentError, "attempt to set option with an empty name" )
+                end
+                if ( Util.is_string_empty?( value ) )
+                    raise( ArgumentError, "attempt to set option with an empty value" )
+                end
+                if ( get_all_options.has_key?( key ) )
+                    raise( ArgumentError, "attempt to set option \"" +
+                            key + "\" mutiple times [" + option + "]" )
+                end
+                options_map[ key ] = value
+            end
+        end
+
+        def get_file_names_list
+            return @file_names
+        end
+
+        def get_options_list
+            return @options
+        end
+
+        def get_extended_options_list
+            return @extended_options
+        end
+
+    end # class CommandLineArguments
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/util/constants.rb b/forester/ruby/evoruby/lib/evo/util/constants.rb
new file mode 100644 (file)
index 0000000..6546478
--- /dev/null
@@ -0,0 +1,33 @@
+#
+# = lib/evo/util/constants.rb - Constants class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: constants.rb,v 1.3 2007/12/21 04:13:33 cmzmasek Exp $
+#
+# last modified: 05/11/2007
+
+module Evoruby
+
+    class Constants
+
+        VERBOSE = true
+
+        EVORUBY_VERSION = '1.0'
+
+        FORESTER_HOME_ENV_VARIABLE = 'FORESTER_HOME'
+        JAVA_HOME_ENV_VARIABLE     = 'JAVA_HOME'
+
+        EVORUBY         = 'evoruby'
+
+        LINE_DELIMITER  = "\n"
+
+        FILE_SEPARATOR  = File::SEPARATOR
+
+        DOMAIN_STRUCTURE_NHX_SEPARATOR = '>'
+
+
+    end # class Constants
+
+end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/util/util.rb b/forester/ruby/evoruby/lib/evo/util/util.rb
new file mode 100644 (file)
index 0000000..234a625
--- /dev/null
@@ -0,0 +1,240 @@
+#
+# = lib/evo/util/util.rb - Util class
+#
+# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: util.rb,v 1.17 2009/10/06 22:22:46 cmzmasek Exp $
+#
+# last modified: 05/15/2007
+
+require 'lib/evo/util/constants'
+
+module Evoruby
+
+    class Util
+
+        def Util.normalize_seq_name( name, length )
+            if name.length > length
+                name = name[ 0, length ]
+            elsif name.length < length
+                for i in 0 ... length - name.length
+                    name = name + " "
+                end
+            end
+            name
+        end
+
+        #  def Util.normalize_mol_sequence( seq )
+        #      new_seq = String.new()
+        #      for i in 0 ... seq.length
+        #          c = seq.get_slice( i )
+        #          if is_aa_gap_character?( c )
+        #              new_seq = new_seq + "-"
+        #          else
+        #              new_seq = new_seq + c
+        #          end
+        #      end
+        #      new_seq
+        #  end
+
+
+        # Returns true if char_code corresponds to: space * - . _
+        def Util.is_aa_gap_character?( char_code )
+            return ( char_code <= 32  || char_code == 42 || char_code == 45 || char_code == 46 ||char_code == 95  )
+        end
+
+        # Deletes *, digits, and whitespace, replaces BJOUZ? with X, and replaces non-(letters, -) with -
+        def Util.clean_seq_str( seq_str )
+            seq_str = seq_str.upcase
+            seq_str = seq_str.gsub( /\s+/, '' )
+            seq_str = seq_str.gsub( /\d+/, '' )
+            seq_str = seq_str.gsub( '*', '' )
+            seq_str = seq_str.gsub( /[BJOUZ?]/, 'X' )
+            seq_str = seq_str.gsub( /[^A-Z\-]/, '-' )
+            seq_str
+        end
+
+        # raises ArgumentError
+        def Util.check_file_for_readability( path )
+            unless ( File.exist?( path ) )
+                error_msg = "file [#{path}] does not exist"
+                raise ArgumentError, error_msg
+            end
+            unless ( File.file?( path ) )
+                error_msg = "file [#{path}] is not a regular file"
+                raise ArgumentError, error_msg
+            end
+            unless ( File.readable?( path ) )
+                error_msg = "file [#{path}] is not a readable file"
+                raise ArgumentError, error_msg
+            end
+            if ( File.zero?( path ) )
+                error_msg = "file [#{path}] is empty"
+                raise ArgumentError, error_msg
+            end
+        end
+
+        # raises ArgumentError
+        def Util.check_file_for_writability( path )
+            if File.directory?( path )
+                error_msg = "file [#{path}] is an existing directory"
+                raise ArgumentError, error_msg
+            elsif File.exist?( path )
+                error_msg = "file [#{path}] already exists"
+                raise ArgumentError, error_msg
+            elsif File.writable?( path )
+                error_msg = "file [#{path}] is not writeable"
+                raise ArgumentError, error_msg
+            end
+        end
+
+        def Util.fatal_error_if_not_writable( prg_name, path )
+            begin
+                Util.check_file_for_writability( path )
+            rescue ArgumentError => e
+                Util.fatal_error( prg_name, e.to_s )
+            end
+        end
+
+        def Util.fatal_error_if_not_readable( prg_name, path )
+            begin
+                Util.check_file_for_readability( path )
+            rescue ArgumentError => e
+                Util.fatal_error( prg_name, e.to_s )
+            end
+        end
+
+        def Util.get_env_variable_value( env_variable ) 
+            value = ENV[env_variable]
+            if value == nil || value.empty?
+                error_msg = "apparently environment variable #{env_variable} has not been set"
+                raise StandardError, error_msg 
+            end
+            value
+        end
+        
+
+        # raises ArgumentError
+        def Util.file2array( path, split_by_semicolon )
+            Util.check_file_for_readability( path )
+            a = Array.new()
+            c = 0
+            File.open( path ) do | file |
+                while line = file.gets
+                    if ( line =~ /^\s*(\S.*?)\s*$/ )
+                        s = $1
+                        if ( split_by_semicolon && s =~/;/ )
+                            sa = s.split( /;/ )
+                            for i in 0 ... sa.length()
+                                a[ c ] = sa[ i ].strip!
+                            end
+                        else
+                            a[ c ] = s
+                        end
+                        c += 1
+                    end
+                end
+            end
+            return a
+        end
+
+        def Util.print_program_information( prg_name,
+                prg_version,
+                prg_desc,
+                date,
+                copyright,
+                contact,
+                www,
+                io = STDOUT )
+
+            if RUBY_VERSION !~ /1.9/
+                puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
+                exit( -1 )
+            end
+
+            ruby_version = RUBY_VERSION
+            l = prg_name.length + prg_version.length + date.length + ruby_version.length + 12
+            io.print( Evoruby::Constants::LINE_DELIMITER )
+            io.print( prg_name + " " + prg_version + " [" + date + "] [ruby " + ruby_version + "]")
+            io.print( Evoruby::Constants::LINE_DELIMITER )
+            l.times {
+                io.print( "_" )
+            }
+            io.print( Constants::LINE_DELIMITER )
+            io.print( Constants::LINE_DELIMITER )
+            io.print( prg_desc )
+            io.print( Constants::LINE_DELIMITER )
+            io.print( Constants::LINE_DELIMITER )
+            io.print( "Copyright (C) " + copyright )
+            io.print( Constants::LINE_DELIMITER )
+            io.print( "Contact: " + contact )
+            io.print( Constants::LINE_DELIMITER )
+            io.print( "         " + www )
+            io.print( Constants::LINE_DELIMITER )
+            io.print( Constants::LINE_DELIMITER )
+        end
+
+        def Util.fatal_error( prg_name, message, io = STDOUT )
+            io.print( Constants::LINE_DELIMITER )
+            if ( !Util.is_string_empty?( prg_name ) )
+                io.print( "[" + prg_name + "] > " + message )
+            else
+                io.print( " > " + message )
+            end
+            io.print( Constants::LINE_DELIMITER )
+            io.print( Constants::LINE_DELIMITER )
+            exit( -1 )
+        end
+
+        def Util.print_message( prg_name, message, io = STDOUT )
+            if ( !Util.is_string_empty?( prg_name ) )
+                io.print( "[" + prg_name + "] > " + message )
+            else
+                io.print( " > " + message )
+            end
+            io.print( Constants::LINE_DELIMITER )
+        end
+
+        def Util.print_warning_message( prg_name, message, io = STDOUT )
+            if ( !Util.is_string_empty?( prg_name ) )
+                io.print( "[" + prg_name + "] > WARNING: " + message )
+            else
+                io.print( " > " + message )
+            end
+            io.print( Constants::LINE_DELIMITER )
+        end
+
+        def Util.is_string_empty?( s )
+            return ( s == nil || s.length < 1 )
+        end
+
+        # From "Ruby Cookbook"
+        # counts_hash: key is a "name", value is the count (integer)
+        def Util.draw_histogram( counts_hash, char = "#" )
+            pairs = counts_hash.keys.collect { |x| [ x.to_s, counts_hash[ x ] ] }.sort
+            largest_key_size = pairs.max { |x, y| x[ 0 ].size <=> y[ 0 ].size }[ 0 ].size
+            pairs.inject( "" ) do | s, kv |
+                s << "#{ kv[ 0 ].ljust( largest_key_size ) }  | #{ char*kv[ 1 ] }" + Constants::LINE_DELIMITER
+            end
+        end
+
+        def Util.looks_like_fasta?( path )
+            Util.check_file_for_readability( path )
+            File.open( path ) do | file |
+                while line = file.gets
+                    if ( line !~ /\S/ || line =~ /^\s*#/ )
+                    elsif line =~ /^\s*>\s*(.+)/
+                        return true
+                    else
+                        return false
+                    end
+                end
+            end
+            error_msg = "unexpected format"
+            raise IOError, error_msg
+        end
+
+    end # class Util
+
+end # module Evoruby
diff --git a/forester/ruby/scripts/delete_ext_nodes.rb b/forester/ruby/scripts/delete_ext_nodes.rb
new file mode 100755 (executable)
index 0000000..4296241
--- /dev/null
@@ -0,0 +1,58 @@
+#!/usr/local/bin/ruby -w
+
+infile = ARGV[ 0 ]
+
+metazoa_choanoflagellata = [ 
+"Metazoa_Choanoflagellata", 
+"Metazoa",
+"Bilateria_Cnidaria",
+"Bilateria",
+"Deuterostomia",
+"Chordata",
+"Urochordata_Vertebrata",
+"Vertebrata",
+"Tetrapoda",
+"Amniota",
+"Eutheria",
+"Euarchontoglires",
+"Primates",
+"Rodentia",
+"Teleostei",
+"Euteleostei",
+"Smegmamorpha",
+"Tetraodontiformes",
+"Urochordata",
+"Ascidiacea",
+"Urochordata",
+"Protostomia",
+"Ecdysozoa",
+"Arthropoda",
+"Insecta",
+"Lepidoptera_Diptera_Hymenoptera",
+"Diptera",
+"Culicoidea",
+"Hymenoptera",
+"Nematoda",
+"Annelida_Mollusca",
+"Annelida" ]
+
+if infile == nil
+ puts "no infile"
+ exit
+end
+
+File.open( infile ) do | file |
+    while line = file.gets
+        if line =~ /^[0-9A-Z]{3,5}\s/
+        elsif line =~ /^\t/
+        elsif line =~ /^{/
+         elsif line =~ /^f_\d/
+        else   
+            line =~ /(\S+)/
+            first = $1
+            if metazoa_choanoflagellata.include?( first ) 
+                puts( line )
+            end
+        end
+    end
+end 
diff --git a/forester/ruby/scripts/hmm_split.rb b/forester/ruby/scripts/hmm_split.rb
new file mode 100755 (executable)
index 0000000..8ae000d
--- /dev/null
@@ -0,0 +1,80 @@
+#!/usr/local/bin/ruby -w
+#
+# = hmm_split 
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: hmm_split.rb,v 1.5 2008/11/17 22:32:43 cmzmasek Exp $
+#
+# To split a Pfam HMM file into one file for each HMM.
+#
+
+
+module ForesterScripts
+    
+    if RUBY_VERSION !~ /1.9/
+                      puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
+                      exit( -1 )
+                end     
+    
+    
+    if ( ARGV == nil || ARGV.length != 3 )
+        puts( "usage: hmm_split.rb <Pfam HMM file> <outfile suffix> <outdir>" )         
+        exit( -1 )
+    end    
+       
+       hmmfile = ARGV[ 0 ]
+       suffix  = ARGV[ 1 ]
+       outdir  = ARGV[ 2 ]
+     
+       if ( !File.exists?( outdir ) )
+           puts( "outdir [" + outdir + "] does not exist" )
+           exit( -1 ) 
+       end 
+       if ( !File.exists?( hmmfile ) ) 
+           puts( "Pfam HMM file [" + hmmfile + "] does not exist" )
+           exit( -1 ) 
+       end                
+       
+       data = String.new
+       name = String.new
+       line_count = 0
+       count = 0
+       
+       File.open( hmmfile ) do | file |
+           while line = file.gets
+               data = data + line
+               line_count += 1
+               if ( line =~ /NAME\s+(.+)/ )
+                   if name.length > 0
+                       puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" )
+                       exit( -1 )                        
+                   end    
+                   name = $1    
+               elsif ( line =~ /\/\// )
+                   if name.length < 1
+                       puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" )
+                       exit( -1 )                        
+                   end
+                                       
+                   outfile = outdir + '/' + name + suffix
+                   if ( File.exists?( outfile ) ) 
+                       puts( "file [" + outfile + "] already exists" )
+                       exit( -1 )  
+                   end                   
+                   open( outfile, 'w' ) do | out |
+                       out.write( data )
+                   end
+                   count += 1
+                   puts( count.to_s + ": " + name )
+                   data = String.new
+                   name = String.new                                      
+               end
+           end
+       end 
+         
+       puts()
+       puts( "wrote " + count.to_s + " individual HMM files to " + outdir )    
+    
+end    
\ No newline at end of file
diff --git a/forester/ruby/scripts/parameters.rb_dir_qsub b/forester/ruby/scripts/parameters.rb_dir_qsub
new file mode 100644 (file)
index 0000000..c06697d
--- /dev/null
@@ -0,0 +1,5 @@
+# $Id: parameters.rb_dir_qsub,v 1.3 2007/12/20 04:07:13 cmzmasek Exp $
+
+PRG:     /home/czmasek/SOFTWARE/HMMER/hmmer-2.3.2/src/hmmpfam
+OPT:     -E 20 -A 0 /home/czmasek/DATA/PFAM/Pfam_ls
+SUFFIX:  _hmmpfam_22_20_ls
\ No newline at end of file
diff --git a/forester/ruby/scripts/pfam2go_reformat.rb b/forester/ruby/scripts/pfam2go_reformat.rb
new file mode 100755 (executable)
index 0000000..7d2bbb5
--- /dev/null
@@ -0,0 +1,90 @@
+#!/usr/local/bin/ruby -w
+#
+# = pfam2go_reformat
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: pfam2go_reformat.rb,v 1.4 2008/11/27 01:41:36 cmzmasek Exp $
+#
+# Reformat pfam2go to a "association" file suitable as input
+# for microarray GO enrichment/overrepresentation-type analyses,
+# and create a file listing all mapped Pfams as well.
+
+
+module ForesterScripts
+
+    require 'set'
+
+    if RUBY_VERSION !~ /1.9/
+        puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
+        exit( -1 )
+    end
+
+    if ( ARGV == nil || ARGV.length != 2 )
+        puts( "usage: pfam2go_reformat.rb <pfam2go file> <outfiles base>" )
+        exit( -1 )
+    end
+
+    infile  = ARGV[ 0 ]
+    outfilebase = ARGV[ 1 ]
+    outfile_sgd_style = outfilebase + "_sgd_style_associations"
+    outfile_simple_map = outfilebase + "_basic_associations"
+    outfile_all_pfams = outfilebase + "_all_associated_pfams"
+
+    pfams = SortedSet.new
+
+    if ( File.exists?( outfile_sgd_style ) )
+        puts( "outfile [" +  outfile_sgd_style + "] already exists" )
+        exit( -1 )
+    end
+    if ( File.exists?( outfile_simple_map ) )
+        puts( "outfile [" +  outfile_simple_map + "] already exists" )
+        exit( -1 )
+    end
+    if ( File.exists?( outfile_all_pfams ) )
+        puts( "outfile [" + outfile_all_pfams + "] already exists" )
+        exit( -1 )
+    end
+    if ( !File.exists?( infile) )
+        puts( "infile [" + infile + "] does not exist" )
+        exit( -1 )
+    end
+
+    out_str_sgd = String.new
+    out_str_basic = String.new
+
+    File.open( infile ) do | file |
+        while line = file.gets
+            if line =~ /^\s*Pfam:PF(\d+)\s+(\S+)\s.+(GO:\d+)\s*$/
+                pfam_id = $1
+                pfam_name = $2
+                go_id = $3
+                new_line = "PFAM" + "\t" + pfam_name + "\t" + pfam_name + "\t\t" + go_id + "\t" + "PF:" + pfam_id + "\t\t\t\t\t\t\t\t\t"
+                out_str_sgd = out_str_sgd + new_line + "\n"
+                out_str_basic = out_str_basic + pfam_name + "\t" + go_id + "\n"
+                pfams.add( pfam_name )
+            end
+        end
+    end
+
+    open(  outfile_sgd_style, 'w' ) do |file|
+        file.write( out_str_sgd )
+    end
+    open( outfile_simple_map, 'w' ) do |file|
+        file.write( out_str_basic )
+    end
+    open( outfile_all_pfams, 'w' ) do |file|
+        pfams.each { |pfam|
+            file.write( pfam )
+            file.write( "\n" )
+        }
+    end
+    puts( "number of associated pfams         : " + pfams.size.to_s )
+    puts( "wrote assocations in sgd style to  : " + outfile_sgd_style )
+    puts( "wrote assocations in basic style to: " + outfile_simple_map )
+    puts( "wrote all associated pfams to      : " + outfile_all_pfams )
+    puts( "OK")
+
+end
+
diff --git a/forester/ruby/scripts/pfam_summarize.rb b/forester/ruby/scripts/pfam_summarize.rb
new file mode 100755 (executable)
index 0000000..52228fa
--- /dev/null
@@ -0,0 +1,116 @@
+#!/usr/local/bin/ruby -w
+#
+# = pfam_summarize
+#
+# Copyright::  Copyright (C) 2008-2009 Christian M. Zmasek. All rights reserved.
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: pfam_summarize.rb,v 1.2 2008/08/28 17:09:07 cmzmasek Exp $
+#
+# This extracts ID, AC, DE, TP, and DR values from Pfam data files.
+#
+# Created 2008-06-25 in San Diego, CA, USA by CMZ
+#
+# Usage: pfam_summarize.rb <infile: Pfam data file such as Pfam-A.full> <outfile>
+
+require 'iconv'
+
+module ForesterScripts
+    if RUBY_VERSION !~ /1.9/
+                      puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
+                      exit( -1 )
+                end     
+    SEP = "\t"
+    LINE_DELIMITER  = "\n"
+
+    if ( ARGV == nil || ARGV.length != 2 )
+        puts( "usage: pfam_summarize.rb <infile: Pfam data file such as Pfam-A.full> <outfile>" )
+        exit( -1 )
+    end
+
+    pfamfile = ARGV[ 0 ]
+    outfile  = ARGV[ 1 ]
+
+    if ( !File.exists?( pfamfile ) )
+        puts( "Pfam data file [" + pfamfile + "] does not exist" )
+        exit( -1 )
+    end
+    if ( File.exists?( outfile ) )
+        puts( "outfile [" + outfile + "] already exists" )
+        exit( -1 )
+    end
+
+    ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
+
+    id = nil
+    ac = nil
+    de = nil
+    tp = nil
+    dr = Array.new()
+    line_count = 0
+    count = 0
+
+    out = File.open( outfile, 'w' )
+
+    File.open( pfamfile ) do | file |
+        while line = file.gets
+            line_count += 1
+
+            line = ic.iconv( line )
+
+            if ( line =~ /#=GF ID\s+(.+)/ )
+                if ( id != nil )
+                    puts( "Pfam data file [" + pfamfile + "] format error [line: " + line + "]" )
+                    exit( -1 )
+                end
+                id = $1
+            elsif ( line =~ /#=GF AC\s+(.+)/ )
+                ac = $1
+            elsif ( line =~ /#=GF DE\s+(.+)/ )
+                de = $1
+            elsif ( line =~ /#=GF TP\s+(.+)/ )
+                tp = $1
+            elsif ( line =~ /#=GF DR\s+(.+)/ )
+                dr.push( $1 )
+            elsif ( line =~ /^\/\// )
+                if ( id == nil || ac == nil )
+                    puts( "Pfam data file [" + pfamfile + "] format error [line: " + line + "]" )
+                    exit( -1 )
+                end
+                out.write( id )
+                out.write( SEP )
+                out.write( ac )
+                out.write( SEP )
+                out.write( tp )
+                out.write( SEP )
+                out.write( '[' )
+                out.write( de )
+                out.write( ']' )
+                out.write( SEP )
+                out.write( '[' )
+                dr.each { |d|
+                    out.write( d )
+                    out.write( ' ' )
+                }
+                out.write( ']' )
+                out.write( LINE_DELIMITER )
+
+                id = nil
+                ac = nil
+                de = nil
+                tp = nil
+                dr = Array.new()
+                count += 1
+            end
+        end
+    end
+
+    out.close
+
+    puts()
+    puts( "Summarized data for " + count.to_s + " individual Pfams to " + outfile )
+    puts( "OK" )
+    puts()
+
+end # module ForesterScripts
+
diff --git a/forester/ruby/scripts/pfam_to_scop.rb b/forester/ruby/scripts/pfam_to_scop.rb
new file mode 100755 (executable)
index 0000000..b8cdef2
--- /dev/null
@@ -0,0 +1,103 @@
+#!/usr/local/bin/ruby -w
+#
+# = pfam_to_scop
+#
+# Copyright::  Copyright (C) 2008-2009 Christian M. Zmasek. All rights reserved.
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: pfam_to_scop.rb,v 1.2 2008/08/28 17:09:07 cmzmasek Exp $
+#
+# This extracts ID and SCOP fa (or fa and sf) from Pfam data files.
+#
+# Created 2008-06-25 in San Diego, CA, USA by CMZ
+#
+# Usage: pfam_to_scop.rb <infile: Pfam data file such as Pfam-A.full> <outfile>
+
+require 'iconv'
+
+module ForesterScripts
+
+    if RUBY_VERSION !~ /1.9/
+                      puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
+                      exit( -1 )
+                end      
+    
+    SF = true
+
+    SEP = "\t"
+    LINE_DELIMITER  = "\n"
+
+    if ( ARGV == nil || ARGV.length != 2 )
+        puts( "usage: pfam_to_scop.rb <infile: Pfam data file such as Pfam-A.full> <outfile>" )
+        exit( -1 )
+    end
+
+    pfamfile = ARGV[ 0 ]
+    outfile  = ARGV[ 1 ]
+
+    if ( !File.exists?( pfamfile ) )
+        puts( "Pfam data file [" + pfamfile + "] does not exist" )
+        exit( -1 )
+    end
+    if ( File.exists?( outfile ) )
+        puts( "outfile [" + outfile + "] already exists" )
+        exit( -1 )
+    end
+
+    ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
+
+    id = nil
+    scops = Array.new()
+    line_count = 0
+    count = 0
+    scop_count = 0
+
+    out = File.open( outfile, 'w' )
+
+    File.open( pfamfile ) do | file |
+        while line = file.gets
+            line_count += 1
+
+            line = ic.iconv( line )
+
+            if ( line =~ /#=GF ID\s+(.+)/ )
+                if ( id != nil )
+                    puts( "Pfam data file [" + pfamfile + "] format error [line: " + line + "]" )
+                    exit( -1 )
+                end
+                id = $1
+            elsif ( line =~ /#=GF\s+DR\s+SCOP;\s+(\w+);\s+fa/ )
+                scops.push( $1 )
+            elsif ( SF && line =~ /#=GF\s+DR\s+SCOP;\s+(\w+);\s+sf/ )
+                scops.push( $1 )
+            elsif ( line =~ /^\/\// )
+                if ( id == nil )
+                    puts( "Pfam data file [" + pfamfile + "] format error [line: " + line + "]" )
+                    exit( -1 )
+                end
+                scops.each { |s|
+                    out.write( id )
+                    out.write( SEP )
+                    out.write( s )
+                    out.write( LINE_DELIMITER )
+                    scop_count += 1
+                }
+                id = nil
+                scops = Array.new()
+                count += 1
+            end
+        end
+    end
+
+    out.close
+
+    puts()
+    if ( SF )
+        puts( "Extracted #{scop_count} scop fa and sf identifiers for #{count.to_s} individual Pfams to " + outfile )
+    else
+        puts( "Extracted #{scop_count} scop fa identifiers for #{count.to_s} individual Pfams to " + outfile )
+    end
+    puts( "OK" )
+    puts()
+
+end # module ForesterScripts
\ No newline at end of file
diff --git a/forester/ruby/scripts/rb_dir_qsub.rb b/forester/ruby/scripts/rb_dir_qsub.rb
new file mode 100644 (file)
index 0000000..c2749bb
--- /dev/null
@@ -0,0 +1,159 @@
+#!/usr/local/bin/ruby -w
+#
+# = rb_dir_qsub
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: rb_dir_qsub.rb,v 1.15 2009/11/07 02:06:59 cmzmasek Exp $
+#
+# To execute qsub commands.
+# Submits PRG for every file in the current directory.
+#
+# Examples for PARAMETER_FILE:
+#
+# PRG:     /home/user/SOFTWARE/HMMER/hmmer-2.3.2/src/hmmpfam
+# OPT:     -E 20 -A 0 /home/user/DATA/PFAM/Pfam_ls
+# SUFFIX:  _hmmpfam_22_20_ls
+#
+# PRG:     /home/user/SOFTWARE/WUBLAST/tblastn
+# OPT:
+# VOPT:    AMPQU
+# VOPT:    HYDMA
+# SUFFIX:  _blast
+#
+# PRG:     /home/czmasek/SOFTWARE/HMMER/hmmer-3.0b2/binaries/intel-linux-x86_64/hmmscan
+# OPT:     -E 2 --notextw --qformat fasta /home/czmasek/DATA/PFAM/PFAM240/Pfam-A.hmm
+# SUFFIX:  .hmmscan30b2_240
+# OUTPUT:  --domtblout
+
+
+module ForesterScripts
+
+    if RUBY_VERSION !~ /1.9/
+        puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
+        exit( -1 )
+    end
+
+    PARAMETER_FILE = 'parameters.rb_dir_qsub'
+    SLEEP          = 1.0
+    REMOVE_SUFFIX  = true
+
+    PRG         = 'PRG:'
+    OPT         = 'OPT:'
+    VOPT        = 'VOPT:'
+    OUTPUT_OPT  = 'OUTPUT:'
+    SUFFIX      = 'SUFFIX:'
+    INPUT_PART  = 'INPUT_PART:'
+
+
+    PBS_O_WORKDIR       = '$PBS_O_WORKDIR/'
+    TMP_CMD_FILE_SUFFIX = '__QSUB'
+    NAME                = 'rb_dir_qsub'
+
+    if ( !File.exists?( PARAMETER_FILE ) )
+        puts( '[' + NAME + '] > parameters file "' + PARAMETER_FILE + '" not found' )
+        Process.exit!
+    end
+    puts( '[' + NAME + '] > reading ' + PARAMETER_FILE )
+
+    prg = ''
+    opt = ''
+    vopts = Array.new
+    suffix = ''
+    input_part = ''
+    output_opt = ''
+    open( PARAMETER_FILE ).each { |line|
+        if ( line.length > 1 && line =~ /^[^#]\S+/ )
+            if line =~ /^#{PRG}\s+(\S+)/
+                prg = $1
+            end
+            if line =~ /^\s*#{OPT}\s+(\S+.+)/
+                opt = $1
+            end
+            if line =~ /^\s*#{VOPT}\s+(\S+.+)/
+                vopts.push( $1 )
+            end
+            if line =~ /^\s*#{SUFFIX}\s+(\S+)/
+                suffix = $1
+            end
+            if line =~ /^\s*#{INPUT_PART}\s+(\S+)/
+                input_part = $1
+            end
+            if line =~ /^\s*#{OUTPUT_OPT}\s+(\S+.+)/
+                output_opt = $1
+            end
+        end
+    }
+    if ( prg.length < 1 )
+        puts( '[' + NAME + '] > no program name found in parameters file "' + PARAMETER_FILE + '"' )
+        Process.exit!
+    end
+    puts( '[' + NAME + '] > program: ' + prg )
+    puts( '[' + NAME + '] > option :  ' + opt )
+    vopts.each { |vopt|
+        puts( '[' + NAME + '] > voption:  ' + vopt )
+    }
+    puts( '[' + NAME + '] > suffix :  ' + suffix )
+    if ( input_part.length > 0 )
+        puts( '[' + NAME + '] > input:  ' + input_part )
+    end
+    if ( output_opt.length > 0 )
+        puts( '[' + NAME + '] > output opt :  ' + output_opt )
+    end
+    if vopts.empty?
+        vopts.push( "" )
+    end
+
+    files = Dir.entries( "." )
+
+    files.each { |file|
+        if ( !File.directory?( file ) && file !~ /^\./ && file !~ /#{PARAMETER_FILE}/ )
+
+            if ( input_part.length > 0 && file !~ /#{input_part}/ )
+                next
+            end
+            vopts.each { |vopt|
+                cmd = ""
+                outputfile = file.to_str
+                if REMOVE_SUFFIX
+                    if outputfile =~ /(.+)\..{1,5}/
+                        outputfile = $1
+                    end
+                end
+                if output_opt.length > 0
+                    cmd = prg + ' ' +
+                     output_opt + ' ' + PBS_O_WORKDIR + outputfile + suffix + ' ' +
+                     opt + ' ' +
+                     PBS_O_WORKDIR + file.to_str +
+                     ' > /dev/null'
+                elsif vopt.length > 0
+                    cmd = prg + ' ' + opt + ' ' + vopt + ' ' + PBS_O_WORKDIR + file.to_str +
+                     ' > ' + PBS_O_WORKDIR + vopt + "_" + outputfile + suffix
+                else
+                    cmd = prg + ' ' + opt + ' ' + PBS_O_WORKDIR + file.to_str +
+                     ' > ' + PBS_O_WORKDIR + outputfile + suffix
+                end
+                tmp_cmd_file = file.to_str + TMP_CMD_FILE_SUFFIX
+                if File.exists?( tmp_cmd_file )
+                    File.delete( tmp_cmd_file )
+                end
+                open( tmp_cmd_file, 'w' ) do |f|
+                    f.write( cmd )
+                end
+                puts( '[' + NAME + '] > excuting ' + cmd )
+                IO.popen( 'qsub ' + tmp_cmd_file , 'r+' ) do |pipe|
+                    pipe.close_write
+                    puts pipe.read
+                end
+                sleep( SLEEP )
+                if File.exists?( tmp_cmd_file )
+                    File.delete( tmp_cmd_file )
+                end
+            }
+        end
+    }
+    puts( '[' + NAME + '] > OK.' )
+    puts
+
+end
diff --git a/forester/ruby/scripts/rb_dir_x.rb b/forester/ruby/scripts/rb_dir_x.rb
new file mode 100644 (file)
index 0000000..b0e9ae1
--- /dev/null
@@ -0,0 +1,128 @@
+#!/usr/local/bin/ruby -w
+#
+# = rb_x_qsub
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: rb_dir_x.rb,v 1.8 2008/09/16 23:31:39 cmzmasek Exp $
+#
+# To execute qsub commands.
+# Submits PRG for every file in the current directory.
+#
+# Examples for PARAMETER_FILE:
+#
+# PRG:     /home/user/SOFTWARE/HMMER/hmmer-2.3.2/src/hmmpfam
+# OPT:     -E 20 -A 0 /home/user/DATA/PFAM/Pfam_ls
+# SUFFIX:  _hmmpfam_22_20_ls
+#
+# PRG:     /home/user/SOFTWARE/WUBLAST/tblastn
+# OPT:
+# VOPT:    AMPQU
+# VOPT:    HYDMA
+# SUFFIX:  _blast
+
+
+module ForesterScripts
+
+    if RUBY_VERSION !~ /1.9/
+        puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
+        exit( -1 )
+    end
+
+    PARAMETER_FILE    = 'parameters.rb_dir_x'
+    SLEEP = 1.0
+    SPAWN = true
+
+    PRG         = 'PRG:'
+    OPT         = 'OPT:'
+    VOPT        = 'VOPT:'
+    OUTPUT_OPT  = 'OUTPUT:' # TODO e.g. > or -o
+    SUFFIX      = 'SUFFIX:'
+    INPUT_PART  = 'INPUT_PART:'
+
+    NAME        = 'rb_dir_x'
+
+    if ( !File.exists?( PARAMETER_FILE ) )
+        puts( '[' + NAME + '] > parameters file "' + PARAMETER_FILE + '" not found' )
+        Process.exit!
+    end
+    puts( '[' + NAME + '] > reading ' + PARAMETER_FILE )
+
+    prg = ''
+    opt = ''
+    vopts = Array.new
+    suffix = ''
+    input_part = ''
+    open( PARAMETER_FILE ).each { |line|
+        if ( line.length > 1 && line =~ /^[^#]\S+/ )
+            if line =~ /^#{PRG}\s+(\S+)/
+                prg = $1
+            end
+            if line =~ /^\s*#{OPT}\s+(\S+.+)/
+                opt = $1
+            end
+            if line =~ /^\s*#{VOPT}\s+(\S+.+)/
+                vopts.push( $1 )
+            end
+            if line =~ /^\s*#{SUFFIX}\s+(\S+)/
+                suffix = $1
+            end
+            if line =~ /^\s*#{INPUT_PART}\s+(\S+)/
+                input_part = $1
+            end
+        end
+    }
+    if ( prg.length < 1 )
+        puts( '[' + NAME + '] > no program name found in parameters file "' + PARAMETER_FILE + '"' )
+        Process.exit!
+    end
+    puts( '[' + NAME + '] > program: ' + prg )
+    puts( '[' + NAME + '] > option :  ' + opt )
+    vopts.each { |vopt|
+        puts( '[' + NAME + '] > voption:  ' + vopt )
+    }
+    puts( '[' + NAME + '] > suffix :  ' + suffix )
+    if ( input_part.length > 0 )
+        puts( '[' + NAME + '] > input  :  ' + input_part )
+    end
+    if vopts.empty?
+        vopts.push( "" )
+    end
+
+    files = Dir.entries( "." )
+
+    files.each { |file|
+        if ( !File.directory?( file ) && file !~ /^\./ && file !~ /#{PARAMETER_FILE}/ )
+
+            if ( input_part.length > 0 && file !~ /#{input_part}/ )
+                next
+            end
+            vopts.each { |vopt|
+                cmd = ""
+                if vopt.length > 0
+                    cmd = 'nohup ' + prg + ' ' + opt + ' ' + vopt + ' ' + file.to_str +
+                     ' > ' + vopt + "_" + file.to_str + suffix + ' &'
+                else
+                    cmd = 'nohup ' + prg + ' ' + opt + ' ' + file.to_str +
+                     ' > ' + file.to_str + suffix + ' &'
+                end
+
+                puts( '[' + NAME + '] > excuting ' + cmd )
+                if SPAWN
+                    spawn( cmd, STDERR => "/dev/null" )
+                else
+                    IO.popen( cmd , 'r+' ) do |pipe|
+                        pipe.close_write
+                        puts pipe.read
+                    end
+                end
+                sleep( SLEEP )
+
+            }
+        end
+    }
+    puts( '[' + NAME + '] > OK.' )
+    puts
+
+end
diff --git a/forester/ruby/scripts/rb_qsub.rb b/forester/ruby/scripts/rb_qsub.rb
new file mode 100755 (executable)
index 0000000..829c3c6
--- /dev/null
@@ -0,0 +1,59 @@
+#!/usr/local/bin/ruby -w
+#
+# = rb_qsub 
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: rb_qsub.rb,v 1.6 2008/08/30 19:57:59 cmzmasek Exp $
+#
+# last modified: 11/13/2007
+#
+#
+# To execute qsub commands.
+# Each line l (unless precded by a # or space) in file
+# 'commands.qsub' is executed as 'qsub l'
+
+
+module ForesterScripts
+
+    if RUBY_VERSION !~ /1.9/
+        puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
+        exit( -1 )
+    end     
+    
+    CMDS_FILE    = 'commands.qsub'
+    TMP_CMD_FILE = '__QSUB_RB_CMD__'
+    PRG_NAME     = 'rb_qsub'
+
+    if ( !File.exists?( CMDS_FILE ) ) 
+        puts( '[' +PRG_NAME + '] > commands file "' + CMDS_FILE + '" not found' )
+        Process.exit!          
+    end    
+    
+    puts( '[' +PRG_NAME + '] > reading ' + CMDS_FILE )
+
+    open( CMDS_FILE ).each { |line| 
+        if ( line.length > 1 && line =~ /^[^#]\S+/ )
+            if ( File.exists?( TMP_CMD_FILE ) ) 
+                File.delete( TMP_CMD_FILE ) 
+            end
+            open( TMP_CMD_FILE, 'w' ) do |f|
+                f.write( line )
+            end 
+            puts( '[' +PRG_NAME + '] > excuting ' + line )
+            IO.popen( 'qsub ' + TMP_CMD_FILE , 'r+' ) do |pipe|
+                pipe.close_write
+                puts pipe.read
+            end
+            if ( File.exists?( TMP_CMD_FILE ) ) 
+                File.delete( TMP_CMD_FILE ) 
+            end
+            sleep( 10.0 )
+        end
+    }
+    puts( '[' +PRG_NAME + '] > OK.' )
+    puts
+
+end
+
diff --git a/forester/ruby/scripts/replace.rb b/forester/ruby/scripts/replace.rb
new file mode 100755 (executable)
index 0000000..6672c62
--- /dev/null
@@ -0,0 +1,81 @@
+#!/usr/local/bin/ruby -w
+#
+# = replace 
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: replace.rb,v 1.5 2008/08/28 17:09:07 cmzmasek Exp $
+#
+# To replace multiple strings in file.
+# Map file contains intructions for replacement (one on each line)
+# in the following format (by example): old#new
+#
+
+
+module ForesterScripts
+    
+    if RUBY_VERSION !~ /1.9/
+                      puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
+                      exit( -1 )
+                end     
+    
+    if ( ARGV == nil || ARGV.length != 3 )
+        puts( "usage: replace.rb <map-file> <infile> <outfile>" )         
+        exit( -1 )
+    end    
+    mapfile = ARGV[ 0 ]
+    infile  = ARGV[ 1 ]
+    outfile = ARGV[ 2 ]
+    
+    
+    if ( File.exists?( outfile ) ) 
+        puts( "outfile [" + outfile + "] already exists" )
+        exit( -1 )  
+    end
+    if ( !File.exists?( infile) )
+        puts( "infile [" + infile + "] does not exist" )
+        exit( -1 ) 
+    end 
+    if ( !File.exists?( mapfile ) ) 
+        puts( "mapfile [" + mapfile + "] does not exist" )
+        exit( -1 ) 
+    end                
+    
+    old_new_map = Hash.new
+    
+    File.open( mapfile ) do | file |
+        while line = file.gets
+            if ( line =~/(\S+)\s*#\s*(\S+)/ )
+                old_new_map[ $1 ] = $2      
+                puts( $1 + ' => ' + $2 )     
+            end
+        end
+    end 
+    
+    if ( old_new_map.size < 1 ) 
+        puts( "mapping file was empty" )         
+        exit( -1 )    
+    end   
+    
+    data_str = String.new
+    
+    File.open( infile ) do | file |
+        while line = file.gets
+            data_str =  data_str + line.chomp
+        end 
+    end     
+    
+    old_new_map.each_pair{ |old, new|
+        data_str = data_str.gsub( old, new )
+    }
+    
+    open( outfile, 'w' ) do |file|
+        file.write( data_str )
+    end      
+    
+    puts( "wrote " + outfile )
+    
+end
+
+    
\ No newline at end of file
diff --git a/forester/ruby/scripts/replace_id.rb b/forester/ruby/scripts/replace_id.rb
new file mode 100644 (file)
index 0000000..df2c2ed
--- /dev/null
@@ -0,0 +1,88 @@
+#!/usr/local/bin/ruby -w
+#
+# = replace_id 
+#
+# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: replace_id.rb,v 1.8 2008/08/28 17:09:07 cmzmasek Exp $
+#
+# To replace ()by way of example '123_CHI5' with '123_CHICK5'
+# given a mapping file containing '123_CHICKEN'
+# (in the form '123_CHICKEN: some description which is ignored').
+#
+# Note. This will break if the species id ends with a number (as is 
+# in the case for many bacteria).
+
+
+module ForesterScripts
+    
+if RUBY_VERSION !~ /1.9/
+                  puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
+                  exit( -1 )
+            end 
+
+    NUMBER_OF_LETTERS = 3
+
+    if ( ARGV == nil || ARGV.length != 3 )
+        puts( "usage: replace_id.rb <map-file> <infile> <outfile>" )         
+        exit( -1 )
+    end    
+    mapfile = ARGV[ 0 ]
+    infile  = ARGV[ 1 ]
+    outfile = ARGV[ 2 ]
+    
+    
+    if ( File.exists?( outfile ) ) 
+        puts( "outfile [" + outfile + "] already exists" )
+        exit( -1 )  
+    end
+    if ( !File.exists?( infile) )
+        puts( "infile [" + infile + "] does not exist" )
+        exit( -1 ) 
+    end 
+    if ( !File.exists?( mapfile ) ) 
+        puts( "mapfile [" + mapfile + "] does not exist" )
+        exit( -1 ) 
+    end                
+    
+    number_to_complete_id_map = Hash.new
+    
+    File.open( mapfile ) do | file |
+        while line = file.gets
+            if ( line =~ /(\d+_\S+)\s*:/ )
+                complete_id = $1
+                complete_id =~ /(\d+)_\S+/
+                number_to_complete_id_map[ $1 ] = complete_id     
+                puts( $1 + ' => ' + complete_id )
+            end
+        end
+    end 
+    
+    if ( number_to_complete_id_map.size < 1 ) 
+        puts( "mapping file was empty" )         
+        exit( -1 )    
+    end   
+    
+    data_str = String.new
+    
+    File.open( infile ) do | file |
+        while line = file.gets
+            data_str = data_str + line.chomp
+        end 
+    end     
+    
+    replacements = 0
+    number_to_complete_id_map.each_pair{ |number, id|
+        data_str.gsub!( /\b#{number}_[A-Z]{#{NUMBER_OF_LETTERS}}/, id )         
+    }
+    
+    open( outfile, 'w' ) do |file|
+        file.write( data_str )
+    end      
+    
+    puts( "wrote " + outfile )
+    puts( "OK" )
+    
+end
+
diff --git a/forester/ruby/scripts/scoptastic.rb b/forester/ruby/scripts/scoptastic.rb
new file mode 100755 (executable)
index 0000000..52e2ab5
--- /dev/null
@@ -0,0 +1,163 @@
+#!/usr/local/bin/ruby -w
+#
+# = scoptastic
+#
+# Copyright::  Copyright (C) 2008-2009 Christian M. Zmasek.
+#              All rights reserved.
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id: scoptastic.rb,v 1.3 2008/08/28 17:09:07 cmzmasek Exp $
+#
+# To create Pfam id to SCOP mappings, one for each of four levels of SCOP
+# classification.
+#
+# Created 2008-06-25 in San Diego, CA, USA by CMZ
+#
+# Usage: scoptastic.rb scoptastic.rb <Pfam id to ac map file, e.g.
+# pfam_summarize.rb output> <Pfam ac to SCOP classification map file> <Pfam id
+# to SCOP outfile root>
+
+
+require 'iconv'
+
+module ForesterScripts
+
+    if RUBY_VERSION !~ /1.9/
+                      puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
+                      exit( -1 )
+                end     
+    
+    CLASS_LEVEL_SUFFIX       = "_SCOP_2_CLASS"
+    FOLD_LEVEL_SUFFIX        = "_SCOP_3_FOLD"
+    SUPERFAMILY_LEVEL_SUFFIX = "_SCOP_4_SUPERFAMILY"
+    FAMILY_LEVEL_SUFFIX      = "_SCOP_5_FAMILY"
+
+    SEP = "\t"
+    LINE_DELIMITER  = "\n"
+
+    if ( ARGV == nil || ARGV.length != 3 )
+        puts( "usage: scoptastic.rb <Pfam id to ac map file, e.g. pfam_summarize.rb output> <Pfam ac to SCOP classification map file> <Pfam id to SCOP outfile root>" )
+        exit( -1 )
+    end
+
+    pfam_id_to_ac   = ARGV[ 0 ]
+    pfam_ac_to_scop = ARGV[ 1 ]
+    outfile         = ARGV[ 2 ]
+
+    if ( !File.exists?( pfam_id_to_ac ) )
+        puts( "Pfam id to ac map file [" + pfam_id_to_ac + "] does not exist" )
+        exit( -1 )
+    end
+    if ( !File.exists?( pfam_ac_to_scop ) )
+        puts( "Pfam ac to SCOP classification map file [" + pfam_ac_to_scop + "] does not exist" )
+        exit( -1 )
+    end
+    if ( File.exists?( outfile + CLASS_LEVEL_SUFFIX ) )
+        puts( "Outfile [" + outfile + CLASS_LEVEL_SUFFIX + "] already exists" )
+        exit( -1 )
+    end
+    if ( File.exists?( outfile +  FOLD_LEVEL_SUFFIX ) )
+        puts( "Outfile [" + outfile +  FOLD_LEVEL_SUFFIX + "] already exists" )
+        exit( -1 )
+    end
+    if ( File.exists?( outfile + SUPERFAMILY_LEVEL_SUFFIX ) )
+        puts( "Outfile [" + outfile + SUPERFAMILY_LEVEL_SUFFIX + "] already exists" )
+        exit( -1 )
+    end
+    if ( File.exists?( outfile + FAMILY_LEVEL_SUFFIX ) )
+        puts( "Outfile [" + outfile + FAMILY_LEVEL_SUFFIX + "] already exists" )
+        exit( -1 )
+    end
+
+    ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
+
+    pfam_ac_to_id_map = Hash.new
+
+    pfam_ac_to_scop_map = Hash.new
+
+    count = 0
+
+    File.open( pfam_id_to_ac  ) do | file |
+        while line = file.gets
+            line = ic.iconv( line )
+            if ( line !~ /^#/ && line =~ /\S/ )
+                if ( line =~ /^(\S+)\s+(PF\d+)/ )
+                    pfam_ac_to_id_map[ $2 ] = $1
+                    count += 1
+                else
+                    puts( "Pfam id to ac map file [" + pfam_id_to_ac + "] format error [line: " + line + "]" )
+                    exit( -1 )
+                end
+            end
+        end
+    end
+    puts()
+    puts( "Extracted #{count} Pfam id to ac mappings from file [#{pfam_id_to_ac}]" )
+
+    count = 0
+    File.open( pfam_ac_to_scop ) do | file |
+        while line = file.gets
+            line = ic.iconv( line )
+            if ( line !~ /^#/ && line =~ /\S/ )
+                if ( line =~ /^(PF\d+)\.?\d*\s+([a-z]\.\d+\.\d+\.\d+)/ )
+                    pfam_ac_to_scop_map[ $1 ] = $2
+                    count += 1
+                else
+                    puts( "Pfam ac to SCOP classification map file [" + pfam_ac_to_scop + "] format error [line: " + line + "]" )
+                    exit( -1 )
+                end
+            end
+        end
+    end
+
+    puts( "Extracted #{count} Pfam ac to SCOP classification mappings from file [#{pfam_ac_to_scop}]" )
+
+    out_class_level = File.open( outfile + CLASS_LEVEL_SUFFIX, 'w' )
+    out_fold_level = File.open( outfile + FOLD_LEVEL_SUFFIX  , 'w' )
+    out_superfamily_level = File.open( outfile + SUPERFAMILY_LEVEL_SUFFIX, 'w' )
+    out_family_level = File.open( outfile + FAMILY_LEVEL_SUFFIX, 'w' )
+
+    count = 0
+    pfam_ac_to_scop_map.each { | pfam_ac,scop |
+        if ( pfam_ac_to_id_map.has_key?( pfam_ac ) )
+            pfam_id = pfam_ac_to_id_map[ pfam_ac ]
+            scop_split = scop.split( "\." )
+
+            out_class_level.write( pfam_id )
+            out_fold_level.write( pfam_id )
+            out_superfamily_level.write( pfam_id )
+            out_family_level.write( pfam_id )
+
+            out_class_level.write( SEP )
+            out_fold_level.write( SEP )
+            out_superfamily_level.write( SEP )
+            out_family_level.write( SEP )
+
+            out_class_level.write( scop_split[ 0 ] )
+            out_fold_level.write( scop_split[ 0 ] + "." + scop_split[ 1 ] )
+            out_superfamily_level.write( scop_split[ 0 ] + "." + scop_split[ 1 ] + "." + scop_split[ 2 ] )
+            out_family_level.write( scop )
+
+            out_class_level.write( LINE_DELIMITER )
+            out_fold_level.write( LINE_DELIMITER )
+            out_superfamily_level.write( LINE_DELIMITER )
+            out_family_level.write( LINE_DELIMITER )
+            count += 1
+        else
+            puts( "Pfam ac #{pfam_ac} not found in Pfam id to ac map file [" + pfam_id_to_ac + "]" )
+            exit( -1 )
+        end
+    }
+
+    out_class_level.close
+    out_fold_level.close
+    out_superfamily_level.close
+    out_family_level.close
+
+    puts()
+    puts( "Wrote #{count} Pfam id to SCOP mappings to files '#{outfile + CLASS_LEVEL_SUFFIX}', '#{outfile + FOLD_LEVEL_SUFFIX}', '#{outfile + SUPERFAMILY_LEVEL_SUFFIX}', and '#{ outfile + FAMILY_LEVEL_SUFFIX}'" )
+    puts( "OK" )
+    puts()
+
+end # module ForesterScripts
+