From: cmzmasek@gmail.com Date: Wed, 9 Feb 2011 01:20:06 +0000 (+0000) Subject: initial commit X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=253a714c2440558d7c70fe30cac5d7b093798968;p=jalview.git initial commit --- diff --git a/forester/ruby/00_README.txt b/forester/ruby/00_README.txt new file mode 100644 index 0000000..28f1768 --- /dev/null +++ b/forester/ruby/00_README.txt @@ -0,0 +1,5 @@ +This folder contains the "evoruby" Ruby classes. +They are not (yet) essential for the rest of the FORESTER package. + +To use evoruby, add "path/to/forester-atv/ruby/evoruby" to RUBYLIB. +(e.g., for C shell: "setenv RUBYLIB $HOME/forester-atv/ruby/evoruby:$HOME/some/other/rubylibrary") \ No newline at end of file diff --git a/forester/ruby/evoruby/exe/d2f.rb b/forester/ruby/evoruby/exe/d2f.rb new file mode 100755 index 0000000..cb48db9 --- /dev/null +++ b/forester/ruby/evoruby/exe/d2f.rb @@ -0,0 +1,20 @@ +#!/usr/local/bin/ruby -w +# +# = exe/d2f +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: d2f.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $ +# +# last modified: 06/11/2007 + +require 'lib/evo/apps/domains_to_forester' + +module Evoruby + + dtf = DomainsToForester.new() + + dtf.run() + +end # module Evoruby diff --git a/forester/ruby/evoruby/exe/dsx.rb b/forester/ruby/evoruby/exe/dsx.rb new file mode 100755 index 0000000..1ff35f7 --- /dev/null +++ b/forester/ruby/evoruby/exe/dsx.rb @@ -0,0 +1,20 @@ +#!/usr/local/bin/ruby -w +# +# = exe/dsx +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: dsx.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $ +# +# last modified: 06/11/2007 + +require 'lib/evo/apps/domain_sequence_extractor' + +module Evoruby + + dsx = DomainSequenceExtractor.new() + + dsx.run() + +end # module Evoruby diff --git a/forester/ruby/evoruby/exe/evo_nursery.rb b/forester/ruby/evoruby/exe/evo_nursery.rb new file mode 100755 index 0000000..0ea157d --- /dev/null +++ b/forester/ruby/evoruby/exe/evo_nursery.rb @@ -0,0 +1,20 @@ +#!/usr/local/bin/ruby -W0 +# +# = exe/evo_nursery +# +# Copyright:: Copyright (C) 2009-2010 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: evo_nursery.rb,v 1.1 2009/10/07 21:59:41 cmzmasek Exp $ +# +# last modified: 2009/10/07 + +require 'lib/evo/apps/evo_nursery' + +module Evoruby + + en = EvoNursery.new() + + en.run() + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/exe/fae.rb b/forester/ruby/evoruby/exe/fae.rb new file mode 100755 index 0000000..009fa0b --- /dev/null +++ b/forester/ruby/evoruby/exe/fae.rb @@ -0,0 +1,19 @@ +#!/usr/local/bin/ruby -w +# +# = exe/fae +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: fae.rb,v 1.1 2008/09/10 02:16:34 cmzmasek Exp $ + + +require 'lib/evo/apps/fasta_extractor' + +module Evoruby + + mse = FastaExtractor.new() + + mse.run() + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/exe/fasta_tap.rb b/forester/ruby/evoruby/exe/fasta_tap.rb new file mode 100755 index 0000000..60afce9 --- /dev/null +++ b/forester/ruby/evoruby/exe/fasta_tap.rb @@ -0,0 +1,19 @@ +#!/usr/local/bin/ruby -w +# +# = exe/fasta_tap +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: fasta_tap.rb,v 1.1 2009/01/20 20:44:54 cmzmasek Exp $ + + +require 'lib/evo/apps/fasta_taxonomy_processor' + +module Evoruby + + tap = FastaTaxonomyProcessor.new() + + tap.run() + +end # module Evoruby diff --git a/forester/ruby/evoruby/exe/hsp.rb b/forester/ruby/evoruby/exe/hsp.rb new file mode 100755 index 0000000..cf0febc --- /dev/null +++ b/forester/ruby/evoruby/exe/hsp.rb @@ -0,0 +1,20 @@ +#!/usr/local/bin/ruby -w +# +# = exe/hsp +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: hsp.rb,v 1.1 2009/11/25 05:42:04 cmzmasek Exp $ +# +# last modified: 11/24/2009 + +require 'lib/evo/apps/hmmscan_parser' + +module Evoruby + + hsp = HmmscanParser.new() + + hsp.run() + +end # module Evoruby diff --git a/forester/ruby/evoruby/exe/msa_pro.rb b/forester/ruby/evoruby/exe/msa_pro.rb new file mode 100755 index 0000000..3fd2566 --- /dev/null +++ b/forester/ruby/evoruby/exe/msa_pro.rb @@ -0,0 +1,20 @@ +#!/usr/local/bin/ruby -w +# +# = exe/msa_pro +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: msa_pro.rb,v 1.4 2008/08/28 17:09:06 cmzmasek Exp $ +# + + +require 'lib/evo/apps/msa_processor' + +module Evoruby + + mp = MsaProcessor.new() + + mp.run() + +end # module Evoruby diff --git a/forester/ruby/evoruby/exe/mse.rb b/forester/ruby/evoruby/exe/mse.rb new file mode 100755 index 0000000..d0d7612 --- /dev/null +++ b/forester/ruby/evoruby/exe/mse.rb @@ -0,0 +1,19 @@ +#!/usr/local/bin/ruby -w +# +# = exe/d2f +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: mse.rb,v 1.2 2008/08/28 17:09:06 cmzmasek Exp $ + + +require 'lib/evo/apps/multi_sequence_extractor' + +module Evoruby + + mse = MultiSequenceExtractor.new() + + mse.run() + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/exe/phylogenies_decorator.rb b/forester/ruby/evoruby/exe/phylogenies_decorator.rb new file mode 100755 index 0000000..a23861b --- /dev/null +++ b/forester/ruby/evoruby/exe/phylogenies_decorator.rb @@ -0,0 +1,20 @@ +#!/usr/local/bin/ruby -w +# +# = exe/phylogeny_factory +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: phylogenies_decorator.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $ +# + + +require 'lib/evo/apps/phylogenies_decorator' + +module Evoruby + + fd = PhylogeniesDecorator.new + + fd.run + +end # module Evoruby diff --git a/forester/ruby/evoruby/exe/phylogeny_factory.rb b/forester/ruby/evoruby/exe/phylogeny_factory.rb new file mode 100644 index 0000000..4682571 --- /dev/null +++ b/forester/ruby/evoruby/exe/phylogeny_factory.rb @@ -0,0 +1,20 @@ +#!/usr/local/bin/ruby -w +# +# = exe/phylogeny_factory +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: phylogeny_factory.rb,v 1.5 2008/08/28 17:09:06 cmzmasek Exp $ +# + + +require 'lib/evo/apps/phylogeny_factory' + +module Evoruby + + pf = PhylogenyFactory.new + + pf.run + +end # module Evoruby diff --git a/forester/ruby/evoruby/exe/run_phylo_pipeline.rb b/forester/ruby/evoruby/exe/run_phylo_pipeline.rb new file mode 100755 index 0000000..d68c58b --- /dev/null +++ b/forester/ruby/evoruby/exe/run_phylo_pipeline.rb @@ -0,0 +1,81 @@ +#!/usr/local/bin/ruby -w +# +# = run_phylo_pipeline +# +# Copyright:: Copyright (C) 2010 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: run_phylo_pipeline.rb,v 1.15 2010/10/09 02:35:42 cmzmasek Exp $ +# +# + + +# hmmscan --nobias --domtblout _hmmscan_240_10 -E 10 /home/czmasek/DATA/PFAM/PFAM240/Pfam-A.hmm .fasta + +# hsp _hmmscan_240_10 _hmmscan_240_10_domain_table + +# d2f -e=10 _hmmscan_240_10_domain_table .fasta _hmmscan_240_10.dff + +# hmmsearch --nobias -E 1000 --domtblout .hmmsearch_SusD <~/DATA/PFAM/PFAM240/PFAM_A_HMMs/SusD.hmm> BACTH_CHIPI.fasta + +# dsx -dd -e=<1e-2> -l=<200> .hmmsearch_SusD .fasta BACTH_CHIPI_e2_200 + + +module Evoruby + + class RunPhyloPipeline + + def run + unless ARGV.length == 4 + puts "arguments are: [inputfile].fasta [hmm-name] [min-length] [neg e-value exponent]" + exit + end + + hmmscan = "/home/czmasek/SOFTWARE/HMMER/hmmer-3.0b3/src/hmmscan" + hmmsearch = "/home/czmasek/SOFTWARE/HMMER/hmmer-3.0b3/src/hmmsearch" + hsp = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester-atv/ruby/evoruby/exe/hsp.rb" + d2f = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester-atv/ruby/evoruby/exe/d2f.rb" + dsx = "/home/czmasek/SOFTWARE/FORESTER/DEV/forester-atv/ruby/evoruby/exe/dsx.rb" + + base_name = ARGV[ 0 ] + hmm = ARGV[ 1 ] + length = ARGV[ 2 ] + e_value_exp = ARGV[ 3 ] + do_domain_combination_analysis = true + + if do_domain_combination_analysis + + cmd = "#{hmmscan} --nobias --domtblout #{base_name}_hmmscan_240_10 -E 10 /home/czmasek/DATA/PFAM/PFAM240/Pfam-A.hmm #{base_name}.fasta" + run_command( cmd ) + + cmd = "#{hsp} #{base_name}_hmmscan_240_10 #{base_name}_hmmscan_240_10_domain_table" + run_command( cmd ) + + cmd = "#{d2f} -e=10 #{base_name}_hmmscan_240_10_domain_table #{base_name}.fasta #{base_name}_hmmscan_240_10.dff" + run_command( cmd ) + + end + + cmd = "#{hmmsearch} --nobias -E 1000 --domtblout #{base_name}.hmmsearch_#{hmm} ~/DATA/PFAM/PFAM240/PFAM_A_HMMs/#{hmm}.hmm #{base_name}.fasta" + run_command( cmd ) + + cmd = "#{dsx} -dd -e=1e-#{e_value_exp.to_s} -l=#{length} #{base_name}.hmmsearch_#{hmm} #{base_name}.fasta #{base_name}_e#{e_value_exp.to_s}_#{length}" + run_command( cmd ) + + end + + def run_command( cmd ) + puts cmd + `#{cmd}` + end + + end + + p = RunPhyloPipeline.new() + + p.run() + +end + + + diff --git a/forester/ruby/evoruby/exe/tap.rb b/forester/ruby/evoruby/exe/tap.rb new file mode 100755 index 0000000..12af133 --- /dev/null +++ b/forester/ruby/evoruby/exe/tap.rb @@ -0,0 +1,20 @@ +#!/usr/local/bin/ruby -w +# +# = exe/tap +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: tap.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $ +# +# last modified: 05/18/2007 + +require 'lib/evo/apps/taxonomy_processor' + +module Evoruby + + tap = TaxonomyProcessor.new() + + tap.run() + +end # module Evoruby diff --git a/forester/ruby/evoruby/exe/test.rb b/forester/ruby/evoruby/exe/test.rb new file mode 100755 index 0000000..8a4431d --- /dev/null +++ b/forester/ruby/evoruby/exe/test.rb @@ -0,0 +1,1171 @@ +#!/usr/local/bin/ruby -w +# +# = exe/test - Test class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: test.rb,v 1.18 2010/10/08 22:04:17 cmzmasek Exp $ +# +# last modified: 05/15/2007 + + +require 'lib/evo/util/constants' +require 'lib/evo/taxonomy/taxonomy' +require 'lib/evo/sequence/sequence' +require 'lib/evo/msa/msa' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/sequence/domain_structure' +require 'lib/evo/sequence/protein_domain' +require 'lib/evo/table/basic_table' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/writer/phylip_sequential_writer' +require 'lib/evo/io/writer/nexus_writer' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/io/parser/ncbi_tseq_parser' +require 'lib/evo/io/parser/hmmsearch_domain_extractor' +require 'lib/evo/apps/domain_sequence_extractor' +require 'lib/evo/apps/hmmscan_parser' +require 'lib/evo/apps/domains_to_forester' +require 'lib/evo/io/parser/general_msa_parser' +require 'lib/evo/io/parser/basic_table_parser' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/soft/fastme' +require 'lib/evo/soft/tree_puzzle' + + + +module Evoruby + + class Test + + GENERAL_MSA_FILE = "files/test/general_msa_file.txt" + FASTA_FILE = "files/test/fasta_file.txt" + TSEQ_FILE = "files/test/ncbi_tseq.xml" + + def initialize() + @failures = 0 + @successes = 0 + end + + + + def test_taxonomy() + begin + tax = Taxonomy.new( "pig" ) + + if tax.get_name != "pig" + return false + end + + tax1 = Taxonomy.new( "dog", "id", "source" ) + tax2 = tax1.copy + + if tax2.get_name != "dog" + return false + end + if tax2.get_id != "id" + return false + end + if tax2.get_id_source != "source" + return false + end + + if !( tax1 == tax2 ) + return false + end + + if !( tax1 == tax1 ) + return false + end + + tax3 = Taxonomy.new( "dog", "id" ) + if ( tax1 == tax3 ) + return false + end + + tax4 = Taxonomy.new( "dog" ) + tax5 = Taxonomy.new( "dog" ) + if !( tax4 == tax5 ) + return false + end + + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + + def test_sequence() + begin + seq = Sequence.new( "seq1", "WLIQ" ) + if ( seq.get_length != 4 ) + return false + end + if ( seq.get_residue( 3 ) != "Q" ) + return false + end + seq.append!( "E?-*X_Y" ) + if ( seq.get_length != 11 ) + return false + end + if ( seq.get_residue( 3 ) != "Q" ) + return false + end + if ( seq.get_residue( 4 ) != "E" ) + return false + end + seq.append!( "A V_" ) + if ( seq.get_length != 15 ) + return false + end + if ( !Test::same?( seq.get_gap_length, 5 ) ) + return false + end + if ( !Test::same?( seq.get_gap_ratio, 5.0 / 15.0 ) ) + return false + end + seq.delete_residue!( 0 ) + seq.delete_residue!( 2 ) + seq2 = seq.copy() + seq.delete_residue!( 0 ) + seq.delete_residue!( 0 ) + seq = nil + if ( seq2.get_length != 13 ) + return false + end + if ( seq2.get_sequence_as_string != "LIE?-*X_YA V_" ) + return false + end + if ( seq2.get_slice( 2, 2 ) != "E?" ) + return false + end + if ( seq2.get_slice( 0, 1 ) != "L" ) + return false + end + if ( seq2.get_subsequence( 1, 4 ).get_sequence_as_string != "IE?-" ) + return false + end + if ( seq2.get_name() != "seq1" ) + return false + end + if ( seq2.get_slice!( 2, 2 ) != "E?" ) + return false + end + if ( seq2.get_sequence_as_string != "LI-*X_YA V_" ) + return false + end + if ( seq2.get_length != 11 ) + return false + end + if ( seq2.get_character_code( 0 ) != 76 ) + return false + end + str_0 = " Li-*X_YA V_ 3 3 1212 ?? B1J OU.Z " + if ( Util.clean_seq_str( str_0 ) != "LI-X-YAV-XXXXXX-X" ) + return false + end + + tax = Taxonomy.new( "dog", "tax_id", "tax_source" ) + seqn = Sequence.new( "seqn", "VVVVV", "acc", "acc source", tax, "symbol", "2accession", "2source" ) + seqc = seqn.copy + if ( seqc.get_name() != "seqn" ) + return false + end + if ( seqc.get_accession() != "acc" ) + return false + end + if ( seqc.get_accession_source() != "acc source" ) + return false + end + if ( seqc.get_taxonomy.get_name != "dog" ) + return false + end + if ( seqc.get_taxonomy.get_id != "tax_id" ) + return false + end + if ( seqc.get_symbol != "symbol" ) + return false + end + if ( seqc.get_secondary_accession != "2accession" ) + return false + end + if ( seqc.get_secondary_accession_source != "2source" ) + return false + end + + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + + def test_msa() + begin + msa = Msa.new() + seq0 = Sequence.new( "seq 0", "a-*-_ x-ijklmnopqrstuvwxyz" ) + seq1 = Sequence.new( "seq 1", "ab--_ X-ijklmnopqrstuvwxyz" ) + seq2 = Sequence.new( "seq 2", "abc-_?x-ijklmnopqrstuvwxyz" ) + seq3 = Sequence.new( "seq 3", "abcd_?x-ijklmnopqrstuvwxyz" ) + seq4 = Sequence.new( "seq 4", "abcde?x-ijklmnopqrstuvwxyz" ) + seq5 = Sequence.new( "seq 5", "abcdefx-ijklmnopqrstuvwxyz" ) + msa.add_sequence( seq0 ); + msa.add_sequence( seq1 ); + msa.add_sequence( seq2 ); + msa.add_sequence( seq3 ); + msa.add_sequence( seq4 ); + msa.add_sequence( seq5 ); + msa.add( "seq 6", "abcdefg-ijklmnopqrstuvwxyz" ); + if ( msa.get_sequence( 0 ).get_name() != "seq 0" ) + return false + end + if ( msa.get_by_name( "Eq 1", false, true ).get_name != "seq 1" ) + return false + end + if ( msa.find_by_name( "Eq 2", false, true )[ 0 ] != 2 ) + return false + end + if ( !msa.is_aligned ) + return false + end + if ( msa.get_number_of_seqs != 7 ) + return false + end + if ( msa.get_length != 26 ) + return false + end + msa.add( "seq 7", "abcdefgqijklmnopqrstuvwxyz" ); + if ( msa.get_number_of_seqs != 8 ) + return false + end + msa.remove_sequence!( 7 ) + if ( msa.get_number_of_seqs != 7 ) + return false + end + msa.remove_gap_only_columns!() + if ( msa.get_length() != 25 ) + return false + end + if ( msa.get_by_name( "seq 0" ).get_sequence_as_string != "a-*-_ xijklmnopqrstuvwxyz" ) + return false + end + msa.remove_gap_columns_w_gap_ratio!( 6.1 / 7.0 ) + if ( msa.get_length() != 25 ) + return false + end + msa.remove_gap_columns_w_gap_ratio!( 6.0 / 7.0 ) + if ( msa.get_length() != 25 ) + return false + end + if ( msa.get_by_name( "seq 0" ).get_sequence_as_string != "a-*-_ xijklmnopqrstuvwxyz" ) + return false + end + msa.remove_gap_columns_w_gap_ratio!( 5.0 / 7.0 ) + if ( msa.get_length() != 25 ) + return false + end + if ( msa.get_by_name( "seq 0" ).get_sequence_as_string != "a-*-_ xijklmnopqrstuvwxyz" ) + return false + end + msa.remove_gap_columns_w_gap_ratio!( 2.0 / 7.0 ) + if ( msa.get_length() != 23 ) + return false + end + if ( msa.get_by_name( "seq 0" ).get_sequence_as_string != "a-* xijklmnopqrstuvwxyz" ) + puts msa.get_by_name( "seq 0" ).get_sequence_as_string + return false + end + msa.remove_gap_columns_w_gap_ratio!( 1.0 / 7.0 ) + if ( msa.get_length() != 21 ) + return false + end + if ( msa.get_by_name( "seq 0" ).get_sequence_as_string != "a-xijklmnopqrstuvwxyz" ) + return false + end + msa2 = Evoruby::Msa.new() + msa2.add( "seq0", "abcdefgh" ); + msa2.add( "seq1", "a-cdefgh" ); + msa2.add( "seq2", "a--defgh" ); + msa2.add( "seq3", "a---efgh" ); + msa2.add( "seq4", "a----fgh" ); + msa2.add( "seq5", "a" ); + if ( msa2.is_aligned ) + return false + end + msa2.remove_sequence!( 5 ) + if ( !msa2.is_aligned ) + return false + end + if ( msa2.get_number_of_seqs != 5 ) + return false + end + msa2.remove_gap_only_columns!() + + if ( msa2.get_length != 8 ) + return false + end + + msa2.remove_sequences_by_gap_ratio!( 4.0 / 8.0 ) + if ( msa2.get_number_of_seqs != 5 ) + return false + end + msa2.remove_sequences_by_gap_ratio!( 3.0 / 8.0 ) + if ( msa2.get_number_of_seqs != 4 ) + return false + end + msa2.remove_sequences_by_gap_ratio!( 1.0 / 8.0 ) + if ( msa2.get_number_of_seqs != 2 ) + return false + end + msa2.remove_sequences_by_gap_ratio!( 0.0 ) + if ( msa2.get_number_of_seqs != 1 ) + return false + end + msa2.add( "seq1", "a-cdefgh" ); + msa2.add( "seq2", "a--defgh" ); + msa2.add( "seq3", "a---efgh" ); + msa2.add( "seq4", "a----fgh" ); + + msa2.remove_sequences_by_non_gap_length!( 4 ) + if ( msa2.get_number_of_seqs != 5 ) + return false + end + msa2.remove_sequences_by_non_gap_length!( 5 ) + if ( msa2.get_number_of_seqs != 4 ) + return false + end + msa2.remove_sequences_by_non_gap_length!( 8 ) + if ( msa2.get_number_of_seqs != 1 ) + return false + end + msa2.add( "seq1", "a-cdefgh" ); + msa2.add( "seq2", "a--defgh" ); + msa2.add( "seq3", "a---efgh" ); + msa2.add( "seq4", "a----fgh" ); + msa2.trim!( 0, 7 ) + if ( msa2.get_by_name( "seq0" ).get_sequence_as_string != "abcdefgh" ) + return false + end + msa2.trim!( 3, 4 ) + if ( msa2.get_by_name( "seq0" ).get_sequence_as_string != "de" ) + return false + end + msa3 = Evoruby::Msa.new() + msa3.add( "seq0", "abcdefgh-abcdef--*" ); + msa3.add( "seq1", "b-deefgh-a____f--*" ); + msa3.add( "seq2", "A________abcdef--*" ); + msa3.add( "seq3", "A Efgh---------*" ); + msa3.add( "seq4", " eFhh---------*" ); + msa3.add( "seq5", "----------------ee" ); + if ( !Test::same?( msa3.calculate_overlap( 0, 0 ), 14 ) ) + return false + end + if ( !Test::same?( msa3.calculate_overlap( 0, 1 ), 9 ) ) + return false + end + if ( !Test::same?( msa3.calculate_overlap( 0, 5 ), 0 ) ) + return false + end + if ( !Test::same?( msa3.calculate_overlap( 4, 5 ), 0 ) ) + return false + end + if ( !msa3.overlap?( 2, 3 ) ) + return false + end + if ( msa3.overlap?( 2, 3, 2 ) ) + return false + end + if ( msa3.overlap?( 4, 5 ) ) + return false + end + if ( !Test::same?( msa3.calculate_identities( 4, 5 ), 0 ) ) + return false + end + if ( !Test::same?( msa3.calculate_identities( 3, 4 ), 3 ) ) + return false + end + if ( msa3.split_into_overlapping_msa.length != 3 ) + return false + end + if ( msa3.split_into_overlapping_msa( 5 ).length != 4 ) + return false + end + + + msa4 = Msa.new() + seq0 = Sequence.new( "seq 0", "ABCDED" ) + seq1 = Sequence.new( "seq 1", "ABCDEE" ) + seq2 = Sequence.new( "seq 2", "abcded" ) + seq3 = Sequence.new( "seq 3", " ABCDEE" ) + seq4 = Sequence.new( "seq 4", "ABCDEV" ) + seq5 = Sequence.new( "seq 5", "ABCDED" ) + seq6 = Sequence.new( "seq 6", "AB.DEI" ) + seq7 = Sequence.new( "seq 7", "aB-DEi*" ) + seq8 = Sequence.new( "seq 8", "ABCDED" ) + seq9 = Sequence.new( "seq 9", "ABCDED" ) + seq10 = Sequence.new( "seq 10", "ABCDED" ) + seq11 = Sequence.new( "seq 11", "ABCDED" ) + msa4.add_sequence( seq0 ); + msa4.add_sequence( seq1 ); + msa4.add_sequence( seq2 ); + msa4.add_sequence( seq3 ); + msa4.add_sequence( seq4 ); + msa4.add_sequence( seq5 ); + msa4.add_sequence( seq6 ); + msa4.add_sequence( seq7 ); + msa4.add_sequence( seq8 ); + msa4.add_sequence( seq9 ); + msa4.add_sequence( seq10 ); + msa4.add_sequence( seq11 ); + + msa4.remove_redundant_sequences! + + puts msa4.to_str + + if msa4.get_number_of_seqs != 4 + return false + end + + if msa4.get_sequence( 0 ).get_name != "seq 0" + return false + end + if msa4.get_sequence( 1 ).get_name != "seq 1" + return false + end + if msa4.get_sequence( 2 ).get_name != "seq 4" + return false + end + if msa4.get_sequence( 3 ).get_name != "seq 6" + return false + end + + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_msa_factory() + begin + f = MsaFactory.new() + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_domain_structure() + begin + ds = DomainStructure.new( 190 ) + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_protein_domain() + begin + ds = ProteinDomain.new( "domain", 23, 466, "d1", 0.4 ) + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_basic_table() + begin + t = BasicTable.new() + t.set_value( 233, 923, "snake" ) + t.set_value( 233, 923, "lizard" ) + if ( t.get_value_as_string( 233, 923 ) != "lizard" ) + return false + end + if ( t.get_value_as_string( 33, 23 ) != "" ) + return false + end + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_msa_io() + begin + msaio = MsaIO.new() + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_phylip_sequentialwriter() + begin + p = PhylipSequentialWriter.new() + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_nexus_writer() + begin + n = NexusWriter.new() + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_fasta_writer() + begin + f = FastaWriter.new() + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_general_msa_parser( path_to_evoruby ) + begin + g = GeneralMsaParser.new() + f = MsaFactory.new() + sep = "" + if ( !Util::is_string_empty?( path_to_evoruby ) ) + sep = Constants::FILE_SEPARATOR + end + msa = f.create_msa_from_file( path_to_evoruby + + sep + + GENERAL_MSA_FILE, g ) + + if ( msa.get_length() != 29 ) + return false + end + if ( msa.get_number_of_seqs() != 7 ) + return false + end + + seq0 = msa.get_sequence( 0 ) + seq1 = msa.get_sequence( 1 ) + seq2 = msa.get_sequence( 2 ) + seq3 = msa.get_sequence( 3 ) + seq4 = msa.get_sequence( 4 ) + seq5 = msa.get_sequence( 5 ) + seq6 = msa.get_sequence( 6 ) + + if ( seq0.get_name() != "sequence0" ) + return false + end + if ( seq0.get_sequence_as_string() != "ABCDE.GHIJKLMNOPQR.TUVWabcxy0" ) + return false + end + + if ( seq1.get_name() != "sequence1" ) + return false + end + if ( seq1.get_sequence_as_string() != "abcdefghijklmnopqrstuvwabcxy1" ) + return false + end + + if ( seq2.get_name() != "sequence2" ) + return false + end + if ( seq2.get_sequence_as_string() != "abcdefghijkl---x_-*?_XXabcxy2" ) + return false + end + + if ( seq3.get_name() != "sequence3" ) + return false + end + if ( seq3.get_sequence_as_string() != "12345678901234567890123abcxy3" ) + return false + end + + if ( seq4.get_name() != "sequence4" ) + return false + end + if ( seq4.get_sequence_as_string() != "--------------------------xy4" ) + return false + end + + if ( seq5.get_name() != "sequence5" ) + return false + end + if ( seq5.get_sequence_as_string() != "a*c*ef****************wabcxy5" ) + return false + end + + if ( seq6.get_name() != "sequence6" ) + return false + end + if ( seq6.get_sequence_as_string() != "ururufhfghfgftgfhftgfttabcxy6" ) + return false + end + + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_fasta_parser( path_to_evoruby ) + begin + fasta = FastaParser.new() + f = MsaFactory.new() + sep = "" + if ( !Util::is_string_empty?( path_to_evoruby ) ) + sep = Constants::FILE_SEPARATOR + end + msa = f.create_msa_from_file( path_to_evoruby + + sep + + FASTA_FILE, fasta ) + + if ( msa.get_length() != 6 ) + return false + end + if ( msa.get_number_of_seqs() != 4 ) + return false + end + + seq0 = msa.get_sequence( 0 ) + seq1 = msa.get_sequence( 1 ) + seq2 = msa.get_sequence( 2 ) + seq3 = msa.get_sequence( 3 ) + + if ( seq0.get_name() != "sequence 0" ) + return false + end + if ( seq0.get_sequence_as_string() != "ABCDEF" ) + return false + end + + if ( seq1.get_name() != "sequence 1" ) + return false + end + if ( seq1.get_sequence_as_string() != "abcdef" ) + return false + end + + if ( seq2.get_name() != "sequence 2" ) + return false + end + if ( seq2.get_sequence_as_string() != "123456" ) + return false + end + if ( seq3.get_name() != "sequence 3" ) + return false + end + if ( seq3.get_sequence_as_string() != "a-c--f" ) + return false + end + + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_ncbi_tseq_parser( path_to_evoruby ) + begin + parser = NcbiTSeqParser.new + f = MsaFactory.new + sep = "" + if ( !Util::is_string_empty?( path_to_evoruby ) ) + sep = Constants::FILE_SEPARATOR + end + msa = f.create_msa_from_file( path_to_evoruby + + sep + + TSEQ_FILE, parser ) + + if ( msa.get_number_of_seqs() != 9 ) + return false + end + + seq0 = msa.get_sequence( 0 ) + seq1 = msa.get_sequence( 1 ) + seq8 = msa.get_sequence( 8 ) + + if ( seq0.get_name() != "SusD [Bacteroides thetaiotaomicron VPI-5482]" ) + return false + end + if ( seq0.get_sequence_as_string() != "MKTKYIKQLFSAALIAVLSSGVTSCINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQENQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK" ) + return false + end + if ( seq0.get_accession != "29341016" ) + return false + end + if ( seq0.get_accession_source != "gi" ) + return false + end + if ( seq0.get_taxonomy.get_name != "Bacteroides thetaiotaomicron VPI-5482" ) + return false + end + if ( seq0.get_taxonomy.get_id != "226186" ) + return false + end + if ( seq0.get_taxonomy.get_id_source != "ncbi" ) + return false + end + + + if ( seq1.get_name() != "SusD, outer membrane protein [Bacteroides thetaiotaomicron VPI-5482]" ) + return false + end + if ( seq1.get_accession != "29349109" ) + return false + end + if ( seq1.get_accession_source != "gi" ) + return false + end + if ( seq1.get_taxonomy.get_name != "Bacteroides thetaiotaomicron VPI-5482" ) + return false + end + if ( seq1.get_taxonomy.get_id != "226186" ) + return false + end + if ( seq1.get_taxonomy.get_id_source != "ncbi" ) + return false + end + + + if ( seq8.get_name() != "Chain A, B. Thetaiotaomicron Susd With Maltotriose" ) + return false + end + if ( seq8.get_accession != "pdb|3CKB|A" ) + return false + end + if ( seq8.get_accession_source != "ncbi" ) + return false + end + if ( seq8.get_taxonomy.get_name != "Bacteroides thetaiotaomicron" ) + return false + end + if ( seq8.get_taxonomy.get_id != "818" ) + return false + end + if ( seq8.get_taxonomy.get_id_source != "ncbi" ) + return false + end + + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_hmmsearch_domain_extractor() + begin + h = Evoruby::HmmsearchDomainExtractor.new() + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_domain_sequence_extractor() + begin + h = Evoruby::DomainSequenceExtractor.new() + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_hmmscan_parser() + begin + h = Evoruby::HmmscanParser.new() + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_domains_to_forester() + begin + d = Evoruby::DomainsToForester.new() + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + + def test_basic_table_parser() + begin + b = Evoruby::BasicTableParser.new() + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + + def test_cla() + begin + cla = CommandLineArguments.new( Array.new ) + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_tree_puzzle() + begin + tp = TreePuzzle.new() + tp.run( '/home/czmasek/scratch/small.aln', + :wag, + :uniform, + 200 ) + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + def test_fastme() + begin + fastme = FastMe.new() + fastme.run( '/home/czmasek/scratch/outdist', 0, :GME ) + rescue Exception => e + puts() + puts( e.to_s ) + puts() + return false + end + return true + end + + + def run() + + t0 = Time.now + + puts + puts "ruby version " + RUBY_VERSION + puts Constants::EVORUBY + " version " + Constants::EVORUBY_VERSION + puts + + path_to_evoruby = Test.get_path_to_evoruby() + + if ( Util.is_string_empty?( path_to_evoruby ) ) + path_to_evoruby = "" + puts() + puts( "Warning! Path to evoruby could not be established. Some tests will might fail." ) + puts() + end + + print( "--- Taxonomy: " ) + if ( test_taxonomy() ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- Sequence: " ) + if ( test_sequence() ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- Msa: " ) + if ( test_msa() ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- MsaFactory: " ) + if ( test_msa_factory() ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- DomainStructure: " ) + if ( test_domain_structure() ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- ProteinDomain: " ) + if ( test_protein_domain() ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- BasicTable: " ) + if ( test_basic_table() ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- MsaIO: " ) + if ( test_msa_io ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- PhylipSequentialWriter: " ) + if ( test_phylip_sequentialwriter ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- FastaWriter : " ) + if ( test_fasta_writer ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- NexusWriter: " ) + if ( test_nexus_writer ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- FastaParser: " ) + if ( test_fasta_parser( path_to_evoruby ) ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- NCBI Tseq parser: " ) + if ( test_ncbi_tseq_parser( path_to_evoruby ) ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- GeneralMsaParser: " ) + if ( test_general_msa_parser( path_to_evoruby ) ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + + print( "--- Hmmsearch domain extractor: " ) + if ( test_hmmsearch_domain_extractor ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- Domain sequence extractor: " ) + if ( test_domain_sequence_extractor ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- Hmmscan parser: " ) + if ( test_hmmscan_parser ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + + print( "--- Domains 2 forester: " ) + if ( test_domains_to_forester ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- BasicTableParser: " ) + if ( test_basic_table_parser ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- TreePuzzle (wrapper): " ) + if ( test_tree_puzzle() ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + print( "--- FastMe (wrapper): " ) + if ( test_fastme() ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + + + + print( "--- CLA: " ) + if ( test_cla() ) + puts( "ok" ) + @successes += 1 + else + puts( "FAILED" ) + @failures += 1 + end + puts + puts "ruby version " + RUBY_VERSION + puts Constants::EVORUBY + " version " + Constants::EVORUBY_VERSION + puts + + td = Time.at( Time.now - t0 ) + puts( "Time : #{ td.sec }.#{ td.usec }s" ) + puts() + + puts( "Successful tests: " + @successes.to_s ) + puts( "Failed tests : " + @failures.to_s ) + puts() + if ( @failures < 1 ) + puts( "OK" ) + else + puts( "NOT ok" ) + end + + puts() + end + + private + + def Test.same?( n, m ) + return ( ( n - m ).abs < 0.000001 ) + end + + def Test.get_path_to_evoruby() + rubylib = ENV['RUBYLIB'].split(':') + evoruby_path = nil + rubylib.each do | path | + if ( path =~ /evoruby/ ) + evoruby_path = path + break + end + end + evoruby_path + end + + end # class Test + + + test = Test.new() + + test.run() + + +end # module Evoruby + diff --git a/forester/ruby/evoruby/exe/tseq_tap.rb b/forester/ruby/evoruby/exe/tseq_tap.rb new file mode 100755 index 0000000..e0075fa --- /dev/null +++ b/forester/ruby/evoruby/exe/tseq_tap.rb @@ -0,0 +1,19 @@ +#!/usr/local/bin/ruby -w +# +# = exe/tseq_tap +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: tseq_tap.rb,v 1.1 2008/12/31 06:00:08 cmzmasek Exp $ + + +require 'lib/evo/apps/tseq_taxonomy_processor' + +module Evoruby + + tap = TseqTaxonomyProcessor.new() + + tap.run() + +end # module Evoruby diff --git a/forester/ruby/evoruby/files/00_phylogeny_factory.template b/forester/ruby/evoruby/files/00_phylogeny_factory.template new file mode 100644 index 0000000..fda13d4 --- /dev/null +++ b/forester/ruby/evoruby/files/00_phylogeny_factory.template @@ -0,0 +1,100 @@ +# $Id: 00_phylogeny_factory.template,v 1.9 2008/11/25 22:25:52 cmzmasek Exp $ + +# Name convention if alignment specific parameters +# are to be used (such as HMMs for hmmalign): +# the substring before the first underscore is a +# unique identifier and needs to match the identifiers +# in '% =' statements +# Example: +# alignment name : 'bcl2_new_alignment' +# parameter statments: '% HMM bcl2=Bcl2_ls' +# '% RSL bcl2=60' +$ PROBCONS=/home/czmasek/SOFTWARE/PROBCONS/probcons_v1_12/probcons +$ DIALIGN_TX=/home/czmasek/SOFTWARE/DIALIGNTX/DIALIGN-TX_1.0.1/source/dialign-tx +$ DIALIGN_DIR=/home/czmasek/SOFTWARE/DIALIGNTX/DIALIGN-TX_1.0.1/conf +$ MAFFT=/home/czmasek/SOFTWARE/MAFFT/mafft-6.240/src/mafft +$ T_COFFEE=/home/czmasek/SOFTWARE/T_COFFEE/T-COFFEE_distribution_Version_6.78/bin_linux/t_coffee +$ MUSCLE=/home/czmasek/SOFTWARE/MUSCLE/muscle3.7/muscle +$ CLUSTALW=/home/czmasek/SOFTWARE/CLUSTALW/clustalw-2.0.9/src/clustalw2 +$ KALIGN=/home/czmasek/SOFTWARE/KALIGN/kalign203/kalign +$ HMMALIGN=/home/czmasek/SOFTWARE/HMMER/hmmer-2.3.2/src/hmmalign +$ MSA_PRO=/home/czmasek/SOFTWARE/FORESTER/DEV/forester-atv/ruby/evoruby/exe/msa_pro.rb +$ PHYLO_PL=/home/czmasek/SOFTWARE/FORESTER/DEV/forester-atv/perl/phylo_pl.pl + +# Default value is 40. +% RSL SRCR=50 +% RSL NACHT=50 +% RSL TIR=60 +% RSL Bcl2=100 +% RSL homeobox=40 + +% PHYLO_OPT=-WIB100q@1nxbwS21 + +% TMP_DIR = /home/czmasek/tmp/ + +# Need to give full path for HMM files. +% HMM NACHT=/home/czmasek/DATA/PFAM/NACHT_ls_cz.hmm +% HMM TIR=/home/czmasek/DATA/PFAM/PFAM_LS/TIR.ls.hmm +% HMM SRCR=/home/czmasek/DATA/PFAM/PFAM_LS/SRCR.ls.hmm +% HMM Homeobox=/home/czmasek/DATA/PFAM/PFAM_LS/Homeobox.ls.hmm +% HMM Cofilin_ADF=/home/czmasek/DATA/PFAM/PFAM_LS/Cofilin_ADF.ls.hmm + + +> KALIGN $ -o $_kalign +> MSA_PRO -o=p -n=10 -rr=0.5 -rsl=%[RSL]% $_kalign $_kalign_05_%[RSL]%.aln +> MSA_PRO -o=n -n=10 -rr=0.5 -rsl=%[RSL]% $_kalign $_kalign_05_%[RSL]%.nex +> PHYLO_PL %[PHYLO_OPT]% $_kalign_05_%[RSL]%.aln $_kalign_05_%[RSL]% %[TMP_DIR]% +- + +> HMMALIGN -q -m -o $_hmmalign_m %[HMM]% $ +> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_hmmalign_m $_hmmalign_m_05_%[RSL]%.aln +> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_hmmalign_m $_hmmalign_m_05_%[RSL]%.nex +> PHYLO_PL %[PHYLO_OPT]% $_hmmalign_m_05_%[RSL]%.aln $_hmmalign_m_05_%[RSL]% %[TMP_DIR]% +- + +> HMMALIGN -q -o $_hmmalign %[HMM]% $ +> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_hmmalign $_hmmalign_05_%[RSL]%.aln +> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_hmmalign $_hmmalign_05_%[RSL]%.nex +> PHYLO_PL %[PHYLO_OPT]% $_hmmalign_05_%[RSL]%.aln $_hmmalign_05_%[RSL]% %[TMP_DIR]% +- + +> PROBCONS $ > $_probcons +> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_probcons $_probcons_05_%[RSL]%.aln +> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_probcons $_probcons_05_%[RSL]%.nex +> PHYLO_PL %[PHYLO_OPT]% $_probcons_05_%[RSL]%.aln $_probcons_05_%[RSL]% %[TMP_DIR]% +- + +> DIALIGN_TX DIALIGN_DIR $ $_dialigntx +> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_dialigntx $_dialigntx_05_%[RSL]%.aln +> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_dialigntx $_dialigntx_05_%[RSL]%.nex +> PHYLO_PL %[PHYLO_OPT]% $_dialigntx_05_%[RSL]%.aln $_dialigntx_05_%[RSL]% %[TMP_DIR]% +- + +> MAFFT --maxiterate 1000 --localpair --quiet $ > $_mafft +> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_mafft $_mafft_05_%[RSL]%.aln +> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_mafft $_mafft_05_%[RSL]%.nex +> PHYLO_PL %[PHYLO_OPT]% $_mafft_05_%[RSL]%.aln $_mafft_05_%[RSL]% %[TMP_DIR]% +- + +#> T_COFFEE $ -outfile $_tcoffee +#> rm $.dnd +#> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_tcoffee $_tcoffee_05_%[RSL]%.aln +#> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_tcoffee $_tcoffee_05_%[RSL]%.nex +#> PHYLO_PL %[PHYLO_OPT]% $_tcoffee_05_%[RSL]%.aln $_tcoffee_05_%[RSL]% %[TMP_DIR]% +#> rm $.dnd +#> rm $.html +#- + +> MUSCLE -maxiters 1000 -maxtrees 100 -clw -in $ -out $_muscle +> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_muscle $_muscle_05_%[RSL]%.aln +> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_muscle $_muscle_05_%[RSL]%.nex +> PHYLO_PL %[PHYLO_OPT]% $_muscle_05_%[RSL]%.aln $_muscle_05_%[RSL]% %[TMP_DIR]% +- + +> CLUSTALW $ -outfile=$_clustalw +> rm $.dnd +> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_clustalw $_clustalw_05_%[RSL]%.aln +> MSA_PRO -o=n -n=10 -rr=0.5 -c -rsl=%[RSL]% $_clustalw $_clustalw_05_%[RSL]%.nex +> PHYLO_PL %[PHYLO_OPT]% $_clustalw_05_%[RSL]%.aln $_clustalw_05_%[RSL]% %[TMP_DIR]% +> rm $.dnd +- diff --git a/forester/ruby/evoruby/files/00_sample_tap_mapfile b/forester/ruby/evoruby/files/00_sample_tap_mapfile new file mode 100644 index 0000000..aa0606c --- /dev/null +++ b/forester/ruby/evoruby/files/00_sample_tap_mapfile @@ -0,0 +1,46 @@ +# $Id: 00_sample_tap_mapfile,v 1.2 2008/08/29 23:58:31 cmzmasek Exp $ + +RAT#RAT +Geodia cydonium#GEOCY +Lubomirskia baicalensis#LUBBA +Suberites domuncula#SUBDO +Hydra vulgaris#HYDAT +Apis mellifera#APIME +Drosophila pseudoobscura#DROPS +Aedes aegypti#AEDAE +Tribolium castaneum#TRICA +Caenorhabditis briggsae#CAEBR +HUMAN#HUMAN +Branchiostoma floridae#BRAFL +amphioxus#BRAFL +Brafl1#BRAFL +Ciona intestinalis#CIOIN +ciona#CIOIN +cow#BOVIN +dog#CANFA +fugu#FUGRU +mouse#MOUSE +MOUSE#MOUSE +Mus musculus#MOUSE +Rattus norvegicus#RAT +tetraodon#TETNG +Tetraodon nigroviridis#TETNG +urchin#STRPU +Xenopus laevis#XENTR +xenopus#XENTR +zebrafish#BRARE +Danio rerio#BRARE +chicken#CHICK +celegans#CAEEL +Caenorhabditis elegans#CAEEL +fruitfly#DROME +Drosophila melanogaster#DROME +Haemonchus contortus#HAECO +Nematostella vectensis#NEMVE +anemone#NEMVE +human#HUMAN +HUMAN#HUMAN +Human#HUMAN +Homo sapiens#HUMAN +Chlamydomonas reinhardtii#CHLRE +Monosiga brevicollis#MONBE diff --git a/forester/ruby/evoruby/files/test/fasta_file.txt b/forester/ruby/evoruby/files/test/fasta_file.txt new file mode 100644 index 0000000..d696196 --- /dev/null +++ b/forester/ruby/evoruby/files/test/fasta_file.txt @@ -0,0 +1,22 @@ +7 26 +# 7 26 is not needed and ignored +CLUSTAL +PROBCONS +>sequence 0 +ABCDEF +>sequence 1 +a +b +c +//comment +d +e +!!comment +f + + > sequence 2 +123456 + > sequence 3 + + a-c--f + diff --git a/forester/ruby/evoruby/files/test/general_msa_file.txt b/forester/ruby/evoruby/files/test/general_msa_file.txt new file mode 100644 index 0000000..5061707 --- /dev/null +++ b/forester/ruby/evoruby/files/test/general_msa_file.txt @@ -0,0 +1,53 @@ +7 26 +# 7 26 is not needed and ignored + + +sequence0 ABCDE GHIJ + KLMNOPQR TUVW +sequence1 abcdefghi + jklm + nopq + rstu + vw +sequence2 abcde + fghijkl---x_-*?_XX +sequence3 12345678901234567890123 + + # this is_a_comment + + +sequence4 ---------- + ------------- + +sequence5 a*c*ef****************w + + % this is_another_comment + +sequence6 ururufhfghfgftgfhftgftt + + // this is_yet_another_comment + !! this is_yet_another_comment + + * -- * + +sequence0 a + bc +sequence1 a + b + c +sequence2 abc + +sequence3 abc + *.. +sequence4 --- +sequence5 abc + ... +sequence6 abc + +xy0 +xy1 +xy2 +xy3 +xy4 +xy5 +xy6 diff --git a/forester/ruby/evoruby/files/test/ncbi_tseq.xml b/forester/ruby/evoruby/files/test/ncbi_tseq.xml new file mode 100644 index 0000000..2e3018f --- /dev/null +++ b/forester/ruby/evoruby/files/test/ncbi_tseq.xml @@ -0,0 +1,104 @@ + + + + + + 29341016 + AAO78806.1 + gnl|mbpwusl|BT3701 + 226186 + Bacteroides thetaiotaomicron VPI-5482 + SusD [Bacteroides thetaiotaomicron VPI-5482] + 551 + MKTKYIKQLFSAALIAVLSSGVTSCINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQENQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK + + + + + 29349109 + NP_812612.1 + gnl|REF_mbpwusl|BT3701 + 226186 + Bacteroides thetaiotaomicron VPI-5482 + SusD, outer membrane protein [Bacteroides thetaiotaomicron VPI-5482] + 551 + MKTKYIKQLFSAALIAVLSSGVTSCINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQENQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK + + + + + 1478026 + AAB42172.1 + 818 + Bacteroides thetaiotaomicron + outer membrane protein + 554 + MKTKYIKQLFSAALIAVLSSGVTSCINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQENQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPASHATYPDTDIPLFPFGRSIPDTCGSHFPPGRRRHRRHQLNFAKRAQLYKKGTEPLTEQETNRDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK + + + + + 188596440 + pdb|3CK9|B + 818 + Bacteroides thetaiotaomicron + Chain B, B. Thetaiotaomicron Susd With Maltoheptaose + 527 + GINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQKNQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK + + + + + 188596439 + pdb|3CK9|A + 818 + Bacteroides thetaiotaomicron + Chain A, B. Thetaiotaomicron Susd With Maltoheptaose + 527 + GINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQKNQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK + + + + + 171849127 + pdb|3CKC|B + 818 + Bacteroides thetaiotaomicron + Chain B, B. Thetaiotaomicron Susd + 527 + GINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQKNQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSXIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK + + + + + 171849126 + pdb|3CKC|A + 818 + Bacteroides thetaiotaomicron + Chain A, B. Thetaiotaomicron Susd + 527 + GINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQKNQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSXIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK + + + + + 171849125 + pdb|3CKB|B + 818 + Bacteroides thetaiotaomicron + Chain B, B. Thetaiotaomicron Susd With Maltotriose + 527 + GINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQKNQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK + + + + + pdb|3CKB|A + 818 + Bacteroides thetaiotaomicron + Chain A, B. Thetaiotaomicron Susd With Maltotriose + 527 + GINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQKNQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK + + + \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb new file mode 100644 index 0000000..ac6199b --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb @@ -0,0 +1,262 @@ +# +# = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: domain_sequence_extractor.rb,v 1.19 2010/12/13 19:00:11 cmzmasek Exp $ + + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/io/parser/hmmsearch_domain_extractor' + +module Evoruby + + class DomainSequenceExtractor + + PRG_NAME = "dsx" + PRG_VERSION = "1.1.0" + PRG_DESC = "extraction of domain sequences from hmmsearch output" + PRG_DATE = "2008.01.03" + COPYRIGHT = "2008-2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + E_VALUE_THRESHOLD_OPTION = 'e' + LENGTH_THRESHOLD_OPTION = 'l' + ADD_POSITION_OPTION = 'p' + ADD_DOMAIN_NUMBER_OPTION = 'd' + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT = 'dd' + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER = 'dl' + TRIM_OPTION = 't' + LOG_FILE_SUFFIX = '_domain_seq_extr.log' + PASSED_SEQS_SUFFIX = '_domain_seq_extr_passed' + FAILED_SEQS_SUFFIX = '_domain_seq_extr_failed' + HELP_OPTION_1 = 'help' + HELP_OPTION_2 = 'h' + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC , + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + ld = Constants::LINE_DELIMITER + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError + Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 3 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( ADD_POSITION_OPTION ) + allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION ) + allowed_opts.push( LENGTH_THRESHOLD_OPTION ) + allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) + allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) + allowed_opts.push( TRIM_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + hmmsearch_output = cla.get_file_name( 0 ) + fasta_sequence_file = cla.get_file_name( 1 ) + outfile = cla.get_file_name( 2 ) + + add_position = false + if ( cla.is_option_set?( ADD_POSITION_OPTION ) ) + add_position = true + end + + trim = false + if ( cla.is_option_set?( TRIM_OPTION ) ) + trim = true + end + + add_domain_number = false + add_domain_number_as_letter = false + add_domain_number_as_digit = false + + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) ) + add_domain_number = true + end + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) ) + add_domain_number_as_letter = true + end + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) ) + add_domain_number_as_digit = true + end + + if ( add_domain_number_as_letter && add_domain_number_as_digit ) + puts( "attempt to add domain number as letter and digit at the same time" ) + print_help + exit( -1 ) + end + + e_value_threshold = -1.0 + if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) + begin + e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( e_value_threshold < 0.0 ) + Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) + end + end + + length_threshold = -1 + if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) ) + begin + length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION ) + rescue ArgumentError => e + Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( length_threshold < 0) + Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT ) + end + end + + log = String.new + + puts() + puts( "Hmmsearch outputfile : " + hmmsearch_output ) + log << "Hmmsearch outputfile : " + hmmsearch_output + ld + puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file ) + log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld + puts( "Outputfile : " + outfile ) + log << "Outputfile : " + outfile + ld + puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX ) + log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld + puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX ) + log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld + puts( "Logfile : " + outfile + LOG_FILE_SUFFIX ) + log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld + if ( e_value_threshold >= 0.0 ) + puts( "E-value threshold : " + e_value_threshold.to_s ) + log << "E-value threshold : " + e_value_threshold.to_s + ld + else + puts( "E-value threshold : no threshold" ) + log << "E-value threshold : no threshold" + ld + end + if ( length_threshold > 0 ) + puts( "Length threshold : " + length_threshold.to_s ) + log << "Length threshold : " + length_threshold.to_s + ld + else + puts( "Length threshold : no threshold" ) + log << "Length threshold : no threshold" + ld + end + + if ( trim ) + puts( "Trim last 2 chars : true" ) + log << "Trim last 2 chars : true" + ld + else + puts( "Trim names : false" ) + log << "Trim names : false" + ld + end + + + if ( add_position ) + puts( "Add positions (rel to complete seq) to extracted domains: true" ) + log << "Add positions (rel to complete seq) to extracted domains: true" + ld + else + puts( "Add positions (rel to complete seq) to extracted domains: false" ) + log << "Add positions (rel to complete seq) to extracted domains: false" + ld + end + + if ( add_domain_number || add_domain_number_as_digit || add_domain_number_as_letter ) + puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" ) + log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld + else + puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" ) + log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld + end + + puts + + domain_count = 0 + begin + parser = HmmsearchDomainExtractor.new() + domain_count = parser.parse( hmmsearch_output, + fasta_sequence_file, + outfile, + outfile + PASSED_SEQS_SUFFIX, + outfile + FAILED_SEQS_SUFFIX, + e_value_threshold, + length_threshold, + add_position, + add_domain_number, + add_domain_number_as_digit, + add_domain_number_as_letter, + trim, + log ) + rescue ArgumentError, IOError, StandardError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "unexpected exception!: " + e.to_s, STDOUT ) + end + + puts + Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" ) + Util.print_message( PRG_NAME, "wrote; " + outfile ) + Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX ) + Util.print_message( PRG_NAME, "(wrote: " + outfile + PASSED_SEQS_SUFFIX + ")" ) + Util.print_message( PRG_NAME, "(wrote: " + outfile + FAILED_SEQS_SUFFIX + ")" ) + + begin + f = File.open( outfile + LOG_FILE_SUFFIX, 'a' ) + f.print( log ) + f.close + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + puts + Util.print_message( PRG_NAME, "OK" ) + puts + + end + + def print_help() + puts() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=: E-value threshold, default is no threshold" ) + puts( " -" + LENGTH_THRESHOLD_OPTION + "=: length threshold, default is no threshold" ) + puts( " -" + ADD_POSITION_OPTION + ": to add positions (rel to complete seq) to extracted domains" ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT + ": to add numbers to extracted domains as digit (example \"domain2\")" ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER + ": to add numbers to extracted domains as letter (example \"domaina\")" ) + puts( " -" + TRIM_OPTION + ": to remove the last 2 characters from sequence names" ) + puts() + end + + end # class DomainSequenceExtractor + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/domains_to_forester.rb b/forester/ruby/evoruby/lib/evo/apps/domains_to_forester.rb new file mode 100644 index 0000000..a03cc2a --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/domains_to_forester.rb @@ -0,0 +1,252 @@ +# +# = lib/evo/apps/domains_to_forester - DomainsToForester class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: domains_to_forester.rb,v 1.11 2010/12/13 19:00:11 cmzmasek Exp $ +# +# last modified: 06/11/2007 + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/sequence/protein_domain' +require 'lib/evo/sequence/domain_structure' + +module Evoruby + + class DomainsToForester + + PRG_NAME = "d2f" + PRG_DESC = "parsed hmmpfam output to forester format" + PRG_VERSION = "1.0.0" + PRG_DATE = "2007.12.18" + COPYRIGHT = "2007 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + E_VALUE_THRESHOLD_OPTION = "e" + OVERWRITE_IF_SAME_FROM_TO_OPTION = "o" + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" + + def parse( domains_list_file, + original_seqs_file, + outfile, + column_delimiter, + e_value_threshold, + overwrite_if_same_from_to ) + Util.check_file_for_readability( domains_list_file ) + Util.check_file_for_readability( original_seqs_file ) + Util.check_file_for_writability( outfile ) + + domain_structures = Hash.new() # protein name is key, domain structure is value + + f = MsaFactory.new + + original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new ) + if ( original_seqs.get_number_of_seqs < 1 ) + error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format" + raise ArgumentError, error_msg + end + + File.open( domains_list_file ) do | file | + while line = file.gets + if ( !is_ignorable?( line ) ) + a = line.split( column_delimiter ) + l = a.length + if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) ) + error_msg = "unexpected format at line: " + line + raise IOError, error_msg + end + protein_name = a[ 0 ] + domain_name = a[ 1 ] + seq_from = -1 + seq_to = -1 + begin + seq_from = a[ 2 ].to_i + rescue Exception + error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]" + raise IOError, error_msg + end + begin + seq_to = a[ 3 ].to_i + rescue Exception + error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]" + raise IOError, error_msg + end + + e_value = -1 + if ( l > 4 ) + begin + e_value = a[ 4 ].to_f + rescue Exception + error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]" + raise IOError, error_msg + end + end + + seq = original_seqs.get_by_name( protein_name, true, false ) + + total_length = seq.get_length + + if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) ) ) + pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value ) + ds = nil + if ( domain_structures.has_key?( protein_name ) ) + ds = domain_structures[ protein_name ] + else + ds = DomainStructure.new( total_length ) + domain_structures[ protein_name ] = ds + end + ds.add_domain( pd, overwrite_if_same_from_to ) + end + + end + end + end + + out = File.open( outfile, "a" ) + ds = domain_structures.sort + for d in ds + protein_name = d[ 0 ] + domain_structure = d[ 1 ] + out.print( protein_name.to_s ) + out.print( ":" ) + out.print( domain_structure.to_NHX ) + out.print( Constants::LINE_DELIMITER ) + end + + out.flush() + out.close() + + end # parse + + + + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 3 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + domains_list_file = cla.get_file_name( 0 ) + original_sequences_file = cla.get_file_name( 1 ) + outfile = cla.get_file_name( 2 ) + + + e_value_threshold = -1.0 + if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) + begin + e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( e_value_threshold < 0.0 ) + Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) + end + end + overwrite_if_same_from_to = false + if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) ) + overwrite_if_same_from_to = true + end + + puts() + puts( "Domains list file : " + domains_list_file ) + puts( "Fasta sequencefile (complete sequences): " + original_sequences_file ) + puts( "Outputfile : " + outfile ) + if ( e_value_threshold >= 0.0 ) + puts( "E-value threshold : " + e_value_threshold.to_s ) + else + puts( "E-value threshold : no threshold" ) + end + if ( overwrite_if_same_from_to ) + puts( "Overwrite if same from and to : true" ) + else + puts( "Overwrite if same from and to : false" ) + end + + puts + + begin + parse( domains_list_file, + original_sequences_file, + outfile, + " ", + e_value_threshold, + overwrite_if_same_from_to ) + + rescue ArgumentError, IOError, StandardError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT ) + end + + + puts + Util.print_message( PRG_NAME, 'OK' ) + puts + + end + + private + + def print_help() + puts() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "= : E-value threshold, default is no threshold" ) + puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" ) + puts() + end + + + + def is_ignorable?( line ) + return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/) + end + + + end # class DomainsToForester + + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/evo_nursery.rb b/forester/ruby/evoruby/lib/evo/apps/evo_nursery.rb new file mode 100755 index 0000000..2f634d6 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/evo_nursery.rb @@ -0,0 +1,317 @@ +# +# = lib/evo/apps/evo_nursery.rb - EvoNursery class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: evo_nursery.rb,v 1.11 2010/12/13 19:00:11 cmzmasek Exp $ + + + +require 'lib/evo/soft/fastme' +require 'lib/evo/soft/tree_puzzle' +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/writer/phylip_sequential_writer' +require 'lib/evo/io/parser/general_msa_parser' +require 'lib/evo/io/writer/msa_writer' + +require 'iconv' + +module Evoruby + + class EvoNursery + GAP_RATIO = 0.75 + GAP_RATIO_FOR_SEQS = 0.75 + MIN_LENGTH = 40 + MIN_SEQS = 4 + MAX_SEQS = 1600 + MAX_ALN_FILE_SIZE = 4000000 + MODEL = :auto + RATES = :uniform + FASTME_INITIAL_TREE = :GME + ALN_NAME = '_align_' + TREE_PUZZLE_OUTDIST = TreePuzzle::OUTDIST + TREE_PUZZLE_OUTFILE = TreePuzzle::OUTFILE + FASTME_OUTTREE = FastMe::OUTTREE + FASTME_OUTPUT_D = FastMe::OUTPUT_D + + PRG_NAME = "evo_nursery" + PRG_DATE = "2009.10.15" + PRG_DESC = "pfam alignments to evolutionary trees" + PRG_VERSION = "0.20" + COPYRIGHT = "2009-2010 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" + + def run + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if RUBY_VERSION !~ /1.9/ + puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) + exit( -1 ) + end + + forester_home = Util.get_env_variable_value( Constants::FORESTER_HOME_ENV_VARIABLE ) + java_home = Util.get_env_variable_value( Constants::JAVA_HOME_ENV_VARIABLE ) + decorator = java_home + '/bin/java -cp ' + forester_home + '/java/forester.jar org.forester.application.decorator' + + if ( ARGV == nil || ARGV.length != 1 ) + help + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + help + exit( 0 ) + end + + output_dir = cla.get_file_name( 0 ) + + if output_dir !~ /\/$/ + output_dir = output_dir + '/' + end + + if !File.exists?( output_dir ) + Util.fatal_error( PRG_NAME, output_dir.to_s + " does not exist", STDOUT ) + end + ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' ) + files = Dir.entries( "." ) + skipped = Array.new + counter = 1 + analyzed = 0; + begin + files.each { |pfam_aln_file| + if ( !File.directory?( pfam_aln_file ) && + pfam_aln_file !~ /^\./ && + pfam_aln_file !~ /.+\.tre$/ ) + + tree_out_file = output_dir + File.basename( pfam_aln_file ) + ".xml" + + if File.exists?( tree_out_file ) + puts + puts + puts "***** skipping " + File.basename( pfam_aln_file ) + ", already exists" + puts + skipped.push( File.basename( pfam_aln_file ) + " [already exists]" ) + next + end + + puts + puts counter.to_s + ": " + pfam_aln_file.to_str + counter += 1 + if File.size( pfam_aln_file ) > MAX_ALN_FILE_SIZE + puts "***** skipping, file size: " + File.size( pfam_aln_file ).to_s + skipped.push( File.basename( pfam_aln_file ) + " [file size: " + File.size( pfam_aln_file ).to_s + "]" ) + next + end + + f = MsaFactory.new() + msa = f.create_msa_from_file( pfam_aln_file, GeneralMsaParser.new() ) + + if msa.get_number_of_seqs < MIN_SEQS || msa.get_number_of_seqs > MAX_SEQS + puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s + skipped.push( File.basename( pfam_aln_file ) + " [seqs: " + msa.get_number_of_seqs.to_s + "]" ) + next + end + + msa.remove_gap_columns_w_gap_ratio!( GAP_RATIO ) + + length = msa.get_length + if length < MIN_LENGTH + puts "***** skipping, length: " + length.to_s + skipped.push( File.basename( pfam_aln_file ) + " [length: " + length.to_s + "]" ) + next + end + + msa.remove_sequences_by_gap_ratio!( GAP_RATIO_FOR_SEQS ) + + if msa.get_number_of_seqs < MIN_SEQS + puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s + skipped.push( File.basename( pfam_aln_file ) + " [seqs: " + msa.get_number_of_seqs.to_s + "]" ) + next + end + + map_file = output_dir + File.basename( pfam_aln_file ) + ".map" + f = File.open( map_file, 'a' ) + for i in 0 ... msa.get_number_of_seqs + name = msa.get_sequence( i ).get_name() + name =~ /(.+)_(.+)\/.+/ + acc = $1 + tax_code = $2 + + mapping_str = i.to_s + mapping_str << "\t" + mapping_str << 'TAXONOMY_CODE:' + mapping_str << tax_code + mapping_str << "\t" + mapping_str << 'SEQ_SYMBOL:' + mapping_str << ( acc + '_' + tax_code ) + mapping_str << "\t" + if ( acc.length < 6 ) + acc = acc + '_' + tax_code + end + mapping_str << 'SEQ_ACCESSION:' + mapping_str << acc + mapping_str << "\t" + mapping_str << 'SEQ_ACCESSION_SOURCE:UniProtKB' + mapping_str << "\t" + mapping_str << 'NODE_NAME:' + mapping_str << name + f.print( mapping_str ) + f.print( "\n" ) + name = msa.get_sequence( i ).set_name( i.to_s ) + end + f.close + + io = MsaIO.new() + w = MsaWriter + w = PhylipSequentialWriter.new() + w.clean( true ) + w.set_max_name_length( 10 ) + if File.exists?( output_dir + ALN_NAME ) + File.unlink( output_dir + ALN_NAME ) + end + io.write_to_file( msa, output_dir + ALN_NAME, w ) + + tp = TreePuzzle.new() + tp.run( output_dir + ALN_NAME, + MODEL, + RATES, + msa.get_number_of_seqs ) + + File.rename( output_dir + ALN_NAME, output_dir + File.basename( pfam_aln_file ) + ".aln" ) + + fastme = FastMe.new() + fastme.run( TREE_PUZZLE_OUTDIST, 0, FASTME_INITIAL_TREE ) + + pfam_acc = nil + pfam_de = nil + File.open( pfam_aln_file ) do |file| + while line = file.gets + line = ic.iconv( line ) + if line =~ /^#=AC\s+(.+)/ + pfam_acc = $1 + end + if line =~ /^#=DE\s+(.+)/ + pfam_de = $1 + end + if pfam_acc && pfam_de + break + end + end + end + if !pfam_acc || !pfam_de + Util.fatal_error( PRG_NAME, "problem with " + pfam_aln_file.to_s, STDOUT ) + end + + puzzle_model = nil + File.open( TREE_PUZZLE_OUTFILE ) do |file| + while line = file.gets + line = ic.iconv( line ) + if line =~ /^Model\s+of\s+substitution:\s+(.+)/ + puzzle_model = $1 + break + end + end + end + if !puzzle_model + Util.fatal_error( PRG_NAME, "problem with puzzle outfile: " + TREE_PUZZLE_OUTFILE.to_s, STDOUT ) + end + + desc = pfam_de + desc << ' | ' + desc << 'ML pwd estimation by TREE-PUZZLE version ' + desc << TreePuzzle::VERSION + desc << ', model: ' + desc << puzzle_model + desc << ', rates: ' + desc << RATES.to_s + desc << '; tree estimation by FastME version ' + desc << FastMe::VERSION + desc << ', initial tree: ' + desc << FASTME_INITIAL_TREE.to_s + desc << '; aln length: ' + desc << msa.get_length.to_s + + cmd = decorator + " -table -p -pn=\"" + pfam_aln_file + + "\" -pi=pfam:" + pfam_acc + + " -pd=\"" + desc + "\" " + + FASTME_OUTTREE + ' ' + + map_file + ' ' + tree_out_file + + IO.popen( cmd , 'r+' ) do | pipe | + pipe.close_write + end + analyzed += 1 + + File.unlink( map_file ) + File.unlink(TREE_PUZZLE_OUTDIST) + File.unlink( TREE_PUZZLE_OUTFILE ) + File.unlink( FASTME_OUTPUT_D ) + end + } + rescue ArgumentError, IOError, StandardError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + puts() + puts( 'Skipped:' ) + puts() + for i in 0 ... skipped.size + puts i.to_s + ": " + skipped[ i ] + end + + puts() + puts( 'Skipped : ' + skipped.size.to_s + ' alignments' ) + puts( 'Analyzed: ' + analyzed.to_s + ' alignments' ) + + puts( 'Min gap ratio for col del : ' + GAP_RATIO.to_s ) + puts( 'Min gap ratio for seq del : ' + GAP_RATIO_FOR_SEQS.to_s ) + puts( 'Minimal aln length : ' + MIN_LENGTH.to_s ) + puts( 'Minimal number of sequences: ' + MIN_SEQS.to_s ) + puts( 'Maximal number of sequences: ' + MAX_SEQS.to_s ) + puts( 'Maximal aln file size : ' + MAX_ALN_FILE_SIZE.to_s ) + puts( 'Model : ' + MODEL.to_s ) + puts( 'FastME initial tree: ' + FASTME_INITIAL_TREE.to_s ) + + puts() + puts( '[' + PRG_NAME + '] > OK' ) + puts() + + end # run + + private + + def help + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb " ) + puts() + end + + + end # class EvoNursery + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/apps/fasta_extractor.rb b/forester/ruby/evoruby/lib/evo/apps/fasta_extractor.rb new file mode 100644 index 0000000..b2d0d5c --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/fasta_extractor.rb @@ -0,0 +1,146 @@ +# +# = lib/evo/apps/fasta_extractor.rb - FastaExtractor class +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: fasta_extractor.rb,v 1.2 2010/12/13 19:00:11 cmzmasek Exp $ + + +require 'lib/evo/util/util' +require 'lib/evo/util/constants' +require 'lib/evo/util/command_line_arguments' + + +module Evoruby + + class FastaExtractor + + PRG_NAME = "fae" + PRG_VERSION = "1.0.0" + PRG_DESC = "extraction of nucleotide sequences from a fasta file by names from wublast search" + PRG_DATE = "2008.08.09" + COPYRIGHT = "2008-2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + HELP_OPTION_1 = 'help' + HELP_OPTION_2 = 'h' + + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC , + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + ld = Constants::LINE_DELIMITER + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 3 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + input_file = cla.get_file_name( 0 ) + names_file = cla.get_file_name( 1 ) + output_file = cla.get_file_name( 2 ) + + if !File.exist?( input_file ) + Util.fatal_error( PRG_NAME, "error: input file [#{input_file}] does not exist" ) + end + if !File.exist?( names_file ) + Util.fatal_error( PRG_NAME, "error: names file [#{names_file}] does not exist" ) + end + if File.exist?( output_file ) + Util.fatal_error( PRG_NAME, "error: [#{output_file }] already exists" ) + end + + names = extract_names_with_frames( names_file ) + + extract_sequences( names, input_file, output_file ) + + puts + Util.print_message( PRG_NAME, "OK" ) + puts + + end + + + def extract_names_with_frames( names_file ) + names = Hash.new() + File.open( names_file ) do | file | + while line = file.gets + if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) ) + if ( line =~ /(\S+)\s+([+|-]\d)\s+\d+\s+(\S+)/ ) + name = $1 + frame = $2 + e = $3 + names[ name ] = "[" + frame + "] [" + e + "]" + end + end + end + end + names + end + + def extract_sequences( names, fasta_file, output_file ) + output = File.open( output_file, "a" ) + matching_state = false + counter = 0 + File.open( fasta_file ) do | file | + while line = file.gets + if !Util.is_string_empty?( line ) + if ( line =~ /\s*>\s*(.+)/ ) + name = $1 + if names.has_key?( name ) + matching_state = true + counter += 1 + puts counter.to_s + ". " +name + " " + names[ name ] + output.print( ">" + name + " " + names[ name ] ) + output.print( Evoruby::Constants::LINE_DELIMITER ) + else + matching_state = false + end + elsif matching_state + output.print( line ) + end + end + end + end + output.close() + end + + def print_help() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb " ) + puts() + end + + end # class FastaExtractor +end \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/apps/fasta_taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/apps/fasta_taxonomy_processor.rb new file mode 100644 index 0000000..6ae3cf1 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/fasta_taxonomy_processor.rb @@ -0,0 +1,205 @@ +# +# = lib/evo/apps/fasta_taxonomy_processor - FastaTaxonomyProcessor class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: fasta_taxonomy_processor.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $ + + +require 'lib/evo/util/util' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/msa/msa' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/parser/sp_taxonomy_parser' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/writer/phylip_sequential_writer' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/apps/tseq_taxonomy_processor' + +module Evoruby + + class FastaTaxonomyProcessor + + PRG_NAME = "fasta_tap" + PRG_DATE = "2009.01.20" + PRG_DESC = "preprocessing of multiple sequence files in ncbi fasta format" + PRG_VERSION = "1.00" + COPYRIGHT = "2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + def initialize() + @tax_ids_to_sp_taxonomies = Hash.new() + end + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if ARGV == nil || ARGV.length != 4 + puts( "Usage: #{PRG_NAME}.rb " ) + puts() + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + allowed_opts = Array.new + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) + end + + sp_taxonomy_infile = cla.get_file_name( 0 ) + sequences_infile = cla.get_file_name( 1 ) + sequences_outfile = cla.get_file_name( 2 ) + mapping_outfile = cla.get_file_name( 3 ) + + Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile ) + Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile ) + Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile ) + Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile ) + + sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile ) + + Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile ) + + fasta_parser = FastaParser.new + msa_fac = MsaFactory.new + + seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser ) + + Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile ) + + removed = seqs.remove_redundant_sequences!( true, true ) + + if removed.size > 0 + Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) + removed.each { | seq_name | + puts seq_name + } + Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" ) + end + + mapping_out = File.open( mapping_outfile, "a" ) + + for i in 0 ... seqs.get_number_of_seqs + seq = seqs.get_sequence( i ) + seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) ) + end + + io = MsaIO.new() + + w = FastaWriter.new() + + w.set_max_name_length( 10 ) + w.clean( true ) + begin + io.write_to_file( seqs, sequences_outfile, w ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) + end + mapping_out.close() + + Util.print_message( PRG_NAME, "wrote: " + mapping_outfile ) + Util.print_message( PRG_NAME, "wrote: " + sequences_outfile ) + Util.print_message( PRG_NAME, "OK" ) + + end + + private + + def modify_name( seq, i, sp_taxonomies, mapping_outfile ) + + #i = i + 1792 + + seq_desc = seq.get_name + + taxonomy_sn = nil + + if seq_desc =~ /\[(.+)\]/ + taxonomy_sn = $1 + else + Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]" ) + end + + matching_sp_taxonomy = nil + + sp_taxonomies.each { |sp_taxonomy| + if ( sp_taxonomy.scientific_name == taxonomy_sn ) + matching_sp_taxonomy = sp_taxonomy + end + } + + if matching_sp_taxonomy == nil + Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" ) + end + + new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code + + gi = nil + if seq_desc =~ /gi\|(.+?)\|/ + gi = $1 + else + Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]" ) + end + + seq_name = "" + + if seq_desc =~ /\|\s*([^|]+?)\s*\[/ + seq_name = $1 + end + + if seq_name =~ /\[.+\]$/ + # Redundant taxonomy information hides here. + seq_name = seq_name.sub(/\[.+\]$/, '') + end + if seq_name =~ /^\s*hypothetical\s+protein\s*/i + # Pointless information. + seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' ) + end + if seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i + # Pointless information. + seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' ) + end + + if gi != nil + mapping_outfile.print( new_name + "\t" + + TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + + TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" + + TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" + + TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + + TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" + + TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" + + TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" + + TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string + + Constants::LINE_DELIMITER ) + else + mapping_outfile.print( new_name + "\t" + + TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + + TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" + + TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" + + TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + + TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" + + TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string + + Constants::LINE_DELIMITER ) + + end + new_name + end + + end + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/hmmscan_parser.rb b/forester/ruby/evoruby/lib/evo/apps/hmmscan_parser.rb new file mode 100644 index 0000000..c2f8177 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/hmmscan_parser.rb @@ -0,0 +1,265 @@ +# +# = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $ +# +# last modified: 11/24/2009 + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' + +module Evoruby + + class HmmscanParser + + PRG_NAME = "hsp" + PRG_VERSION = "1.0.1" + PRG_DESC = "hmmscan parser" + PRG_DATE = "2009.11.24" + COPYRIGHT = "2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + DELIMITER_OPTION = "d" + E_VALUE_THRESHOLD_OPTION = "e" + IGNORE_DUF_OPTION = "i" + PARSE_OUT_DESCRIPITION_OPTION = "a" + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" + + def initialize + @domain_counts = Hash.new + end + + # raises ArgumentError, IOError + def parse( inpath, + outpath, + column_delimiter, + e_value_threshold, + ignore_dufs, + get_descriptions ) + Util.check_file_for_readability( inpath ) + Util.check_file_for_writability( outpath ) + + outfile = File.open( outpath, "a" ) + + query = String.new + desc = String.new + model = String.new + env_from = String.new + env_to = String.new + i_e_value = String.new + + queries_count = 0 + + nl = Constants::LINE_DELIMITER + + File.open( inpath ) do | file | + while line = file.gets + if !HmmscanParser.is_ignorable?( line ) && line =~ /^\S+\s+\S/ + + # tn acc tlen query acc qlen Evalue score bias # of c-E i-E score bias hf ht af at ef et acc desc + # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 + line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/ + + model = $1 + query = $4 + i_e_value = $13.to_f + env_from = $20.to_i + env_to = $21.to_i + + if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) && + ( !ignore_dufs || ( model !~ /^DUF\d+/ ) ) ) + count_model( model ) + outfile.print( query + + column_delimiter ) + if ( get_descriptions ) + outfile.print( desc + + column_delimiter ) + end + outfile.print( model + + column_delimiter + + env_from.to_s + + column_delimiter + + env_to.to_s + + column_delimiter + + i_e_value.to_s ) + outfile.print( nl ) + end + end + end # while line = file.gets + end + outfile.flush() + outfile.close() + + return queries_count + + end # def parse + + def count_model( model ) + if ( @domain_counts.has_key?( model ) ) + count = @domain_counts[ model ].to_i + count += 1 + @domain_counts[ model ] = count + else + @domain_counts[ model ] = 1 + end + end + + + def get_domain_counts() + return @domain_counts + end + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 2 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( DELIMITER_OPTION ) + allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( IGNORE_DUF_OPTION ) + allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + inpath = cla.get_file_name( 0 ) + outpath = cla.get_file_name( 1 ) + + column_delimiter = "\t" + if ( cla.is_option_set?( DELIMITER_OPTION ) ) + begin + column_delimiter = cla.get_option_value( DELIMITER_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + + e_value_threshold = -1.0 + if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) + begin + e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( e_value_threshold < 0.0 ) + Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) + end + end + + ignore_dufs = false + if ( cla.is_option_set?( IGNORE_DUF_OPTION ) ) + ignore_dufs = true + end + + parse_descriptions = false + if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) ) + parse_descriptions = true + end + + puts() + puts( "hmmpfam outputfile: " + inpath ) + puts( "outputfile : " + outpath ) + if ( e_value_threshold >= 0.0 ) + puts( "E-value threshold : " + e_value_threshold.to_s ) + else + puts( "E-value threshold : no threshold" ) + end + if ( parse_descriptions ) + puts( "parse descriptions: true" ) + else + puts( "parse descriptions: false" ) + end + if ( ignore_dufs ) + puts( "ignore DUFs : true" ) + else + puts( "ignore DUFs : false" ) + end + if ( column_delimiter == "\t" ) + puts( "column delimiter : TAB" ) + else + puts( "column delimiter : " + column_delimiter ) + end + puts() + + begin + queries_count = parse( inpath, + outpath, + column_delimiter, + e_value_threshold, + ignore_dufs, + parse_descriptions ) + rescue ArgumentError, IOError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + domain_counts = get_domain_counts() + + puts + puts( "read output for a total of " + queries_count.to_s + " query sequences" ) + puts + puts( "domain counts (considering potential E-value threshold and ignoring of DUFs):" ) + puts( "(number of different domains: " + domain_counts.length.to_s + ")" ) + puts + puts( Util.draw_histogram( domain_counts, "#" ) ) + puts + Util.print_message( PRG_NAME, 'OK' ) + puts + + end # def run() + + def print_help() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" ) + puts( " -" + E_VALUE_THRESHOLD_OPTION + ": E-value threshold, default is no threshold" ) + puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" ) + puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" ) + puts() + end + + + private + + + def HmmscanParser.is_ignorable?( line ) + return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ ) + end + + end # class HmmscanParser + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/apps/msa_processor.rb b/forester/ruby/evoruby/lib/evo/apps/msa_processor.rb new file mode 100644 index 0000000..708a9b7 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/msa_processor.rb @@ -0,0 +1,839 @@ +# +# = lib/evo/apps/msa_processor.rb - MsaProcessor class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: msa_processor.rb,v 1.33 2010/12/13 19:00:10 cmzmasek Exp $ +# + +require 'date' +require 'set' + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/writer/phylip_sequential_writer' +require 'lib/evo/io/writer/nexus_writer' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/io/parser/general_msa_parser' +require 'lib/evo/io/writer/msa_writer' + +module Evoruby + + class MsaProcessor + + PRG_NAME = "msa_pro" + PRG_DATE = "2010.03.19" + PRG_DESC = "processing of multiple sequence alignments" + PRG_VERSION = "1.05" + COPYRIGHT = "2008-2010 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + + NAME_LENGTH_DEFAULT = 10 + WIDTH_DEFAULT_FASTA = 60 + INPUT_TYPE_OPTION = "i" + OUTPUT_TYPE_OPTION = "o" + MAXIMAL_NAME_LENGTH_OPTION = "n" + WIDTH_OPTION = "w" + CLEAN_UP_SEQ_OPTION = "c" + REM_RED_OPTION = "rem_red" + REMOVE_GAP_COLUMNS_OPTION = "rgc" + REMOVE_GAP_ONLY_COLUMNS = "rgoc" + REMOVE_COLUMNS_GAP_RATIO_OPTION = "rr" + REMOVE_ALL_GAP_CHARACTERS_OPTION = "rg" + REMOVE_ALL_SEQUENCES_LISTED_OPTION = "r" + KEEP_ONLY_SEQUENCES_LISTED_OPTION = "k" + + KEEP_MATCHING_SEQUENCES_OPTION = "mk" + REMOVE_MATCHING_SEQUENCES_OPTION = "mr" + + TRIM_OPTION = "t" + REMOVE_SEQS_GAP_RATIO_OPTION = "rsgr" + REMOVE_SEQS_NON_GAP_LENGTH_OPTION = "rsl" + SPLIT = "split" + LOG_SUFFIX = "_msa_pro.log" + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" + + + def initialize() + @input_format_set = false + @output_format_set = false + @fasta_input = false + @phylip_input = true + @name_length = NAME_LENGTH_DEFAULT + @name_length_set = false + @width = WIDTH_DEFAULT_FASTA # fasta only + @pi_output = true + @fasta_output = false + @nexus_output = false + @clean = false # phylip only + @rgc = false + @rgoc = false + @rg = false # fasta only + @rem_red = false + @rgr = -1 + @rsgr = -1 + @rsl = -1 + @remove_matching = nil + @keep_matching = nil + + @seqs_name_file = nil + @remove_seqs = false + @keep_seqs = false + @trim = false + @split = -1 + @first = -1 + @last = -1 + end + + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if ( ARGV == nil || ARGV.length < 1 ) + Util.print_message( PRG_NAME, "Illegal number of arguments" ) + print_help + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "Error: " + e.to_s, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 2 || ARGV.length < 2 ) + Util.print_message( PRG_NAME, "Illegal number of arguments" ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( INPUT_TYPE_OPTION ) + allowed_opts.push( OUTPUT_TYPE_OPTION ) + allowed_opts.push( MAXIMAL_NAME_LENGTH_OPTION ) + allowed_opts.push( WIDTH_OPTION ) + allowed_opts.push( CLEAN_UP_SEQ_OPTION ) + allowed_opts.push( REMOVE_GAP_COLUMNS_OPTION ) + allowed_opts.push( REMOVE_GAP_ONLY_COLUMNS ) + allowed_opts.push( REMOVE_COLUMNS_GAP_RATIO_OPTION ) + allowed_opts.push( REMOVE_ALL_GAP_CHARACTERS_OPTION ) + allowed_opts.push( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) + allowed_opts.push( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) + allowed_opts.push( TRIM_OPTION ) + allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION ) + allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) + allowed_opts.push( SPLIT ) + allowed_opts.push( REM_RED_OPTION ) + allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION ) + allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed ) + end + + input = cla.get_file_name( 0 ) + output = cla.get_file_name( 1 ) + + analyze_command_line( cla ) + + begin + Util.check_file_for_readability( input ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + begin + Util.check_file_for_writability( output ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + if ( @rg ) + set_pi_output( false ) + set_fasta_output( true ) + set_nexus_output( false ) + end + + if ( !@input_format_set ) + fasta_like = false + begin + fasta_like = Util.looks_like_fasta?( input ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + @fasta_input = fasta_like + @phylip_input = !fasta_like + if ( !@output_format_set ) + @fasta_output = fasta_like + @pi_output = !fasta_like + @nexus_output = false + end + end + + ld = Constants::LINE_DELIMITER + log = PRG_NAME + " " + PRG_VERSION + " [" + PRG_DATE + "]" + " LOG" + ld + now = DateTime.now + log << "Date/time: " + now.to_s + ld + + puts() + puts( "Input alignment : " + input ) + log << "Input alignment : " + input + ld + puts( "Output alignment : " + output ) + log << "Output alignment : " + output + ld + if ( @phylip_input ) + puts( "Input is : Phylip, or something like it" ) + log << "Input is : Phylip, or something like it" + ld + elsif ( @fasta_input ) + puts( "Input is : Fasta" ) + log << "Input is : Fasta" + ld + end + if( @rgr >= 0 ) + puts( "Max col gap ratio: " + @rgr.to_s ) + log << "Max col gap ratio: " + @rgr.to_s + ld + elsif ( @rgc ) + puts( "Remove gap colums" ) + log << "Remove gap colums" + ld + elsif( @rgoc ) + puts( "Remove gap only colums" ) + log << "Remove gap only colums" + ld + end + if ( @clean ) + puts( "Clean up : true" ) + log << "Clean up : true" + ld + end + + if ( @pi_output ) + puts( "Output is : Phylip interleaved" ) + log << "Output is : Phylip interleaved" + ld + elsif ( @fasta_output ) + puts( "Output is : Fasta" ) + log << "Output is : Fasta" + ld + if ( @width ) + puts( "Width : " + @width.to_s ) + log << "Width : " + @width.to_s + ld + end + if ( @rg ) + puts( "Remove all gap characters (alignment is destroyed)" ) + log << "Remove all gap characters (alignment is destroyed)" + ld + end + elsif ( @nexus_output ) + puts( "Output is : Nexus" ) + log << "Output is : Nexus" + ld + end + if ( @name_length_set || !@fasta_output ) + puts( "Max name length : " + @name_length.to_s ) + log << "Max name length : " + @name_length.to_s + ld + end + if( @rsgr >= 0 ) + puts( "Remove sequences for which the gap ratio > " + @rsgr.to_s ) + log << "Remove sequences for which the gap ratio > " + @rsgr.to_s + ld + end + if( @rsl >= 0 ) + puts( "Remove sequences with less than " + @rsl.to_s + " non-gap characters" ) + log << "Remove sequences with less than " + @rsl.to_s + " non-gap characters" + ld + end + if ( @remove_seqs ) + puts( "Remove sequences listed in: " + @seqs_name_file ) + log << "Remove sequences listed in: " + @seqs_name_file + ld + elsif ( @keep_seqs ) + puts( "Keep only sequences listed in: " + @seqs_name_file ) + log << "Keep only sequences listed in: " + @seqs_name_file + ld + end + if ( @trim ) + puts( "Keep only columns from: "+ @first.to_s + " to " + @last.to_s ) + log << "Keep only columns from: "+ @first.to_s + " to " + @last.to_s + ld + end + if ( @rem_red ) + puts( "Remove redundant sequences: true" ) + log << "Remove redundant sequences: true" + ld + end + if ( @split > 0 ) + puts( "Split : " + @split.to_s ) + log << "Split : " + @split.to_s + ld + end + puts() + + f = MsaFactory.new() + + msa = nil + + begin + if ( @phylip_input ) + msa = f.create_msa_from_file( input, GeneralMsaParser.new() ) + elsif ( @fasta_input ) + msa = f.create_msa_from_file( input, FastaParser.new() ) + end + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + if ( msa.is_aligned() ) + Util.print_message( PRG_NAME, "Length of original alignment : " + msa.get_length.to_s ) + log << "Length of original alignment : " + msa.get_length.to_s + ld + else + Util.print_message( PRG_NAME, "the input is not aligned" ) + log << "The input is not aligned" + ld + end + + all_names = Set.new() + for i in 0 ... msa.get_number_of_seqs() + current_name = msa.get_sequence( i ).get_name + if all_names.include?( current_name ) + Util.print_warning_message( PRG_NAME, "sequence name [" + current_name + "] is not unique" ) + else + all_names.add( current_name ) + end + end + + begin + + if ( @remove_seqs || @keep_seqs ) + names = Util.file2array( @seqs_name_file, true ) + if ( names == nil || names.length() < 1 ) + error_msg = "file \"" + @seqs_name_file.to_s + "\" appears empty" + Util.fatal_error( PRG_NAME, error_msg ) + end + + if ( @remove_seqs ) + c = 0 + for i in 0 ... names.length() + to_delete = msa.find_by_name( names[ i ], true, false ) + if ( to_delete.length() < 1 ) + error_msg = "sequence name \"" + names[ i ] + "\" not found" + Util.fatal_error( PRG_NAME, error_msg ) + elsif ( to_delete.length() > 1 ) + error_msg = "sequence name \"" + names[ i ] + "\" is not unique" + Util.fatal_error( PRG_NAME, error_msg ) + else + msa.remove_sequence!( to_delete[ 0 ] ) + c += 1 + end + end + Util.print_message( PRG_NAME, "Removed " + c.to_s + " sequences" ) + log << "Removed " + c.to_s + " sequences" + ld + elsif ( @keep_seqs ) + msa_new = Msa.new() + r = 0 + k = 0 + for j in 0 ... msa.get_number_of_seqs() + if ( names.include?( msa.get_sequence( j ).get_name() ) ) + msa_new.add_sequence( msa.get_sequence( j ) ) + k += 1 + else + r += 1 + end + end + msa = msa_new + Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" ) + log << "Kept " + k.to_s + " sequences" + ld + Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" ) + log << "removed " + r.to_s + " sequences" + ld + end + end + + if ( @trim ) + msa.trim!( @first, @last ) + end + if( @rgr >= 0 ) + msa.remove_gap_columns_w_gap_ratio!( @rgr ) + elsif ( @rgc ) + msa.remove_gap_columns!() + elsif( @rgoc ) + msa.remove_gap_only_columns!() + end + if( @rsgr >= 0 ) + n = msa.get_number_of_seqs() + removed = msa.remove_sequences_by_gap_ratio!( @rsgr ) + k = msa.get_number_of_seqs() + r = n - k + Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" ) + log << "Kept " + k.to_s + " sequences" + ld + Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" ) + log << "Removed " + r.to_s + " sequences:" + ld + removed.each { | seq_name | + log << " " + seq_name + ld + } + end + if( @rsl >= 0 ) + n = msa.get_number_of_seqs() + removed = msa.remove_sequences_by_non_gap_length!( @rsl ) + k = msa.get_number_of_seqs() + r = n - k + Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" ) + log << "Kept " + k.to_s + " sequences" + ld + Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" ) + log << "Removed " + r.to_s + " sequences:" + ld + removed.each { | seq_name | + log << " " + seq_name + ld + } + end + if ( @keep_matching ) + n = msa.get_number_of_seqs + to_be_removed = Set.new + for ii in 0 ... n + seq = msa.get_sequence( ii ) + if !seq.get_name.downcase.index( @keep_matching.downcase ) + to_be_removed.add( ii ) + end + end + to_be_removed_ary = to_be_removed.to_a.sort.reverse + to_be_removed_ary.each { | index | + msa.remove_sequence!( index ) + } + # msa = sort( msa ) + end + if ( @remove_matching ) + n = msa.get_number_of_seqs + to_be_removed = Set.new + for iii in 0 ... n + + seq = msa.get_sequence( iii ) + + if seq.get_name.downcase.index( @remove_matching.downcase ) + to_be_removed.add( iii ) + end + end + to_be_removed_ary = to_be_removed.to_a.sort.reverse + to_be_removed_ary.each { | index | + msa.remove_sequence!( index ) + } + msa = sort( msa ) + end + + + + if ( @split > 0 ) + begin + msas = msa.split( @split, true ) + io = MsaIO.new() + w = MsaWriter + if ( @pi_output ) + w = PhylipSequentialWriter.new() + w.clean( @clean ) + w.set_max_name_length( @name_length ) + elsif( @fasta_output ) + w = FastaWriter.new() + w.set_line_width( @width ) + if ( @rg ) + w.remove_gap_chars( true ) + Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" ) + log << "removing gap character, the output is likely to become unaligned" + ld + end + w.clean( @clean ) + if ( @name_length_set ) + w.set_max_name_length( @name_length ) + end + elsif( @nexus_output ) + w = NexusWriter.new() + w.clean( @clean ) + w.set_max_name_length( @name_length ) + end + i = 0 + for m in msas + i = i + 1 + io.write_to_file( m, output + "_" + i.to_s, w ) + end + Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files" ) + log << "wrote " + msas.length.to_s + " files" + ld + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + end + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + if ( @split <= 0 ) + + unless ( @rg ) + if ( msa.is_aligned() ) + Util.print_message( PRG_NAME, "length of processed alignment: " + msa.get_length.to_s ) + log << "length of processed alignment: " + msa.get_length.to_s + ld + else + Util.print_warning_message( PRG_NAME, "output is not aligned" ) + log << "output is not aligned" + ld + end + end + + if @rem_red + removed = msa.remove_redundant_sequences!( true, true ) + if removed.size > 0 + Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) + log << "going to ignore the following " + removed.size.to_s + " redundant sequences:" + ld + removed.each { | seq_name | + puts seq_name + log << seq_name + ld + } + Util.print_message( PRG_NAME, "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" ) + log << "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" + ld + end + end + + io = MsaIO.new() + + w = MsaWriter + + if ( @pi_output ) + w = PhylipSequentialWriter.new() + w.clean( @clean ) + w.set_max_name_length( @name_length ) + elsif( @fasta_output ) + w = FastaWriter.new() + w.set_line_width( @width ) + if ( @rg ) + w.remove_gap_chars( true ) + Util.print_warning_message( PRG_NAME, "removing gap characters, the output is likely to become unaligned" ) + log << "removing gap character, the output is likely to become unaligned" + ld + end + w.clean( @clean ) + if ( @name_length_set ) + w.set_max_name_length( @name_length ) + end + elsif( @nexus_output ) + w = NexusWriter.new() + w.clean( @clean ) + w.set_max_name_length( @name_length ) + end + + + begin + io.write_to_file( msa, output, w ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + begin + f = File.open( output + LOG_SUFFIX, 'a' ) + f.print( log ) + f.close + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + + end + Util.print_message( PRG_NAME, "OK" ) + puts + end + + + private + + def sort( msa ) + names = Set.new + for i in 0 ... msa.get_number_of_seqs + name = msa.get_sequence( i ).get_name + names.add( name ) + end + sorted_ary = names.to_a.sort + new_msa = Msa.new + sorted_ary.each { | seq_name | + seq = msa.get_sequence( msa.find_by_name( seq_name, true, false )[ 0 ] ) + new_msa.add_sequence( seq ) + } + new_msa + end + + def set_fasta_input( fi = true ) + @fasta_input = fi + @input_format_set = true + end + def set_phylip_input( pi = true ) + @phylip_input = pi + @input_format_set = true + end + def set_name_length( i ) + @name_length = i + @name_length_set = true + end + def set_width( i ) + @width = i + end + def set_fasta_output( fo = true ) + @fasta_output = fo + @output_format_set = true + end + def set_pi_output( pso = true ) + @pi_output = pso + @output_format_set = true + end + def set_nexus_output( nexus = true ) + @nexus_output = nexus + @output_format_set = true + end + def set_clean( c = true ) + @clean = c + end + def set_remove_gap_columns( rgc = true ) + @rgc = rgc + end + def set_remove_gap_only_columns( rgoc = true ) + @rgoc = rgoc + end + def set_remove_gaps( rg = true ) + @rg = rg + end + def set_remove_gap_ratio( rgr ) + @rgr = rgr + end + def set_remove_seqs_gap_ratio( rsgr ) + @rsgr = rsgr + end + def set_remove_seqs_min_non_gap_length( rsl ) + @rsl = rsl + end + def set_remove_seqs( file ) + @seqs_name_file = file + @remove_seqs = true + @keep_seqs = false + end + def set_keep_seqs( file ) + @seqs_name_file = file + @keep_seqs = true + @remove_seqs = false + end + def set_trim( first, last ) + @trim = true + @first = first + @last = last + end + def set_remove_matching( remove ) + @remove_matching = remove + end + def set_keep_matching( keep ) + @keep_matching = keep + end + def set_rem_red( rr ) + @rem_red = rr + end + + + + def set_split( s ) + if ( s > 0 ) + @split = s + @clean = false # phylip only + @rgc = false + @rgoc = false + @rg = false # fasta only + @rgr = -1 + @rsgr = -1 + @rsl = -1 + @seqs_name_file = nil + @remove_seqs = false + @keep_seqs = false + @trim = false + @first = -1 + @last = -1 + end + end + def analyze_command_line( cla ) + if ( cla.is_option_set?( INPUT_TYPE_OPTION ) ) + begin + type = cla.get_option_value( INPUT_TYPE_OPTION ) + if ( type == "p" ) + set_phylip_input( true ) + set_fasta_input( false ) + elsif ( type == "f" ) + set_fasta_input( true ) + set_phylip_input( false ) + end + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( OUTPUT_TYPE_OPTION ) ) + begin + type = cla.get_option_value( OUTPUT_TYPE_OPTION ) + if ( type == "p" ) + set_pi_output( true ) + set_fasta_output( false ) + set_nexus_output( false ) + elsif ( type == "f" ) + set_pi_output( false ) + set_fasta_output( true ) + set_nexus_output( false ) + elsif ( type == "n" ) + set_pi_output( false ) + set_fasta_output( false ) + set_nexus_output( true ) + end + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( MAXIMAL_NAME_LENGTH_OPTION ) ) + begin + l = cla.get_option_value_as_int( MAXIMAL_NAME_LENGTH_OPTION ) + set_name_length( l ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( WIDTH_OPTION ) ) + begin + w = cla.get_option_value_as_int( WIDTH_OPTION ) + set_width( w ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( CLEAN_UP_SEQ_OPTION ) ) + set_clean( true ) + end + if ( cla.is_option_set?( REMOVE_GAP_COLUMNS_OPTION ) ) + set_remove_gap_columns( true ) + end + if ( cla.is_option_set?( REM_RED_OPTION ) ) + set_rem_red( true ) + end + if ( cla.is_option_set?( REMOVE_GAP_ONLY_COLUMNS ) ) + set_remove_gap_only_columns( true ) + end + if ( cla.is_option_set?( REMOVE_ALL_GAP_CHARACTERS_OPTION ) ) + set_remove_gaps( true ) + end + if ( cla.is_option_set?( REMOVE_COLUMNS_GAP_RATIO_OPTION ) ) + begin + f = cla.get_option_value_as_float( REMOVE_COLUMNS_GAP_RATIO_OPTION ) + set_remove_gap_ratio( f ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) ) + begin + s = cla.get_option_value( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) + set_remove_seqs( s ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) ) + begin + s = cla.get_option_value( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) + set_keep_seqs( s ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( TRIM_OPTION ) ) + begin + s = cla.get_option_value( TRIM_OPTION ) + if ( s =~ /(\d+)-(\d+)/ ) + set_trim( $1.to_i(), $2.to_i() ) + else + puts( "illegal argument" ) + print_help + exit( -1 ) + end + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) ) + begin + f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION ) + set_remove_seqs_gap_ratio( f ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) ) + begin + f = cla.get_option_value_as_int( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) + set_remove_seqs_min_non_gap_length( f ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( SPLIT ) ) + begin + s = cla.get_option_value_as_int( SPLIT ) + set_split( s ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + end + if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) ) + begin + s = cla.get_option_value( REMOVE_MATCHING_SEQUENCES_OPTION ) + set_remove_matching( s ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( KEEP_MATCHING_SEQUENCES_OPTION ) ) + begin + s = cla.get_option_value( KEEP_MATCHING_SEQUENCES_OPTION ) + set_keep_matching( s ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + + + end + + def print_help() + puts() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + INPUT_TYPE_OPTION + "=: f for fasta, p for phylip selex type" ) + puts( " -" + OUTPUT_TYPE_OPTION + "=: f for fasta, n for nexus, p for phylip sequential (default)" ) + puts( " -" + MAXIMAL_NAME_LENGTH_OPTION + "=: n=maximal name length (default for phylip 10, for fasta: unlimited )" ) + puts( " -" + WIDTH_OPTION + "=: n=width (fasta output only, default is 60)" ) + puts( " -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" ) + puts( " -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" ) + puts( " -" + REMOVE_GAP_ONLY_COLUMNS + ": remove gap-only columns" ) + puts( " -" + REMOVE_COLUMNS_GAP_RATIO_OPTION + "=: remove columns for which ( seqs with gap / number of sequences > n )" ) + puts( " -" + REMOVE_ALL_GAP_CHARACTERS_OPTION + ": remove all gap characters (destroys alignment, fasta output only)" ) + puts( " -" + REMOVE_ALL_SEQUENCES_LISTED_OPTION + "=: remove all sequences listed in file" ) + puts( " -" + KEEP_ONLY_SEQUENCES_LISTED_OPTION + "=: keep only sequences listed in file" ) + puts( " -" + TRIM_OPTION + "=-: remove columns before first and after last" ) + puts( " -" + REMOVE_SEQS_GAP_RATIO_OPTION + "=: remove sequences for which the gap ratio > n (after column operations)" ) + puts( " -" + REMOVE_SEQS_NON_GAP_LENGTH_OPTION + "= remove sequences with less than n non-gap characters (after column operations)" ) + puts( " -" + REMOVE_MATCHING_SEQUENCES_OPTION + "= remove all sequences with names containing s" ) + puts( " -" + KEEP_MATCHING_SEQUENCES_OPTION + "= keep only sequences with names containing s" ) + puts( " -" + SPLIT + "= split a fasta file into n files of equal number of sequences (expect for " ) + puts( " last one), cannot be used with other options" ) + puts( " -" + REM_RED_OPTION + ": remove redundant sequences" ) + puts() + end + + + + + + end # class MsaProcessor + + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb new file mode 100644 index 0000000..7f9ffe8 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb @@ -0,0 +1,395 @@ +# +# = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $ + + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/msa/msa' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/util/command_line_arguments' + + + +module Evoruby + + class MultiSequenceExtractor + + PRG_NAME = "mse" + PRG_VERSION = "1.0.0" + PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files" + PRG_DATE = "2008.08.13" + COPYRIGHT = "2008-2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + HELP_OPTION_1 = 'help' + HELP_OPTION_2 = 'h' + + LOG_SUFFIX = ".mse_log" + FASTA_SUFFIX = ".fasta" + FASTA_WITH_NORMALIZED_IDS_SUFFIX = ".ni.fasta" + NORMALIZED_IDS_MAP_SUFFIX = ".nim" + PROTEINS_LIST_FILE_SEPARATOR = "\t" + CACHE_GENOMES = false + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC , + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + ld = Constants::LINE_DELIMITER + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 3 && cla.get_number_of_files != 4 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + seq_names_files_suffix = cla.get_file_name( 0 ) + input_dir = cla.get_file_name( 1 ) + out_dir = cla.get_file_name( 2 ) + mapping_file = nil + + if ( cla.get_number_of_files == 4 ) + mapping_file = cla.get_file_name( 3 ) + begin + Util.check_file_for_readability( mapping_file ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + end + + if !File.exist?( input_dir ) + Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" ) + end + if !File.exist?( out_dir ) + Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" ) + end + if !File.directory?( input_dir ) + Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" ) + end + if !File.directory?( out_dir ) + Util.fatal_error( PRG_NAME, "error: [#{out_dir}] is not a directory" ) + end + + + log = String.new + + log << "Program : " + PRG_NAME + ld + log << "Version : " + PRG_VERSION + ld + log << "Program date : " + PRG_DATE + ld + + puts() + puts( "Sequence names files suffix: " + seq_names_files_suffix ) + log << "Sequence names files suffix: " + seq_names_files_suffix + ld + puts( "Input dir : " + input_dir ) + log << "Input dir : " + input_dir + ld + puts( "Output dir : " + out_dir ) + log << "Output dir : " + out_dir + ld + if ( mapping_file != nil ) + puts( "Mapping file : " + mapping_file ) + log << "Mapping file : " + mapping_file + ld + end + log << "Date : " + Time.now.to_s + ld + puts + + if ( mapping_file != nil ) + species_codes_to_paths = extract_mappings( mapping_file ) + end + + input_files = obtain_inputfiles( input_dir, seq_names_files_suffix ) + + counter = 0 + species_to_genomes = Hash.new() + + input_files.each { |input_file| + counter += 1 + puts + puts + puts counter.to_s + "/" + input_files.size.to_s + read_seq_family_file( input_file, + seq_names_files_suffix, + input_dir, + species_codes_to_paths, + species_to_genomes, + log, + out_dir, + mapping_file ) + } + puts + Util.print_message( PRG_NAME, "OK" ) + puts + + end + + + def read_seq_family_file( input_file, + seq_names_files_suffix, + input_dir, + species_codes_to_paths, + species_to_genomes, + log, + out_dir, + mapping_file ) + + begin + Util.check_file_for_readability( input_file ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + basename = File.basename( input_file, seq_names_files_suffix ) + out_file_path_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX + out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX + out_file_path_ids_map = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX + begin + Util.check_file_for_writability( out_file_path_fasta_file ) + Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file ) + Util.check_file_for_writability( out_file_path_ids_map ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + ids_map_writer = nil + begin + ids_map_writer = File.open( out_file_path_ids_map, 'a' ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + current_species = "" + current_msa = nil + new_msa = Msa.new + new_msa_normalized_ids = Msa.new + per_species_counter = 0 + + puts basename + + File.open( input_file ) do | file | + while line = file.gets + if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) ) + values = line.split( PROTEINS_LIST_FILE_SEPARATOR ) + if ( values.length < 2 ) + Util.fatal_error( PRG_NAME, "unexpected format: " + line ) + end + species = values[ 0 ] + seq_name = values[ 1 ] + if ( species != current_species ) + current_species = species + my_file = input_dir + Constants::FILE_SEPARATOR + current_species + + if ( !File.exist?( my_file ) ) + if species_codes_to_paths == nil + Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" ) + elsif ( !species_codes_to_paths.has_key?( current_species ) ) + Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" ) + end + my_file = species_codes_to_paths[ current_species ] + end + my_path = File.expand_path( my_file ) + my_readlink = my_path + if ( File.symlink?( my_path ) ) + my_readlink = File.readlink( my_path ) + end + current_msa = nil + if ( CACHE_GENOMES && species_to_genomes.has_key?( species ) ) + current_msa = species_to_genomes[ species ] + else + current_msa = read_fasta_file( my_file ) + if CACHE_GENOMES + species_to_genomes[ species ] = current_msa + end + end + + if ( per_species_counter > 0 ) + print_counts( per_species_counter, log, Constants::LINE_DELIMITER ) + per_species_counter = 0 + end + puts " " + current_species + " [" + my_readlink + "]" + log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER + end + puts " " + seq_name + log << " " + seq_name + Constants::LINE_DELIMITER + per_species_counter = per_species_counter + 1 + seq = nil + + if current_msa.find_by_name_start( seq_name, true ).size > 0 + begin + seq = current_msa.get_by_name_start( seq_name, true ).copy + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + elsif + # Not found, try finding by partial match. + begin + seq = current_msa.get_by_name( seq_name, true, true ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + end + + normalized_id = per_species_counter.to_s( 16 ).upcase + + "_" + current_species + + per_species_counter.to_i + + ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER ) + + if ( seq != nil ) + seq.set_name( seq.get_name + " [" + current_species + "]" ) + new_msa.add_sequence( seq ) + else + Util.fatal_error( PRG_NAME, "unexected error: seq is nil" ) + end + + new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) ) + + end + end + + end + + ids_map_writer.close + + if ( per_species_counter > 0 ) + print_counts( per_species_counter, log, Constants::LINE_DELIMITER ) + end + + io = MsaIO.new() + + fasta_writer = FastaWriter.new() + fasta_writer.remove_gap_chars + fasta_writer.clean + + begin + io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + begin + io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + begin + f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LOG_SUFFIX , 'a' ) + f.print( log ) + f.close + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + end + + def obtain_inputfiles( input_dir, seq_names_files_suffix ) + input_files = Array.new() + Dir.foreach( input_dir ) { |file_name| + if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size ) + input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name ) + end + } + input_files + end + + def extract_mappings( mapping_file ) + species_code_to_path = Hash.new() + File.open( mapping_file ) do | file | + while line = file.gets + if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) ) + if ( line =~ /(\S+)\s+(\S+)/ ) + species = $1 + path = $2 + if ( species_code_to_path.has_key?( species ) ) + Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" ) + end + if ( species_code_to_path.has_value?( path ) ) + Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" ) + end + if ( !File.exist?( path ) ) + Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" ) + end + if ( !File.file?( path ) ) + Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" ) + end + if ( !File.readable?( path ) ) + Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" ) + end + if ( File.size( path ) < 10000 ) + Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" ) + end + if ( !Util.looks_like_fasta?( path ) ) + Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" ) + end + species_code_to_path[ species ] = path + puts species + " -> " + path + end + end + end + end + species_code_to_path + end + + def print_counts( per_species_counter, log, ld ) + puts " [sum: " + per_species_counter.to_s + "]" + log << " [sum: " + per_species_counter.to_s + "]" + ld + end + + def read_fasta_file( input ) + f = MsaFactory.new() + msa = nil + begin + msa = f.create_msa_from_file( input, FastaParser.new() ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + msa + end + + def print_help() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [mapping file for " + + "genome multiple-sequence ('fasta') files not in input dir]" ) + puts() + puts( " " + "Example: \"mse.rb .prot . seqs ../genome_locations.txt\"" ) + puts() + end + + end # class MultiSequenceExtractor +end \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/apps/new_tap.rb b/forester/ruby/evoruby/lib/evo/apps/new_tap.rb new file mode 100644 index 0000000..1dc7431 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/new_tap.rb @@ -0,0 +1,167 @@ +# +# = lib/evo/apps/ - class +# +# Copyright:: Copyright (C) 2009 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: new_tap.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $ + + +require 'lib/evo/util/util' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/msa/msa' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/io/parser/general_msa_parser' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/writer/phylip_sequential_writer' +require 'lib/evo/util/command_line_arguments' + +module Evoruby + + class TaxonomyProcessor + + PRG_NAME = "" + PRG_DATE = "2009.10.09" + PRG_DESC = "replacement of labels in multiple sequence files" + PRG_VERSION = "1.00" + COPYRIGHT = "2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + REMOVE_REDUNDANT_SEQS_OPTION = "rr" + + def initialize() + @taxonomies = Hash.new() + end + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if ( ARGV == nil || ( ARGV.length != 3 && ARGV.length != 4 ) ) + puts( "Usage: #{PRG_NAME}.rb " ) + puts() + puts( " options: -" + REMOVE_REDUNDANT_SEQS_OPTION + ": to remove redundant sequences" ) + puts() + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + input = cla.get_file_name( 0 ) + output = cla.get_file_name( 1 ) + map_file = cla.get_file_name( 2 ) + + allowed_opts = Array.new + allowed_opts.push( REMOVE_REDUNDANT_SEQS_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) + end + + + remove_redudant = false + if ( cla.is_option_set?( REMOVE_REDUNDANT_SEQS_OPTION ) ) + remove_redudant = true + end + + if ( File.exists?( output ) ) + Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" ) + end + if ( File.exists?( map_file ) ) + Util.fatal_error( PRG_NAME, "map file [" + map_file + "] already exists" ) + end + if ( !File.exists?( input) ) + Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" ) + end + + fasta_like = Util.looks_like_fasta?( input ) + + puts() + puts( "Input alignment : " + input ) + puts( "Output alignment: " + output ) + puts( "Output map : " + map_file ) + if ( fasta_like ) + puts( "Format : Fasta" ) + else + puts( "Format : Phylip like" ) + end + puts() + + species_map = Hash.new + + f = MsaFactory.new() + begin + if ( fasta_like ) + msa = f.create_msa_from_file( input, FastaParser.new() ) + else + msa = f.create_msa_from_file( input, GeneralMsaParser.new() ) + end + rescue Exception => e + Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s ) + end + + if ( msa == nil || msa.get_number_of_seqs() < 1 ) + Util.fatal_error( PRG_NAME, "failed to read MSA" ) + end + begin + Util.check_file_for_writability( map_file ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT ) + end + + if ( remove_redudant ) + removed = msa.remove_redundant_sequences!( true ) + if removed.size > 0 + Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) + removed.each { | seq_name | + puts seq_name + } + Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" ) + end + end + + lf = File.open( map_file, "a" ) + for i in 0 ... msa.get_number_of_seqs + seq = msa.get_sequence( i ) + end + + io = MsaIO.new() + w = nil + if ( fasta_like ) + w = FastaWriter.new() + else + w = PhylipSequentialWriter.new() + end + w.set_max_name_length( 10 ) + w.clean( true ) + begin + io.write_to_file( msa, output, w ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) + end + lf.close() + if ( @taxonomies.length > 0 ) + Util.print_message( PRG_NAME, "number of unique taxonomies: " + @taxonomies.length.to_s ) + end + Util.print_message( PRG_NAME, "wrote: " + map_file ) + Util.print_message( PRG_NAME, "wrote: " + output ) + Util.print_message( PRG_NAME, "OK" ) + end + + end # class + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/apps/phylogenies_decorator.rb b/forester/ruby/evoruby/lib/evo/apps/phylogenies_decorator.rb new file mode 100644 index 0000000..69a6bc4 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/phylogenies_decorator.rb @@ -0,0 +1,299 @@ +#!/usr/local/bin/ruby -w +# +# = lib/evo/apps/phylogenies_decorator +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# decoration of phylogenies with sequence/species names and domain architectures +# +# $Id: phylogenies_decorator.rb,v 1.34 2010/12/13 19:00:11 cmzmasek Exp $ +# +# Environment variable FORESTER_HOME needs to point to the appropriate +# directory (e.g. setenv FORESTER_HOME $HOME/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester-atv/) + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' + +require 'date' + +module Evoruby + + class PhylogeniesDecorator + + DECORATOR_OPTIONS_SEQ_NAMES = '-r=1 -mdn' + # -mdn is a hidden expert option to rename e.g. "6_ORYLA3" to "6_[3]_ORYLA" + #DECORATOR_OPTIONS_SEQ_NAMES = '-sn -r=1' + DECORATOR_OPTIONS_DOMAINS = '-r=1' + IDS_MAPFILE_SUFFIX = '.nim' + DOMAINS_MAPFILE_SUFFIX = '.dff' + SLEEP_TIME = 0.1 + REMOVE_NI = true + TMP_FILE = '___PD___' + LOG_FILE = '00_phylogenies_decorator.log' + FORESTER_HOME = ENV[Constants::FORESTER_HOME_ENV_VARIABLE] + JAVA_HOME = ENV[Constants::JAVA_HOME_ENV_VARIABLE] + + PRG_NAME = "phylogenies_decorator" + PRG_DATE = "2008.09.02" + PRG_DESC = "decoration of phylogenies with sequence/species names and domain architectures" + PRG_VERSION = "1.0.1" + COPYRIGHT = "2008-2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + IDS_ONLY_OPTION = "n" + DOMAINS_ONLY_OPTION = "d" + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" + + NL = Constants::LINE_DELIMITER + + def run + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if ( ARGV == nil || ARGV.length > 3 || ARGV.length < 2 ) + print_help + exit( -1 ) + end + + if FORESTER_HOME == nil || FORESTER_HOME.length < 1 + Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::FORESTER_HOME_ENV_VARIABLE} has not been set" ) + end + if JAVA_HOME == nil || JAVA_HOME.length < 1 + Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::JAVA_HOME_ENV_VARIABLE} has not been set" ) + end + + if !File.exist?( FORESTER_HOME ) + Util.fatal_error( PRG_NAME, '[' + FORESTER_HOME + '] does not exist' ) + end + if !File.exist?( JAVA_HOME ) + Util.fatal_error( PRG_NAME, '[' + JAVA_HOME + '] does not exist' ) + end + + decorator = JAVA_HOME + '/bin/java -cp ' + FORESTER_HOME + '/java/forester.jar org.forester.application.decorator' + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if File.exist?( LOG_FILE ) + Util.fatal_error( PRG_NAME, 'logfile [' + LOG_FILE + '] already exists' ) + end + + allowed_opts = Array.new + allowed_opts.push( IDS_ONLY_OPTION ) + allowed_opts.push( DOMAINS_ONLY_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) + end + + ids_only = false + domains_only = false + + in_suffix = cla.get_file_name( 0 ) + out_suffix = cla.get_file_name( 1 ) + + if cla.is_option_set?( IDS_ONLY_OPTION ) + ids_only = true + end + if cla.is_option_set?( DOMAINS_ONLY_OPTION ) + domains_only = true + end + + if ( ids_only && domains_only ) + Util.fatal_error( PRG_NAME, 'attempt to use ids only and domains only at the same time' ) + end + + log = String.new + + now = DateTime.now + log << "Program : " + PRG_NAME + NL + log << "Version : " + PRG_VERSION + NL + log << "Program date : " + PRG_DATE + NL + log << "Options for seq names: " + DECORATOR_OPTIONS_SEQ_NAMES + NL + log << "Options for domains : " + DECORATOR_OPTIONS_DOMAINS + NL + log << "FORESTER_HOME : " + FORESTER_HOME + NL + log << "JAVA_HOME : " + JAVA_HOME + NL + NL + log << "Date/time: " + now.to_s + NL + log << "Directory: " + Dir.getwd + NL + NL + + Util.print_message( PRG_NAME, 'input suffix : ' + in_suffix ) + Util.print_message( PRG_NAME, 'output suffix : ' + out_suffix ) + + log << 'input suffix : ' + in_suffix + NL + log << 'output suffix : ' + out_suffix + NL + + if ( File.exists?( TMP_FILE ) ) + File.delete( TMP_FILE ) + end + + files = Dir.entries( "." ) + + counter = 0 + + files.each { | phylogeny_file | + if ( !File.directory?( phylogeny_file ) && + phylogeny_file !~ /^\./ && + phylogeny_file !~ /^00/ && + phylogeny_file !~ /#{out_suffix}$/ && + phylogeny_file =~ /#{in_suffix}$/ ) + begin + Util.check_file_for_readability( phylogeny_file ) + rescue ArgumentError + Util.fatal_error( PRG_NAME, 'can not read from: ' + phylogeny_file + ': '+ $! ) + end + + counter += 1 + + outfile = phylogeny_file.sub( /#{in_suffix}$/, out_suffix ) + + if REMOVE_NI + outfile = outfile.sub( /_ni_/, '_' ) + end + + if File.exists?( outfile ) + msg = counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile + + ' : already exists, skipping' + Util.print_message( PRG_NAME, msg ) + log << msg + NL + next + end + + Util.print_message( PRG_NAME, counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile ) + log << counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile + NL + + phylogeny_id = get_id( phylogeny_file ) + + ids_mapfile_name = nil + domains_mapfile_name = nil + + if ids_only + ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX ) + elsif domains_only + domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX ) + else + ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX ) + domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX ) + end + + if domains_mapfile_name != nil + begin + Util.check_file_for_readability( domains_mapfile_name ) + rescue ArgumentError + Util.fatal_error( PRG_NAME, 'failed to read from [#{domains_mapfile_name}]: ' + $! ) + end + end + + if ids_mapfile_name != nil + begin + Util.check_file_for_readability( ids_mapfile_name ) + rescue ArgumentError + Util.fatal_error( PRG_NAME, 'failed to read from [#{ids_mapfile_name}]: ' + $! ) + end + end + + if domains_mapfile_name != nil + if ids_mapfile_name != nil + my_outfile = TMP_FILE + else + my_outfile = outfile + end + cmd = decorator + ' ' + DECORATOR_OPTIONS_DOMAINS + ' ' + + '-f=d ' + phylogeny_file + ' ' + + domains_mapfile_name + ' ' + my_outfile + execute_cmd( cmd, log ) + end + + if ids_mapfile_name != nil + if domains_mapfile_name != nil + my_infile = TMP_FILE + else + my_infile = phylogeny_file + end + cmd = decorator + ' ' + DECORATOR_OPTIONS_SEQ_NAMES + ' ' + + '-f=s ' + my_infile + ' ' + + ids_mapfile_name + ' ' + outfile + execute_cmd( cmd, log ) + end + + if ( File.exists?( TMP_FILE ) ) + File.delete( TMP_FILE ) + end + end + } + open( LOG_FILE, 'w' ) do | f | + f.write( log ) + end + puts + Util.print_message( PRG_NAME, 'OK' ) + puts + end # def run + + def execute_cmd( cmd, log ) + log << 'excuting ' + cmd + NL + IO.popen( cmd , 'r+' ) do | pipe | + pipe.close_write + log << pipe.read + NL + NL + end + sleep( SLEEP_TIME ) + end + + + def get_id( phylogeny_file_name ) + phylogeny_file_name =~ /^([^_]+)/ + $1 + end + + def get_file( files_in_dir, phylogeny_id, suffix_pattern ) + matching_files = Array.new + files_in_dir.each { | file | + + if ( !File.directory?( file ) && + file !~ /^\./ && + file !~ /^00/ && + file =~ /^#{phylogeny_id}.*#{suffix_pattern}$/ ) + matching_files << file + end + } + if matching_files.length < 1 + Util.fatal_error( PRG_NAME, 'no file matching [' + phylogeny_id + + '_] [' + suffix_pattern + '] present in current directory' ) + elsif matching_files.length > 1 + Util.fatal_error( PRG_NAME, 'more than one file matching [' + phylogeny_id + + '_] [' + suffix_pattern + '] present in current directory' ) + end + matching_files[ 0 ] + end + + def print_help() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + IDS_ONLY_OPTION + ": decorate with sequence/species names only" ) + puts( " -" + DOMAINS_ONLY_OPTION + ": decorate with domain structures" ) + puts() + end + end # class PhylogenyiesDecorator + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/phylogeny_factory.rb b/forester/ruby/evoruby/lib/evo/apps/phylogeny_factory.rb new file mode 100644 index 0000000..999541e --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/phylogeny_factory.rb @@ -0,0 +1,267 @@ +# +# = lib/evo/apps/phylogeny_factory - PhylogenyFactory class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: phylogeny_factory.rb,v 1.32 2010/12/13 19:00:11 cmzmasek Exp $ + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' + +require 'set' +require 'date' + +module Evoruby + + class PhylogenyFactory + + PRG_NAME = "phylogeny_factory" + PRG_DATE = "2010.05.26" + PRG_DESC = "automated phylogeny reconstruction using queing system" + PRG_VERSION = "1.1" + COPYRIGHT = "2010 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + USE_JOB_SUBMISSION_SYSTEM_OPTION = 's' + LOG_FILE = '00_phylogeny_factory.log' + TEMPLATE_FILE = '00_phylogeny_factory.template' + PBS_O_WORKDIR = '$PBS_O_WORKDIR/' + MIN_LENGTH_DEFAULT = 40 + WALLTIME = '100:00:00' + QUEUE = 'default' + + TMP_CMD_FILE_SUFFIX = '_QSUB' + + HMM = 'HMM' + RSL = 'RSL' + + OPTION_OPEN = '%[' + OPTION_CLOSE = ']%' + + WAIT = 1.0 + + NL = Constants::LINE_DELIMITER + + def run + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + allowed_opts = Array.new + allowed_opts.push( USE_JOB_SUBMISSION_SYSTEM_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + if File.exists?( LOG_FILE ) + puts( '[' + PRG_NAME + '] > log file [' + LOG_FILE + '] already exists' ) + exit( -1 ) + end + + if !File.exists?( TEMPLATE_FILE ) + puts( '[' + PRG_NAME + '] > template file [' + TEMPLATE_FILE + '] not found' ) + exit( -1 ) + end + + use_job_submission_system = false + if cla.is_option_set?( USE_JOB_SUBMISSION_SYSTEM_OPTION ) + use_job_submission_system = true + end + + log = String.new + + now = DateTime.now + log << "Program : " + PRG_NAME + NL + log << "Version : " + PRG_VERSION + NL + log << "Program date: " + PRG_DATE + NL + NL + log << "Date/time : " + now.to_s + NL + log << "Directory : " + Dir.getwd + NL + NL + + puts( '[' + PRG_NAME + '] > reading ' + TEMPLATE_FILE ) + + paths = Hash.new # path placeholder -> full path + min_lengths = Hash.new # alignment id -> minimal length + hmms = Hash.new # alignment id -> hmm + options = Hash.new # option placeholder -> option + ids = Set.new + + commands = Array.new + + log << "////////////////////////////////////////////////////////////////// #{NL}" + log << "Template file [" + TEMPLATE_FILE + "]:#{NL}" + + command = String.new + + open( TEMPLATE_FILE ).each { | line | + log << line + if ( line =~ /^#/ ) + elsif ( line =~ /^\$\s*(\S+)\s*=\s*(\S+)/ ) + paths[ $1 ] = $2 + puts( '[' + PRG_NAME + '] > paths : ' + $1 + ' => ' + $2 ) + + elsif ( line =~ /^%\s*#{HMM}\s*(\S+)\s*=\s*(\S+)/ ) + hmms[ $1 ] = $2 + puts( '[' + PRG_NAME + '] > hmms : ' + $1 + ' => ' + $2 ) + + elsif ( line =~ /^%\s*#{RSL}\s*(\S+)\s*=\s*(\S+)/ ) + min_lengths[ $1 ] = $2 + puts( '[' + PRG_NAME + '] > min lengths: ' + $1 + ' => ' + $2 ) + + elsif ( line =~ /^%\s*(\S+)\s*=\s*(\S+)/ ) + options[ $1 ] = $2 + puts( '[' + PRG_NAME + '] > options : ' + $1 + ' => ' + $2 ) + + elsif ( line =~ /^>\s*(.+)/ ) + command = command + $1 + ";#{NL}" + + elsif ( line =~ /^-/ ) + commands << prepare( command, paths ) + command = String.new + end + } + log << "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ #{NL}#{NL}" + + files = Dir.entries( "." ) + + files.each { | file | + if ( !File.directory?( file ) && + file !~ /^\./ && + file !~ /#{TEMPLATE_FILE}/ && + file !~ /.bck$/ && + file !~ /.log$/ && + file !~ /nohup/ && + file !~ /^00/ ) + aln_name = file.to_str + id = get_id( aln_name ) + if !ids.include?( id ) + ids.add( id ) + end + puts( '[' + PRG_NAME + '] > file [id] : ' + aln_name + ' [' + id + ']' ) + commands.each do | cmd | + + cmd = subst_hmm( cmd, aln_name, hmms ) + cmd = subst_min_length( cmd, aln_name, min_lengths ) + cmd = subst_options( cmd, options ) + if use_job_submission_system + cmd = subst_aln_name( cmd, PBS_O_WORKDIR + aln_name ) + else + cmd = subst_aln_name( cmd, aln_name ) + end + + if ( cmd =~ /%/ ) + cmd =~ /(%.*?%)/ + problem = $1 + puts( '[' + PRG_NAME + '] > WARNING : [' + id + '] command still contains placeholder: ' + problem ) + log << "WARNING: command still contains placeholder: " + cmd + NL + else + tmp_cmd_file = file.to_str[ 0..4 ] + TMP_CMD_FILE_SUFFIX + if ( File.exists?( tmp_cmd_file ) ) + File.delete( tmp_cmd_file ) + end + if use_job_submission_system + open( tmp_cmd_file, 'w' ) do |f| + f.write( cmd ) + end + end + + log << cmd + NL + + if use_job_submission_system + IO.popen( 'qsub -q ' + QUEUE + ' -l walltime=' + WALLTIME + ' ' + tmp_cmd_file , 'r+' ) do | pipe | + pipe.close_write + end + else + spawn( 'nohup ' + cmd + ' &', STDERR => "/dev/null" ) + end + + sleep( WAIT ) + if ( File.exists?( tmp_cmd_file ) ) + File.delete( tmp_cmd_file ) + end + end + end + end + } + + open( LOG_FILE, 'w' ) do | f | + f.write( log ) + end + + puts() + puts( '[' + PRG_NAME + '] > OK' ) + puts() + + end # def run + + def prepare( command, paths ) + paths.each_pair{ | name, full | + command = command.gsub( name, full ) + } + command + end + + def subst_options( command, options ) + opt_placeholders = command.scan( /%\[\S+\]%/ ) + opt_placeholders.each { | opt_placeholder | + opt_placeholder = opt_placeholder.gsub( OPTION_OPEN , '' ) + opt_placeholder = opt_placeholder.gsub( OPTION_CLOSE, '' ) + opt_value = options[ opt_placeholder ] + if ( opt_value != nil && opt_value.size > 0 ) + command = command.gsub( OPTION_OPEN + opt_placeholder + OPTION_CLOSE, opt_value ) + end + } + command + end + + def subst_aln_name( command, aln_name ) + command = command.gsub( '$', aln_name ) + command + end + + def subst_hmm( command, aln_name, hmms ) + id = get_id( aln_name ) + hmm = hmms[ id ] + if ( hmm != nil && hmm.size > 0 ) + command = command.gsub( OPTION_OPEN + HMM + OPTION_CLOSE, hmm ) + end + command + end + + def subst_min_length( command, aln_name, min_lengths ) + id = get_id( aln_name ) + min_length = min_lengths[ id ] + if ( min_length != nil && min_length.size > 0 ) + command = command.gsub( OPTION_OPEN + RSL + OPTION_CLOSE, min_length ) + else + command = command.gsub( OPTION_OPEN + RSL + OPTION_CLOSE, MIN_LENGTH_DEFAULT.to_s ) + end + command + end + + def get_id( aln_name ) + aln_name =~ /^([^_]+)/ + $1 + end + + end # class PhylogenyFactory + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/apps/taxonomy_processor.rb new file mode 100644 index 0000000..d688844 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/taxonomy_processor.rb @@ -0,0 +1,297 @@ +# +# = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: taxonomy_processor.rb,v 1.26 2010/12/13 19:00:11 cmzmasek Exp $ + + +require 'lib/evo/util/util' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/msa/msa' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/io/parser/general_msa_parser' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/writer/phylip_sequential_writer' +require 'lib/evo/util/command_line_arguments' + +module Evoruby + + class TaxonomyProcessor + + PRG_NAME = "tap" + PRG_DATE = "2010.02.24" + PRG_DESC = "replacement of species names in multiple sequence files" + PRG_VERSION = "1.01" + COPYRIGHT = "2010 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + EXTRACT_TAXONOMY_OPTION = "t" + + def initialize() + @taxonomies = Hash.new() + end + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if ( ARGV == nil || ( ARGV.length != 3 && ARGV.length != 4 && ARGV.length != 5 && ARGV.length != 6 ) ) + puts( "Usage: #{PRG_NAME}.rb [options] [input map file] " ) + puts() + puts( " options: -" + EXTRACT_TAXONOMY_OPTION + ": to extract taxonomy information from bracketed expression" ) + puts() + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + mapfile = nil + input = nil + output = nil + list_file = nil + + if cla.get_number_of_files == 4 + mapfile = cla.get_file_name( 0 ) + input = cla.get_file_name( 1 ) + output = cla.get_file_name( 2 ) + list_file = cla.get_file_name( 3 ) + elsif cla.get_number_of_files == 3 + input = cla.get_file_name( 0 ) + output = cla.get_file_name( 1 ) + list_file = cla.get_file_name( 2 ) + end + + + allowed_opts = Array.new + allowed_opts.push( EXTRACT_TAXONOMY_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) + end + + extract_taxonomy = false + if ( cla.is_option_set?( EXTRACT_TAXONOMY_OPTION ) ) + extract_taxonomy = true + end + + if ( File.exists?( output ) ) + Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" ) + end + if ( File.exists?( list_file ) ) + Util.fatal_error( PRG_NAME, "list file [" + list_file + "] already exists" ) + end + if ( !File.exists?( input) ) + Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" ) + end + if ( mapfile != nil && !File.exists?( mapfile ) ) + Util.fatal_error( PRG_NAME, "mapfile [" + mapfile + "] does not exist" ) + end + + fasta_like = Util.looks_like_fasta?( input ) + + puts() + if mapfile != nil + puts( "Map file : " + mapfile ) + end + puts( "Input alignment : " + input ) + puts( "Output alignment: " + output ) + puts( "Name list : " + list_file ) + if ( fasta_like ) + puts( "Format : Fasta" ) + else + puts( "Format : Phylip like" ) + end + if ( extract_taxonomy ) + puts( "Extract taxonomy: true" ) + end + puts() + + species_map = Hash.new + if mapfile != nil + File.open( mapfile ) do | file | + while line = file.gets + if ( line =~/(.+)#(.+)/ || line =~/(.+)\s+(.+)/ ) + species_map[ $1 ] = $2 + Util.print_message( PRG_NAME, "mapping: " + $1 + ' => ' + $2 ) + end + end + end + end + + f = MsaFactory.new() + begin + if ( fasta_like ) + msa = f.create_msa_from_file( input, FastaParser.new() ) + else + msa = f.create_msa_from_file( input, GeneralMsaParser.new() ) + end + rescue Exception => e + Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s ) + end + + if ( msa == nil || msa.get_number_of_seqs() < 1 ) + Util.fatal_error( PRG_NAME, "failed to read MSA" ) + end + begin + Util.check_file_for_writability( list_file ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT ) + end + + #removed = msa.remove_redundant_sequences!( true ) + #if removed.size > 0 + # Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) + # removed.each { | seq_name | + # puts seq_name + # } + # Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" ) + #end + + lf = File.open( list_file, "a" ) + for i in 0 ... msa.get_number_of_seqs + seq = msa.get_sequence( i ) + seq.set_name( Util::normalize_seq_name( modify_name( seq.get_name(), i, lf, species_map, extract_taxonomy ), 10 ) ) + end + + io = MsaIO.new() + w = nil + if ( fasta_like ) + w = FastaWriter.new() + else + w = PhylipSequentialWriter.new() + end + w.set_max_name_length( 10 ) + w.clean( true ) + begin + io.write_to_file( msa, output, w ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) + end + lf.close() + if ( @taxonomies.length > 0 ) + Util.print_message( PRG_NAME, "number of unique taxonomies: " + @taxonomies.length.to_s ) + end + Util.print_message( PRG_NAME, "wrote: " + list_file ) + Util.print_message( PRG_NAME, "wrote: " + output ) + Util.print_message( PRG_NAME, "OK" ) + end + + private + + def modify_name( desc, counter, file, species_map, extract_taxonomy ) + new_desc = nil + my_species = nil + if desc =~ /^>?\s*\S{1,10}_([0-9A-Z]{3,5})/ + new_desc = counter.to_s( 16 ) + "_" + $1 + elsif extract_taxonomy + if ( desc.count( "[" ) != desc.count( "]" ) ) + Util.fatal_error( PRG_NAME, "illegal bracket count in: " + desc ) + end + species = nil + species_map.each_key do | key | + if desc =~ /[\b|_]#{key}\b/ # Added boundaries to prevent e.g. RAT matching ARATH. + species = species_map[ key ] + new_desc = counter.to_s( 16 ) + "_" + species + break + end + end + if species == nil + if desc =~/.*\[(\S{3,}?)\]/ + species = $1 + species.strip! + species.upcase! + species.gsub!( /\s+/, " " ) + species.gsub!( /-/, "" ) + species.gsub!( /\)/, "" ) + species.gsub!( /\(/, "" ) + species.gsub!( /\'/, "" ) + if species =~ /\S+\s\S+/ || species =~ /\S{3,5}/ + if species =~ /(\S+)\s(\S+)/ + code = $1[ 0..2 ] + $2[ 0..1 ] + elsif species =~ /\S{3,5}/ + code = species + elsif species.count( " " ) > 2 + species =~ /(\S+)\s+(\S+)\s+(\S+)$/ + third_last = $1 + second_last = $2 + last = $3 + code = code[ 0 ] + third_last[ 0 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ] + elsif species.count( " " ) > 1 + species =~ /(\S+)\s+(\S+)$/ + second_last = $1 + last = $2 + code = code[ 0..1 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ] + end + new_desc = counter.to_s( 16 ) + "_" + code + if @taxonomies.has_key?( code ) + if ( !@taxonomies.has_value?( species ) ) + Util.fatal_error( PRG_NAME, "code [#{code}] is not unique in [#{desc}]" ) + end + else + if ( @taxonomies.has_value?( species ) ) + Util.fatal_error( PRG_NAME, "genome [#{species}] is not unique in [#{desc}]" ) + else + @taxonomies[ code ] = species + end + end + else + Util.fatal_error( PRG_NAME, "illegal format [#{species}] in: " + desc ) + end + else + Util.fatal_error( PRG_NAME, "illegal format in: " + desc ) + end + end + else + species = nil + my_species = nil + species_map.each_key do | key | + if desc =~ /#{key}/ + species = species_map[ key ] + species = species.gsub( /\s+/, "" ) + species = species.gsub( /_/, " " ) + my_species = species + if species =~ /(\S+)\s+(\S+)/ + species = $1[0..2] + $2[0..1] + end + species = species.gsub( /\s+/, "" ) + species = species.slice(0, 5) + species.upcase! + break + end + end + if species == nil + Util.fatal_error( PRG_NAME, "species not found in: " + desc ) + else + new_desc = counter.to_s( 16 ) + "_" + species + end + end + if new_desc == nil + Util.fatal_error( PRG_NAME, "failed to extract species from: " + desc ) + end + if my_species != nil + file.print( new_desc + ": " + desc + " [" + my_species + "]" + "\n" ) + else + file.print( new_desc + ": " + desc + "\n" ) + end + new_desc + end + + end # class TaxonomyProcessor + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/tseq_taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/apps/tseq_taxonomy_processor.rb new file mode 100644 index 0000000..f708247 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/tseq_taxonomy_processor.rb @@ -0,0 +1,190 @@ +# +# = lib/evo/apps/tseq_taxonomy_processor - TseqTaxonomyProcessor class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: tseq_taxonomy_processor.rb,v 1.6 2010/12/13 19:00:11 cmzmasek Exp $ + + +require 'lib/evo/util/util' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/msa/msa' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/parser/sp_taxonomy_parser' +require 'lib/evo/io/parser/ncbi_tseq_parser' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/writer/phylip_sequential_writer' +require 'lib/evo/util/command_line_arguments' + +module Evoruby + + class TseqTaxonomyProcessor + + PRG_NAME = "tseq_tap" + PRG_DATE = "2009.01.06" + PRG_DESC = "preprocessing of multiple sequence files in ncbi tseq xml format" + PRG_VERSION = "1.02" + COPYRIGHT = "2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + TAXONOMY_CODE = "TAXONOMY_CODE:" + TAXONOMY_ID = "TAXONOMY_ID:" + TAXONOMY_ID_TYPE = "TAXONOMY_ID_TYPE:" + TAXONOMY_SN = "TAXONOMY_SN:" + TAXONOMY_CN = "TAXONOMY_CN:" + SEQ_ACCESSION = "SEQ_ACCESSION:" + SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE:" + SEQ_SECONDARY_ACCESSION = "SEQ_SECONDARY_ACCESSION:" + SEQ_SYMBOL = "SEQ_SYMBOL:" + SEQ_NAME = "SEQ_NAME:" + SEQ_MOL_SEQ = "SEQ_MOL_SEQ:" + + def initialize() + @tax_ids_to_sp_taxonomies = Hash.new() + end + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if ARGV == nil || ARGV.length != 4 + puts( "Usage: #{PRG_NAME}.rb " ) + puts() + + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + allowed_opts = Array.new + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) + end + + sp_taxonomy_infile = cla.get_file_name( 0 ) + sequences_infile = cla.get_file_name( 1 ) + sequences_outfile = cla.get_file_name( 2 ) + mapping_outfile = cla.get_file_name( 3 ) + + Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile ) + Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile ) + Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile ) + Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile ) + + sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile ) + + Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile ) + + tseq_parser = NcbiTSeqParser.new + msa_fac = MsaFactory.new + + seqs = msa_fac.create_msa_from_file( sequences_infile, tseq_parser ) + + Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile ) + + removed = seqs.remove_redundant_sequences!( true, true ) + + if removed.size > 0 + Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) + removed.each { | seq_name | + puts seq_name + } + Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" ) + end + + mapping_out = File.open( mapping_outfile, "a" ) + + for i in 0 ... seqs.get_number_of_seqs + seq = seqs.get_sequence( i ) + if seq.get_taxonomy == nil + Util.fatal_error( PRG_NAME, "sequence [" + seq.get_name + "] has no taxonomy information" ) + end + seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) ) + end + + io = MsaIO.new() + + w = FastaWriter.new() + + w.set_max_name_length( 10 ) + w.clean( true ) + begin + io.write_to_file( seqs, sequences_outfile, w ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) + end + mapping_out.close() + + Util.print_message( PRG_NAME, "wrote: " + mapping_outfile ) + Util.print_message( PRG_NAME, "wrote: " + sequences_outfile ) + Util.print_message( PRG_NAME, "OK" ) + + end + + private + + def modify_name( seq, i, sp_taxonomies, mapping_outfile ) + + tax_id = seq.get_taxonomy.get_id + matching_sp_taxonomy = nil + + if @tax_ids_to_sp_taxonomies.has_key?( tax_id ) + # This is so that a second lookup will be much faster. + matching_sp_taxonomy = @tax_ids_to_sp_taxonomies[ tax_id ] + else + sp_taxonomies.each { |sp_taxonomy| + if ( sp_taxonomy.id == tax_id ) + if matching_sp_taxonomy != nil + Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] is not unique" ) + end + matching_sp_taxonomy = sp_taxonomy + @tax_ids_to_sp_taxonomies[ tax_id ] = sp_taxonomy + end + } + end + if matching_sp_taxonomy == nil + Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] for [" + seq.get_taxonomy.get_name + "] not found" ) + end + + new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code + + seq_name = seq.get_name + if seq_name =~ /\[.+\]$/ + # Redundant taxonomy information hides here. + seq_name = seq_name.sub(/\[.+\]$/, '') + end + if seq_name =~ /^\s*hypothetical\s+protein\s*/i + # Pointless information. + seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' ) + end + + mapping_outfile.print( new_name + "\t" + + TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + + TAXONOMY_ID + tax_id + "\t" + + TAXONOMY_ID_TYPE + seq.get_taxonomy.get_id_source + "\t" + + TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + + SEQ_ACCESSION + seq.get_accession + "\t" + + SEQ_ACCESSION_SOURCE + seq.get_accession_source + "\t" + + SEQ_SYMBOL + seq.get_symbol + "\t" + + SEQ_NAME + seq_name + "\t" + + SEQ_MOL_SEQ + seq.get_sequence_as_string + + Constants::LINE_DELIMITER ) + new_name + end + + end + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/io/msa_io.rb b/forester/ruby/evoruby/lib/evo/io/msa_io.rb new file mode 100644 index 0000000..6f96f01 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/msa_io.rb @@ -0,0 +1,24 @@ +# +# = lib/evo/io/msa_io.rb - MsaIO class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: msa_io.rb,v 1.2 2007/06/12 04:51:35 cmzmasek Exp $ +# +# last modified: 05/16/2007 + +module Evoruby + + class MsaIO + + def initialize() + end + + def write_to_file( msa, path, msa_writer ) + msa_writer.write( msa, path ) + end + + end # module Evoruby + +end # class MsaIO diff --git a/forester/ruby/evoruby/lib/evo/io/parser/basic_table_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/basic_table_parser.rb new file mode 100644 index 0000000..31f27b4 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/parser/basic_table_parser.rb @@ -0,0 +1,41 @@ +# +# = lib/evo/io/parser/basic_table_parser - BasicTableParser class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: basic_table_parser.rb,v 1.3 2007/09/28 03:12:10 cmzmasek Exp $ +# +# last modified: 05/16/2007 + +module Evoruby + + class BasicTableParser + + START_OF_COMMENT_LINE_CHAR = "#" + + # raises ArgumentError + def BasicTableParser.parse( path, column_delimiter ) + Util.check_file_for_readability( path ) + table = BasicTable.new + row = 0 + File.open( path ) do | file | + while line = file.gets + if ( !Util.is_string_empty?( line ) && + !line.slice( 0, 1 ).eql?( START_OF_COMMENT_LINE_CHAR ) ) + values = line.split( column_delimiter ) + col = 0 + values.each { | value | + table.set_value( row, col, value.strip! ) + col += 1 + } + row += 1 + end + end + end + return table + end + + end # class BasicTableParser + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/io/parser/fasta_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/fasta_parser.rb new file mode 100644 index 0000000..2e3d51a --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/parser/fasta_parser.rb @@ -0,0 +1,77 @@ +# +# = lib/evo/io/parser/fasta_parser - FastaParser class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: fasta_parser.rb,v 1.11 2010/10/08 22:04:17 cmzmasek Exp $ +# +# last modified: 05/17/2007 + +require 'lib/evo/io/parser/msa_parser' +require 'lib/evo/msa/msa' + +require 'iconv' + +module Evoruby + + class FastaParser < MsaParser + + def initialize + end + + def parse( path ) + Util.check_file_for_readability( path ) + msa = Msa.new + current_seq = String.new() + name = String.new() + saw_first_seq = false + ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' ) + File.open( path ) do | file | + while line = file.gets + line = ic.iconv( line ) + if can_ignore?( line, saw_first_seq ) + + elsif line =~ /^\s*>\s*(.+)/ + saw_first_seq = true + add_seq( name, current_seq, msa ) + name = $1 + current_seq = String.new() + elsif line =~ /^\s*(.+)/ + if name.length < 1 + error_msg = "format error at: " + line + raise IOError, error_msg + end + # was: seq = $1.rstrip + seq = $1.gsub(/\s+/, '') + current_seq = current_seq + seq + else + error_msg = "Unexpected line: " + line + raise IOError, error_msg + end + end + end + add_seq( name, current_seq, msa ) + return msa + end + + private + + def add_seq( name, seq, msa ) + if name.length > 0 && seq.length > 0 + msa.add( name, seq ) + end + end + + def can_ignore?( line, saw_first_seq ) + return ( line !~ /\S/ || + line =~ /^\s*#/ || + line =~ /^\s*%/ || + line =~ /^\s*\/\// || + line =~ /^\s*!!/ || + ( !saw_first_seq && line =~/^\s*[^>]/ ) ) + end + + end # class FastaParser + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb new file mode 100644 index 0000000..6a170e3 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb @@ -0,0 +1,108 @@ +# +# = lib/evo/io/parser/general_msa_parser - GeneralMsaParser class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: general_msa_parser.rb,v 1.8 2009/10/08 22:44:54 cmzmasek Exp $ +# +# last modified: 2009/10/08 + +require 'lib/evo/io/parser/msa_parser' +require 'lib/evo/msa/msa' + +require 'iconv' + +module Evoruby + + class GeneralMsaParser < MsaParser + + def initialize + end + + def parse( path ) + Util.check_file_for_readability( path ) + block = -1 + current_seq_index_per_block = -1 + current_name = nil + saw_ignorable = true + is_first = true + msa = Msa.new + ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' ) + File.open( path ) do | file | + while line = file.gets + line = ic.iconv( line ) + if can_ignore?( line ) + saw_ignorable = true + elsif ( is_first && is_program_name_line?( line ) ) + elsif( line =~ /^\S+\s+.+\s*$/ || line =~ /^\s+.+\s*$/ || line =~ /^\S+\s*$/ ) + if ( saw_ignorable ) + block += 1 + current_seq_index_per_block = -1 + saw_ignorable = false + end + current_seq_index_per_block += 1 + if ( line =~ /^(\S+)\s+(.+?)\s*$/ ) + name = $1 + seq = $2.gsub( /\s/, '.' ) + a = msa.find_by_name( name, false, false ) + if ( a.length < 1 ) + msa.add( name, seq ) + elsif ( a.length == 1 ) + msa.get_sequence( a[ 0 ] ).append!( seq ) + else + error_msg = "Unexpected error at line: " + line + raise IOError, error_msg + end + current_name = name + elsif ( line =~ /^\s+(.+?)\s*$/ ) + seq = $1.gsub( /\s/, '.' ) + a = msa.find_by_name( current_name, false, false ) + if ( a.length != 1 ) + error_msg = "Unexpected error at line: " + line + raise IOError, error_msg + else + msa.get_sequence( a[ 0 ] ).append!( seq ) + end + + elsif ( line =~ /^(\S+)\s*$/ ) + seq = $1 + if block == 0 + error_msg = "First block cannot contain unnamed sequences" + raise IOError, error_msg + else + msa.get_sequence( current_seq_index_per_block ).append!( seq ) + end + current_name = nil + end + else + error_msg = "Unexpected line: " + line + raise IOError, error_msg + end + if ( is_first ) + is_first = false + end + end + end + return msa + end # def parse( path ) + + private + + def can_ignore?( line ) + return ( line !~ /[A-Za-z\-?\*_\.]/ || + line =~ /^\s+[*\.:]/ || + line =~ /^\s*#/ || + line =~ /^\s*%/ || + line =~ /^\s*\/\// || + line =~ /^\s*!!/ ) + end + + def is_program_name_line?( line ) + return ( line =~ /^CLUSTAL\s/ || + line =~ /^MUSCLE\s\(/ || + line =~ /^PROBCONS\s/ ) + end + end # class GeneralMsaParser + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb new file mode 100644 index 0000000..d228024 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb @@ -0,0 +1,298 @@ +# +# = lib/evo/io/parser/hmmsearch_domain_extractor.rb - HmmsearchDomainExtractor class +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: hmmsearch_domain_extractor.rb,v 1.24 2009/11/25 06:30:24 cmzmasek Exp $ + + +require 'lib/evo/util/constants' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/parser/fasta_parser' + + +module Evoruby + + class HmmsearchDomainExtractor + + TRIM_BY = 2 + + def initialize + end + + # raises ArgumentError, IOError, StandardError + def parse( hmmsearch_output, + fasta_sequence_file, + outfile, + passed_seqs_outfile, + failed_seqs_outfile, + e_value_threshold, + length_threshold, + add_position, + add_domain_number, + add_domain_number_as_digit, + add_domain_number_as_letter, + trim_name, + log ) + + Util.check_file_for_readability( hmmsearch_output ) + Util.check_file_for_readability( fasta_sequence_file ) + Util.check_file_for_writability( outfile ) + Util.check_file_for_writability( passed_seqs_outfile ) + Util.check_file_for_writability( failed_seqs_outfile ) + + in_msa = nil + factory = MsaFactory.new() + in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() ) + + if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 ) + error_msg = "could not find fasta sequences in " + fasta_sequence_file + raise IOError, error_msg + end + + out_msa = Msa.new + failed_seqs = Msa.new + passed_seqs = Msa.new + + ld = Constants::LINE_DELIMITER + + domain_pass_counter = 0 + domain_fail_counter = 0 + proteins_with_passing_domains = 0 + proteins_with_failing_domains = 0 + max_domain_copy_number_per_protein = -1 + max_domain_copy_number_sequence = '' + failed_species_counts = Hash.new + passed_species_counts = Hash.new + + File.open( hmmsearch_output ) do | file | + while line = file.gets + if !is_ignorable?( line ) && line =~ /^\S+\s+/ + + # tn acc tlen query acc qlen Evalue score bias # of c-E i-E score bias hf ht af at ef et acc desc + # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 + line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/ + + # line =~ /^(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)/ + sequence = $1 + number = $10.to_i + out_of = $11.to_i + env_from = $20.to_i + env_to = $21.to_i + i_e_value = $13.to_f + if ( number > max_domain_copy_number_per_protein ) + max_domain_copy_number_sequence = sequence + max_domain_copy_number_per_protein = number + end + if ( ( ( e_value_threshold.to_f < 0.0 ) || ( i_e_value <= e_value_threshold ) ) && + ( ( length_threshold.to_f <= 0 ) || ( env_to - env_from + 1 ) >= length_threshold.to_f ) ) + HmmsearchDomainExtractor.extract_domain( sequence, + number, + out_of, + env_from, + env_to, + in_msa, + out_msa, + add_position, + add_domain_number, + add_domain_number_as_digit, + add_domain_number_as_letter, + trim_name ) + domain_pass_counter += 1 + count_species( sequence, passed_species_counts ) + if !passed_seqs.has?( sequence, true, false ) + HmmsearchDomainExtractor.add_sequence( sequence, in_msa, passed_seqs ) + proteins_with_passing_domains += 1 + end + else + print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" ) + log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" + if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) ) + print( " iE=" + i_e_value.to_s ) + log << " iE=" + i_e_value.to_s + end + if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f ) + le = env_to - env_from + 1 + print( " l=" + le.to_s ) + log << " l=" + le.to_s + end + print( Constants::LINE_DELIMITER ) + log << Constants::LINE_DELIMITER + domain_fail_counter += 1 + count_species( sequence, failed_species_counts ) + if !failed_seqs.has?( sequence, true, false ) + HmmsearchDomainExtractor.add_sequence( sequence, in_msa, failed_seqs ) + proteins_with_failing_domains += 1 + end + end + end + end + end + + if domain_pass_counter < 1 + error_msg = "no domain sequences were extracted" + raise StandardError, error_msg + end + + log << Constants::LINE_DELIMITER + puts( "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s ) + log << "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s + log << Constants::LINE_DELIMITER + + if ( max_domain_copy_number_per_protein > 1 ) + puts( "First protein with this copy number: " + max_domain_copy_number_sequence ) + log << "First protein with this copy number: " + max_domain_copy_number_sequence + log << Constants::LINE_DELIMITER + end + + io = MsaIO.new() + w = FastaWriter.new() + w.set_line_width( 60 ) + w.clean( true ) + + begin + io.write_to_file( out_msa, outfile, w ) + rescue Exception + error_msg = "could not write to \"" + outfile + "\"" + raise IOError, error_msg + end + + begin + io.write_to_file( passed_seqs, passed_seqs_outfile, w ) + rescue Exception + error_msg = "could not write to \"" + passed_seqs_outfile + "\"" + raise IOError, error_msg + end + + begin + io.write_to_file( failed_seqs, failed_seqs_outfile, w ) + rescue Exception + error_msg = "could not write to \"" + failed_seqs_outfile + "\"" + raise IOError, error_msg + end + + log << ld + log << "passing domains : " + domain_pass_counter.to_s + ld + log << "failing domains : " + domain_fail_counter.to_s + ld + log << "proteins with passing domains: " + proteins_with_passing_domains.to_s + ld + log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld + log << ld + log << 'passing domains counts per species: ' << ld + passed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld } + log << ld + log << 'failing domains counts per species: ' << ld + failed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld } + log << ld + return domain_pass_counter + + end # parse + + private + + + def HmmsearchDomainExtractor.add_sequence( sequence_name, in_msa, add_to_msa ) + seqs = in_msa.find_by_name( sequence_name, true, false ) + if ( seqs.length < 1 ) + error_msg = "sequence \"" + sequence_name + "\" not found in sequence file" + raise StandardError, error_msg + end + if ( seqs.length > 1 ) + error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file" + raise StandardError, error_msg + end + seq = in_msa.get_sequence( seqs[ 0 ] ) + add_to_msa.add_sequence( seq ) + end + + # raises ArgumentError, StandardError + def HmmsearchDomainExtractor.extract_domain( sequence, + number, + out_of, + seq_from, + seq_to, + in_msa, + out_msa, + add_position, + add_domain_number, + add_domain_number_as_digit, + add_domain_number_as_letter, + trim_name ) + if ( number < 1 || out_of < 1 || number > out_of ) + error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s + raise ArgumentError, error_msg + end + if ( seq_from < 1 || seq_to < 1 || seq_from >= seq_to ) + error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s + raise ArgumentError, error_msg + end + seqs = in_msa.find_by_name( sequence, true, false ) + if ( seqs.length < 1 ) + error_msg = "sequence \"" + sequence + "\" not found in sequence file" + raise StandardError, error_msg + end + if ( seqs.length > 1 ) + error_msg = "sequence \"" + sequence + "\" not unique in sequence file" + raise StandardError, error_msg + end + # hmmsearch is 1 based, wheres sequences are 0 bases in this package. + seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 ) + if ( add_position ) + seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s ) + end + + if ( trim_name ) + seq.set_name( seq.get_name[ 0, seq.get_name.length - TRIM_BY ] ) + end + + if ( out_of != 1 ) + if ( add_domain_number_as_digit ) + seq.set_name( seq.get_name + number.to_s ) + elsif ( add_domain_number_as_letter ) + if number > 25 + error_msg = 'too many identical domains per sequence, cannot use letters to distinguish them' + raise StandardError, error_msg + end + seq.set_name( seq.get_name + ( number + 96 ).chr ) + elsif ( add_domain_number ) + seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s ) + end + end + + if ( seq.get_name.length > 10 ) + error_msg = "sequence name [" + seq.get_name + "] is longer than 10 characters" + raise StandardError, error_msg + end + + out_msa.add_sequence( seq ) + end + + def count_species( sequence, species_counts_map ) + species = get_species( sequence ) + if species != nil + if !species_counts_map.has_key?( species ) + species_counts_map[ species ] = 1 + else + species_counts_map[ species ] = species_counts_map[ species ] + 1 + end + end + end + + def get_species( sequence_name ) + if sequence_name =~ /^.+_(.+)$/ + return $1 + else + return nil + end + end + + def is_ignorable?( line ) + return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ ) + end + + end # class HmmsearchDomainExtractor + +end # module Evoruby + diff --git a/forester/ruby/evoruby/lib/evo/io/parser/msa_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/msa_parser.rb new file mode 100644 index 0000000..e1d1b56 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/parser/msa_parser.rb @@ -0,0 +1,22 @@ +# +# = lib/evo/io/parser/msa_parser - MsaParser class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: msa_parser.rb,v 1.2 2007/06/12 04:51:34 cmzmasek Exp $ +# +# last modified: 05/16/2007 + +module Evoruby + + class MsaParser + def initialize() + raise TypeError, "Cannot instanciate abstract class MsaParser" + end + + def parse( path ) + end + end + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/io/parser/ncbi_tseq_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/ncbi_tseq_parser.rb new file mode 100644 index 0000000..994755a --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/parser/ncbi_tseq_parser.rb @@ -0,0 +1,153 @@ +# +# = lib/evo/io/parser/ncbi_tseq_parser - NcbiTSeqParser class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: ncbi_tseq_parser.rb,v 1.5 2009/01/07 02:48:20 cmzmasek Exp $ + + +require 'lib/evo/io/parser/msa_parser' +require 'lib/evo/taxonomy/taxonomy' +require 'lib/evo/msa/msa' + +require 'iconv' + +module Evoruby + + class NcbiTSeqParser < MsaParser + + TSEQ_SEQ = "TSeq_sequence" + TSEQ_DEFLINE = "TSeq_defline" + TSEQ_ORGNAME = "TSeq_orgname" + TSEQ_TAXID = "TSeq_taxid" + TSEQ_SID = "TSeq_sid" + TSEQ_ACCVER = "TSeq_accver" + TSEQ_GI = "TSeq_gi" + TSEQ_TYPE = "TSeq_seqtype" + TSEQ_LENGTH = "TSeq_length" + + def initialize + end + + + # + # + # + # 29341016 + # AAO78806.1 + # gnl|mbpwusl|BT3701 + # 226186 + # Bacteroides thetaiotaomicron VPI-5482 + # SusD [Bacteroides thetaiotaomicron VPI-5482] + # 551 + # MKTKYIKQLFSAALIAVLSSGVTSCINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQENQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK + # + + def parse( path ) + Util.check_file_for_readability( path ) + seqs = Msa.new + + in_seq = false + gi = nil + accver = nil + sid = nil + taxid = nil + orgname = nil + defline = nil + seq_str = nil + line_counter = 1 + ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' ) + File.open( path ) do | file | + while line = file.gets + line = ic.iconv( line ) + line_counter += 1 + if can_ignore?( line ) + + elsif line =~ /^\s*/ + in_seq = true + + + elsif in_seq + if line =~ /^\s*<\/TSeq>/ + in_seq = false + taxonomy = nil + if taxid != nil || orgname != nil + id_source = nil + if taxid != nil + id_source = "ncbi" + end + taxonomy = Taxonomy.new( orgname, taxid , id_source ) + end + id = nil + id_source = nil + symbol = nil + if gi != nil + id = gi + id_source = "gi" + if sid != nil + symbol = sid + elsif accver != nil + symbol = accver + end + elsif sid != nil + id = sid + id_source = "ncbi" + if accver != nil + symbol = accver + end + elsif accver != nil + id = accver + id_source = "ncbi" + end + + sequence = Sequence.new( defline, + seq_str, + id, + id_source, + taxonomy, + symbol ) + + seqs.add_sequence( sequence ) + gi = nil + accver = nil + sid = nil + taxid = nil + orgname = nil + defline = nil + seq_str = nil + elsif line =~ /^\s*<#{TSEQ_GI}>(\d+)<\/#{TSEQ_GI}>/ + gi = $1 + elsif line =~ /^\s*<#{TSEQ_ACCVER}>(.+)<\/#{TSEQ_ACCVER}>/ + accver = $1 + elsif line =~ /^\s*<#{TSEQ_SID}>(.+)<\/#{TSEQ_SID}>/ + sid = $1 + elsif line =~ /^\s*<#{TSEQ_TAXID}>(\d+)<\/#{TSEQ_TAXID}>/ + taxid = $1 + elsif line =~ /^\s*<#{TSEQ_ORGNAME}>(.+)<\/#{TSEQ_ORGNAME}>/ + orgname = $1 + elsif line =~ /^\s*<#{TSEQ_DEFLINE}>(.+)<\/#{TSEQ_DEFLINE}>/ + defline = $1 + elsif line =~ /^\s*<#{TSEQ_SEQ}>(.+)<\/#{TSEQ_SEQ}>/ + seq_str = $1 + elsif line =~ /^\s*<#{TSEQ_TYPE}/ + elsif line =~ /^\s*<#{TSEQ_LENGTH}/ + else + error_msg = "unexpected line format at line #{line_counter}: " + line + raise IOError, error_msg + end + end + end + end + return seqs + end + + private + + def can_ignore?( line ) + return ( line !~ /\S/ ) + end + + end # class NcbiTSeqParser + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/io/parser/sp_taxonomy_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/sp_taxonomy_parser.rb new file mode 100644 index 0000000..cdc8af8 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/parser/sp_taxonomy_parser.rb @@ -0,0 +1,42 @@ +# +# = lib/evo/io/parser/sp_taxonomy_parser - SpTaxonomyParser class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: sp_taxonomy_parser.rb,v 1.2 2008/12/31 03:21:45 cmzmasek Exp $ + + +module Evoruby + + require 'lib/evo/taxonomy/sp_taxonomy' + + class SpTaxonomyParser + + START_OF_COMMENT_LINE_CHAR = "#" + + # raises ArgumentError + def SpTaxonomyParser.parse( path ) + Util.check_file_for_readability( path ) + row = 0 + sp_taxonomies = Array.new + File.open( path ) do | file | + while line = file.gets + row += 1 + if !Util.is_string_empty?( line ) + if line =~ /([A-Z0-9]{3,5})\s+[A-Z]\s+(\d+):\s+N=(.+)/ + code = $1 + id = $2 + sci_name = $3 + tax = SpTaxonomy.new(code, id, sci_name ) + #puts tax.to_str + sp_taxonomies.push( tax ) + end + end + end + end + sp_taxonomies + end + end # class SpTaxonomyParser + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/io/writer/fasta_writer.rb b/forester/ruby/evoruby/lib/evo/io/writer/fasta_writer.rb new file mode 100644 index 0000000..26f7461 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/writer/fasta_writer.rb @@ -0,0 +1,86 @@ +# +# = lib/evo/io/writer/fasta_writer.rb - FastaWriter class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: fasta_writer.rb,v 1.6 2008/09/12 23:52:11 cmzmasek Exp $ +# +# last modified: 05/16/2007 + +require 'lib/evo/io/writer/msa_writer' + +module Evoruby + + class FastaWriter < MsaWriter + + LINE_WIDTH_DEFAULT = 60 + MAX_NAME_LENGTH_DEFAULT = 0 + + def initialize() + @line_width = LINE_WIDTH_DEFAULT + @max_name_length = MAX_NAME_LENGTH_DEFAULT + @remove_gap_chars = false + @clean = false + end + + + def set_line_width( line_width = LINE_WIDTH_DEFAULT ) + if ( line_width < 1 ) + line_width = LINE_WIDTH_DEFAULT + end + @line_width = line_width + end + + def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT ) + if ( length < 1 ) + length = MAX_NAME_LENGTH_DEFAULT + end + @max_name_length = length + end + + def remove_gap_chars( remove_gap_chars = true ) + @remove_gap_chars = remove_gap_chars + end + + def clean( clean = true ) + @clean = clean + end + + def write( msa, path ) + Util.check_file_for_writability( path ) + f = File.open( path, "a" ) + for i in 0 ... msa.get_number_of_seqs() + seq_obj = msa.get_sequence( i ) + name = seq_obj.get_name() + f.print( ">" ) + if ( @max_name_length != MAX_NAME_LENGTH_DEFAULT ) + name = Util.normalize_seq_name( name, @max_name_length ) + end + f.print( name ) + counter = 0 + for j in 0 ... seq_obj.get_length() + unless @remove_gap_chars && Util.is_aa_gap_character?( seq_obj.get_character_code( j ) ) + char = seq_obj.get_residue( j ) + if ( @clean ) + char = Util.clean_seq_str( char ) + if ( char.length < 1 ) + next + end + end + if counter % @line_width == 0 + f.print( Evoruby::Constants::LINE_DELIMITER ) + end + f.print( char ) + counter += 1 + end + end + f.print( Evoruby::Constants::LINE_DELIMITER ) + end + f.close() + end + + end # class FastaWriter + +end # module Evoruby + diff --git a/forester/ruby/evoruby/lib/evo/io/writer/msa_writer.rb b/forester/ruby/evoruby/lib/evo/io/writer/msa_writer.rb new file mode 100644 index 0000000..36eec15 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/writer/msa_writer.rb @@ -0,0 +1,30 @@ +# +# = lib/evo/io/writer/msa_writer.rb - MsaWriter class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: msa_writer.rb,v 1.2 2007/06/12 04:51:35 cmzmasek Exp $ +# +# last modified: 05/16/2007 + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' + +module Evoruby + + class MsaWriter + + def initialize() + raise TypeError, "Cannot instanciate abstract class MsaWriter" + end + + def set_max_name_length( length ) + end + + def write( msa, path ) + end + + end # class MsaWriter + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/io/writer/nexus_writer.rb b/forester/ruby/evoruby/lib/evo/io/writer/nexus_writer.rb new file mode 100644 index 0000000..17a0c77 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/writer/nexus_writer.rb @@ -0,0 +1,82 @@ +# +# = lib/evo/io/writer/nexus_writer.rb - NexusWriter class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: nexus_writer.rb,v 1.4 2009/11/04 01:50:59 cmzmasek Exp $ +# +# last modified: 05/16/2007 + +require 'lib/evo/io/writer/msa_writer' + +module Evoruby + + class NexusWriter < MsaWriter + + MAX_NAME_LENGTH_DEFAULT = 10 + + def initialize() + @max_name_length = MAX_NAME_LENGTH_DEFAULT + @clean = false + end + + def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT ) + if length < 1 + length = MAX_NAME_LENGTH_DEFAULT + end + @max_name_length = length + end + + def clean( clean = true ) + @clean = clean + end + + def write( msa, path ) + if ( !msa.is_aligned() ) + error_msg = "attempt to write unaligned msa in nexus format" + raise StandardError, error_msg, caller + end + + Util.check_file_for_writability( path ) + + f = File.open( path, "a" ) + + f.print( "Begin Data;" ) + f.print( Evoruby::Constants::LINE_DELIMITER ) + f.print( " Dimensions NTax=" ) + f.print( msa.get_number_of_seqs().to_s() ) + f.print( " NChar=" ) + f.print( msa.get_length().to_s() ) + f.print( ";" ) + f.print( Evoruby::Constants::LINE_DELIMITER ) + f.print( " Format DataType=Protein Interleave=No gap=-;" ) + f.print( Evoruby::Constants::LINE_DELIMITER ) + f.print( " Matrix" ) + f.print( Evoruby::Constants::LINE_DELIMITER ) + for i in 0 ... msa.get_number_of_seqs() + seq_obj = msa.get_sequence( i ) + name = seq_obj.get_name() + seq = seq_obj.get_sequence_as_string() + name = name.gsub( /\s+$/, '') + name = name.gsub( /\s+/, '_') + name = Util.normalize_seq_name( name, @max_name_length ) + f.print( " " ) + f.print( name ) + f.print( " " ) + if ( @clean ) + seq = Util.clean_seq_str( seq ) + end + f.print( seq ) + f.print( Evoruby::Constants::LINE_DELIMITER ) + end + f.print( " ;" ) + f.print( Evoruby::Constants::LINE_DELIMITER ) + f.print( "End;" ) + f.print( Evoruby::Constants::LINE_DELIMITER ) + f.close() + end + + end # class NexusWriter + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/io/writer/phylip_sequential_writer.rb b/forester/ruby/evoruby/lib/evo/io/writer/phylip_sequential_writer.rb new file mode 100644 index 0000000..15f4c5c --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/writer/phylip_sequential_writer.rb @@ -0,0 +1,70 @@ +# +# = lib/evo/io/writer/phylip_sequential_writer.rb - PhylipSequentialWriter class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: phylip_sequential_writer.rb,v 1.4 2008/09/03 00:31:38 cmzmasek Exp $ +# +# last modified: 05/16/2007 + +require 'lib/evo/io/writer/msa_writer' + +module Evoruby + + class PhylipSequentialWriter < MsaWriter + + MAX_NAME_LENGTH_DEFAULT = 10 + + def initialize() + @max_name_length = MAX_NAME_LENGTH_DEFAULT + @clean = false + end + + def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT ) + if length < 1 + length = MAX_NAME_LENGTH_DEFAULT + end + @max_name_length = length + end + + def clean( clean = true ) + @clean = clean + end + + def write( msa, path ) + if ( !msa.is_aligned() ) + error_msg = "attempt to write unaligned msa in phylip sequential format" + raise StandardError, error_msg, caller + end + + + Util.check_file_for_writability( path ) + + f = File.open( path, "a" ) + + f.print( msa.get_number_of_seqs().to_s() ) + f.print( " " ) + f.print( msa.get_length().to_s() ) + f.print( Evoruby::Constants::LINE_DELIMITER ) + for i in 0 ... msa.get_number_of_seqs() + seq_obj = msa.get_sequence( i ) + name = seq_obj.get_name() + seq = seq_obj.get_sequence_as_string() + name = name.gsub( /\s+$/, '') + name = name.gsub( /\s+/, '_') + name = Util.normalize_seq_name( name, @max_name_length ) + f.print( name ) + f.print( " " ) + if ( @clean ) + seq = Util.clean_seq_str( seq ) + end + f.print( seq ) + f.print( Evoruby::Constants::LINE_DELIMITER ) + end + f.close() + end + + end # class PhylipSequentialWriter + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/msa/msa.rb b/forester/ruby/evoruby/lib/evo/msa/msa.rb new file mode 100644 index 0000000..2924646 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/msa/msa.rb @@ -0,0 +1,513 @@ +# +# = lib/evo/msa/msa.rb - Msa class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: msa.rb,v 1.11 2009/01/03 00:42:08 cmzmasek Exp $ +# + + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/sequence/sequence' + +module Evoruby + + class Msa + + def initialize() + @sequences = Array.new() + end + + + def add_sequence( sequence ) + @sequences.push( sequence ) + end + + def add( name, molecular_sequence_str ) + add_sequence( Sequence.new( name, molecular_sequence_str ) ) + end + + def get_sequence( index ) + if ( index < 0 || index > get_number_of_seqs() - 1 ) + error_msg = "attempt to get sequence " << + index.to_s << " in alignment of " << get_number_of_seqs().to_s << + " sequences" + raise ArgumentError, error_msg + end + return @sequences[ index ] + end + + def remove_sequence!( index ) + if ( index < 0 || index > get_number_of_seqs() - 1 ) + error_msg = "attempt to remove sequence " << + index.to_s << " in alignment of " << get_number_of_seqs().to_s << + " sequences" + raise ArgumentError, error_msg + end + @sequences.delete_at( index ) + end + + def is_aligned() + if ( get_number_of_seqs < 1 ) + return false + else + l = @sequences[ 0 ].get_length() + for i in 0 ... get_number_of_seqs() + if ( get_sequence( i ).get_length() != l ) + return false + end + end + end + return true + end + + def find_by_name( name, case_sensitive, partial_match ) + indices = Array.new() + for i in 0 ... get_number_of_seqs() + current_name = get_sequence( i ).get_name() + if !case_sensitive + current_name = current_name.downcase + name = name.downcase + end + if current_name == name || + ( partial_match && current_name.include?( name ) ) + indices.push( i ) + end + end + indices + end + + def find_by_name_start( name, case_sensitive ) + indices = Array.new() + for i in 0 ... get_number_of_seqs() + get_sequence( i ).get_name() =~ /^\s*(\S+)/ + current_name = $1 + if !case_sensitive + current_name = current_name.downcase + name = name.downcase + end + if ( current_name == name ) + indices.push( i ) + end + end + indices + end + + def has?( name, case_sensitive = true, partial_match = false ) + for i in 0 ... get_number_of_seqs() + current_name = get_sequence( i ).get_name() + if !case_sensitive + current_name = current_name.downcase + name = name.downcase + end + if current_name == name || + ( partial_match && current_name.include?( name ) ) + return true + end + end + false + end + + # throws ArgumentError + def get_by_name( name, case_sensitive = true, partial_match = false ) + indices = find_by_name( name, case_sensitive, partial_match ) + if ( indices.length > 1 ) + error_msg = "\"" + name + "\" not unique" + raise ArgumentError, error_msg + elsif ( indices.length < 1 ) + error_msg = "\"" + name + "\" not found" + raise ArgumentError, error_msg + end + get_sequence( indices[ 0 ] ) + end + + # throws ArgumentError + def get_by_name_start( name, case_sensitive = true ) + indices = find_by_name_start( name, case_sensitive ) + if ( indices.length > 1 ) + error_msg = "\"" + name + "\" not unique" + raise ArgumentError, error_msg + elsif ( indices.length < 1 ) + error_msg = "\"" + name + "\" not found" + raise ArgumentError, error_msg + end + get_sequence( indices[ 0 ] ) + end + + + def get_sub_alignment( seq_numbers ) + msa = Msa.new() + for i in 0 ... seq_numbers.length() + msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() ) + end + return msa + end + + def get_number_of_seqs() + @sequences.length + end + + def get_length() + if ( !is_aligned() ) + error_msg = "attempt to get length of unaligned msa" + raise StandardError, error_msg, caller + end + if ( get_number_of_seqs() < 1 ) + -1 + else + @sequences[ 0 ].get_length() + end + end + + def to_str() + s = String.new() + for i in 0...get_number_of_seqs() + s += @sequences[ i ].to_str + Constants::LINE_DELIMITER + end + s + end + + def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 ) + if ( !is_aligned() ) + error_msg = "attempt to get overlap diagram of unaligned msa" + raise StandardError, error_msg, caller + end + for i in 0 ... get_number_of_seqs() + io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) ) + for j in 0 ... get_number_of_seqs() + if i == j + io.print( " " ) + else + if overlap?( i, j, min_overlap ) + io.print( "+" ) + else + io.print( "-" ) + end + end + end + io.print( Evoruby::Constants::LINE_DELIMITER ) + end + end + + #returns array of Msa with an overlap of min_overlap + def split_into_overlapping_msa( min_overlap = 1 ) + if ( !is_aligned() ) + error_msg = "attempt to split into overlapping msas of unaligned msa" + raise StandardError, error_msg, caller + end + msas = Array.new() + bins = get_overlaps( min_overlap ) + for i in 0 ... bins.length + msas.push( get_sub_alignment( bins[ i ] ) ) + end + msas + end + + def overlap?( index_1, index_2, min_overlap = 1 ) + seq_1 = get_sequence( index_1 ) + seq_2 = get_sequence( index_2 ) + overlap_count = 0 + for i in 0...seq_1.get_length() + if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) && + !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) + overlap_count += 1 + if overlap_count >= min_overlap + return true + end + end + end + return false + end + + def calculate_overlap( index_1, index_2 ) + seq_1 = get_sequence( index_1 ) + seq_2 = get_sequence( index_2 ) + overlap_count = 0 + for i in 0...seq_1.get_length() + if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) && + !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) + overlap_count += 1 + end + end + overlap_count + end + + def calculate_identities( index_1, index_2 ) + seq_1 = get_sequence( index_1 ) + seq_2 = get_sequence( index_2 ) + identities_count = 0 + for i in 0...seq_1.get_length() + if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) && + !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) && + seq_1.get_character_code( i ) != 63 && + ( seq_1.get_residue( i ).downcase() == + seq_2.get_residue( i ).downcase() ) + identities_count += 1 + end + end + identities_count + end + + def remove_gap_only_columns!() + remove_columns!( get_gap_only_columns() ) + end + + def remove_gap_columns!() + remove_columns!( get_gap_columns() ) + end + + # removes columns for which seqs with gap / number of sequences > gap_ratio + def remove_gap_columns_w_gap_ratio!( gap_ratio ) + remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) ) + end + + + def remove_sequences_by_gap_ratio!( gap_ratio ) + if ( !is_aligned() ) + error_msg = "attempt to remove sequences by gap ratio on unaligned msa" + raise StandardError, error_msg, caller + end + n = get_number_of_seqs + removed = Array.new + for s in 0 ... n + if ( get_sequence( ( n - 1 ) - s ).get_gap_ratio() > gap_ratio ) + if ( Evoruby::Constants::VERBOSE ) + puts( "removed: " + get_sequence( ( n - 1 ) - s ).get_name ) + end + removed << get_sequence( ( n - 1 ) - s ).get_name + remove_sequence!( ( n - 1 ) - s ) + end + end + removed + end + + + def remove_redundant_sequences!( consider_taxonomy = false, verbose = false ) + n = get_number_of_seqs + removed = Array.new + to_be_removed = Set.new + for i in 0 ... ( n - 1 ) + for j in ( i + 1 ) ... n + if !to_be_removed.include?( i ) && !to_be_removed.include?( j ) + if !consider_taxonomy || + ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) || + ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) ) + if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) == + Util.clean_seq_str( get_sequence( j ).get_sequence_as_string ) + to_be_removed.add( j ) + if verbose + tax_i = "" + tax_j = "" + if get_sequence( i ).get_taxonomy != nil + tax_i = get_sequence( i ).get_taxonomy.get_name + end + if get_sequence( j ).get_taxonomy != nil + tax_j = get_sequence( j ).get_taxonomy.get_name + end + puts get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]" + end + end + end + end + end + end + to_be_removed_ary = to_be_removed.to_a.sort.reverse + + to_be_removed_ary.each { | index | + removed.push( get_sequence( index ).get_name ) + remove_sequence!( index ) + } + removed + end + + + def remove_sequences_by_non_gap_length!( min_non_gap_length ) + if ( !is_aligned() ) + error_msg = "attempt to remove sequences by non gap length on unaligned msa" + raise StandardError, error_msg, caller + end + n = get_number_of_seqs + l = get_length + removed = Array.new + for s in 0 ... n + if ( ( l - get_sequence( ( n - 1 ) - s ).get_gap_length ) < min_non_gap_length ) + if ( Evoruby::Constants::VERBOSE ) + puts( "removed: " + get_sequence( ( n - 1 ) - s ).get_name ) + end + removed << get_sequence( ( n - 1 ) - s ).get_name + remove_sequence!( ( n - 1 ) - s ) + end + end + removed + end + + def trim!( first, last ) + cols = Array.new() + for i in 0 ... get_length() + if ( i < first || i > last ) + cols.push( i ) + end + end + remove_columns!( cols ) + end + + def get_gap_only_columns() + if ( !is_aligned() ) + error_msg = "attempt to get gap only columns of unaligned msa" + raise StandardError, error_msg, caller + end + cols = Array.new() + for c in 0 ... get_length + nogap_char_found = false + for s in 0 ... get_number_of_seqs + unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) ) + nogap_char_found = true + break + end + end + unless nogap_char_found + cols.push( c ) + end + end + return cols + end + + def get_gap_columns() + if ( !is_aligned() ) + error_msg = "attempt to get gap columns of unaligned msa" + raise StandardError, error_msg, caller + end + cols = Array.new() + for c in 0 ... get_length + gap_char_found = false + for s in 0 ... get_number_of_seqs + if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) ) + gap_char_found = true + break + end + end + if gap_char_found + cols.push( c ) + end + end + return cols + end + + # gap_ratio = seqs with gap / number of sequences + # returns column indices for which seqs with gap / number of sequences > gap_ratio + def get_gap_columns_w_gap_ratio( gap_ratio ) + if ( !is_aligned() ) + error_msg = "attempt to get gap columns with gap_ratio of unaligned msa" + raise StandardError, error_msg, caller + end + if ( gap_ratio < 0 || gap_ratio > 1 ) + error_msg = "gap ratio must be between 0 and 1 inclusive" + raise ArgumentError, error_msg, caller + end + cols = Array.new() + for c in 0 ... get_length + gap_chars_found = 0 + for s in 0 ... get_number_of_seqs + if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) ) + gap_chars_found += 1 + end + end + if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio ) + cols.push( c ) + end + end + return cols + end + + + # Split an alignment into n alignemnts of equal size, except last one + def split( n, verbose = false ) + if ( n < 2 || n > get_number_of_seqs ) + error_msg = "attempt to split into less than two or more than the number of sequences" + raise StandardError, error_msg, caller + end + msas = Array.new() + r = get_number_of_seqs % n + x = get_number_of_seqs / n + for i in 0 ... n + msa = Msa.new() + s = 0 + + if ( ( r > 0 ) && ( i == ( n - 1 ) ) ) + y = x + r + if ( verbose ) + puts( i.to_s + ": " + y.to_s ) + end + for j in 0 ... y + msa.add_sequence( get_sequence( ( i * x ) + j ) ) + end + else + if ( verbose ) + puts( i.to_s + ": " + x.to_s ) + end + for j in 0 ... x + msa.add_sequence( get_sequence( ( i * x ) + j ) ) + end + end + msas.push( msa ) + end + msas + end + + + private + + def get_overlaps( min_overlap = 1 ) + if ( !is_aligned() ) + error_msg = "attempt to get overlaps of unaligned msa" + raise StandardError, error_msg, caller + end + bins = Array.new() + for i in 0 ... get_number_of_seqs() + found_bin = false + for j in 0 ... bins.length + current_bin = bins[ j ] + # does seq i overlap with all seqs in current_bin? + all_overlap = true + for z in 0 ... current_bin.length + unless overlap?( i, current_bin[ z ], min_overlap ) + all_overlap = false + break + end + end + if all_overlap + current_bin.push( i ) + found_bin = true + end + end + unless found_bin + new_bin = Array.new() + new_bin.push( i ) + bins.push( new_bin ) + end + end + return bins + end + + def remove_columns!( cols ) + if ( !is_aligned() ) + error_msg = "attempt to remove columns of unaligned msa" + raise StandardError, error_msg, caller + end + cols.reverse!() + for c in 0 ... cols.length() + col = cols[ c ] + for s in 0 ... get_number_of_seqs() + get_sequence( s ).delete_residue!( col ) + end + end + return self + end + + + end # class Msa + +end # module Evoruby + diff --git a/forester/ruby/evoruby/lib/evo/msa/msa_factory.rb b/forester/ruby/evoruby/lib/evo/msa/msa_factory.rb new file mode 100644 index 0000000..551161e --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/msa/msa_factory.rb @@ -0,0 +1,24 @@ +# +# = lib/evo/msa/msa_factory.rb - MsaFactory class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: msa_factory.rb,v 1.2 2007/06/12 04:51:34 cmzmasek Exp $ +# +# last modified: 05/16/2007 + +module Evoruby + + class MsaFactory + + def initialize + end + + def create_msa_from_file( path, msa_parser ) + msa_parser.parse( path ) + end + + end # class MsaFactory + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/sequence/domain_structure.rb b/forester/ruby/evoruby/lib/evo/sequence/domain_structure.rb new file mode 100644 index 0000000..8189218 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/sequence/domain_structure.rb @@ -0,0 +1,73 @@ +# +# = lib/evo/sequence/domain_structure.rb - DomainStructure class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: domain_structure.rb,v 1.2 2007/06/12 04:51:33 cmzmasek Exp $ +# +# last modified: 05/16/2007 + +require 'lib/evo/util/constants' + +module Evoruby + + class DomainStructure + + def initialize( total_length ) + @domains = Hash.new + @total_length = total_length + end + + def add_domain( domain, overwrite_if_same_from_to ) + key = domain.get_from + if ( @domains.has_key?( key ) ) + prev_domain = @domains[ key ] + if ( prev_domain.get_to == domain.get_to ) + puts( "WARNING: more than one domain at the same location [" + + key.to_s + "-" + domain.get_to.to_s + "]: " + prev_domain.get_name + " and " + domain.get_name) + if ( overwrite_if_same_from_to ) + puts( " ignored the one with higher E-value [" + + prev_domain.get_confidence().to_s + " vs " + domain.get_confidence().to_s + "]" ) + if prev_domain.get_confidence() < domain.get_confidence() + return # keep previous one + else + @domains[ key ] = domain + return + end + end + end + + while ( @domains.has_key?( key ) ) + key = key + 0.0001 + end + + end + @domains[ key ] = domain + end + + def to_NHX + str = String.new + str << get_total_length.to_s + a = @domains.sort + for d in a + domain = d[ 1 ] + str << Evoruby::Constants::DOMAIN_STRUCTURE_NHX_SEPARATOR + str << domain.get_from.to_s + str << Evoruby::Constants::DOMAIN_STRUCTURE_NHX_SEPARATOR + str << domain.get_to.to_s + str << Evoruby::Constants::DOMAIN_STRUCTURE_NHX_SEPARATOR + str << domain.get_confidence.to_s + str << Evoruby::Constants::DOMAIN_STRUCTURE_NHX_SEPARATOR + str << domain.get_name + end + return str + end + + def get_total_length + return @total_length + end + + end # class DomainStructure + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/sequence/protein_domain.rb b/forester/ruby/evoruby/lib/evo/sequence/protein_domain.rb new file mode 100644 index 0000000..d9edb52 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/sequence/protein_domain.rb @@ -0,0 +1,45 @@ +# +# = lib/evo/sequence/protein_domain.rb - ProteinDomain class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: protein_domain.rb,v 1.2 2007/06/12 04:51:33 cmzmasek Exp $ +# +# last modified: 05/16/2007 + +module Evoruby + + class ProteinDomain + + def initialize( name, from, to, id, confidence ) + @name = String.new( name ) + @from = from + @to = to + @id = String.new( id ) + @confidence = confidence + end + + def get_name() + return @name + end + + def get_from() + return @from + end + + def get_to() + return @to + end + + def get_id() + return @id + end + + def get_confidence() + return @confidence + end + + end # class ProteinDomain + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/sequence/sequence.rb b/forester/ruby/evoruby/lib/evo/sequence/sequence.rb new file mode 100644 index 0000000..77b8d7a --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/sequence/sequence.rb @@ -0,0 +1,165 @@ +# +# = lib/evo/sequence/sequence.rb - Sequence class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: sequence.rb,v 1.10 2009/01/07 02:48:20 cmzmasek Exp $ + +require 'set' + +module Evoruby + + class Sequence + + def initialize( name, + molecular_sequence_str, + accession = nil, + accession_source = nil, + taxonomy = nil, + symbol = nil, + secondary_accession = nil, + secondary_accession_source = nil ) + @name = String.new( name.strip() ) + @molecular_sequence = String.new( molecular_sequence_str ) + if ( accession == nil ) + @accession = String.new() + else + @accession = String.new( accession.strip() ) + end + if ( accession_source == nil ) + @accession_source = String.new() + else + @accession_source = String.new( accession_source.strip() ) + end + @taxonomy = taxonomy + if ( symbol == nil ) + @symbol = String.new() + else + @symbol = String.new( symbol.strip() ) + end + if ( secondary_accession == nil ) + @secondary_accession = String.new() + else + @secondary_accession = String.new( secondary_accession.strip() ) + end + if ( secondary_accession_source == nil ) + @secondary_accession_source = String.new() + else + @secondary_accession_source = String.new( secondary_accession_source.strip() ) + end + end + + def copy + if get_taxonomy == nil + Sequence.new( get_name, get_sequence_as_string, get_accession, get_accession_source, nil, get_symbol, get_secondary_accession, get_secondary_accession_source ) + else + Sequence.new( get_name, get_sequence_as_string, get_accession, get_accession_source, get_taxonomy.copy, get_symbol, get_secondary_accession, get_secondary_accession_source ) + end + end + + def get_name() + @name + end + + def set_name( name ) + @name = name + end + + def get_sequence_as_string() + @molecular_sequence + end + + def get_accession() + @accession + end + + def get_accession_source() + @accession_source + end + + def get_secondary_accession() + @secondary_accession + end + + def get_secondary_accession_source() + @secondary_accession_source + end + + def get_symbol() + @symbol + end + + def get_taxonomy() + @taxonomy + end + + def get_length() + @molecular_sequence.length + end + + def get_residue( position ) + get_slice( position, 1 ) + end + + def get_character_code( position ) + @molecular_sequence.getbyte( position ) + end + + def get_gap_ratio() + return get_gap_length().to_f / get_length() + end + + def get_gap_length() + counter = 0 + for i in 0 ... get_length() + if ( Util.is_aa_gap_character?( get_character_code( i ) ) ) + counter += 1 + end + end + return counter; + end + + def delete_residue!( position ) + if ( position < 0 || position >= get_length() ) + error_msg = "attempt to delete residue at postion out of range" + raise ArgumentError, error_msg + end + @molecular_sequence.slice!( position ) + end + + def get_slice( start, length ) + if ( start < 0 || start + length > get_length() ) + error_msg = "attempt to get sequence residue(s) at postion out of range" + raise ArgumentError, error_msg + end + @molecular_sequence.slice( start, length ) + end + + def get_slice!( start, length ) + if ( start < 0 || start + length > get_length() ) + error_msg = "attempt to get sequence residue(s) at postion out of range" + raise ArgumentError, error_msg + end + @molecular_sequence.slice!( start, length ) + end + + def get_subsequence( first, last ) + if ( last < first ) + error_msg = "attempt to get subsequence from " + first + " to " + last + raise ArgumentError, error_msg + end + return Sequence.new( get_name, @molecular_sequence.slice( first, last - first + 1 ) ) + end + + def append!( molecular_sequence_str ) + @molecular_sequence.concat( molecular_sequence_str ) + end + + def to_str() + return "[" + @name + "] " + @molecular_sequence + end + + end # class Sequence + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/soft/fastme.rb b/forester/ruby/evoruby/lib/evo/soft/fastme.rb new file mode 100644 index 0000000..a59756f --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/soft/fastme.rb @@ -0,0 +1,70 @@ +# +# = lib/soft/fastme - FastMe class +# +# Copyright:: Copyright (C) 2009 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: fastme.rb,v 1.3 2009/10/08 22:44:54 cmzmasek Exp $ +# +# last modified: 2009/10/06 + +require 'lib/evo/soft/resource_locations' +require 'lib/evo/util/util' + +module Evoruby + + class FastMe + + VERBOSE = false + + OUTTREE = 'output.tre' + OUTPUT_D = 'output.d' + VERSION = '2.0' + + def initialize + @fast_me_home = Util.get_env_variable_value( ResourceLocations::FASTME_HOME_ENV_VARIABLE ) + Util.check_file_for_readability( @fast_me_home ) + end + + def run( pwd_file, bootstrap_number, initial_tree ) + Util.check_file_for_readability( pwd_file ) + if bootstrap_number == nil || bootstrap_number < 0 + error_msg = "illegal bootstrap number: " + bootstrap_number + raise ArgumentError, error_msg + end + init_tree_option = determine_initial_tree( initial_tree ) + input = String.new() + if bootstrap_number > 1 + input = "-b #{init_tree_option} -i #{pwd_file} -n #{bootstrap_number} -s b" + else + input = "-b #{init_tree_option} -i #{pwd_file} -s b" + end + if VERBOSE + puts @fast_me_home + " " + input + end + IO.popen( @fast_me_home + " " + input, 'r+' ) do |io| + io.close_write + return io.read + end + end + + private + + def determine_initial_tree( initial_tree ) + opt = nil + if ( initial_tree == :BME ) + opt = "BME" + elsif ( initial_tree == :GME ) + opt = "GME" + elsif ( initial_tree == :NJ ) + opt = "NJ" + else + error_msg = "unknown initial tree" + raise ArgumentError, error_msg + end + return opt + end + + end # class FastMe + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/soft/raxml.rb b/forester/ruby/evoruby/lib/evo/soft/raxml.rb new file mode 100644 index 0000000..92122b9 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/soft/raxml.rb @@ -0,0 +1,52 @@ +# +# = lib/soft/raxml - Raxml class +# +# Copyright:: Copyright (C) 2009 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: raxml.rb,v 1.1 2009/10/07 00:08:35 cmzmasek Exp $ +# +# last modified: 2009/10/06 + +require 'lib/evo/soft/resource_locations' +require 'lib/evo/util/util' + +module Evoruby + + class Raxml + + VERBOSE = true + + def initialize + @fast_me_home = Util.get_env_variable_value( ResourceLocations::FASTME_HOME_ENV_VARIABLE ) + Util.check_file_for_readability( @fast_me_home ) + end + + def run( pwd_file, bootstrap_number, initial_tree ) + Util.check_file_for_readability( pwd_file ) + if bootstrap_number == nil || bootstrap_number < 0 + error_msg = "illegal bootstrap number: " + bootstrap_number + raise ArgumentError, error_msg + end + if initial_tree == nil || (!initial_tree.eql?( "BME" ) && !initial_tree.eql?( "GME" ) && !initial_tree.eql?( "NJ" ) ) + error_msg = "illegal initial tree: " + initial_tree + raise ArgumentError, error_msg + end + input = String.new() + if bootstrap_number > 1 + input = '-b #{initial_tree} -i #{pwd_file} -n #{bootstrap_number} -s b' + else + input = '-b #{initial_tree} -i #{pwd_file} -s b' + end + if VERBOSE + puts @fast_me_home + " " + input + end + IO.popen( @fast_me_home, 'r+' ) do |io| + io.puts input + io.close_write + return io.read + end + end + end # class Raxml + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/soft/resource_locations.rb b/forester/ruby/evoruby/lib/evo/soft/resource_locations.rb new file mode 100644 index 0000000..65772b5 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/soft/resource_locations.rb @@ -0,0 +1,21 @@ +# +# = lib/soft/resource_locations - ResourceLocations class +# +# Copyright:: Copyright (C) 2009 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: resource_locations.rb,v 1.1 2009/10/06 22:22:46 cmzmasek Exp $ +# +# last modified: 2009/10/06 + + +module Evoruby + + class ResourceLocations + + FASTME_HOME_ENV_VARIABLE = 'FASTME_HOME' + TREEPUZZLE_HOME_ENV_VARIABLE = 'TREEPUZZLE_HOME' + + end # ResourceLocations + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/soft/tree_puzzle.rb b/forester/ruby/evoruby/lib/evo/soft/tree_puzzle.rb new file mode 100644 index 0000000..416b896 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/soft/tree_puzzle.rb @@ -0,0 +1,110 @@ +# +# = lib/soft/tree_puzzle - TreePuzzle class +# +# Copyright:: Copyright (C) 2009 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: tree_puzzle.rb,v 1.5 2009/10/08 22:44:54 cmzmasek Exp $ +# +# last modified: 2009/10/06 + +require 'lib/evo/soft/resource_locations' +require 'lib/evo/util/util' + +module Evoruby + + class TreePuzzle + + VERBOSE = false + + OUTDIST = 'outdist' + OUTFILE = 'outfile' + VERSION = '5.2' + + def initialize + @tree_puzzle_home = Util.get_env_variable_value( ResourceLocations::TREEPUZZLE_HOME_ENV_VARIABLE ) + Util.check_file_for_readability( @tree_puzzle_home ) + end + + def run( alignment_file, model, rate_heterogeneity, number_of_seqs ) + Util.check_file_for_readability( alignment_file ) + + input = alignment_file + input << "\nk\nk" + if number_of_seqs <= 257 + input << "\nk" + end + input << determine_model_option( model ) + input << determine_rate_heterogeneity_option( rate_heterogeneity ) + input << "\ny\n" + + if VERBOSE + puts @tree_puzzle_home + " " + input + end + IO.popen( @tree_puzzle_home, 'r+' ) do |io| + io.puts input + io.close_write + return io.read + end + end + + private + + # "Model of substitution" order for DQO TREE-PUZZLE 5.0: + # Auto + # m -> Dayhoff (Dayhoff et al. 1978) + # m -> JTT (Jones et al. 1992) + # m -> mtREV24 (Adachi-Hasegawa 1996) + # m -> BLOSUM62 (Henikoff-Henikoff 92) + # m -> VT (Mueller-Vingron 2000) + # m -> WAG (Whelan-Goldman 2000) + # m -> Auto + def determine_model_option( model ) + cmd = nil + if ( model == :pam ) + cmd = "\nm" + elsif ( model == :jtt ) + cmd = "\nm\nm" + elsif ( model == :mtrev24 ) + cmd = "\nm\nm\nm" + elsif ( model == :blosum62 ) + cmd = "\nm\nm\nm\nm" + elsif ( model == :vt ) + cmd = "\nm\nm\nm\nm\nm" + elsif ( model == :wag ) + cmd = "\nm\nm\nm\nm\nm\nm" + elsif ( model == :auto ) + cmd = "" + else + error_msg = "unknown model" + raise ArgumentError, error_msg + end + cmd + end + + + # Model of rate heterogeneity: + # "8 Gamma distributed rates" + # "Two rates (1 invariable + 1 variable)" + # "Mixed (1 invariable + 8 Gamma rates)" + # otherwise: Uniform rate + def determine_rate_heterogeneity_option( rates ) + opt = nil + if ( rates == :gamma8 ) + opt = "\nw" + elsif ( rates == :inv1_var1 ) + opt = "\nw\nw" + elsif ( rates == :inv1_gamma8 ) + opt = "\nw\nw\nw" + elsif ( rates == :uniform ) + opt = "" + else + error_msg = "unknown rate heterogeneity option" + raise ArgumentError, error_msg + end + return opt + end + + end # class TreePuzzle + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/table/basic_table.rb b/forester/ruby/evoruby/lib/evo/table/basic_table.rb new file mode 100644 index 0000000..8e5901a --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/table/basic_table.rb @@ -0,0 +1,122 @@ +# = lib/evo/table/basic_table.rb - BasicTable class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL)'s +# +# $Id: basic_table.rb,v 1.3 2007/09/28 03:12:10 cmzmasek Exp $ +# +# last modified: 05/16/2007 + +#require 'lib/evo/util/constants' + +module Evoruby + + class BasicTable + + def initialize() + @rows = Hash.new + @max_row = 0 + @max_col = 0 + end + + # raises ArgumentError + def set_value( row, col, value ) + if ( ( row < 0 ) || ( col < 0 ) ) + raise( ArgumentError, "attempt to use negative values for row or column" ) + end + if ( row > get_max_row() ) + set_max_row( row ) + end + if ( col > get_max_col() ) + set_max_col( col ) + end + row_map = nil + if ( @rows.has_key?( row ) ) + row_map = @rows[ row ] + else + row_map = Hash.new + @rows[ row ] = row_map + end + row_map[ col ] = value + end + + # raises ArgumentError + def get_value_as_string( row, col ) + return ( get_value( row, col ) ).to_s + end + + # raises ArgumentError + def get_value( row, col ) + if ( ( row > get_max_row() ) || ( row < 0 ) ) + raise( ArgumentError, "value for row (" + row.to_s + + ") is out of range [max row: " + get_max_row().to_s + "]" ) + elsif ( ( col > get_max_col() ) || ( row < 0 ) ) + raise( ArgumentError, "value for column (" + col.to_s + + ") is out of range [max column: " + get_max_col().to_s + "]" ) + end + row_map = @rows[ row ] + if ( ( row_map == nil ) || ( row_map.length < 1 ) ) + return nil + end + return row_map[ col ] + end + + def get_max_col() + return @max_col + end + + def get_max_row() + return @max_row + end + + # raises ArgumentError + def get_columns_as_map( key_col, value_col ) + map = Hash.new + for row in 0 .. get_max_row + key = get_value( row, key_col ) + value = get_value( row, value_col ) + if ( ( key != nil ) && ( value != nil ) ) + if ( map.has_key?( key ) ) + raise( ArgumentError, "attempt to use non-unique table value as key [" + + + key + "]" ) + end + map[ key ] = value + end + end + return map + end + + def to_s + str = String.new + for row in 0 .. get_max_row + for col in 0 .. get_max_col + str << col.to_s << " " + end + str << LEvoruby::Constants::LINE_DELIMITER + for col in 0 .. get_max_col + str << row.to_s << ": " + str << get_value( row, col ) << " " + end + str << Evoruby::Constants::LINE_DELIMITER + end + return str + end + + + private + + def get_row( row ) + return @rows[ row ] + end + + def set_max_col( max_col ) + @max_col = max_col + end + + def set_max_row( max_row ) + @max_row = max_row + end + + end # class BasicTable + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/taxonomy/sp_taxonomy.rb b/forester/ruby/evoruby/lib/evo/taxonomy/sp_taxonomy.rb new file mode 100644 index 0000000..ffda5df --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/taxonomy/sp_taxonomy.rb @@ -0,0 +1,38 @@ +# +# = lib/evo/taxonomy/sp_taxonomy.rb - SpTaxonomy class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: sp_taxonomy.rb,v 1.1 2008/12/30 05:28:00 cmzmasek Exp $ + + + +module Evoruby + + class SpTaxonomy + + attr_accessor :code, :id, :scientific_name, :common_name + + def initialize( code, id, scientific_name, common_name = nil ) + @code = String.new( code.strip() ) + @id = String.new( id.strip() ) + @scientific_name = String.new( scientific_name.strip() ) + if ( common_name == nil ) + @common_name = String.new() + else + @common_name = String.new( common_name.strip() ) + end + end + + def copy + return Taxonomy.new( code, id, scientific_name, common_name ) + end + + def to_str() + code + " " + id + ": N=" + scientific_name + end + + end # class SpTaxonomy + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/taxonomy/taxonomy.rb b/forester/ruby/evoruby/lib/evo/taxonomy/taxonomy.rb new file mode 100644 index 0000000..47f011b --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/taxonomy/taxonomy.rb @@ -0,0 +1,65 @@ +# +# = lib/evo/taxonomy/taxonomy.rb - Taxonomy class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: taxonomy.rb,v 1.2 2009/01/03 00:19:08 cmzmasek Exp $ + + + +module Evoruby + + class Taxonomy + + def initialize( name, id = nil, id_source = nil ) + @name = String.new( name.strip() ) + if ( id == nil ) + @id = String.new() + else + @id = String.new( id.strip() ) + end + if ( id_source == nil ) + @id_source = String.new() + else + @id_source = String.new( id_source.strip() ) + end + end + + def == ( taxonomy ) + if taxonomy == nil + return false + else + return ( ( get_name == taxonomy.get_name ) && + ( get_id == taxonomy.get_id ) && + ( get_id_source == taxonomy.get_id_source ) ) + end + end + + def copy + return Taxonomy.new( get_name, get_id, get_id_source ) + end + + def get_name() + @name + end + + def get_id() + @id + end + + def get_id_source() + @id_source + end + + def to_str() + if Util.is_string_empty?( get_id ) + @name + else + "[" + get_id + "] " + @name + end + end + + end # class Taxonomy + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb b/forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb new file mode 100644 index 0000000..1ad9924 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb @@ -0,0 +1,177 @@ +# +# = lib/evo/util/command_line_arguments.rb - CommandLineArguments class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: command_line_arguments.rb,v 1.2 2007/06/12 04:51:34 cmzmasek Exp $ +# +# last modified: 05/16/2007 + +module Evoruby + + class CommandLineArguments + + OPTIONS_PREFIX = "-" + EXTENDED_OPTIONS_PREFIX = "--" + OPTIONS_SEPARATOR = "=" + + # raises ArgumentError + def initialize( args ) + @options = Hash.new + @extended_options = Hash.new + @file_names = Array.new + parse_arguments( args ) + end + + def get_file_names + return @file_names + end + + def get_file_name( i ) + return @file_names[ i ] + end + + def get_number_of_files() + return @file_names.length + end + + def is_option_set?( option_name ) + o = get_all_options + return ( o.has_key?( option_name ) ) + end + + # raises ArgumentError + def get_option_value( option_name ) + o = get_all_options + if ( o.has_key?( option_name ) ) + value = o[ option_name ] + if ( !Util.is_string_empty?( value ) ) + return value + else + raise( ArgumentError, "value for option \"" + + option_name + "\" is not set", caller ) + end + else + raise( ArgumentError, "option \"" + option_name + + "\" is not set", caller ) + end + end + + def get_option_value_as_int( option_name ) + return get_option_value( option_name ).to_i + end + + def get_option_value_as_float( option_name ) + return get_option_value( option_name ).to_f + end + + # mandatory_options (Array) + # + def validate_mandatory_options( mandatory_options ) + o = get_all_options + missing = Array.new + for ma in mandatory_options + if ( !o.has_key?( ma ) ) + missing.push( ma ) + end + end + return missing + end + + # mandatory_options (Array) + # + def validate_mandatory_options_as_str( mandatory_options ) + missing = validate_mandatory_options( mandatory_options ) + return missing.join( ", " ) + end + + # allowed_options (Array) + # + def validate_allowed_options( allowed_options ) + o = get_all_options + disallowed = Array.new + o.each_key { |op| + if ( !allowed_options.include?( op ) ) + disallowed.push( op ) + end + } + return disallowed + end + + # allowed_options (Array) + # + def validate_allowed_options_as_str( allowed_options ) + disallowed = validate_allowed_options( allowed_options ) + return disallowed.join( ", " ) + end + + private + + def get_all_options + o = Hash.new + o.merge!( get_options_list ) + o.merge!( get_extended_options_list ) + return o + end + + def parse_arguments( args ) + for arg in args + if ( arg.index( EXTENDED_OPTIONS_PREFIX ) == 0 ) + parse_option( arg.slice( EXTENDED_OPTIONS_PREFIX.length, arg.length() - 1 ), + get_extended_options_list ) + + elsif ( arg.index( OPTIONS_PREFIX ) == 0 ) + parse_option( arg.slice( OPTIONS_PREFIX.length, arg.length() - 1 ), + get_options_list ) + + else + get_file_names.push( arg ) + end + end + end + + # raises ArgumentError + def parse_option( option, options_map ) + sep_index = option.index( OPTIONS_SEPARATOR ) + if ( sep_index == nil ) + if ( Util.is_string_empty?( option ) ) + raise( ArgumentError, "attempt to set option with an empty name" ) + end + if ( get_all_options.has_key?( option ) ) + raise( ArgumentError, "attempt to set option \"" + + option + "\" mutiple times" ) + end + options_map[ option ] = "" + else + key = option.slice( 0, sep_index ) + value = option.slice( sep_index + 1, option.length() - 1 ) + if ( Util.is_string_empty?( key ) ) + raise( ArgumentError, "attempt to set option with an empty name" ) + end + if ( Util.is_string_empty?( value ) ) + raise( ArgumentError, "attempt to set option with an empty value" ) + end + if ( get_all_options.has_key?( key ) ) + raise( ArgumentError, "attempt to set option \"" + + key + "\" mutiple times [" + option + "]" ) + end + options_map[ key ] = value + end + end + + def get_file_names_list + return @file_names + end + + def get_options_list + return @options + end + + def get_extended_options_list + return @extended_options + end + + end # class CommandLineArguments + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/util/constants.rb b/forester/ruby/evoruby/lib/evo/util/constants.rb new file mode 100644 index 0000000..6546478 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/util/constants.rb @@ -0,0 +1,33 @@ +# +# = lib/evo/util/constants.rb - Constants class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: constants.rb,v 1.3 2007/12/21 04:13:33 cmzmasek Exp $ +# +# last modified: 05/11/2007 + +module Evoruby + + class Constants + + VERBOSE = true + + EVORUBY_VERSION = '1.0' + + FORESTER_HOME_ENV_VARIABLE = 'FORESTER_HOME' + JAVA_HOME_ENV_VARIABLE = 'JAVA_HOME' + + EVORUBY = 'evoruby' + + LINE_DELIMITER = "\n" + + FILE_SEPARATOR = File::SEPARATOR + + DOMAIN_STRUCTURE_NHX_SEPARATOR = '>' + + + end # class Constants + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/util/util.rb b/forester/ruby/evoruby/lib/evo/util/util.rb new file mode 100644 index 0000000..234a625 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/util/util.rb @@ -0,0 +1,240 @@ +# +# = lib/evo/util/util.rb - Util class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: util.rb,v 1.17 2009/10/06 22:22:46 cmzmasek Exp $ +# +# last modified: 05/15/2007 + +require 'lib/evo/util/constants' + +module Evoruby + + class Util + + def Util.normalize_seq_name( name, length ) + if name.length > length + name = name[ 0, length ] + elsif name.length < length + for i in 0 ... length - name.length + name = name + " " + end + end + name + end + + # def Util.normalize_mol_sequence( seq ) + # new_seq = String.new() + # for i in 0 ... seq.length + # c = seq.get_slice( i ) + # if is_aa_gap_character?( c ) + # new_seq = new_seq + "-" + # else + # new_seq = new_seq + c + # end + # end + # new_seq + # end + + + # Returns true if char_code corresponds to: space * - . _ + def Util.is_aa_gap_character?( char_code ) + return ( char_code <= 32 || char_code == 42 || char_code == 45 || char_code == 46 ||char_code == 95 ) + end + + # Deletes *, digits, and whitespace, replaces BJOUZ? with X, and replaces non-(letters, -) with - + def Util.clean_seq_str( seq_str ) + seq_str = seq_str.upcase + seq_str = seq_str.gsub( /\s+/, '' ) + seq_str = seq_str.gsub( /\d+/, '' ) + seq_str = seq_str.gsub( '*', '' ) + seq_str = seq_str.gsub( /[BJOUZ?]/, 'X' ) + seq_str = seq_str.gsub( /[^A-Z\-]/, '-' ) + seq_str + end + + # raises ArgumentError + def Util.check_file_for_readability( path ) + unless ( File.exist?( path ) ) + error_msg = "file [#{path}] does not exist" + raise ArgumentError, error_msg + end + unless ( File.file?( path ) ) + error_msg = "file [#{path}] is not a regular file" + raise ArgumentError, error_msg + end + unless ( File.readable?( path ) ) + error_msg = "file [#{path}] is not a readable file" + raise ArgumentError, error_msg + end + if ( File.zero?( path ) ) + error_msg = "file [#{path}] is empty" + raise ArgumentError, error_msg + end + end + + # raises ArgumentError + def Util.check_file_for_writability( path ) + if File.directory?( path ) + error_msg = "file [#{path}] is an existing directory" + raise ArgumentError, error_msg + elsif File.exist?( path ) + error_msg = "file [#{path}] already exists" + raise ArgumentError, error_msg + elsif File.writable?( path ) + error_msg = "file [#{path}] is not writeable" + raise ArgumentError, error_msg + end + end + + def Util.fatal_error_if_not_writable( prg_name, path ) + begin + Util.check_file_for_writability( path ) + rescue ArgumentError => e + Util.fatal_error( prg_name, e.to_s ) + end + end + + def Util.fatal_error_if_not_readable( prg_name, path ) + begin + Util.check_file_for_readability( path ) + rescue ArgumentError => e + Util.fatal_error( prg_name, e.to_s ) + end + end + + def Util.get_env_variable_value( env_variable ) + value = ENV[env_variable] + if value == nil || value.empty? + error_msg = "apparently environment variable #{env_variable} has not been set" + raise StandardError, error_msg + end + value + end + + + # raises ArgumentError + def Util.file2array( path, split_by_semicolon ) + Util.check_file_for_readability( path ) + a = Array.new() + c = 0 + File.open( path ) do | file | + while line = file.gets + if ( line =~ /^\s*(\S.*?)\s*$/ ) + s = $1 + if ( split_by_semicolon && s =~/;/ ) + sa = s.split( /;/ ) + for i in 0 ... sa.length() + a[ c ] = sa[ i ].strip! + end + else + a[ c ] = s + end + c += 1 + end + end + end + return a + end + + def Util.print_program_information( prg_name, + prg_version, + prg_desc, + date, + copyright, + contact, + www, + io = STDOUT ) + + if RUBY_VERSION !~ /1.9/ + puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) + exit( -1 ) + end + + ruby_version = RUBY_VERSION + l = prg_name.length + prg_version.length + date.length + ruby_version.length + 12 + io.print( Evoruby::Constants::LINE_DELIMITER ) + io.print( prg_name + " " + prg_version + " [" + date + "] [ruby " + ruby_version + "]") + io.print( Evoruby::Constants::LINE_DELIMITER ) + l.times { + io.print( "_" ) + } + io.print( Constants::LINE_DELIMITER ) + io.print( Constants::LINE_DELIMITER ) + io.print( prg_desc ) + io.print( Constants::LINE_DELIMITER ) + io.print( Constants::LINE_DELIMITER ) + io.print( "Copyright (C) " + copyright ) + io.print( Constants::LINE_DELIMITER ) + io.print( "Contact: " + contact ) + io.print( Constants::LINE_DELIMITER ) + io.print( " " + www ) + io.print( Constants::LINE_DELIMITER ) + io.print( Constants::LINE_DELIMITER ) + end + + def Util.fatal_error( prg_name, message, io = STDOUT ) + io.print( Constants::LINE_DELIMITER ) + if ( !Util.is_string_empty?( prg_name ) ) + io.print( "[" + prg_name + "] > " + message ) + else + io.print( " > " + message ) + end + io.print( Constants::LINE_DELIMITER ) + io.print( Constants::LINE_DELIMITER ) + exit( -1 ) + end + + def Util.print_message( prg_name, message, io = STDOUT ) + if ( !Util.is_string_empty?( prg_name ) ) + io.print( "[" + prg_name + "] > " + message ) + else + io.print( " > " + message ) + end + io.print( Constants::LINE_DELIMITER ) + end + + def Util.print_warning_message( prg_name, message, io = STDOUT ) + if ( !Util.is_string_empty?( prg_name ) ) + io.print( "[" + prg_name + "] > WARNING: " + message ) + else + io.print( " > " + message ) + end + io.print( Constants::LINE_DELIMITER ) + end + + def Util.is_string_empty?( s ) + return ( s == nil || s.length < 1 ) + end + + # From "Ruby Cookbook" + # counts_hash: key is a "name", value is the count (integer) + def Util.draw_histogram( counts_hash, char = "#" ) + pairs = counts_hash.keys.collect { |x| [ x.to_s, counts_hash[ x ] ] }.sort + largest_key_size = pairs.max { |x, y| x[ 0 ].size <=> y[ 0 ].size }[ 0 ].size + pairs.inject( "" ) do | s, kv | + s << "#{ kv[ 0 ].ljust( largest_key_size ) } | #{ char*kv[ 1 ] }" + Constants::LINE_DELIMITER + end + end + + def Util.looks_like_fasta?( path ) + Util.check_file_for_readability( path ) + File.open( path ) do | file | + while line = file.gets + if ( line !~ /\S/ || line =~ /^\s*#/ ) + elsif line =~ /^\s*>\s*(.+)/ + return true + else + return false + end + end + end + error_msg = "unexpected format" + raise IOError, error_msg + end + + end # class Util + +end # module Evoruby diff --git a/forester/ruby/scripts/delete_ext_nodes.rb b/forester/ruby/scripts/delete_ext_nodes.rb new file mode 100755 index 0000000..4296241 --- /dev/null +++ b/forester/ruby/scripts/delete_ext_nodes.rb @@ -0,0 +1,58 @@ +#!/usr/local/bin/ruby -w + +infile = ARGV[ 0 ] + +metazoa_choanoflagellata = [ +"Metazoa_Choanoflagellata", +"Metazoa", +"Bilateria_Cnidaria", +"Bilateria", +"Deuterostomia", +"Chordata", +"Urochordata_Vertebrata", +"Vertebrata", +"Tetrapoda", +"Amniota", +"Eutheria", +"Euarchontoglires", +"Primates", +"Rodentia", +"Teleostei", +"Euteleostei", +"Smegmamorpha", +"Tetraodontiformes", +"Urochordata", +"Ascidiacea", +"Urochordata", +"Protostomia", +"Ecdysozoa", +"Arthropoda", +"Insecta", +"Lepidoptera_Diptera_Hymenoptera", +"Diptera", +"Culicoidea", +"Hymenoptera", +"Nematoda", +"Annelida_Mollusca", +"Annelida" ] + +if infile == nil + puts "no infile" + exit +end + +File.open( infile ) do | file | + while line = file.gets + if line =~ /^[0-9A-Z]{3,5}\s/ + elsif line =~ /^\t/ + elsif line =~ /^{/ + elsif line =~ /^f_\d/ + else + line =~ /(\S+)/ + first = $1 + if metazoa_choanoflagellata.include?( first ) + puts( line ) + end + end + end +end diff --git a/forester/ruby/scripts/hmm_split.rb b/forester/ruby/scripts/hmm_split.rb new file mode 100755 index 0000000..8ae000d --- /dev/null +++ b/forester/ruby/scripts/hmm_split.rb @@ -0,0 +1,80 @@ +#!/usr/local/bin/ruby -w +# +# = hmm_split +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: hmm_split.rb,v 1.5 2008/11/17 22:32:43 cmzmasek Exp $ +# +# To split a Pfam HMM file into one file for each HMM. +# + + +module ForesterScripts + + if RUBY_VERSION !~ /1.9/ + puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) + exit( -1 ) + end + + + if ( ARGV == nil || ARGV.length != 3 ) + puts( "usage: hmm_split.rb " ) + exit( -1 ) + end + + hmmfile = ARGV[ 0 ] + suffix = ARGV[ 1 ] + outdir = ARGV[ 2 ] + + if ( !File.exists?( outdir ) ) + puts( "outdir [" + outdir + "] does not exist" ) + exit( -1 ) + end + if ( !File.exists?( hmmfile ) ) + puts( "Pfam HMM file [" + hmmfile + "] does not exist" ) + exit( -1 ) + end + + data = String.new + name = String.new + line_count = 0 + count = 0 + + File.open( hmmfile ) do | file | + while line = file.gets + data = data + line + line_count += 1 + if ( line =~ /NAME\s+(.+)/ ) + if name.length > 0 + puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" ) + exit( -1 ) + end + name = $1 + elsif ( line =~ /\/\// ) + if name.length < 1 + puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" ) + exit( -1 ) + end + + outfile = outdir + '/' + name + suffix + if ( File.exists?( outfile ) ) + puts( "file [" + outfile + "] already exists" ) + exit( -1 ) + end + open( outfile, 'w' ) do | out | + out.write( data ) + end + count += 1 + puts( count.to_s + ": " + name ) + data = String.new + name = String.new + end + end + end + + puts() + puts( "wrote " + count.to_s + " individual HMM files to " + outdir ) + +end \ No newline at end of file diff --git a/forester/ruby/scripts/parameters.rb_dir_qsub b/forester/ruby/scripts/parameters.rb_dir_qsub new file mode 100644 index 0000000..c06697d --- /dev/null +++ b/forester/ruby/scripts/parameters.rb_dir_qsub @@ -0,0 +1,5 @@ +# $Id: parameters.rb_dir_qsub,v 1.3 2007/12/20 04:07:13 cmzmasek Exp $ + +PRG: /home/czmasek/SOFTWARE/HMMER/hmmer-2.3.2/src/hmmpfam +OPT: -E 20 -A 0 /home/czmasek/DATA/PFAM/Pfam_ls +SUFFIX: _hmmpfam_22_20_ls \ No newline at end of file diff --git a/forester/ruby/scripts/pfam2go_reformat.rb b/forester/ruby/scripts/pfam2go_reformat.rb new file mode 100755 index 0000000..7d2bbb5 --- /dev/null +++ b/forester/ruby/scripts/pfam2go_reformat.rb @@ -0,0 +1,90 @@ +#!/usr/local/bin/ruby -w +# +# = pfam2go_reformat +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: pfam2go_reformat.rb,v 1.4 2008/11/27 01:41:36 cmzmasek Exp $ +# +# Reformat pfam2go to a "association" file suitable as input +# for microarray GO enrichment/overrepresentation-type analyses, +# and create a file listing all mapped Pfams as well. + + +module ForesterScripts + + require 'set' + + if RUBY_VERSION !~ /1.9/ + puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) + exit( -1 ) + end + + if ( ARGV == nil || ARGV.length != 2 ) + puts( "usage: pfam2go_reformat.rb " ) + exit( -1 ) + end + + infile = ARGV[ 0 ] + outfilebase = ARGV[ 1 ] + outfile_sgd_style = outfilebase + "_sgd_style_associations" + outfile_simple_map = outfilebase + "_basic_associations" + outfile_all_pfams = outfilebase + "_all_associated_pfams" + + pfams = SortedSet.new + + if ( File.exists?( outfile_sgd_style ) ) + puts( "outfile [" + outfile_sgd_style + "] already exists" ) + exit( -1 ) + end + if ( File.exists?( outfile_simple_map ) ) + puts( "outfile [" + outfile_simple_map + "] already exists" ) + exit( -1 ) + end + if ( File.exists?( outfile_all_pfams ) ) + puts( "outfile [" + outfile_all_pfams + "] already exists" ) + exit( -1 ) + end + if ( !File.exists?( infile) ) + puts( "infile [" + infile + "] does not exist" ) + exit( -1 ) + end + + out_str_sgd = String.new + out_str_basic = String.new + + File.open( infile ) do | file | + while line = file.gets + if line =~ /^\s*Pfam:PF(\d+)\s+(\S+)\s.+(GO:\d+)\s*$/ + pfam_id = $1 + pfam_name = $2 + go_id = $3 + new_line = "PFAM" + "\t" + pfam_name + "\t" + pfam_name + "\t\t" + go_id + "\t" + "PF:" + pfam_id + "\t\t\t\t\t\t\t\t\t" + out_str_sgd = out_str_sgd + new_line + "\n" + out_str_basic = out_str_basic + pfam_name + "\t" + go_id + "\n" + pfams.add( pfam_name ) + end + end + end + + open( outfile_sgd_style, 'w' ) do |file| + file.write( out_str_sgd ) + end + open( outfile_simple_map, 'w' ) do |file| + file.write( out_str_basic ) + end + open( outfile_all_pfams, 'w' ) do |file| + pfams.each { |pfam| + file.write( pfam ) + file.write( "\n" ) + } + end + puts( "number of associated pfams : " + pfams.size.to_s ) + puts( "wrote assocations in sgd style to : " + outfile_sgd_style ) + puts( "wrote assocations in basic style to: " + outfile_simple_map ) + puts( "wrote all associated pfams to : " + outfile_all_pfams ) + puts( "OK") + +end + diff --git a/forester/ruby/scripts/pfam_summarize.rb b/forester/ruby/scripts/pfam_summarize.rb new file mode 100755 index 0000000..52228fa --- /dev/null +++ b/forester/ruby/scripts/pfam_summarize.rb @@ -0,0 +1,116 @@ +#!/usr/local/bin/ruby -w +# +# = pfam_summarize +# +# Copyright:: Copyright (C) 2008-2009 Christian M. Zmasek. All rights reserved. +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: pfam_summarize.rb,v 1.2 2008/08/28 17:09:07 cmzmasek Exp $ +# +# This extracts ID, AC, DE, TP, and DR values from Pfam data files. +# +# Created 2008-06-25 in San Diego, CA, USA by CMZ +# +# Usage: pfam_summarize.rb + +require 'iconv' + +module ForesterScripts + if RUBY_VERSION !~ /1.9/ + puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) + exit( -1 ) + end + SEP = "\t" + LINE_DELIMITER = "\n" + + if ( ARGV == nil || ARGV.length != 2 ) + puts( "usage: pfam_summarize.rb " ) + exit( -1 ) + end + + pfamfile = ARGV[ 0 ] + outfile = ARGV[ 1 ] + + if ( !File.exists?( pfamfile ) ) + puts( "Pfam data file [" + pfamfile + "] does not exist" ) + exit( -1 ) + end + if ( File.exists?( outfile ) ) + puts( "outfile [" + outfile + "] already exists" ) + exit( -1 ) + end + + ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' ) + + id = nil + ac = nil + de = nil + tp = nil + dr = Array.new() + line_count = 0 + count = 0 + + out = File.open( outfile, 'w' ) + + File.open( pfamfile ) do | file | + while line = file.gets + line_count += 1 + + line = ic.iconv( line ) + + if ( line =~ /#=GF ID\s+(.+)/ ) + if ( id != nil ) + puts( "Pfam data file [" + pfamfile + "] format error [line: " + line + "]" ) + exit( -1 ) + end + id = $1 + elsif ( line =~ /#=GF AC\s+(.+)/ ) + ac = $1 + elsif ( line =~ /#=GF DE\s+(.+)/ ) + de = $1 + elsif ( line =~ /#=GF TP\s+(.+)/ ) + tp = $1 + elsif ( line =~ /#=GF DR\s+(.+)/ ) + dr.push( $1 ) + elsif ( line =~ /^\/\// ) + if ( id == nil || ac == nil ) + puts( "Pfam data file [" + pfamfile + "] format error [line: " + line + "]" ) + exit( -1 ) + end + out.write( id ) + out.write( SEP ) + out.write( ac ) + out.write( SEP ) + out.write( tp ) + out.write( SEP ) + out.write( '[' ) + out.write( de ) + out.write( ']' ) + out.write( SEP ) + out.write( '[' ) + dr.each { |d| + out.write( d ) + out.write( ' ' ) + } + out.write( ']' ) + out.write( LINE_DELIMITER ) + + id = nil + ac = nil + de = nil + tp = nil + dr = Array.new() + count += 1 + end + end + end + + out.close + + puts() + puts( "Summarized data for " + count.to_s + " individual Pfams to " + outfile ) + puts( "OK" ) + puts() + +end # module ForesterScripts + diff --git a/forester/ruby/scripts/pfam_to_scop.rb b/forester/ruby/scripts/pfam_to_scop.rb new file mode 100755 index 0000000..b8cdef2 --- /dev/null +++ b/forester/ruby/scripts/pfam_to_scop.rb @@ -0,0 +1,103 @@ +#!/usr/local/bin/ruby -w +# +# = pfam_to_scop +# +# Copyright:: Copyright (C) 2008-2009 Christian M. Zmasek. All rights reserved. +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: pfam_to_scop.rb,v 1.2 2008/08/28 17:09:07 cmzmasek Exp $ +# +# This extracts ID and SCOP fa (or fa and sf) from Pfam data files. +# +# Created 2008-06-25 in San Diego, CA, USA by CMZ +# +# Usage: pfam_to_scop.rb + +require 'iconv' + +module ForesterScripts + + if RUBY_VERSION !~ /1.9/ + puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) + exit( -1 ) + end + + SF = true + + SEP = "\t" + LINE_DELIMITER = "\n" + + if ( ARGV == nil || ARGV.length != 2 ) + puts( "usage: pfam_to_scop.rb " ) + exit( -1 ) + end + + pfamfile = ARGV[ 0 ] + outfile = ARGV[ 1 ] + + if ( !File.exists?( pfamfile ) ) + puts( "Pfam data file [" + pfamfile + "] does not exist" ) + exit( -1 ) + end + if ( File.exists?( outfile ) ) + puts( "outfile [" + outfile + "] already exists" ) + exit( -1 ) + end + + ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' ) + + id = nil + scops = Array.new() + line_count = 0 + count = 0 + scop_count = 0 + + out = File.open( outfile, 'w' ) + + File.open( pfamfile ) do | file | + while line = file.gets + line_count += 1 + + line = ic.iconv( line ) + + if ( line =~ /#=GF ID\s+(.+)/ ) + if ( id != nil ) + puts( "Pfam data file [" + pfamfile + "] format error [line: " + line + "]" ) + exit( -1 ) + end + id = $1 + elsif ( line =~ /#=GF\s+DR\s+SCOP;\s+(\w+);\s+fa/ ) + scops.push( $1 ) + elsif ( SF && line =~ /#=GF\s+DR\s+SCOP;\s+(\w+);\s+sf/ ) + scops.push( $1 ) + elsif ( line =~ /^\/\// ) + if ( id == nil ) + puts( "Pfam data file [" + pfamfile + "] format error [line: " + line + "]" ) + exit( -1 ) + end + scops.each { |s| + out.write( id ) + out.write( SEP ) + out.write( s ) + out.write( LINE_DELIMITER ) + scop_count += 1 + } + id = nil + scops = Array.new() + count += 1 + end + end + end + + out.close + + puts() + if ( SF ) + puts( "Extracted #{scop_count} scop fa and sf identifiers for #{count.to_s} individual Pfams to " + outfile ) + else + puts( "Extracted #{scop_count} scop fa identifiers for #{count.to_s} individual Pfams to " + outfile ) + end + puts( "OK" ) + puts() + +end # module ForesterScripts \ No newline at end of file diff --git a/forester/ruby/scripts/rb_dir_qsub.rb b/forester/ruby/scripts/rb_dir_qsub.rb new file mode 100644 index 0000000..c2749bb --- /dev/null +++ b/forester/ruby/scripts/rb_dir_qsub.rb @@ -0,0 +1,159 @@ +#!/usr/local/bin/ruby -w +# +# = rb_dir_qsub +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: rb_dir_qsub.rb,v 1.15 2009/11/07 02:06:59 cmzmasek Exp $ +# +# To execute qsub commands. +# Submits PRG for every file in the current directory. +# +# Examples for PARAMETER_FILE: +# +# PRG: /home/user/SOFTWARE/HMMER/hmmer-2.3.2/src/hmmpfam +# OPT: -E 20 -A 0 /home/user/DATA/PFAM/Pfam_ls +# SUFFIX: _hmmpfam_22_20_ls +# +# PRG: /home/user/SOFTWARE/WUBLAST/tblastn +# OPT: +# VOPT: AMPQU +# VOPT: HYDMA +# SUFFIX: _blast +# +# PRG: /home/czmasek/SOFTWARE/HMMER/hmmer-3.0b2/binaries/intel-linux-x86_64/hmmscan +# OPT: -E 2 --notextw --qformat fasta /home/czmasek/DATA/PFAM/PFAM240/Pfam-A.hmm +# SUFFIX: .hmmscan30b2_240 +# OUTPUT: --domtblout + + +module ForesterScripts + + if RUBY_VERSION !~ /1.9/ + puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) + exit( -1 ) + end + + PARAMETER_FILE = 'parameters.rb_dir_qsub' + SLEEP = 1.0 + REMOVE_SUFFIX = true + + PRG = 'PRG:' + OPT = 'OPT:' + VOPT = 'VOPT:' + OUTPUT_OPT = 'OUTPUT:' + SUFFIX = 'SUFFIX:' + INPUT_PART = 'INPUT_PART:' + + + PBS_O_WORKDIR = '$PBS_O_WORKDIR/' + TMP_CMD_FILE_SUFFIX = '__QSUB' + NAME = 'rb_dir_qsub' + + if ( !File.exists?( PARAMETER_FILE ) ) + puts( '[' + NAME + '] > parameters file "' + PARAMETER_FILE + '" not found' ) + Process.exit! + end + puts( '[' + NAME + '] > reading ' + PARAMETER_FILE ) + + prg = '' + opt = '' + vopts = Array.new + suffix = '' + input_part = '' + output_opt = '' + open( PARAMETER_FILE ).each { |line| + if ( line.length > 1 && line =~ /^[^#]\S+/ ) + if line =~ /^#{PRG}\s+(\S+)/ + prg = $1 + end + if line =~ /^\s*#{OPT}\s+(\S+.+)/ + opt = $1 + end + if line =~ /^\s*#{VOPT}\s+(\S+.+)/ + vopts.push( $1 ) + end + if line =~ /^\s*#{SUFFIX}\s+(\S+)/ + suffix = $1 + end + if line =~ /^\s*#{INPUT_PART}\s+(\S+)/ + input_part = $1 + end + if line =~ /^\s*#{OUTPUT_OPT}\s+(\S+.+)/ + output_opt = $1 + end + end + } + if ( prg.length < 1 ) + puts( '[' + NAME + '] > no program name found in parameters file "' + PARAMETER_FILE + '"' ) + Process.exit! + end + puts( '[' + NAME + '] > program: ' + prg ) + puts( '[' + NAME + '] > option : ' + opt ) + vopts.each { |vopt| + puts( '[' + NAME + '] > voption: ' + vopt ) + } + puts( '[' + NAME + '] > suffix : ' + suffix ) + if ( input_part.length > 0 ) + puts( '[' + NAME + '] > input: ' + input_part ) + end + if ( output_opt.length > 0 ) + puts( '[' + NAME + '] > output opt : ' + output_opt ) + end + if vopts.empty? + vopts.push( "" ) + end + + files = Dir.entries( "." ) + + files.each { |file| + if ( !File.directory?( file ) && file !~ /^\./ && file !~ /#{PARAMETER_FILE}/ ) + + if ( input_part.length > 0 && file !~ /#{input_part}/ ) + next + end + vopts.each { |vopt| + cmd = "" + outputfile = file.to_str + if REMOVE_SUFFIX + if outputfile =~ /(.+)\..{1,5}/ + outputfile = $1 + end + end + if output_opt.length > 0 + cmd = prg + ' ' + + output_opt + ' ' + PBS_O_WORKDIR + outputfile + suffix + ' ' + + opt + ' ' + + PBS_O_WORKDIR + file.to_str + + ' > /dev/null' + elsif vopt.length > 0 + cmd = prg + ' ' + opt + ' ' + vopt + ' ' + PBS_O_WORKDIR + file.to_str + + ' > ' + PBS_O_WORKDIR + vopt + "_" + outputfile + suffix + else + cmd = prg + ' ' + opt + ' ' + PBS_O_WORKDIR + file.to_str + + ' > ' + PBS_O_WORKDIR + outputfile + suffix + end + tmp_cmd_file = file.to_str + TMP_CMD_FILE_SUFFIX + if File.exists?( tmp_cmd_file ) + File.delete( tmp_cmd_file ) + end + open( tmp_cmd_file, 'w' ) do |f| + f.write( cmd ) + end + puts( '[' + NAME + '] > excuting ' + cmd ) + IO.popen( 'qsub ' + tmp_cmd_file , 'r+' ) do |pipe| + pipe.close_write + puts pipe.read + end + sleep( SLEEP ) + if File.exists?( tmp_cmd_file ) + File.delete( tmp_cmd_file ) + end + } + end + } + puts( '[' + NAME + '] > OK.' ) + puts + +end diff --git a/forester/ruby/scripts/rb_dir_x.rb b/forester/ruby/scripts/rb_dir_x.rb new file mode 100644 index 0000000..b0e9ae1 --- /dev/null +++ b/forester/ruby/scripts/rb_dir_x.rb @@ -0,0 +1,128 @@ +#!/usr/local/bin/ruby -w +# +# = rb_x_qsub +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: rb_dir_x.rb,v 1.8 2008/09/16 23:31:39 cmzmasek Exp $ +# +# To execute qsub commands. +# Submits PRG for every file in the current directory. +# +# Examples for PARAMETER_FILE: +# +# PRG: /home/user/SOFTWARE/HMMER/hmmer-2.3.2/src/hmmpfam +# OPT: -E 20 -A 0 /home/user/DATA/PFAM/Pfam_ls +# SUFFIX: _hmmpfam_22_20_ls +# +# PRG: /home/user/SOFTWARE/WUBLAST/tblastn +# OPT: +# VOPT: AMPQU +# VOPT: HYDMA +# SUFFIX: _blast + + +module ForesterScripts + + if RUBY_VERSION !~ /1.9/ + puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) + exit( -1 ) + end + + PARAMETER_FILE = 'parameters.rb_dir_x' + SLEEP = 1.0 + SPAWN = true + + PRG = 'PRG:' + OPT = 'OPT:' + VOPT = 'VOPT:' + OUTPUT_OPT = 'OUTPUT:' # TODO e.g. > or -o + SUFFIX = 'SUFFIX:' + INPUT_PART = 'INPUT_PART:' + + NAME = 'rb_dir_x' + + if ( !File.exists?( PARAMETER_FILE ) ) + puts( '[' + NAME + '] > parameters file "' + PARAMETER_FILE + '" not found' ) + Process.exit! + end + puts( '[' + NAME + '] > reading ' + PARAMETER_FILE ) + + prg = '' + opt = '' + vopts = Array.new + suffix = '' + input_part = '' + open( PARAMETER_FILE ).each { |line| + if ( line.length > 1 && line =~ /^[^#]\S+/ ) + if line =~ /^#{PRG}\s+(\S+)/ + prg = $1 + end + if line =~ /^\s*#{OPT}\s+(\S+.+)/ + opt = $1 + end + if line =~ /^\s*#{VOPT}\s+(\S+.+)/ + vopts.push( $1 ) + end + if line =~ /^\s*#{SUFFIX}\s+(\S+)/ + suffix = $1 + end + if line =~ /^\s*#{INPUT_PART}\s+(\S+)/ + input_part = $1 + end + end + } + if ( prg.length < 1 ) + puts( '[' + NAME + '] > no program name found in parameters file "' + PARAMETER_FILE + '"' ) + Process.exit! + end + puts( '[' + NAME + '] > program: ' + prg ) + puts( '[' + NAME + '] > option : ' + opt ) + vopts.each { |vopt| + puts( '[' + NAME + '] > voption: ' + vopt ) + } + puts( '[' + NAME + '] > suffix : ' + suffix ) + if ( input_part.length > 0 ) + puts( '[' + NAME + '] > input : ' + input_part ) + end + if vopts.empty? + vopts.push( "" ) + end + + files = Dir.entries( "." ) + + files.each { |file| + if ( !File.directory?( file ) && file !~ /^\./ && file !~ /#{PARAMETER_FILE}/ ) + + if ( input_part.length > 0 && file !~ /#{input_part}/ ) + next + end + vopts.each { |vopt| + cmd = "" + if vopt.length > 0 + cmd = 'nohup ' + prg + ' ' + opt + ' ' + vopt + ' ' + file.to_str + + ' > ' + vopt + "_" + file.to_str + suffix + ' &' + else + cmd = 'nohup ' + prg + ' ' + opt + ' ' + file.to_str + + ' > ' + file.to_str + suffix + ' &' + end + + puts( '[' + NAME + '] > excuting ' + cmd ) + if SPAWN + spawn( cmd, STDERR => "/dev/null" ) + else + IO.popen( cmd , 'r+' ) do |pipe| + pipe.close_write + puts pipe.read + end + end + sleep( SLEEP ) + + } + end + } + puts( '[' + NAME + '] > OK.' ) + puts + +end diff --git a/forester/ruby/scripts/rb_qsub.rb b/forester/ruby/scripts/rb_qsub.rb new file mode 100755 index 0000000..829c3c6 --- /dev/null +++ b/forester/ruby/scripts/rb_qsub.rb @@ -0,0 +1,59 @@ +#!/usr/local/bin/ruby -w +# +# = rb_qsub +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: rb_qsub.rb,v 1.6 2008/08/30 19:57:59 cmzmasek Exp $ +# +# last modified: 11/13/2007 +# +# +# To execute qsub commands. +# Each line l (unless precded by a # or space) in file +# 'commands.qsub' is executed as 'qsub l' + + +module ForesterScripts + + if RUBY_VERSION !~ /1.9/ + puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) + exit( -1 ) + end + + CMDS_FILE = 'commands.qsub' + TMP_CMD_FILE = '__QSUB_RB_CMD__' + PRG_NAME = 'rb_qsub' + + if ( !File.exists?( CMDS_FILE ) ) + puts( '[' +PRG_NAME + '] > commands file "' + CMDS_FILE + '" not found' ) + Process.exit! + end + + puts( '[' +PRG_NAME + '] > reading ' + CMDS_FILE ) + + open( CMDS_FILE ).each { |line| + if ( line.length > 1 && line =~ /^[^#]\S+/ ) + if ( File.exists?( TMP_CMD_FILE ) ) + File.delete( TMP_CMD_FILE ) + end + open( TMP_CMD_FILE, 'w' ) do |f| + f.write( line ) + end + puts( '[' +PRG_NAME + '] > excuting ' + line ) + IO.popen( 'qsub ' + TMP_CMD_FILE , 'r+' ) do |pipe| + pipe.close_write + puts pipe.read + end + if ( File.exists?( TMP_CMD_FILE ) ) + File.delete( TMP_CMD_FILE ) + end + sleep( 10.0 ) + end + } + puts( '[' +PRG_NAME + '] > OK.' ) + puts + +end + diff --git a/forester/ruby/scripts/replace.rb b/forester/ruby/scripts/replace.rb new file mode 100755 index 0000000..6672c62 --- /dev/null +++ b/forester/ruby/scripts/replace.rb @@ -0,0 +1,81 @@ +#!/usr/local/bin/ruby -w +# +# = replace +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: replace.rb,v 1.5 2008/08/28 17:09:07 cmzmasek Exp $ +# +# To replace multiple strings in file. +# Map file contains intructions for replacement (one on each line) +# in the following format (by example): old#new +# + + +module ForesterScripts + + if RUBY_VERSION !~ /1.9/ + puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) + exit( -1 ) + end + + if ( ARGV == nil || ARGV.length != 3 ) + puts( "usage: replace.rb " ) + exit( -1 ) + end + mapfile = ARGV[ 0 ] + infile = ARGV[ 1 ] + outfile = ARGV[ 2 ] + + + if ( File.exists?( outfile ) ) + puts( "outfile [" + outfile + "] already exists" ) + exit( -1 ) + end + if ( !File.exists?( infile) ) + puts( "infile [" + infile + "] does not exist" ) + exit( -1 ) + end + if ( !File.exists?( mapfile ) ) + puts( "mapfile [" + mapfile + "] does not exist" ) + exit( -1 ) + end + + old_new_map = Hash.new + + File.open( mapfile ) do | file | + while line = file.gets + if ( line =~/(\S+)\s*#\s*(\S+)/ ) + old_new_map[ $1 ] = $2 + puts( $1 + ' => ' + $2 ) + end + end + end + + if ( old_new_map.size < 1 ) + puts( "mapping file was empty" ) + exit( -1 ) + end + + data_str = String.new + + File.open( infile ) do | file | + while line = file.gets + data_str = data_str + line.chomp + end + end + + old_new_map.each_pair{ |old, new| + data_str = data_str.gsub( old, new ) + } + + open( outfile, 'w' ) do |file| + file.write( data_str ) + end + + puts( "wrote " + outfile ) + +end + + \ No newline at end of file diff --git a/forester/ruby/scripts/replace_id.rb b/forester/ruby/scripts/replace_id.rb new file mode 100644 index 0000000..df2c2ed --- /dev/null +++ b/forester/ruby/scripts/replace_id.rb @@ -0,0 +1,88 @@ +#!/usr/local/bin/ruby -w +# +# = replace_id +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: replace_id.rb,v 1.8 2008/08/28 17:09:07 cmzmasek Exp $ +# +# To replace ()by way of example '123_CHI5' with '123_CHICK5' +# given a mapping file containing '123_CHICKEN' +# (in the form '123_CHICKEN: some description which is ignored'). +# +# Note. This will break if the species id ends with a number (as is +# in the case for many bacteria). + + +module ForesterScripts + +if RUBY_VERSION !~ /1.9/ + puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) + exit( -1 ) + end + + NUMBER_OF_LETTERS = 3 + + if ( ARGV == nil || ARGV.length != 3 ) + puts( "usage: replace_id.rb " ) + exit( -1 ) + end + mapfile = ARGV[ 0 ] + infile = ARGV[ 1 ] + outfile = ARGV[ 2 ] + + + if ( File.exists?( outfile ) ) + puts( "outfile [" + outfile + "] already exists" ) + exit( -1 ) + end + if ( !File.exists?( infile) ) + puts( "infile [" + infile + "] does not exist" ) + exit( -1 ) + end + if ( !File.exists?( mapfile ) ) + puts( "mapfile [" + mapfile + "] does not exist" ) + exit( -1 ) + end + + number_to_complete_id_map = Hash.new + + File.open( mapfile ) do | file | + while line = file.gets + if ( line =~ /(\d+_\S+)\s*:/ ) + complete_id = $1 + complete_id =~ /(\d+)_\S+/ + number_to_complete_id_map[ $1 ] = complete_id + puts( $1 + ' => ' + complete_id ) + end + end + end + + if ( number_to_complete_id_map.size < 1 ) + puts( "mapping file was empty" ) + exit( -1 ) + end + + data_str = String.new + + File.open( infile ) do | file | + while line = file.gets + data_str = data_str + line.chomp + end + end + + replacements = 0 + number_to_complete_id_map.each_pair{ |number, id| + data_str.gsub!( /\b#{number}_[A-Z]{#{NUMBER_OF_LETTERS}}/, id ) + } + + open( outfile, 'w' ) do |file| + file.write( data_str ) + end + + puts( "wrote " + outfile ) + puts( "OK" ) + +end + diff --git a/forester/ruby/scripts/scoptastic.rb b/forester/ruby/scripts/scoptastic.rb new file mode 100755 index 0000000..52e2ab5 --- /dev/null +++ b/forester/ruby/scripts/scoptastic.rb @@ -0,0 +1,163 @@ +#!/usr/local/bin/ruby -w +# +# = scoptastic +# +# Copyright:: Copyright (C) 2008-2009 Christian M. Zmasek. +# All rights reserved. +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: scoptastic.rb,v 1.3 2008/08/28 17:09:07 cmzmasek Exp $ +# +# To create Pfam id to SCOP mappings, one for each of four levels of SCOP +# classification. +# +# Created 2008-06-25 in San Diego, CA, USA by CMZ +# +# Usage: scoptastic.rb scoptastic.rb + + +require 'iconv' + +module ForesterScripts + + if RUBY_VERSION !~ /1.9/ + puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) + exit( -1 ) + end + + CLASS_LEVEL_SUFFIX = "_SCOP_2_CLASS" + FOLD_LEVEL_SUFFIX = "_SCOP_3_FOLD" + SUPERFAMILY_LEVEL_SUFFIX = "_SCOP_4_SUPERFAMILY" + FAMILY_LEVEL_SUFFIX = "_SCOP_5_FAMILY" + + SEP = "\t" + LINE_DELIMITER = "\n" + + if ( ARGV == nil || ARGV.length != 3 ) + puts( "usage: scoptastic.rb " ) + exit( -1 ) + end + + pfam_id_to_ac = ARGV[ 0 ] + pfam_ac_to_scop = ARGV[ 1 ] + outfile = ARGV[ 2 ] + + if ( !File.exists?( pfam_id_to_ac ) ) + puts( "Pfam id to ac map file [" + pfam_id_to_ac + "] does not exist" ) + exit( -1 ) + end + if ( !File.exists?( pfam_ac_to_scop ) ) + puts( "Pfam ac to SCOP classification map file [" + pfam_ac_to_scop + "] does not exist" ) + exit( -1 ) + end + if ( File.exists?( outfile + CLASS_LEVEL_SUFFIX ) ) + puts( "Outfile [" + outfile + CLASS_LEVEL_SUFFIX + "] already exists" ) + exit( -1 ) + end + if ( File.exists?( outfile + FOLD_LEVEL_SUFFIX ) ) + puts( "Outfile [" + outfile + FOLD_LEVEL_SUFFIX + "] already exists" ) + exit( -1 ) + end + if ( File.exists?( outfile + SUPERFAMILY_LEVEL_SUFFIX ) ) + puts( "Outfile [" + outfile + SUPERFAMILY_LEVEL_SUFFIX + "] already exists" ) + exit( -1 ) + end + if ( File.exists?( outfile + FAMILY_LEVEL_SUFFIX ) ) + puts( "Outfile [" + outfile + FAMILY_LEVEL_SUFFIX + "] already exists" ) + exit( -1 ) + end + + ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' ) + + pfam_ac_to_id_map = Hash.new + + pfam_ac_to_scop_map = Hash.new + + count = 0 + + File.open( pfam_id_to_ac ) do | file | + while line = file.gets + line = ic.iconv( line ) + if ( line !~ /^#/ && line =~ /\S/ ) + if ( line =~ /^(\S+)\s+(PF\d+)/ ) + pfam_ac_to_id_map[ $2 ] = $1 + count += 1 + else + puts( "Pfam id to ac map file [" + pfam_id_to_ac + "] format error [line: " + line + "]" ) + exit( -1 ) + end + end + end + end + puts() + puts( "Extracted #{count} Pfam id to ac mappings from file [#{pfam_id_to_ac}]" ) + + count = 0 + File.open( pfam_ac_to_scop ) do | file | + while line = file.gets + line = ic.iconv( line ) + if ( line !~ /^#/ && line =~ /\S/ ) + if ( line =~ /^(PF\d+)\.?\d*\s+([a-z]\.\d+\.\d+\.\d+)/ ) + pfam_ac_to_scop_map[ $1 ] = $2 + count += 1 + else + puts( "Pfam ac to SCOP classification map file [" + pfam_ac_to_scop + "] format error [line: " + line + "]" ) + exit( -1 ) + end + end + end + end + + puts( "Extracted #{count} Pfam ac to SCOP classification mappings from file [#{pfam_ac_to_scop}]" ) + + out_class_level = File.open( outfile + CLASS_LEVEL_SUFFIX, 'w' ) + out_fold_level = File.open( outfile + FOLD_LEVEL_SUFFIX , 'w' ) + out_superfamily_level = File.open( outfile + SUPERFAMILY_LEVEL_SUFFIX, 'w' ) + out_family_level = File.open( outfile + FAMILY_LEVEL_SUFFIX, 'w' ) + + count = 0 + pfam_ac_to_scop_map.each { | pfam_ac,scop | + if ( pfam_ac_to_id_map.has_key?( pfam_ac ) ) + pfam_id = pfam_ac_to_id_map[ pfam_ac ] + scop_split = scop.split( "\." ) + + out_class_level.write( pfam_id ) + out_fold_level.write( pfam_id ) + out_superfamily_level.write( pfam_id ) + out_family_level.write( pfam_id ) + + out_class_level.write( SEP ) + out_fold_level.write( SEP ) + out_superfamily_level.write( SEP ) + out_family_level.write( SEP ) + + out_class_level.write( scop_split[ 0 ] ) + out_fold_level.write( scop_split[ 0 ] + "." + scop_split[ 1 ] ) + out_superfamily_level.write( scop_split[ 0 ] + "." + scop_split[ 1 ] + "." + scop_split[ 2 ] ) + out_family_level.write( scop ) + + out_class_level.write( LINE_DELIMITER ) + out_fold_level.write( LINE_DELIMITER ) + out_superfamily_level.write( LINE_DELIMITER ) + out_family_level.write( LINE_DELIMITER ) + count += 1 + else + puts( "Pfam ac #{pfam_ac} not found in Pfam id to ac map file [" + pfam_id_to_ac + "]" ) + exit( -1 ) + end + } + + out_class_level.close + out_fold_level.close + out_superfamily_level.close + out_family_level.close + + puts() + puts( "Wrote #{count} Pfam id to SCOP mappings to files '#{outfile + CLASS_LEVEL_SUFFIX}', '#{outfile + FOLD_LEVEL_SUFFIX}', '#{outfile + SUPERFAMILY_LEVEL_SUFFIX}', and '#{ outfile + FAMILY_LEVEL_SUFFIX}'" ) + puts( "OK" ) + puts() + +end # module ForesterScripts +