From dbf5b588d65d1c62094dd5d339eca5056a5ade5f Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Tue, 2 Oct 2012 18:53:34 +0000 Subject: [PATCH] refactored --- .../lib/evo/tool/domain_sequence_extractor.rb | 268 +++++++ .../evoruby/lib/evo/tool/domains_to_forester.rb | 261 ++++++ forester/ruby/evoruby/lib/evo/tool/evo_nursery.rb | 317 ++++++++ .../ruby/evoruby/lib/evo/tool/fasta_extractor.rb | 146 ++++ .../lib/evo/tool/fasta_taxonomy_processor.rb | 205 +++++ .../ruby/evoruby/lib/evo/tool/hmmscan_parser.rb | 265 ++++++ .../ruby/evoruby/lib/evo/tool/msa_processor.rb | 848 ++++++++++++++++++++ .../lib/evo/tool/multi_sequence_extractor.rb | 551 +++++++++++++ forester/ruby/evoruby/lib/evo/tool/new_tap.rb | 167 ++++ .../evoruby/lib/evo/tool/phylogenies_decorator.rb | 299 +++++++ .../ruby/evoruby/lib/evo/tool/phylogeny_factory.rb | 267 ++++++ .../evoruby/lib/evo/tool/taxonomy_processor.rb | 317 ++++++++ .../lib/evo/tool/tseq_taxonomy_processor.rb | 190 +++++ 13 files changed, 4101 insertions(+) create mode 100644 forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb create mode 100644 forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb create mode 100755 forester/ruby/evoruby/lib/evo/tool/evo_nursery.rb create mode 100644 forester/ruby/evoruby/lib/evo/tool/fasta_extractor.rb create mode 100644 forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb create mode 100644 forester/ruby/evoruby/lib/evo/tool/hmmscan_parser.rb create mode 100644 forester/ruby/evoruby/lib/evo/tool/msa_processor.rb create mode 100644 forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb create mode 100644 forester/ruby/evoruby/lib/evo/tool/new_tap.rb create mode 100644 forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb create mode 100644 forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb create mode 100644 forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb create mode 100644 forester/ruby/evoruby/lib/evo/tool/tseq_taxonomy_processor.rb diff --git a/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb new file mode 100644 index 0000000..efe6f6a --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb @@ -0,0 +1,268 @@ +# +# = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class +# +# Copyright:: Copyright (C) 2012 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id:Exp $ + + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/io/parser/hmmscan_domain_extractor' + +module Evoruby + + class DomainSequenceExtractor + + PRG_NAME = "dsx" + PRG_VERSION = "2.000" + PRG_DESC = "extraction of domain sequences from hmmscan output" + PRG_DATE = "20121001" + COPYRIGHT = "2012 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + E_VALUE_THRESHOLD_OPTION = 'e' + LENGTH_THRESHOLD_OPTION = 'l' + ADD_POSITION_OPTION = 'p' + ADD_DOMAIN_NUMBER_OPTION = 'd' + ADD_SPECIES = 's' + MIN_LINKER_OPT = 'ml' + LOG_FILE_SUFFIX = '_domain_seq_extr.log' + PASSED_SEQS_SUFFIX = '_with_passing_domains.fasta' + FAILED_SEQS_SUFFIX = '_with_no_passing_domains.fasta' + HELP_OPTION_1 = 'help' + HELP_OPTION_2 = 'h' + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC , + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + ld = Constants::LINE_DELIMITER + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError + Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 4 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( ADD_POSITION_OPTION ) + allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION ) + allowed_opts.push( LENGTH_THRESHOLD_OPTION ) + allowed_opts.push( ADD_SPECIES ) + allowed_opts.push( MIN_LINKER_OPT ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + domain_id = cla.get_file_name( 0 ) + hmmsearch_output = cla.get_file_name( 1 ) + fasta_sequence_file = cla.get_file_name( 2 ) + outfile = cla.get_file_name( 3 ) + + if outfile.downcase.end_with?( ".fasta" ) + outfile = outfile[ 0 .. outfile.length - 7 ] + elsif outfile.downcase.end_with?( ".fsa" ) + outfile = outfile[ 0 .. outfile.length - 5 ] + end + + + add_position = false + if ( cla.is_option_set?( ADD_POSITION_OPTION ) ) + add_position = true + end + + add_domain_number = false + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) ) + add_domain_number = true + end + + add_species = false + if cla.is_option_set? ADD_SPECIES + add_species = true + end + + e_value_threshold = -1.0 + if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) + begin + e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( e_value_threshold < 0.0 ) + Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) + end + end + + length_threshold = -1 + if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) ) + begin + length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION ) + rescue ArgumentError => e + Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( length_threshold < 0) + Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT ) + end + end + + + min_linker = nil + if ( cla.is_option_set?( MIN_LINKER_OPT ) ) + begin + min_linker = cla.get_option_value_as_int( MIN_LINKER_OPT ) + rescue ArgumentError => e + Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( !min_linker || min_linker > 100 || min_linker < -100 ) + Forester::Util.fatal_error( PRG_NAME, "unexpected value for min linker " + min_linker.to_s, STDOUT ) + end + end + + + log = String.new + + puts() + puts( "Domain : " + domain_id ) + log << "Domain : " + domain_id + ld + puts( "Hmmscan outputfile : " + hmmsearch_output ) + log << "Hmmscan outputfile : " + hmmsearch_output + ld + puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file ) + log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld + puts( "Outputfile : " + outfile + ".fasta" ) + log << "Outputfile : " + outfile + ld + puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX ) + log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld + puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX ) + log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld + puts( "Logfile : " + outfile + LOG_FILE_SUFFIX ) + log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld + if ( e_value_threshold >= 0.0 ) + puts( "E-value threshold : " + e_value_threshold.to_s ) + log << "E-value threshold : " + e_value_threshold.to_s + ld + else + puts( "E-value threshold : no threshold" ) + log << "E-value threshold : no threshold" + ld + end + if ( length_threshold > 0 ) + puts( "Length threshold : " + length_threshold.to_s ) + log << "Length threshold : " + length_threshold.to_s + ld + else + puts( "Length threshold : no threshold" ) + log << "Length threshold : no threshold" + ld + end + if ( min_linker ) + puts( "Min linker : " + min_linker.to_s ) + log << "Min linker : " + min_linker.to_s + ld + + end + + + if ( add_position ) + puts( "Add positions (rel to complete seq) to extracted domains: true" ) + log << "Add positions (rel to complete seq) to extracted domains: true" + ld + else + puts( "Add positions (rel to complete seq) to extracted domains: false" ) + log << "Add positions (rel to complete seq) to extracted domains: false" + ld + end + + if ( add_domain_number ) + puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" ) + log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld + else + puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" ) + log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld + end + + puts + + domain_count = 0 + begin + parser = HmmscanDomainExtractor.new() + domain_count = parser.parse( domain_id, + hmmsearch_output, + fasta_sequence_file, + outfile, + outfile + PASSED_SEQS_SUFFIX, + outfile + FAILED_SEQS_SUFFIX, + e_value_threshold, + length_threshold, + add_position, + add_domain_number, + add_species, + min_linker, + log ) + rescue ArgumentError, IOError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + + rescue Exception => e + puts e.backtrace + Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT ) + + end + + puts + Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" ) + Util.print_message( PRG_NAME, "wrote; " + outfile + ".fasta") + Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX ) + Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX ) + Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX ) + + begin + f = File.open( outfile + LOG_FILE_SUFFIX, 'a' ) + f.print( log ) + f.close + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + puts + Util.print_message( PRG_NAME, "OK" ) + puts + + end + + def print_help() + puts() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=: E-value threshold, default is no threshold" ) + puts( " -" + LENGTH_THRESHOLD_OPTION + "=: length threshold, default is no threshold" ) + puts( " -" + ADD_POSITION_OPTION + ": to add positions (rel to complete seq) to extracted domains" ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" ) + puts( " -" + ADD_SPECIES + ": to add species [in brackets]" ) + puts( " -" + MIN_LINKER_OPT + "=: to extract pairs of same domains with a distance inbetween shorter than a given value" ) + puts() + end + + end # class DomainSequenceExtractor + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb b/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb new file mode 100644 index 0000000..85af666 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb @@ -0,0 +1,261 @@ +# +# = lib/evo/apps/domains_to_forester - DomainsToForester class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: Exp $ +# +# last modified: 06/11/2007 + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/sequence/protein_domain' +require 'lib/evo/sequence/domain_structure' + +module Evoruby + + class DomainsToForester + + PRG_NAME = "d2f" + PRG_DESC = "parsed hmmpfam output to forester format" + PRG_VERSION = "1.001" + PRG_DATE = "20120807" + COPYRIGHT = "2012 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + E_VALUE_THRESHOLD_OPTION = "e" + OVERWRITE_IF_SAME_FROM_TO_OPTION = "o" + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" + + def parse( domains_list_file, + original_seqs_file, + outfile, + column_delimiter, + e_value_threshold, + overwrite_if_same_from_to ) + Util.check_file_for_readability( domains_list_file ) + Util.check_file_for_readability( original_seqs_file ) + Util.check_file_for_writability( outfile ) + + domain_structures = Hash.new() # protein name is key, domain structure is value + + f = MsaFactory.new + + original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new ) + if ( original_seqs.get_number_of_seqs < 1 ) + error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format" + raise ArgumentError, error_msg + end + + File.open( domains_list_file ) do | file | + while line = file.gets + if !is_ignorable?( line ) + + a = line.split( column_delimiter ) + l = a.length + if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) ) + error_msg = "unexpected format at line: " + line + raise IOError, error_msg + end + protein_name = a[ 0 ] + domain_name = a[ 1 ] + seq_from = -1 + seq_to = -1 + ########################################## + if domain_name =~ /RRM_\d/ + puts "ignoring " + line + next + end + ########################################## + + + begin + seq_from = a[ 2 ].to_i + rescue Exception + error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]" + raise IOError, error_msg + end + begin + seq_to = a[ 3 ].to_i + rescue Exception + error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]" + raise IOError, error_msg + end + + e_value = -1 + if l > 4 + begin + e_value = a[ 4 ].to_f + rescue Exception + error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]" + raise IOError, error_msg + end + end + + seq = original_seqs.get_by_name_start( protein_name ) + + total_length = seq.get_length + + if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) ) ) + pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value ) + ds = nil + if ( domain_structures.has_key?( protein_name ) ) + ds = domain_structures[ protein_name ] + else + ds = DomainStructure.new( total_length ) + domain_structures[ protein_name ] = ds + end + ds.add_domain( pd, overwrite_if_same_from_to ) + end + + end + end + end + + out = File.open( outfile, "a" ) + ds = domain_structures.sort + for d in ds + protein_name = d[ 0 ] + domain_structure = d[ 1 ] + out.print( protein_name.to_s ) + out.print( ": " ) + out.print( domain_structure.to_NHX ) + out.print( Constants::LINE_DELIMITER ) + end + + out.flush() + out.close() + + end # parse + + + + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if cla.get_number_of_files != 3 + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + domains_list_file = cla.get_file_name( 0 ) + original_sequences_file = cla.get_file_name( 1 ) + outfile = cla.get_file_name( 2 ) + + + e_value_threshold = -1.0 + if cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) + begin + e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( e_value_threshold < 0.0 ) + Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) + end + end + overwrite_if_same_from_to = false + if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) ) + overwrite_if_same_from_to = true + end + + puts + puts( "Domains list file : " + domains_list_file ) + puts( "Fasta sequencefile (complete sequences): " + original_sequences_file ) + puts( "Outputfile : " + outfile ) + if ( e_value_threshold >= 0.0 ) + puts( "E-value threshold : " + e_value_threshold.to_s ) + else + puts( "E-value threshold : no threshold" ) + end + if ( overwrite_if_same_from_to ) + puts( "Overwrite if same from and to : true" ) + else + puts( "Overwrite if same from and to : false" ) + end + + puts + + begin + parse( domains_list_file, + original_sequences_file, + outfile, + " ", + e_value_threshold, + overwrite_if_same_from_to ) + + rescue ArgumentError, IOError, StandardError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT ) + end + + + puts + Util.print_message( PRG_NAME, 'OK' ) + puts + + end + + private + + def print_help() + puts + puts( "Usage:" ) + puts + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "= : E-value threshold, default is no threshold" ) + puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" ) + puts + end + + + + def is_ignorable?( line ) + return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/) + end + + + end # class DomainsToForester + + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/tool/evo_nursery.rb b/forester/ruby/evoruby/lib/evo/tool/evo_nursery.rb new file mode 100755 index 0000000..e6f653f --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/evo_nursery.rb @@ -0,0 +1,317 @@ +# +# = lib/evo/apps/evo_nursery.rb - EvoNursery class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: evo_nursery.rb,v 1.11 2010/12/13 19:00:11 cmzmasek Exp $ + + + +require 'lib/evo/soft/fastme' +require 'lib/evo/soft/tree_puzzle' +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/writer/phylip_sequential_writer' +require 'lib/evo/io/parser/general_msa_parser' +require 'lib/evo/io/writer/msa_writer' + +require 'iconv' + +module Evoruby + + class EvoNursery + GAP_RATIO = 0.50 + GAP_RATIO_FOR_SEQS = 0.75 + MIN_LENGTH = 30 + MIN_SEQS = 4 + MAX_SEQS = 3000 + MAX_ALN_FILE_SIZE = 5000000 + MODEL = :auto + RATES = :uniform + FASTME_INITIAL_TREE = :GME + ALN_NAME = '_align_' + TREE_PUZZLE_OUTDIST = TreePuzzle::OUTDIST + TREE_PUZZLE_OUTFILE = TreePuzzle::OUTFILE + FASTME_OUTTREE = FastMe::OUTTREE + FASTME_OUTPUT_D = FastMe::OUTPUT_D + + PRG_NAME = "evo_nursery" + PRG_DATE = "2012.03.21" + PRG_DESC = "pfam alignments to evolutionary trees" + PRG_VERSION = "0.20" + COPYRIGHT = "2009-2012 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" + + def run + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if RUBY_VERSION !~ /1.9/ + puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) + exit( -1 ) + end + + forester_home = Util.get_env_variable_value( Constants::FORESTER_HOME_ENV_VARIABLE ) + java_home = Util.get_env_variable_value( Constants::JAVA_HOME_ENV_VARIABLE ) + decorator = java_home + '/bin/java -cp ' + forester_home + '/java/forester.jar org.forester.application.decorator' + + if ( ARGV == nil || ARGV.length != 1 ) + help + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + help + exit( 0 ) + end + + output_dir = cla.get_file_name( 0 ) + + if output_dir !~ /\/$/ + output_dir = output_dir + '/' + end + + if !File.exists?( output_dir ) + Util.fatal_error( PRG_NAME, output_dir.to_s + " does not exist", STDOUT ) + end + ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' ) + files = Dir.entries( "." ) + skipped = Array.new + counter = 1 + analyzed = 0; + begin + files.each { |pfam_aln_file| + if ( !File.directory?( pfam_aln_file ) && + pfam_aln_file !~ /^\./ && + pfam_aln_file !~ /.+\.tre$/ ) + + tree_out_file = output_dir + File.basename( pfam_aln_file ) + ".xml" + + if File.exists?( tree_out_file ) + puts + puts + puts "***** skipping " + File.basename( pfam_aln_file ) + ", already exists" + puts + skipped.push( File.basename( pfam_aln_file ) + " [already exists]" ) + next + end + + puts + puts counter.to_s + ": " + pfam_aln_file.to_str + counter += 1 + if File.size( pfam_aln_file ) > MAX_ALN_FILE_SIZE + puts "***** skipping, file size: " + File.size( pfam_aln_file ).to_s + skipped.push( File.basename( pfam_aln_file ) + " [file size: " + File.size( pfam_aln_file ).to_s + "]" ) + next + end + + f = MsaFactory.new() + msa = f.create_msa_from_file( pfam_aln_file, GeneralMsaParser.new() ) + + if msa.get_number_of_seqs < MIN_SEQS || msa.get_number_of_seqs > MAX_SEQS + puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s + skipped.push( File.basename( pfam_aln_file ) + " [seqs: " + msa.get_number_of_seqs.to_s + "]" ) + next + end + + msa.remove_gap_columns_w_gap_ratio!( GAP_RATIO ) + + length = msa.get_length + if length < MIN_LENGTH + puts "***** skipping, length: " + length.to_s + skipped.push( File.basename( pfam_aln_file ) + " [length: " + length.to_s + "]" ) + next + end + + msa.remove_sequences_by_gap_ratio!( GAP_RATIO_FOR_SEQS ) + + if msa.get_number_of_seqs < MIN_SEQS + puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s + skipped.push( File.basename( pfam_aln_file ) + " [seqs: " + msa.get_number_of_seqs.to_s + "]" ) + next + end + + map_file = output_dir + File.basename( pfam_aln_file ) + ".map" + f = File.open( map_file, 'a' ) + for i in 0 ... msa.get_number_of_seqs + name = msa.get_sequence( i ).get_name() + name =~ /(.+)_(.+)\/.+/ + acc = $1 + tax_code = $2 + + mapping_str = i.to_s + mapping_str << "\t" + mapping_str << 'TAXONOMY_CODE:' + mapping_str << tax_code + mapping_str << "\t" + mapping_str << 'SEQ_SYMBOL:' + mapping_str << ( acc + '_' + tax_code ) + mapping_str << "\t" + if ( acc.length < 6 ) + acc = acc + '_' + tax_code + end + mapping_str << 'SEQ_ACCESSION:' + mapping_str << acc + mapping_str << "\t" + mapping_str << 'SEQ_ACCESSION_SOURCE:UniProtKB' + mapping_str << "\t" + mapping_str << 'NODE_NAME:' + mapping_str << name + f.print( mapping_str ) + f.print( "\n" ) + name = msa.get_sequence( i ).set_name( i.to_s ) + end + f.close + + io = MsaIO.new() + w = MsaWriter + w = PhylipSequentialWriter.new() + w.clean( true ) + w.set_max_name_length( 10 ) + if File.exists?( output_dir + ALN_NAME ) + File.unlink( output_dir + ALN_NAME ) + end + io.write_to_file( msa, output_dir + ALN_NAME, w ) + + tp = TreePuzzle.new() + tp.run( output_dir + ALN_NAME, + MODEL, + RATES, + msa.get_number_of_seqs ) + + File.rename( output_dir + ALN_NAME, output_dir + File.basename( pfam_aln_file ) + ".aln" ) + + fastme = FastMe.new() + fastme.run( TREE_PUZZLE_OUTDIST, 0, FASTME_INITIAL_TREE ) + + pfam_acc = nil + pfam_de = nil + File.open( pfam_aln_file ) do |file| + while line = file.gets + line = ic.iconv( line ) + if line =~ /^#=AC\s+(.+)/ + pfam_acc = $1 + end + if line =~ /^#=DE\s+(.+)/ + pfam_de = $1 + end + if pfam_acc && pfam_de + break + end + end + end + if !pfam_acc || !pfam_de + Util.fatal_error( PRG_NAME, "problem with " + pfam_aln_file.to_s, STDOUT ) + end + + puzzle_model = nil + File.open( TREE_PUZZLE_OUTFILE ) do |file| + while line = file.gets + line = ic.iconv( line ) + if line =~ /^Model\s+of\s+substitution:\s+(.+)/ + puzzle_model = $1 + break + end + end + end + if !puzzle_model + Util.fatal_error( PRG_NAME, "problem with puzzle outfile: " + TREE_PUZZLE_OUTFILE.to_s, STDOUT ) + end + + desc = pfam_de + desc << ' | ' + desc << 'ML pwd estimation by TREE-PUZZLE version ' + desc << TreePuzzle::VERSION + desc << ', model: ' + desc << puzzle_model + desc << ', rates: ' + desc << RATES.to_s + desc << '; tree estimation by FastME version ' + desc << FastMe::VERSION + desc << ', initial tree: ' + desc << FASTME_INITIAL_TREE.to_s + desc << '; aln length: ' + desc << msa.get_length.to_s + + cmd = decorator + " -table -p -pn=\"" + pfam_aln_file + + "\" -pi=pfam:" + pfam_acc + + " -pd=\"" + desc + "\" " + + FASTME_OUTTREE + ' ' + + map_file + ' ' + tree_out_file + + IO.popen( cmd , 'r+' ) do | pipe | + pipe.close_write + end + analyzed += 1 + + File.unlink( map_file ) + File.unlink(TREE_PUZZLE_OUTDIST) + File.unlink( TREE_PUZZLE_OUTFILE ) + File.unlink( FASTME_OUTPUT_D ) + end + } + rescue ArgumentError, IOError, StandardError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + puts() + puts( 'Skipped:' ) + puts() + for i in 0 ... skipped.size + puts i.to_s + ": " + skipped[ i ] + end + + puts() + puts( 'Skipped : ' + skipped.size.to_s + ' alignments' ) + puts( 'Analyzed: ' + analyzed.to_s + ' alignments' ) + + puts( 'Min gap ratio for col del : ' + GAP_RATIO.to_s ) + puts( 'Min gap ratio for seq del : ' + GAP_RATIO_FOR_SEQS.to_s ) + puts( 'Minimal aln length : ' + MIN_LENGTH.to_s ) + puts( 'Minimal number of sequences: ' + MIN_SEQS.to_s ) + puts( 'Maximal number of sequences: ' + MAX_SEQS.to_s ) + puts( 'Maximal aln file size : ' + MAX_ALN_FILE_SIZE.to_s ) + puts( 'Model : ' + MODEL.to_s ) + puts( 'FastME initial tree: ' + FASTME_INITIAL_TREE.to_s ) + + puts() + puts( '[' + PRG_NAME + '] > OK' ) + puts() + + end # run + + private + + def help + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb " ) + puts() + end + + + end # class EvoNursery + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/tool/fasta_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/fasta_extractor.rb new file mode 100644 index 0000000..b2d0d5c --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/fasta_extractor.rb @@ -0,0 +1,146 @@ +# +# = lib/evo/apps/fasta_extractor.rb - FastaExtractor class +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: fasta_extractor.rb,v 1.2 2010/12/13 19:00:11 cmzmasek Exp $ + + +require 'lib/evo/util/util' +require 'lib/evo/util/constants' +require 'lib/evo/util/command_line_arguments' + + +module Evoruby + + class FastaExtractor + + PRG_NAME = "fae" + PRG_VERSION = "1.0.0" + PRG_DESC = "extraction of nucleotide sequences from a fasta file by names from wublast search" + PRG_DATE = "2008.08.09" + COPYRIGHT = "2008-2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + HELP_OPTION_1 = 'help' + HELP_OPTION_2 = 'h' + + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC , + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + ld = Constants::LINE_DELIMITER + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 3 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + input_file = cla.get_file_name( 0 ) + names_file = cla.get_file_name( 1 ) + output_file = cla.get_file_name( 2 ) + + if !File.exist?( input_file ) + Util.fatal_error( PRG_NAME, "error: input file [#{input_file}] does not exist" ) + end + if !File.exist?( names_file ) + Util.fatal_error( PRG_NAME, "error: names file [#{names_file}] does not exist" ) + end + if File.exist?( output_file ) + Util.fatal_error( PRG_NAME, "error: [#{output_file }] already exists" ) + end + + names = extract_names_with_frames( names_file ) + + extract_sequences( names, input_file, output_file ) + + puts + Util.print_message( PRG_NAME, "OK" ) + puts + + end + + + def extract_names_with_frames( names_file ) + names = Hash.new() + File.open( names_file ) do | file | + while line = file.gets + if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) ) + if ( line =~ /(\S+)\s+([+|-]\d)\s+\d+\s+(\S+)/ ) + name = $1 + frame = $2 + e = $3 + names[ name ] = "[" + frame + "] [" + e + "]" + end + end + end + end + names + end + + def extract_sequences( names, fasta_file, output_file ) + output = File.open( output_file, "a" ) + matching_state = false + counter = 0 + File.open( fasta_file ) do | file | + while line = file.gets + if !Util.is_string_empty?( line ) + if ( line =~ /\s*>\s*(.+)/ ) + name = $1 + if names.has_key?( name ) + matching_state = true + counter += 1 + puts counter.to_s + ". " +name + " " + names[ name ] + output.print( ">" + name + " " + names[ name ] ) + output.print( Evoruby::Constants::LINE_DELIMITER ) + else + matching_state = false + end + elsif matching_state + output.print( line ) + end + end + end + end + output.close() + end + + def print_help() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb " ) + puts() + end + + end # class FastaExtractor +end \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb new file mode 100644 index 0000000..6ae3cf1 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb @@ -0,0 +1,205 @@ +# +# = lib/evo/apps/fasta_taxonomy_processor - FastaTaxonomyProcessor class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: fasta_taxonomy_processor.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $ + + +require 'lib/evo/util/util' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/msa/msa' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/parser/sp_taxonomy_parser' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/writer/phylip_sequential_writer' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/apps/tseq_taxonomy_processor' + +module Evoruby + + class FastaTaxonomyProcessor + + PRG_NAME = "fasta_tap" + PRG_DATE = "2009.01.20" + PRG_DESC = "preprocessing of multiple sequence files in ncbi fasta format" + PRG_VERSION = "1.00" + COPYRIGHT = "2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + def initialize() + @tax_ids_to_sp_taxonomies = Hash.new() + end + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if ARGV == nil || ARGV.length != 4 + puts( "Usage: #{PRG_NAME}.rb " ) + puts() + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + allowed_opts = Array.new + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) + end + + sp_taxonomy_infile = cla.get_file_name( 0 ) + sequences_infile = cla.get_file_name( 1 ) + sequences_outfile = cla.get_file_name( 2 ) + mapping_outfile = cla.get_file_name( 3 ) + + Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile ) + Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile ) + Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile ) + Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile ) + + sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile ) + + Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile ) + + fasta_parser = FastaParser.new + msa_fac = MsaFactory.new + + seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser ) + + Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile ) + + removed = seqs.remove_redundant_sequences!( true, true ) + + if removed.size > 0 + Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) + removed.each { | seq_name | + puts seq_name + } + Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" ) + end + + mapping_out = File.open( mapping_outfile, "a" ) + + for i in 0 ... seqs.get_number_of_seqs + seq = seqs.get_sequence( i ) + seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) ) + end + + io = MsaIO.new() + + w = FastaWriter.new() + + w.set_max_name_length( 10 ) + w.clean( true ) + begin + io.write_to_file( seqs, sequences_outfile, w ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) + end + mapping_out.close() + + Util.print_message( PRG_NAME, "wrote: " + mapping_outfile ) + Util.print_message( PRG_NAME, "wrote: " + sequences_outfile ) + Util.print_message( PRG_NAME, "OK" ) + + end + + private + + def modify_name( seq, i, sp_taxonomies, mapping_outfile ) + + #i = i + 1792 + + seq_desc = seq.get_name + + taxonomy_sn = nil + + if seq_desc =~ /\[(.+)\]/ + taxonomy_sn = $1 + else + Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]" ) + end + + matching_sp_taxonomy = nil + + sp_taxonomies.each { |sp_taxonomy| + if ( sp_taxonomy.scientific_name == taxonomy_sn ) + matching_sp_taxonomy = sp_taxonomy + end + } + + if matching_sp_taxonomy == nil + Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" ) + end + + new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code + + gi = nil + if seq_desc =~ /gi\|(.+?)\|/ + gi = $1 + else + Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]" ) + end + + seq_name = "" + + if seq_desc =~ /\|\s*([^|]+?)\s*\[/ + seq_name = $1 + end + + if seq_name =~ /\[.+\]$/ + # Redundant taxonomy information hides here. + seq_name = seq_name.sub(/\[.+\]$/, '') + end + if seq_name =~ /^\s*hypothetical\s+protein\s*/i + # Pointless information. + seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' ) + end + if seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i + # Pointless information. + seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' ) + end + + if gi != nil + mapping_outfile.print( new_name + "\t" + + TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + + TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" + + TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" + + TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + + TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" + + TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" + + TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" + + TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string + + Constants::LINE_DELIMITER ) + else + mapping_outfile.print( new_name + "\t" + + TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + + TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" + + TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" + + TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + + TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" + + TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string + + Constants::LINE_DELIMITER ) + + end + new_name + end + + end + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_parser.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_parser.rb new file mode 100644 index 0000000..c2f8177 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_parser.rb @@ -0,0 +1,265 @@ +# +# = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $ +# +# last modified: 11/24/2009 + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' + +module Evoruby + + class HmmscanParser + + PRG_NAME = "hsp" + PRG_VERSION = "1.0.1" + PRG_DESC = "hmmscan parser" + PRG_DATE = "2009.11.24" + COPYRIGHT = "2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + DELIMITER_OPTION = "d" + E_VALUE_THRESHOLD_OPTION = "e" + IGNORE_DUF_OPTION = "i" + PARSE_OUT_DESCRIPITION_OPTION = "a" + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" + + def initialize + @domain_counts = Hash.new + end + + # raises ArgumentError, IOError + def parse( inpath, + outpath, + column_delimiter, + e_value_threshold, + ignore_dufs, + get_descriptions ) + Util.check_file_for_readability( inpath ) + Util.check_file_for_writability( outpath ) + + outfile = File.open( outpath, "a" ) + + query = String.new + desc = String.new + model = String.new + env_from = String.new + env_to = String.new + i_e_value = String.new + + queries_count = 0 + + nl = Constants::LINE_DELIMITER + + File.open( inpath ) do | file | + while line = file.gets + if !HmmscanParser.is_ignorable?( line ) && line =~ /^\S+\s+\S/ + + # tn acc tlen query acc qlen Evalue score bias # of c-E i-E score bias hf ht af at ef et acc desc + # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 + line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/ + + model = $1 + query = $4 + i_e_value = $13.to_f + env_from = $20.to_i + env_to = $21.to_i + + if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) && + ( !ignore_dufs || ( model !~ /^DUF\d+/ ) ) ) + count_model( model ) + outfile.print( query + + column_delimiter ) + if ( get_descriptions ) + outfile.print( desc + + column_delimiter ) + end + outfile.print( model + + column_delimiter + + env_from.to_s + + column_delimiter + + env_to.to_s + + column_delimiter + + i_e_value.to_s ) + outfile.print( nl ) + end + end + end # while line = file.gets + end + outfile.flush() + outfile.close() + + return queries_count + + end # def parse + + def count_model( model ) + if ( @domain_counts.has_key?( model ) ) + count = @domain_counts[ model ].to_i + count += 1 + @domain_counts[ model ] = count + else + @domain_counts[ model ] = 1 + end + end + + + def get_domain_counts() + return @domain_counts + end + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 2 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( DELIMITER_OPTION ) + allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( IGNORE_DUF_OPTION ) + allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + inpath = cla.get_file_name( 0 ) + outpath = cla.get_file_name( 1 ) + + column_delimiter = "\t" + if ( cla.is_option_set?( DELIMITER_OPTION ) ) + begin + column_delimiter = cla.get_option_value( DELIMITER_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + + e_value_threshold = -1.0 + if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) + begin + e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( e_value_threshold < 0.0 ) + Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) + end + end + + ignore_dufs = false + if ( cla.is_option_set?( IGNORE_DUF_OPTION ) ) + ignore_dufs = true + end + + parse_descriptions = false + if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) ) + parse_descriptions = true + end + + puts() + puts( "hmmpfam outputfile: " + inpath ) + puts( "outputfile : " + outpath ) + if ( e_value_threshold >= 0.0 ) + puts( "E-value threshold : " + e_value_threshold.to_s ) + else + puts( "E-value threshold : no threshold" ) + end + if ( parse_descriptions ) + puts( "parse descriptions: true" ) + else + puts( "parse descriptions: false" ) + end + if ( ignore_dufs ) + puts( "ignore DUFs : true" ) + else + puts( "ignore DUFs : false" ) + end + if ( column_delimiter == "\t" ) + puts( "column delimiter : TAB" ) + else + puts( "column delimiter : " + column_delimiter ) + end + puts() + + begin + queries_count = parse( inpath, + outpath, + column_delimiter, + e_value_threshold, + ignore_dufs, + parse_descriptions ) + rescue ArgumentError, IOError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + domain_counts = get_domain_counts() + + puts + puts( "read output for a total of " + queries_count.to_s + " query sequences" ) + puts + puts( "domain counts (considering potential E-value threshold and ignoring of DUFs):" ) + puts( "(number of different domains: " + domain_counts.length.to_s + ")" ) + puts + puts( Util.draw_histogram( domain_counts, "#" ) ) + puts + Util.print_message( PRG_NAME, 'OK' ) + puts + + end # def run() + + def print_help() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" ) + puts( " -" + E_VALUE_THRESHOLD_OPTION + ": E-value threshold, default is no threshold" ) + puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" ) + puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" ) + puts() + end + + + private + + + def HmmscanParser.is_ignorable?( line ) + return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ ) + end + + end # class HmmscanParser + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb b/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb new file mode 100644 index 0000000..6299417 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb @@ -0,0 +1,848 @@ +# +# = lib/evo/apps/msa_processor.rb - MsaProcessor class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: msa_processor.rb,v 1.33 2010/12/13 19:00:10 cmzmasek Exp $ +# + +require 'date' +require 'set' + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/writer/phylip_sequential_writer' +require 'lib/evo/io/writer/nexus_writer' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/io/parser/general_msa_parser' +require 'lib/evo/io/writer/msa_writer' + +module Evoruby + + class MsaProcessor + + PRG_NAME = "msa_pro" + PRG_DATE = "2012.05.11" + PRG_DESC = "processing of multiple sequence alignments" + PRG_VERSION = "1.06" + COPYRIGHT = "2008-2010 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + + NAME_LENGTH_DEFAULT = 10 + WIDTH_DEFAULT_FASTA = 60 + INPUT_TYPE_OPTION = "i" + OUTPUT_TYPE_OPTION = "o" + MAXIMAL_NAME_LENGTH_OPTION = "n" + WIDTH_OPTION = "w" + CLEAN_UP_SEQ_OPTION = "c" + REM_RED_OPTION = "rem_red" + REMOVE_GAP_COLUMNS_OPTION = "rgc" + REMOVE_GAP_ONLY_COLUMNS = "rgoc" + REMOVE_COLUMNS_GAP_RATIO_OPTION = "rr" + REMOVE_ALL_GAP_CHARACTERS_OPTION = "rg" + REMOVE_ALL_SEQUENCES_LISTED_OPTION = "r" + KEEP_ONLY_SEQUENCES_LISTED_OPTION = "k" + + KEEP_MATCHING_SEQUENCES_OPTION = "mk" + REMOVE_MATCHING_SEQUENCES_OPTION = "mr" + + TRIM_OPTION = "t" + REMOVE_SEQS_GAP_RATIO_OPTION = "rsgr" + REMOVE_SEQS_NON_GAP_LENGTH_OPTION = "rsl" + SPLIT = "split" + LOG_SUFFIX = "_msa_pro.log" + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" + + + def initialize() + @input_format_set = false + @output_format_set = false + @fasta_input = false + @phylip_input = true + @name_length = NAME_LENGTH_DEFAULT + @name_length_set = false + @width = WIDTH_DEFAULT_FASTA # fasta only + @pi_output = true + @fasta_output = false + @nexus_output = false + @clean = false # phylip only + @rgc = false + @rgoc = false + @rg = false # fasta only + @rem_red = false + @rgr = -1 + @rsgr = -1 + @rsl = -1 + @remove_matching = nil + @keep_matching = nil + + @seqs_name_file = nil + @remove_seqs = false + @keep_seqs = false + @trim = false + @split = -1 + @first = -1 + @last = -1 + end + + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if ( ARGV == nil || ARGV.length < 1 ) + Util.print_message( PRG_NAME, "Illegal number of arguments" ) + print_help + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "Error: " + e.to_s, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 2 || ARGV.length < 2 ) + Util.print_message( PRG_NAME, "Illegal number of arguments" ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( INPUT_TYPE_OPTION ) + allowed_opts.push( OUTPUT_TYPE_OPTION ) + allowed_opts.push( MAXIMAL_NAME_LENGTH_OPTION ) + allowed_opts.push( WIDTH_OPTION ) + allowed_opts.push( CLEAN_UP_SEQ_OPTION ) + allowed_opts.push( REMOVE_GAP_COLUMNS_OPTION ) + allowed_opts.push( REMOVE_GAP_ONLY_COLUMNS ) + allowed_opts.push( REMOVE_COLUMNS_GAP_RATIO_OPTION ) + allowed_opts.push( REMOVE_ALL_GAP_CHARACTERS_OPTION ) + allowed_opts.push( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) + allowed_opts.push( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) + allowed_opts.push( TRIM_OPTION ) + allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION ) + allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) + allowed_opts.push( SPLIT ) + allowed_opts.push( REM_RED_OPTION ) + allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION ) + allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed ) + end + + input = cla.get_file_name( 0 ) + output = cla.get_file_name( 1 ) + + analyze_command_line( cla ) + + begin + Util.check_file_for_readability( input ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + begin + Util.check_file_for_writability( output ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + if ( @rg ) + set_pi_output( false ) + set_fasta_output( true ) + set_nexus_output( false ) + end + + if ( !@input_format_set ) + fasta_like = false + begin + fasta_like = Util.looks_like_fasta?( input ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + @fasta_input = fasta_like + @phylip_input = !fasta_like + if ( !@output_format_set ) + @fasta_output = fasta_like + @pi_output = !fasta_like + @nexus_output = false + end + end + + ld = Constants::LINE_DELIMITER + log = PRG_NAME + " " + PRG_VERSION + " [" + PRG_DATE + "]" + " LOG" + ld + now = DateTime.now + log << "Date/time: " + now.to_s + ld + + puts() + puts( "Input alignment : " + input ) + log << "Input alignment : " + input + ld + puts( "Output alignment : " + output ) + log << "Output alignment : " + output + ld + if ( @phylip_input ) + puts( "Input is : Phylip, or something like it" ) + log << "Input is : Phylip, or something like it" + ld + elsif ( @fasta_input ) + puts( "Input is : Fasta" ) + log << "Input is : Fasta" + ld + end + if( @rgr >= 0 ) + puts( "Max col gap ratio: " + @rgr.to_s ) + log << "Max col gap ratio: " + @rgr.to_s + ld + elsif ( @rgc ) + puts( "Remove gap colums" ) + log << "Remove gap colums" + ld + elsif( @rgoc ) + puts( "Remove gap only colums" ) + log << "Remove gap only colums" + ld + end + if ( @clean ) + puts( "Clean up : true" ) + log << "Clean up : true" + ld + end + + if ( @pi_output ) + puts( "Output is : Phylip interleaved" ) + log << "Output is : Phylip interleaved" + ld + elsif ( @fasta_output ) + puts( "Output is : Fasta" ) + log << "Output is : Fasta" + ld + if ( @width ) + puts( "Width : " + @width.to_s ) + log << "Width : " + @width.to_s + ld + end + if ( @rg ) + puts( "Remove all gap characters (alignment is destroyed)" ) + log << "Remove all gap characters (alignment is destroyed)" + ld + end + elsif ( @nexus_output ) + puts( "Output is : Nexus" ) + log << "Output is : Nexus" + ld + end + if ( @name_length_set || !@fasta_output ) + puts( "Max name length : " + @name_length.to_s ) + log << "Max name length : " + @name_length.to_s + ld + end + if( @rsgr >= 0 ) + puts( "Remove sequences for which the gap ratio > " + @rsgr.to_s ) + log << "Remove sequences for which the gap ratio > " + @rsgr.to_s + ld + end + if( @rsl >= 0 ) + puts( "Remove sequences with less than " + @rsl.to_s + " non-gap characters" ) + log << "Remove sequences with less than " + @rsl.to_s + " non-gap characters" + ld + end + if ( @remove_seqs ) + puts( "Remove sequences listed in: " + @seqs_name_file ) + log << "Remove sequences listed in: " + @seqs_name_file + ld + elsif ( @keep_seqs ) + puts( "Keep only sequences listed in: " + @seqs_name_file ) + log << "Keep only sequences listed in: " + @seqs_name_file + ld + end + if ( @trim ) + puts( "Keep only columns from: "+ @first.to_s + " to " + @last.to_s ) + log << "Keep only columns from: "+ @first.to_s + " to " + @last.to_s + ld + end + if ( @rem_red ) + puts( "Remove redundant sequences: true" ) + log << "Remove redundant sequences: true" + ld + end + if ( @split > 0 ) + puts( "Split : " + @split.to_s ) + log << "Split : " + @split.to_s + ld + end + puts() + + f = MsaFactory.new() + + msa = nil + + begin + if ( @phylip_input ) + msa = f.create_msa_from_file( input, GeneralMsaParser.new() ) + elsif ( @fasta_input ) + msa = f.create_msa_from_file( input, FastaParser.new() ) + end + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + if ( msa.is_aligned() ) + Util.print_message( PRG_NAME, "Length of original alignment : " + msa.get_length.to_s ) + log << "Length of original alignment : " + msa.get_length.to_s + ld + gp = msa.calculate_gap_proportion + Util.print_message( PRG_NAME, "Gap-proportion of original alignment : " + gp.to_s ) + log << "Gap-proportion of original alignment : " + gp.to_s + ld + else + Util.print_message( PRG_NAME, "the input is not aligned" ) + log << "The input is not aligned" + ld + end + + all_names = Set.new() + for i in 0 ... msa.get_number_of_seqs() + current_name = msa.get_sequence( i ).get_name + if all_names.include?( current_name ) + Util.print_warning_message( PRG_NAME, "sequence name [" + current_name + "] is not unique" ) + else + all_names.add( current_name ) + end + end + + begin + + if ( @remove_seqs || @keep_seqs ) + names = Util.file2array( @seqs_name_file, true ) + if ( names == nil || names.length() < 1 ) + error_msg = "file \"" + @seqs_name_file.to_s + "\" appears empty" + Util.fatal_error( PRG_NAME, error_msg ) + end + + if ( @remove_seqs ) + c = 0 + for i in 0 ... names.length() + to_delete = msa.find_by_name( names[ i ], true, false ) + if ( to_delete.length() < 1 ) + error_msg = "sequence name \"" + names[ i ] + "\" not found" + Util.fatal_error( PRG_NAME, error_msg ) + elsif ( to_delete.length() > 1 ) + error_msg = "sequence name \"" + names[ i ] + "\" is not unique" + Util.fatal_error( PRG_NAME, error_msg ) + else + msa.remove_sequence!( to_delete[ 0 ] ) + c += 1 + end + end + Util.print_message( PRG_NAME, "Removed " + c.to_s + " sequences" ) + log << "Removed " + c.to_s + " sequences" + ld + elsif ( @keep_seqs ) + msa_new = Msa.new() + r = 0 + k = 0 + for j in 0 ... msa.get_number_of_seqs() + if ( names.include?( msa.get_sequence( j ).get_name() ) ) + msa_new.add_sequence( msa.get_sequence( j ) ) + k += 1 + else + r += 1 + end + end + msa = msa_new + Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" ) + log << "Kept " + k.to_s + " sequences" + ld + Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" ) + log << "removed " + r.to_s + " sequences" + ld + end + end + + if ( @trim ) + msa.trim!( @first, @last ) + end + if( @rgr >= 0 ) + msa.remove_gap_columns_w_gap_ratio!( @rgr ) + elsif ( @rgc ) + msa.remove_gap_columns!() + elsif( @rgoc ) + msa.remove_gap_only_columns!() + end + if( @rsgr >= 0 ) + n = msa.get_number_of_seqs() + removed = msa.remove_sequences_by_gap_ratio!( @rsgr ) + k = msa.get_number_of_seqs() + r = n - k + Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" ) + log << "Kept " + k.to_s + " sequences" + ld + Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" ) + log << "Removed " + r.to_s + " sequences:" + ld + removed.each { | seq_name | + log << " " + seq_name + ld + } + end + if( @rsl >= 0 ) + n = msa.get_number_of_seqs() + removed = msa.remove_sequences_by_non_gap_length!( @rsl ) + k = msa.get_number_of_seqs() + r = n - k + Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" ) + log << "Kept " + k.to_s + " sequences" + ld + Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" ) + log << "Removed " + r.to_s + " sequences:" + ld + removed.each { | seq_name | + log << " " + seq_name + ld + } + end + if ( @keep_matching ) + n = msa.get_number_of_seqs + to_be_removed = Set.new + for ii in 0 ... n + seq = msa.get_sequence( ii ) + if !seq.get_name.downcase.index( @keep_matching.downcase ) + to_be_removed.add( ii ) + end + end + to_be_removed_ary = to_be_removed.to_a.sort.reverse + to_be_removed_ary.each { | index | + msa.remove_sequence!( index ) + } + # msa = sort( msa ) + end + if ( @remove_matching ) + n = msa.get_number_of_seqs + to_be_removed = Set.new + for iii in 0 ... n + + seq = msa.get_sequence( iii ) + + if seq.get_name.downcase.index( @remove_matching.downcase ) + to_be_removed.add( iii ) + end + end + to_be_removed_ary = to_be_removed.to_a.sort.reverse + to_be_removed_ary.each { | index | + msa.remove_sequence!( index ) + } + msa = sort( msa ) + end + + + + if ( @split > 0 ) + begin + msas = msa.split( @split, true ) + io = MsaIO.new() + w = MsaWriter + if ( @pi_output ) + w = PhylipSequentialWriter.new() + w.clean( @clean ) + w.set_max_name_length( @name_length ) + elsif( @fasta_output ) + w = FastaWriter.new() + w.set_line_width( @width ) + if ( @rg ) + w.remove_gap_chars( true ) + Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" ) + log << "removing gap character, the output is likely to become unaligned" + ld + end + w.clean( @clean ) + if ( @name_length_set ) + w.set_max_name_length( @name_length ) + end + elsif( @nexus_output ) + w = NexusWriter.new() + w.clean( @clean ) + w.set_max_name_length( @name_length ) + end + i = 0 + for m in msas + i = i + 1 + io.write_to_file( m, output + "_" + i.to_s, w ) + end + Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files" ) + log << "wrote " + msas.length.to_s + " files" + ld + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + end + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + if ( @split <= 0 ) + + unless ( @rg ) + if ( msa.is_aligned() ) + Util.print_message( PRG_NAME, "Length of processed alignment : " + msa.get_length.to_s ) + log << "Length of processed alignment : " + msa.get_length.to_s + ld + gp = msa.calculate_gap_proportion + Util.print_message( PRG_NAME, "Gap-proportion of processed alignment: " + gp.to_s ) + log << "Gap-proportion of processed alignment: " + gp.to_s + ld + else + Util.print_warning_message( PRG_NAME, "output is not aligned" ) + log << "output is not aligned" + ld + end + end + + if @rem_red + removed = msa.remove_redundant_sequences!( true, false ) + if removed.size > 0 + identicals = msa.get_identical_seqs_detected + log << "the following " + identicals.size.to_s + " sequences are identical:" + ld + identicals.each { | s | + log << s + ld + } + log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld + removed.each { | seq_name | + log << seq_name + ld + } + Util.print_message( PRG_NAME, "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" ) + log << "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" + ld + end + end + + io = MsaIO.new() + + w = MsaWriter + + if ( @pi_output ) + w = PhylipSequentialWriter.new() + w.clean( @clean ) + w.set_max_name_length( @name_length ) + elsif( @fasta_output ) + w = FastaWriter.new() + w.set_line_width( @width ) + if ( @rg ) + w.remove_gap_chars( true ) + Util.print_warning_message( PRG_NAME, "removing gap characters, the output is likely to become unaligned" ) + log << "removing gap character, the output is likely to become unaligned" + ld + end + w.clean( @clean ) + if ( @name_length_set ) + w.set_max_name_length( @name_length ) + end + elsif( @nexus_output ) + w = NexusWriter.new() + w.clean( @clean ) + w.set_max_name_length( @name_length ) + end + + + begin + io.write_to_file( msa, output, w ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + begin + f = File.open( output + LOG_SUFFIX, 'a' ) + f.print( log ) + f.close + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + + end + Util.print_message( PRG_NAME, "OK" ) + puts + end + + + private + + def sort( msa ) + names = Set.new + for i in 0 ... msa.get_number_of_seqs + name = msa.get_sequence( i ).get_name + names.add( name ) + end + sorted_ary = names.to_a.sort + new_msa = Msa.new + sorted_ary.each { | seq_name | + seq = msa.get_sequence( msa.find_by_name( seq_name, true, false )[ 0 ] ) + new_msa.add_sequence( seq ) + } + new_msa + end + + def set_fasta_input( fi = true ) + @fasta_input = fi + @input_format_set = true + end + def set_phylip_input( pi = true ) + @phylip_input = pi + @input_format_set = true + end + def set_name_length( i ) + @name_length = i + @name_length_set = true + end + def set_width( i ) + @width = i + end + def set_fasta_output( fo = true ) + @fasta_output = fo + @output_format_set = true + end + def set_pi_output( pso = true ) + @pi_output = pso + @output_format_set = true + end + def set_nexus_output( nexus = true ) + @nexus_output = nexus + @output_format_set = true + end + def set_clean( c = true ) + @clean = c + end + def set_remove_gap_columns( rgc = true ) + @rgc = rgc + end + def set_remove_gap_only_columns( rgoc = true ) + @rgoc = rgoc + end + def set_remove_gaps( rg = true ) + @rg = rg + end + def set_remove_gap_ratio( rgr ) + @rgr = rgr + end + def set_remove_seqs_gap_ratio( rsgr ) + @rsgr = rsgr + end + def set_remove_seqs_min_non_gap_length( rsl ) + @rsl = rsl + end + def set_remove_seqs( file ) + @seqs_name_file = file + @remove_seqs = true + @keep_seqs = false + end + def set_keep_seqs( file ) + @seqs_name_file = file + @keep_seqs = true + @remove_seqs = false + end + def set_trim( first, last ) + @trim = true + @first = first + @last = last + end + def set_remove_matching( remove ) + @remove_matching = remove + end + def set_keep_matching( keep ) + @keep_matching = keep + end + def set_rem_red( rr ) + @rem_red = rr + end + + + + def set_split( s ) + if ( s > 0 ) + @split = s + @clean = false # phylip only + @rgc = false + @rgoc = false + @rg = false # fasta only + @rgr = -1 + @rsgr = -1 + @rsl = -1 + @seqs_name_file = nil + @remove_seqs = false + @keep_seqs = false + @trim = false + @first = -1 + @last = -1 + end + end + def analyze_command_line( cla ) + if ( cla.is_option_set?( INPUT_TYPE_OPTION ) ) + begin + type = cla.get_option_value( INPUT_TYPE_OPTION ) + if ( type == "p" ) + set_phylip_input( true ) + set_fasta_input( false ) + elsif ( type == "f" ) + set_fasta_input( true ) + set_phylip_input( false ) + end + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( OUTPUT_TYPE_OPTION ) ) + begin + type = cla.get_option_value( OUTPUT_TYPE_OPTION ) + if ( type == "p" ) + set_pi_output( true ) + set_fasta_output( false ) + set_nexus_output( false ) + elsif ( type == "f" ) + set_pi_output( false ) + set_fasta_output( true ) + set_nexus_output( false ) + elsif ( type == "n" ) + set_pi_output( false ) + set_fasta_output( false ) + set_nexus_output( true ) + end + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( MAXIMAL_NAME_LENGTH_OPTION ) ) + begin + l = cla.get_option_value_as_int( MAXIMAL_NAME_LENGTH_OPTION ) + set_name_length( l ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( WIDTH_OPTION ) ) + begin + w = cla.get_option_value_as_int( WIDTH_OPTION ) + set_width( w ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( CLEAN_UP_SEQ_OPTION ) ) + set_clean( true ) + end + if ( cla.is_option_set?( REMOVE_GAP_COLUMNS_OPTION ) ) + set_remove_gap_columns( true ) + end + if ( cla.is_option_set?( REM_RED_OPTION ) ) + set_rem_red( true ) + end + if ( cla.is_option_set?( REMOVE_GAP_ONLY_COLUMNS ) ) + set_remove_gap_only_columns( true ) + end + if ( cla.is_option_set?( REMOVE_ALL_GAP_CHARACTERS_OPTION ) ) + set_remove_gaps( true ) + end + if ( cla.is_option_set?( REMOVE_COLUMNS_GAP_RATIO_OPTION ) ) + begin + f = cla.get_option_value_as_float( REMOVE_COLUMNS_GAP_RATIO_OPTION ) + set_remove_gap_ratio( f ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) ) + begin + s = cla.get_option_value( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) + set_remove_seqs( s ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) ) + begin + s = cla.get_option_value( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) + set_keep_seqs( s ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( TRIM_OPTION ) ) + begin + s = cla.get_option_value( TRIM_OPTION ) + if ( s =~ /(\d+)-(\d+)/ ) + set_trim( $1.to_i(), $2.to_i() ) + else + puts( "illegal argument" ) + print_help + exit( -1 ) + end + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) ) + begin + f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION ) + set_remove_seqs_gap_ratio( f ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) ) + begin + f = cla.get_option_value_as_int( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) + set_remove_seqs_min_non_gap_length( f ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( SPLIT ) ) + begin + s = cla.get_option_value_as_int( SPLIT ) + set_split( s ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + end + if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) ) + begin + s = cla.get_option_value( REMOVE_MATCHING_SEQUENCES_OPTION ) + set_remove_matching( s ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( KEEP_MATCHING_SEQUENCES_OPTION ) ) + begin + s = cla.get_option_value( KEEP_MATCHING_SEQUENCES_OPTION ) + set_keep_matching( s ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + + + end + + def print_help() + puts() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + INPUT_TYPE_OPTION + "=: f for fasta, p for phylip selex type" ) + puts( " -" + OUTPUT_TYPE_OPTION + "=: f for fasta, n for nexus, p for phylip sequential (default)" ) + puts( " -" + MAXIMAL_NAME_LENGTH_OPTION + "=: n=maximal name length (default for phylip 10, for fasta: unlimited )" ) + puts( " -" + WIDTH_OPTION + "=: n=width (fasta output only, default is 60)" ) + puts( " -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" ) + puts( " -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" ) + puts( " -" + REMOVE_GAP_ONLY_COLUMNS + ": remove gap-only columns" ) + puts( " -" + REMOVE_COLUMNS_GAP_RATIO_OPTION + "=: remove columns for which ( seqs with gap / number of sequences > n )" ) + puts( " -" + REMOVE_ALL_GAP_CHARACTERS_OPTION + ": remove all gap characters (destroys alignment, fasta output only)" ) + puts( " -" + REMOVE_ALL_SEQUENCES_LISTED_OPTION + "=: remove all sequences listed in file" ) + puts( " -" + KEEP_ONLY_SEQUENCES_LISTED_OPTION + "=: keep only sequences listed in file" ) + puts( " -" + TRIM_OPTION + "=-: remove columns before first and after last" ) + puts( " -" + REMOVE_SEQS_GAP_RATIO_OPTION + "=: remove sequences for which the gap ratio > n (after column operations)" ) + puts( " -" + REMOVE_SEQS_NON_GAP_LENGTH_OPTION + "= remove sequences with less than n non-gap characters (after column operations)" ) + puts( " -" + REMOVE_MATCHING_SEQUENCES_OPTION + "= remove all sequences with names containing s" ) + puts( " -" + KEEP_MATCHING_SEQUENCES_OPTION + "= keep only sequences with names containing s" ) + puts( " -" + SPLIT + "= split a fasta file into n files of equal number of sequences (expect for " ) + puts( " last one), cannot be used with other options" ) + puts( " -" + REM_RED_OPTION + ": remove redundant sequences" ) + puts() + end + + + + + + end # class MsaProcessor + + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb new file mode 100644 index 0000000..b499c72 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb @@ -0,0 +1,551 @@ +# +# = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $ + + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/msa/msa' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/util/command_line_arguments' + + + +module Evoruby + + class MultiSequenceExtractor + + PRG_NAME = "mse" + PRG_VERSION = "1.02" + PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files" + PRG_DATE = "2012.07.20" + COPYRIGHT = "2008-2012 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + HELP_OPTION_1 = 'help' + HELP_OPTION_2 = 'h' + + EXT_OPTION = 'e' + EXTRACT_LINKERS_OPTION = 'l' + LOG_SUFFIX = ".mse_log" + LINKERS_SUFFIX = ".linkers" + FASTA_SUFFIX = ".fasta" + FASTA_WITH_NORMALIZED_IDS_SUFFIX = ".ni.fasta" + NORMALIZED_IDS_MAP_SUFFIX = ".nim" + PROTEINS_LIST_FILE_SEPARATOR = "\t" + + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC , + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + ld = Constants::LINE_DELIMITER + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push(EXT_OPTION) + allowed_opts.push(EXTRACT_LINKERS_OPTION) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + seq_names_files_suffix = cla.get_file_name( 0 ) + input_dir = cla.get_file_name( 1 ) + out_dir = cla.get_file_name( 2 ) + out_dir_doms = cla.get_file_name( 3 ) + mapping_file = nil + + if ( cla.get_number_of_files == 5 ) + mapping_file = cla.get_file_name( 4 ) + begin + Util.check_file_for_readability( mapping_file ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + end + + extension = 0 + if cla.is_option_set?(EXT_OPTION) + extension = cla.get_option_value_as_int(EXT_OPTION) + if extension < 0 + extension = 0 + end + end + + extract_linkers = false + if cla.is_option_set?(EXTRACT_LINKERS_OPTION) + extract_linkers = true + end + + if !File.exist?( input_dir ) + Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" ) + end + if !File.exist?( out_dir ) + Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" ) + end + if !File.exist?( out_dir_doms ) + Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" ) + end + if !File.directory?( input_dir ) + Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" ) + end + if !File.directory?( out_dir ) + Util.fatal_error( PRG_NAME, "error: [#{out_dir}] is not a directory" ) + end + if !File.directory?( out_dir_doms ) + Util.fatal_error( PRG_NAME, "error: [#{out_dir_doms}] is not a directory" ) + end + + + log = String.new + + log << "Program : " + PRG_NAME + ld + log << "Version : " + PRG_VERSION + ld + log << "Program date : " + PRG_DATE + ld + + puts() + puts( "Sequence names files suffix: " + seq_names_files_suffix ) + log << "Sequence names files suffix: " + seq_names_files_suffix + ld + puts( "Input dir : " + input_dir ) + log << "Input dir : " + input_dir + ld + puts( "Output dir : " + out_dir ) + log << "Output dir : " + out_dir + ld + puts( "Output dir domains : " + out_dir_doms ) + log << "Output dir domains : " + out_dir_doms + ld + if ( mapping_file != nil ) + puts( "Mapping file : " + mapping_file ) + log << "Mapping file : " + mapping_file + ld + end + if extension > 0 + puts( "Extension : " + extension.to_s ) + log << "Extension : " + extension.to_s + ld + end + if extract_linkers + puts( "Extract linkers : true" ) + log << "Extract linkers : true" + ld + end + log << "Date : " + Time.now.to_s + ld + puts + + if ( mapping_file != nil ) + species_codes_to_paths = extract_mappings( mapping_file ) + end + + input_files = obtain_inputfiles( input_dir, seq_names_files_suffix ) + + counter = 0 + + input_files.each { |input_file| + counter += 1 + puts + puts + puts counter.to_s + "/" + input_files.size.to_s + read_seq_family_file( input_file, + seq_names_files_suffix, + input_dir, + species_codes_to_paths, + log, + out_dir, + out_dir_doms, + mapping_file, + extension, + extract_linkers ) + } + puts + Util.print_message( PRG_NAME, "OK" ) + puts + + end + + + def read_seq_family_file( input_file, + seq_names_files_suffix, + input_dir, + species_codes_to_paths, + log, + out_dir, + out_dir_doms, + mapping_file, + extension, + extract_linkers ) + + begin + Util.check_file_for_readability( input_file ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + basename = File.basename( input_file, seq_names_files_suffix ) + out_file_path_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX + out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX + out_file_path_ids_map = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX + doms_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX + doms_ext_out_file_path_fasta_file = nil + if extension > 0 + doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX + end + begin + Util.check_file_for_writability( out_file_path_fasta_file ) + Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file ) + Util.check_file_for_writability( out_file_path_ids_map ) + Util.check_file_for_writability( doms_out_file_path_fasta_file ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + ids_map_writer = nil + begin + ids_map_writer = File.open( out_file_path_ids_map, 'a' ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + current_species = "" + current_msa = nil + new_msa = Msa.new + new_msa_normalized_ids = Msa.new + new_msa_domains = Msa.new + new_msa_domains_extended = Msa.new + per_species_counter = 0 + linkers = "" + + puts basename + + File.open( input_file ) do | file | + while line = file.gets + line.strip! + if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) + values = line.split( PROTEINS_LIST_FILE_SEPARATOR ) + mod_line = nil + if ( values.length < 2 ) + Util.fatal_error( PRG_NAME, "unexpected format: " + line ) + end + species = values[ 0 ] + if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA" || species == "EIMTE" + next + end + seq_name = values[ 1 ] + domain_ranges = nil + if ( values.length > 3 ) + domain_ranges_block = values[ 3 ] + domain_ranges = domain_ranges_block.split( "/" ) + end + if ( species != current_species ) + current_species = species + my_file = input_dir + Constants::FILE_SEPARATOR + current_species + + if ( !File.exist?( my_file ) ) + if species_codes_to_paths == nil + Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" ) + elsif ( !species_codes_to_paths.has_key?( current_species ) ) + Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" ) + end + my_file = species_codes_to_paths[ current_species ] + end + my_path = File.expand_path( my_file ) + my_readlink = my_path + if ( File.symlink?( my_path ) ) + my_readlink = File.readlink( my_path ) + end + current_msa = read_fasta_file( my_file ) + + if ( per_species_counter > 0 ) + print_counts( per_species_counter, log, Constants::LINE_DELIMITER ) + per_species_counter = 0 + end + puts " " + current_species + " [" + my_readlink + "]" + log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER + end + puts " " + seq_name + log << " " + seq_name + Constants::LINE_DELIMITER + per_species_counter = per_species_counter + 1 + seq = nil + + if current_msa.find_by_name_start( seq_name, true ).size > 0 + begin + seq = current_msa.get_by_name_start( seq_name, true ).copy + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + else + # Not found, try finding by partial match. + begin + seq = current_msa.get_by_name( seq_name, true, true ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + end + + normalized_id = per_species_counter.to_s( 16 ).upcase + + "_" + current_species + + per_species_counter.to_i + + ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER ) + + orig_name = nil + if seq != nil + orig_name = seq.get_name + seq.set_name( seq.get_name + " [" + current_species + "]" ) + new_msa.add_sequence( seq ) + else + Util.fatal_error( PRG_NAME, "unexected error: seq is nil" ) + end + + if domain_ranges != nil + first = true + prev_to = -1 + + domain_ranges.each { |range| + if range != nil && range.length > 0 + s = range.split("-") + from = s[ 0 ].to_i + to = s[ 1 ].to_i + new_msa_domains.add_sequence( Sequence.new( orig_name + + " [" + from.to_s + + "-" + to.to_s + + "] [" + basename + "] [" + + current_species + "]", + seq.get_sequence_as_string[from..to] ) ) + if extension > 0 + from_e = from - extension + if from_e < 0 + from_e = 0 + end + to_e = to + extension + if to_e > seq.get_sequence_as_string.length - 1 + to_e = seq.get_sequence_as_string.length - 1 + end + new_msa_domains_extended.add_sequence( Sequence.new( orig_name + + " [" + from.to_s + + "-" + to.to_s + + "] [extended by " + + extension.to_s + + "] [" + basename + "] [" + + current_species + "]", + seq.get_sequence_as_string[ from_e..to_e ] ) ) + end # extension > 0 + if extract_linkers + if first + first = false + f = 0 + t = from - 1 + if extension > 0 + f = from - extension + end + mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|" + else + mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|" + end + prev_to = to + end + end # range != nil && range.length > 0 + } + if extract_linkers && prev_to > 0 + f = prev_to + 1 + t = seq.get_sequence_as_string.length - 1 + if extension > 0 + t = prev_to + extension + end + mod_line += get_linker_sequence( f, t, seq ) + "]" + end + end + + new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) ) + if mod_line + linkers << mod_line + Constants::LINE_DELIMITER + end + end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) + end # while line = file.gets + + end + + ids_map_writer.close + + if ( per_species_counter > 0 ) + print_counts( per_species_counter, log, Constants::LINE_DELIMITER ) + end + + io = MsaIO.new() + + fasta_writer = FastaWriter.new() + fasta_writer.remove_gap_chars + fasta_writer.clean + + begin + io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + if new_msa_domains != nil + begin + io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + end + + if extension > 0 && new_msa_domains_extended != nil + begin + io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + end + + begin + io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + begin + f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LOG_SUFFIX , 'a' ) + f.print( log ) + f.close + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + if extract_linkers && linkers != nil + begin + f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LINKERS_SUFFIX , 'a' ) + f.print( linkers ) + f.close + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + end + + + end + + + def get_linker_sequence( from, to, seq ) + if from < 0 + from = 0 + end + if to > seq.get_sequence_as_string.length - 1 + to = seq.get_sequence_as_string.length - 1 + end + if from > to + return "" + else + return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ] + end + end + + def obtain_inputfiles( input_dir, seq_names_files_suffix ) + input_files = Array.new() + Dir.foreach( input_dir ) { |file_name| + if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size ) + input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name ) + end + } + input_files + end + + def extract_mappings( mapping_file ) + species_code_to_path = Hash.new() + File.open( mapping_file ) do | file | + while line = file.gets + if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) ) + if ( line =~ /(\S+)\s+(\S+)/ ) + species = $1 + path = $2 + if ( species_code_to_path.has_key?( species ) ) + Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" ) + end + if ( species_code_to_path.has_value?( path ) ) + Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" ) + end + if ( !File.exist?( path ) ) + Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" ) + end + if ( !File.file?( path ) ) + Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" ) + end + if ( !File.readable?( path ) ) + Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" ) + end + if ( File.size( path ) < 10000 ) + Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" ) + end + if ( !Util.looks_like_fasta?( path ) ) + Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" ) + end + species_code_to_path[ species ] = path + puts species + " -> " + path + end + end + end + end + species_code_to_path + end + + def print_counts( per_species_counter, log, ld ) + puts " [sum: " + per_species_counter.to_s + "]" + log << " [sum: " + per_species_counter.to_s + "]" + ld + end + + def read_fasta_file( input ) + f = MsaFactory.new() + msa = nil + begin + msa = f.create_msa_from_file( input, FastaParser.new() ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + msa + end + + def print_help() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [mapping file for " + + "genome multiple-sequence ('fasta') files not in input dir]" ) + puts() + puts( " option: -" + EXT_OPTION + "=: to extend extracted domains" ) + puts( " -" + EXTRACT_LINKERS_OPTION + " : to extract linkers" ) + puts() + puts( " " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" ) + puts() + end + + end # class MultiSequenceExtractor +end \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/tool/new_tap.rb b/forester/ruby/evoruby/lib/evo/tool/new_tap.rb new file mode 100644 index 0000000..1dc7431 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/new_tap.rb @@ -0,0 +1,167 @@ +# +# = lib/evo/apps/ - class +# +# Copyright:: Copyright (C) 2009 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: new_tap.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $ + + +require 'lib/evo/util/util' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/msa/msa' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/io/parser/general_msa_parser' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/writer/phylip_sequential_writer' +require 'lib/evo/util/command_line_arguments' + +module Evoruby + + class TaxonomyProcessor + + PRG_NAME = "" + PRG_DATE = "2009.10.09" + PRG_DESC = "replacement of labels in multiple sequence files" + PRG_VERSION = "1.00" + COPYRIGHT = "2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + REMOVE_REDUNDANT_SEQS_OPTION = "rr" + + def initialize() + @taxonomies = Hash.new() + end + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if ( ARGV == nil || ( ARGV.length != 3 && ARGV.length != 4 ) ) + puts( "Usage: #{PRG_NAME}.rb " ) + puts() + puts( " options: -" + REMOVE_REDUNDANT_SEQS_OPTION + ": to remove redundant sequences" ) + puts() + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + input = cla.get_file_name( 0 ) + output = cla.get_file_name( 1 ) + map_file = cla.get_file_name( 2 ) + + allowed_opts = Array.new + allowed_opts.push( REMOVE_REDUNDANT_SEQS_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) + end + + + remove_redudant = false + if ( cla.is_option_set?( REMOVE_REDUNDANT_SEQS_OPTION ) ) + remove_redudant = true + end + + if ( File.exists?( output ) ) + Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" ) + end + if ( File.exists?( map_file ) ) + Util.fatal_error( PRG_NAME, "map file [" + map_file + "] already exists" ) + end + if ( !File.exists?( input) ) + Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" ) + end + + fasta_like = Util.looks_like_fasta?( input ) + + puts() + puts( "Input alignment : " + input ) + puts( "Output alignment: " + output ) + puts( "Output map : " + map_file ) + if ( fasta_like ) + puts( "Format : Fasta" ) + else + puts( "Format : Phylip like" ) + end + puts() + + species_map = Hash.new + + f = MsaFactory.new() + begin + if ( fasta_like ) + msa = f.create_msa_from_file( input, FastaParser.new() ) + else + msa = f.create_msa_from_file( input, GeneralMsaParser.new() ) + end + rescue Exception => e + Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s ) + end + + if ( msa == nil || msa.get_number_of_seqs() < 1 ) + Util.fatal_error( PRG_NAME, "failed to read MSA" ) + end + begin + Util.check_file_for_writability( map_file ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT ) + end + + if ( remove_redudant ) + removed = msa.remove_redundant_sequences!( true ) + if removed.size > 0 + Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) + removed.each { | seq_name | + puts seq_name + } + Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" ) + end + end + + lf = File.open( map_file, "a" ) + for i in 0 ... msa.get_number_of_seqs + seq = msa.get_sequence( i ) + end + + io = MsaIO.new() + w = nil + if ( fasta_like ) + w = FastaWriter.new() + else + w = PhylipSequentialWriter.new() + end + w.set_max_name_length( 10 ) + w.clean( true ) + begin + io.write_to_file( msa, output, w ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) + end + lf.close() + if ( @taxonomies.length > 0 ) + Util.print_message( PRG_NAME, "number of unique taxonomies: " + @taxonomies.length.to_s ) + end + Util.print_message( PRG_NAME, "wrote: " + map_file ) + Util.print_message( PRG_NAME, "wrote: " + output ) + Util.print_message( PRG_NAME, "OK" ) + end + + end # class + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb b/forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb new file mode 100644 index 0000000..69a6bc4 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb @@ -0,0 +1,299 @@ +#!/usr/local/bin/ruby -w +# +# = lib/evo/apps/phylogenies_decorator +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# decoration of phylogenies with sequence/species names and domain architectures +# +# $Id: phylogenies_decorator.rb,v 1.34 2010/12/13 19:00:11 cmzmasek Exp $ +# +# Environment variable FORESTER_HOME needs to point to the appropriate +# directory (e.g. setenv FORESTER_HOME $HOME/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester-atv/) + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' + +require 'date' + +module Evoruby + + class PhylogeniesDecorator + + DECORATOR_OPTIONS_SEQ_NAMES = '-r=1 -mdn' + # -mdn is a hidden expert option to rename e.g. "6_ORYLA3" to "6_[3]_ORYLA" + #DECORATOR_OPTIONS_SEQ_NAMES = '-sn -r=1' + DECORATOR_OPTIONS_DOMAINS = '-r=1' + IDS_MAPFILE_SUFFIX = '.nim' + DOMAINS_MAPFILE_SUFFIX = '.dff' + SLEEP_TIME = 0.1 + REMOVE_NI = true + TMP_FILE = '___PD___' + LOG_FILE = '00_phylogenies_decorator.log' + FORESTER_HOME = ENV[Constants::FORESTER_HOME_ENV_VARIABLE] + JAVA_HOME = ENV[Constants::JAVA_HOME_ENV_VARIABLE] + + PRG_NAME = "phylogenies_decorator" + PRG_DATE = "2008.09.02" + PRG_DESC = "decoration of phylogenies with sequence/species names and domain architectures" + PRG_VERSION = "1.0.1" + COPYRIGHT = "2008-2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + IDS_ONLY_OPTION = "n" + DOMAINS_ONLY_OPTION = "d" + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" + + NL = Constants::LINE_DELIMITER + + def run + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if ( ARGV == nil || ARGV.length > 3 || ARGV.length < 2 ) + print_help + exit( -1 ) + end + + if FORESTER_HOME == nil || FORESTER_HOME.length < 1 + Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::FORESTER_HOME_ENV_VARIABLE} has not been set" ) + end + if JAVA_HOME == nil || JAVA_HOME.length < 1 + Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::JAVA_HOME_ENV_VARIABLE} has not been set" ) + end + + if !File.exist?( FORESTER_HOME ) + Util.fatal_error( PRG_NAME, '[' + FORESTER_HOME + '] does not exist' ) + end + if !File.exist?( JAVA_HOME ) + Util.fatal_error( PRG_NAME, '[' + JAVA_HOME + '] does not exist' ) + end + + decorator = JAVA_HOME + '/bin/java -cp ' + FORESTER_HOME + '/java/forester.jar org.forester.application.decorator' + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if File.exist?( LOG_FILE ) + Util.fatal_error( PRG_NAME, 'logfile [' + LOG_FILE + '] already exists' ) + end + + allowed_opts = Array.new + allowed_opts.push( IDS_ONLY_OPTION ) + allowed_opts.push( DOMAINS_ONLY_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) + end + + ids_only = false + domains_only = false + + in_suffix = cla.get_file_name( 0 ) + out_suffix = cla.get_file_name( 1 ) + + if cla.is_option_set?( IDS_ONLY_OPTION ) + ids_only = true + end + if cla.is_option_set?( DOMAINS_ONLY_OPTION ) + domains_only = true + end + + if ( ids_only && domains_only ) + Util.fatal_error( PRG_NAME, 'attempt to use ids only and domains only at the same time' ) + end + + log = String.new + + now = DateTime.now + log << "Program : " + PRG_NAME + NL + log << "Version : " + PRG_VERSION + NL + log << "Program date : " + PRG_DATE + NL + log << "Options for seq names: " + DECORATOR_OPTIONS_SEQ_NAMES + NL + log << "Options for domains : " + DECORATOR_OPTIONS_DOMAINS + NL + log << "FORESTER_HOME : " + FORESTER_HOME + NL + log << "JAVA_HOME : " + JAVA_HOME + NL + NL + log << "Date/time: " + now.to_s + NL + log << "Directory: " + Dir.getwd + NL + NL + + Util.print_message( PRG_NAME, 'input suffix : ' + in_suffix ) + Util.print_message( PRG_NAME, 'output suffix : ' + out_suffix ) + + log << 'input suffix : ' + in_suffix + NL + log << 'output suffix : ' + out_suffix + NL + + if ( File.exists?( TMP_FILE ) ) + File.delete( TMP_FILE ) + end + + files = Dir.entries( "." ) + + counter = 0 + + files.each { | phylogeny_file | + if ( !File.directory?( phylogeny_file ) && + phylogeny_file !~ /^\./ && + phylogeny_file !~ /^00/ && + phylogeny_file !~ /#{out_suffix}$/ && + phylogeny_file =~ /#{in_suffix}$/ ) + begin + Util.check_file_for_readability( phylogeny_file ) + rescue ArgumentError + Util.fatal_error( PRG_NAME, 'can not read from: ' + phylogeny_file + ': '+ $! ) + end + + counter += 1 + + outfile = phylogeny_file.sub( /#{in_suffix}$/, out_suffix ) + + if REMOVE_NI + outfile = outfile.sub( /_ni_/, '_' ) + end + + if File.exists?( outfile ) + msg = counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile + + ' : already exists, skipping' + Util.print_message( PRG_NAME, msg ) + log << msg + NL + next + end + + Util.print_message( PRG_NAME, counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile ) + log << counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile + NL + + phylogeny_id = get_id( phylogeny_file ) + + ids_mapfile_name = nil + domains_mapfile_name = nil + + if ids_only + ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX ) + elsif domains_only + domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX ) + else + ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX ) + domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX ) + end + + if domains_mapfile_name != nil + begin + Util.check_file_for_readability( domains_mapfile_name ) + rescue ArgumentError + Util.fatal_error( PRG_NAME, 'failed to read from [#{domains_mapfile_name}]: ' + $! ) + end + end + + if ids_mapfile_name != nil + begin + Util.check_file_for_readability( ids_mapfile_name ) + rescue ArgumentError + Util.fatal_error( PRG_NAME, 'failed to read from [#{ids_mapfile_name}]: ' + $! ) + end + end + + if domains_mapfile_name != nil + if ids_mapfile_name != nil + my_outfile = TMP_FILE + else + my_outfile = outfile + end + cmd = decorator + ' ' + DECORATOR_OPTIONS_DOMAINS + ' ' + + '-f=d ' + phylogeny_file + ' ' + + domains_mapfile_name + ' ' + my_outfile + execute_cmd( cmd, log ) + end + + if ids_mapfile_name != nil + if domains_mapfile_name != nil + my_infile = TMP_FILE + else + my_infile = phylogeny_file + end + cmd = decorator + ' ' + DECORATOR_OPTIONS_SEQ_NAMES + ' ' + + '-f=s ' + my_infile + ' ' + + ids_mapfile_name + ' ' + outfile + execute_cmd( cmd, log ) + end + + if ( File.exists?( TMP_FILE ) ) + File.delete( TMP_FILE ) + end + end + } + open( LOG_FILE, 'w' ) do | f | + f.write( log ) + end + puts + Util.print_message( PRG_NAME, 'OK' ) + puts + end # def run + + def execute_cmd( cmd, log ) + log << 'excuting ' + cmd + NL + IO.popen( cmd , 'r+' ) do | pipe | + pipe.close_write + log << pipe.read + NL + NL + end + sleep( SLEEP_TIME ) + end + + + def get_id( phylogeny_file_name ) + phylogeny_file_name =~ /^([^_]+)/ + $1 + end + + def get_file( files_in_dir, phylogeny_id, suffix_pattern ) + matching_files = Array.new + files_in_dir.each { | file | + + if ( !File.directory?( file ) && + file !~ /^\./ && + file !~ /^00/ && + file =~ /^#{phylogeny_id}.*#{suffix_pattern}$/ ) + matching_files << file + end + } + if matching_files.length < 1 + Util.fatal_error( PRG_NAME, 'no file matching [' + phylogeny_id + + '_] [' + suffix_pattern + '] present in current directory' ) + elsif matching_files.length > 1 + Util.fatal_error( PRG_NAME, 'more than one file matching [' + phylogeny_id + + '_] [' + suffix_pattern + '] present in current directory' ) + end + matching_files[ 0 ] + end + + def print_help() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + IDS_ONLY_OPTION + ": decorate with sequence/species names only" ) + puts( " -" + DOMAINS_ONLY_OPTION + ": decorate with domain structures" ) + puts() + end + end # class PhylogenyiesDecorator + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb b/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb new file mode 100644 index 0000000..999541e --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb @@ -0,0 +1,267 @@ +# +# = lib/evo/apps/phylogeny_factory - PhylogenyFactory class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: phylogeny_factory.rb,v 1.32 2010/12/13 19:00:11 cmzmasek Exp $ + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' + +require 'set' +require 'date' + +module Evoruby + + class PhylogenyFactory + + PRG_NAME = "phylogeny_factory" + PRG_DATE = "2010.05.26" + PRG_DESC = "automated phylogeny reconstruction using queing system" + PRG_VERSION = "1.1" + COPYRIGHT = "2010 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + USE_JOB_SUBMISSION_SYSTEM_OPTION = 's' + LOG_FILE = '00_phylogeny_factory.log' + TEMPLATE_FILE = '00_phylogeny_factory.template' + PBS_O_WORKDIR = '$PBS_O_WORKDIR/' + MIN_LENGTH_DEFAULT = 40 + WALLTIME = '100:00:00' + QUEUE = 'default' + + TMP_CMD_FILE_SUFFIX = '_QSUB' + + HMM = 'HMM' + RSL = 'RSL' + + OPTION_OPEN = '%[' + OPTION_CLOSE = ']%' + + WAIT = 1.0 + + NL = Constants::LINE_DELIMITER + + def run + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + allowed_opts = Array.new + allowed_opts.push( USE_JOB_SUBMISSION_SYSTEM_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + if File.exists?( LOG_FILE ) + puts( '[' + PRG_NAME + '] > log file [' + LOG_FILE + '] already exists' ) + exit( -1 ) + end + + if !File.exists?( TEMPLATE_FILE ) + puts( '[' + PRG_NAME + '] > template file [' + TEMPLATE_FILE + '] not found' ) + exit( -1 ) + end + + use_job_submission_system = false + if cla.is_option_set?( USE_JOB_SUBMISSION_SYSTEM_OPTION ) + use_job_submission_system = true + end + + log = String.new + + now = DateTime.now + log << "Program : " + PRG_NAME + NL + log << "Version : " + PRG_VERSION + NL + log << "Program date: " + PRG_DATE + NL + NL + log << "Date/time : " + now.to_s + NL + log << "Directory : " + Dir.getwd + NL + NL + + puts( '[' + PRG_NAME + '] > reading ' + TEMPLATE_FILE ) + + paths = Hash.new # path placeholder -> full path + min_lengths = Hash.new # alignment id -> minimal length + hmms = Hash.new # alignment id -> hmm + options = Hash.new # option placeholder -> option + ids = Set.new + + commands = Array.new + + log << "////////////////////////////////////////////////////////////////// #{NL}" + log << "Template file [" + TEMPLATE_FILE + "]:#{NL}" + + command = String.new + + open( TEMPLATE_FILE ).each { | line | + log << line + if ( line =~ /^#/ ) + elsif ( line =~ /^\$\s*(\S+)\s*=\s*(\S+)/ ) + paths[ $1 ] = $2 + puts( '[' + PRG_NAME + '] > paths : ' + $1 + ' => ' + $2 ) + + elsif ( line =~ /^%\s*#{HMM}\s*(\S+)\s*=\s*(\S+)/ ) + hmms[ $1 ] = $2 + puts( '[' + PRG_NAME + '] > hmms : ' + $1 + ' => ' + $2 ) + + elsif ( line =~ /^%\s*#{RSL}\s*(\S+)\s*=\s*(\S+)/ ) + min_lengths[ $1 ] = $2 + puts( '[' + PRG_NAME + '] > min lengths: ' + $1 + ' => ' + $2 ) + + elsif ( line =~ /^%\s*(\S+)\s*=\s*(\S+)/ ) + options[ $1 ] = $2 + puts( '[' + PRG_NAME + '] > options : ' + $1 + ' => ' + $2 ) + + elsif ( line =~ /^>\s*(.+)/ ) + command = command + $1 + ";#{NL}" + + elsif ( line =~ /^-/ ) + commands << prepare( command, paths ) + command = String.new + end + } + log << "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ #{NL}#{NL}" + + files = Dir.entries( "." ) + + files.each { | file | + if ( !File.directory?( file ) && + file !~ /^\./ && + file !~ /#{TEMPLATE_FILE}/ && + file !~ /.bck$/ && + file !~ /.log$/ && + file !~ /nohup/ && + file !~ /^00/ ) + aln_name = file.to_str + id = get_id( aln_name ) + if !ids.include?( id ) + ids.add( id ) + end + puts( '[' + PRG_NAME + '] > file [id] : ' + aln_name + ' [' + id + ']' ) + commands.each do | cmd | + + cmd = subst_hmm( cmd, aln_name, hmms ) + cmd = subst_min_length( cmd, aln_name, min_lengths ) + cmd = subst_options( cmd, options ) + if use_job_submission_system + cmd = subst_aln_name( cmd, PBS_O_WORKDIR + aln_name ) + else + cmd = subst_aln_name( cmd, aln_name ) + end + + if ( cmd =~ /%/ ) + cmd =~ /(%.*?%)/ + problem = $1 + puts( '[' + PRG_NAME + '] > WARNING : [' + id + '] command still contains placeholder: ' + problem ) + log << "WARNING: command still contains placeholder: " + cmd + NL + else + tmp_cmd_file = file.to_str[ 0..4 ] + TMP_CMD_FILE_SUFFIX + if ( File.exists?( tmp_cmd_file ) ) + File.delete( tmp_cmd_file ) + end + if use_job_submission_system + open( tmp_cmd_file, 'w' ) do |f| + f.write( cmd ) + end + end + + log << cmd + NL + + if use_job_submission_system + IO.popen( 'qsub -q ' + QUEUE + ' -l walltime=' + WALLTIME + ' ' + tmp_cmd_file , 'r+' ) do | pipe | + pipe.close_write + end + else + spawn( 'nohup ' + cmd + ' &', STDERR => "/dev/null" ) + end + + sleep( WAIT ) + if ( File.exists?( tmp_cmd_file ) ) + File.delete( tmp_cmd_file ) + end + end + end + end + } + + open( LOG_FILE, 'w' ) do | f | + f.write( log ) + end + + puts() + puts( '[' + PRG_NAME + '] > OK' ) + puts() + + end # def run + + def prepare( command, paths ) + paths.each_pair{ | name, full | + command = command.gsub( name, full ) + } + command + end + + def subst_options( command, options ) + opt_placeholders = command.scan( /%\[\S+\]%/ ) + opt_placeholders.each { | opt_placeholder | + opt_placeholder = opt_placeholder.gsub( OPTION_OPEN , '' ) + opt_placeholder = opt_placeholder.gsub( OPTION_CLOSE, '' ) + opt_value = options[ opt_placeholder ] + if ( opt_value != nil && opt_value.size > 0 ) + command = command.gsub( OPTION_OPEN + opt_placeholder + OPTION_CLOSE, opt_value ) + end + } + command + end + + def subst_aln_name( command, aln_name ) + command = command.gsub( '$', aln_name ) + command + end + + def subst_hmm( command, aln_name, hmms ) + id = get_id( aln_name ) + hmm = hmms[ id ] + if ( hmm != nil && hmm.size > 0 ) + command = command.gsub( OPTION_OPEN + HMM + OPTION_CLOSE, hmm ) + end + command + end + + def subst_min_length( command, aln_name, min_lengths ) + id = get_id( aln_name ) + min_length = min_lengths[ id ] + if ( min_length != nil && min_length.size > 0 ) + command = command.gsub( OPTION_OPEN + RSL + OPTION_CLOSE, min_length ) + else + command = command.gsub( OPTION_OPEN + RSL + OPTION_CLOSE, MIN_LENGTH_DEFAULT.to_s ) + end + command + end + + def get_id( aln_name ) + aln_name =~ /^([^_]+)/ + $1 + end + + end # class PhylogenyFactory + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb new file mode 100644 index 0000000..d316f85 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb @@ -0,0 +1,317 @@ +# +# = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: taxonomy_processor.rb,v 1.26 2010/12/13 19:00:11 cmzmasek Exp $ + + +require 'lib/evo/util/util' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/msa/msa' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/io/parser/general_msa_parser' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/writer/phylip_sequential_writer' +require 'lib/evo/util/command_line_arguments' + +module Evoruby + + class TaxonomyProcessor + + PRG_NAME = "tap" + PRG_DATE = "2012.09.27" + PRG_DESC = "replacement of species names in multiple sequence files" + PRG_VERSION = "1.02" + COPYRIGHT = "2012 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + SIMPLE = true + + EXTRACT_TAXONOMY_OPTION = "t" + + def initialize() + @taxonomies = Hash.new() + end + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if ( ARGV == nil || ( ARGV.length != 1 && ARGV.length != 3 && ARGV.length != 4 && ARGV.length != 5 && ARGV.length != 6 ) ) + puts( "Usage: #{PRG_NAME}.rb [options] [input map file] [output sequences] [output id list]" ) + puts() + puts( " options: -" + EXTRACT_TAXONOMY_OPTION + ": to extract taxonomy information from bracketed expression" ) + puts() + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + + mapfile = nil + input = nil + output = nil + list_file = nil + + + + if cla.get_number_of_files == 4 + mapfile = cla.get_file_name( 0 ) + input = cla.get_file_name( 1 ) + output = cla.get_file_name( 2 ) + list_file = cla.get_file_name( 3 ) + elsif cla.get_number_of_files == 3 + input = cla.get_file_name( 0 ) + output = cla.get_file_name( 1 ) + list_file = cla.get_file_name( 2 ) + elsif cla.get_number_of_files == 1 + input = cla.get_file_name( 0 ) + i = nil + if input.downcase.end_with?( ".fasta" ) + i = input[ 0 .. input.length - 7 ] + elsif input.downcase.end_with?( ".fsa" ) + i = input[ 0 .. input.length - 5 ] + else + i = input + end + output = i + "_ni.fasta" + list_file = i + ".nim" + end + + + allowed_opts = Array.new + allowed_opts.push( EXTRACT_TAXONOMY_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) + end + + extract_taxonomy = false + if ( cla.is_option_set?( EXTRACT_TAXONOMY_OPTION ) ) + extract_taxonomy = true + end + + if ( File.exists?( output ) ) + Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" ) + end + if ( File.exists?( list_file ) ) + Util.fatal_error( PRG_NAME, "list file [" + list_file + "] already exists" ) + end + if ( !File.exists?( input) ) + Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" ) + end + if ( mapfile != nil && !File.exists?( mapfile ) ) + Util.fatal_error( PRG_NAME, "mapfile [" + mapfile + "] does not exist" ) + end + + fasta_like = Util.looks_like_fasta?( input ) + + puts() + if mapfile != nil + puts( "Map file : " + mapfile ) + end + puts( "Input alignment : " + input ) + puts( "Output alignment: " + output ) + puts( "Name list : " + list_file ) + if ( fasta_like ) + puts( "Format : Fasta" ) + else + puts( "Format : Phylip like" ) + end + if ( extract_taxonomy ) + puts( "Extract taxonomy: true" ) + end + puts() + + species_map = Hash.new + if mapfile != nil + File.open( mapfile ) do | file | + while line = file.gets + if ( line =~/(.+)#(.+)/ || line =~/(.+)\s+(.+)/ ) + species_map[ $1 ] = $2 + Util.print_message( PRG_NAME, "mapping: " + $1 + ' => ' + $2 ) + end + end + end + end + + f = MsaFactory.new() + begin + if ( fasta_like ) + msa = f.create_msa_from_file( input, FastaParser.new() ) + else + msa = f.create_msa_from_file( input, GeneralMsaParser.new() ) + end + rescue Exception => e + Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s ) + end + + if ( msa == nil || msa.get_number_of_seqs() < 1 ) + Util.fatal_error( PRG_NAME, "failed to read MSA" ) + end + begin + Util.check_file_for_writability( list_file ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT ) + end + + #removed = msa.remove_redundant_sequences!( true ) + #if removed.size > 0 + # Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) + # removed.each { | seq_name | + # puts seq_name + # } + # Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" ) + #end + + lf = File.open( list_file, "a" ) + for i in 0 ... msa.get_number_of_seqs + seq = msa.get_sequence( i ) + seq.set_name( Util::normalize_seq_name( modify_name( seq.get_name(), i, lf, species_map, extract_taxonomy ), 10 ) ) + end + + io = MsaIO.new() + w = nil + if ( fasta_like ) + w = FastaWriter.new() + else + w = PhylipSequentialWriter.new() + end + w.set_max_name_length( 10 ) + w.clean( true ) + begin + io.write_to_file( msa, output, w ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) + end + lf.close() + if ( @taxonomies.length > 0 ) + Util.print_message( PRG_NAME, "number of unique taxonomies: " + @taxonomies.length.to_s ) + end + Util.print_message( PRG_NAME, "wrote: " + list_file ) + Util.print_message( PRG_NAME, "wrote: " + output ) + Util.print_message( PRG_NAME, "OK" ) + end + + private + + def modify_name( desc, counter, file, species_map, extract_taxonomy ) + new_desc = nil + my_species = nil + # if desc =~ /^>?\s*\S{1,10}_([0-9A-Z]{3,5})/ + if desc =~ /^>?\s*\S{1,10}_([A-Z]{3,5})/ + new_desc = counter.to_s( 16 ) + "_" + $1 + elsif SIMPLE + new_desc = counter.to_s( 16 ) + elsif extract_taxonomy + if ( desc.count( "[" ) != desc.count( "]" ) ) + Util.fatal_error( PRG_NAME, "illegal bracket count in: " + desc ) + end + species = nil + species_map.each_key do | key | + if desc =~ /[\b|_]#{key}\b/ # Added boundaries to prevent e.g. RAT matching ARATH. + species = species_map[ key ] + new_desc = counter.to_s( 16 ) + "_" + species + break + end + end + if species == nil + if desc =~/.*\[(\S{3,}?)\]/ + species = $1 + species.strip! + species.upcase! + species.gsub!( /\s+/, " " ) + species.gsub!( /-/, "" ) + species.gsub!( /\)/, "" ) + species.gsub!( /\(/, "" ) + species.gsub!( /\'/, "" ) + if species =~ /\S+\s\S+/ || species =~ /\S{3,5}/ + if species =~ /(\S+)\s(\S+)/ + code = $1[ 0..2 ] + $2[ 0..1 ] + elsif species =~ /\S{3,5}/ + code = species + elsif species.count( " " ) > 2 + species =~ /(\S+)\s+(\S+)\s+(\S+)$/ + third_last = $1 + second_last = $2 + last = $3 + code = code[ 0 ] + third_last[ 0 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ] + elsif species.count( " " ) > 1 + species =~ /(\S+)\s+(\S+)$/ + second_last = $1 + last = $2 + code = code[ 0..1 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ] + end + new_desc = counter.to_s( 16 ) + "_" + code + if @taxonomies.has_key?( code ) + if ( !@taxonomies.has_value?( species ) ) + Util.fatal_error( PRG_NAME, "code [#{code}] is not unique in [#{desc}]" ) + end + else + if ( @taxonomies.has_value?( species ) ) + Util.fatal_error( PRG_NAME, "genome [#{species}] is not unique in [#{desc}]" ) + else + @taxonomies[ code ] = species + end + end + else + Util.fatal_error( PRG_NAME, "illegal format [#{species}] in: " + desc ) + end + else + Util.fatal_error( PRG_NAME, "illegal format in: " + desc ) + end + end + else + species = nil + my_species = nil + species_map.each_key do | key | + if desc =~ /#{key}/ + species = species_map[ key ] + species = species.gsub( /\s+/, "" ) + species = species.gsub( /_/, " " ) + my_species = species + if species =~ /(\S+)\s+(\S+)/ + species = $1[0..2] + $2[0..1] + end + species = species.gsub( /\s+/, "" ) + species = species.slice(0, 5) + species.upcase! + break + end + end + if species == nil + Util.fatal_error( PRG_NAME, "species not found in: " + desc ) + else + new_desc = counter.to_s( 16 ) + "_" + species + end + end + if new_desc == nil + Util.fatal_error( PRG_NAME, "failed to extract species from: " + desc ) + end + if my_species != nil + file.print( new_desc + ": " + desc + " [" + my_species + "]" + "\n" ) + else + file.print( new_desc + ": " + desc + "\n" ) + end + new_desc + end + + end # class TaxonomyProcessor + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/tool/tseq_taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/tool/tseq_taxonomy_processor.rb new file mode 100644 index 0000000..f708247 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/tseq_taxonomy_processor.rb @@ -0,0 +1,190 @@ +# +# = lib/evo/apps/tseq_taxonomy_processor - TseqTaxonomyProcessor class +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: tseq_taxonomy_processor.rb,v 1.6 2010/12/13 19:00:11 cmzmasek Exp $ + + +require 'lib/evo/util/util' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/msa/msa' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/parser/sp_taxonomy_parser' +require 'lib/evo/io/parser/ncbi_tseq_parser' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/writer/phylip_sequential_writer' +require 'lib/evo/util/command_line_arguments' + +module Evoruby + + class TseqTaxonomyProcessor + + PRG_NAME = "tseq_tap" + PRG_DATE = "2009.01.06" + PRG_DESC = "preprocessing of multiple sequence files in ncbi tseq xml format" + PRG_VERSION = "1.02" + COPYRIGHT = "2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + TAXONOMY_CODE = "TAXONOMY_CODE:" + TAXONOMY_ID = "TAXONOMY_ID:" + TAXONOMY_ID_TYPE = "TAXONOMY_ID_TYPE:" + TAXONOMY_SN = "TAXONOMY_SN:" + TAXONOMY_CN = "TAXONOMY_CN:" + SEQ_ACCESSION = "SEQ_ACCESSION:" + SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE:" + SEQ_SECONDARY_ACCESSION = "SEQ_SECONDARY_ACCESSION:" + SEQ_SYMBOL = "SEQ_SYMBOL:" + SEQ_NAME = "SEQ_NAME:" + SEQ_MOL_SEQ = "SEQ_MOL_SEQ:" + + def initialize() + @tax_ids_to_sp_taxonomies = Hash.new() + end + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if ARGV == nil || ARGV.length != 4 + puts( "Usage: #{PRG_NAME}.rb " ) + puts() + + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + allowed_opts = Array.new + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) + end + + sp_taxonomy_infile = cla.get_file_name( 0 ) + sequences_infile = cla.get_file_name( 1 ) + sequences_outfile = cla.get_file_name( 2 ) + mapping_outfile = cla.get_file_name( 3 ) + + Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile ) + Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile ) + Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile ) + Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile ) + + sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile ) + + Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile ) + + tseq_parser = NcbiTSeqParser.new + msa_fac = MsaFactory.new + + seqs = msa_fac.create_msa_from_file( sequences_infile, tseq_parser ) + + Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile ) + + removed = seqs.remove_redundant_sequences!( true, true ) + + if removed.size > 0 + Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) + removed.each { | seq_name | + puts seq_name + } + Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" ) + end + + mapping_out = File.open( mapping_outfile, "a" ) + + for i in 0 ... seqs.get_number_of_seqs + seq = seqs.get_sequence( i ) + if seq.get_taxonomy == nil + Util.fatal_error( PRG_NAME, "sequence [" + seq.get_name + "] has no taxonomy information" ) + end + seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) ) + end + + io = MsaIO.new() + + w = FastaWriter.new() + + w.set_max_name_length( 10 ) + w.clean( true ) + begin + io.write_to_file( seqs, sequences_outfile, w ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) + end + mapping_out.close() + + Util.print_message( PRG_NAME, "wrote: " + mapping_outfile ) + Util.print_message( PRG_NAME, "wrote: " + sequences_outfile ) + Util.print_message( PRG_NAME, "OK" ) + + end + + private + + def modify_name( seq, i, sp_taxonomies, mapping_outfile ) + + tax_id = seq.get_taxonomy.get_id + matching_sp_taxonomy = nil + + if @tax_ids_to_sp_taxonomies.has_key?( tax_id ) + # This is so that a second lookup will be much faster. + matching_sp_taxonomy = @tax_ids_to_sp_taxonomies[ tax_id ] + else + sp_taxonomies.each { |sp_taxonomy| + if ( sp_taxonomy.id == tax_id ) + if matching_sp_taxonomy != nil + Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] is not unique" ) + end + matching_sp_taxonomy = sp_taxonomy + @tax_ids_to_sp_taxonomies[ tax_id ] = sp_taxonomy + end + } + end + if matching_sp_taxonomy == nil + Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] for [" + seq.get_taxonomy.get_name + "] not found" ) + end + + new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code + + seq_name = seq.get_name + if seq_name =~ /\[.+\]$/ + # Redundant taxonomy information hides here. + seq_name = seq_name.sub(/\[.+\]$/, '') + end + if seq_name =~ /^\s*hypothetical\s+protein\s*/i + # Pointless information. + seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' ) + end + + mapping_outfile.print( new_name + "\t" + + TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + + TAXONOMY_ID + tax_id + "\t" + + TAXONOMY_ID_TYPE + seq.get_taxonomy.get_id_source + "\t" + + TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + + SEQ_ACCESSION + seq.get_accession + "\t" + + SEQ_ACCESSION_SOURCE + seq.get_accession_source + "\t" + + SEQ_SYMBOL + seq.get_symbol + "\t" + + SEQ_NAME + seq_name + "\t" + + SEQ_MOL_SEQ + seq.get_sequence_as_string + + Constants::LINE_DELIMITER ) + new_name + end + + end + +end # module Evoruby \ No newline at end of file -- 1.7.10.2