From: cmzmasek@gmail.com Date: Tue, 2 Oct 2012 18:39:44 +0000 (+0000) Subject: refactored X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=46341b10de4041b7b78d8de8356fac112ed18f96;p=jalview.git refactored --- diff --git a/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb deleted file mode 100644 index efe6f6a..0000000 --- a/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb +++ /dev/null @@ -1,268 +0,0 @@ -# -# = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class -# -# Copyright:: Copyright (C) 2012 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id:Exp $ - - -require 'lib/evo/util/constants' -require 'lib/evo/util/util' -require 'lib/evo/util/command_line_arguments' -require 'lib/evo/io/parser/hmmscan_domain_extractor' - -module Evoruby - - class DomainSequenceExtractor - - PRG_NAME = "dsx" - PRG_VERSION = "2.000" - PRG_DESC = "extraction of domain sequences from hmmscan output" - PRG_DATE = "20121001" - COPYRIGHT = "2012 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - - E_VALUE_THRESHOLD_OPTION = 'e' - LENGTH_THRESHOLD_OPTION = 'l' - ADD_POSITION_OPTION = 'p' - ADD_DOMAIN_NUMBER_OPTION = 'd' - ADD_SPECIES = 's' - MIN_LINKER_OPT = 'ml' - LOG_FILE_SUFFIX = '_domain_seq_extr.log' - PASSED_SEQS_SUFFIX = '_with_passing_domains.fasta' - FAILED_SEQS_SUFFIX = '_with_no_passing_domains.fasta' - HELP_OPTION_1 = 'help' - HELP_OPTION_2 = 'h' - - def run() - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC , - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - ld = Constants::LINE_DELIMITER - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError - Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT ) - end - - if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) - print_help - exit( 0 ) - end - - if ( cla.get_number_of_files != 4 ) - print_help - exit( -1 ) - end - - allowed_opts = Array.new - allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) - allowed_opts.push( ADD_POSITION_OPTION ) - allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION ) - allowed_opts.push( LENGTH_THRESHOLD_OPTION ) - allowed_opts.push( ADD_SPECIES ) - allowed_opts.push( MIN_LINKER_OPT ) - - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, - "unknown option(s): " + disallowed, - STDOUT ) - end - - domain_id = cla.get_file_name( 0 ) - hmmsearch_output = cla.get_file_name( 1 ) - fasta_sequence_file = cla.get_file_name( 2 ) - outfile = cla.get_file_name( 3 ) - - if outfile.downcase.end_with?( ".fasta" ) - outfile = outfile[ 0 .. outfile.length - 7 ] - elsif outfile.downcase.end_with?( ".fsa" ) - outfile = outfile[ 0 .. outfile.length - 5 ] - end - - - add_position = false - if ( cla.is_option_set?( ADD_POSITION_OPTION ) ) - add_position = true - end - - add_domain_number = false - if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) ) - add_domain_number = true - end - - add_species = false - if cla.is_option_set? ADD_SPECIES - add_species = true - end - - e_value_threshold = -1.0 - if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) - begin - e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) - rescue ArgumentError => e - Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - if ( e_value_threshold < 0.0 ) - Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) - end - end - - length_threshold = -1 - if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) ) - begin - length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION ) - rescue ArgumentError => e - Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - if ( length_threshold < 0) - Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT ) - end - end - - - min_linker = nil - if ( cla.is_option_set?( MIN_LINKER_OPT ) ) - begin - min_linker = cla.get_option_value_as_int( MIN_LINKER_OPT ) - rescue ArgumentError => e - Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - if ( !min_linker || min_linker > 100 || min_linker < -100 ) - Forester::Util.fatal_error( PRG_NAME, "unexpected value for min linker " + min_linker.to_s, STDOUT ) - end - end - - - log = String.new - - puts() - puts( "Domain : " + domain_id ) - log << "Domain : " + domain_id + ld - puts( "Hmmscan outputfile : " + hmmsearch_output ) - log << "Hmmscan outputfile : " + hmmsearch_output + ld - puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file ) - log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld - puts( "Outputfile : " + outfile + ".fasta" ) - log << "Outputfile : " + outfile + ld - puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX ) - log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld - puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX ) - log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld - puts( "Logfile : " + outfile + LOG_FILE_SUFFIX ) - log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld - if ( e_value_threshold >= 0.0 ) - puts( "E-value threshold : " + e_value_threshold.to_s ) - log << "E-value threshold : " + e_value_threshold.to_s + ld - else - puts( "E-value threshold : no threshold" ) - log << "E-value threshold : no threshold" + ld - end - if ( length_threshold > 0 ) - puts( "Length threshold : " + length_threshold.to_s ) - log << "Length threshold : " + length_threshold.to_s + ld - else - puts( "Length threshold : no threshold" ) - log << "Length threshold : no threshold" + ld - end - if ( min_linker ) - puts( "Min linker : " + min_linker.to_s ) - log << "Min linker : " + min_linker.to_s + ld - - end - - - if ( add_position ) - puts( "Add positions (rel to complete seq) to extracted domains: true" ) - log << "Add positions (rel to complete seq) to extracted domains: true" + ld - else - puts( "Add positions (rel to complete seq) to extracted domains: false" ) - log << "Add positions (rel to complete seq) to extracted domains: false" + ld - end - - if ( add_domain_number ) - puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" ) - log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld - else - puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" ) - log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld - end - - puts - - domain_count = 0 - begin - parser = HmmscanDomainExtractor.new() - domain_count = parser.parse( domain_id, - hmmsearch_output, - fasta_sequence_file, - outfile, - outfile + PASSED_SEQS_SUFFIX, - outfile + FAILED_SEQS_SUFFIX, - e_value_threshold, - length_threshold, - add_position, - add_domain_number, - add_species, - min_linker, - log ) - rescue ArgumentError, IOError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - - rescue Exception => e - puts e.backtrace - Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT ) - - end - - puts - Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" ) - Util.print_message( PRG_NAME, "wrote; " + outfile + ".fasta") - Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX ) - Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX ) - Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX ) - - begin - f = File.open( outfile + LOG_FILE_SUFFIX, 'a' ) - f.print( log ) - f.close - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - puts - Util.print_message( PRG_NAME, "OK" ) - puts - - end - - def print_help() - puts() - puts( "Usage:" ) - puts() - puts( " " + PRG_NAME + ".rb [options] " ) - puts() - puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=: E-value threshold, default is no threshold" ) - puts( " -" + LENGTH_THRESHOLD_OPTION + "=: length threshold, default is no threshold" ) - puts( " -" + ADD_POSITION_OPTION + ": to add positions (rel to complete seq) to extracted domains" ) - puts( " -" + ADD_DOMAIN_NUMBER_OPTION + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" ) - puts( " -" + ADD_SPECIES + ": to add species [in brackets]" ) - puts( " -" + MIN_LINKER_OPT + "=: to extract pairs of same domains with a distance inbetween shorter than a given value" ) - puts() - end - - end # class DomainSequenceExtractor - -end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/domains_to_forester.rb b/forester/ruby/evoruby/lib/evo/apps/domains_to_forester.rb deleted file mode 100644 index 85af666..0000000 --- a/forester/ruby/evoruby/lib/evo/apps/domains_to_forester.rb +++ /dev/null @@ -1,261 +0,0 @@ -# -# = lib/evo/apps/domains_to_forester - DomainsToForester class -# -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: Exp $ -# -# last modified: 06/11/2007 - -require 'lib/evo/util/constants' -require 'lib/evo/util/util' -require 'lib/evo/util/command_line_arguments' -require 'lib/evo/msa/msa_factory' -require 'lib/evo/io/parser/fasta_parser' -require 'lib/evo/sequence/protein_domain' -require 'lib/evo/sequence/domain_structure' - -module Evoruby - - class DomainsToForester - - PRG_NAME = "d2f" - PRG_DESC = "parsed hmmpfam output to forester format" - PRG_VERSION = "1.001" - PRG_DATE = "20120807" - COPYRIGHT = "2012 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - - E_VALUE_THRESHOLD_OPTION = "e" - OVERWRITE_IF_SAME_FROM_TO_OPTION = "o" - HELP_OPTION_1 = "help" - HELP_OPTION_2 = "h" - - def parse( domains_list_file, - original_seqs_file, - outfile, - column_delimiter, - e_value_threshold, - overwrite_if_same_from_to ) - Util.check_file_for_readability( domains_list_file ) - Util.check_file_for_readability( original_seqs_file ) - Util.check_file_for_writability( outfile ) - - domain_structures = Hash.new() # protein name is key, domain structure is value - - f = MsaFactory.new - - original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new ) - if ( original_seqs.get_number_of_seqs < 1 ) - error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format" - raise ArgumentError, error_msg - end - - File.open( domains_list_file ) do | file | - while line = file.gets - if !is_ignorable?( line ) - - a = line.split( column_delimiter ) - l = a.length - if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) ) - error_msg = "unexpected format at line: " + line - raise IOError, error_msg - end - protein_name = a[ 0 ] - domain_name = a[ 1 ] - seq_from = -1 - seq_to = -1 - ########################################## - if domain_name =~ /RRM_\d/ - puts "ignoring " + line - next - end - ########################################## - - - begin - seq_from = a[ 2 ].to_i - rescue Exception - error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]" - raise IOError, error_msg - end - begin - seq_to = a[ 3 ].to_i - rescue Exception - error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]" - raise IOError, error_msg - end - - e_value = -1 - if l > 4 - begin - e_value = a[ 4 ].to_f - rescue Exception - error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]" - raise IOError, error_msg - end - end - - seq = original_seqs.get_by_name_start( protein_name ) - - total_length = seq.get_length - - if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) ) ) - pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value ) - ds = nil - if ( domain_structures.has_key?( protein_name ) ) - ds = domain_structures[ protein_name ] - else - ds = DomainStructure.new( total_length ) - domain_structures[ protein_name ] = ds - end - ds.add_domain( pd, overwrite_if_same_from_to ) - end - - end - end - end - - out = File.open( outfile, "a" ) - ds = domain_structures.sort - for d in ds - protein_name = d[ 0 ] - domain_structure = d[ 1 ] - out.print( protein_name.to_s ) - out.print( ": " ) - out.print( domain_structure.to_NHX ) - out.print( Constants::LINE_DELIMITER ) - end - - out.flush() - out.close() - - end # parse - - - - - def run() - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - - if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) - print_help - exit( 0 ) - end - - if cla.get_number_of_files != 3 - print_help - exit( -1 ) - end - - allowed_opts = Array.new - allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) - allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION ) - - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, - "unknown option(s): " + disallowed, - STDOUT ) - end - - domains_list_file = cla.get_file_name( 0 ) - original_sequences_file = cla.get_file_name( 1 ) - outfile = cla.get_file_name( 2 ) - - - e_value_threshold = -1.0 - if cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) - begin - e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - if ( e_value_threshold < 0.0 ) - Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) - end - end - overwrite_if_same_from_to = false - if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) ) - overwrite_if_same_from_to = true - end - - puts - puts( "Domains list file : " + domains_list_file ) - puts( "Fasta sequencefile (complete sequences): " + original_sequences_file ) - puts( "Outputfile : " + outfile ) - if ( e_value_threshold >= 0.0 ) - puts( "E-value threshold : " + e_value_threshold.to_s ) - else - puts( "E-value threshold : no threshold" ) - end - if ( overwrite_if_same_from_to ) - puts( "Overwrite if same from and to : true" ) - else - puts( "Overwrite if same from and to : false" ) - end - - puts - - begin - parse( domains_list_file, - original_sequences_file, - outfile, - " ", - e_value_threshold, - overwrite_if_same_from_to ) - - rescue ArgumentError, IOError, StandardError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT ) - end - - - puts - Util.print_message( PRG_NAME, 'OK' ) - puts - - end - - private - - def print_help() - puts - puts( "Usage:" ) - puts - puts( " " + PRG_NAME + ".rb [options] " ) - puts() - puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "= : E-value threshold, default is no threshold" ) - puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" ) - puts - end - - - - def is_ignorable?( line ) - return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/) - end - - - end # class DomainsToForester - - -end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/evo_nursery.rb b/forester/ruby/evoruby/lib/evo/apps/evo_nursery.rb deleted file mode 100755 index e6f653f..0000000 --- a/forester/ruby/evoruby/lib/evo/apps/evo_nursery.rb +++ /dev/null @@ -1,317 +0,0 @@ -# -# = lib/evo/apps/evo_nursery.rb - EvoNursery class -# -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: evo_nursery.rb,v 1.11 2010/12/13 19:00:11 cmzmasek Exp $ - - - -require 'lib/evo/soft/fastme' -require 'lib/evo/soft/tree_puzzle' -require 'lib/evo/util/constants' -require 'lib/evo/util/util' -require 'lib/evo/util/command_line_arguments' -require 'lib/evo/msa/msa_factory' -require 'lib/evo/io/msa_io' -require 'lib/evo/io/writer/phylip_sequential_writer' -require 'lib/evo/io/parser/general_msa_parser' -require 'lib/evo/io/writer/msa_writer' - -require 'iconv' - -module Evoruby - - class EvoNursery - GAP_RATIO = 0.50 - GAP_RATIO_FOR_SEQS = 0.75 - MIN_LENGTH = 30 - MIN_SEQS = 4 - MAX_SEQS = 3000 - MAX_ALN_FILE_SIZE = 5000000 - MODEL = :auto - RATES = :uniform - FASTME_INITIAL_TREE = :GME - ALN_NAME = '_align_' - TREE_PUZZLE_OUTDIST = TreePuzzle::OUTDIST - TREE_PUZZLE_OUTFILE = TreePuzzle::OUTFILE - FASTME_OUTTREE = FastMe::OUTTREE - FASTME_OUTPUT_D = FastMe::OUTPUT_D - - PRG_NAME = "evo_nursery" - PRG_DATE = "2012.03.21" - PRG_DESC = "pfam alignments to evolutionary trees" - PRG_VERSION = "0.20" - COPYRIGHT = "2009-2012 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - - HELP_OPTION_1 = "help" - HELP_OPTION_2 = "h" - - def run - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - if RUBY_VERSION !~ /1.9/ - puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) - exit( -1 ) - end - - forester_home = Util.get_env_variable_value( Constants::FORESTER_HOME_ENV_VARIABLE ) - java_home = Util.get_env_variable_value( Constants::JAVA_HOME_ENV_VARIABLE ) - decorator = java_home + '/bin/java -cp ' + forester_home + '/java/forester.jar org.forester.application.decorator' - - if ( ARGV == nil || ARGV.length != 1 ) - help - exit( -1 ) - end - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) - help - exit( 0 ) - end - - output_dir = cla.get_file_name( 0 ) - - if output_dir !~ /\/$/ - output_dir = output_dir + '/' - end - - if !File.exists?( output_dir ) - Util.fatal_error( PRG_NAME, output_dir.to_s + " does not exist", STDOUT ) - end - ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' ) - files = Dir.entries( "." ) - skipped = Array.new - counter = 1 - analyzed = 0; - begin - files.each { |pfam_aln_file| - if ( !File.directory?( pfam_aln_file ) && - pfam_aln_file !~ /^\./ && - pfam_aln_file !~ /.+\.tre$/ ) - - tree_out_file = output_dir + File.basename( pfam_aln_file ) + ".xml" - - if File.exists?( tree_out_file ) - puts - puts - puts "***** skipping " + File.basename( pfam_aln_file ) + ", already exists" - puts - skipped.push( File.basename( pfam_aln_file ) + " [already exists]" ) - next - end - - puts - puts counter.to_s + ": " + pfam_aln_file.to_str - counter += 1 - if File.size( pfam_aln_file ) > MAX_ALN_FILE_SIZE - puts "***** skipping, file size: " + File.size( pfam_aln_file ).to_s - skipped.push( File.basename( pfam_aln_file ) + " [file size: " + File.size( pfam_aln_file ).to_s + "]" ) - next - end - - f = MsaFactory.new() - msa = f.create_msa_from_file( pfam_aln_file, GeneralMsaParser.new() ) - - if msa.get_number_of_seqs < MIN_SEQS || msa.get_number_of_seqs > MAX_SEQS - puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s - skipped.push( File.basename( pfam_aln_file ) + " [seqs: " + msa.get_number_of_seqs.to_s + "]" ) - next - end - - msa.remove_gap_columns_w_gap_ratio!( GAP_RATIO ) - - length = msa.get_length - if length < MIN_LENGTH - puts "***** skipping, length: " + length.to_s - skipped.push( File.basename( pfam_aln_file ) + " [length: " + length.to_s + "]" ) - next - end - - msa.remove_sequences_by_gap_ratio!( GAP_RATIO_FOR_SEQS ) - - if msa.get_number_of_seqs < MIN_SEQS - puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s - skipped.push( File.basename( pfam_aln_file ) + " [seqs: " + msa.get_number_of_seqs.to_s + "]" ) - next - end - - map_file = output_dir + File.basename( pfam_aln_file ) + ".map" - f = File.open( map_file, 'a' ) - for i in 0 ... msa.get_number_of_seqs - name = msa.get_sequence( i ).get_name() - name =~ /(.+)_(.+)\/.+/ - acc = $1 - tax_code = $2 - - mapping_str = i.to_s - mapping_str << "\t" - mapping_str << 'TAXONOMY_CODE:' - mapping_str << tax_code - mapping_str << "\t" - mapping_str << 'SEQ_SYMBOL:' - mapping_str << ( acc + '_' + tax_code ) - mapping_str << "\t" - if ( acc.length < 6 ) - acc = acc + '_' + tax_code - end - mapping_str << 'SEQ_ACCESSION:' - mapping_str << acc - mapping_str << "\t" - mapping_str << 'SEQ_ACCESSION_SOURCE:UniProtKB' - mapping_str << "\t" - mapping_str << 'NODE_NAME:' - mapping_str << name - f.print( mapping_str ) - f.print( "\n" ) - name = msa.get_sequence( i ).set_name( i.to_s ) - end - f.close - - io = MsaIO.new() - w = MsaWriter - w = PhylipSequentialWriter.new() - w.clean( true ) - w.set_max_name_length( 10 ) - if File.exists?( output_dir + ALN_NAME ) - File.unlink( output_dir + ALN_NAME ) - end - io.write_to_file( msa, output_dir + ALN_NAME, w ) - - tp = TreePuzzle.new() - tp.run( output_dir + ALN_NAME, - MODEL, - RATES, - msa.get_number_of_seqs ) - - File.rename( output_dir + ALN_NAME, output_dir + File.basename( pfam_aln_file ) + ".aln" ) - - fastme = FastMe.new() - fastme.run( TREE_PUZZLE_OUTDIST, 0, FASTME_INITIAL_TREE ) - - pfam_acc = nil - pfam_de = nil - File.open( pfam_aln_file ) do |file| - while line = file.gets - line = ic.iconv( line ) - if line =~ /^#=AC\s+(.+)/ - pfam_acc = $1 - end - if line =~ /^#=DE\s+(.+)/ - pfam_de = $1 - end - if pfam_acc && pfam_de - break - end - end - end - if !pfam_acc || !pfam_de - Util.fatal_error( PRG_NAME, "problem with " + pfam_aln_file.to_s, STDOUT ) - end - - puzzle_model = nil - File.open( TREE_PUZZLE_OUTFILE ) do |file| - while line = file.gets - line = ic.iconv( line ) - if line =~ /^Model\s+of\s+substitution:\s+(.+)/ - puzzle_model = $1 - break - end - end - end - if !puzzle_model - Util.fatal_error( PRG_NAME, "problem with puzzle outfile: " + TREE_PUZZLE_OUTFILE.to_s, STDOUT ) - end - - desc = pfam_de - desc << ' | ' - desc << 'ML pwd estimation by TREE-PUZZLE version ' - desc << TreePuzzle::VERSION - desc << ', model: ' - desc << puzzle_model - desc << ', rates: ' - desc << RATES.to_s - desc << '; tree estimation by FastME version ' - desc << FastMe::VERSION - desc << ', initial tree: ' - desc << FASTME_INITIAL_TREE.to_s - desc << '; aln length: ' - desc << msa.get_length.to_s - - cmd = decorator + " -table -p -pn=\"" + pfam_aln_file + - "\" -pi=pfam:" + pfam_acc + - " -pd=\"" + desc + "\" " + - FASTME_OUTTREE + ' ' + - map_file + ' ' + tree_out_file - - IO.popen( cmd , 'r+' ) do | pipe | - pipe.close_write - end - analyzed += 1 - - File.unlink( map_file ) - File.unlink(TREE_PUZZLE_OUTDIST) - File.unlink( TREE_PUZZLE_OUTFILE ) - File.unlink( FASTME_OUTPUT_D ) - end - } - rescue ArgumentError, IOError, StandardError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - - puts() - puts( 'Skipped:' ) - puts() - for i in 0 ... skipped.size - puts i.to_s + ": " + skipped[ i ] - end - - puts() - puts( 'Skipped : ' + skipped.size.to_s + ' alignments' ) - puts( 'Analyzed: ' + analyzed.to_s + ' alignments' ) - - puts( 'Min gap ratio for col del : ' + GAP_RATIO.to_s ) - puts( 'Min gap ratio for seq del : ' + GAP_RATIO_FOR_SEQS.to_s ) - puts( 'Minimal aln length : ' + MIN_LENGTH.to_s ) - puts( 'Minimal number of sequences: ' + MIN_SEQS.to_s ) - puts( 'Maximal number of sequences: ' + MAX_SEQS.to_s ) - puts( 'Maximal aln file size : ' + MAX_ALN_FILE_SIZE.to_s ) - puts( 'Model : ' + MODEL.to_s ) - puts( 'FastME initial tree: ' + FASTME_INITIAL_TREE.to_s ) - - puts() - puts( '[' + PRG_NAME + '] > OK' ) - puts() - - end # run - - private - - def help - puts( "Usage:" ) - puts() - puts( " " + PRG_NAME + ".rb " ) - puts() - end - - - end # class EvoNursery - -end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/apps/fasta_extractor.rb b/forester/ruby/evoruby/lib/evo/apps/fasta_extractor.rb deleted file mode 100644 index b2d0d5c..0000000 --- a/forester/ruby/evoruby/lib/evo/apps/fasta_extractor.rb +++ /dev/null @@ -1,146 +0,0 @@ -# -# = lib/evo/apps/fasta_extractor.rb - FastaExtractor class -# -# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: fasta_extractor.rb,v 1.2 2010/12/13 19:00:11 cmzmasek Exp $ - - -require 'lib/evo/util/util' -require 'lib/evo/util/constants' -require 'lib/evo/util/command_line_arguments' - - -module Evoruby - - class FastaExtractor - - PRG_NAME = "fae" - PRG_VERSION = "1.0.0" - PRG_DESC = "extraction of nucleotide sequences from a fasta file by names from wublast search" - PRG_DATE = "2008.08.09" - COPYRIGHT = "2008-2009 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - HELP_OPTION_1 = 'help' - HELP_OPTION_2 = 'h' - - - def run() - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC , - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - ld = Constants::LINE_DELIMITER - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) - print_help - exit( 0 ) - end - - if ( cla.get_number_of_files != 3 ) - print_help - exit( -1 ) - end - - allowed_opts = Array.new - - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, - "unknown option(s): " + disallowed, - STDOUT ) - end - - input_file = cla.get_file_name( 0 ) - names_file = cla.get_file_name( 1 ) - output_file = cla.get_file_name( 2 ) - - if !File.exist?( input_file ) - Util.fatal_error( PRG_NAME, "error: input file [#{input_file}] does not exist" ) - end - if !File.exist?( names_file ) - Util.fatal_error( PRG_NAME, "error: names file [#{names_file}] does not exist" ) - end - if File.exist?( output_file ) - Util.fatal_error( PRG_NAME, "error: [#{output_file }] already exists" ) - end - - names = extract_names_with_frames( names_file ) - - extract_sequences( names, input_file, output_file ) - - puts - Util.print_message( PRG_NAME, "OK" ) - puts - - end - - - def extract_names_with_frames( names_file ) - names = Hash.new() - File.open( names_file ) do | file | - while line = file.gets - if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) ) - if ( line =~ /(\S+)\s+([+|-]\d)\s+\d+\s+(\S+)/ ) - name = $1 - frame = $2 - e = $3 - names[ name ] = "[" + frame + "] [" + e + "]" - end - end - end - end - names - end - - def extract_sequences( names, fasta_file, output_file ) - output = File.open( output_file, "a" ) - matching_state = false - counter = 0 - File.open( fasta_file ) do | file | - while line = file.gets - if !Util.is_string_empty?( line ) - if ( line =~ /\s*>\s*(.+)/ ) - name = $1 - if names.has_key?( name ) - matching_state = true - counter += 1 - puts counter.to_s + ". " +name + " " + names[ name ] - output.print( ">" + name + " " + names[ name ] ) - output.print( Evoruby::Constants::LINE_DELIMITER ) - else - matching_state = false - end - elsif matching_state - output.print( line ) - end - end - end - end - output.close() - end - - def print_help() - puts( "Usage:" ) - puts() - puts( " " + PRG_NAME + ".rb " ) - puts() - end - - end # class FastaExtractor -end \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/apps/fasta_taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/apps/fasta_taxonomy_processor.rb deleted file mode 100644 index 6ae3cf1..0000000 --- a/forester/ruby/evoruby/lib/evo/apps/fasta_taxonomy_processor.rb +++ /dev/null @@ -1,205 +0,0 @@ -# -# = lib/evo/apps/fasta_taxonomy_processor - FastaTaxonomyProcessor class -# -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: fasta_taxonomy_processor.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $ - - -require 'lib/evo/util/util' -require 'lib/evo/msa/msa_factory' -require 'lib/evo/msa/msa' -require 'lib/evo/io/msa_io' -require 'lib/evo/io/parser/sp_taxonomy_parser' -require 'lib/evo/io/parser/fasta_parser' -require 'lib/evo/io/writer/fasta_writer' -require 'lib/evo/io/writer/phylip_sequential_writer' -require 'lib/evo/util/command_line_arguments' -require 'lib/evo/apps/tseq_taxonomy_processor' - -module Evoruby - - class FastaTaxonomyProcessor - - PRG_NAME = "fasta_tap" - PRG_DATE = "2009.01.20" - PRG_DESC = "preprocessing of multiple sequence files in ncbi fasta format" - PRG_VERSION = "1.00" - COPYRIGHT = "2009 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - - def initialize() - @tax_ids_to_sp_taxonomies = Hash.new() - end - - def run() - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - if ARGV == nil || ARGV.length != 4 - puts( "Usage: #{PRG_NAME}.rb " ) - puts() - exit( -1 ) - end - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - allowed_opts = Array.new - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) - end - - sp_taxonomy_infile = cla.get_file_name( 0 ) - sequences_infile = cla.get_file_name( 1 ) - sequences_outfile = cla.get_file_name( 2 ) - mapping_outfile = cla.get_file_name( 3 ) - - Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile ) - Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile ) - Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile ) - Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile ) - - sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile ) - - Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile ) - - fasta_parser = FastaParser.new - msa_fac = MsaFactory.new - - seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser ) - - Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile ) - - removed = seqs.remove_redundant_sequences!( true, true ) - - if removed.size > 0 - Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) - removed.each { | seq_name | - puts seq_name - } - Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" ) - end - - mapping_out = File.open( mapping_outfile, "a" ) - - for i in 0 ... seqs.get_number_of_seqs - seq = seqs.get_sequence( i ) - seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) ) - end - - io = MsaIO.new() - - w = FastaWriter.new() - - w.set_max_name_length( 10 ) - w.clean( true ) - begin - io.write_to_file( seqs, sequences_outfile, w ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) - end - mapping_out.close() - - Util.print_message( PRG_NAME, "wrote: " + mapping_outfile ) - Util.print_message( PRG_NAME, "wrote: " + sequences_outfile ) - Util.print_message( PRG_NAME, "OK" ) - - end - - private - - def modify_name( seq, i, sp_taxonomies, mapping_outfile ) - - #i = i + 1792 - - seq_desc = seq.get_name - - taxonomy_sn = nil - - if seq_desc =~ /\[(.+)\]/ - taxonomy_sn = $1 - else - Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]" ) - end - - matching_sp_taxonomy = nil - - sp_taxonomies.each { |sp_taxonomy| - if ( sp_taxonomy.scientific_name == taxonomy_sn ) - matching_sp_taxonomy = sp_taxonomy - end - } - - if matching_sp_taxonomy == nil - Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" ) - end - - new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code - - gi = nil - if seq_desc =~ /gi\|(.+?)\|/ - gi = $1 - else - Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]" ) - end - - seq_name = "" - - if seq_desc =~ /\|\s*([^|]+?)\s*\[/ - seq_name = $1 - end - - if seq_name =~ /\[.+\]$/ - # Redundant taxonomy information hides here. - seq_name = seq_name.sub(/\[.+\]$/, '') - end - if seq_name =~ /^\s*hypothetical\s+protein\s*/i - # Pointless information. - seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' ) - end - if seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i - # Pointless information. - seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' ) - end - - if gi != nil - mapping_outfile.print( new_name + "\t" + - TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + - TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" + - TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" + - TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + - TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" + - TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" + - TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" + - TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string + - Constants::LINE_DELIMITER ) - else - mapping_outfile.print( new_name + "\t" + - TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + - TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" + - TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" + - TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + - TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" + - TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string + - Constants::LINE_DELIMITER ) - - end - new_name - end - - end - -end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/hmmscan_parser.rb b/forester/ruby/evoruby/lib/evo/apps/hmmscan_parser.rb deleted file mode 100644 index c2f8177..0000000 --- a/forester/ruby/evoruby/lib/evo/apps/hmmscan_parser.rb +++ /dev/null @@ -1,265 +0,0 @@ -# -# = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class -# -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $ -# -# last modified: 11/24/2009 - -require 'lib/evo/util/constants' -require 'lib/evo/util/util' -require 'lib/evo/util/command_line_arguments' - -module Evoruby - - class HmmscanParser - - PRG_NAME = "hsp" - PRG_VERSION = "1.0.1" - PRG_DESC = "hmmscan parser" - PRG_DATE = "2009.11.24" - COPYRIGHT = "2009 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - - DELIMITER_OPTION = "d" - E_VALUE_THRESHOLD_OPTION = "e" - IGNORE_DUF_OPTION = "i" - PARSE_OUT_DESCRIPITION_OPTION = "a" - HELP_OPTION_1 = "help" - HELP_OPTION_2 = "h" - - def initialize - @domain_counts = Hash.new - end - - # raises ArgumentError, IOError - def parse( inpath, - outpath, - column_delimiter, - e_value_threshold, - ignore_dufs, - get_descriptions ) - Util.check_file_for_readability( inpath ) - Util.check_file_for_writability( outpath ) - - outfile = File.open( outpath, "a" ) - - query = String.new - desc = String.new - model = String.new - env_from = String.new - env_to = String.new - i_e_value = String.new - - queries_count = 0 - - nl = Constants::LINE_DELIMITER - - File.open( inpath ) do | file | - while line = file.gets - if !HmmscanParser.is_ignorable?( line ) && line =~ /^\S+\s+\S/ - - # tn acc tlen query acc qlen Evalue score bias # of c-E i-E score bias hf ht af at ef et acc desc - # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 - line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/ - - model = $1 - query = $4 - i_e_value = $13.to_f - env_from = $20.to_i - env_to = $21.to_i - - if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) && - ( !ignore_dufs || ( model !~ /^DUF\d+/ ) ) ) - count_model( model ) - outfile.print( query + - column_delimiter ) - if ( get_descriptions ) - outfile.print( desc + - column_delimiter ) - end - outfile.print( model + - column_delimiter + - env_from.to_s + - column_delimiter + - env_to.to_s + - column_delimiter + - i_e_value.to_s ) - outfile.print( nl ) - end - end - end # while line = file.gets - end - outfile.flush() - outfile.close() - - return queries_count - - end # def parse - - def count_model( model ) - if ( @domain_counts.has_key?( model ) ) - count = @domain_counts[ model ].to_i - count += 1 - @domain_counts[ model ] = count - else - @domain_counts[ model ] = 1 - end - end - - - def get_domain_counts() - return @domain_counts - end - - def run() - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - - if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) - print_help - exit( 0 ) - end - - if ( cla.get_number_of_files != 2 ) - print_help - exit( -1 ) - end - - allowed_opts = Array.new - allowed_opts.push( DELIMITER_OPTION ) - allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) - allowed_opts.push( IGNORE_DUF_OPTION ) - allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION ) - - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, - "unknown option(s): " + disallowed, - STDOUT ) - end - - inpath = cla.get_file_name( 0 ) - outpath = cla.get_file_name( 1 ) - - column_delimiter = "\t" - if ( cla.is_option_set?( DELIMITER_OPTION ) ) - begin - column_delimiter = cla.get_option_value( DELIMITER_OPTION ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - - e_value_threshold = -1.0 - if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) - begin - e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - if ( e_value_threshold < 0.0 ) - Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) - end - end - - ignore_dufs = false - if ( cla.is_option_set?( IGNORE_DUF_OPTION ) ) - ignore_dufs = true - end - - parse_descriptions = false - if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) ) - parse_descriptions = true - end - - puts() - puts( "hmmpfam outputfile: " + inpath ) - puts( "outputfile : " + outpath ) - if ( e_value_threshold >= 0.0 ) - puts( "E-value threshold : " + e_value_threshold.to_s ) - else - puts( "E-value threshold : no threshold" ) - end - if ( parse_descriptions ) - puts( "parse descriptions: true" ) - else - puts( "parse descriptions: false" ) - end - if ( ignore_dufs ) - puts( "ignore DUFs : true" ) - else - puts( "ignore DUFs : false" ) - end - if ( column_delimiter == "\t" ) - puts( "column delimiter : TAB" ) - else - puts( "column delimiter : " + column_delimiter ) - end - puts() - - begin - queries_count = parse( inpath, - outpath, - column_delimiter, - e_value_threshold, - ignore_dufs, - parse_descriptions ) - rescue ArgumentError, IOError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - domain_counts = get_domain_counts() - - puts - puts( "read output for a total of " + queries_count.to_s + " query sequences" ) - puts - puts( "domain counts (considering potential E-value threshold and ignoring of DUFs):" ) - puts( "(number of different domains: " + domain_counts.length.to_s + ")" ) - puts - puts( Util.draw_histogram( domain_counts, "#" ) ) - puts - Util.print_message( PRG_NAME, 'OK' ) - puts - - end # def run() - - def print_help() - puts( "Usage:" ) - puts() - puts( " " + PRG_NAME + ".rb [options] " ) - puts() - puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" ) - puts( " -" + E_VALUE_THRESHOLD_OPTION + ": E-value threshold, default is no threshold" ) - puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" ) - puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" ) - puts() - end - - - private - - - def HmmscanParser.is_ignorable?( line ) - return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ ) - end - - end # class HmmscanParser - -end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/apps/msa_processor.rb b/forester/ruby/evoruby/lib/evo/apps/msa_processor.rb deleted file mode 100644 index 6299417..0000000 --- a/forester/ruby/evoruby/lib/evo/apps/msa_processor.rb +++ /dev/null @@ -1,848 +0,0 @@ -# -# = lib/evo/apps/msa_processor.rb - MsaProcessor class -# -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: msa_processor.rb,v 1.33 2010/12/13 19:00:10 cmzmasek Exp $ -# - -require 'date' -require 'set' - -require 'lib/evo/util/constants' -require 'lib/evo/util/util' -require 'lib/evo/util/command_line_arguments' -require 'lib/evo/msa/msa_factory' -require 'lib/evo/io/msa_io' -require 'lib/evo/io/writer/phylip_sequential_writer' -require 'lib/evo/io/writer/nexus_writer' -require 'lib/evo/io/writer/fasta_writer' -require 'lib/evo/io/parser/fasta_parser' -require 'lib/evo/io/parser/general_msa_parser' -require 'lib/evo/io/writer/msa_writer' - -module Evoruby - - class MsaProcessor - - PRG_NAME = "msa_pro" - PRG_DATE = "2012.05.11" - PRG_DESC = "processing of multiple sequence alignments" - PRG_VERSION = "1.06" - COPYRIGHT = "2008-2010 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - - - NAME_LENGTH_DEFAULT = 10 - WIDTH_DEFAULT_FASTA = 60 - INPUT_TYPE_OPTION = "i" - OUTPUT_TYPE_OPTION = "o" - MAXIMAL_NAME_LENGTH_OPTION = "n" - WIDTH_OPTION = "w" - CLEAN_UP_SEQ_OPTION = "c" - REM_RED_OPTION = "rem_red" - REMOVE_GAP_COLUMNS_OPTION = "rgc" - REMOVE_GAP_ONLY_COLUMNS = "rgoc" - REMOVE_COLUMNS_GAP_RATIO_OPTION = "rr" - REMOVE_ALL_GAP_CHARACTERS_OPTION = "rg" - REMOVE_ALL_SEQUENCES_LISTED_OPTION = "r" - KEEP_ONLY_SEQUENCES_LISTED_OPTION = "k" - - KEEP_MATCHING_SEQUENCES_OPTION = "mk" - REMOVE_MATCHING_SEQUENCES_OPTION = "mr" - - TRIM_OPTION = "t" - REMOVE_SEQS_GAP_RATIO_OPTION = "rsgr" - REMOVE_SEQS_NON_GAP_LENGTH_OPTION = "rsl" - SPLIT = "split" - LOG_SUFFIX = "_msa_pro.log" - HELP_OPTION_1 = "help" - HELP_OPTION_2 = "h" - - - def initialize() - @input_format_set = false - @output_format_set = false - @fasta_input = false - @phylip_input = true - @name_length = NAME_LENGTH_DEFAULT - @name_length_set = false - @width = WIDTH_DEFAULT_FASTA # fasta only - @pi_output = true - @fasta_output = false - @nexus_output = false - @clean = false # phylip only - @rgc = false - @rgoc = false - @rg = false # fasta only - @rem_red = false - @rgr = -1 - @rsgr = -1 - @rsl = -1 - @remove_matching = nil - @keep_matching = nil - - @seqs_name_file = nil - @remove_seqs = false - @keep_seqs = false - @trim = false - @split = -1 - @first = -1 - @last = -1 - end - - - def run() - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - if ( ARGV == nil || ARGV.length < 1 ) - Util.print_message( PRG_NAME, "Illegal number of arguments" ) - print_help - exit( -1 ) - end - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "Error: " + e.to_s, STDOUT ) - end - - if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) - print_help - exit( 0 ) - end - - if ( cla.get_number_of_files != 2 || ARGV.length < 2 ) - Util.print_message( PRG_NAME, "Illegal number of arguments" ) - print_help - exit( -1 ) - end - - allowed_opts = Array.new - allowed_opts.push( INPUT_TYPE_OPTION ) - allowed_opts.push( OUTPUT_TYPE_OPTION ) - allowed_opts.push( MAXIMAL_NAME_LENGTH_OPTION ) - allowed_opts.push( WIDTH_OPTION ) - allowed_opts.push( CLEAN_UP_SEQ_OPTION ) - allowed_opts.push( REMOVE_GAP_COLUMNS_OPTION ) - allowed_opts.push( REMOVE_GAP_ONLY_COLUMNS ) - allowed_opts.push( REMOVE_COLUMNS_GAP_RATIO_OPTION ) - allowed_opts.push( REMOVE_ALL_GAP_CHARACTERS_OPTION ) - allowed_opts.push( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) - allowed_opts.push( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) - allowed_opts.push( TRIM_OPTION ) - allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION ) - allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) - allowed_opts.push( SPLIT ) - allowed_opts.push( REM_RED_OPTION ) - allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION ) - allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION ) - - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, - "unknown option(s): " + disallowed ) - end - - input = cla.get_file_name( 0 ) - output = cla.get_file_name( 1 ) - - analyze_command_line( cla ) - - begin - Util.check_file_for_readability( input ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - begin - Util.check_file_for_writability( output ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - if ( @rg ) - set_pi_output( false ) - set_fasta_output( true ) - set_nexus_output( false ) - end - - if ( !@input_format_set ) - fasta_like = false - begin - fasta_like = Util.looks_like_fasta?( input ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - @fasta_input = fasta_like - @phylip_input = !fasta_like - if ( !@output_format_set ) - @fasta_output = fasta_like - @pi_output = !fasta_like - @nexus_output = false - end - end - - ld = Constants::LINE_DELIMITER - log = PRG_NAME + " " + PRG_VERSION + " [" + PRG_DATE + "]" + " LOG" + ld - now = DateTime.now - log << "Date/time: " + now.to_s + ld - - puts() - puts( "Input alignment : " + input ) - log << "Input alignment : " + input + ld - puts( "Output alignment : " + output ) - log << "Output alignment : " + output + ld - if ( @phylip_input ) - puts( "Input is : Phylip, or something like it" ) - log << "Input is : Phylip, or something like it" + ld - elsif ( @fasta_input ) - puts( "Input is : Fasta" ) - log << "Input is : Fasta" + ld - end - if( @rgr >= 0 ) - puts( "Max col gap ratio: " + @rgr.to_s ) - log << "Max col gap ratio: " + @rgr.to_s + ld - elsif ( @rgc ) - puts( "Remove gap colums" ) - log << "Remove gap colums" + ld - elsif( @rgoc ) - puts( "Remove gap only colums" ) - log << "Remove gap only colums" + ld - end - if ( @clean ) - puts( "Clean up : true" ) - log << "Clean up : true" + ld - end - - if ( @pi_output ) - puts( "Output is : Phylip interleaved" ) - log << "Output is : Phylip interleaved" + ld - elsif ( @fasta_output ) - puts( "Output is : Fasta" ) - log << "Output is : Fasta" + ld - if ( @width ) - puts( "Width : " + @width.to_s ) - log << "Width : " + @width.to_s + ld - end - if ( @rg ) - puts( "Remove all gap characters (alignment is destroyed)" ) - log << "Remove all gap characters (alignment is destroyed)" + ld - end - elsif ( @nexus_output ) - puts( "Output is : Nexus" ) - log << "Output is : Nexus" + ld - end - if ( @name_length_set || !@fasta_output ) - puts( "Max name length : " + @name_length.to_s ) - log << "Max name length : " + @name_length.to_s + ld - end - if( @rsgr >= 0 ) - puts( "Remove sequences for which the gap ratio > " + @rsgr.to_s ) - log << "Remove sequences for which the gap ratio > " + @rsgr.to_s + ld - end - if( @rsl >= 0 ) - puts( "Remove sequences with less than " + @rsl.to_s + " non-gap characters" ) - log << "Remove sequences with less than " + @rsl.to_s + " non-gap characters" + ld - end - if ( @remove_seqs ) - puts( "Remove sequences listed in: " + @seqs_name_file ) - log << "Remove sequences listed in: " + @seqs_name_file + ld - elsif ( @keep_seqs ) - puts( "Keep only sequences listed in: " + @seqs_name_file ) - log << "Keep only sequences listed in: " + @seqs_name_file + ld - end - if ( @trim ) - puts( "Keep only columns from: "+ @first.to_s + " to " + @last.to_s ) - log << "Keep only columns from: "+ @first.to_s + " to " + @last.to_s + ld - end - if ( @rem_red ) - puts( "Remove redundant sequences: true" ) - log << "Remove redundant sequences: true" + ld - end - if ( @split > 0 ) - puts( "Split : " + @split.to_s ) - log << "Split : " + @split.to_s + ld - end - puts() - - f = MsaFactory.new() - - msa = nil - - begin - if ( @phylip_input ) - msa = f.create_msa_from_file( input, GeneralMsaParser.new() ) - elsif ( @fasta_input ) - msa = f.create_msa_from_file( input, FastaParser.new() ) - end - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - - if ( msa.is_aligned() ) - Util.print_message( PRG_NAME, "Length of original alignment : " + msa.get_length.to_s ) - log << "Length of original alignment : " + msa.get_length.to_s + ld - gp = msa.calculate_gap_proportion - Util.print_message( PRG_NAME, "Gap-proportion of original alignment : " + gp.to_s ) - log << "Gap-proportion of original alignment : " + gp.to_s + ld - else - Util.print_message( PRG_NAME, "the input is not aligned" ) - log << "The input is not aligned" + ld - end - - all_names = Set.new() - for i in 0 ... msa.get_number_of_seqs() - current_name = msa.get_sequence( i ).get_name - if all_names.include?( current_name ) - Util.print_warning_message( PRG_NAME, "sequence name [" + current_name + "] is not unique" ) - else - all_names.add( current_name ) - end - end - - begin - - if ( @remove_seqs || @keep_seqs ) - names = Util.file2array( @seqs_name_file, true ) - if ( names == nil || names.length() < 1 ) - error_msg = "file \"" + @seqs_name_file.to_s + "\" appears empty" - Util.fatal_error( PRG_NAME, error_msg ) - end - - if ( @remove_seqs ) - c = 0 - for i in 0 ... names.length() - to_delete = msa.find_by_name( names[ i ], true, false ) - if ( to_delete.length() < 1 ) - error_msg = "sequence name \"" + names[ i ] + "\" not found" - Util.fatal_error( PRG_NAME, error_msg ) - elsif ( to_delete.length() > 1 ) - error_msg = "sequence name \"" + names[ i ] + "\" is not unique" - Util.fatal_error( PRG_NAME, error_msg ) - else - msa.remove_sequence!( to_delete[ 0 ] ) - c += 1 - end - end - Util.print_message( PRG_NAME, "Removed " + c.to_s + " sequences" ) - log << "Removed " + c.to_s + " sequences" + ld - elsif ( @keep_seqs ) - msa_new = Msa.new() - r = 0 - k = 0 - for j in 0 ... msa.get_number_of_seqs() - if ( names.include?( msa.get_sequence( j ).get_name() ) ) - msa_new.add_sequence( msa.get_sequence( j ) ) - k += 1 - else - r += 1 - end - end - msa = msa_new - Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" ) - log << "Kept " + k.to_s + " sequences" + ld - Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" ) - log << "removed " + r.to_s + " sequences" + ld - end - end - - if ( @trim ) - msa.trim!( @first, @last ) - end - if( @rgr >= 0 ) - msa.remove_gap_columns_w_gap_ratio!( @rgr ) - elsif ( @rgc ) - msa.remove_gap_columns!() - elsif( @rgoc ) - msa.remove_gap_only_columns!() - end - if( @rsgr >= 0 ) - n = msa.get_number_of_seqs() - removed = msa.remove_sequences_by_gap_ratio!( @rsgr ) - k = msa.get_number_of_seqs() - r = n - k - Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" ) - log << "Kept " + k.to_s + " sequences" + ld - Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" ) - log << "Removed " + r.to_s + " sequences:" + ld - removed.each { | seq_name | - log << " " + seq_name + ld - } - end - if( @rsl >= 0 ) - n = msa.get_number_of_seqs() - removed = msa.remove_sequences_by_non_gap_length!( @rsl ) - k = msa.get_number_of_seqs() - r = n - k - Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" ) - log << "Kept " + k.to_s + " sequences" + ld - Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" ) - log << "Removed " + r.to_s + " sequences:" + ld - removed.each { | seq_name | - log << " " + seq_name + ld - } - end - if ( @keep_matching ) - n = msa.get_number_of_seqs - to_be_removed = Set.new - for ii in 0 ... n - seq = msa.get_sequence( ii ) - if !seq.get_name.downcase.index( @keep_matching.downcase ) - to_be_removed.add( ii ) - end - end - to_be_removed_ary = to_be_removed.to_a.sort.reverse - to_be_removed_ary.each { | index | - msa.remove_sequence!( index ) - } - # msa = sort( msa ) - end - if ( @remove_matching ) - n = msa.get_number_of_seqs - to_be_removed = Set.new - for iii in 0 ... n - - seq = msa.get_sequence( iii ) - - if seq.get_name.downcase.index( @remove_matching.downcase ) - to_be_removed.add( iii ) - end - end - to_be_removed_ary = to_be_removed.to_a.sort.reverse - to_be_removed_ary.each { | index | - msa.remove_sequence!( index ) - } - msa = sort( msa ) - end - - - - if ( @split > 0 ) - begin - msas = msa.split( @split, true ) - io = MsaIO.new() - w = MsaWriter - if ( @pi_output ) - w = PhylipSequentialWriter.new() - w.clean( @clean ) - w.set_max_name_length( @name_length ) - elsif( @fasta_output ) - w = FastaWriter.new() - w.set_line_width( @width ) - if ( @rg ) - w.remove_gap_chars( true ) - Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" ) - log << "removing gap character, the output is likely to become unaligned" + ld - end - w.clean( @clean ) - if ( @name_length_set ) - w.set_max_name_length( @name_length ) - end - elsif( @nexus_output ) - w = NexusWriter.new() - w.clean( @clean ) - w.set_max_name_length( @name_length ) - end - i = 0 - for m in msas - i = i + 1 - io.write_to_file( m, output + "_" + i.to_s, w ) - end - Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files" ) - log << "wrote " + msas.length.to_s + " files" + ld - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - - end - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - - if ( @split <= 0 ) - - unless ( @rg ) - if ( msa.is_aligned() ) - Util.print_message( PRG_NAME, "Length of processed alignment : " + msa.get_length.to_s ) - log << "Length of processed alignment : " + msa.get_length.to_s + ld - gp = msa.calculate_gap_proportion - Util.print_message( PRG_NAME, "Gap-proportion of processed alignment: " + gp.to_s ) - log << "Gap-proportion of processed alignment: " + gp.to_s + ld - else - Util.print_warning_message( PRG_NAME, "output is not aligned" ) - log << "output is not aligned" + ld - end - end - - if @rem_red - removed = msa.remove_redundant_sequences!( true, false ) - if removed.size > 0 - identicals = msa.get_identical_seqs_detected - log << "the following " + identicals.size.to_s + " sequences are identical:" + ld - identicals.each { | s | - log << s + ld - } - log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld - removed.each { | seq_name | - log << seq_name + ld - } - Util.print_message( PRG_NAME, "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" ) - log << "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" + ld - end - end - - io = MsaIO.new() - - w = MsaWriter - - if ( @pi_output ) - w = PhylipSequentialWriter.new() - w.clean( @clean ) - w.set_max_name_length( @name_length ) - elsif( @fasta_output ) - w = FastaWriter.new() - w.set_line_width( @width ) - if ( @rg ) - w.remove_gap_chars( true ) - Util.print_warning_message( PRG_NAME, "removing gap characters, the output is likely to become unaligned" ) - log << "removing gap character, the output is likely to become unaligned" + ld - end - w.clean( @clean ) - if ( @name_length_set ) - w.set_max_name_length( @name_length ) - end - elsif( @nexus_output ) - w = NexusWriter.new() - w.clean( @clean ) - w.set_max_name_length( @name_length ) - end - - - begin - io.write_to_file( msa, output, w ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - begin - f = File.open( output + LOG_SUFFIX, 'a' ) - f.print( log ) - f.close - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - - end - Util.print_message( PRG_NAME, "OK" ) - puts - end - - - private - - def sort( msa ) - names = Set.new - for i in 0 ... msa.get_number_of_seqs - name = msa.get_sequence( i ).get_name - names.add( name ) - end - sorted_ary = names.to_a.sort - new_msa = Msa.new - sorted_ary.each { | seq_name | - seq = msa.get_sequence( msa.find_by_name( seq_name, true, false )[ 0 ] ) - new_msa.add_sequence( seq ) - } - new_msa - end - - def set_fasta_input( fi = true ) - @fasta_input = fi - @input_format_set = true - end - def set_phylip_input( pi = true ) - @phylip_input = pi - @input_format_set = true - end - def set_name_length( i ) - @name_length = i - @name_length_set = true - end - def set_width( i ) - @width = i - end - def set_fasta_output( fo = true ) - @fasta_output = fo - @output_format_set = true - end - def set_pi_output( pso = true ) - @pi_output = pso - @output_format_set = true - end - def set_nexus_output( nexus = true ) - @nexus_output = nexus - @output_format_set = true - end - def set_clean( c = true ) - @clean = c - end - def set_remove_gap_columns( rgc = true ) - @rgc = rgc - end - def set_remove_gap_only_columns( rgoc = true ) - @rgoc = rgoc - end - def set_remove_gaps( rg = true ) - @rg = rg - end - def set_remove_gap_ratio( rgr ) - @rgr = rgr - end - def set_remove_seqs_gap_ratio( rsgr ) - @rsgr = rsgr - end - def set_remove_seqs_min_non_gap_length( rsl ) - @rsl = rsl - end - def set_remove_seqs( file ) - @seqs_name_file = file - @remove_seqs = true - @keep_seqs = false - end - def set_keep_seqs( file ) - @seqs_name_file = file - @keep_seqs = true - @remove_seqs = false - end - def set_trim( first, last ) - @trim = true - @first = first - @last = last - end - def set_remove_matching( remove ) - @remove_matching = remove - end - def set_keep_matching( keep ) - @keep_matching = keep - end - def set_rem_red( rr ) - @rem_red = rr - end - - - - def set_split( s ) - if ( s > 0 ) - @split = s - @clean = false # phylip only - @rgc = false - @rgoc = false - @rg = false # fasta only - @rgr = -1 - @rsgr = -1 - @rsl = -1 - @seqs_name_file = nil - @remove_seqs = false - @keep_seqs = false - @trim = false - @first = -1 - @last = -1 - end - end - def analyze_command_line( cla ) - if ( cla.is_option_set?( INPUT_TYPE_OPTION ) ) - begin - type = cla.get_option_value( INPUT_TYPE_OPTION ) - if ( type == "p" ) - set_phylip_input( true ) - set_fasta_input( false ) - elsif ( type == "f" ) - set_fasta_input( true ) - set_phylip_input( false ) - end - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - if ( cla.is_option_set?( OUTPUT_TYPE_OPTION ) ) - begin - type = cla.get_option_value( OUTPUT_TYPE_OPTION ) - if ( type == "p" ) - set_pi_output( true ) - set_fasta_output( false ) - set_nexus_output( false ) - elsif ( type == "f" ) - set_pi_output( false ) - set_fasta_output( true ) - set_nexus_output( false ) - elsif ( type == "n" ) - set_pi_output( false ) - set_fasta_output( false ) - set_nexus_output( true ) - end - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - if ( cla.is_option_set?( MAXIMAL_NAME_LENGTH_OPTION ) ) - begin - l = cla.get_option_value_as_int( MAXIMAL_NAME_LENGTH_OPTION ) - set_name_length( l ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - if ( cla.is_option_set?( WIDTH_OPTION ) ) - begin - w = cla.get_option_value_as_int( WIDTH_OPTION ) - set_width( w ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - if ( cla.is_option_set?( CLEAN_UP_SEQ_OPTION ) ) - set_clean( true ) - end - if ( cla.is_option_set?( REMOVE_GAP_COLUMNS_OPTION ) ) - set_remove_gap_columns( true ) - end - if ( cla.is_option_set?( REM_RED_OPTION ) ) - set_rem_red( true ) - end - if ( cla.is_option_set?( REMOVE_GAP_ONLY_COLUMNS ) ) - set_remove_gap_only_columns( true ) - end - if ( cla.is_option_set?( REMOVE_ALL_GAP_CHARACTERS_OPTION ) ) - set_remove_gaps( true ) - end - if ( cla.is_option_set?( REMOVE_COLUMNS_GAP_RATIO_OPTION ) ) - begin - f = cla.get_option_value_as_float( REMOVE_COLUMNS_GAP_RATIO_OPTION ) - set_remove_gap_ratio( f ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - if ( cla.is_option_set?( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) ) - begin - s = cla.get_option_value( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) - set_remove_seqs( s ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - if ( cla.is_option_set?( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) ) - begin - s = cla.get_option_value( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) - set_keep_seqs( s ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - if ( cla.is_option_set?( TRIM_OPTION ) ) - begin - s = cla.get_option_value( TRIM_OPTION ) - if ( s =~ /(\d+)-(\d+)/ ) - set_trim( $1.to_i(), $2.to_i() ) - else - puts( "illegal argument" ) - print_help - exit( -1 ) - end - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) ) - begin - f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION ) - set_remove_seqs_gap_ratio( f ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - if ( cla.is_option_set?( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) ) - begin - f = cla.get_option_value_as_int( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) - set_remove_seqs_min_non_gap_length( f ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - if ( cla.is_option_set?( SPLIT ) ) - begin - s = cla.get_option_value_as_int( SPLIT ) - set_split( s ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - - end - if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) ) - begin - s = cla.get_option_value( REMOVE_MATCHING_SEQUENCES_OPTION ) - set_remove_matching( s ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - if ( cla.is_option_set?( KEEP_MATCHING_SEQUENCES_OPTION ) ) - begin - s = cla.get_option_value( KEEP_MATCHING_SEQUENCES_OPTION ) - set_keep_matching( s ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - - - end - - def print_help() - puts() - puts( "Usage:" ) - puts() - puts( " " + PRG_NAME + ".rb [options] " ) - puts() - puts( " options: -" + INPUT_TYPE_OPTION + "=: f for fasta, p for phylip selex type" ) - puts( " -" + OUTPUT_TYPE_OPTION + "=: f for fasta, n for nexus, p for phylip sequential (default)" ) - puts( " -" + MAXIMAL_NAME_LENGTH_OPTION + "=: n=maximal name length (default for phylip 10, for fasta: unlimited )" ) - puts( " -" + WIDTH_OPTION + "=: n=width (fasta output only, default is 60)" ) - puts( " -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" ) - puts( " -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" ) - puts( " -" + REMOVE_GAP_ONLY_COLUMNS + ": remove gap-only columns" ) - puts( " -" + REMOVE_COLUMNS_GAP_RATIO_OPTION + "=: remove columns for which ( seqs with gap / number of sequences > n )" ) - puts( " -" + REMOVE_ALL_GAP_CHARACTERS_OPTION + ": remove all gap characters (destroys alignment, fasta output only)" ) - puts( " -" + REMOVE_ALL_SEQUENCES_LISTED_OPTION + "=: remove all sequences listed in file" ) - puts( " -" + KEEP_ONLY_SEQUENCES_LISTED_OPTION + "=: keep only sequences listed in file" ) - puts( " -" + TRIM_OPTION + "=-: remove columns before first and after last" ) - puts( " -" + REMOVE_SEQS_GAP_RATIO_OPTION + "=: remove sequences for which the gap ratio > n (after column operations)" ) - puts( " -" + REMOVE_SEQS_NON_GAP_LENGTH_OPTION + "= remove sequences with less than n non-gap characters (after column operations)" ) - puts( " -" + REMOVE_MATCHING_SEQUENCES_OPTION + "= remove all sequences with names containing s" ) - puts( " -" + KEEP_MATCHING_SEQUENCES_OPTION + "= keep only sequences with names containing s" ) - puts( " -" + SPLIT + "= split a fasta file into n files of equal number of sequences (expect for " ) - puts( " last one), cannot be used with other options" ) - puts( " -" + REM_RED_OPTION + ": remove redundant sequences" ) - puts() - end - - - - - - end # class MsaProcessor - - -end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb deleted file mode 100644 index b499c72..0000000 --- a/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb +++ /dev/null @@ -1,551 +0,0 @@ -# -# = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class -# -# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $ - - -require 'lib/evo/util/constants' -require 'lib/evo/util/util' -require 'lib/evo/msa/msa' -require 'lib/evo/msa/msa_factory' -require 'lib/evo/io/msa_io' -require 'lib/evo/io/parser/fasta_parser' -require 'lib/evo/io/writer/fasta_writer' -require 'lib/evo/util/command_line_arguments' - - - -module Evoruby - - class MultiSequenceExtractor - - PRG_NAME = "mse" - PRG_VERSION = "1.02" - PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files" - PRG_DATE = "2012.07.20" - COPYRIGHT = "2008-2012 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - HELP_OPTION_1 = 'help' - HELP_OPTION_2 = 'h' - - EXT_OPTION = 'e' - EXTRACT_LINKERS_OPTION = 'l' - LOG_SUFFIX = ".mse_log" - LINKERS_SUFFIX = ".linkers" - FASTA_SUFFIX = ".fasta" - FASTA_WITH_NORMALIZED_IDS_SUFFIX = ".ni.fasta" - NORMALIZED_IDS_MAP_SUFFIX = ".nim" - PROTEINS_LIST_FILE_SEPARATOR = "\t" - - - def run() - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC , - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - ld = Constants::LINE_DELIMITER - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) - print_help - exit( 0 ) - end - - if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 ) - print_help - exit( -1 ) - end - - allowed_opts = Array.new - allowed_opts.push(EXT_OPTION) - allowed_opts.push(EXTRACT_LINKERS_OPTION) - - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, - "unknown option(s): " + disallowed, - STDOUT ) - end - - seq_names_files_suffix = cla.get_file_name( 0 ) - input_dir = cla.get_file_name( 1 ) - out_dir = cla.get_file_name( 2 ) - out_dir_doms = cla.get_file_name( 3 ) - mapping_file = nil - - if ( cla.get_number_of_files == 5 ) - mapping_file = cla.get_file_name( 4 ) - begin - Util.check_file_for_readability( mapping_file ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - end - - extension = 0 - if cla.is_option_set?(EXT_OPTION) - extension = cla.get_option_value_as_int(EXT_OPTION) - if extension < 0 - extension = 0 - end - end - - extract_linkers = false - if cla.is_option_set?(EXTRACT_LINKERS_OPTION) - extract_linkers = true - end - - if !File.exist?( input_dir ) - Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" ) - end - if !File.exist?( out_dir ) - Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" ) - end - if !File.exist?( out_dir_doms ) - Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" ) - end - if !File.directory?( input_dir ) - Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" ) - end - if !File.directory?( out_dir ) - Util.fatal_error( PRG_NAME, "error: [#{out_dir}] is not a directory" ) - end - if !File.directory?( out_dir_doms ) - Util.fatal_error( PRG_NAME, "error: [#{out_dir_doms}] is not a directory" ) - end - - - log = String.new - - log << "Program : " + PRG_NAME + ld - log << "Version : " + PRG_VERSION + ld - log << "Program date : " + PRG_DATE + ld - - puts() - puts( "Sequence names files suffix: " + seq_names_files_suffix ) - log << "Sequence names files suffix: " + seq_names_files_suffix + ld - puts( "Input dir : " + input_dir ) - log << "Input dir : " + input_dir + ld - puts( "Output dir : " + out_dir ) - log << "Output dir : " + out_dir + ld - puts( "Output dir domains : " + out_dir_doms ) - log << "Output dir domains : " + out_dir_doms + ld - if ( mapping_file != nil ) - puts( "Mapping file : " + mapping_file ) - log << "Mapping file : " + mapping_file + ld - end - if extension > 0 - puts( "Extension : " + extension.to_s ) - log << "Extension : " + extension.to_s + ld - end - if extract_linkers - puts( "Extract linkers : true" ) - log << "Extract linkers : true" + ld - end - log << "Date : " + Time.now.to_s + ld - puts - - if ( mapping_file != nil ) - species_codes_to_paths = extract_mappings( mapping_file ) - end - - input_files = obtain_inputfiles( input_dir, seq_names_files_suffix ) - - counter = 0 - - input_files.each { |input_file| - counter += 1 - puts - puts - puts counter.to_s + "/" + input_files.size.to_s - read_seq_family_file( input_file, - seq_names_files_suffix, - input_dir, - species_codes_to_paths, - log, - out_dir, - out_dir_doms, - mapping_file, - extension, - extract_linkers ) - } - puts - Util.print_message( PRG_NAME, "OK" ) - puts - - end - - - def read_seq_family_file( input_file, - seq_names_files_suffix, - input_dir, - species_codes_to_paths, - log, - out_dir, - out_dir_doms, - mapping_file, - extension, - extract_linkers ) - - begin - Util.check_file_for_readability( input_file ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - basename = File.basename( input_file, seq_names_files_suffix ) - out_file_path_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX - out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX - out_file_path_ids_map = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX - doms_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX - doms_ext_out_file_path_fasta_file = nil - if extension > 0 - doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX - end - begin - Util.check_file_for_writability( out_file_path_fasta_file ) - Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file ) - Util.check_file_for_writability( out_file_path_ids_map ) - Util.check_file_for_writability( doms_out_file_path_fasta_file ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - ids_map_writer = nil - begin - ids_map_writer = File.open( out_file_path_ids_map, 'a' ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - current_species = "" - current_msa = nil - new_msa = Msa.new - new_msa_normalized_ids = Msa.new - new_msa_domains = Msa.new - new_msa_domains_extended = Msa.new - per_species_counter = 0 - linkers = "" - - puts basename - - File.open( input_file ) do | file | - while line = file.gets - line.strip! - if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) - values = line.split( PROTEINS_LIST_FILE_SEPARATOR ) - mod_line = nil - if ( values.length < 2 ) - Util.fatal_error( PRG_NAME, "unexpected format: " + line ) - end - species = values[ 0 ] - if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA" || species == "EIMTE" - next - end - seq_name = values[ 1 ] - domain_ranges = nil - if ( values.length > 3 ) - domain_ranges_block = values[ 3 ] - domain_ranges = domain_ranges_block.split( "/" ) - end - if ( species != current_species ) - current_species = species - my_file = input_dir + Constants::FILE_SEPARATOR + current_species - - if ( !File.exist?( my_file ) ) - if species_codes_to_paths == nil - Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" ) - elsif ( !species_codes_to_paths.has_key?( current_species ) ) - Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" ) - end - my_file = species_codes_to_paths[ current_species ] - end - my_path = File.expand_path( my_file ) - my_readlink = my_path - if ( File.symlink?( my_path ) ) - my_readlink = File.readlink( my_path ) - end - current_msa = read_fasta_file( my_file ) - - if ( per_species_counter > 0 ) - print_counts( per_species_counter, log, Constants::LINE_DELIMITER ) - per_species_counter = 0 - end - puts " " + current_species + " [" + my_readlink + "]" - log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER - end - puts " " + seq_name - log << " " + seq_name + Constants::LINE_DELIMITER - per_species_counter = per_species_counter + 1 - seq = nil - - if current_msa.find_by_name_start( seq_name, true ).size > 0 - begin - seq = current_msa.get_by_name_start( seq_name, true ).copy - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - else - # Not found, try finding by partial match. - begin - seq = current_msa.get_by_name( seq_name, true, true ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - end - - normalized_id = per_species_counter.to_s( 16 ).upcase + - "_" + current_species - - per_species_counter.to_i - - ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER ) - - orig_name = nil - if seq != nil - orig_name = seq.get_name - seq.set_name( seq.get_name + " [" + current_species + "]" ) - new_msa.add_sequence( seq ) - else - Util.fatal_error( PRG_NAME, "unexected error: seq is nil" ) - end - - if domain_ranges != nil - first = true - prev_to = -1 - - domain_ranges.each { |range| - if range != nil && range.length > 0 - s = range.split("-") - from = s[ 0 ].to_i - to = s[ 1 ].to_i - new_msa_domains.add_sequence( Sequence.new( orig_name + - " [" + from.to_s + - "-" + to.to_s + - "] [" + basename + "] [" + - current_species + "]", - seq.get_sequence_as_string[from..to] ) ) - if extension > 0 - from_e = from - extension - if from_e < 0 - from_e = 0 - end - to_e = to + extension - if to_e > seq.get_sequence_as_string.length - 1 - to_e = seq.get_sequence_as_string.length - 1 - end - new_msa_domains_extended.add_sequence( Sequence.new( orig_name + - " [" + from.to_s + - "-" + to.to_s + - "] [extended by " + - extension.to_s + - "] [" + basename + "] [" + - current_species + "]", - seq.get_sequence_as_string[ from_e..to_e ] ) ) - end # extension > 0 - if extract_linkers - if first - first = false - f = 0 - t = from - 1 - if extension > 0 - f = from - extension - end - mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|" - else - mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|" - end - prev_to = to - end - end # range != nil && range.length > 0 - } - if extract_linkers && prev_to > 0 - f = prev_to + 1 - t = seq.get_sequence_as_string.length - 1 - if extension > 0 - t = prev_to + extension - end - mod_line += get_linker_sequence( f, t, seq ) + "]" - end - end - - new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) ) - if mod_line - linkers << mod_line + Constants::LINE_DELIMITER - end - end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) - end # while line = file.gets - - end - - ids_map_writer.close - - if ( per_species_counter > 0 ) - print_counts( per_species_counter, log, Constants::LINE_DELIMITER ) - end - - io = MsaIO.new() - - fasta_writer = FastaWriter.new() - fasta_writer.remove_gap_chars - fasta_writer.clean - - begin - io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - if new_msa_domains != nil - begin - io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - end - - if extension > 0 && new_msa_domains_extended != nil - begin - io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - end - - begin - io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - begin - f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LOG_SUFFIX , 'a' ) - f.print( log ) - f.close - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - if extract_linkers && linkers != nil - begin - f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LINKERS_SUFFIX , 'a' ) - f.print( linkers ) - f.close - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - end - - - end - - - def get_linker_sequence( from, to, seq ) - if from < 0 - from = 0 - end - if to > seq.get_sequence_as_string.length - 1 - to = seq.get_sequence_as_string.length - 1 - end - if from > to - return "" - else - return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ] - end - end - - def obtain_inputfiles( input_dir, seq_names_files_suffix ) - input_files = Array.new() - Dir.foreach( input_dir ) { |file_name| - if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size ) - input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name ) - end - } - input_files - end - - def extract_mappings( mapping_file ) - species_code_to_path = Hash.new() - File.open( mapping_file ) do | file | - while line = file.gets - if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) ) - if ( line =~ /(\S+)\s+(\S+)/ ) - species = $1 - path = $2 - if ( species_code_to_path.has_key?( species ) ) - Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" ) - end - if ( species_code_to_path.has_value?( path ) ) - Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" ) - end - if ( !File.exist?( path ) ) - Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" ) - end - if ( !File.file?( path ) ) - Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" ) - end - if ( !File.readable?( path ) ) - Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" ) - end - if ( File.size( path ) < 10000 ) - Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" ) - end - if ( !Util.looks_like_fasta?( path ) ) - Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" ) - end - species_code_to_path[ species ] = path - puts species + " -> " + path - end - end - end - end - species_code_to_path - end - - def print_counts( per_species_counter, log, ld ) - puts " [sum: " + per_species_counter.to_s + "]" - log << " [sum: " + per_species_counter.to_s + "]" + ld - end - - def read_fasta_file( input ) - f = MsaFactory.new() - msa = nil - begin - msa = f.create_msa_from_file( input, FastaParser.new() ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - msa - end - - def print_help() - puts( "Usage:" ) - puts() - puts( " " + PRG_NAME + ".rb [mapping file for " + - "genome multiple-sequence ('fasta') files not in input dir]" ) - puts() - puts( " option: -" + EXT_OPTION + "=: to extend extracted domains" ) - puts( " -" + EXTRACT_LINKERS_OPTION + " : to extract linkers" ) - puts() - puts( " " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" ) - puts() - end - - end # class MultiSequenceExtractor -end \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/apps/new_tap.rb b/forester/ruby/evoruby/lib/evo/apps/new_tap.rb deleted file mode 100644 index 1dc7431..0000000 --- a/forester/ruby/evoruby/lib/evo/apps/new_tap.rb +++ /dev/null @@ -1,167 +0,0 @@ -# -# = lib/evo/apps/ - class -# -# Copyright:: Copyright (C) 2009 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: new_tap.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $ - - -require 'lib/evo/util/util' -require 'lib/evo/msa/msa_factory' -require 'lib/evo/msa/msa' -require 'lib/evo/io/msa_io' -require 'lib/evo/io/parser/fasta_parser' -require 'lib/evo/io/parser/general_msa_parser' -require 'lib/evo/io/writer/fasta_writer' -require 'lib/evo/io/writer/phylip_sequential_writer' -require 'lib/evo/util/command_line_arguments' - -module Evoruby - - class TaxonomyProcessor - - PRG_NAME = "" - PRG_DATE = "2009.10.09" - PRG_DESC = "replacement of labels in multiple sequence files" - PRG_VERSION = "1.00" - COPYRIGHT = "2009 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - - REMOVE_REDUNDANT_SEQS_OPTION = "rr" - - def initialize() - @taxonomies = Hash.new() - end - - def run() - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - if ( ARGV == nil || ( ARGV.length != 3 && ARGV.length != 4 ) ) - puts( "Usage: #{PRG_NAME}.rb " ) - puts() - puts( " options: -" + REMOVE_REDUNDANT_SEQS_OPTION + ": to remove redundant sequences" ) - puts() - exit( -1 ) - end - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - input = cla.get_file_name( 0 ) - output = cla.get_file_name( 1 ) - map_file = cla.get_file_name( 2 ) - - allowed_opts = Array.new - allowed_opts.push( REMOVE_REDUNDANT_SEQS_OPTION ) - - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) - end - - - remove_redudant = false - if ( cla.is_option_set?( REMOVE_REDUNDANT_SEQS_OPTION ) ) - remove_redudant = true - end - - if ( File.exists?( output ) ) - Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" ) - end - if ( File.exists?( map_file ) ) - Util.fatal_error( PRG_NAME, "map file [" + map_file + "] already exists" ) - end - if ( !File.exists?( input) ) - Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" ) - end - - fasta_like = Util.looks_like_fasta?( input ) - - puts() - puts( "Input alignment : " + input ) - puts( "Output alignment: " + output ) - puts( "Output map : " + map_file ) - if ( fasta_like ) - puts( "Format : Fasta" ) - else - puts( "Format : Phylip like" ) - end - puts() - - species_map = Hash.new - - f = MsaFactory.new() - begin - if ( fasta_like ) - msa = f.create_msa_from_file( input, FastaParser.new() ) - else - msa = f.create_msa_from_file( input, GeneralMsaParser.new() ) - end - rescue Exception => e - Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s ) - end - - if ( msa == nil || msa.get_number_of_seqs() < 1 ) - Util.fatal_error( PRG_NAME, "failed to read MSA" ) - end - begin - Util.check_file_for_writability( map_file ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT ) - end - - if ( remove_redudant ) - removed = msa.remove_redundant_sequences!( true ) - if removed.size > 0 - Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) - removed.each { | seq_name | - puts seq_name - } - Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" ) - end - end - - lf = File.open( map_file, "a" ) - for i in 0 ... msa.get_number_of_seqs - seq = msa.get_sequence( i ) - end - - io = MsaIO.new() - w = nil - if ( fasta_like ) - w = FastaWriter.new() - else - w = PhylipSequentialWriter.new() - end - w.set_max_name_length( 10 ) - w.clean( true ) - begin - io.write_to_file( msa, output, w ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) - end - lf.close() - if ( @taxonomies.length > 0 ) - Util.print_message( PRG_NAME, "number of unique taxonomies: " + @taxonomies.length.to_s ) - end - Util.print_message( PRG_NAME, "wrote: " + map_file ) - Util.print_message( PRG_NAME, "wrote: " + output ) - Util.print_message( PRG_NAME, "OK" ) - end - - end # class - -end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/apps/phylogenies_decorator.rb b/forester/ruby/evoruby/lib/evo/apps/phylogenies_decorator.rb deleted file mode 100644 index 69a6bc4..0000000 --- a/forester/ruby/evoruby/lib/evo/apps/phylogenies_decorator.rb +++ /dev/null @@ -1,299 +0,0 @@ -#!/usr/local/bin/ruby -w -# -# = lib/evo/apps/phylogenies_decorator -# -# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# decoration of phylogenies with sequence/species names and domain architectures -# -# $Id: phylogenies_decorator.rb,v 1.34 2010/12/13 19:00:11 cmzmasek Exp $ -# -# Environment variable FORESTER_HOME needs to point to the appropriate -# directory (e.g. setenv FORESTER_HOME $HOME/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester-atv/) - -require 'lib/evo/util/constants' -require 'lib/evo/util/util' -require 'lib/evo/util/command_line_arguments' - -require 'date' - -module Evoruby - - class PhylogeniesDecorator - - DECORATOR_OPTIONS_SEQ_NAMES = '-r=1 -mdn' - # -mdn is a hidden expert option to rename e.g. "6_ORYLA3" to "6_[3]_ORYLA" - #DECORATOR_OPTIONS_SEQ_NAMES = '-sn -r=1' - DECORATOR_OPTIONS_DOMAINS = '-r=1' - IDS_MAPFILE_SUFFIX = '.nim' - DOMAINS_MAPFILE_SUFFIX = '.dff' - SLEEP_TIME = 0.1 - REMOVE_NI = true - TMP_FILE = '___PD___' - LOG_FILE = '00_phylogenies_decorator.log' - FORESTER_HOME = ENV[Constants::FORESTER_HOME_ENV_VARIABLE] - JAVA_HOME = ENV[Constants::JAVA_HOME_ENV_VARIABLE] - - PRG_NAME = "phylogenies_decorator" - PRG_DATE = "2008.09.02" - PRG_DESC = "decoration of phylogenies with sequence/species names and domain architectures" - PRG_VERSION = "1.0.1" - COPYRIGHT = "2008-2009 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - - IDS_ONLY_OPTION = "n" - DOMAINS_ONLY_OPTION = "d" - HELP_OPTION_1 = "help" - HELP_OPTION_2 = "h" - - NL = Constants::LINE_DELIMITER - - def run - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - if ( ARGV == nil || ARGV.length > 3 || ARGV.length < 2 ) - print_help - exit( -1 ) - end - - if FORESTER_HOME == nil || FORESTER_HOME.length < 1 - Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::FORESTER_HOME_ENV_VARIABLE} has not been set" ) - end - if JAVA_HOME == nil || JAVA_HOME.length < 1 - Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::JAVA_HOME_ENV_VARIABLE} has not been set" ) - end - - if !File.exist?( FORESTER_HOME ) - Util.fatal_error( PRG_NAME, '[' + FORESTER_HOME + '] does not exist' ) - end - if !File.exist?( JAVA_HOME ) - Util.fatal_error( PRG_NAME, '[' + JAVA_HOME + '] does not exist' ) - end - - decorator = JAVA_HOME + '/bin/java -cp ' + FORESTER_HOME + '/java/forester.jar org.forester.application.decorator' - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) - print_help - exit( 0 ) - end - - if File.exist?( LOG_FILE ) - Util.fatal_error( PRG_NAME, 'logfile [' + LOG_FILE + '] already exists' ) - end - - allowed_opts = Array.new - allowed_opts.push( IDS_ONLY_OPTION ) - allowed_opts.push( DOMAINS_ONLY_OPTION ) - - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) - end - - ids_only = false - domains_only = false - - in_suffix = cla.get_file_name( 0 ) - out_suffix = cla.get_file_name( 1 ) - - if cla.is_option_set?( IDS_ONLY_OPTION ) - ids_only = true - end - if cla.is_option_set?( DOMAINS_ONLY_OPTION ) - domains_only = true - end - - if ( ids_only && domains_only ) - Util.fatal_error( PRG_NAME, 'attempt to use ids only and domains only at the same time' ) - end - - log = String.new - - now = DateTime.now - log << "Program : " + PRG_NAME + NL - log << "Version : " + PRG_VERSION + NL - log << "Program date : " + PRG_DATE + NL - log << "Options for seq names: " + DECORATOR_OPTIONS_SEQ_NAMES + NL - log << "Options for domains : " + DECORATOR_OPTIONS_DOMAINS + NL - log << "FORESTER_HOME : " + FORESTER_HOME + NL - log << "JAVA_HOME : " + JAVA_HOME + NL + NL - log << "Date/time: " + now.to_s + NL - log << "Directory: " + Dir.getwd + NL + NL - - Util.print_message( PRG_NAME, 'input suffix : ' + in_suffix ) - Util.print_message( PRG_NAME, 'output suffix : ' + out_suffix ) - - log << 'input suffix : ' + in_suffix + NL - log << 'output suffix : ' + out_suffix + NL - - if ( File.exists?( TMP_FILE ) ) - File.delete( TMP_FILE ) - end - - files = Dir.entries( "." ) - - counter = 0 - - files.each { | phylogeny_file | - if ( !File.directory?( phylogeny_file ) && - phylogeny_file !~ /^\./ && - phylogeny_file !~ /^00/ && - phylogeny_file !~ /#{out_suffix}$/ && - phylogeny_file =~ /#{in_suffix}$/ ) - begin - Util.check_file_for_readability( phylogeny_file ) - rescue ArgumentError - Util.fatal_error( PRG_NAME, 'can not read from: ' + phylogeny_file + ': '+ $! ) - end - - counter += 1 - - outfile = phylogeny_file.sub( /#{in_suffix}$/, out_suffix ) - - if REMOVE_NI - outfile = outfile.sub( /_ni_/, '_' ) - end - - if File.exists?( outfile ) - msg = counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile + - ' : already exists, skipping' - Util.print_message( PRG_NAME, msg ) - log << msg + NL - next - end - - Util.print_message( PRG_NAME, counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile ) - log << counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile + NL - - phylogeny_id = get_id( phylogeny_file ) - - ids_mapfile_name = nil - domains_mapfile_name = nil - - if ids_only - ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX ) - elsif domains_only - domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX ) - else - ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX ) - domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX ) - end - - if domains_mapfile_name != nil - begin - Util.check_file_for_readability( domains_mapfile_name ) - rescue ArgumentError - Util.fatal_error( PRG_NAME, 'failed to read from [#{domains_mapfile_name}]: ' + $! ) - end - end - - if ids_mapfile_name != nil - begin - Util.check_file_for_readability( ids_mapfile_name ) - rescue ArgumentError - Util.fatal_error( PRG_NAME, 'failed to read from [#{ids_mapfile_name}]: ' + $! ) - end - end - - if domains_mapfile_name != nil - if ids_mapfile_name != nil - my_outfile = TMP_FILE - else - my_outfile = outfile - end - cmd = decorator + ' ' + DECORATOR_OPTIONS_DOMAINS + ' ' + - '-f=d ' + phylogeny_file + ' ' + - domains_mapfile_name + ' ' + my_outfile - execute_cmd( cmd, log ) - end - - if ids_mapfile_name != nil - if domains_mapfile_name != nil - my_infile = TMP_FILE - else - my_infile = phylogeny_file - end - cmd = decorator + ' ' + DECORATOR_OPTIONS_SEQ_NAMES + ' ' + - '-f=s ' + my_infile + ' ' + - ids_mapfile_name + ' ' + outfile - execute_cmd( cmd, log ) - end - - if ( File.exists?( TMP_FILE ) ) - File.delete( TMP_FILE ) - end - end - } - open( LOG_FILE, 'w' ) do | f | - f.write( log ) - end - puts - Util.print_message( PRG_NAME, 'OK' ) - puts - end # def run - - def execute_cmd( cmd, log ) - log << 'excuting ' + cmd + NL - IO.popen( cmd , 'r+' ) do | pipe | - pipe.close_write - log << pipe.read + NL + NL - end - sleep( SLEEP_TIME ) - end - - - def get_id( phylogeny_file_name ) - phylogeny_file_name =~ /^([^_]+)/ - $1 - end - - def get_file( files_in_dir, phylogeny_id, suffix_pattern ) - matching_files = Array.new - files_in_dir.each { | file | - - if ( !File.directory?( file ) && - file !~ /^\./ && - file !~ /^00/ && - file =~ /^#{phylogeny_id}.*#{suffix_pattern}$/ ) - matching_files << file - end - } - if matching_files.length < 1 - Util.fatal_error( PRG_NAME, 'no file matching [' + phylogeny_id + - '_] [' + suffix_pattern + '] present in current directory' ) - elsif matching_files.length > 1 - Util.fatal_error( PRG_NAME, 'more than one file matching [' + phylogeny_id + - '_] [' + suffix_pattern + '] present in current directory' ) - end - matching_files[ 0 ] - end - - def print_help() - puts( "Usage:" ) - puts() - puts( " " + PRG_NAME + ".rb [options] " ) - puts() - puts( " options: -" + IDS_ONLY_OPTION + ": decorate with sequence/species names only" ) - puts( " -" + DOMAINS_ONLY_OPTION + ": decorate with domain structures" ) - puts() - end - end # class PhylogenyiesDecorator - -end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/phylogeny_factory.rb b/forester/ruby/evoruby/lib/evo/apps/phylogeny_factory.rb deleted file mode 100644 index 999541e..0000000 --- a/forester/ruby/evoruby/lib/evo/apps/phylogeny_factory.rb +++ /dev/null @@ -1,267 +0,0 @@ -# -# = lib/evo/apps/phylogeny_factory - PhylogenyFactory class -# -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: phylogeny_factory.rb,v 1.32 2010/12/13 19:00:11 cmzmasek Exp $ - -require 'lib/evo/util/constants' -require 'lib/evo/util/util' -require 'lib/evo/util/command_line_arguments' - -require 'set' -require 'date' - -module Evoruby - - class PhylogenyFactory - - PRG_NAME = "phylogeny_factory" - PRG_DATE = "2010.05.26" - PRG_DESC = "automated phylogeny reconstruction using queing system" - PRG_VERSION = "1.1" - COPYRIGHT = "2010 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - - USE_JOB_SUBMISSION_SYSTEM_OPTION = 's' - LOG_FILE = '00_phylogeny_factory.log' - TEMPLATE_FILE = '00_phylogeny_factory.template' - PBS_O_WORKDIR = '$PBS_O_WORKDIR/' - MIN_LENGTH_DEFAULT = 40 - WALLTIME = '100:00:00' - QUEUE = 'default' - - TMP_CMD_FILE_SUFFIX = '_QSUB' - - HMM = 'HMM' - RSL = 'RSL' - - OPTION_OPEN = '%[' - OPTION_CLOSE = ']%' - - WAIT = 1.0 - - NL = Constants::LINE_DELIMITER - - def run - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - allowed_opts = Array.new - allowed_opts.push( USE_JOB_SUBMISSION_SYSTEM_OPTION ) - - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, - "unknown option(s): " + disallowed, - STDOUT ) - end - - if File.exists?( LOG_FILE ) - puts( '[' + PRG_NAME + '] > log file [' + LOG_FILE + '] already exists' ) - exit( -1 ) - end - - if !File.exists?( TEMPLATE_FILE ) - puts( '[' + PRG_NAME + '] > template file [' + TEMPLATE_FILE + '] not found' ) - exit( -1 ) - end - - use_job_submission_system = false - if cla.is_option_set?( USE_JOB_SUBMISSION_SYSTEM_OPTION ) - use_job_submission_system = true - end - - log = String.new - - now = DateTime.now - log << "Program : " + PRG_NAME + NL - log << "Version : " + PRG_VERSION + NL - log << "Program date: " + PRG_DATE + NL + NL - log << "Date/time : " + now.to_s + NL - log << "Directory : " + Dir.getwd + NL + NL - - puts( '[' + PRG_NAME + '] > reading ' + TEMPLATE_FILE ) - - paths = Hash.new # path placeholder -> full path - min_lengths = Hash.new # alignment id -> minimal length - hmms = Hash.new # alignment id -> hmm - options = Hash.new # option placeholder -> option - ids = Set.new - - commands = Array.new - - log << "////////////////////////////////////////////////////////////////// #{NL}" - log << "Template file [" + TEMPLATE_FILE + "]:#{NL}" - - command = String.new - - open( TEMPLATE_FILE ).each { | line | - log << line - if ( line =~ /^#/ ) - elsif ( line =~ /^\$\s*(\S+)\s*=\s*(\S+)/ ) - paths[ $1 ] = $2 - puts( '[' + PRG_NAME + '] > paths : ' + $1 + ' => ' + $2 ) - - elsif ( line =~ /^%\s*#{HMM}\s*(\S+)\s*=\s*(\S+)/ ) - hmms[ $1 ] = $2 - puts( '[' + PRG_NAME + '] > hmms : ' + $1 + ' => ' + $2 ) - - elsif ( line =~ /^%\s*#{RSL}\s*(\S+)\s*=\s*(\S+)/ ) - min_lengths[ $1 ] = $2 - puts( '[' + PRG_NAME + '] > min lengths: ' + $1 + ' => ' + $2 ) - - elsif ( line =~ /^%\s*(\S+)\s*=\s*(\S+)/ ) - options[ $1 ] = $2 - puts( '[' + PRG_NAME + '] > options : ' + $1 + ' => ' + $2 ) - - elsif ( line =~ /^>\s*(.+)/ ) - command = command + $1 + ";#{NL}" - - elsif ( line =~ /^-/ ) - commands << prepare( command, paths ) - command = String.new - end - } - log << "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ #{NL}#{NL}" - - files = Dir.entries( "." ) - - files.each { | file | - if ( !File.directory?( file ) && - file !~ /^\./ && - file !~ /#{TEMPLATE_FILE}/ && - file !~ /.bck$/ && - file !~ /.log$/ && - file !~ /nohup/ && - file !~ /^00/ ) - aln_name = file.to_str - id = get_id( aln_name ) - if !ids.include?( id ) - ids.add( id ) - end - puts( '[' + PRG_NAME + '] > file [id] : ' + aln_name + ' [' + id + ']' ) - commands.each do | cmd | - - cmd = subst_hmm( cmd, aln_name, hmms ) - cmd = subst_min_length( cmd, aln_name, min_lengths ) - cmd = subst_options( cmd, options ) - if use_job_submission_system - cmd = subst_aln_name( cmd, PBS_O_WORKDIR + aln_name ) - else - cmd = subst_aln_name( cmd, aln_name ) - end - - if ( cmd =~ /%/ ) - cmd =~ /(%.*?%)/ - problem = $1 - puts( '[' + PRG_NAME + '] > WARNING : [' + id + '] command still contains placeholder: ' + problem ) - log << "WARNING: command still contains placeholder: " + cmd + NL - else - tmp_cmd_file = file.to_str[ 0..4 ] + TMP_CMD_FILE_SUFFIX - if ( File.exists?( tmp_cmd_file ) ) - File.delete( tmp_cmd_file ) - end - if use_job_submission_system - open( tmp_cmd_file, 'w' ) do |f| - f.write( cmd ) - end - end - - log << cmd + NL - - if use_job_submission_system - IO.popen( 'qsub -q ' + QUEUE + ' -l walltime=' + WALLTIME + ' ' + tmp_cmd_file , 'r+' ) do | pipe | - pipe.close_write - end - else - spawn( 'nohup ' + cmd + ' &', STDERR => "/dev/null" ) - end - - sleep( WAIT ) - if ( File.exists?( tmp_cmd_file ) ) - File.delete( tmp_cmd_file ) - end - end - end - end - } - - open( LOG_FILE, 'w' ) do | f | - f.write( log ) - end - - puts() - puts( '[' + PRG_NAME + '] > OK' ) - puts() - - end # def run - - def prepare( command, paths ) - paths.each_pair{ | name, full | - command = command.gsub( name, full ) - } - command - end - - def subst_options( command, options ) - opt_placeholders = command.scan( /%\[\S+\]%/ ) - opt_placeholders.each { | opt_placeholder | - opt_placeholder = opt_placeholder.gsub( OPTION_OPEN , '' ) - opt_placeholder = opt_placeholder.gsub( OPTION_CLOSE, '' ) - opt_value = options[ opt_placeholder ] - if ( opt_value != nil && opt_value.size > 0 ) - command = command.gsub( OPTION_OPEN + opt_placeholder + OPTION_CLOSE, opt_value ) - end - } - command - end - - def subst_aln_name( command, aln_name ) - command = command.gsub( '$', aln_name ) - command - end - - def subst_hmm( command, aln_name, hmms ) - id = get_id( aln_name ) - hmm = hmms[ id ] - if ( hmm != nil && hmm.size > 0 ) - command = command.gsub( OPTION_OPEN + HMM + OPTION_CLOSE, hmm ) - end - command - end - - def subst_min_length( command, aln_name, min_lengths ) - id = get_id( aln_name ) - min_length = min_lengths[ id ] - if ( min_length != nil && min_length.size > 0 ) - command = command.gsub( OPTION_OPEN + RSL + OPTION_CLOSE, min_length ) - else - command = command.gsub( OPTION_OPEN + RSL + OPTION_CLOSE, MIN_LENGTH_DEFAULT.to_s ) - end - command - end - - def get_id( aln_name ) - aln_name =~ /^([^_]+)/ - $1 - end - - end # class PhylogenyFactory - -end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/apps/taxonomy_processor.rb deleted file mode 100644 index d316f85..0000000 --- a/forester/ruby/evoruby/lib/evo/apps/taxonomy_processor.rb +++ /dev/null @@ -1,317 +0,0 @@ -# -# = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class -# -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: taxonomy_processor.rb,v 1.26 2010/12/13 19:00:11 cmzmasek Exp $ - - -require 'lib/evo/util/util' -require 'lib/evo/msa/msa_factory' -require 'lib/evo/msa/msa' -require 'lib/evo/io/msa_io' -require 'lib/evo/io/parser/fasta_parser' -require 'lib/evo/io/parser/general_msa_parser' -require 'lib/evo/io/writer/fasta_writer' -require 'lib/evo/io/writer/phylip_sequential_writer' -require 'lib/evo/util/command_line_arguments' - -module Evoruby - - class TaxonomyProcessor - - PRG_NAME = "tap" - PRG_DATE = "2012.09.27" - PRG_DESC = "replacement of species names in multiple sequence files" - PRG_VERSION = "1.02" - COPYRIGHT = "2012 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - - SIMPLE = true - - EXTRACT_TAXONOMY_OPTION = "t" - - def initialize() - @taxonomies = Hash.new() - end - - def run() - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - if ( ARGV == nil || ( ARGV.length != 1 && ARGV.length != 3 && ARGV.length != 4 && ARGV.length != 5 && ARGV.length != 6 ) ) - puts( "Usage: #{PRG_NAME}.rb [options] [input map file] [output sequences] [output id list]" ) - puts() - puts( " options: -" + EXTRACT_TAXONOMY_OPTION + ": to extract taxonomy information from bracketed expression" ) - puts() - exit( -1 ) - end - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - - mapfile = nil - input = nil - output = nil - list_file = nil - - - - if cla.get_number_of_files == 4 - mapfile = cla.get_file_name( 0 ) - input = cla.get_file_name( 1 ) - output = cla.get_file_name( 2 ) - list_file = cla.get_file_name( 3 ) - elsif cla.get_number_of_files == 3 - input = cla.get_file_name( 0 ) - output = cla.get_file_name( 1 ) - list_file = cla.get_file_name( 2 ) - elsif cla.get_number_of_files == 1 - input = cla.get_file_name( 0 ) - i = nil - if input.downcase.end_with?( ".fasta" ) - i = input[ 0 .. input.length - 7 ] - elsif input.downcase.end_with?( ".fsa" ) - i = input[ 0 .. input.length - 5 ] - else - i = input - end - output = i + "_ni.fasta" - list_file = i + ".nim" - end - - - allowed_opts = Array.new - allowed_opts.push( EXTRACT_TAXONOMY_OPTION ) - - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) - end - - extract_taxonomy = false - if ( cla.is_option_set?( EXTRACT_TAXONOMY_OPTION ) ) - extract_taxonomy = true - end - - if ( File.exists?( output ) ) - Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" ) - end - if ( File.exists?( list_file ) ) - Util.fatal_error( PRG_NAME, "list file [" + list_file + "] already exists" ) - end - if ( !File.exists?( input) ) - Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" ) - end - if ( mapfile != nil && !File.exists?( mapfile ) ) - Util.fatal_error( PRG_NAME, "mapfile [" + mapfile + "] does not exist" ) - end - - fasta_like = Util.looks_like_fasta?( input ) - - puts() - if mapfile != nil - puts( "Map file : " + mapfile ) - end - puts( "Input alignment : " + input ) - puts( "Output alignment: " + output ) - puts( "Name list : " + list_file ) - if ( fasta_like ) - puts( "Format : Fasta" ) - else - puts( "Format : Phylip like" ) - end - if ( extract_taxonomy ) - puts( "Extract taxonomy: true" ) - end - puts() - - species_map = Hash.new - if mapfile != nil - File.open( mapfile ) do | file | - while line = file.gets - if ( line =~/(.+)#(.+)/ || line =~/(.+)\s+(.+)/ ) - species_map[ $1 ] = $2 - Util.print_message( PRG_NAME, "mapping: " + $1 + ' => ' + $2 ) - end - end - end - end - - f = MsaFactory.new() - begin - if ( fasta_like ) - msa = f.create_msa_from_file( input, FastaParser.new() ) - else - msa = f.create_msa_from_file( input, GeneralMsaParser.new() ) - end - rescue Exception => e - Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s ) - end - - if ( msa == nil || msa.get_number_of_seqs() < 1 ) - Util.fatal_error( PRG_NAME, "failed to read MSA" ) - end - begin - Util.check_file_for_writability( list_file ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT ) - end - - #removed = msa.remove_redundant_sequences!( true ) - #if removed.size > 0 - # Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) - # removed.each { | seq_name | - # puts seq_name - # } - # Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" ) - #end - - lf = File.open( list_file, "a" ) - for i in 0 ... msa.get_number_of_seqs - seq = msa.get_sequence( i ) - seq.set_name( Util::normalize_seq_name( modify_name( seq.get_name(), i, lf, species_map, extract_taxonomy ), 10 ) ) - end - - io = MsaIO.new() - w = nil - if ( fasta_like ) - w = FastaWriter.new() - else - w = PhylipSequentialWriter.new() - end - w.set_max_name_length( 10 ) - w.clean( true ) - begin - io.write_to_file( msa, output, w ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) - end - lf.close() - if ( @taxonomies.length > 0 ) - Util.print_message( PRG_NAME, "number of unique taxonomies: " + @taxonomies.length.to_s ) - end - Util.print_message( PRG_NAME, "wrote: " + list_file ) - Util.print_message( PRG_NAME, "wrote: " + output ) - Util.print_message( PRG_NAME, "OK" ) - end - - private - - def modify_name( desc, counter, file, species_map, extract_taxonomy ) - new_desc = nil - my_species = nil - # if desc =~ /^>?\s*\S{1,10}_([0-9A-Z]{3,5})/ - if desc =~ /^>?\s*\S{1,10}_([A-Z]{3,5})/ - new_desc = counter.to_s( 16 ) + "_" + $1 - elsif SIMPLE - new_desc = counter.to_s( 16 ) - elsif extract_taxonomy - if ( desc.count( "[" ) != desc.count( "]" ) ) - Util.fatal_error( PRG_NAME, "illegal bracket count in: " + desc ) - end - species = nil - species_map.each_key do | key | - if desc =~ /[\b|_]#{key}\b/ # Added boundaries to prevent e.g. RAT matching ARATH. - species = species_map[ key ] - new_desc = counter.to_s( 16 ) + "_" + species - break - end - end - if species == nil - if desc =~/.*\[(\S{3,}?)\]/ - species = $1 - species.strip! - species.upcase! - species.gsub!( /\s+/, " " ) - species.gsub!( /-/, "" ) - species.gsub!( /\)/, "" ) - species.gsub!( /\(/, "" ) - species.gsub!( /\'/, "" ) - if species =~ /\S+\s\S+/ || species =~ /\S{3,5}/ - if species =~ /(\S+)\s(\S+)/ - code = $1[ 0..2 ] + $2[ 0..1 ] - elsif species =~ /\S{3,5}/ - code = species - elsif species.count( " " ) > 2 - species =~ /(\S+)\s+(\S+)\s+(\S+)$/ - third_last = $1 - second_last = $2 - last = $3 - code = code[ 0 ] + third_last[ 0 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ] - elsif species.count( " " ) > 1 - species =~ /(\S+)\s+(\S+)$/ - second_last = $1 - last = $2 - code = code[ 0..1 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ] - end - new_desc = counter.to_s( 16 ) + "_" + code - if @taxonomies.has_key?( code ) - if ( !@taxonomies.has_value?( species ) ) - Util.fatal_error( PRG_NAME, "code [#{code}] is not unique in [#{desc}]" ) - end - else - if ( @taxonomies.has_value?( species ) ) - Util.fatal_error( PRG_NAME, "genome [#{species}] is not unique in [#{desc}]" ) - else - @taxonomies[ code ] = species - end - end - else - Util.fatal_error( PRG_NAME, "illegal format [#{species}] in: " + desc ) - end - else - Util.fatal_error( PRG_NAME, "illegal format in: " + desc ) - end - end - else - species = nil - my_species = nil - species_map.each_key do | key | - if desc =~ /#{key}/ - species = species_map[ key ] - species = species.gsub( /\s+/, "" ) - species = species.gsub( /_/, " " ) - my_species = species - if species =~ /(\S+)\s+(\S+)/ - species = $1[0..2] + $2[0..1] - end - species = species.gsub( /\s+/, "" ) - species = species.slice(0, 5) - species.upcase! - break - end - end - if species == nil - Util.fatal_error( PRG_NAME, "species not found in: " + desc ) - else - new_desc = counter.to_s( 16 ) + "_" + species - end - end - if new_desc == nil - Util.fatal_error( PRG_NAME, "failed to extract species from: " + desc ) - end - if my_species != nil - file.print( new_desc + ": " + desc + " [" + my_species + "]" + "\n" ) - else - file.print( new_desc + ": " + desc + "\n" ) - end - new_desc - end - - end # class TaxonomyProcessor - -end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/tseq_taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/apps/tseq_taxonomy_processor.rb deleted file mode 100644 index f708247..0000000 --- a/forester/ruby/evoruby/lib/evo/apps/tseq_taxonomy_processor.rb +++ /dev/null @@ -1,190 +0,0 @@ -# -# = lib/evo/apps/tseq_taxonomy_processor - TseqTaxonomyProcessor class -# -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: tseq_taxonomy_processor.rb,v 1.6 2010/12/13 19:00:11 cmzmasek Exp $ - - -require 'lib/evo/util/util' -require 'lib/evo/msa/msa_factory' -require 'lib/evo/msa/msa' -require 'lib/evo/io/msa_io' -require 'lib/evo/io/parser/sp_taxonomy_parser' -require 'lib/evo/io/parser/ncbi_tseq_parser' -require 'lib/evo/io/writer/fasta_writer' -require 'lib/evo/io/writer/phylip_sequential_writer' -require 'lib/evo/util/command_line_arguments' - -module Evoruby - - class TseqTaxonomyProcessor - - PRG_NAME = "tseq_tap" - PRG_DATE = "2009.01.06" - PRG_DESC = "preprocessing of multiple sequence files in ncbi tseq xml format" - PRG_VERSION = "1.02" - COPYRIGHT = "2009 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - - TAXONOMY_CODE = "TAXONOMY_CODE:" - TAXONOMY_ID = "TAXONOMY_ID:" - TAXONOMY_ID_TYPE = "TAXONOMY_ID_TYPE:" - TAXONOMY_SN = "TAXONOMY_SN:" - TAXONOMY_CN = "TAXONOMY_CN:" - SEQ_ACCESSION = "SEQ_ACCESSION:" - SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE:" - SEQ_SECONDARY_ACCESSION = "SEQ_SECONDARY_ACCESSION:" - SEQ_SYMBOL = "SEQ_SYMBOL:" - SEQ_NAME = "SEQ_NAME:" - SEQ_MOL_SEQ = "SEQ_MOL_SEQ:" - - def initialize() - @tax_ids_to_sp_taxonomies = Hash.new() - end - - def run() - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - if ARGV == nil || ARGV.length != 4 - puts( "Usage: #{PRG_NAME}.rb " ) - puts() - - exit( -1 ) - end - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - allowed_opts = Array.new - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) - end - - sp_taxonomy_infile = cla.get_file_name( 0 ) - sequences_infile = cla.get_file_name( 1 ) - sequences_outfile = cla.get_file_name( 2 ) - mapping_outfile = cla.get_file_name( 3 ) - - Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile ) - Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile ) - Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile ) - Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile ) - - sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile ) - - Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile ) - - tseq_parser = NcbiTSeqParser.new - msa_fac = MsaFactory.new - - seqs = msa_fac.create_msa_from_file( sequences_infile, tseq_parser ) - - Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile ) - - removed = seqs.remove_redundant_sequences!( true, true ) - - if removed.size > 0 - Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) - removed.each { | seq_name | - puts seq_name - } - Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" ) - end - - mapping_out = File.open( mapping_outfile, "a" ) - - for i in 0 ... seqs.get_number_of_seqs - seq = seqs.get_sequence( i ) - if seq.get_taxonomy == nil - Util.fatal_error( PRG_NAME, "sequence [" + seq.get_name + "] has no taxonomy information" ) - end - seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) ) - end - - io = MsaIO.new() - - w = FastaWriter.new() - - w.set_max_name_length( 10 ) - w.clean( true ) - begin - io.write_to_file( seqs, sequences_outfile, w ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) - end - mapping_out.close() - - Util.print_message( PRG_NAME, "wrote: " + mapping_outfile ) - Util.print_message( PRG_NAME, "wrote: " + sequences_outfile ) - Util.print_message( PRG_NAME, "OK" ) - - end - - private - - def modify_name( seq, i, sp_taxonomies, mapping_outfile ) - - tax_id = seq.get_taxonomy.get_id - matching_sp_taxonomy = nil - - if @tax_ids_to_sp_taxonomies.has_key?( tax_id ) - # This is so that a second lookup will be much faster. - matching_sp_taxonomy = @tax_ids_to_sp_taxonomies[ tax_id ] - else - sp_taxonomies.each { |sp_taxonomy| - if ( sp_taxonomy.id == tax_id ) - if matching_sp_taxonomy != nil - Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] is not unique" ) - end - matching_sp_taxonomy = sp_taxonomy - @tax_ids_to_sp_taxonomies[ tax_id ] = sp_taxonomy - end - } - end - if matching_sp_taxonomy == nil - Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] for [" + seq.get_taxonomy.get_name + "] not found" ) - end - - new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code - - seq_name = seq.get_name - if seq_name =~ /\[.+\]$/ - # Redundant taxonomy information hides here. - seq_name = seq_name.sub(/\[.+\]$/, '') - end - if seq_name =~ /^\s*hypothetical\s+protein\s*/i - # Pointless information. - seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' ) - end - - mapping_outfile.print( new_name + "\t" + - TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + - TAXONOMY_ID + tax_id + "\t" + - TAXONOMY_ID_TYPE + seq.get_taxonomy.get_id_source + "\t" + - TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + - SEQ_ACCESSION + seq.get_accession + "\t" + - SEQ_ACCESSION_SOURCE + seq.get_accession_source + "\t" + - SEQ_SYMBOL + seq.get_symbol + "\t" + - SEQ_NAME + seq_name + "\t" + - SEQ_MOL_SEQ + seq.get_sequence_as_string + - Constants::LINE_DELIMITER ) - new_name - end - - end - -end # module Evoruby \ No newline at end of file