From 8edcad512a3039c1940c03b72dd3d5c2cb8509ac Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Wed, 15 Feb 2017 17:10:56 -0800 Subject: [PATCH] in progress --- forester/ruby/evoruby/exe/fae.rb | 19 -- forester/ruby/evoruby/exe/fastx.rb | 17 ++ forester/ruby/evoruby/exe/hsa.rb | 8 +- forester/ruby/evoruby/exe/hsp.rb | 14 +- .../lib/evo/io/parser/hmmscan_domain_extractor.rb | 207 ++++++++-------- .../lib/evo/tool/domain_sequence_extractor.rb | 155 +++++++----- .../evoruby/lib/evo/tool/domains_to_forester.rb | 22 +- .../ruby/evoruby/lib/evo/tool/fasta_extractor.rb | 248 +++++++++----------- .../ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb | 7 +- .../ruby/evoruby/lib/evo/tool/hmmscan_summary.rb | 8 +- .../evoruby/lib/evo/tool/taxonomy_processor.rb | 4 +- forester/ruby/evoruby/lib/evo/util/constants.rb | 1 + forester/ruby/scripts/hmm_split.rb | 136 +++++------ 13 files changed, 415 insertions(+), 431 deletions(-) delete mode 100755 forester/ruby/evoruby/exe/fae.rb create mode 100755 forester/ruby/evoruby/exe/fastx.rb mode change 100755 => 100644 forester/ruby/evoruby/exe/hsp.rb diff --git a/forester/ruby/evoruby/exe/fae.rb b/forester/ruby/evoruby/exe/fae.rb deleted file mode 100755 index f75af89..0000000 --- a/forester/ruby/evoruby/exe/fae.rb +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/local/bin/ruby -w -# -# = exe/fae -# -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: fae.rb,v 1.1 2008/09/10 02:16:34 cmzmasek Exp $ - - -require 'lib/evo/tools/fasta_extractor' - -module Evoruby - - mse = FastaExtractor.new() - - mse.run() - -end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/exe/fastx.rb b/forester/ruby/evoruby/exe/fastx.rb new file mode 100755 index 0000000..62c720a --- /dev/null +++ b/forester/ruby/evoruby/exe/fastx.rb @@ -0,0 +1,17 @@ +#!/usr/local/bin/ruby -w +# +# = exe/fastx +# +# +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) + +require 'lib/evo/tool/fasta_extractor' + +module Evoruby + + mse = FastaExtractor.new() + + mse.run() + +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/exe/hsa.rb b/forester/ruby/evoruby/exe/hsa.rb index a79da24..b5987e4 100755 --- a/forester/ruby/evoruby/exe/hsa.rb +++ b/forester/ruby/evoruby/exe/hsa.rb @@ -2,12 +2,8 @@ # # = exe/hsp # -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: hsp.rb,v 1.1 2009/11/25 05:42:04 cmzmasek Exp $ -# -# last modified: 11/24/2009 +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) require 'lib/evo/tool/hmmscan_analysis' diff --git a/forester/ruby/evoruby/exe/hsp.rb b/forester/ruby/evoruby/exe/hsp.rb old mode 100755 new mode 100644 index 7b09d13..105d3dc --- a/forester/ruby/evoruby/exe/hsp.rb +++ b/forester/ruby/evoruby/exe/hsp.rb @@ -2,19 +2,15 @@ # # = exe/hsp # -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: hsp.rb,v 1.1 2009/11/25 05:42:04 cmzmasek Exp $ -# -# last modified: 11/24/2009 +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) require 'lib/evo/tool/hmmscan_summary' module Evoruby - hsp = HmmscanSummary.new() + hsp = HmmscanSummary.new() - hsp.run() + hsp.run() -end # module Evoruby +end # module Evoruby \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb index 39a0088..5186bf5 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb @@ -1,11 +1,8 @@ # # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanDomainExtractor class # -# Copyright:: Copyright (C) 2012 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: $ - +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) require 'lib/evo/util/constants' require 'lib/evo/msa/msa_factory' @@ -14,31 +11,27 @@ require 'lib/evo/io/writer/fasta_writer' require 'lib/evo/io/parser/fasta_parser' require 'lib/evo/io/parser/hmmscan_parser' - - module Evoruby - class HmmscanDomainExtractor ADD_TO_CLOSE_PAIRS = 0 - def initialize end # raises ArgumentError, IOError, StandardError def parse( domain_id, - hmmscan_output, - fasta_sequence_file, - outfile, - passed_seqs_outfile, - failed_seqs_outfile, - e_value_threshold, - length_threshold, - add_position, - add_domain_number, - add_species, - min_linker, - log ) + hmmscan_output, + fasta_sequence_file, + outfile, + passed_seqs_outfile, + failed_seqs_outfile, + e_value_threshold, + length_threshold, + add_position, + add_domain_number, + add_species, + min_linker, + log ) Util.check_file_for_readability( hmmscan_output ) Util.check_file_for_readability( fasta_sequence_file ) @@ -106,7 +99,7 @@ module Evoruby i_e_value = r.i_e_value if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) && - ( ( length_threshold <= 0 ) || ( env_to - env_from + 1 ) >= length_threshold.to_f ) ) + ( ( length_threshold <= 0 ) || ( env_to - env_from + 1 ) >= length_threshold.to_f ) ) hmmscan_datas << HmmsearchData.new( sequence, number, out_of, env_from, env_to, i_e_value ) if ( number > max_domain_copy_number_per_protein ) max_domain_copy_number_sequence = sequence @@ -137,21 +130,21 @@ module Evoruby if number == out_of if !hmmscan_datas.empty? process_hmmscan_datas( hmmscan_datas, - in_msa, - add_position, - add_domain_number, - add_species, - out_msa, - out_msa_singles, - out_msa_pairs, - out_msa_isolated, - min_linker, - out_msa_single_domains_protein_seqs, - out_msa_close_pairs_protein_seqs, - out_msa_close_pairs_only_protein_seqs, - out_msa_isolated_protein_seqs, - out_msa_isolated_only_protein_seqs, - out_msa_isolated_and_close_pair_protein_seqs ) + in_msa, + add_position, + add_domain_number, + add_species, + out_msa, + out_msa_singles, + out_msa_pairs, + out_msa_isolated, + min_linker, + out_msa_single_domains_protein_seqs, + out_msa_close_pairs_protein_seqs, + out_msa_close_pairs_only_protein_seqs, + out_msa_isolated_protein_seqs, + out_msa_isolated_only_protein_seqs, + out_msa_isolated_and_close_pair_protein_seqs ) domain_pass_counter += hmmscan_datas.length if passed_seqs.find_by_name_start( sequence, true ).length < 1 add_sequence( sequence, in_msa, passed_seqs ) @@ -193,11 +186,11 @@ module Evoruby write_msa( failed_seqs, failed_seqs_outfile ) if out_msa_pairs - write_msa( out_msa_pairs, outfile +"_" + min_linker.to_s + ".fasta") + write_msa( out_msa_pairs, outfile + "_" + min_linker.to_s + ".fasta") end if out_msa_singles - write_msa( out_msa_singles, outfile +"_singles.fasta") + write_msa( out_msa_singles, outfile + "_singles.fasta") end if out_msa_isolated @@ -205,7 +198,7 @@ module Evoruby end if out_msa_single_domains_protein_seqs - write_msa( out_msa_single_domains_protein_seqs, outfile +"_proteins_with_singles.fasta" ) + write_msa( out_msa_single_domains_protein_seqs, outfile + "_proteins_with_singles.fasta" ) end if out_msa_close_pairs_protein_seqs @@ -230,9 +223,9 @@ module Evoruby if min_linker if ( out_msa_single_domains_protein_seqs.get_number_of_seqs + - out_msa_close_pairs_only_protein_seqs.get_number_of_seqs + - out_msa_isolated_only_protein_seqs.get_number_of_seqs + - out_msa_isolated_and_close_pair_protein_seqs.get_number_of_seqs ) != passed_seqs.get_number_of_seqs + out_msa_close_pairs_only_protein_seqs.get_number_of_seqs + + out_msa_isolated_only_protein_seqs.get_number_of_seqs + + out_msa_isolated_and_close_pair_protein_seqs.get_number_of_seqs ) != passed_seqs.get_number_of_seqs error_msg = "this should not have happened" raise StandardError, error_msg end @@ -263,7 +256,6 @@ module Evoruby end # parse - private def write_msa( msa, filename ) @@ -279,7 +271,6 @@ module Evoruby end end - def add_sequence( sequence_name, in_msa, add_to_msa ) seqs = in_msa.find_by_name_start( sequence_name, true ) if ( seqs.length < 1 ) @@ -295,21 +286,21 @@ module Evoruby end def process_hmmscan_datas( hmmscan_datas, - in_msa, - add_position, - add_domain_number, - add_species, - out_msa, - out_msa_singles, - out_msa_pairs, - out_msa_isolated, - min_linker, - out_msa_single_domains_protein_seqs, - out_msa_close_pairs_protein_seqs, - out_msa_close_pairs_only_protein_seqs, - out_msa_isolated_protein_seqs, - out_msa_isolated_only_protein_seqs, - out_msa_isolated_and_close_pair_protein_seqs ) + in_msa, + add_position, + add_domain_number, + add_species, + out_msa, + out_msa_singles, + out_msa_pairs, + out_msa_isolated, + min_linker, + out_msa_single_domains_protein_seqs, + out_msa_close_pairs_protein_seqs, + out_msa_close_pairs_only_protein_seqs, + out_msa_isolated_protein_seqs, + out_msa_isolated_only_protein_seqs, + out_msa_isolated_and_close_pair_protein_seqs ) actual_out_of = hmmscan_datas.size saw_close_pair = false @@ -327,28 +318,28 @@ module Evoruby seq_name = hmmscan_data.seq_name extract_domain( seq_name, - index + 1, - actual_out_of, - hmmscan_data.env_from, - hmmscan_data.env_to, - in_msa, - out_msa, - add_position, - add_domain_number, - add_species ) + index + 1, + actual_out_of, + hmmscan_data.env_from, + hmmscan_data.env_to, + in_msa, + out_msa, + add_position, + add_domain_number, + add_species ) if min_linker if actual_out_of == 1 extract_domain( seq_name, - 1, - 1, - hmmscan_data.env_from, - hmmscan_data.env_to, - in_msa, - out_msa_singles, - add_position, - add_domain_number, - add_species ) + 1, + 1, + hmmscan_data.env_from, + hmmscan_data.env_to, + in_msa, + out_msa_singles, + add_position, + add_domain_number, + add_species ) if out_msa_single_domains_protein_seqs.find_by_name_start( seq_name, true ).length < 1 add_sequence( seq_name, in_msa, out_msa_single_domains_protein_seqs ) else @@ -361,20 +352,20 @@ module Evoruby last = index == hmmscan_datas.length - 1 if ( ( first && ( ( hmmscan_datas[ index + 1 ].env_from - hmmscan_data.env_to ) > min_linker) ) || - ( last && ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) || - ( !first && !last && ( ( hmmscan_datas[ index + 1 ].env_from - hmmscan_data.env_to ) > min_linker ) && - ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) ) + ( last && ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) || + ( !first && !last && ( ( hmmscan_datas[ index + 1 ].env_from - hmmscan_data.env_to ) > min_linker ) && + ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) ) extract_domain( seq_name, - index + 1, - actual_out_of, - hmmscan_data.env_from, - hmmscan_data.env_to, - in_msa, - out_msa_isolated, - add_position, - add_domain_number, - add_species ) + index + 1, + actual_out_of, + hmmscan_data.env_from, + hmmscan_data.env_to, + in_msa, + out_msa_isolated, + add_position, + add_domain_number, + add_species ) saw_isolated = true elsif !first @@ -396,15 +387,15 @@ module Evoruby end extract_domain( seq_name, - index.to_s + "+" + ( index + 1 ).to_s, - actual_out_of, - from, - to, - in_msa, - out_msa_pairs, - add_position, - add_domain_number, - add_species ) + index.to_s + "+" + ( index + 1 ).to_s, + actual_out_of, + from, + to, + in_msa, + out_msa_pairs, + add_position, + add_domain_number, + add_species ) saw_close_pair = true end end @@ -456,15 +447,15 @@ module Evoruby end # def process_hmmscan_data def extract_domain( sequence, - number, - out_of, - seq_from, - seq_to, - in_msa, - out_msa, - add_position, - add_domain_number, - add_species ) + number, + out_of, + seq_from, + seq_to, + in_msa, + out_msa, + add_position, + add_domain_number, + add_species ) if number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of ) error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s raise StandardError, error_msg diff --git a/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb index a8144ec..ac03d43 100644 --- a/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb @@ -13,9 +13,9 @@ module Evoruby class DomainSequenceExtractor PRG_NAME = "dsx" - PRG_VERSION = "2.001" - PRG_DESC = "extraction of domain sequences from hmmscan output" - PRG_DATE = "20170213" + PRG_VERSION = "2.002" + PRG_DESC = "Extraction of domain sequences from hmmscan output" + PRG_DATE = "20170214" WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" E_VALUE_THRESHOLD_OPTION = 'e' @@ -23,7 +23,6 @@ module Evoruby ADD_POSITION_OPTION = 'p' ADD_DOMAIN_NUMBER_OPTION = 'd' ADD_SPECIES = 's' - MIN_LINKER_OPT = 'ml' LOG_FILE_SUFFIX = '_domain_seq_extr.log' PASSED_SEQS_SUFFIX = '_with_passing_domains.fasta' FAILED_SEQS_SUFFIX = '_with_no_passing_domains.fasta' @@ -38,7 +37,10 @@ module Evoruby WWW, STDOUT ) - ld = Constants::LINE_DELIMITER + if ( ARGV == nil || ( ARGV.length < 1 ) ) + print_help + exit( -1 ) + end begin cla = CommandLineArguments.new( ARGV ) @@ -52,7 +54,7 @@ module Evoruby exit( 0 ) end - if ( cla.get_number_of_files != 4 ) + unless ( cla.get_number_of_files == 2 || cla.get_number_of_files == 3 || cla.get_number_of_files == 4 ) print_help exit( -1 ) end @@ -63,7 +65,6 @@ module Evoruby allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION ) allowed_opts.push( LENGTH_THRESHOLD_OPTION ) allowed_opts.push( ADD_SPECIES ) - allowed_opts.push( MIN_LINKER_OPT ) disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) @@ -72,75 +73,101 @@ module Evoruby STDOUT ) end - domain_id = cla.get_file_name( 0 ) - hmmsearch_output = cla.get_file_name( 1 ) - fasta_sequence_file = cla.get_file_name( 2 ) - outfile = cla.get_file_name( 3 ) - - if outfile.downcase.end_with?( ".fasta" ) - outfile = outfile[ 0 .. outfile.length - 7 ] - elsif outfile.downcase.end_with?( ".fsa" ) - outfile = outfile[ 0 .. outfile.length - 5 ] - end - - add_position = false - if ( cla.is_option_set?( ADD_POSITION_OPTION ) ) - add_position = true - end - - add_domain_number = false - if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) ) - add_domain_number = true - end - - add_species = false - if cla.is_option_set? ADD_SPECIES - add_species = true - end - - e_value_threshold = -1.0 + e_value_threshold = 0.1 if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) begin e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) rescue ArgumentError => e - Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end if ( e_value_threshold < 0.0 ) - Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) + Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) end end - length_threshold = -1 + length_threshold = 50 if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) ) begin length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION ) rescue ArgumentError => e - Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end if ( length_threshold < 0) - Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT ) + Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT ) end end - min_linker = nil - if ( cla.is_option_set?( MIN_LINKER_OPT ) ) - begin - min_linker = cla.get_option_value_as_int( MIN_LINKER_OPT ) - rescue ArgumentError => e - Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + domain_id = cla.get_file_name( 0 ) + hmmscan_output = cla.get_file_name( 1 ) + fasta_sequence_file = "" + outfile = "" + + if (cla.get_number_of_files == 4) + fasta_sequence_file = cla.get_file_name( 2 ) + outfile = cla.get_file_name( 3 ) + elsif (cla.get_number_of_files == 2 || cla.get_number_of_files == 3) + if (cla.get_number_of_files == 3) + fasta_sequence_file = cla.get_file_name( 2 ) + else + hmmscan_index = hmmscan_output.index(Constants::HMMSCAN) + if ( hmmscan_index != nil ) + prefix = hmmscan_output[0 .. hmmscan_index-1 ] + suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX + files = Dir.entries( "." ) + matching_files = Util.get_matching_files( files, prefix, suffix) + if matching_files.length < 1 + Util.fatal_error( PRG_NAME, 'no file matching [' + prefix + + '...' + suffix + '] present in current directory: need to indicate as second argument' ) + end + if matching_files.length > 1 + Util.fatal_error( PRG_NAME, 'more than one file matching [' + + prefix + '...' + suffix + '] present in current directory: need to indicate as second argument' ) + end + fasta_sequence_file = matching_files[ 0 ] + else + Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' ) + end end - if ( !min_linker || min_linker > 100 || min_linker < -100 ) - Forester::Util.fatal_error( PRG_NAME, "unexpected value for min linker " + min_linker.to_s, STDOUT ) + hmmscan_index = hmmscan_output.index(Constants::HMMSCAN) + if ( hmmscan_index != nil ) + outfile = hmmscan_output.sub(Constants::HMMSCAN, "_") + "_" + domain_id + e = e_value_threshold >= 1 ? e_value_threshold.to_i : e_value_threshold.to_s.sub!('.','_') + outfile += "_E" + e.to_s + outfile += "_L" + length_threshold.to_i.to_s + else + Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' ) end end + if outfile.downcase.end_with?( ".fasta" ) + outfile = outfile[ 0 .. outfile.length - 7 ] + elsif outfile.downcase.end_with?( ".fsa" ) + outfile = outfile[ 0 .. outfile.length - 5 ] + end + + add_position = false + if ( cla.is_option_set?( ADD_POSITION_OPTION ) ) + add_position = true + end + + add_domain_number = false + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) ) + add_domain_number = true + end + + add_species = false + if cla.is_option_set?( ADD_SPECIES ) + add_species = true + end + log = String.new + ld = Constants::LINE_DELIMITER puts() puts( "Domain : " + domain_id ) log << "Domain : " + domain_id + ld - puts( "Hmmscan outputfile : " + hmmsearch_output ) - log << "Hmmscan outputfile : " + hmmsearch_output + ld + puts( "Hmmscan outputfile : " + hmmscan_output ) + log << "Hmmscan outputfile : " + hmmscan_output + ld puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file ) log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld puts( "Outputfile : " + outfile + ".fasta" ) @@ -165,11 +192,6 @@ module Evoruby puts( "Length threshold : no threshold" ) log << "Length threshold : no threshold" + ld end - if ( min_linker ) - puts( "Min linker : " + min_linker.to_s ) - log << "Min linker : " + min_linker.to_s + ld - - end if ( add_position ) puts( "Add positions (rel to complete seq) to extracted domains: true" ) @@ -193,7 +215,7 @@ module Evoruby begin parser = HmmscanDomainExtractor.new() domain_count = parser.parse( domain_id, - hmmsearch_output, + hmmscan_output, fasta_sequence_file, outfile, outfile + PASSED_SEQS_SUFFIX, @@ -203,7 +225,7 @@ module Evoruby add_position, add_domain_number, add_species, - min_linker, + false, log ) rescue ArgumentError, IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) @@ -239,14 +261,21 @@ module Evoruby puts() puts( "Usage:" ) puts() - puts( " " + PRG_NAME + ".rb [options] " ) + puts( " " + PRG_NAME + ".rb [options] [file containing complete sequences in fasta format] [outputfile]" ) + puts() + puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "= : iE-value threshold, default is 0.1" ) + puts( " -" + LENGTH_THRESHOLD_OPTION + "= : length threshold, default is 50" ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION + " : to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" ) + puts( " -" + ADD_POSITION_OPTION + " : to add positions (rel to complete seq) to extracted domains" ) + puts( " -" + ADD_SPECIES + " : to add species [in brackets]" ) puts() - puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=: iE-value threshold, default is no threshold" ) - puts( " -" + LENGTH_THRESHOLD_OPTION + "=: length threshold, default is no threshold" ) - puts( " -" + ADD_POSITION_OPTION + ": to add positions (rel to complete seq) to extracted domains" ) - puts( " -" + ADD_DOMAIN_NUMBER_OPTION + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" ) - puts( " -" + ADD_SPECIES + ": to add species [in brackets]" ) - puts( " -" + MIN_LINKER_OPT + "=: to extract pairs of same domains with a distance inbetween shorter than a given value" ) + puts( "Examples:" ) + puts + puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta P53_#{Constants::PFAM_V_FOR_EX}_10_Pkinase_E1_0e-06_L50" ) + puts + puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta" ) + puts + puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10" ) puts() end diff --git a/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb b/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb index b928468..48459ae 100644 --- a/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb +++ b/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb @@ -183,7 +183,7 @@ module Evoruby if ( cla.get_number_of_files == 2 ) original_sequences_file = cla.get_file_name( 1 ) else - hmmscan_index = domains_list_file.index("hmmscan") + hmmscan_index = domains_list_file.index(Constants::HMMSCAN) if ( hmmscan_index != nil ) prefix = domains_list_file[0 .. hmmscan_index-1 ] suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX @@ -198,14 +198,18 @@ module Evoruby prefix + '...' + suffix + '] present in current directory: need to indicate as second argument' ) end original_sequences_file = matching_files[ 0 ] + else + Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' ) end + end outfile = domains_list_file if (outfile.end_with?(Constants::DOMAIN_TABLE_SUFFIX) ) outfile = outfile.chomp(Constants::DOMAIN_TABLE_SUFFIX) end if ( e_value_threshold >= 0.0 ) - outfile = outfile + Constants::DOMAINS_TO_FORESTER_EVALUE_CUTOFF_SUFFIX + e_value_threshold.to_s + e = e_value_threshold >= 1 ? e_value_threshold.to_i : e_value_threshold.to_s.sub!('.','_') + outfile = outfile + Constants::DOMAINS_TO_FORESTER_EVALUE_CUTOFF_SUFFIX + e.to_s end outfile = outfile + Constants::DOMAINS_TO_FORESTER_OUTFILE_SUFFIX end @@ -248,7 +252,7 @@ module Evoruby puts Util.print_message( PRG_NAME, "wrote: " + outfile ) - Util.print_message( PRG_NAME, "next steps in standard analysis pipeline: hmmsearch followed by dsx.rb") + Util.print_message( PRG_NAME, "next step in standard analysis pipeline: dsx.rb") Util.print_message( PRG_NAME, 'OK' ) puts @@ -262,16 +266,18 @@ module Evoruby puts puts( " " + PRG_NAME + ".rb [options] [file containing complete sequences in fasta format] [outputfile]" ) puts() - puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "= : E-value threshold, default is no threshold" ) - puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" ) + puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=: E-value threshold, default is no threshold" ) + puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" ) puts + puts( " [next step in standard analysis pipeline: dsx.rb]") + puts() puts( "Examples:" ) puts - puts( " " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta P53_hmmscan_300_10.dff" ) + puts( " " + PRG_NAME + ".rb -o P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10.dff" ) puts - puts( " " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta" ) + puts( " " + PRG_NAME + ".rb -o P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta" ) puts - puts( " " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table" ) + puts( " " + PRG_NAME + ".rb -o P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table" ) puts() end diff --git a/forester/ruby/evoruby/lib/evo/tool/fasta_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/fasta_extractor.rb index b2d0d5c..d9ad2a8 100644 --- a/forester/ruby/evoruby/lib/evo/tool/fasta_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/fasta_extractor.rb @@ -1,146 +1,124 @@ # # = lib/evo/apps/fasta_extractor.rb - FastaExtractor class # -# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: fasta_extractor.rb,v 1.2 2010/12/13 19:00:11 cmzmasek Exp $ - +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) -require 'lib/evo/util/util' require 'lib/evo/util/constants' +require 'lib/evo/util/util' require 'lib/evo/util/command_line_arguments' - module Evoruby - - class FastaExtractor - - PRG_NAME = "fae" - PRG_VERSION = "1.0.0" - PRG_DESC = "extraction of nucleotide sequences from a fasta file by names from wublast search" - PRG_DATE = "2008.08.09" - COPYRIGHT = "2008-2009 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - HELP_OPTION_1 = 'help' - HELP_OPTION_2 = 'h' - - - def run() - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC , - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - ld = Constants::LINE_DELIMITER - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) - print_help - exit( 0 ) - end - - if ( cla.get_number_of_files != 3 ) - print_help - exit( -1 ) - end - - allowed_opts = Array.new - - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, - "unknown option(s): " + disallowed, - STDOUT ) + class FastaExtractor + + PRG_NAME = "fastx" + PRG_VERSION = "1.000" + PRG_DESC = "extraction of molecular sequences from a fasta file" + PRG_DATE = "170215" + WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" + + HELP_OPTION_1 = 'help' + HELP_OPTION_2 = 'h' + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC , + PRG_DATE, + WWW, + STDOUT ) + + if ( ARGV == nil || ( ARGV.length < 1 ) ) + print_help + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 3 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + input_file = cla.get_file_name( 0 ) + query = cla.get_file_name( 1 ) + output_file = cla.get_file_name( 2 ) + + if !File.exist?( input_file ) + Util.fatal_error( PRG_NAME, "error: input file [#{input_file}] does not exist" ) + end + if File.exist?( output_file ) + Util.fatal_error( PRG_NAME, "error: [#{output_file}] already exists" ) + end + + results = extract_sequences( query, input_file, output_file ) + + Util.print_message( PRG_NAME, "matched: " + results ) + Util.print_message( PRG_NAME, "wrote: " + output_file ) + Util.print_message( PRG_NAME, "OK" ) + + end + + def extract_sequences( query, fasta_file, output_file ) + output = File.open( output_file, "a" ) + matching_state = false + matches = 0 + total = 0 + File.open( fasta_file ) do | file | + while line = file.gets + if !Util.is_string_empty?( line ) + if line =~ /^\s*>/ + total += 1 + if total % 10000 == 0 + STDOUT.write "\r#{matches}/#{total}" + STDOUT.flush + end + if line =~ /#{query}/ + matching_state = true + matches += 1 + output.print( line ) + else + matching_state = false + end + elsif matching_state + output.print( line ) end - - input_file = cla.get_file_name( 0 ) - names_file = cla.get_file_name( 1 ) - output_file = cla.get_file_name( 2 ) - - if !File.exist?( input_file ) - Util.fatal_error( PRG_NAME, "error: input file [#{input_file}] does not exist" ) - end - if !File.exist?( names_file ) - Util.fatal_error( PRG_NAME, "error: names file [#{names_file}] does not exist" ) - end - if File.exist?( output_file ) - Util.fatal_error( PRG_NAME, "error: [#{output_file }] already exists" ) - end - - names = extract_names_with_frames( names_file ) - - extract_sequences( names, input_file, output_file ) - - puts - Util.print_message( PRG_NAME, "OK" ) - puts - + end end - - - def extract_names_with_frames( names_file ) - names = Hash.new() - File.open( names_file ) do | file | - while line = file.gets - if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) ) - if ( line =~ /(\S+)\s+([+|-]\d)\s+\d+\s+(\S+)/ ) - name = $1 - frame = $2 - e = $3 - names[ name ] = "[" + frame + "] [" + e + "]" - end - end - end - end - names - end - - def extract_sequences( names, fasta_file, output_file ) - output = File.open( output_file, "a" ) - matching_state = false - counter = 0 - File.open( fasta_file ) do | file | - while line = file.gets - if !Util.is_string_empty?( line ) - if ( line =~ /\s*>\s*(.+)/ ) - name = $1 - if names.has_key?( name ) - matching_state = true - counter += 1 - puts counter.to_s + ". " +name + " " + names[ name ] - output.print( ">" + name + " " + names[ name ] ) - output.print( Evoruby::Constants::LINE_DELIMITER ) - else - matching_state = false - end - elsif matching_state - output.print( line ) - end - end - end - end - output.close() - end - - def print_help() - puts( "Usage:" ) - puts() - puts( " " + PRG_NAME + ".rb " ) - puts() - end - - end # class FastaExtractor + end + output.close() + matches.to_s + "/" + total.to_s + end + + def print_help() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb " ) + puts() + puts( "Examples:" ) + puts + puts( " " + PRG_NAME + ".rb Pfam-A.fasta kinase kinases" ) + puts() + end + + end # class FastaExtractor end \ No newline at end of file diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb index f702b55..203d110 100644 --- a/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb +++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb @@ -1,11 +1,8 @@ # # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class # -# Copyright:: Copyright (C) 2012 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $ -# +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) require 'lib/evo/util/constants' require 'lib/evo/util/util' diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb index caff316..6e9afb6 100644 --- a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb +++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb @@ -153,7 +153,7 @@ module Evoruby end puts() - puts( "hmmpfam outputfile : " + inpath ) + puts( "hmmscan outputfile : " + inpath ) puts( "outputfile : " + outpath ) if ( i_e_value_threshold >= 0.0 ) @@ -469,9 +469,11 @@ module Evoruby puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + "=: E-value threshold for full protein sequences, only for protein architectures summary" ) puts( " -" + SPECIES_OPTION + "= : species for protein architectures summary" ) puts() - puts( "Example:" ) + puts( " [next step in standard analysis pipeline: d2f.rb]") puts() - puts( " " + "hmmscan --nobias --domtblout P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 -E 10 Pfam-A.hmm P53_ni.fasta" ) + puts( "Examples:" ) + puts() + puts( " " + "hmmscan --max --domtblout P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 -E 10 Pfam-A.hmm P53_ni.fasta" ) puts() puts( " " + PRG_NAME + ".rb P53_hmmscan_300_10" ) puts() diff --git a/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb index 6a4bb5f..1d8a25b 100644 --- a/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb @@ -19,7 +19,7 @@ module Evoruby class TaxonomyProcessor PRG_NAME = "tap" - PRG_DATE = "170213" + PRG_DATE = "170214" PRG_DESC = "Replacement of labels in multiple sequence files" PRG_VERSION = "2.004" WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" @@ -208,6 +208,8 @@ module Evoruby puts( " options: -" + EXTRACT_TAXONOMY_OPTION + " : to extract taxonomy information from bracketed expressions" ) puts( " -" + ANNOTATION_OPTION + "=: to add an annotation to all entries" ) puts() + puts( " [next steps in standard analysis pipeline: hmmscan followed by hsp.rb]") + puts() puts( "Example:" ) puts() puts( " " + PRG_NAME + ".rb P53.fasta" ) diff --git a/forester/ruby/evoruby/lib/evo/util/constants.rb b/forester/ruby/evoruby/lib/evo/util/constants.rb index b54801b..c9e15f9 100644 --- a/forester/ruby/evoruby/lib/evo/util/constants.rb +++ b/forester/ruby/evoruby/lib/evo/util/constants.rb @@ -14,6 +14,7 @@ module Evoruby ID_NORMALIZED_FASTA_FILE_SUFFIX = "_ni.fasta" ID_MAP_FILE_SUFFIX = ".nim" DOMAIN_TABLE_SUFFIX = "_domain_table" + HMMSCAN = "_hmmscan_" DOMAINS_TO_FORESTER_OUTFILE_SUFFIX = ".dff" DOMAINS_TO_FORESTER_EVALUE_CUTOFF_SUFFIX = "_dtfE" diff --git a/forester/ruby/scripts/hmm_split.rb b/forester/ruby/scripts/hmm_split.rb index 8ae000d..69edd90 100755 --- a/forester/ruby/scripts/hmm_split.rb +++ b/forester/ruby/scripts/hmm_split.rb @@ -1,80 +1,68 @@ #!/usr/local/bin/ruby -w # -# = hmm_split -# -# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: hmm_split.rb,v 1.5 2008/11/17 22:32:43 cmzmasek Exp $ -# -# To split a Pfam HMM file into one file for each HMM. +# = hmm_split # - +# Copyright:: Copyright (C) 2017 Christian M Zmasek +# License:: GNU Lesser General Public License (LGPL) module ForesterScripts - - if RUBY_VERSION !~ /1.9/ - puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) - exit( -1 ) - end - - - if ( ARGV == nil || ARGV.length != 3 ) - puts( "usage: hmm_split.rb " ) - exit( -1 ) - end - - hmmfile = ARGV[ 0 ] - suffix = ARGV[ 1 ] - outdir = ARGV[ 2 ] - - if ( !File.exists?( outdir ) ) - puts( "outdir [" + outdir + "] does not exist" ) - exit( -1 ) - end - if ( !File.exists?( hmmfile ) ) - puts( "Pfam HMM file [" + hmmfile + "] does not exist" ) - exit( -1 ) - end - - data = String.new - name = String.new - line_count = 0 - count = 0 - - File.open( hmmfile ) do | file | - while line = file.gets - data = data + line - line_count += 1 - if ( line =~ /NAME\s+(.+)/ ) - if name.length > 0 - puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" ) - exit( -1 ) - end - name = $1 - elsif ( line =~ /\/\// ) - if name.length < 1 - puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" ) - exit( -1 ) - end - - outfile = outdir + '/' + name + suffix - if ( File.exists?( outfile ) ) - puts( "file [" + outfile + "] already exists" ) - exit( -1 ) - end - open( outfile, 'w' ) do | out | - out.write( data ) - end - count += 1 - puts( count.to_s + ": " + name ) - data = String.new - name = String.new - end - end - end - - puts() - puts( "wrote " + count.to_s + " individual HMM files to " + outdir ) - + + if ( ARGV == nil || ARGV.length != 3 ) + puts( "usage: hmm_split.rb " ) + exit( -1 ) + end + + hmmfile = ARGV[ 0 ] + suffix = ARGV[ 1 ] + outdir = ARGV[ 2 ] + + if ( !File.exist?( outdir ) ) + puts( "outdir [" + outdir + "] does not exist" ) + exit( -1 ) + end + if ( !File.exist?( hmmfile ) ) + puts( "Pfam HMM file [" + hmmfile + "] does not exist" ) + exit( -1 ) + end + + data = String.new + name = String.new + line_count = 0 + count = 0 + + File.open( hmmfile ) do | file | + while line = file.gets + data = data + line + line_count += 1 + if ( line =~ /NAME\s+(.+)/ ) + if name.length > 0 + puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" ) + exit( -1 ) + end + name = $1 + elsif ( line =~ /\/\// ) + if name.length < 1 + puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" ) + exit( -1 ) + end + + outfile = outdir + '/' + name + suffix + if ( File.exist?( outfile ) ) + puts( "file [" + outfile + "] already exists" ) + exit( -1 ) + end + open( outfile, 'w' ) do | out | + out.write( data ) + end + count += 1 + puts( count.to_s + ": " + name ) + data = String.new + name = String.new + end + end + end + + puts() + puts( "wrote " + count.to_s + " individual HMM files to " + outdir ) + end \ No newline at end of file -- 1.7.10.2