From: cmzmasek@gmail.com Date: Wed, 26 Sep 2012 03:16:38 +0000 (+0000) Subject: in progress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=01427be863c6ad9f5c174c9c708d261596caa820;p=jalview.git in progress --- diff --git a/forester/ruby/evoruby/exe/dsx_old.rb b/forester/ruby/evoruby/exe/dsx_old.rb new file mode 100755 index 0000000..1ff35f7 --- /dev/null +++ b/forester/ruby/evoruby/exe/dsx_old.rb @@ -0,0 +1,20 @@ +#!/usr/local/bin/ruby -w +# +# = exe/dsx +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: dsx.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $ +# +# last modified: 06/11/2007 + +require 'lib/evo/apps/domain_sequence_extractor' + +module Evoruby + + dsx = DomainSequenceExtractor.new() + + dsx.run() + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb index ac6199b..517042a 100644 --- a/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb @@ -1,262 +1,299 @@ # # = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class # -# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# Copyright:: Copyright (C) 2012 Christian M. Zmasek # License:: GNU Lesser General Public License (LGPL) # -# $Id: domain_sequence_extractor.rb,v 1.19 2010/12/13 19:00:11 cmzmasek Exp $ +# $Id:Exp $ require 'lib/evo/util/constants' require 'lib/evo/util/util' require 'lib/evo/util/command_line_arguments' -require 'lib/evo/io/parser/hmmsearch_domain_extractor' +require 'lib/evo/io/parser/hmmscan_domain_extractor' module Evoruby - class DomainSequenceExtractor - - PRG_NAME = "dsx" - PRG_VERSION = "1.1.0" - PRG_DESC = "extraction of domain sequences from hmmsearch output" - PRG_DATE = "2008.01.03" - COPYRIGHT = "2008-2009 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - - E_VALUE_THRESHOLD_OPTION = 'e' - LENGTH_THRESHOLD_OPTION = 'l' - ADD_POSITION_OPTION = 'p' - ADD_DOMAIN_NUMBER_OPTION = 'd' - ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT = 'dd' - ADD_DOMAIN_NUMBER_OPTION_AS_LETTER = 'dl' - TRIM_OPTION = 't' - LOG_FILE_SUFFIX = '_domain_seq_extr.log' - PASSED_SEQS_SUFFIX = '_domain_seq_extr_passed' - FAILED_SEQS_SUFFIX = '_domain_seq_extr_failed' - HELP_OPTION_1 = 'help' - HELP_OPTION_2 = 'h' - - def run() - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC , - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - ld = Constants::LINE_DELIMITER - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError - Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT ) - end - - if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) - print_help - exit( 0 ) - end - - if ( cla.get_number_of_files != 3 ) - print_help - exit( -1 ) - end - - allowed_opts = Array.new - allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) - allowed_opts.push( ADD_POSITION_OPTION ) - allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION ) - allowed_opts.push( LENGTH_THRESHOLD_OPTION ) - allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) - allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) - allowed_opts.push( TRIM_OPTION ) - - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, - "unknown option(s): " + disallowed, - STDOUT ) - end - - hmmsearch_output = cla.get_file_name( 0 ) - fasta_sequence_file = cla.get_file_name( 1 ) - outfile = cla.get_file_name( 2 ) - - add_position = false - if ( cla.is_option_set?( ADD_POSITION_OPTION ) ) - add_position = true - end - - trim = false - if ( cla.is_option_set?( TRIM_OPTION ) ) - trim = true - end - - add_domain_number = false - add_domain_number_as_letter = false - add_domain_number_as_digit = false - - if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) ) - add_domain_number = true - end - if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) ) - add_domain_number_as_letter = true - end - if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) ) - add_domain_number_as_digit = true - end - - if ( add_domain_number_as_letter && add_domain_number_as_digit ) - puts( "attempt to add domain number as letter and digit at the same time" ) - print_help - exit( -1 ) - end - - e_value_threshold = -1.0 - if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) - begin - e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) - rescue ArgumentError => e - Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - if ( e_value_threshold < 0.0 ) - Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) - end - end - - length_threshold = -1 - if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) ) - begin - length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION ) - rescue ArgumentError => e - Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - if ( length_threshold < 0) - Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT ) - end - end - - log = String.new - - puts() - puts( "Hmmsearch outputfile : " + hmmsearch_output ) - log << "Hmmsearch outputfile : " + hmmsearch_output + ld - puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file ) - log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld - puts( "Outputfile : " + outfile ) - log << "Outputfile : " + outfile + ld - puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX ) - log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld - puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX ) - log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld - puts( "Logfile : " + outfile + LOG_FILE_SUFFIX ) - log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld - if ( e_value_threshold >= 0.0 ) - puts( "E-value threshold : " + e_value_threshold.to_s ) - log << "E-value threshold : " + e_value_threshold.to_s + ld - else - puts( "E-value threshold : no threshold" ) - log << "E-value threshold : no threshold" + ld - end - if ( length_threshold > 0 ) - puts( "Length threshold : " + length_threshold.to_s ) - log << "Length threshold : " + length_threshold.to_s + ld - else - puts( "Length threshold : no threshold" ) - log << "Length threshold : no threshold" + ld - end - - if ( trim ) - puts( "Trim last 2 chars : true" ) - log << "Trim last 2 chars : true" + ld - else - puts( "Trim names : false" ) - log << "Trim names : false" + ld - end - - - if ( add_position ) - puts( "Add positions (rel to complete seq) to extracted domains: true" ) - log << "Add positions (rel to complete seq) to extracted domains: true" + ld - else - puts( "Add positions (rel to complete seq) to extracted domains: false" ) - log << "Add positions (rel to complete seq) to extracted domains: false" + ld - end - - if ( add_domain_number || add_domain_number_as_digit || add_domain_number_as_letter ) - puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" ) - log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld - else - puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" ) - log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld - end - - puts - - domain_count = 0 - begin - parser = HmmsearchDomainExtractor.new() - domain_count = parser.parse( hmmsearch_output, - fasta_sequence_file, - outfile, - outfile + PASSED_SEQS_SUFFIX, - outfile + FAILED_SEQS_SUFFIX, - e_value_threshold, - length_threshold, - add_position, - add_domain_number, - add_domain_number_as_digit, - add_domain_number_as_letter, - trim, - log ) - rescue ArgumentError, IOError, StandardError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "unexpected exception!: " + e.to_s, STDOUT ) - end - - puts - Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" ) - Util.print_message( PRG_NAME, "wrote; " + outfile ) - Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX ) - Util.print_message( PRG_NAME, "(wrote: " + outfile + PASSED_SEQS_SUFFIX + ")" ) - Util.print_message( PRG_NAME, "(wrote: " + outfile + FAILED_SEQS_SUFFIX + ")" ) - - begin - f = File.open( outfile + LOG_FILE_SUFFIX, 'a' ) - f.print( log ) - f.close - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - puts - Util.print_message( PRG_NAME, "OK" ) - puts - + class DomainSequenceExtractor + + PRG_NAME = "dsx" + PRG_VERSION = "1.000" + PRG_DESC = "extraction of domain sequences from hmmscan output" + PRG_DATE = "20120906" + COPYRIGHT = "2012 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + E_VALUE_THRESHOLD_OPTION = 'e' + LENGTH_THRESHOLD_OPTION = 'l' + ADD_POSITION_OPTION = 'p' + ADD_DOMAIN_NUMBER_OPTION = 'd' + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT = 'dd' + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER = 'dl' + ADD_SPECIES = 's' + MIN_LINKER_OPT = 'ml' + TRIM_OPTION = 't' + LOG_FILE_SUFFIX = '_domain_seq_extr.log' + PASSED_SEQS_SUFFIX = '_domain_seq_extr_passed' + FAILED_SEQS_SUFFIX = '_domain_seq_extr_failed' + HELP_OPTION_1 = 'help' + HELP_OPTION_2 = 'h' + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC , + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + ld = Constants::LINE_DELIMITER + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError + Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 4 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( ADD_POSITION_OPTION ) + allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION ) + allowed_opts.push( LENGTH_THRESHOLD_OPTION ) + allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) + allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) + allowed_opts.push( TRIM_OPTION ) + allowed_opts.push( ADD_SPECIES ) + allowed_opts.push( MIN_LINKER_OPT ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + domain_id = cla.get_file_name( 0 ) + hmmsearch_output = cla.get_file_name( 1 ) + fasta_sequence_file = cla.get_file_name( 2 ) + outfile = cla.get_file_name( 3 ) + + add_position = false + if ( cla.is_option_set?( ADD_POSITION_OPTION ) ) + add_position = true + end + + trim = false + if ( cla.is_option_set?( TRIM_OPTION ) ) + trim = true + end + + add_domain_number = false + add_domain_number_as_letter = false + add_domain_number_as_digit = false + + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) ) + add_domain_number = true + end + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) ) + add_domain_number_as_letter = true + end + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) ) + add_domain_number_as_digit = true + end + + add_species = false + if cla.is_option_set? ADD_SPECIES + add_species = true + end + + if ( add_domain_number_as_letter && add_domain_number_as_digit ) + puts( "attempt to add domain number as letter and digit at the same time" ) + print_help + exit( -1 ) + end + + e_value_threshold = -1.0 + if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) + begin + e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end - - def print_help() - puts() - puts( "Usage:" ) - puts() - puts( " " + PRG_NAME + ".rb [options] " ) - puts() - puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=: E-value threshold, default is no threshold" ) - puts( " -" + LENGTH_THRESHOLD_OPTION + "=: length threshold, default is no threshold" ) - puts( " -" + ADD_POSITION_OPTION + ": to add positions (rel to complete seq) to extracted domains" ) - puts( " -" + ADD_DOMAIN_NUMBER_OPTION + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" ) - puts( " -" + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT + ": to add numbers to extracted domains as digit (example \"domain2\")" ) - puts( " -" + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER + ": to add numbers to extracted domains as letter (example \"domaina\")" ) - puts( " -" + TRIM_OPTION + ": to remove the last 2 characters from sequence names" ) - puts() + if ( e_value_threshold < 0.0 ) + Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) end - - end # class DomainSequenceExtractor + end + + length_threshold = -1 + if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) ) + begin + length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION ) + rescue ArgumentError => e + Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( length_threshold < 0) + Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT ) + end + end + + + min_linker = nil + if ( cla.is_option_set?( MIN_LINKER_OPT ) ) + begin + min_linker = cla.get_option_value_as_int( MIN_LINKER_OPT ) + rescue ArgumentError => e + Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( !min_linker || min_linker > 100 || min_linker < -100 ) + Forester::Util.fatal_error( PRG_NAME, "unexpected value for min linker " + min_linker.to_s, STDOUT ) + end + end + + + log = String.new + + puts() + puts( "Domain : " + domain_id ) + log << "Domain : " + domain_id + ld + puts( "Hmmscan outputfile : " + hmmsearch_output ) + log << "Hmmscan outputfile : " + hmmsearch_output + ld + puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file ) + log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld + puts( "Outputfile : " + outfile ) + log << "Outputfile : " + outfile + ld + puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX ) + log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld + puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX ) + log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld + puts( "Logfile : " + outfile + LOG_FILE_SUFFIX ) + log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld + if ( e_value_threshold >= 0.0 ) + puts( "E-value threshold : " + e_value_threshold.to_s ) + log << "E-value threshold : " + e_value_threshold.to_s + ld + else + puts( "E-value threshold : no threshold" ) + log << "E-value threshold : no threshold" + ld + end + if ( length_threshold > 0 ) + puts( "Length threshold : " + length_threshold.to_s ) + log << "Length threshold : " + length_threshold.to_s + ld + else + puts( "Length threshold : no threshold" ) + log << "Length threshold : no threshold" + ld + end + + if ( trim ) + puts( "Trim last 2 chars : true" ) + log << "Trim last 2 chars : true" + ld + else + puts( "Trim names : false" ) + log << "Trim names : false" + ld + end + + if ( min_linker ) + puts( "Min linker : " + min_linker.to_s ) + log << "Min linker : " + min_linker.to_s + ld + + end + + + if ( add_position ) + puts( "Add positions (rel to complete seq) to extracted domains: true" ) + log << "Add positions (rel to complete seq) to extracted domains: true" + ld + else + puts( "Add positions (rel to complete seq) to extracted domains: false" ) + log << "Add positions (rel to complete seq) to extracted domains: false" + ld + end + + if ( add_domain_number || add_domain_number_as_digit || add_domain_number_as_letter ) + puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" ) + log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld + else + puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" ) + log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld + end + + puts + + domain_count = 0 + begin + parser = HmmscanDomainExtractor.new() + domain_count = parser.parse( domain_id, + hmmsearch_output, + fasta_sequence_file, + outfile, + outfile + PASSED_SEQS_SUFFIX, + outfile + FAILED_SEQS_SUFFIX, + e_value_threshold, + length_threshold, + add_position, + add_domain_number, + add_domain_number_as_digit, + add_domain_number_as_letter, + trim, + add_species, + min_linker, + log ) + rescue ArgumentError, IOError, StandardError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "unexpected exception!: " + e.to_s, STDOUT ) + end + + puts + Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" ) + Util.print_message( PRG_NAME, "wrote; " + outfile ) + Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX ) + Util.print_message( PRG_NAME, "(wrote: " + outfile + PASSED_SEQS_SUFFIX + ")" ) + Util.print_message( PRG_NAME, "(wrote: " + outfile + FAILED_SEQS_SUFFIX + ")" ) + + begin + f = File.open( outfile + LOG_FILE_SUFFIX, 'a' ) + f.print( log ) + f.close + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + puts + Util.print_message( PRG_NAME, "OK" ) + puts + + end + + def print_help() + puts() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=: E-value threshold, default is no threshold" ) + puts( " -" + LENGTH_THRESHOLD_OPTION + "=: length threshold, default is no threshold" ) + puts( " -" + ADD_POSITION_OPTION + ": to add positions (rel to complete seq) to extracted domains" ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT + ": to add numbers to extracted domains as digit (example \"domain2\")" ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER + ": to add numbers to extracted domains as letter (example \"domaina\")" ) + puts( " -" + TRIM_OPTION + ": to remove the last 2 characters from sequence names" ) + puts( " -" + ADD_SPECIES + ": to add species [in brackets]" ) + puts( " -" + MIN_LINKER_OPT + "=: to extract pairs of same domains with a distance inbetween shorter than a given value" ) + puts() + end + + end # class DomainSequenceExtractor end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor_old.rb b/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor_old.rb new file mode 100644 index 0000000..ac6199b --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor_old.rb @@ -0,0 +1,262 @@ +# +# = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class +# +# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: domain_sequence_extractor.rb,v 1.19 2010/12/13 19:00:11 cmzmasek Exp $ + + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/io/parser/hmmsearch_domain_extractor' + +module Evoruby + + class DomainSequenceExtractor + + PRG_NAME = "dsx" + PRG_VERSION = "1.1.0" + PRG_DESC = "extraction of domain sequences from hmmsearch output" + PRG_DATE = "2008.01.03" + COPYRIGHT = "2008-2009 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + E_VALUE_THRESHOLD_OPTION = 'e' + LENGTH_THRESHOLD_OPTION = 'l' + ADD_POSITION_OPTION = 'p' + ADD_DOMAIN_NUMBER_OPTION = 'd' + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT = 'dd' + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER = 'dl' + TRIM_OPTION = 't' + LOG_FILE_SUFFIX = '_domain_seq_extr.log' + PASSED_SEQS_SUFFIX = '_domain_seq_extr_passed' + FAILED_SEQS_SUFFIX = '_domain_seq_extr_failed' + HELP_OPTION_1 = 'help' + HELP_OPTION_2 = 'h' + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC , + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + ld = Constants::LINE_DELIMITER + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError + Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 3 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( ADD_POSITION_OPTION ) + allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION ) + allowed_opts.push( LENGTH_THRESHOLD_OPTION ) + allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) + allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) + allowed_opts.push( TRIM_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + hmmsearch_output = cla.get_file_name( 0 ) + fasta_sequence_file = cla.get_file_name( 1 ) + outfile = cla.get_file_name( 2 ) + + add_position = false + if ( cla.is_option_set?( ADD_POSITION_OPTION ) ) + add_position = true + end + + trim = false + if ( cla.is_option_set?( TRIM_OPTION ) ) + trim = true + end + + add_domain_number = false + add_domain_number_as_letter = false + add_domain_number_as_digit = false + + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) ) + add_domain_number = true + end + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) ) + add_domain_number_as_letter = true + end + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) ) + add_domain_number_as_digit = true + end + + if ( add_domain_number_as_letter && add_domain_number_as_digit ) + puts( "attempt to add domain number as letter and digit at the same time" ) + print_help + exit( -1 ) + end + + e_value_threshold = -1.0 + if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) + begin + e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( e_value_threshold < 0.0 ) + Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) + end + end + + length_threshold = -1 + if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) ) + begin + length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION ) + rescue ArgumentError => e + Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( length_threshold < 0) + Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT ) + end + end + + log = String.new + + puts() + puts( "Hmmsearch outputfile : " + hmmsearch_output ) + log << "Hmmsearch outputfile : " + hmmsearch_output + ld + puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file ) + log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld + puts( "Outputfile : " + outfile ) + log << "Outputfile : " + outfile + ld + puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX ) + log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld + puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX ) + log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld + puts( "Logfile : " + outfile + LOG_FILE_SUFFIX ) + log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld + if ( e_value_threshold >= 0.0 ) + puts( "E-value threshold : " + e_value_threshold.to_s ) + log << "E-value threshold : " + e_value_threshold.to_s + ld + else + puts( "E-value threshold : no threshold" ) + log << "E-value threshold : no threshold" + ld + end + if ( length_threshold > 0 ) + puts( "Length threshold : " + length_threshold.to_s ) + log << "Length threshold : " + length_threshold.to_s + ld + else + puts( "Length threshold : no threshold" ) + log << "Length threshold : no threshold" + ld + end + + if ( trim ) + puts( "Trim last 2 chars : true" ) + log << "Trim last 2 chars : true" + ld + else + puts( "Trim names : false" ) + log << "Trim names : false" + ld + end + + + if ( add_position ) + puts( "Add positions (rel to complete seq) to extracted domains: true" ) + log << "Add positions (rel to complete seq) to extracted domains: true" + ld + else + puts( "Add positions (rel to complete seq) to extracted domains: false" ) + log << "Add positions (rel to complete seq) to extracted domains: false" + ld + end + + if ( add_domain_number || add_domain_number_as_digit || add_domain_number_as_letter ) + puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" ) + log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld + else + puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" ) + log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld + end + + puts + + domain_count = 0 + begin + parser = HmmsearchDomainExtractor.new() + domain_count = parser.parse( hmmsearch_output, + fasta_sequence_file, + outfile, + outfile + PASSED_SEQS_SUFFIX, + outfile + FAILED_SEQS_SUFFIX, + e_value_threshold, + length_threshold, + add_position, + add_domain_number, + add_domain_number_as_digit, + add_domain_number_as_letter, + trim, + log ) + rescue ArgumentError, IOError, StandardError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "unexpected exception!: " + e.to_s, STDOUT ) + end + + puts + Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" ) + Util.print_message( PRG_NAME, "wrote; " + outfile ) + Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX ) + Util.print_message( PRG_NAME, "(wrote: " + outfile + PASSED_SEQS_SUFFIX + ")" ) + Util.print_message( PRG_NAME, "(wrote: " + outfile + FAILED_SEQS_SUFFIX + ")" ) + + begin + f = File.open( outfile + LOG_FILE_SUFFIX, 'a' ) + f.print( log ) + f.close + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + puts + Util.print_message( PRG_NAME, "OK" ) + puts + + end + + def print_help() + puts() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=: E-value threshold, default is no threshold" ) + puts( " -" + LENGTH_THRESHOLD_OPTION + "=: length threshold, default is no threshold" ) + puts( " -" + ADD_POSITION_OPTION + ": to add positions (rel to complete seq) to extracted domains" ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT + ": to add numbers to extracted domains as digit (example \"domain2\")" ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER + ": to add numbers to extracted domains as letter (example \"domaina\")" ) + puts( " -" + TRIM_OPTION + ": to remove the last 2 characters from sequence names" ) + puts() + end + + end # class DomainSequenceExtractor + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb index 417820c..bf7cb71 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb @@ -38,6 +38,7 @@ module Evoruby add_domain_number_as_letter, trim_name, add_species, + min_linker, log ) Util.check_file_for_readability( hmmsearch_output ) @@ -56,8 +57,13 @@ module Evoruby end out_msa = Msa.new + failed_seqs = Msa.new passed_seqs = Msa.new + out_msa_pairs = nil + if min_linker + out_msa_pairs = Msa.new + end ld = Constants::LINE_DELIMITER @@ -70,6 +76,12 @@ module Evoruby failed_species_counts = Hash.new passed_species_counts = Hash.new + prev_sequence = nil + prev_number = nil + prev_env_from = nil + prev_env_to = nil + prev_i_e_value = nil + File.open( hmmsearch_output ) do | file | while line = file.gets if !is_ignorable?( line ) && line =~ /^\S+\s+/ @@ -115,6 +127,34 @@ module Evoruby add_sequence( sequence, in_msa, passed_seqs ) proteins_with_passing_domains += 1 end + + if min_linker + if sequence == prev_sequence && ( ( ( e_value_threshold.to_f < 0.0 ) || ( prev_i_e_value <= e_value_threshold ) ) && + ( ( length_threshold.to_f <= 0 ) || ( ( prev_env_to - prev_env_from + 1 ) >= length_threshold.to_f ) ) ) + diff = env_from - prev_env_to + if diff <= min_linker + extract_domain( sequence, + prev_number.to_s + "/" + number.to_s, + out_of, + prev_env_from, + env_to, + in_msa, + out_msa_pairs, + false, + true, + false, + false, + trim_name , + add_species ) + end + end + prev_sequence = sequence + prev_number = number + prev_env_from = env_from + prev_env_to = env_to + prev_i_e_value = i_e_value + end + else print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" ) log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" @@ -168,6 +208,15 @@ module Evoruby raise IOError, error_msg end + if out_msa_pairs + begin + io.write_to_file( out_msa_pairs, outfile+"_" + min_linker.to_s, w ) + rescue Exception + error_msg = "could not write to \"" + outfile+"_" + min_linker.to_s + "\"" + raise IOError, error_msg + end + end + begin io.write_to_file( passed_seqs, passed_seqs_outfile, w ) rescue Exception @@ -200,7 +249,6 @@ module Evoruby private - def add_sequence( sequence_name, in_msa, add_to_msa ) seqs = in_msa.find_by_name_start( sequence_name, true ) if ( seqs.length < 1 ) @@ -229,11 +277,11 @@ module Evoruby add_domain_number_as_letter, trim_name, add_species ) - if ( number < 1 || out_of < 1 || number > out_of ) + if number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of ) error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s raise ArgumentError, error_msg end - if ( seq_from < 1 || seq_to < 1 || seq_from >= seq_to ) + if seq_from < 1 || seq_to < 1 || seq_from >= seq_to error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s raise ArgumentError, error_msg end @@ -262,15 +310,15 @@ module Evoruby end if out_of != 1 - if ( add_domain_number_as_digit ) + if add_domain_number_as_digit seq.set_name( seq.get_name + number.to_s ) - elsif ( add_domain_number_as_letter ) + elsif add_domain_number_as_letter if number > 25 error_msg = 'too many identical domains per sequence, cannot use letters to distinguish them' raise StandardError, error_msg end seq.set_name( seq.get_name + ( number + 96 ).chr ) - elsif ( add_domain_number ) + elsif add_domain_number seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s ) end end @@ -292,7 +340,7 @@ module Evoruby end out_msa.add_sequence( seq ) - + end def count_species( sequence, species_counts_map )