From: cmzmasek Date: Wed, 22 Feb 2017 01:42:47 +0000 (-0800) Subject: in progress... multi-domain extractor started X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=0a0ba26ef67a1f770b8866777e6ccec9402a455f;p=jalview.git in progress... multi-domain extractor started --- diff --git a/forester/ruby/evoruby/exe/dsx.rb b/forester/ruby/evoruby/exe/dsx.rb index 02091bf..0ae40d0 100755 --- a/forester/ruby/evoruby/exe/dsx.rb +++ b/forester/ruby/evoruby/exe/dsx.rb @@ -2,12 +2,9 @@ # # = exe/dsx # -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: dsx.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $ -# -# last modified: 06/11/2007 +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) + require 'lib/evo/tool/domain_sequence_extractor' diff --git a/forester/ruby/evoruby/exe/mdsx.rb b/forester/ruby/evoruby/exe/mdsx.rb new file mode 100755 index 0000000..0078a40 --- /dev/null +++ b/forester/ruby/evoruby/exe/mdsx.rb @@ -0,0 +1,17 @@ +#!/usr/local/bin/ruby -w +# +# = exe/dsx +# +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) + + +require 'lib/evo/tool/multi_domain_seq_extractor' + +module Evoruby + + mdsx = MultiDomainSeqExtractor.new() + + mdsx.run() + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb index fc70c0d..0a7f2b1 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb @@ -133,7 +133,7 @@ module Evoruby i_e_value = r.i_e_value prev_query = r.query - length = env_to - env_from + 1 + length = 1 + env_to - env_from overall_target_length_sum += length if length > overall_target_length_max diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb new file mode 100644 index 0000000..0ed3811 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb @@ -0,0 +1,507 @@ +# +# = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class +# +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# Last modified: 2017/02/20 + +require 'lib/evo/util/constants' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/io/parser/hmmscan_parser' + +module Evoruby + class HmmscanMultiDomainExtractor + def initialize + end + + # raises ArgumentError, IOError, StandardError + def parse( domain_id, + hmmscan_output, + fasta_sequence_file, + outfile, + passed_seqs_outfile, + failed_seqs_outfile, + e_value_threshold, + length_threshold, + add_position, + add_domain_number, + add_species, + log ) + + Util.check_file_for_readability( hmmscan_output ) + Util.check_file_for_readability( fasta_sequence_file ) + Util.check_file_for_writability( outfile + ".fasta" ) + Util.check_file_for_writability( passed_seqs_outfile ) + Util.check_file_for_writability( failed_seqs_outfile ) + + in_msa = nil + factory = MsaFactory.new() + in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() ) + + if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 ) + error_msg = "could not find fasta sequences in " + fasta_sequence_file + raise IOError, error_msg + end + + out_msa = Msa.new + + failed_seqs = Msa.new + passed_seqs = Msa.new + + ld = Constants::LINE_DELIMITER + + domain_pass_counter = 0 + domain_fail_counter = 0 + passing_domains_per_protein = 0 + proteins_with_failing_domains = 0 + domain_not_present_counter = 0 + protein_counter = 1 + max_domain_copy_number_per_protein = -1 + max_domain_copy_number_sequence = "" + passing_target_length_sum = 0 + overall_target_length_sum = 0 + overall_target_length_min = 10000000 + overall_target_length_max = -1 + passing_target_length_min = 10000000 + passing_target_length_max = -1 + + overall_target_ie_min = 10000000 + overall_target_ie_max = -1 + passing_target_ie_min = 10000000 + passing_target_ie_max = -1 + + hmmscan_datas = [] + + hmmscan_parser = HmmscanParser.new( hmmscan_output ) + results = hmmscan_parser.parse + + #### + + pc0 = PassCondition.new('Bcl-2', 0.01, -1, 0.5 ) + + pcs = Hash.new + pcs["Bcl-2"] = pc0 + + prev_query = nil + domains_in_query_ary = Array.new + queries_ary = Array.new + results.each do |hmmscan_result| + if ( prev_query != nil ) && ( hmmscan_result.query != prev_query ) + ###-- + pass = true + domains_in_query_ary.each do |domain_in_query| + if pcs.has_key?(domain_in_query.model) + pc = pcs[domain_in_query.model] + # @abs_len = abs_len + # @percent_len = rel_len + if (pc.i_e_value != nil) && (pc.i_e_value >= 0) + if domain_in_query.i_e_value > pc.i_e_value + pass = false + #break + end + end + if (pc.abs_len != nil) && (pc.abs_len > 0) + length = 1 + domain_in_query.env_to - domain_in_query.env_from + if length < pc.abs_len + pass = false + #break + end + end + if (pc.rel_len != nil) && (pc.rel_len > 0) + length = 1 + domain_in_query.env_to - domain_in_query.env_from + if length < (pc.rel_len * domain_in_query.tlen) + pass = false + #break + end + end + end + end + if pass == true + queries_ary.push(domains_in_query_ary) + end + domains_in_query_ary = Array.new + ###-- + end + prev_query = hmmscan_result.query + domains_in_query_ary.push(hmmscan_result) + end + if prev_query != nil + queries_ary.push(domains_in_query_ary) + end + puts queries_ary[0][0].model + puts queries_ary[0][0].i_e_value + puts queries_ary[0][1].model + puts queries_ary[0][2].model + puts queries_ary[1][0].model + puts queries_ary[1][0].i_e_value + puts queries_ary[1][1].model + puts queries_ary[2][2].model + queries_ary.each do | query_ary | + query_ary.each do | domain | + # puts domain.model + end + #puts + end + + #### + + prev_query = nil + saw_target = false + + results.each do | r | + + if ( prev_query != nil ) && ( r.query != prev_query ) + protein_counter += 1 + passing_domains_per_protein = 0 + if !saw_target + log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld + domain_not_present_counter += 1 + end + saw_target = false + end + + prev_query = r.query + + if domain_id != r.model + next + end + + saw_target = true + + # target = r.model + sequence = r.query + # sequence_len = r.qlen + number = r.number + out_of = r.out_of + env_from = r.env_from + env_to = r.env_to + i_e_value = r.i_e_value + prev_query = r.query + + length = 1 + env_to - env_from + + overall_target_length_sum += length + if length > overall_target_length_max + overall_target_length_max = length + end + if length < overall_target_length_min + overall_target_length_min = length + end + + if i_e_value > overall_target_ie_max + overall_target_ie_max = i_e_value + end + if i_e_value < overall_target_ie_min + overall_target_ie_min = i_e_value + end + + if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) && + ( ( length_threshold <= 0 ) || ( length >= length_threshold.to_f ) ) ) + hmmscan_datas << HmmscanData.new( sequence, number, out_of, env_from, env_to, i_e_value ) + passing_target_length_sum += length + passing_domains_per_protein += 1 + if length > passing_target_length_max + passing_target_length_max = length + end + if length < passing_target_length_min + passing_target_length_min = length + end + if i_e_value > passing_target_ie_max + passing_target_ie_max = i_e_value + end + if i_e_value < passing_target_ie_min + passing_target_ie_min = i_e_value + end + if ( passing_domains_per_protein > max_domain_copy_number_per_protein ) + max_domain_copy_number_sequence = sequence + max_domain_copy_number_per_protein = passing_domains_per_protein + end + else # no pass + log << domain_fail_counter.to_s + ": " + sequence.to_s + " fails threshold(s)" + if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) ) + log << " iE=" + i_e_value.to_s + end + if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f ) + le = env_to - env_from + 1 + log << " l=" + le.to_s + end + log << ld + domain_fail_counter += 1 + end + + if number > out_of + error_msg = "number > out_of (this should not have happened)" + raise StandardError, error_msg + end + + if number == out_of + if !hmmscan_datas.empty? + process_hmmscan_datas( hmmscan_datas, + in_msa, + add_position, + add_domain_number, + add_species, + out_msa ) + domain_pass_counter += hmmscan_datas.length + if passed_seqs.find_by_name_start( sequence, true ).length < 1 + add_sequence( sequence, in_msa, passed_seqs ) + else + error_msg = "this should not have happened" + raise StandardError, error_msg + end + else # no pass + if failed_seqs.find_by_name_start( sequence, true ).length < 1 + add_sequence( sequence, in_msa, failed_seqs ) + proteins_with_failing_domains += 1 + else + error_msg = "this should not have happened" + raise StandardError, error_msg + end + end + hmmscan_datas.clear + end + + end # results.each do | r | + + if (prev_query != nil) && (!saw_target) + log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld + domain_not_present_counter += 1 + end + + if domain_pass_counter < 1 + error_msg = "no domain sequences were extracted" + raise IOError, error_msg + end + + if ( domain_not_present_counter + passed_seqs.get_number_of_seqs + proteins_with_failing_domains ) != protein_counter + error_msg = "not present + passing + not passing != proteins in sequence (fasta) file (this should not have happened)" + raise StandardError, error_msg + end + + puts + log << ld + + log << ld + avg_pass = ( passing_target_length_sum / domain_pass_counter ) + puts( "Passing target domain lengths: average: " + avg_pass.to_s ) + log << "Passing target domain lengths: average: " + avg_pass.to_s + log << ld + puts( "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - " + passing_target_length_max.to_s) + log << "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - " + passing_target_length_max.to_s + log << ld + puts( "Passing target domain iE: min-max: " + passing_target_ie_min.to_s + " - " + passing_target_ie_max.to_s) + log << "Passing target domain iE: min-max: " + passing_target_ie_min.to_s + " - " + passing_target_ie_max.to_s + log << ld + puts( "Passing target domains: sum: " + domain_pass_counter.to_s ) + log << "Passing target domains: sum: " + domain_pass_counter.to_s + log << ld + log << ld + puts + sum = domain_pass_counter + domain_fail_counter + avg_all = overall_target_length_sum / sum + puts( "All target domain lengths: average: " + avg_all.to_s ) + log << "All target domain lengths: average: " + avg_all.to_s + log << ld + puts( "All target domain lengths: min-max: " + overall_target_length_min.to_s + " - " + overall_target_length_max.to_s) + log << "All target domain lengths: min-max: " + overall_target_length_min.to_s + " - " + overall_target_length_max.to_s + log << ld + puts( "All target target domain iE: min-max: " + overall_target_ie_min.to_s + " - " + overall_target_ie_max.to_s) + log << "All target target domain iE: min-max: " + overall_target_ie_min.to_s + " - " + overall_target_ie_max.to_s + log << ld + puts( "All target domains: sum: " + sum.to_s ) + log << "All target domains: sum: " + sum.to_s + + puts + puts( "Proteins with passing target domain(s): " + passed_seqs.get_number_of_seqs.to_s ) + puts( "Proteins with no passing target domain: " + proteins_with_failing_domains.to_s ) + puts( "Proteins with no target domain : " + domain_not_present_counter.to_s ) + + log << ld + log << ld + puts + puts( "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s ) + log << "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s + log << ld + + if ( max_domain_copy_number_per_protein > 1 ) + puts( "First target protein with this copy number: " + max_domain_copy_number_sequence ) + log << "First target protein with this copy number: " + max_domain_copy_number_sequence + log << ld + end + + write_msa( out_msa, outfile + ".fasta" ) + write_msa( passed_seqs, passed_seqs_outfile ) + write_msa( failed_seqs, failed_seqs_outfile ) + + log << ld + log << "passing target domains : " + domain_pass_counter.to_s + ld + log << "failing target domains : " + domain_fail_counter.to_s + ld + log << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld + log << "proteins in hmmscan outputfile : " + protein_counter.to_s + ld + log << "proteins with passing target domain(s) : " + passed_seqs.get_number_of_seqs.to_s + ld + log << "proteins with no passing target domain : " + proteins_with_failing_domains.to_s + ld + log << "proteins with no target domain : " + domain_not_present_counter.to_s + ld + + log << ld + + return domain_pass_counter + + end # parse + + private + + def write_msa( msa, filename ) + io = MsaIO.new() + w = FastaWriter.new() + w.set_line_width( 60 ) + w.clean( true ) + begin + io.write_to_file( msa, filename, w ) + rescue Exception + error_msg = "could not write to \"" + filename + "\"" + raise IOError, error_msg + end + end + + def add_sequence( sequence_name, in_msa, add_to_msa ) + seqs = in_msa.find_by_name_start( sequence_name, true ) + if ( seqs.length < 1 ) + error_msg = "sequence \"" + sequence_name + "\" not found in sequence file" + raise StandardError, error_msg + end + if ( seqs.length > 1 ) + error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file" + raise StandardError, error_msg + end + seq = in_msa.get_sequence( seqs[ 0 ] ) + add_to_msa.add_sequence( seq ) + end + + def process_hmmscan_datas( hmmscan_datas, + in_msa, + add_position, + add_domain_number, + add_species, + out_msa ) + + actual_out_of = hmmscan_datas.size + + seq_name = "" + prev_seq_name = nil + + hmmscan_datas.each_with_index do |hmmscan_data, index| + if hmmscan_data.number < ( index + 1 ) + error_msg = "hmmscan_data.number < ( index + 1 ) (this should not have happened)" + raise StandardError, error_msg + end + + seq_name = hmmscan_data.seq_name + + extract_domain( seq_name, + index + 1, + actual_out_of, + hmmscan_data.env_from, + hmmscan_data.env_to, + in_msa, + out_msa, + add_position, + add_domain_number, + add_species ) + + if prev_seq_name && prev_seq_name != seq_name + error_msg = "this should not have happened" + raise StandardError, error_msg + end + prev_seq_name = seq_name + end # each + end # def process_hmmscan_data + + def extract_domain( sequence, + number, + out_of, + seq_from, + seq_to, + in_msa, + out_msa, + add_position, + add_domain_number, + add_species ) + if number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of ) + error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s + raise StandardError, error_msg + end + if seq_from < 1 || seq_to < 1 || seq_from >= seq_to + error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s + raise StandardError, error_msg + end + seqs = in_msa.find_by_name_start( sequence, true ) + if seqs.length < 1 + error_msg = "sequence \"" + sequence + "\" not found in sequence file" + raise IOError, error_msg + end + if seqs.length > 1 + error_msg = "sequence \"" + sequence + "\" not unique in sequence file" + raise IOError, error_msg + end + # hmmscan is 1 based, whereas sequences are 0 bases in this package. + seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 ) + + orig_name = seq.get_name + + seq.set_name( orig_name.split[ 0 ] ) + + if add_position + seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s ) + end + + if out_of != 1 && add_domain_number + seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s ) + end + + if add_species + a = orig_name.rindex "[" + b = orig_name.rindex "]" + unless a && b + error_msg = "species not found in " + orig_name + raise StandardError, error_msg + end + species = orig_name[ a .. b ] + seq.set_name( seq.get_name + " " + species ) + end + out_msa.add_sequence( seq ) + end + + def is_ignorable?( line ) + return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ ) + end + + end # class HmmscanDomainExtractor + + class HmmscanData + def initialize( seq_name, number, out_of, env_from, env_to, i_e_value ) + @seq_name = seq_name + @number = number + @out_of = out_of + @env_from = env_from + @env_to = env_to + @i_e_value = i_e_value + end + attr_reader :seq_name, :number, :out_of, :env_from, :env_to, :i_e_value + end + + class PassCondition + def initialize( hmm, i_e_value, abs_len, rel_len ) + @hmm = hmm + @i_e_value = i_e_value + @abs_len = abs_len + @percent_len = rel_len + end + attr_reader :hmm, :i_e_value, :abs_len, :rel_len + end + +end # module Evoruby + diff --git a/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb index 5448f4a..c1e23b0 100644 --- a/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb @@ -1,5 +1,5 @@ # -# = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class +# = lib/evo/tool/domain_sequence_extractor - DomainSequenceExtractor class # # Copyright:: Copyright (C) 2017 Christian M. Zmasek # License:: GNU Lesser General Public License (LGPL) diff --git a/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb new file mode 100644 index 0000000..6c74e8a --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb @@ -0,0 +1,286 @@ +# +# = lib/evo/tool/multi_domain_seq_extractor - MultiDomainSeqExtractor class +# +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/io/parser/hmmscan_multi_domain_extractor' + +module Evoruby + class MultiDomainSeqExtractor + + PRG_NAME = "mdsx" + PRG_VERSION = "1.000" + PRG_DESC = "Extraction of multi domain sequences from hmmscan output" + PRG_DATE = "20170220" + WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" + + E_VALUE_THRESHOLD_DEFAULT = 0.1 + LENGTH_THRESHOLD_DEFAULT = 50 + + E_VALUE_THRESHOLD_OPTION = 'e' + LENGTH_THRESHOLD_OPTION = 'l' + ADD_POSITION_OPTION = 'p' + ADD_DOMAIN_NUMBER_OPTION = 'd' + ADD_SPECIES = 's' + LOG_FILE_SUFFIX = '_multi_domain_seq_extr.log' + PASSED_SEQS_SUFFIX = '_with_passing_domains.fasta' + FAILED_SEQS_SUFFIX = '_with_no_passing_domains.fasta' + HELP_OPTION_1 = 'help' + HELP_OPTION_2 = 'h' + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC , + PRG_DATE, + WWW, + STDOUT ) + + if ( ARGV == nil || ( ARGV.length < 1 ) ) + print_help + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError + Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + unless ( cla.get_number_of_files == 2 || cla.get_number_of_files == 3 || cla.get_number_of_files == 4 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( ADD_POSITION_OPTION ) + allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION ) + allowed_opts.push( LENGTH_THRESHOLD_OPTION ) + allowed_opts.push( ADD_SPECIES ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + e_value_threshold = E_VALUE_THRESHOLD_DEFAULT + if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) + begin + e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( e_value_threshold < 0.0 ) + Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) + end + end + + length_threshold = LENGTH_THRESHOLD_DEFAULT + if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) ) + begin + length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( length_threshold < 0) + Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT ) + end + end + + domain_id = cla.get_file_name( 0 ) + hmmscan_output = cla.get_file_name( 1 ) + fasta_sequence_file = "" + outfile = "" + + if (cla.get_number_of_files == 4) + fasta_sequence_file = cla.get_file_name( 2 ) + outfile = cla.get_file_name( 3 ) + elsif (cla.get_number_of_files == 2 || cla.get_number_of_files == 3) + if (cla.get_number_of_files == 3) + fasta_sequence_file = cla.get_file_name( 2 ) + else + hmmscan_index = hmmscan_output.index(Constants::HMMSCAN) + if ( hmmscan_index != nil ) + prefix = hmmscan_output[0 .. hmmscan_index-1 ] + suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX + files = Dir.entries( "." ) + matching_files = Util.get_matching_files( files, prefix, suffix) + if matching_files.length < 1 + Util.fatal_error( PRG_NAME, 'no file matching [' + prefix + + '...' + suffix + '] present in current directory: need to indicate as second argument' ) + end + if matching_files.length > 1 + Util.fatal_error( PRG_NAME, 'more than one file matching [' + + prefix + '...' + suffix + '] present in current directory: need to indicate as second argument' ) + end + fasta_sequence_file = matching_files[ 0 ] + else + Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' ) + end + end + hmmscan_index = hmmscan_output.index(Constants::HMMSCAN) + if ( hmmscan_index != nil ) + outfile = hmmscan_output.sub(Constants::HMMSCAN, "_") + "_" + domain_id + e = e_value_threshold >= 1 ? e_value_threshold.to_i : e_value_threshold.to_s.sub!('.','_') + outfile += "_E" + e.to_s + outfile += "_L" + length_threshold.to_i.to_s + else + Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' ) + end + end + + if outfile.downcase.end_with?( ".fasta" ) + outfile = outfile[ 0 .. outfile.length - 7 ] + elsif outfile.downcase.end_with?( ".fsa" ) + outfile = outfile[ 0 .. outfile.length - 5 ] + end + + add_position = false + if ( cla.is_option_set?( ADD_POSITION_OPTION ) ) + add_position = true + end + + add_domain_number = false + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) ) + add_domain_number = true + end + + add_species = false + if cla.is_option_set?( ADD_SPECIES ) + add_species = true + end + + log = String.new + ld = Constants::LINE_DELIMITER + + puts() + puts( "Domain : " + domain_id ) + log << "Domain : " + domain_id + ld + puts( "Hmmscan outputfile : " + hmmscan_output ) + log << "Hmmscan outputfile : " + hmmscan_output + ld + puts( "Fasta sequencefile (complete sequences) : " + fasta_sequence_file ) + log << "Fasta sequencefile (complete sequences) : " + fasta_sequence_file + ld + puts( "Outputfile : " + outfile + ".fasta" ) + log << "Outputfile : " + outfile + ld + puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX ) + log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld + puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX ) + log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld + puts( "Logfile : " + outfile + LOG_FILE_SUFFIX ) + log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld + if ( e_value_threshold >= 0.0 ) + puts( "iE-value threshold : " + e_value_threshold.to_s ) + log << "iE-value threshold : " + e_value_threshold.to_s + ld + else + puts( "iE-value threshold : no threshold" ) + log << "iE-value threshold : no threshold" + ld + end + if ( length_threshold > 0 ) + puts( "Length threshold (env) : " + length_threshold.to_s ) + log << "Length threshold (env) : " + length_threshold.to_s + ld + else + puts( "Length threshold (env) : no threshold" ) + log << "Length threshold (env) : no threshold" + ld + end + if ( add_position ) + puts( "Add positions (rel to complete seq) to extracted domains : true" ) + log << "Add positions (rel to complete seq) to extracted domains : true" + ld + else + puts( "Add positions (rel to complete seq) to extracted domains : false" ) + log << "Add positions (rel to complete seq) to extracted domains : false" + ld + end + + if ( add_domain_number ) + puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" ) + log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld + else + puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" ) + log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld + end + + puts + log << ld + + domain_count = 0 + begin + parser = HmmscanMultiDomainExtractor.new() + domain_count = parser.parse( domain_id, + hmmscan_output, + fasta_sequence_file, + outfile, + outfile + PASSED_SEQS_SUFFIX, + outfile + FAILED_SEQS_SUFFIX, + e_value_threshold, + length_threshold, + add_position, + add_domain_number, + add_species, + log ) + rescue ArgumentError, IOError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + + rescue Exception => e + puts e.backtrace + Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT ) + + end + + puts + Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" ) + Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta") + Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX ) + Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX ) + Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX ) + + begin + f = File.open( outfile + LOG_FILE_SUFFIX, 'a' ) + f.print( log ) + f.close + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + puts + Util.print_message( PRG_NAME, "OK" ) + puts + + end + + def print_help() + puts() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] [file containing complete sequences in fasta format] [outputfile]" ) + puts() + puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "= : iE-value threshold for target domain, default is " + E_VALUE_THRESHOLD_DEFAULT.to_s ) + puts( " -" + LENGTH_THRESHOLD_OPTION + "= : length threshold target domain (env), default is " + LENGTH_THRESHOLD_DEFAULT.to_s ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION + " : to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" ) + puts( " -" + ADD_POSITION_OPTION + " : to add positions (rel to complete seq) to extracted domains" ) + puts( " -" + ADD_SPECIES + " : to add species [in brackets]" ) + puts() + puts( "Examples:" ) + puts + puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta P53_#{Constants::PFAM_V_FOR_EX}_10_Pkinase_E1_0e-06_L50" ) + puts + puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta" ) + puts + puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10" ) + puts() + end + + end # class DomainSequenceExtractor + +end # module Evoruby