From: cmzmasek@gmail.com Date: Mon, 1 Oct 2012 23:29:12 +0000 (+0000) Subject: in progress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=9a0520d4b055dc6e100534ee3c8953ecae716ff2;p=jalview.git in progress --- diff --git a/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb index 37c6d78..c9311eb 100644 --- a/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor.rb @@ -17,9 +17,9 @@ module Evoruby class DomainSequenceExtractor PRG_NAME = "dsx" - PRG_VERSION = "1.000" + PRG_VERSION = "2.000" PRG_DESC = "extraction of domain sequences from hmmscan output" - PRG_DATE = "20120906" + PRG_DATE = "20121001" COPYRIGHT = "2012 Christian M Zmasek" CONTACT = "phylosoft@gmail.com" WWW = "www.phylosoft.org" @@ -31,8 +31,8 @@ module Evoruby ADD_SPECIES = 's' MIN_LINKER_OPT = 'ml' LOG_FILE_SUFFIX = '_domain_seq_extr.log' - PASSED_SEQS_SUFFIX = '_domain_seq_extr_passed' - FAILED_SEQS_SUFFIX = '_domain_seq_extr_failed' + PASSED_SEQS_SUFFIX = '_with_passing_domains.fasta' + FAILED_SEQS_SUFFIX = '_with_no_passing_domains.fasta' HELP_OPTION_1 = 'help' HELP_OPTION_2 = 'h' @@ -86,6 +86,13 @@ module Evoruby fasta_sequence_file = cla.get_file_name( 2 ) outfile = cla.get_file_name( 3 ) + if outfile.downcase.end_with?( ".fasta" ) + outfile = outfile[ 0 .. outfile.length - 7 ] + elsif outfile.downcase.end_with?( ".fsa" ) + outfile = outfile[ 0 .. outfile.length - 5 ] + end + + add_position = false if ( cla.is_option_set?( ADD_POSITION_OPTION ) ) add_position = true @@ -148,7 +155,7 @@ module Evoruby log << "Hmmscan outputfile : " + hmmsearch_output + ld puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file ) log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld - puts( "Outputfile : " + outfile ) + puts( "Outputfile : " + outfile + ".fasta" ) log << "Outputfile : " + outfile + ld puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX ) log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld @@ -223,10 +230,10 @@ module Evoruby puts Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" ) - Util.print_message( PRG_NAME, "wrote; " + outfile ) + Util.print_message( PRG_NAME, "wrote; " + outfile + ".fasta") Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX ) - Util.print_message( PRG_NAME, "(wrote: " + outfile + PASSED_SEQS_SUFFIX + ")" ) - Util.print_message( PRG_NAME, "(wrote: " + outfile + FAILED_SEQS_SUFFIX + ")" ) + Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX ) + Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX ) begin f = File.open( outfile + LOG_FILE_SUFFIX, 'a' ) diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb index e7a928f..b7092e0 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb @@ -38,7 +38,7 @@ module Evoruby Util.check_file_for_readability( hmmscan_output ) Util.check_file_for_readability( fasta_sequence_file ) - Util.check_file_for_writability( outfile ) + Util.check_file_for_writability( outfile + ".fasta" ) Util.check_file_for_writability( passed_seqs_outfile ) Util.check_file_for_writability( failed_seqs_outfile ) @@ -56,7 +56,7 @@ module Evoruby failed_seqs = Msa.new passed_seqs = Msa.new out_msa_pairs = nil - out_msa_distant_partners = nil + out_msa_isolated = nil out_msa_singles = nil if min_linker out_msa_pairs = Msa.new @@ -68,17 +68,12 @@ module Evoruby domain_pass_counter = 0 domain_fail_counter = 0 - singlets_counter = 0 - distant_pairs_counter = 0 - close_pairs_counter = 0 proteins_with_failing_domains = 0 max_domain_copy_number_per_protein = -1 max_domain_copy_number_sequence = "" - hmmscan_datas = Array.new - File.open( hmmscan_output ) do | file | while line = file.gets if !is_ignorable?( line ) && line =~ /^\S+\s+/ @@ -120,11 +115,6 @@ module Evoruby print( ld ) log << ld domain_fail_counter += 1 - - if failed_seqs.find_by_name_start( sequence, true ).length < 1 - add_sequence( sequence, in_msa, failed_seqs ) - proteins_with_failing_domains += 1 - end end if number > out_of @@ -132,19 +122,34 @@ module Evoruby raise StandardError, error_msg end - if number == out_of && !hmmscan_datas.empty? - domain_pass_counter += process_hmmscan_datas( hmmscan_datas, - in_msa, - add_position, - add_domain_number, - add_species, - out_msa, - out_msa_singles, - out_msa_pairs, - out_msa_isolated, - passed_seqs, - min_linker ) - + if number == out_of + if !hmmscan_datas.empty? + process_hmmscan_datas( hmmscan_datas, + in_msa, + add_position, + add_domain_number, + add_species, + out_msa, + out_msa_singles, + out_msa_pairs, + out_msa_isolated, + min_linker ) + domain_pass_counter += hmmscan_datas.length + if passed_seqs.find_by_name_start( sequence, true ).length < 1 + add_sequence( sequence, in_msa, passed_seqs ) + else + error_msg = "this should not have happened" + raise StandardError, error_msg + end + else + if failed_seqs.find_by_name_start( sequence, true ).length < 1 + add_sequence( sequence, in_msa, failed_seqs ) + proteins_with_failing_domains += 1 + else + error_msg = "this should not have happened" + raise StandardError, error_msg + end + end hmmscan_datas.clear end @@ -170,33 +175,33 @@ module Evoruby log << ld end - write_msa( out_msa, outfile ) + write_msa( out_msa, outfile + ".fasta" ) write_msa( passed_seqs, passed_seqs_outfile ) write_msa( failed_seqs, failed_seqs_outfile ) if out_msa_pairs - write_msa( out_msa_pairs, outfile +"_" + min_linker.to_s ) + write_msa( out_msa_pairs, outfile +"_" + min_linker.to_s + ".fasta") end if out_msa_singles - write_msa( out_msa_singles, outfile +"_singles" ) + write_msa( out_msa_singles, outfile +"_singles" + ".fasta") end if out_msa_isolated - write_msa( out_msa_isolated, outfile + "_" + min_linker.to_s + "_isolated" ); + write_msa( out_msa_isolated, outfile + "_" + min_linker.to_s + "_isolated" + ".fasta"); end log << ld - log << "passing domains : " + domain_pass_counter.to_s + ld + log << "passing domains : " + domain_pass_counter.to_s + ld if ( min_linker ) - log << "single domains : " + singlets_counter.to_s + ld - log << "domains in close pairs : " + close_pairs_counter.to_s + ld - log << "isolated domains : " + distant_pairs_counter.to_s + ld + log << "single domains : " + out_msa_singles.get_number_of_seqs.to_s + ld + log << "domains in close pairs : " + out_msa_pairs.get_number_of_seqs.to_s + ld + log << "isolated domains : " + out_msa_isolated.get_number_of_seqs.to_s + ld end - log << "failing domains : " + domain_fail_counter.to_s + ld - log << "proteins with passing domains: " + passed_seqs.get_number_of_seqs.to_s + ld - log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld + log << "failing domains : " + domain_fail_counter.to_s + ld + log << "proteins with passing domains : " + passed_seqs.get_number_of_seqs.to_s + ld + log << "proteins with no passing domains: " + proteins_with_failing_domains.to_s + ld log << ld return domain_pass_counter @@ -243,13 +248,10 @@ module Evoruby out_msa_singles, out_msa_pairs, out_msa_isolated, - passed_seqs, min_linker ) actual_out_of = hmmscan_datas.size - domain_pass_counter = 0; - hmmscan_datas.each_with_index do |hmmscan_data, index| if hmmscan_data.number < ( index + 1 ) error_msg = "hmmscan_data.number < ( index + 1 ) (this should not have happened)" @@ -266,15 +268,8 @@ module Evoruby add_position, add_domain_number, add_species ) - domain_pass_counter += 1 - - if passed_seqs.find_by_name_start( hmmscan_data.seq_name, true ).length < 1 - add_sequence( hmmscan_data.seq_name, in_msa, passed_seqs ) - end - if min_linker - if actual_out_of == 1 extract_domain( hmmscan_data.seq_name, 1, @@ -289,15 +284,11 @@ module Evoruby else first = index == 0 last = index == hmmscan_datas.length - 1 - # next_env_from = hmmscan_datas[ index + 1 ].env_from - # env_to = hmmscan_data.env_to - # env_from = hmmscan_data.env_from - # prev_env_to = hmmscan_datas[ index - 1 ].env_to - - if (( first && ( ( hmmscan_datas[ index + 1 ].env_from - hmmscan_data.env_to ) > min_linker) ) || - ( last && ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) || - ( !first && !last && (hmmscan_datas[ index + 1 ].env_from- hmmscan_data.env_to > min_linker ) && ( last && hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to > min_linker ) )) + if ( ( first && ( ( hmmscan_datas[ index + 1 ].env_from - hmmscan_data.env_to ) > min_linker) ) || + ( last && ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) || + ( !first && !last && ( ( hmmscan_datas[ index + 1 ].env_from - hmmscan_data.env_to ) > min_linker ) && + ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) ) extract_domain( hmmscan_data.seq_name, index + 1, @@ -325,7 +316,6 @@ module Evoruby end end end # each - domain_pass_counter end # def process_hmmscan_data def extract_domain( sequence,