From 17bfbc7d478a52e9752fbd1aa8d3abf46bf1063a Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Tue, 7 Mar 2017 18:55:59 -0800 Subject: [PATCH] in progress... --- .../io/parser/hmmscan_multi_domain_extractor.rb | 205 +++++++++++++------- forester/ruby/evoruby/lib/evo/msa/msa.rb | 59 +++--- .../lib/evo/tool/multi_domain_seq_extractor.rb | 78 +++----- 3 files changed, 179 insertions(+), 163 deletions(-) diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb index 33e48a4..35b32df 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb @@ -16,14 +16,16 @@ require 'lib/evo/io/parser/hmmscan_parser' module Evoruby class HmmscanMultiDomainExtractor + TESTING = false OUTPUT_ID = 'mdsx' DOMAIN_DELIMITER = ' -- ' PASSING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta" FAILING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta" TARGET_DA_SUFFIX = "_#{OUTPUT_ID}_target_da.fasta" - CONCAT_TARGET_DOM_SUFFIX = "_#{OUTPUT_ID}_concat_target_dom.fasta" + CONCAT_TARGET_DOM_SUFFIX = "_#{OUTPUT_ID}_concat_target_doms.fasta" TARGET_DOM_OUTPUT_MIDPART = "_#{OUTPUT_ID}_target_dom_" + LOG_FILE_SUFFIX = "_#{OUTPUT_ID}.log" def initialize @passing_domains_data = nil @failing_domains_data = nil @@ -31,6 +33,7 @@ module Evoruby @passsing_domain_architectures = nil @failing_proteins_bc_not_all_target_doms_present = nil @failing_proteins_bc_missing_cutoffs = nil + @log = nil end # raises ArgumentError, IOError, StandardError @@ -44,6 +47,7 @@ module Evoruby failing_fl_seqs_outfile = outfile_base + FAILING_FL_SEQS_SUFFIX target_da_outfile = outfile_base + TARGET_DA_SUFFIX concat_target_dom_outfile = outfile_base + CONCAT_TARGET_DOM_SUFFIX + logfile = outfile_base + LOG_FILE_SUFFIX Util.check_file_for_readability( hmmscan_output ) Util.check_file_for_readability( fasta_sequence_file ) @@ -51,6 +55,9 @@ module Evoruby Util.check_file_for_writability( failing_fl_seqs_outfile ) Util.check_file_for_writability( target_da_outfile ) Util.check_file_for_writability( concat_target_dom_outfile ) + Util.check_file_for_writability( logfile ) + + @log = log_str in_msa = nil factory = MsaFactory.new() @@ -64,14 +71,11 @@ module Evoruby failed_seqs_msa = Msa.new passed_seqs_msa = Msa.new - ld = Constants::LINE_DELIMITER - hmmscan_parser = HmmscanParser.new( hmmscan_output ) results = hmmscan_parser.parse #### # Import: if multiple copies of same domain, thresholds need to be same! - target_domain_ary = Array.new #target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 )) #target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 )) @@ -81,9 +85,11 @@ module Evoruby # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 )) # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 )) - target_domain_ary.push(TargetDomain.new('BH4', 1, -1, 0.2, 0 )) - target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 1 )) - target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 2 )) + target_domain_ary = parse_da target_da + + # target_domain_ary.push(TargetDomain.new('BH4', 1, -1, 0.2, 0 )) + # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 1 )) + # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 2 )) # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 )) # target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 )) @@ -105,7 +111,12 @@ module Evoruby target_domain_architecure.freeze - puts 'Target domain architecture: ' + target_domain_architecure + log 'Hmmscan outputfile : ' + hmmscan_output + log 'Full length fasta sequence file: ' + fasta_sequence_file + log 'Target domain architecture : ' + target_domain_architecure + target_domain_ary.each do |x| + log x.to_str + end target_domain_names = Set.new @@ -128,7 +139,7 @@ module Evoruby out_concatenated_domains_msa = Msa.new results.each do |hmmscan_result| if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name ) - if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure) + if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure) passing_sequences.push(domains_in_query_seq) else failing_sequences.push(domains_in_query_seq) @@ -141,44 +152,46 @@ module Evoruby end # each result if prev_query_seq_name != nil - total_sequences += 1 - if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure) + if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure) passing_sequences.push(domains_in_query_seq) else failing_sequences.push(domains_in_query_seq) end + total_sequences += 1 end - puts - - puts 'Failing domain architectures containing one or more target domain(s):' - @failing_domain_architectures = @failing_domain_architectures .sort{|a, b|a<=>b}.to_h - failing_da_sum = 0 - @failing_domain_architectures .each do |da, count| - failing_da_sum += count - puts count.to_s.rjust(4) + ': ' + da + if in_msa.get_number_of_seqs < total_sequences + error_msg = "hmmscan output contains more protein sequences than fasta sequence file" + raise IOError, error_msg end - puts - - puts 'Passing domain architectures containing target domain(s):' + log + log 'Passing domain architectures containing target domain(s):' @passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h passing_da_sum = 0 @passsing_domain_architectures.each do |da, count| passing_da_sum += count - puts count.to_s.rjust(4) + ': ' + da + log count.to_s.rjust(4) + ': ' + da end - - puts 'Passing domain(s):' + log + log 'Failing domain architectures containing one or more target domain(s):' + @failing_domain_architectures = @failing_domain_architectures .sort{|a, b|a<=>b}.to_h + failing_da_sum = 0 + @failing_domain_architectures .each do |da, count| + failing_da_sum += count + log count.to_s.rjust(4) + ': ' + da + end + log + log 'Passing target domain(s):' @passing_domains_data = @passing_domains_data.sort{|a, b|a<=>b}.to_h @passing_domains_data.each do |n, d| - puts d.to_str + log d.to_str end - - puts 'Failing domain(s):' + log + log'Failing target domain(s):' @failing_domains_data = @failing_domains_data.sort{|a, b|a<=>b}.to_h @failing_domains_data.each do |n, d| - puts d.to_str + log d.to_str end unless total_sequences == (passing_sequences.size + failing_sequences.size) @@ -201,60 +214,60 @@ module Evoruby raise StandardError, error_msg end - puts - - puts "Protein sequences in sequence (fasta) file: " + in_msa.get_number_of_seqs.to_s.rjust(5) - puts "Protein sequences in hmmscan output file : " + total_sequences.to_s.rjust(5) - puts " Passing protein sequences : " + passing_sequences.size.to_s.rjust(5) - puts " Failing protein sequences : " + failing_sequences.size.to_s.rjust(5) - puts " Not all target domain present : " + @failing_proteins_bc_not_all_target_doms_present.to_s.rjust(5) - puts " Target domain(s) failing cutoffs : " + @failing_proteins_bc_missing_cutoffs.to_s.rjust(5) - - puts + log + log "Protein sequences in sequence (fasta) file: " + in_msa.get_number_of_seqs.to_s.rjust(5) + log "Protein sequences in hmmscan output file : " + total_sequences.to_s.rjust(5) + log " Passing protein sequences : " + passing_sequences.size.to_s.rjust(5) + log " Failing protein sequences : " + failing_sequences.size.to_s.rjust(5) + log " Not all target domain present : " + @failing_proteins_bc_not_all_target_doms_present.to_s.rjust(5) + log " Target domain(s) failing cutoffs : " + @failing_proteins_bc_missing_cutoffs.to_s.rjust(5) + log out_domain_msas.keys.sort.each do |domain_name| file_name = outfile_base + TARGET_DOM_OUTPUT_MIDPART + domain_name + '.fasta' - - write_msa( out_domain_msas[domain_name], file_name ) - puts "wrote #{domain_name}" + write_msa out_domain_msas[domain_name], file_name + log "Wrote passing target domain sequence for " + domain_name.ljust(16) + ': ' + file_name end - write_msa( out_domain_architecture_msa, target_da_outfile ) - puts "wrote target_domain_architecure" + write_msa out_domain_architecture_msa, target_da_outfile + log 'Wrote target domain architecture : ' + target_da_outfile - write_msa( out_concatenated_domains_msa, concat_target_dom_outfile ) - puts "wrote concatenated_domains_msa" + write_msa out_concatenated_domains_msa, concat_target_dom_outfile + log 'Wrote concatenated target domain(s) : ' + concat_target_dom_outfile passing_sequences.each do | domains | query_name = domains[0].query - if passed_seqs_msa.find_by_name_start( query_name, true ).length < 1 + if (!TESTING) || (passed_seqs_msa.find_by_name_start( query_name, true ).length < 1) add_sequence( query_name, in_msa, passed_seqs_msa ) else - error_msg = "this should not have happened" + error_msg = 'this should not have happened' raise StandardError, error_msg end end failing_sequences.each do | domains | query_name = domains[0].query - if failed_seqs_msa.find_by_name_start( query_name, true ).length < 1 + if (!TESTING) || (failed_seqs_msa.find_by_name_start( query_name, true ).length < 1) add_sequence( query_name, in_msa, failed_seqs_msa ) else - error_msg = "this should not have happened" + error_msg = 'this should not have happened' raise StandardError, error_msg end end - write_msa( passed_seqs_msa, passing_fl_seqs_outfile ) - puts "wrote ..." - write_msa( failed_seqs_msa, failing_fl_seqs_outfile ) - puts "wrote ..." - - log_str << ld + write_msa passed_seqs_msa, passing_fl_seqs_outfile + log 'Wrote passing full length protein sequences : ' + passing_fl_seqs_outfile + write_msa failed_seqs_msa, failing_fl_seqs_outfile + log 'Wrote failing full length protein sequences : ' + failing_fl_seqs_outfile - log_str << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld - - log_str << ld + begin + f = File.open( logfile, 'w' ) + f.print( @log ) + f.close + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + log 'Wrote log file : ' + logfile end # parse @@ -264,7 +277,7 @@ module Evoruby # target_domain_names: Set of String # target_domains: Hash String->TargetDomain # target_domain_architecture: String - def checkit2(domains_in_query_seq, + def compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, @@ -442,7 +455,7 @@ module Evoruby w = FastaWriter.new() w.set_line_width( 60 ) w.clean( true ) - File.delete(filename) if File.exist?(filename) #TODO remove me + File.delete(filename) if File.exist?(filename) begin io.write_to_file( msa, filename, w ) rescue Exception @@ -505,13 +518,42 @@ module Evoruby seq.get_sequence_as_string end + def parse_da( target_da_str ) + target_domain_hash = Hash.new + target_domain_ary = Array.new + target_das = target_da_str.split '--' + target_das.each do |x| + inds = x.split '=' + unless inds.size == 4 + raise IOError, 'domain architecture is ill formatted: ' + x + end + target_domain_name = inds[0] + ie_cutoff = inds[1].to_f + abs_len_cutoff = inds[2].to_i + rel_len_cutoff = inds[3].to_f + if target_domain_hash.has_key? target_domain_name + target_domain_ary.push target_domain_hash[target_domain_name] + else + td = TargetDomain.new(target_domain_name, ie_cutoff, abs_len_cutoff, rel_len_cutoff) + target_domain_hash[target_domain_name] = td + target_domain_ary.push td + end + end + target_domain_ary + end + + def log(str = '') + puts str + @log << str << Constants::LINE_DELIMITER + end + end # class HmmscanMultiDomainExtractor class DomainData def initialize( name ) if (name == nil) || name.size < 1 error_msg = "domain name nil or empty" - raise StandardError, error_msg + raise IOError, error_msg end @name = name @count = 0 @@ -526,16 +568,16 @@ module Evoruby def add( name, length, i_e_value) if name != @name error_msg = "domain names do not match" - raise StandardError, error_msg + raise IOError, error_msg end if length < 0 error_msg = "length cannot me negative" - raise StandardError, error_msg + raise IOError, error_msg end if i_e_value < 0 error_msg = "iE-value cannot me negative" - raise StandardError, error_msg + raise IOError, error_msg end @count += 1 @i_e_value_sum += i_e_value @@ -570,7 +612,7 @@ module Evoruby def to_str s = '' - s << @name.rjust(24) + ': ' + s << @name.rjust(16) + ': ' s << @count.to_s.rjust(4) + ' ' s << avg_length.round(1).to_s.rjust(6) + ' ' s << @len_min.to_s.rjust(4) + ' -' @@ -586,22 +628,41 @@ module Evoruby end class TargetDomain - def initialize( name, i_e_value, abs_len, rel_len, position ) + def initialize(name, i_e_value, abs_len, rel_len) if (name == nil) || name.size < 1 error_msg = "target domain name nil or empty" - raise StandardError, error_msg + raise IOError, error_msg end if rel_len > 1 - error_msg = "target domain relative length is greater than 1" - raise StandardError, error_msg + error_msg = name + ": target domain relative length is greater than 1" + raise IOError, error_msg + end + if (abs_len <= 0) && (rel_len <= 0) + error_msg = name + ": need to have either absolute length or relative length cutoff" + raise IOError, error_msg + end + if (abs_len > 0) && (rel_len > 0) + error_msg = name + ": cannot have both absolute length and relative length cutoff" + raise IOError, error_msg end @name = name @i_e_value = i_e_value @abs_len = abs_len @rel_len = rel_len - @position = position end - attr_reader :name, :i_e_value, :abs_len, :rel_len, :position + + def to_str + s = @name.rjust(16) + ':' + s << ' iE-cutoff: ' + ("%.2E" % @i_e_value).rjust(9) + if @abs_len > 0 + s << ', abs len-cutoff: ' + @abs_len.to_s.rjust(4) + end + if @rel_len > 0 + s << ', rel len-cutoff: ' + @rel_len.to_s.rjust(4) + end + s + end + attr_reader :name, :i_e_value, :abs_len, :rel_len end end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/msa/msa.rb b/forester/ruby/evoruby/lib/evo/msa/msa.rb index 549ae2c..0dcad71 100644 --- a/forester/ruby/evoruby/lib/evo/msa/msa.rb +++ b/forester/ruby/evoruby/lib/evo/msa/msa.rb @@ -6,15 +6,12 @@ # # Last modified: 2017/02/07 - require 'lib/evo/util/constants' require 'lib/evo/util/util' require 'lib/evo/sequence/sequence' module Evoruby - class Msa - def initialize() @sequences = Array.new @identical_seqs_detected = Array.new @@ -22,7 +19,6 @@ module Evoruby @namestart_to_seq_indices = Hash.new end - def add_sequence( sequence ) @sequences.push( sequence ) end @@ -34,8 +30,8 @@ module Evoruby def get_sequence( index ) if ( index < 0 || index > get_number_of_seqs() - 1 ) error_msg = "attempt to get sequence " << - index.to_s << " in alignment of " << get_number_of_seqs().to_s << - " sequences" + index.to_s << " in alignment of " << get_number_of_seqs().to_s << + " sequences" raise ArgumentError, error_msg end return @sequences[ index ] @@ -44,8 +40,8 @@ module Evoruby def remove_sequence!( index ) if ( index < 0 || index > get_number_of_seqs() - 1 ) error_msg = "attempt to remove sequence " << - index.to_s << " in alignment of " << get_number_of_seqs().to_s << - " sequences" + index.to_s << " in alignment of " << get_number_of_seqs().to_s << + " sequences" raise ArgumentError, error_msg end @name_to_seq_indices.clear @@ -57,7 +53,6 @@ module Evoruby @identical_seqs_detected end - def is_aligned() if ( get_number_of_seqs < 1 ) return false @@ -90,7 +85,7 @@ module Evoruby name = name.downcase end if current_name == name || - ( partial_match && current_name.include?( name ) ) + ( partial_match && current_name.include?( name ) ) indices.push( i ) end end @@ -127,26 +122,26 @@ module Evoruby get_sequence( indices[ 0 ] ) end - def find_by_name_start( name, case_sensitive ) + def find_by_name_start(name, case_sensitive = true) if case_sensitive && @namestart_to_seq_indices.has_key?( name ) return @namestart_to_seq_indices[ name ] end indices = [] - for i in 0 ... get_number_of_seqs() - get_sequence( i ).get_name() =~ /^\s*(\S+)/ + for i in 0 ... get_number_of_seqs + get_sequence(i).get_name =~ /^\s*(\S+)/ current_name = $1 if case_sensitive - if !@namestart_to_seq_indices.has_key?( current_name ) - @namestart_to_seq_indices[ current_name ] = [] + unless @namestart_to_seq_indices.has_key?(current_name) + @namestart_to_seq_indices[current_name] = [] end - @namestart_to_seq_indices[ current_name ].push( i ) + @namestart_to_seq_indices[current_name].push( i ) end if !case_sensitive current_name = current_name.downcase name = name.downcase end - if current_name == name - indices.push( i ) + if current_name == name + indices.push(i) end end indices @@ -160,7 +155,7 @@ module Evoruby name = name.downcase end if current_name == name || - ( partial_match && current_name.include?( name ) ) + ( partial_match && current_name.include?( name ) ) return true end end @@ -193,7 +188,6 @@ module Evoruby get_sequence( indices[ 0 ] ) end - def get_sub_alignment( seq_numbers ) msa = Msa.new() for i in 0 ... seq_numbers.length() @@ -230,7 +224,6 @@ module Evoruby s end - def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 ) if ( !is_aligned() ) error_msg = "attempt to get overlap diagram of unaligned msa" @@ -273,7 +266,7 @@ module Evoruby overlap_count = 0 for i in 0...seq_1.get_length() if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) && - !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) + !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) overlap_count += 1 if overlap_count >= min_overlap return true @@ -289,7 +282,7 @@ module Evoruby overlap_count = 0 for i in 0...seq_1.get_length if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) && - !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) + !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) overlap_count += 1 end end @@ -302,10 +295,10 @@ module Evoruby identities_count = 0 for i in 0...seq_1.get_length if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) && - !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) && - seq_1.get_character_code( i ) != 63 && - ( seq_1.get_residue( i ).downcase() == - seq_2.get_residue( i ).downcase() ) + !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) && + seq_1.get_character_code( i ) != 63 && + ( seq_1.get_residue( i ).downcase() == + seq_2.get_residue( i ).downcase() ) identities_count += 1 end end @@ -325,7 +318,6 @@ module Evoruby remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) ) end - def remove_sequences_by_gap_ratio!( gap_ratio ) if ( !is_aligned() ) error_msg = "attempt to remove sequences by gap ratio on unaligned msa" @@ -345,7 +337,6 @@ module Evoruby removed end - def remove_redundant_sequences!( consider_taxonomy = false, verbose = false ) n = get_number_of_seqs removed = Array.new @@ -355,10 +346,10 @@ module Evoruby for j in ( i + 1 ) ... n if !to_be_removed.include?( i ) && !to_be_removed.include?( j ) if !consider_taxonomy || - ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) || - ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) ) + ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) || + ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) ) if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) == - Util.clean_seq_str( get_sequence( j ).get_sequence_as_string ) + Util.clean_seq_str( get_sequence( j ).get_sequence_as_string ) to_be_removed.add( j ) tax_i = "" @@ -389,7 +380,6 @@ module Evoruby removed end - def remove_sequences_by_non_gap_length!( min_non_gap_length ) n = get_number_of_seqs removed = Array.new @@ -504,7 +494,6 @@ module Evoruby return cols end - # Split an alignment into n alignemnts of equal size, except last one def split( n, verbose = false ) if ( n < 2 || n > get_number_of_seqs ) @@ -538,7 +527,6 @@ module Evoruby msas end - private def get_overlaps( min_overlap = 1 ) @@ -588,7 +576,6 @@ module Evoruby return self end - end # class Msa end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb index c1a35ab..762556b 100644 --- a/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb @@ -15,10 +15,9 @@ module Evoruby PRG_NAME = "mdsx" PRG_VERSION = "1.000" PRG_DESC = "Extraction of multi domain sequences from hmmscan output" - PRG_DATE = "20170220" + PRG_DATE = "20170307" WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" - LOG_FILE_SUFFIX = '_MDSX.log' HELP_OPTION_1 = 'help' HELP_OPTION_2 = 'h' def run() @@ -64,7 +63,7 @@ module Evoruby domain_id = cla.get_file_name( 0 ) hmmscan_output = cla.get_file_name( 1 ) fasta_sequence_file = "" - outfile = "" + outfile_base = "" if cla.get_number_of_files == 3 fasta_sequence_file = cla.get_file_name( 2 ) @@ -90,41 +89,26 @@ module Evoruby end hmmscan_index = hmmscan_output.index(Constants::HMMSCAN) if hmmscan_index != nil - outfile = hmmscan_output.sub(Constants::HMMSCAN, '_') + outfile_base = hmmscan_output.sub(Constants::HMMSCAN, '_') else Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' ) end - log = String.new - ld = Constants::LINE_DELIMITER - - # puts() - # puts( "Domain : " + domain_id ) - # log << "Domain : " + domain_id + ld - # puts( "Hmmscan outputfile : " + hmmscan_output ) - # log << "Hmmscan outputfile : " + hmmscan_output + ld - # puts( "Fasta sequencefile (complete sequences) : " + fasta_sequence_file ) - # log << "Fasta sequencefile (complete sequences) : " + fasta_sequence_file + ld - # puts( "Outputfile : " + outfile + ".fasta" ) - # log << "Outputfile : " + outfile + ld - # puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX ) - # log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld - # puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX ) - # log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld - # puts( "Logfile : " + outfile + LOG_FILE_SUFFIX ) - # log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld + log_str = '' - puts - log << ld + log_str << PRG_NAME << Constants::LINE_DELIMITER + log_str << PRG_VERSION << Constants::LINE_DELIMITER + log_str << PRG_DESC << Constants::LINE_DELIMITER + log_str << PRG_DATE << Constants::LINE_DELIMITER + log_str << Constants::LINE_DELIMITER - domain_count = 0 begin parser = HmmscanMultiDomainExtractor.new() - domain_count = parser.parse( domain_id, + parser.parse( domain_id, hmmscan_output, fasta_sequence_file, - outfile, - log ) + outfile_base, + log_str) rescue ArgumentError, IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) @@ -134,42 +118,26 @@ module Evoruby end puts - - # Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta") - # Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX ) - # Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX ) - # Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX ) - - begin - f = File.open( outfile + LOG_FILE_SUFFIX, 'a' ) - f.print( log ) - f.close - rescue Exception => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - - puts Util.print_message( PRG_NAME, "OK" ) puts end def print_help() - puts() - puts( "Usage:" ) - puts() - puts( " " + PRG_NAME + ".rb [file containing complete sequences in fasta format]" ) - puts() - puts( " options: -" ) - puts() - puts( "Examples:" ) puts - puts( " " + PRG_NAME + ".rb " ) + puts "Usage:" + puts + puts " " + PRG_NAME + ".rb [file containing complete sequences in fasta format]" + puts + puts " options: -" + puts + puts "Examples:" + puts + puts " " + PRG_NAME + ".rb " + puts puts - - puts() end - end # class DomainSequenceExtractor + end # class MultiDomainSeqExtractor end # module Evoruby -- 1.7.10.2