module Evoruby
class HmmscanMultiDomainExtractor
+
+ OUTPUT_ID = 'MDSX'
+
+ PASSING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
+ FAILING_FL_SEQS_SUFFIX = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
+ TARGET_DA_SUFFIX = "_#{OUTPUT_ID}_target_da.fasta"
+ CONCAT_TARGET_DOM_SUFFIX = "_#{OUTPUT_ID}_concat_target_dom.fasta"
def initialize
@passed = Hash.new
@non_passsing_domain_architectures = Hash.new
end
# raises ArgumentError, IOError, StandardError
- def parse( domain_id,
+ def parse( target_da,
hmmscan_output,
fasta_sequence_file,
- outfile,
- passed_seqs_outfile,
- failed_seqs_outfile,
- e_value_threshold,
- length_threshold,
- add_position,
- add_domain_number,
- add_species,
- log )
+ outfile_base,
+ log_str )
+
+ passing_fl_seqs_outfile = outfile_base + PASSING_FL_SEQS_SUFFIX
+ failing_fl_seqs_outfile = outfile_base + FAILING_FL_SEQS_SUFFIX
+ target_da_outfile = outfile_base + TARGET_DA_SUFFIX
+ concat_target_dom_outfile = outfile_base + CONCAT_TARGET_DOM_SUFFIX
Util.check_file_for_readability( hmmscan_output )
Util.check_file_for_readability( fasta_sequence_file )
- Util.check_file_for_writability( outfile + ".fasta" )
- Util.check_file_for_writability( passed_seqs_outfile )
- Util.check_file_for_writability( failed_seqs_outfile )
+ Util.check_file_for_writability( passing_fl_seqs_outfile )
+ Util.check_file_for_writability( failing_fl_seqs_outfile )
+ Util.check_file_for_writability( target_da_outfile )
+ Util.check_file_for_writability( concat_target_dom_outfile )
in_msa = nil
factory = MsaFactory.new()
out_msa = Msa.new
- failed_seqs = Msa.new
- passed_seqs = Msa.new
- passed_multi_seqs = Msa.new
+ failed_seqs_msa = Msa.new
+ passed_seqs_msa = Msa.new
+ # passed_multi_seqs_msa = Msa.new
ld = Constants::LINE_DELIMITER
####
# Import: if multiple copies of same domain, threshold need to be same!
target_domain_ary = Array.new
- target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 1 ))
- target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 ))
+
+ target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 ))
+ target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 ))
+
+ # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 0 ))
+ # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 ))
# target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
# target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
prev_query_seq_name = nil
domains_in_query_seq = Array.new
- passing_sequences = Array.new
+ passing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
+ failing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
total_sequences = 0
@passed = Hash.new
out_domain_msas = Hash.new
out_domain_architecture_msa = Msa.new
+ out_concatenated_domains_msa = Msa.new
results.each do |hmmscan_result|
if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
- if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure)
+ if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_contactenated_domains_msa, target_domain_architecure)
passing_sequences.push(domains_in_query_seq)
+ else
+ failing_sequences.push(domains_in_query_seq)
end
+
domains_in_query_seq = Array.new
total_sequences += 1
end
if prev_query_seq_name != nil
total_sequences += 1
- if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, target_domain_architecure)
+ if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_contactenated_domains_msa, target_domain_architecure)
passing_sequences.push(domains_in_query_seq)
+ else
+ failing_sequences.push(domains_in_query_seq)
end
end
end
puts "writing target_domain_architecure"
- write_msa( out_domain_architecture_msa, "target_domain_architecure" + ".fasta" )
-
- passing_sequences.each do | domains |
+ write_msa( out_domain_architecture_msa, target_da_outfile )
- seq = domains[0].query
- # puts seq + "::"
+ puts "writing contactenated_domains_msa"
+ write_msa( out_concatenated_domains_msa, concat_target_dom_outfile )
- if passed_seqs.find_by_name_start( seq, true ).length < 1
- add_sequence( seq, in_msa, passed_multi_seqs )
+ passing_sequences.each do | domains |
+ query_name = domains[0].query
+ if passed_seqs_msa.find_by_name_start( query_name, true ).length < 1
+ add_sequence( query_name, in_msa, passed_seqs_msa )
else
error_msg = "this should not have happened"
raise StandardError, error_msg
end
+ end
- domains.each do | domain |
- #puts domain.query + ": " + domain.model
+ failing_sequences.each do | domains |
+ query_name = domains[0].query
+ if failed_seqs_msa.find_by_name_start( query_name, true ).length < 1
+ add_sequence( query_name, in_msa, failed_seqs_msa )
+ else
+ error_msg = "this should not have happened"
+ raise StandardError, error_msg
end
- #puts
end
puts
- puts 'Non passing domain architectures:'
+ puts 'Non passing domain architectures containing target domain(s):'
@non_passsing_domain_architectures = @non_passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
@non_passsing_domain_architectures.each do |da, count|
puts da + ': ' + count.to_s
puts
- puts 'Passing domain counts:'
+ puts 'Passing target domain counts:'
@passed = @passed.sort{|a, b|a<=>b}.to_h
@passed.each do |dom, count|
puts dom + ': ' + count.to_s
puts "total sequences : " + total_sequences.to_s
puts "passing sequences: " + passing_sequences.size.to_s
- # write_msa( out_msa, outfile + ".fasta" )
- # write_msa( passed_multi_seqs, passed_seqs_outfile )
- # write_msa( failed_seqs, failed_seqs_outfile )
+ write_msa( passed_seqs_msa, passing_fl_seqs_outfile )
+ write_msa( failed_seqs_msa, failing_fl_seqs_outfile )
- log << ld
- log << "passing target domains : " + domain_pass_counter.to_s + ld
- log << "failing target domains : " + domain_fail_counter.to_s + ld
- log << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld
- log << "proteins in hmmscan outputfile : " + protein_counter.to_s + ld
- log << "proteins with passing target domain(s) : " + passed_seqs.get_number_of_seqs.to_s + ld
- log << "proteins with no passing target domain : " + proteins_with_failing_domains.to_s + ld
- log << "proteins with no target domain : " + domain_not_present_counter.to_s + ld
+ log_str << ld
+ log_str << "passing target domains : " + domain_pass_counter.to_s + ld
+ log_str << "failing target domains : " + domain_fail_counter.to_s + ld
+ log_str << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld
+ log_str << "proteins in hmmscan outputfile : " + protein_counter.to_s + ld
+ log_str << "proteins with passing target domain(s) : " + passed_seqs_msa.get_number_of_seqs.to_s + ld
+ log_str << "proteins with no passing target domain : " + proteins_with_failing_domains.to_s + ld
+ log_str << "proteins with no target domain : " + domain_not_present_counter.to_s + ld
- log << ld
+ log_str << ld
return domain_pass_counter
in_msa,
out_domain_msas,
out_domain_architecture_msa,
+ out_contactenated_domains_msa,
target_domain_architecture)
domain_names_in_query_seq = Set.new
end
if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
length = 1 + domain.env_to - domain.env_from
- puts (target_domain.rel_len * domain.tlen)
+ # puts (target_domain.rel_len * domain.tlen)
if length < (target_domain.rel_len * domain.tlen)
next
end
error_msg = "seq names do not match: this should not have happened"
raise StandardError, error_msg
end
- puts "query: " + query_seq
extracted_domain_seq = extract_domain( query_seq,
domain_counter[domain_name],
end
end
- puts "env from: #{min_env_from} - #{max_env_from}"
- puts "env to : #{min_env_to} - #{max_env_to}"
+ #puts "env from: #{min_env_from} - #{max_env_from}"
+ #puts "env to : #{min_env_to} - #{max_env_to}"
extract_domain( query_seq,
1,
out_domain_architecture_msa )
puts passed_domains_counts
- puts concatenated_domains
+
+ out_contactenated_domains_msa.add( query_seq, concatenated_domains)
+
true
end
PRG_DATE = "20170220"
WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
- E_VALUE_THRESHOLD_DEFAULT = 0.1
- LENGTH_THRESHOLD_DEFAULT = 50
-
- E_VALUE_THRESHOLD_OPTION = 'e'
- LENGTH_THRESHOLD_OPTION = 'l'
- ADD_POSITION_OPTION = 'p'
- ADD_DOMAIN_NUMBER_OPTION = 'd'
- ADD_SPECIES = 's'
- LOG_FILE_SUFFIX = '_multi_domain_seq_extr.log'
- PASSED_SEQS_SUFFIX = '_with_passing_domains.fasta'
- FAILED_SEQS_SUFFIX = '_with_no_passing_domains.fasta'
+ LOG_FILE_SUFFIX = '_MDSX.log'
HELP_OPTION_1 = 'help'
HELP_OPTION_2 = 'h'
def run()
exit( 0 )
end
- unless ( cla.get_number_of_files == 2 || cla.get_number_of_files == 3 || cla.get_number_of_files == 4 )
+ unless ( cla.get_number_of_files == 2 || cla.get_number_of_files == 3 )
print_help
exit( -1 )
end
allowed_opts = Array.new
- allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
- allowed_opts.push( ADD_POSITION_OPTION )
- allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
- allowed_opts.push( LENGTH_THRESHOLD_OPTION )
- allowed_opts.push( ADD_SPECIES )
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
STDOUT )
end
- e_value_threshold = E_VALUE_THRESHOLD_DEFAULT
- if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
- begin
- e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- if ( e_value_threshold < 0.0 )
- Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
- end
- end
-
- length_threshold = LENGTH_THRESHOLD_DEFAULT
- if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
- begin
- length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- if ( length_threshold < 0)
- Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
- end
- end
-
domain_id = cla.get_file_name( 0 )
hmmscan_output = cla.get_file_name( 1 )
fasta_sequence_file = ""
outfile = ""
- if (cla.get_number_of_files == 4)
+ if (cla.get_number_of_files == 3)
fasta_sequence_file = cla.get_file_name( 2 )
- outfile = cla.get_file_name( 3 )
- elsif (cla.get_number_of_files == 2 || cla.get_number_of_files == 3)
- if (cla.get_number_of_files == 3)
- fasta_sequence_file = cla.get_file_name( 2 )
- else
- hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
- if ( hmmscan_index != nil )
- prefix = hmmscan_output[0 .. hmmscan_index-1 ]
- suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
- files = Dir.entries( "." )
- matching_files = Util.get_matching_files( files, prefix, suffix)
- if matching_files.length < 1
- Util.fatal_error( PRG_NAME, 'no file matching [' + prefix +
- '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
- end
- if matching_files.length > 1
- Util.fatal_error( PRG_NAME, 'more than one file matching [' +
- prefix + '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
- end
- fasta_sequence_file = matching_files[ 0 ]
- else
- Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
- end
- end
+ else
hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
if ( hmmscan_index != nil )
- outfile = hmmscan_output.sub(Constants::HMMSCAN, "_") + "_" + domain_id
- e = e_value_threshold >= 1 ? e_value_threshold.to_i : e_value_threshold.to_s.sub!('.','_')
- outfile += "_E" + e.to_s
- outfile += "_L" + length_threshold.to_i.to_s
+ prefix = hmmscan_output[0 .. hmmscan_index-1 ]
+ suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
+ files = Dir.entries( "." )
+ matching_files = Util.get_matching_files( files, prefix, suffix)
+ if matching_files.length < 1
+ Util.fatal_error( PRG_NAME, 'no file matching [' + prefix +
+ '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
+ end
+ if matching_files.length > 1
+ Util.fatal_error( PRG_NAME, 'more than one file matching [' +
+ prefix + '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
+ end
+ fasta_sequence_file = matching_files[ 0 ]
else
Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
end
end
-
- if outfile.downcase.end_with?( ".fasta" )
- outfile = outfile[ 0 .. outfile.length - 7 ]
- elsif outfile.downcase.end_with?( ".fsa" )
- outfile = outfile[ 0 .. outfile.length - 5 ]
- end
-
- add_position = false
- if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
- add_position = true
- end
-
- add_domain_number = false
- if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
- add_domain_number = true
- end
-
- add_species = false
- if cla.is_option_set?( ADD_SPECIES )
- add_species = true
+ hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
+ if ( hmmscan_index != nil )
+ outfile = hmmscan_output.sub(Constants::HMMSCAN, "_") + "_MDSX"
+ else
+ Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
end
log = String.new
ld = Constants::LINE_DELIMITER
- puts()
- puts( "Domain : " + domain_id )
- log << "Domain : " + domain_id + ld
- puts( "Hmmscan outputfile : " + hmmscan_output )
- log << "Hmmscan outputfile : " + hmmscan_output + ld
- puts( "Fasta sequencefile (complete sequences) : " + fasta_sequence_file )
- log << "Fasta sequencefile (complete sequences) : " + fasta_sequence_file + ld
- puts( "Outputfile : " + outfile + ".fasta" )
- log << "Outputfile : " + outfile + ld
- puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX )
- log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld
- puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX )
- log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld
- puts( "Logfile : " + outfile + LOG_FILE_SUFFIX )
- log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld
- if ( e_value_threshold >= 0.0 )
- puts( "iE-value threshold : " + e_value_threshold.to_s )
- log << "iE-value threshold : " + e_value_threshold.to_s + ld
- else
- puts( "iE-value threshold : no threshold" )
- log << "iE-value threshold : no threshold" + ld
- end
- if ( length_threshold > 0 )
- puts( "Length threshold (env) : " + length_threshold.to_s )
- log << "Length threshold (env) : " + length_threshold.to_s + ld
- else
- puts( "Length threshold (env) : no threshold" )
- log << "Length threshold (env) : no threshold" + ld
- end
- if ( add_position )
- puts( "Add positions (rel to complete seq) to extracted domains : true" )
- log << "Add positions (rel to complete seq) to extracted domains : true" + ld
- else
- puts( "Add positions (rel to complete seq) to extracted domains : false" )
- log << "Add positions (rel to complete seq) to extracted domains : false" + ld
- end
-
- if ( add_domain_number )
- puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
- log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
- else
- puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
- log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
- end
+ # puts()
+ # puts( "Domain : " + domain_id )
+ # log << "Domain : " + domain_id + ld
+ # puts( "Hmmscan outputfile : " + hmmscan_output )
+ # log << "Hmmscan outputfile : " + hmmscan_output + ld
+ # puts( "Fasta sequencefile (complete sequences) : " + fasta_sequence_file )
+ # log << "Fasta sequencefile (complete sequences) : " + fasta_sequence_file + ld
+ # puts( "Outputfile : " + outfile + ".fasta" )
+ # log << "Outputfile : " + outfile + ld
+ # puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX )
+ # log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld
+ # puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX )
+ # log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld
+ # puts( "Logfile : " + outfile + LOG_FILE_SUFFIX )
+ # log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld
puts
log << ld
hmmscan_output,
fasta_sequence_file,
outfile,
- outfile + PASSED_SEQS_SUFFIX,
- outfile + FAILED_SEQS_SUFFIX,
- e_value_threshold,
- length_threshold,
- add_position,
- add_domain_number,
- add_species,
log )
rescue ArgumentError, IOError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
puts
Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
- Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta")
- Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
- Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX )
- Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX )
+ # Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta")
+ # Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
+ # Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX )
+ # Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX )
begin
f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
puts()
puts( "Usage:" )
puts()
- puts( " " + PRG_NAME + ".rb [options] <target domain> <hmmscan outputfile> [file containing complete sequences in fasta format] [outputfile]" )
+ puts( " " + PRG_NAME + ".rb <da> <hmmscan outputfile> [file containing complete sequences in fasta format]" )
puts()
- puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f> : iE-value threshold for target domain, default is " + E_VALUE_THRESHOLD_DEFAULT.to_s )
- puts( " -" + LENGTH_THRESHOLD_OPTION + "=<i> : length threshold target domain (env), default is " + LENGTH_THRESHOLD_DEFAULT.to_s )
- puts( " -" + ADD_DOMAIN_NUMBER_OPTION + " : to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
- puts( " -" + ADD_POSITION_OPTION + " : to add positions (rel to complete seq) to extracted domains" )
- puts( " -" + ADD_SPECIES + " : to add species [in brackets]" )
+ puts( " options: -" )
puts()
puts( "Examples:" )
puts
- puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta P53_#{Constants::PFAM_V_FOR_EX}_10_Pkinase_E1_0e-06_L50" )
+ puts( " " + PRG_NAME + ".rb " )
puts
- puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta" )
- puts
- puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10" )
+
puts()
end