X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Fio%2Fparser%2Fhmmscan_domain_extractor.rb;h=8c66f429da7c54b1fca2e240d213ceb4c176e723;hb=b8e92404ed8c15340596fdae0474272cd29eaed4;hp=417820c74b188bd58c60a05d3d99d629525681f8;hpb=0d669bffc29d659c8542e3353af09b9bf573a5b5;p=jalview.git diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb index 417820c..8c66f42 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb @@ -38,6 +38,7 @@ module Evoruby add_domain_number_as_letter, trim_name, add_species, + min_linker, log ) Util.check_file_for_readability( hmmsearch_output ) @@ -56,8 +57,17 @@ module Evoruby end out_msa = Msa.new + failed_seqs = Msa.new passed_seqs = Msa.new + out_msa_pairs = nil + out_msa_distant_partners = nil + out_msa_singlets = nil + if min_linker + out_msa_pairs = Msa.new + out_msa_distant_partners = Msa.new + out_msa_singlets = Msa.new + end ld = Constants::LINE_DELIMITER @@ -66,9 +76,14 @@ module Evoruby proteins_with_passing_domains = 0 proteins_with_failing_domains = 0 max_domain_copy_number_per_protein = -1 - max_domain_copy_number_sequence = '' - failed_species_counts = Hash.new - passed_species_counts = Hash.new + max_domain_copy_number_sequence = "" + + prev_sequence = nil + prev_number = nil + prev_env_from = nil + prev_env_to = nil + prev_i_e_value = nil + prev_is_pair = false File.open( hmmsearch_output ) do | file | while line = file.gets @@ -83,7 +98,6 @@ module Evoruby next end - sequence = $4 number = $10.to_i out_of = $11.to_i @@ -94,8 +108,9 @@ module Evoruby max_domain_copy_number_sequence = sequence max_domain_copy_number_per_protein = number end - if ( ( ( e_value_threshold.to_f < 0.0 ) || ( i_e_value <= e_value_threshold ) ) && - ( ( length_threshold.to_f <= 0 ) || ( env_to - env_from + 1 ) >= length_threshold.to_f ) ) + if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) && + ( ( length_threshold <= 0 ) || ( env_to - env_from + 1 ) >= length_threshold.to_f ) ) + extract_domain( sequence, number, out_of, @@ -110,35 +125,124 @@ module Evoruby trim_name , add_species ) domain_pass_counter += 1 - count_species( sequence, passed_species_counts ) + if passed_seqs.find_by_name_start( sequence, true ).length < 1 add_sequence( sequence, in_msa, passed_seqs ) proteins_with_passing_domains += 1 end - else - print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" ) - log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" - if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) ) - print( " iE=" + i_e_value.to_s ) - log << " iE=" + i_e_value.to_s - end - if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f ) - le = env_to - env_from + 1 - print( " l=" + le.to_s ) - log << " l=" + le.to_s - end - print( Constants::LINE_DELIMITER ) - log << Constants::LINE_DELIMITER - domain_fail_counter += 1 - count_species( sequence, failed_species_counts ) - if failed_seqs.find_by_name_start( sequence, true ).length < 1 - add_sequence( sequence, in_msa, failed_seqs ) - proteins_with_failing_domains += 1 + + if min_linker + if ( ( e_value_threshold < 0.0 ) || ( prev_i_e_value <= e_value_threshold ) ) && + ( ( length_threshold <= 0 ) || ( ( prev_env_to - prev_env_from + 1 ) >= length_threshold.to_f ) ) + + if prev_sequence && sequence != prev_sequence + prev_is_pair = false + end + + if out_of == 1 + + if rev_sequence && sequence == prev_sequence + puts "sequence == prev_sequence && out_of == 1" + exit + end + extract_domain( sequence, + number, + out_of, + env_from, + env_to, + in_msa, + out_msa_singlets, + false, + true, + false, + false, + trim_name , + add_species ) + + elsif prev_sequence && sequence == prev_sequence + + if ( env_from - prev_env_to ) <= min_linker ####### + extract_domain( sequence, + prev_number.to_s + "+" + number.to_s, + out_of, + prev_env_from, + env_to, + in_msa, + out_msa_pairs, + false, + true, + false, + false, + trim_name , + add_species ) + prev_is_pair = true + else ####### + if !prev_is_pair + extract_domain( sequence, + prev_number, + out_of, + prev_env_from, + prev_env_to, + in_msa, + out_msa_distant_partners, + false, + true, + false, + false, + trim_name , + add_species ) + end + if number == out_of + extract_domain( sequence, + number, + out_of, + env_from, + env_to, + in_msa, + out_msa_distant_partners, + false, + true, + false, + false, + trim_name , + add_species ) + end + prev_is_pair = false + end ####### + + end + prev_sequence = sequence + prev_number = number + prev_env_from = env_from + prev_env_to = env_to + prev_i_e_value = i_e_value + end + + else + print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" ) + log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" + if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) ) + print( " iE=" + i_e_value.to_s ) + log << " iE=" + i_e_value.to_s + end + if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f ) + le = env_to - env_from + 1 + print( " l=" + le.to_s ) + log << " l=" + le.to_s + end + print( Constants::LINE_DELIMITER ) + log << Constants::LINE_DELIMITER + domain_fail_counter += 1 + + if failed_seqs.find_by_name_start( sequence, true ).length < 1 + add_sequence( sequence, in_msa, failed_seqs ) + proteins_with_failing_domains += 1 + end end end - end - end - end + end # if !is_ignorable?( line ) && line =~ /^\S+\s+/ + end # while line = file.gets + end # File.open( hmmsearch_output ) do | file | if domain_pass_counter < 1 error_msg = "no domain sequences were extracted" @@ -156,50 +260,50 @@ module Evoruby log << Constants::LINE_DELIMITER end - io = MsaIO.new() - w = FastaWriter.new() - w.set_line_width( 60 ) - w.clean( true ) + write_msa( out_msa, outfile ) + write_msa( passed_seqs, passed_seqs_outfile ) + write_msa( failed_seqs, failed_seqs_outfile ) - begin - io.write_to_file( out_msa, outfile, w ) - rescue Exception - error_msg = "could not write to \"" + outfile + "\"" - raise IOError, error_msg + if out_msa_pairs + write_msa( out_msa_pairs, outfile +"_" + min_linker.to_s ) end - begin - io.write_to_file( passed_seqs, passed_seqs_outfile, w ) - rescue Exception - error_msg = "could not write to \"" + passed_seqs_outfile + "\"" - raise IOError, error_msg + if out_msa_singlets + write_msa( out_msa_singlets, outfile +"_singles" ) end - begin - io.write_to_file( failed_seqs, failed_seqs_outfile, w ) - rescue Exception - error_msg = "could not write to \"" + failed_seqs_outfile + "\"" - raise IOError, error_msg + if out_msa_distant_partners + write_msa( out_msa_distant_partners, outfile +"_dist" ) end + log << ld log << "passing domains : " + domain_pass_counter.to_s + ld log << "failing domains : " + domain_fail_counter.to_s + ld log << "proteins with passing domains: " + proteins_with_passing_domains.to_s + ld log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld log << ld - log << 'passing domains counts per species: ' << ld - passed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld } - log << ld - log << 'failing domains counts per species: ' << ld - failed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld } - log << ld + return domain_pass_counter end # parse + private + def write_msa( msa, filename ) + io = MsaIO.new() + w = FastaWriter.new() + w.set_line_width( 60 ) + w.clean( true ) + begin + io.write_to_file( msa, filename, w ) + rescue Exception + error_msg = "could not write to \"" + filename + "\"" + raise IOError, error_msg + end + end + def add_sequence( sequence_name, in_msa, add_to_msa ) seqs = in_msa.find_by_name_start( sequence_name, true ) @@ -229,11 +333,11 @@ module Evoruby add_domain_number_as_letter, trim_name, add_species ) - if ( number < 1 || out_of < 1 || number > out_of ) + if number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of ) error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s raise ArgumentError, error_msg end - if ( seq_from < 1 || seq_to < 1 || seq_from >= seq_to ) + if seq_from < 1 || seq_to < 1 || seq_from >= seq_to error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s raise ArgumentError, error_msg end @@ -262,15 +366,15 @@ module Evoruby end if out_of != 1 - if ( add_domain_number_as_digit ) + if add_domain_number_as_digit seq.set_name( seq.get_name + number.to_s ) - elsif ( add_domain_number_as_letter ) + elsif add_domain_number_as_letter if number > 25 error_msg = 'too many identical domains per sequence, cannot use letters to distinguish them' raise StandardError, error_msg end seq.set_name( seq.get_name + ( number + 96 ).chr ) - elsif ( add_domain_number ) + elsif add_domain_number seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s ) end end @@ -290,28 +394,7 @@ module Evoruby species = orig_name[ a .. b ] seq.set_name( seq.get_name + " " + species ) end - out_msa.add_sequence( seq ) - - end - - def count_species( sequence, species_counts_map ) - species = get_species( sequence ) - if species != nil - if !species_counts_map.has_key?( species ) - species_counts_map[ species ] = 1 - else - species_counts_map[ species ] = species_counts_map[ species ] + 1 - end - end - end - - def get_species( sequence_name ) - if sequence_name =~ /^.+_(.+)$/ - return $1 - else - return nil - end end def is_ignorable?( line )