X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Fio%2Fparser%2Fhmmsearch_domain_extractor.rb;h=e6d67d9706c7d83c1202e7366c2f3945eda5efb5;hb=8f603946524000820503376e9d63e77815b2d949;hp=d2280240b506cad9d8ca3deac62cce5610a900e3;hpb=253a714c2440558d7c70fe30cac5d7b093798968;p=jalview.git diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb index d228024..e6d67d9 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb @@ -64,9 +64,8 @@ module Evoruby proteins_with_passing_domains = 0 proteins_with_failing_domains = 0 max_domain_copy_number_per_protein = -1 - max_domain_copy_number_sequence = '' - failed_species_counts = Hash.new - passed_species_counts = Hash.new + max_domain_copy_number_sequence = "" + File.open( hmmsearch_output ) do | file | while line = file.gets @@ -76,7 +75,6 @@ module Evoruby # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/ - # line =~ /^(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)/ sequence = $1 number = $10.to_i out_of = $11.to_i @@ -89,6 +87,7 @@ module Evoruby end if ( ( ( e_value_threshold.to_f < 0.0 ) || ( i_e_value <= e_value_threshold ) ) && ( ( length_threshold.to_f <= 0 ) || ( env_to - env_from + 1 ) >= length_threshold.to_f ) ) + HmmsearchDomainExtractor.extract_domain( sequence, number, out_of, @@ -101,8 +100,9 @@ module Evoruby add_domain_number_as_digit, add_domain_number_as_letter, trim_name ) + domain_pass_counter += 1 - count_species( sequence, passed_species_counts ) + if !passed_seqs.has?( sequence, true, false ) HmmsearchDomainExtractor.add_sequence( sequence, in_msa, passed_seqs ) proteins_with_passing_domains += 1 @@ -122,7 +122,7 @@ module Evoruby print( Constants::LINE_DELIMITER ) log << Constants::LINE_DELIMITER domain_fail_counter += 1 - count_species( sequence, failed_species_counts ) + if !failed_seqs.has?( sequence, true, false ) HmmsearchDomainExtractor.add_sequence( sequence, in_msa, failed_seqs ) proteins_with_failing_domains += 1 @@ -180,19 +180,13 @@ module Evoruby log << "proteins with passing domains: " + proteins_with_passing_domains.to_s + ld log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld log << ld - log << 'passing domains counts per species: ' << ld - passed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld } - log << ld - log << 'failing domains counts per species: ' << ld - failed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld } - log << ld + return domain_pass_counter end # parse private - def HmmsearchDomainExtractor.add_sequence( sequence_name, in_msa, add_to_msa ) seqs = in_msa.find_by_name( sequence_name, true, false ) if ( seqs.length < 1 ) @@ -269,24 +263,9 @@ module Evoruby out_msa.add_sequence( seq ) end - def count_species( sequence, species_counts_map ) - species = get_species( sequence ) - if species != nil - if !species_counts_map.has_key?( species ) - species_counts_map[ species ] = 1 - else - species_counts_map[ species ] = species_counts_map[ species ] + 1 - end - end - end + - def get_species( sequence_name ) - if sequence_name =~ /^.+_(.+)$/ - return $1 - else - return nil - end - end + def is_ignorable?( line ) return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )