X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Fexe%2Fselect_same_gn.rb;h=66d3540802f9066b46e911ab863c97ab30f69a88;hb=f1b56e9c41f3e7369b934e86175cea9c9c0d19b2;hp=1ecd373789e535d0f9c69b05c1e246e3f47b64fc;hpb=2795611c1d3ebe07e0d4b70cecc3f4fdafafdfe5;p=jalview.git diff --git a/forester/ruby/evoruby/exe/select_same_gn.rb b/forester/ruby/evoruby/exe/select_same_gn.rb index 1ecd373..66d3540 100755 --- a/forester/ruby/evoruby/exe/select_same_gn.rb +++ b/forester/ruby/evoruby/exe/select_same_gn.rb @@ -12,7 +12,7 @@ module Evoruby input = ARGV[ 0 ] f = MsaFactory.new() - + IGNORE_SEQS_LACKING_GN = false msa = nil @@ -25,31 +25,54 @@ module Evoruby end all_names = Set.new + all_seqs_per_species = Hash.new gn_to_seqs = Hash.new unique_genes_msa = Msa.new longest_non_unique_genes_msa = Msa.new gn_re = /GN=(\S+)/ fragment_re = /fragment/i + species_re = /\[([A-Z0-9]{3,6})\]$/ frag_counter = 0 no_gn_counter = 0 + same_seq_counter = 0 for i in 0 ... msa.get_number_of_seqs() seq = msa.get_sequence( i ) name = seq.get_name - if all_names.has?( name ) - puts "error: " + name + " is not unique" - else - all_names.put( name ) + if all_names.include?( name ) + puts "error: sequence name \"" + name + "\" is not unique (#" + i.to_s + ")" + exit + else + all_names << name end - - + if fragment_re.match( name ) puts "ignored because fragment: " + name frag_counter += 1 next end - + + if species_re.match( name ) + s_match = species_re.match( name ) + species = s_match[1] + + unless all_seqs_per_species.include?( species ) + all_seqs_per_species[ species ] = Set.new + end + all_seqs = all_seqs_per_species[ species ] + mol_seq = seq.get_sequence_as_string.upcase + if all_seqs.include?( mol_seq ) + puts "ignored because identical sequence in same species: " + name + same_seq_counter += 1 + next + else + all_seqs << mol_seq + end + else + puts "no species for: " + name + end + gn_match = gn_re.match( name ) if IGNORE_SEQS_LACKING_GN unless gn_match @@ -62,7 +85,7 @@ module Evoruby puts "no GN=: " + name end end - + gn =nil if gn_match gn = gn_match[1] @@ -72,16 +95,19 @@ module Evoruby exit end gn = name - end - + end + unless gn_to_seqs.has_key?(gn) gn_to_seqs[gn] = Msa.new end gn_to_seqs[gn].add_sequence(seq) end - puts "Sequences ignored because \"fragment\" in desc: " + frag_counter.to_s - puts "Sequences ignored because no \"GN=\" in desc : " + no_gn_counter.to_s + puts "Sequences ignored because \"fragment\" in desc : " + frag_counter.to_s + if IGNORE_SEQS_LACKING_GN + puts "Sequences ignored because no \"GN=\" in desc : " + no_gn_counter.to_s + end + puts "Sequences ignored because identical sequence in same species: " + same_seq_counter.to_s puts puts