From 89a0bf45be7d28991571b0aa08af686e8b4652cf Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Wed, 16 Jan 2013 22:55:12 +0000 Subject: [PATCH] inprogress --- forester/ruby/evoruby/exe/select_same_gn.rb | 30 +++++++++++++++++++-------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/forester/ruby/evoruby/exe/select_same_gn.rb b/forester/ruby/evoruby/exe/select_same_gn.rb index 083a6a7..272595c 100755 --- a/forester/ruby/evoruby/exe/select_same_gn.rb +++ b/forester/ruby/evoruby/exe/select_same_gn.rb @@ -25,12 +25,13 @@ module Evoruby end all_names = Set.new - all_seqs = Set.new + all_seqs_per_species = Hash.new gn_to_seqs = Hash.new unique_genes_msa = Msa.new longest_non_unique_genes_msa = Msa.new gn_re = /GN=(\S+)/ fragment_re = /fragment/i + species_re = /\[([A-Z]{3,5})\]$/ frag_counter = 0 no_gn_counter = 0 @@ -44,19 +45,30 @@ module Evoruby else all_names << name end - mol_seq = seq.get_sequence_as_string.upcase - if all_seqs.include?( mol_seq ) - puts "error: sequence of \"" + name + "\" is not unique (#" + i.to_s + ")" - exit - else - all_seqs << mol_seq - end - + #mol_seq = seq.get_sequence_as_string.upcase + #if all_seqs.include?( mol_seq ) + # puts "error: sequence of \"" + name + "\" is not unique (#" + i.to_s + ")" + # exit + #else + # all_seqs << mol_seq + #end + + + if fragment_re.match( name ) puts "ignored because fragment: " + name frag_counter += 1 next end + + species = nil + if species_re.match( name ) + species = species_re[1] + puts species + else + puts "no species for: " + name + exit + end gn_match = gn_re.match( name ) if IGNORE_SEQS_LACKING_GN -- 1.7.10.2