X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Fexe%2Fselect_same_gn.rb;h=b2c2410c109e16d94d83fdfba9a693f662d61841;hb=4fe471178831e3cc58206e4f937a6a0013edd00e;hp=97887a9d536757008693e8fcf7b9de3dddffd400;hpb=4a28a6795c9170f35eb1bf698055d5a730d7839a;p=jalview.git diff --git a/forester/ruby/evoruby/exe/select_same_gn.rb b/forester/ruby/evoruby/exe/select_same_gn.rb index 97887a9..b2c2410 100755 --- a/forester/ruby/evoruby/exe/select_same_gn.rb +++ b/forester/ruby/evoruby/exe/select_same_gn.rb @@ -25,32 +25,28 @@ module Evoruby end all_names = Set.new - all_seqs = Set.new + all_seqs_per_species = Hash.new + all_msa_per_species = Hash.new gn_to_seqs = Hash.new unique_genes_msa = Msa.new longest_non_unique_genes_msa = Msa.new gn_re = /GN=(\S+)/ fragment_re = /fragment/i + species_re = /\[([A-Z0-9]{3,6})\]$/ frag_counter = 0 no_gn_counter = 0 + same_seq_counter = 0 for i in 0 ... msa.get_number_of_seqs() seq = msa.get_sequence( i ) name = seq.get_name if all_names.include?( name ) - puts "error: " + name + " is not unique (#" + i + ")" + puts "error: sequence name \"" + name + "\" is not unique (#" + i.to_s + ")" exit else all_names << name end - mol_seq = seq.get_sequence_as_string.upcase - if all_seqs.include?( mol_seq ) - puts "error: sequence of " + name + " is not unique (#" + i + ")" - exit - else - all_seqs << mol_seq - end if fragment_re.match( name ) puts "ignored because fragment: " + name @@ -58,6 +54,28 @@ module Evoruby next end + species = nil + if species_re.match( name ) + s_match = species_re.match( name ) + species = s_match[1] + + unless all_seqs_per_species.has_key?( species ) + all_seqs_per_species[ species ] = Set.new + end + all_seqs = all_seqs_per_species[ species ] + mol_seq = seq.get_sequence_as_string.upcase + if all_seqs.include?( mol_seq ) + puts "ignored because identical sequence in same species: " + name + same_seq_counter += 1 + next + else + all_seqs << mol_seq + end + else + puts "error: no species for: " + name + exit + end + gn_match = gn_re.match( name ) if IGNORE_SEQS_LACKING_GN unless gn_match @@ -73,7 +91,7 @@ module Evoruby gn =nil if gn_match - gn = gn_match[1] + gn = gn_match[1] + "_" + species else if IGNORE_SEQS_LACKING_GN puts "cannot be" @@ -88,13 +106,17 @@ module Evoruby gn_to_seqs[gn].add_sequence(seq) end - puts "Sequences ignored because \"fragment\" in desc: " + frag_counter.to_s - puts "Sequences ignored because no \"GN=\" in desc : " + no_gn_counter.to_s + puts "Sequences ignored because \"fragment\" in desc : " + frag_counter.to_s + if IGNORE_SEQS_LACKING_GN + puts "Sequences ignored because no \"GN=\" in desc : " + no_gn_counter.to_s + end + puts "Sequences ignored because identical sequence in same species: " + same_seq_counter.to_s puts puts counter = 1 gn_to_seqs.each_pair do |gene,seqs| + seq = nil if seqs.get_number_of_seqs > 1 puts counter.to_s + ": " + gene puts seqs.to_fasta @@ -110,12 +132,28 @@ module Evoruby longest_seq = current end end - longest_non_unique_genes_msa.add_sequence(longest_seq) + seq = longest_seq + longest_non_unique_genes_msa.add_sequence( seq ) else - unique_genes_msa.add_sequence( seqs.get_sequence( 0 ) ) + seq = seqs.get_sequence( 0 ) + unique_genes_msa.add_sequence( seq ) end + + species = species_re.match( seq.get_name )[ 1 ] + unless all_msa_per_species.has_key?(species) + all_msa_per_species[species] = Msa.new + end + all_msa_per_species[species].add_sequence(seq) + end + w = FastaWriter.new w.write(unique_genes_msa, "seqs_from_unique_genes.fasta") w.write(longest_non_unique_genes_msa, "longest_seqs_from_nonunique_genes.fasta") + + all_msa_per_species.each_pair do |s,m| + w = FastaWriter.new + w.write(m, s +".fasta") + end + end