X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Fexe%2Fselect_same_gn.rb;h=7dbe9f10017de23937b82744ba2295233f776497;hb=b35ac7de74589875baabdd2f7b5fe8d5cd43ab33;hp=275a1492364a52e47b78e69981abc1fabbcdb2cd;hpb=6f3385fbe81811e9daea8cfe02dce9b1cf61c9dc;p=jalview.git diff --git a/forester/ruby/evoruby/exe/select_same_gn.rb b/forester/ruby/evoruby/exe/select_same_gn.rb index 275a149..7dbe9f1 100755 --- a/forester/ruby/evoruby/exe/select_same_gn.rb +++ b/forester/ruby/evoruby/exe/select_same_gn.rb @@ -10,9 +10,18 @@ require 'lib/evo/io/parser/fasta_parser' module Evoruby + if ARGV.length != 1 + puts "usage: select_same_gn.rb " + exit + end + input = ARGV[ 0 ] f = MsaFactory.new() + IGNORE_SEQS_LACKING_GN = false + IGNORE_FRAGMENTS = false + IGNORE_SPECIES = true + msa = nil begin @@ -22,48 +31,115 @@ module Evoruby exit end + outbase = input.sub( /\..+/, "" ) + + outfile = File.open(outbase + "_log.txt" , "w") + + all_names = Set.new + all_seqs_per_species = Hash.new + all_msa_per_species = Hash.new gn_to_seqs = Hash.new unique_genes_msa = Msa.new longest_non_unique_genes_msa = Msa.new gn_re = /GN=(\S+)/ fragment_re = /fragment/i + species_re = /\[([A-Z0-9]{3,5})\]$/ frag_counter = 0 no_gn_counter = 0 + same_seq_counter = 0 for i in 0 ... msa.get_number_of_seqs() seq = msa.get_sequence( i ) name = seq.get_name - if fragment_re.match( name ) - puts "ignored because fragment: " + name + if all_names.include?( name ) + puts "error: sequence name \"" + name + "\" is not unique (#" + i.to_s + ")" + exit + else + all_names << name + end + + if IGNORE_FRAGMENTS && fragment_re.match( name ) + outfile.puts("ignored because fragment: " + name) frag_counter += 1 next end + + species = nil + if IGNORE_SPECIES || species_re.match( name ) + unless IGNORE_SPECIES + species = species_re.match( name )[ 1 ] + else + species = "XXXXX" + end + + unless all_seqs_per_species.has_key?( species ) + all_seqs_per_species[ species ] = Set.new + end + all_seqs = all_seqs_per_species[ species ] + mol_seq = seq.get_sequence_as_string.upcase + if all_seqs.include?( mol_seq ) + outfile.puts("ignored because identical sequence in same species: " + name ) + + same_seq_counter += 1 + next + else + all_seqs << mol_seq + end + else + puts "error: no species for: " + name + exit + end + gn_match = gn_re.match( name ) - unless gn_match - puts "ignored because no GN=: " + name - no_gn_counter += 1 - next + if IGNORE_SEQS_LACKING_GN + unless gn_match + outfile.puts( "ignored because no GN=: " + name ) + no_gn_counter += 1 + next + end + else + unless gn_match + outfile.puts( "no GN=: " + name ) + end + end + + gn =nil + if gn_match + gn = gn_match[1] + "_" + species + else + if IGNORE_SEQS_LACKING_GN + puts "cannot be" + exit + end + gn = name end - gn = gn_match[1] + unless gn_to_seqs.has_key?(gn) gn_to_seqs[gn] = Msa.new end gn_to_seqs[gn].add_sequence(seq) end - puts "Sequeunces ignored because \"fragment\" in desc: " + frag_counter.to_s - puts "Sequeunces ignored because no \"GN=\" in desc : " + no_gn_counter.to_s - puts - puts + if IGNORE_FRAGMENTS + outfile.puts( "Sequences ignored because \"fragment\" in desc : " + frag_counter.to_s ) + end + + if IGNORE_SEQS_LACKING_GN + outfile.puts( "Sequences ignored because no \"GN=\" in desc : " + no_gn_counter.to_s ) + end + outfile.puts( "Sequences ignored because identical sequence in same species: " + same_seq_counter.to_s ) + outfile.puts + outfile.puts counter = 1 gn_to_seqs.each_pair do |gene,seqs| + seq = nil if seqs.get_number_of_seqs > 1 - puts counter.to_s + ": " + gene - puts seqs.to_fasta - puts - puts + outfile.puts( counter.to_s + ": " + gene ) + outfile.puts( seqs.to_fasta ) + outfile.puts + outfile.puts counter += 1 longest = 0 longest_seq = nil @@ -74,12 +150,35 @@ module Evoruby longest_seq = current end end - longest_non_unique_genes_msa.add_sequence(longest_seq) + seq = longest_seq + longest_non_unique_genes_msa.add_sequence( seq ) + else + seq = seqs.get_sequence( 0 ) + unique_genes_msa.add_sequence( seq ) + end + + + unless IGNORE_SPECIES + species = species_re.match( seq.get_name )[ 1 ] else - unique_genes_msa.add_sequence( seqs.get_sequence( 0 ) ) + species = "XXXXX" end + unless all_msa_per_species.has_key?( species ) + all_msa_per_species[ species ] = Msa.new + end + all_msa_per_species[ species ].add_sequence( seq ) + end + + outfile.close + w = FastaWriter.new - w.write(unique_genes_msa, "seqs_from_unique_genes.fasta") - w.write(longest_non_unique_genes_msa, "longest_seqs_from_nonunique_genes.fasta") + w.write( unique_genes_msa, outbase + "_seqs_from_unique_genes.fasta" ) + w.write( longest_non_unique_genes_msa, outbase + "_longest_seqs_from_nonunique_genes.fasta" ) + + all_msa_per_species.each_pair do |s,m| + w = FastaWriter.new + w.write( m, outbase + "_" + s + ".fasta" ) + end + end