1 #!/usr/local/bin/ruby -w
4 require 'lib/evo/sequence/sequence'
5 require 'lib/evo/msa/msa'
6 require 'lib/evo/msa/msa_factory'
7 require 'lib/evo/io/writer/fasta_writer'
8 require 'lib/evo/io/parser/fasta_parser'
14 puts "usage: select_same_gn.rb <fasta formatted multiple sequence file>"
21 IGNORE_SEQS_LACKING_GN = false
22 IGNORE_FRAGMENTS = false
28 msa = f.create_msa_from_file( input, FastaParser.new() )
30 puts "error: " + e.to_s
34 outbase = input.sub( /\..+/, "" )
36 outfile = File.open(outbase + "_log.txt" , "w")
39 all_seqs_per_species = Hash.new
40 all_msa_per_species = Hash.new
42 unique_genes_msa = Msa.new
43 longest_non_unique_genes_msa = Msa.new
45 fragment_re = /fragment/i
46 species_re = /\[([A-Z0-9]{3,5})\]$/
52 for i in 0 ... msa.get_number_of_seqs()
53 seq = msa.get_sequence( i )
55 if all_names.include?( name )
56 puts "error: sequence name \"" + name + "\" is not unique (#" + i.to_s + ")"
62 if IGNORE_FRAGMENTS && fragment_re.match( name )
63 outfile.puts("ignored because fragment: " + name)
69 if IGNORE_SPECIES || species_re.match( name )
71 species = species_re.match( name )[ 1 ]
76 unless all_seqs_per_species.has_key?( species )
77 all_seqs_per_species[ species ] = Set.new
79 all_seqs = all_seqs_per_species[ species ]
80 mol_seq = seq.get_sequence_as_string.upcase
81 if all_seqs.include?( mol_seq )
82 outfile.puts("ignored because identical sequence in same species: " + name )
90 puts "error: no species for: " + name
94 gn_match = gn_re.match( name )
95 if IGNORE_SEQS_LACKING_GN
97 outfile.puts( "ignored because no GN=: " + name )
103 outfile.puts( "no GN=: " + name )
109 gn = gn_match[1] + "_" + species
111 if IGNORE_SEQS_LACKING_GN
118 unless gn_to_seqs.has_key?(gn)
119 gn_to_seqs[gn] = Msa.new
121 gn_to_seqs[gn].add_sequence(seq)
125 outfile.puts( "Sequences ignored because \"fragment\" in desc : " + frag_counter.to_s )
128 if IGNORE_SEQS_LACKING_GN
129 outfile.puts( "Sequences ignored because no \"GN=\" in desc : " + no_gn_counter.to_s )
131 outfile.puts( "Sequences ignored because identical sequence in same species: " + same_seq_counter.to_s )
136 gn_to_seqs.each_pair do |gene,seqs|
138 if seqs.get_number_of_seqs > 1
139 outfile.puts( counter.to_s + ": " + gene )
140 outfile.puts( seqs.to_fasta )
146 for j in 0 ... seqs.get_number_of_seqs()
147 current = seqs.get_sequence( j )
148 if current.get_length > longest
149 longest = current.get_length
150 longest_seq = current
154 longest_non_unique_genes_msa.add_sequence( seq )
156 seq = seqs.get_sequence( 0 )
157 unique_genes_msa.add_sequence( seq )
161 unless IGNORE_SPECIES
162 species = species_re.match( seq.get_name )[ 1 ]
166 unless all_msa_per_species.has_key?( species )
167 all_msa_per_species[ species ] = Msa.new
169 all_msa_per_species[ species ].add_sequence( seq )
176 w.write( unique_genes_msa, outbase + "_seqs_from_unique_genes.fasta" )
177 w.write( longest_non_unique_genes_msa, outbase + "_longest_seqs_from_nonunique_genes.fasta" )
179 all_msa_per_species.each_pair do |s,m|
181 w.write( m, outbase + "_" + s + ".fasta" )