1 #!/usr/local/bin/ruby -w
4 require 'lib/evo/sequence/sequence'
5 require 'lib/evo/msa/msa'
6 require 'lib/evo/msa/msa_factory'
7 require 'lib/evo/io/writer/fasta_writer'
8 require 'lib/evo/io/parser/fasta_parser'
16 IGNORE_SEQS_LACKING_GN = false
21 msa = f.create_msa_from_file( input, FastaParser.new() )
23 puts "error: " + e.to_s
28 all_seqs_per_species = Hash.new
29 all_msa_per_species = Hash.new
31 unique_genes_msa = Msa.new
32 longest_non_unique_genes_msa = Msa.new
34 fragment_re = /fragment/i
35 species_re = /\[([A-Z0-9]{3,6})\]$/
41 for i in 0 ... msa.get_number_of_seqs()
42 seq = msa.get_sequence( i )
44 if all_names.include?( name )
45 puts "error: sequence name \"" + name + "\" is not unique (#" + i.to_s + ")"
51 if fragment_re.match( name )
52 puts "ignored because fragment: " + name
58 if species_re.match( name )
59 s_match = species_re.match( name )
62 unless all_seqs_per_species.has_key?( species )
63 all_seqs_per_species[ species ] = Set.new
65 all_seqs = all_seqs_per_species[ species ]
66 mol_seq = seq.get_sequence_as_string.upcase
67 if all_seqs.include?( mol_seq )
68 puts "ignored because identical sequence in same species: " + name
75 puts "error: no species for: " + name
79 gn_match = gn_re.match( name )
80 if IGNORE_SEQS_LACKING_GN
82 puts "ignored because no GN=: " + name
88 puts "no GN=: " + name
94 gn = gn_match[1] + "_" + species
96 if IGNORE_SEQS_LACKING_GN
103 unless gn_to_seqs.has_key?(gn)
104 gn_to_seqs[gn] = Msa.new
106 gn_to_seqs[gn].add_sequence(seq)
109 puts "Sequences ignored because \"fragment\" in desc : " + frag_counter.to_s
110 if IGNORE_SEQS_LACKING_GN
111 puts "Sequences ignored because no \"GN=\" in desc : " + no_gn_counter.to_s
113 puts "Sequences ignored because identical sequence in same species: " + same_seq_counter.to_s
118 gn_to_seqs.each_pair do |gene,seqs|
120 if seqs.get_number_of_seqs > 1
121 puts counter.to_s + ": " + gene
128 for j in 0 ... seqs.get_number_of_seqs()
129 current = seqs.get_sequence( j )
130 if current.get_length > longest
131 longest = current.get_length
132 longest_seq = current
136 longest_non_unique_genes_msa.add_sequence( seq )
138 seq = seqs.get_sequence( 0 )
139 unique_genes_msa.add_sequence( seq )
142 species = species_re.match( seq.get_name )[ 1 ]
143 unless all_msa_per_species.has_key?(species)
144 all_msa_per_species[species] = Msa.new
146 all_msa_per_species[species].add_sequence(seq)
151 w.write(unique_genes_msa, "seqs_from_unique_genes.fasta")
152 w.write(longest_non_unique_genes_msa, "longest_seqs_from_nonunique_genes.fasta")
154 all_msa_per_species.each_pair do |s,m|
156 w.write(m, s +".fasta")