2 # = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: taxonomy_processor.rb,v 1.26 2010/12/13 19:00:11 cmzmasek Exp $
10 require 'lib/evo/util/util'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/io/msa_io'
14 require 'lib/evo/io/parser/fasta_parser'
15 require 'lib/evo/io/parser/general_msa_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/io/writer/phylip_sequential_writer'
18 require 'lib/evo/util/command_line_arguments'
22 class TaxonomyProcessor
25 PRG_DATE = "2012.09.27"
26 PRG_DESC = "replacement of species names in multiple sequence files"
28 COPYRIGHT = "2012 Christian M Zmasek"
29 CONTACT = "phylosoft@gmail.com"
30 WWW = "www.phylosoft.org"
34 EXTRACT_TAXONOMY_OPTION = "t"
37 @taxonomies = Hash.new()
42 Util.print_program_information( PRG_NAME,
51 if ( ARGV == nil || ( ARGV.length != 1 && ARGV.length != 3 && ARGV.length != 4 && ARGV.length != 5 && ARGV.length != 6 ) )
52 puts( "Usage: #{PRG_NAME}.rb [options] [input map file] <input sequences> [output sequences] [output id list]" )
54 puts( " options: -" + EXTRACT_TAXONOMY_OPTION + ": to extract taxonomy information from bracketed expression" )
60 cla = CommandLineArguments.new( ARGV )
61 rescue ArgumentError => e
62 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
73 if cla.get_number_of_files == 4
74 mapfile = cla.get_file_name( 0 )
75 input = cla.get_file_name( 1 )
76 output = cla.get_file_name( 2 )
77 list_file = cla.get_file_name( 3 )
78 elsif cla.get_number_of_files == 3
79 input = cla.get_file_name( 0 )
80 output = cla.get_file_name( 1 )
81 list_file = cla.get_file_name( 2 )
82 elsif cla.get_number_of_files == 1
83 input = cla.get_file_name( 0 )
85 if input.downcase.end_with?( ".fasta" )
86 i = input[ 0 .. input.length - 7 ]
87 elsif input.downcase.end_with?( ".fsa" )
88 i = input[ 0 .. input.length - 5 ]
92 output = i + "_ni.fasta"
93 list_file = i + ".nim"
97 allowed_opts = Array.new
98 allowed_opts.push( EXTRACT_TAXONOMY_OPTION )
100 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
101 if ( disallowed.length > 0 )
102 Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
105 extract_taxonomy = false
106 if ( cla.is_option_set?( EXTRACT_TAXONOMY_OPTION ) )
107 extract_taxonomy = true
110 if ( File.exists?( output ) )
111 Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" )
113 if ( File.exists?( list_file ) )
114 Util.fatal_error( PRG_NAME, "list file [" + list_file + "] already exists" )
116 if ( !File.exists?( input) )
117 Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" )
119 if ( mapfile != nil && !File.exists?( mapfile ) )
120 Util.fatal_error( PRG_NAME, "mapfile [" + mapfile + "] does not exist" )
123 fasta_like = Util.looks_like_fasta?( input )
127 puts( "Map file : " + mapfile )
129 puts( "Input alignment : " + input )
130 puts( "Output alignment: " + output )
131 puts( "Name list : " + list_file )
133 puts( "Format : Fasta" )
135 puts( "Format : Phylip like" )
137 if ( extract_taxonomy )
138 puts( "Extract taxonomy: true" )
142 species_map = Hash.new
144 File.open( mapfile ) do | file |
145 while line = file.gets
146 if ( line =~/(.+)#(.+)/ || line =~/(.+)\s+(.+)/ )
147 species_map[ $1 ] = $2
148 Util.print_message( PRG_NAME, "mapping: " + $1 + ' => ' + $2 )
157 msa = f.create_msa_from_file( input, FastaParser.new() )
159 msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
161 rescue Exception => e
162 Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s )
165 if ( msa == nil || msa.get_number_of_seqs() < 1 )
166 Util.fatal_error( PRG_NAME, "failed to read MSA" )
169 Util.check_file_for_writability( list_file )
170 rescue Exception => e
171 Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT )
174 #removed = msa.remove_redundant_sequences!( true )
176 # Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
177 # removed.each { | seq_name |
180 # Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" )
183 lf = File.open( list_file, "a" )
184 for i in 0 ... msa.get_number_of_seqs
185 seq = msa.get_sequence( i )
186 seq.set_name( Util::normalize_seq_name( modify_name( seq.get_name(), i, lf, species_map, extract_taxonomy ), 10 ) )
192 w = FastaWriter.new()
194 w = PhylipSequentialWriter.new()
196 w.set_max_name_length( 10 )
199 io.write_to_file( msa, output, w )
200 rescue Exception => e
201 Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
204 if ( @taxonomies.length > 0 )
205 Util.print_message( PRG_NAME, "number of unique taxonomies: " + @taxonomies.length.to_s )
207 Util.print_message( PRG_NAME, "wrote: " + list_file )
208 Util.print_message( PRG_NAME, "wrote: " + output )
209 Util.print_message( PRG_NAME, "OK" )
214 def modify_name( desc, counter, file, species_map, extract_taxonomy )
217 # if desc =~ /^>?\s*\S{1,10}_([0-9A-Z]{3,5})/
218 if desc =~ /^>?\s*\S{1,10}_([A-Z]{3,5})/
219 new_desc = counter.to_s( 16 ) + "_" + $1
221 new_desc = counter.to_s( 16 )
222 elsif extract_taxonomy
223 if ( desc.count( "[" ) != desc.count( "]" ) )
224 Util.fatal_error( PRG_NAME, "illegal bracket count in: " + desc )
227 species_map.each_key do | key |
228 if desc =~ /[\b|_]#{key}\b/ # Added boundaries to prevent e.g. RAT matching ARATH.
229 species = species_map[ key ]
230 new_desc = counter.to_s( 16 ) + "_" + species
235 if desc =~/.*\[(\S{3,}?)\]/
239 species.gsub!( /\s+/, " " )
240 species.gsub!( /-/, "" )
241 species.gsub!( /\)/, "" )
242 species.gsub!( /\(/, "" )
243 species.gsub!( /\'/, "" )
244 if species =~ /\S+\s\S+/ || species =~ /\S{3,5}/
245 if species =~ /(\S+)\s(\S+)/
246 code = $1[ 0..2 ] + $2[ 0..1 ]
247 elsif species =~ /\S{3,5}/
249 elsif species.count( " " ) > 2
250 species =~ /(\S+)\s+(\S+)\s+(\S+)$/
254 code = code[ 0 ] + third_last[ 0 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ]
255 elsif species.count( " " ) > 1
256 species =~ /(\S+)\s+(\S+)$/
259 code = code[ 0..1 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ]
261 new_desc = counter.to_s( 16 ) + "_" + code
262 if @taxonomies.has_key?( code )
263 if ( !@taxonomies.has_value?( species ) )
264 Util.fatal_error( PRG_NAME, "code [#{code}] is not unique in [#{desc}]" )
267 if ( @taxonomies.has_value?( species ) )
268 Util.fatal_error( PRG_NAME, "genome [#{species}] is not unique in [#{desc}]" )
270 @taxonomies[ code ] = species
274 Util.fatal_error( PRG_NAME, "illegal format [#{species}] in: " + desc )
277 Util.fatal_error( PRG_NAME, "illegal format in: " + desc )
283 species_map.each_key do | key |
285 species = species_map[ key ]
286 species = species.gsub( /\s+/, "" )
287 species = species.gsub( /_/, " " )
289 if species =~ /(\S+)\s+(\S+)/
290 species = $1[0..2] + $2[0..1]
292 species = species.gsub( /\s+/, "" )
293 species = species.slice(0, 5)
299 Util.fatal_error( PRG_NAME, "species not found in: " + desc )
301 new_desc = counter.to_s( 16 ) + "_" + species
305 Util.fatal_error( PRG_NAME, "failed to extract species from: " + desc )
308 file.print( new_desc + ": " + desc + " [" + my_species + "]" + "\n" )
310 file.print( new_desc + ": " + desc + "\n" )
315 end # class TaxonomyProcessor