2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
4 # Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/msa/msa_factory'
14 require 'lib/evo/io/msa_io'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/util/command_line_arguments'
23 class MultiSequenceExtractor
27 PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
28 PRG_DATE = "2008.08.13"
29 COPYRIGHT = "2008-2009 Christian M Zmasek"
30 CONTACT = "phylosoft@gmail.com"
31 WWW = "www.phylosoft.org"
32 HELP_OPTION_1 = 'help'
35 LOG_SUFFIX = ".mse_log"
36 FASTA_SUFFIX = ".fasta"
37 FASTA_WITH_NORMALIZED_IDS_SUFFIX = ".ni.fasta"
38 NORMALIZED_IDS_MAP_SUFFIX = ".nim"
39 PROTEINS_LIST_FILE_SEPARATOR = "\t"
44 Util.print_program_information( PRG_NAME,
53 ld = Constants::LINE_DELIMITER
56 cla = CommandLineArguments.new( ARGV )
57 rescue ArgumentError => e
58 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
61 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
62 cla.is_option_set?( HELP_OPTION_2 ) )
67 if ( cla.get_number_of_files != 3 && cla.get_number_of_files != 4 )
72 allowed_opts = Array.new
74 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
75 if ( disallowed.length > 0 )
76 Util.fatal_error( PRG_NAME,
77 "unknown option(s): " + disallowed,
81 seq_names_files_suffix = cla.get_file_name( 0 )
82 input_dir = cla.get_file_name( 1 )
83 out_dir = cla.get_file_name( 2 )
86 if ( cla.get_number_of_files == 4 )
87 mapping_file = cla.get_file_name( 3 )
89 Util.check_file_for_readability( mapping_file )
90 rescue ArgumentError => e
91 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
95 if !File.exist?( input_dir )
96 Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
98 if !File.exist?( out_dir )
99 Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
101 if !File.directory?( input_dir )
102 Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
104 if !File.directory?( out_dir )
105 Util.fatal_error( PRG_NAME, "error: [#{out_dir}] is not a directory" )
111 log << "Program : " + PRG_NAME + ld
112 log << "Version : " + PRG_VERSION + ld
113 log << "Program date : " + PRG_DATE + ld
116 puts( "Sequence names files suffix: " + seq_names_files_suffix )
117 log << "Sequence names files suffix: " + seq_names_files_suffix + ld
118 puts( "Input dir : " + input_dir )
119 log << "Input dir : " + input_dir + ld
120 puts( "Output dir : " + out_dir )
121 log << "Output dir : " + out_dir + ld
122 if ( mapping_file != nil )
123 puts( "Mapping file : " + mapping_file )
124 log << "Mapping file : " + mapping_file + ld
126 log << "Date : " + Time.now.to_s + ld
129 if ( mapping_file != nil )
130 species_codes_to_paths = extract_mappings( mapping_file )
133 input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
136 species_to_genomes = Hash.new()
138 input_files.each { |input_file|
142 puts counter.to_s + "/" + input_files.size.to_s
143 read_seq_family_file( input_file,
144 seq_names_files_suffix,
146 species_codes_to_paths,
153 Util.print_message( PRG_NAME, "OK" )
159 def read_seq_family_file( input_file,
160 seq_names_files_suffix,
162 species_codes_to_paths,
169 Util.check_file_for_readability( input_file )
170 rescue ArgumentError => e
171 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
174 basename = File.basename( input_file, seq_names_files_suffix )
175 out_file_path_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
176 out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
177 out_file_path_ids_map = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
179 Util.check_file_for_writability( out_file_path_fasta_file )
180 Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
181 Util.check_file_for_writability( out_file_path_ids_map )
182 rescue ArgumentError => e
183 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
188 ids_map_writer = File.open( out_file_path_ids_map, 'a' )
189 rescue Exception => e
190 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
196 new_msa_normalized_ids = Msa.new
197 per_species_counter = 0
201 File.open( input_file ) do | file |
202 while line = file.gets
203 if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
204 values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
205 if ( values.length < 2 )
206 Util.fatal_error( PRG_NAME, "unexpected format: " + line )
208 species = values[ 0 ]
209 if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"
212 seq_name = values[ 1 ]
213 if ( species != current_species )
214 current_species = species
215 my_file = input_dir + Constants::FILE_SEPARATOR + current_species
217 if ( !File.exist?( my_file ) )
218 if species_codes_to_paths == nil
219 Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
220 elsif ( !species_codes_to_paths.has_key?( current_species ) )
221 Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
223 my_file = species_codes_to_paths[ current_species ]
225 my_path = File.expand_path( my_file )
226 my_readlink = my_path
227 if ( File.symlink?( my_path ) )
228 my_readlink = File.readlink( my_path )
231 if ( CACHE_GENOMES && species_to_genomes.has_key?( species ) )
232 current_msa = species_to_genomes[ species ]
234 current_msa = read_fasta_file( my_file )
236 species_to_genomes[ species ] = current_msa
240 if ( per_species_counter > 0 )
241 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
242 per_species_counter = 0
244 puts " " + current_species + " [" + my_readlink + "]"
245 log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
248 log << " " + seq_name + Constants::LINE_DELIMITER
249 per_species_counter = per_species_counter + 1
252 if current_msa.find_by_name_start( seq_name, true ).size > 0
254 seq = current_msa.get_by_name_start( seq_name, true ).copy
255 rescue ArgumentError => e
256 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
259 # Not found, try finding by partial match.
261 seq = current_msa.get_by_name( seq_name, true, true )
262 rescue ArgumentError => e
263 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
267 normalized_id = per_species_counter.to_s( 16 ).upcase +
268 "_" + current_species
270 per_species_counter.to_i
272 ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
275 seq.set_name( seq.get_name + " [" + current_species + "]" )
276 new_msa.add_sequence( seq )
278 Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
281 new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
290 if ( per_species_counter > 0 )
291 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
296 fasta_writer = FastaWriter.new()
297 fasta_writer.remove_gap_chars
301 io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
302 rescue Exception => e
303 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
307 io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
308 rescue Exception => e
309 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
313 f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LOG_SUFFIX , 'a' )
316 rescue Exception => e
317 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
322 def obtain_inputfiles( input_dir, seq_names_files_suffix )
323 input_files = Array.new()
324 Dir.foreach( input_dir ) { |file_name|
325 if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
326 input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
332 def extract_mappings( mapping_file )
333 species_code_to_path = Hash.new()
334 File.open( mapping_file ) do | file |
335 while line = file.gets
336 if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
337 if ( line =~ /(\S+)\s+(\S+)/ )
340 if ( species_code_to_path.has_key?( species ) )
341 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
343 if ( species_code_to_path.has_value?( path ) )
344 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
346 if ( !File.exist?( path ) )
347 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
349 if ( !File.file?( path ) )
350 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
352 if ( !File.readable?( path ) )
353 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
355 if ( File.size( path ) < 10000 )
356 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
358 if ( !Util.looks_like_fasta?( path ) )
359 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
361 species_code_to_path[ species ] = path
362 puts species + " -> " + path
370 def print_counts( per_species_counter, log, ld )
371 puts " [sum: " + per_species_counter.to_s + "]"
372 log << " [sum: " + per_species_counter.to_s + "]" + ld
375 def read_fasta_file( input )
379 msa = f.create_msa_from_file( input, FastaParser.new() )
380 rescue Exception => e
381 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
389 puts( " " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
390 "and possibly genome multiple-sequence ('fasta') files> <output directory> [mapping file for " +
391 "genome multiple-sequence ('fasta') files not in input dir]" )
393 puts( " " + "Example: \"mse.rb .prot . seqs ../genome_locations.txt\"" )
397 end # class MultiSequenceExtractor