2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
4 # Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/msa/msa_factory'
14 require 'lib/evo/io/msa_io'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/util/command_line_arguments'
23 class MultiSequenceExtractor
27 PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
28 PRG_DATE = "2012.07.19"
29 COPYRIGHT = "2008-2012 Christian M Zmasek"
30 CONTACT = "phylosoft@gmail.com"
31 WWW = "www.phylosoft.org"
32 HELP_OPTION_1 = 'help'
36 EXTRACT_LINKERS_OPTION = 'l'
37 LOG_SUFFIX = ".mse_log"
38 LINKERS_SUFFIX = ".linkers"
39 FASTA_SUFFIX = ".fasta"
40 FASTA_WITH_NORMALIZED_IDS_SUFFIX = ".ni.fasta"
41 NORMALIZED_IDS_MAP_SUFFIX = ".nim"
42 PROTEINS_LIST_FILE_SEPARATOR = "\t"
47 Util.print_program_information( PRG_NAME,
56 ld = Constants::LINE_DELIMITER
59 cla = CommandLineArguments.new( ARGV )
60 rescue ArgumentError => e
61 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
64 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
65 cla.is_option_set?( HELP_OPTION_2 ) )
70 if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
75 allowed_opts = Array.new
76 allowed_opts.push(EXT_OPTION)
77 allowed_opts.push(EXTRACT_LINKERS_OPTION)
79 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
80 if ( disallowed.length > 0 )
81 Util.fatal_error( PRG_NAME,
82 "unknown option(s): " + disallowed,
86 seq_names_files_suffix = cla.get_file_name( 0 )
87 input_dir = cla.get_file_name( 1 )
88 out_dir = cla.get_file_name( 2 )
89 out_dir_doms = cla.get_file_name( 3 )
92 if ( cla.get_number_of_files == 5 )
93 mapping_file = cla.get_file_name( 4 )
95 Util.check_file_for_readability( mapping_file )
96 rescue ArgumentError => e
97 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
102 if cla.is_option_set?(EXT_OPTION)
103 extension = cla.get_option_value_as_int(EXT_OPTION)
109 extract_linkers = false
110 if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
111 extract_linkers = true
114 if !File.exist?( input_dir )
115 Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
117 if !File.exist?( out_dir )
118 Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
120 if !File.exist?( out_dir_doms )
121 Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
123 if !File.directory?( input_dir )
124 Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
126 if !File.directory?( out_dir )
127 Util.fatal_error( PRG_NAME, "error: [#{out_dir}] is not a directory" )
129 if !File.directory?( out_dir_doms )
130 Util.fatal_error( PRG_NAME, "error: [#{out_dir_doms}] is not a directory" )
136 log << "Program : " + PRG_NAME + ld
137 log << "Version : " + PRG_VERSION + ld
138 log << "Program date : " + PRG_DATE + ld
141 puts( "Sequence names files suffix: " + seq_names_files_suffix )
142 log << "Sequence names files suffix: " + seq_names_files_suffix + ld
143 puts( "Input dir : " + input_dir )
144 log << "Input dir : " + input_dir + ld
145 puts( "Output dir : " + out_dir )
146 log << "Output dir : " + out_dir + ld
147 puts( "Output dir domains : " + out_dir_doms )
148 log << "Output dir domains : " + out_dir_doms + ld
149 if ( mapping_file != nil )
150 puts( "Mapping file : " + mapping_file )
151 log << "Mapping file : " + mapping_file + ld
154 puts( "Extension : " + extension.to_s )
155 log << "Extension : " + extension.to_s + ld
157 log << "Date : " + Time.now.to_s + ld
160 if ( mapping_file != nil )
161 species_codes_to_paths = extract_mappings( mapping_file )
164 input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
168 input_files.each { |input_file|
172 puts counter.to_s + "/" + input_files.size.to_s
173 read_seq_family_file( input_file,
174 seq_names_files_suffix,
176 species_codes_to_paths,
185 Util.print_message( PRG_NAME, "OK" )
191 def read_seq_family_file( input_file,
192 seq_names_files_suffix,
194 species_codes_to_paths,
203 Util.check_file_for_readability( input_file )
204 rescue ArgumentError => e
205 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
207 basename = File.basename( input_file, seq_names_files_suffix )
208 out_file_path_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
209 out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
210 out_file_path_ids_map = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
211 doms_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
212 doms_ext_out_file_path_fasta_file = nil
214 doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX
217 Util.check_file_for_writability( out_file_path_fasta_file )
218 Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
219 Util.check_file_for_writability( out_file_path_ids_map )
220 Util.check_file_for_writability( doms_out_file_path_fasta_file )
221 rescue ArgumentError => e
222 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
227 ids_map_writer = File.open( out_file_path_ids_map, 'a' )
228 rescue Exception => e
229 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
235 new_msa_normalized_ids = Msa.new
236 new_msa_domains = Msa.new
237 new_msa_domains_extended = Msa.new
238 per_species_counter = 0
243 File.open( input_file ) do | file |
244 while line = file.gets
246 if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
247 values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
249 if ( values.length < 2 )
250 Util.fatal_error( PRG_NAME, "unexpected format: " + line )
252 species = values[ 0 ]
253 if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA" || species == "EIMTE"
256 seq_name = values[ 1 ]
258 if ( values.length > 3 )
259 domain_ranges_block = values[ 3 ]
260 domain_ranges = domain_ranges_block.split( "/" )
262 if ( species != current_species )
263 current_species = species
264 my_file = input_dir + Constants::FILE_SEPARATOR + current_species
266 if ( !File.exist?( my_file ) )
267 if species_codes_to_paths == nil
268 Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
269 elsif ( !species_codes_to_paths.has_key?( current_species ) )
270 Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
272 my_file = species_codes_to_paths[ current_species ]
274 my_path = File.expand_path( my_file )
275 my_readlink = my_path
276 if ( File.symlink?( my_path ) )
277 my_readlink = File.readlink( my_path )
279 current_msa = read_fasta_file( my_file )
281 if ( per_species_counter > 0 )
282 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
283 per_species_counter = 0
285 puts " " + current_species + " [" + my_readlink + "]"
286 log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
289 log << " " + seq_name + Constants::LINE_DELIMITER
290 per_species_counter = per_species_counter + 1
293 if current_msa.find_by_name_start( seq_name, true ).size > 0
295 seq = current_msa.get_by_name_start( seq_name, true ).copy
296 rescue ArgumentError => e
297 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
300 # Not found, try finding by partial match.
302 seq = current_msa.get_by_name( seq_name, true, true )
303 rescue ArgumentError => e
304 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
308 normalized_id = per_species_counter.to_s( 16 ).upcase +
309 "_" + current_species
311 per_species_counter.to_i
313 ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
317 orig_name = seq.get_name
318 seq.set_name( seq.get_name + " [" + current_species + "]" )
319 new_msa.add_sequence( seq )
321 Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
324 if domain_ranges != nil
328 domain_ranges.each { |range|
329 if range != nil && range.length > 0
333 new_msa_domains.add_sequence( Sequence.new( orig_name +
336 "] [" + basename + "] [" +
337 current_species + "]",
338 seq.get_sequence_as_string[from..to] ) )
340 from_e = from - extension
344 to_e = to + extension
345 if to_e > seq.get_sequence_as_string.length - 1
346 to_e = seq.get_sequence_as_string.length - 1
348 new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
353 "] [" + basename + "] [" +
354 current_species + "]",
355 seq.get_sequence_as_string[ from_e..to_e ] ) )
365 mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
367 mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
371 end # range != nil && range.length > 0
373 if extract_linkers && prev_to > 0
375 t = seq.get_sequence_as_string.length - 1
379 mod_line += get_linker_sequence( f, t, seq ) + "]"
383 new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
385 linkers << mod_line + Constants::LINE_DELIMITER
387 end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
388 end # while line = file.gets
394 if ( per_species_counter > 0 )
395 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
400 fasta_writer = FastaWriter.new()
401 fasta_writer.remove_gap_chars
405 io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
406 rescue Exception => e
407 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
410 if new_msa_domains != nil
412 io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
413 rescue Exception => e
414 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
418 if extension > 0 && new_msa_domains_extended != nil
420 io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
421 rescue Exception => e
422 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
427 io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
428 rescue Exception => e
429 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
433 f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LOG_SUFFIX , 'a' )
436 rescue Exception => e
437 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
442 f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LINKERS_SUFFIX , 'a' )
445 rescue Exception => e
446 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
454 def get_linker_sequence( from, to, seq )
458 if to > seq.get_sequence_as_string.length - 1
459 to = seq.get_sequence_as_string.length - 1
464 return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
468 def obtain_inputfiles( input_dir, seq_names_files_suffix )
469 input_files = Array.new()
470 Dir.foreach( input_dir ) { |file_name|
471 if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
472 input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
478 def extract_mappings( mapping_file )
479 species_code_to_path = Hash.new()
480 File.open( mapping_file ) do | file |
481 while line = file.gets
482 if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
483 if ( line =~ /(\S+)\s+(\S+)/ )
486 if ( species_code_to_path.has_key?( species ) )
487 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
489 if ( species_code_to_path.has_value?( path ) )
490 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
492 if ( !File.exist?( path ) )
493 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
495 if ( !File.file?( path ) )
496 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
498 if ( !File.readable?( path ) )
499 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
501 if ( File.size( path ) < 10000 )
502 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
504 if ( !Util.looks_like_fasta?( path ) )
505 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
507 species_code_to_path[ species ] = path
508 puts species + " -> " + path
516 def print_counts( per_species_counter, log, ld )
517 puts " [sum: " + per_species_counter.to_s + "]"
518 log << " [sum: " + per_species_counter.to_s + "]" + ld
521 def read_fasta_file( input )
525 msa = f.create_msa_from_file( input, FastaParser.new() )
526 rescue Exception => e
527 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
535 puts( " " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
536 "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
537 "genome multiple-sequence ('fasta') files not in input dir]" )
539 puts( " option: -" + EXT_OPTION + "=<int>: to extend extracted domains" )
540 puts( " -" + EXTRACT_LINKERS_OPTION + " : to extract linkers" )
542 puts( " " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" )
546 end # class MultiSequenceExtractor