2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
4 # Copyright:: Copyright (C) 2014 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
9 require 'lib/evo/util/constants'
10 require 'lib/evo/util/util'
11 require 'lib/evo/msa/msa'
12 require 'lib/evo/msa/msa_factory'
13 require 'lib/evo/io/msa_io'
14 require 'lib/evo/io/parser/fasta_parser'
15 require 'lib/evo/io/writer/fasta_writer'
16 require 'lib/evo/util/command_line_arguments'
20 class MultiSequenceExtractor
24 PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
26 COPYRIGHT = "2014 Christian M Zmasek"
27 CONTACT = "phyloxml@gmail.com"
28 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
29 HELP_OPTION_1 = 'help'
33 EXTRACT_LINKERS_OPTION = 'l'
34 LOG_SUFFIX = ".mse_log"
35 LINKERS_SUFFIX = ".linkers"
36 FASTA_SUFFIX = ".fasta"
37 FASTA_WITH_NORMALIZED_IDS_SUFFIX = ".ni.fasta"
38 NORMALIZED_IDS_MAP_SUFFIX = ".nim"
39 PROTEINS_LIST_FILE_SEPARATOR = "\t"
42 @file_to_msa = Hash.new
48 Util.print_program_information( PRG_NAME,
57 ld = Constants::LINE_DELIMITER
60 cla = CommandLineArguments.new( ARGV )
61 rescue ArgumentError => e
62 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
65 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
66 cla.is_option_set?( HELP_OPTION_2 ) )
71 if ( cla.get_number_of_files != 5 )
76 allowed_opts = Array.new
77 allowed_opts.push(EXT_OPTION)
78 allowed_opts.push(EXTRACT_LINKERS_OPTION)
80 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
81 if ( disallowed.length > 0 )
82 Util.fatal_error( PRG_NAME,
83 "unknown option(s): " + disallowed,
87 seq_names_files_suffix = cla.get_file_name( 0 )
88 input_dir = cla.get_file_name( 1 )
89 out_dir = cla.get_file_name( 2 )
90 out_dir_doms = cla.get_file_name( 3 )
91 mapping_file = cla.get_file_name( 4 )
94 Util.check_file_for_readability( mapping_file )
95 rescue ArgumentError => e
96 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
100 if cla.is_option_set?(EXT_OPTION)
101 extension = cla.get_option_value_as_int(EXT_OPTION)
107 extract_linkers = false
108 if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
109 extract_linkers = true
112 unless File.exist?( out_dir )
115 unless File.exist?( out_dir_doms )
116 Dir.mkdir( out_dir_doms )
119 unless File.exist?( input_dir )
120 Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
122 unless File.exist?( out_dir )
123 Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
125 unless File.exist?( out_dir_doms )
126 Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
128 unless File.directory?( input_dir )
129 Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
131 unless File.directory?( out_dir )
132 Util.fatal_error( PRG_NAME, "error: [#{out_dir}] is not a directory" )
134 unless File.directory?( out_dir_doms )
135 Util.fatal_error( PRG_NAME, "error: [#{out_dir_doms}] is not a directory" )
140 log << "Program : " + PRG_NAME + ld
141 log << "Version : " + PRG_VERSION + ld
142 log << "Program date : " + PRG_DATE + ld
145 puts( "Sequence names files suffix: " + seq_names_files_suffix )
146 log << "Sequence names files suffix: " + seq_names_files_suffix + ld
147 puts( "Input dir : " + input_dir )
148 log << "Input dir : " + input_dir + ld
149 puts( "Output dir : " + out_dir )
150 log << "Output dir : " + out_dir + ld
151 puts( "Output dir domains : " + out_dir_doms )
152 log << "Output dir domains : " + out_dir_doms + ld
153 puts( "Mapping file : " + mapping_file )
154 log << "Mapping file : " + mapping_file + ld
156 puts( "Extension : " + extension.to_s )
157 log << "Extension : " + extension.to_s + ld
160 puts( "Extract linkers : true" )
161 log << "Extract linkers : true" + ld
163 log << "Date : " + Time.now.to_s + ld
166 species_codes_to_paths = extract_mappings( mapping_file )
168 input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
172 input_files.each { |input_file|
176 puts counter.to_s + "/" + input_files.size.to_s
177 read_seq_family_file( input_file,
178 seq_names_files_suffix,
180 species_codes_to_paths,
189 Util.print_message( PRG_NAME, "OK" )
195 def read_seq_family_file( input_file,
196 seq_names_files_suffix,
198 species_codes_to_paths,
207 Util.check_file_for_readability( input_file )
208 rescue ArgumentError => e
209 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
211 basename = File.basename( input_file, seq_names_files_suffix )
212 out_file_path_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
213 out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
214 out_file_path_ids_map = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
215 doms_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
216 doms_ext_out_file_path_fasta_file = nil
218 doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX
221 Util.check_file_for_writability( out_file_path_fasta_file )
222 Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
223 Util.check_file_for_writability( out_file_path_ids_map )
224 Util.check_file_for_writability( doms_out_file_path_fasta_file )
225 rescue ArgumentError => e
226 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
231 ids_map_writer = File.open( out_file_path_ids_map, 'a' )
232 rescue Exception => e
233 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
239 new_msa_normalized_ids = Msa.new
240 new_msa_domains = Msa.new
241 new_msa_domains_extended = Msa.new
242 per_species_counter = 0
247 File.open( input_file ) do | file |
249 while line = file.gets
251 if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
252 values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
254 if ( values.length < 2 )
255 Util.fatal_error( PRG_NAME, "unexpected format: " + line )
257 species = values[ 0 ]
258 #if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA" || species == "EIMTE"
261 seq_name = values[ 1 ]
263 if ( values.length > 3 )
264 domain_ranges_block = values[ 3 ]
265 domain_ranges = domain_ranges_block.split( "/" )
267 if ( species != current_species )
268 current_species = species
269 my_file = input_dir + Constants::FILE_SEPARATOR + current_species
271 if ( !File.exist?( my_file ) )
272 if species_codes_to_paths == nil
273 Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
274 elsif ( !species_codes_to_paths.has_key?( current_species ) )
275 Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
277 my_file = species_codes_to_paths[ current_species ]
279 my_path = File.expand_path( my_file )
280 my_readlink = my_path
281 if ( File.symlink?( my_path ) )
282 my_readlink = File.readlink( my_path )
284 current_msa = read_fasta_file( my_file )
286 if ( per_species_counter > 0 )
287 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
288 per_species_counter = 0
290 puts " " + species_counter.to_s + ":" + current_species + " [" + my_readlink + "]"
291 log << species_counter.to_s << ": " << current_species << " [" + my_readlink + "]" << Constants::LINE_DELIMITER
294 log << " " << seq_name << Constants::LINE_DELIMITER
295 per_species_counter = per_species_counter + 1
298 indices = current_msa.find_by_name_start( seq_name, true )
300 seq = current_msa.get_sequence( indices[ 0 ] )
301 elsif indices.size == 0
302 # Not found, try finding by partial match.
304 seq = current_msa.get_by_name( seq_name, true, true )
305 rescue ArgumentError => e
306 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
309 Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique" )
312 normalized_id = per_species_counter.to_s( 16 ).upcase +
313 "_" + current_species
315 per_species_counter.to_i
317 ids_map_writer.write( normalized_id + "\t" + seq.get_name + Constants::LINE_DELIMITER )
321 orig_name = seq.get_name
322 seq.set_name( seq.get_name + " [" + current_species + "]" )
323 new_msa.add_sequence( seq )
325 Util.fatal_error( PRG_NAME, "unexpected error: seq is nil" )
328 if domain_ranges != nil
332 domain_ranges.each { |range|
333 if range != nil && range.length > 0
337 new_msa_domains.add_sequence( Sequence.new( orig_name +
340 "] [" + basename + "] [" +
341 current_species + "]",
342 seq.get_sequence_as_string[from..to] ) )
344 from_e = from - extension
348 to_e = to + extension
349 if to_e > seq.get_sequence_as_string.length - 1
350 to_e = seq.get_sequence_as_string.length - 1
352 new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
357 "] [" + basename + "] [" +
358 current_species + "]",
359 seq.get_sequence_as_string[ from_e..to_e ] ) )
369 mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
371 mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
375 end # range != nil && range.length > 0
377 if extract_linkers && prev_to > 0
379 t = seq.get_sequence_as_string.length - 1
381 t = prev_to + extension
383 mod_line += get_linker_sequence( f, t, seq ) + "]"
387 new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
389 linkers << mod_line + Constants::LINE_DELIMITER
391 end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
392 end # while line = file.gets
398 if ( per_species_counter > 0 )
399 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
404 fasta_writer = FastaWriter.new()
405 fasta_writer.remove_gap_chars
409 io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
410 rescue Exception => e
411 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
414 if new_msa_domains != nil
416 io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
417 rescue Exception => e
418 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
422 if extension > 0 && new_msa_domains_extended != nil
424 io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
425 rescue Exception => e
426 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
431 io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
432 rescue Exception => e
433 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
437 f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LOG_SUFFIX , 'a' )
440 rescue Exception => e
441 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
444 if extract_linkers && linkers != nil
446 f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LINKERS_SUFFIX , 'a' )
449 rescue Exception => e
450 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
456 def get_linker_sequence( from, to, seq )
460 if to > seq.get_sequence_as_string.length - 1
461 to = seq.get_sequence_as_string.length - 1
466 return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
470 def obtain_inputfiles( input_dir, seq_names_files_suffix )
471 input_files = Array.new()
472 Dir.foreach( input_dir ) { |file_name|
473 if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
474 input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
480 def extract_mappings( mapping_file )
481 species_code_to_path = Hash.new()
482 File.open( mapping_file ) do | file |
483 while line = file.gets
484 if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
485 if ( line =~ /(\S+)\s+(\S+)/ )
488 if ( species_code_to_path.has_key?( species ) )
489 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
491 if ( species_code_to_path.has_value?( path ) )
492 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
494 unless ( File.exist?( path ) )
495 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
497 unless ( File.file?( path ) )
498 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
500 unless ( File.readable?( path ) )
501 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
503 if ( File.size( path ) < 1000 )
504 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
506 unless ( Util.looks_like_fasta?( path ) )
507 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
509 species_code_to_path[ species ] = path
510 puts species + " -> " + path
518 def print_counts( per_species_counter, log, ld )
519 puts " sum: " + per_species_counter.to_s
520 log << " sum: " + per_species_counter.to_s + ld
523 def read_fasta_file( input )
524 if @file_to_msa.has_key?( input )
525 return @file_to_msa[ input ]
531 msa = f.create_msa_from_file( input, FastaParser.new() )
532 rescue Exception => e
533 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
536 @file_to_msa[ input ] = msa
537 @seqs += msa.get_number_of_seqs
538 puts " total seqs in memory: " + @seqs.to_s
546 puts( " " + PRG_NAME + ".rb <sequence id ('prot') files suffix> <dir containing sequence id ('prot') files>" +
547 " <output directory for protein sequences> <output directory for domain sequences> <genome locations file>" )
549 puts( " option: -" + EXT_OPTION + "=<int>: to extend extracted domains" )
550 puts( " -" + EXTRACT_LINKERS_OPTION + " : to extract linkers" )
552 puts( " " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" )
556 end # class MultiSequenceExtractor