2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
4 # Copyright:: Copyright (C) 2014 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
8 require 'lib/evo/util/constants'
9 require 'lib/evo/util/util'
10 require 'lib/evo/msa/msa'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/io/msa_io'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/writer/fasta_writer'
15 require 'lib/evo/util/command_line_arguments'
18 class MultiSequenceExtractor
22 PRG_DESC = "processing of \"surfacing\" output: extraction of sequences by name from multiple multi-sequence ('fasta') files"
24 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
25 HELP_OPTION_1 = 'help'
29 EXTRACT_LINKERS_OPTION = 'l'
30 LOG_SUFFIX = ".mse_log"
31 LINKERS_SUFFIX = ".linkers"
32 FASTA_SUFFIX = ".fasta"
33 FASTA_WITH_NORMALIZED_IDS_SUFFIX = ".ni.fasta"
34 NORMALIZED_IDS_MAP_SUFFIX = ".nim"
35 PROTEINS_LIST_FILE_SEPARATOR = "\t"
37 @file_to_msa = Hash.new
43 Util.print_program_information( PRG_NAME,
50 ld = Constants::LINE_DELIMITER
53 cla = CommandLineArguments.new( ARGV )
54 rescue ArgumentError => e
55 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
58 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
59 cla.is_option_set?( HELP_OPTION_2 ) )
64 if ( cla.get_number_of_files != 5 )
69 allowed_opts = Array.new
70 allowed_opts.push(EXT_OPTION)
71 allowed_opts.push(EXTRACT_LINKERS_OPTION)
73 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
74 if ( disallowed.length > 0 )
75 Util.fatal_error( PRG_NAME,
76 "unknown option(s): " + disallowed,
80 seq_names_files_suffix = cla.get_file_name( 0 )
81 input_dir = cla.get_file_name( 1 )
82 out_dir = cla.get_file_name( 2 )
83 out_dir_doms = cla.get_file_name( 3 )
84 mapping_file = cla.get_file_name( 4 )
87 Util.check_file_for_readability( mapping_file )
89 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
93 if cla.is_option_set?(EXT_OPTION)
94 extension = cla.get_option_value_as_int(EXT_OPTION)
100 extract_linkers = false
101 if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
102 extract_linkers = true
105 unless File.exist?( out_dir )
108 unless File.exist?( out_dir_doms )
109 Dir.mkdir( out_dir_doms )
112 unless File.exist?( input_dir )
113 Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
115 unless File.exist?( out_dir )
116 Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
118 unless File.exist?( out_dir_doms )
119 Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
121 unless File.directory?( input_dir )
122 Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
124 unless File.directory?( out_dir )
125 Util.fatal_error( PRG_NAME, "error: [#{out_dir}] is not a directory" )
127 unless File.directory?( out_dir_doms )
128 Util.fatal_error( PRG_NAME, "error: [#{out_dir_doms}] is not a directory" )
133 log << "Program : " + PRG_NAME + ld
134 log << "Version : " + PRG_VERSION + ld
135 log << "Program date : " + PRG_DATE + ld
138 puts( "Sequence names files suffix: " + seq_names_files_suffix )
139 log << "Sequence names files suffix: " + seq_names_files_suffix + ld
140 puts( "Input dir : " + input_dir )
141 log << "Input dir : " + input_dir + ld
142 puts( "Output dir : " + out_dir )
143 log << "Output dir : " + out_dir + ld
144 puts( "Output dir domains : " + out_dir_doms )
145 log << "Output dir domains : " + out_dir_doms + ld
146 puts( "Mapping file : " + mapping_file )
147 log << "Mapping file : " + mapping_file + ld
149 puts( "Extension : " + extension.to_s )
150 log << "Extension : " + extension.to_s + ld
153 puts( "Extract linkers : true" )
154 log << "Extract linkers : true" + ld
156 log << "Date : " + Time.now.to_s + ld
159 species_codes_to_paths = extract_mappings( mapping_file )
161 input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
165 input_files.each { |input_file|
169 puts counter.to_s + "/" + input_files.size.to_s
170 read_seq_family_file( input_file,
171 seq_names_files_suffix,
173 species_codes_to_paths,
182 Util.print_message( PRG_NAME, "OK" )
187 def read_seq_family_file( input_file,
188 seq_names_files_suffix,
190 species_codes_to_paths,
199 Util.check_file_for_readability( input_file )
200 rescue ArgumentError => e
201 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
203 basename = File.basename( input_file, seq_names_files_suffix )
204 out_file_path_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
205 out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
206 out_file_path_ids_map = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
207 doms_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
208 doms_ext_out_file_path_fasta_file = nil
210 doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_ext_" + extension.to_s + FASTA_SUFFIX
213 Util.check_file_for_writability( out_file_path_fasta_file )
214 Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
215 Util.check_file_for_writability( out_file_path_ids_map )
216 Util.check_file_for_writability( doms_out_file_path_fasta_file )
217 rescue ArgumentError => e
218 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
223 ids_map_writer = File.open( out_file_path_ids_map, 'a' )
224 rescue Exception => e
225 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
231 new_msa_normalized_ids = Msa.new
232 new_msa_domains = Msa.new
233 new_msa_domains_extended = Msa.new
234 per_species_counter = 0
239 File.open( input_file ) do | file |
241 while line = file.gets
243 if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
244 values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
246 if ( values.length < 2 )
247 Util.fatal_error( PRG_NAME, "unexpected format: " + line )
249 species = values[ 0 ]
251 seq_name = values[ 1 ]
253 if ( values.length > 3 )
254 domain_ranges_block = values[ 3 ]
255 domain_ranges = domain_ranges_block.split( "/" )
257 if ( species != current_species )
258 current_species = species
259 my_file = input_dir + Constants::FILE_SEPARATOR + current_species
261 if ( !File.exist?( my_file ) )
262 if species_codes_to_paths == nil
263 Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
264 elsif ( !species_codes_to_paths.has_key?( current_species ) )
265 Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
267 my_file = species_codes_to_paths[ current_species ]
269 my_path = File.expand_path( my_file )
270 my_readlink = my_path
271 if ( File.symlink?( my_path ) )
272 my_readlink = File.readlink( my_path )
274 current_msa = read_fasta_file( my_file )
276 if ( per_species_counter > 0 )
277 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
278 per_species_counter = 0
280 puts " " + species_counter.to_s + ":" + current_species + " [" + my_readlink + "]"
281 log << species_counter.to_s << ": " << current_species << " [" + my_readlink + "]" << Constants::LINE_DELIMITER
284 log << " " << seq_name << Constants::LINE_DELIMITER
285 per_species_counter = per_species_counter + 1
288 indices = current_msa.find_by_name_start( seq_name, true )
290 seq = current_msa.get_sequence( indices[ 0 ] )
291 elsif indices.size == 0
292 # Not found, try finding by partial match.
294 seq = current_msa.get_by_name( seq_name, true, true )
295 rescue ArgumentError => e
296 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
299 Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique" )
302 normalized_id = per_species_counter.to_s( 16 ).upcase +
303 "_" + current_species
305 per_species_counter.to_i
307 ids_map_writer.write( normalized_id + "\t" + seq.get_name + Constants::LINE_DELIMITER )
311 orig_name = seq.get_name
312 seq.set_name( seq.get_name + " [" + current_species + "]" )
313 new_msa.add_sequence( seq )
315 Util.fatal_error( PRG_NAME, "unexpected error: seq is nil" )
318 if domain_ranges != nil
322 domain_ranges.each { |range|
323 if range != nil && range.length > 0
327 new_msa_domains.add_sequence( Sequence.new( orig_name +
330 "] [" + basename + "] [" +
331 current_species + "]",
332 seq.get_sequence_as_string[from..to] ) )
334 from_e = from - extension
338 to_e = to + extension
339 if to_e > seq.get_sequence_as_string.length - 1
340 to_e = seq.get_sequence_as_string.length - 1
342 new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
347 "] [" + basename + "] [" +
348 current_species + "]",
349 seq.get_sequence_as_string[ from_e..to_e ] ) )
359 mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
361 mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
365 end # range != nil && range.length > 0
367 if extract_linkers && prev_to > 0
369 t = seq.get_sequence_as_string.length - 1
371 t = prev_to + extension
373 mod_line += get_linker_sequence( f, t, seq ) + "]"
377 new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
379 linkers << mod_line + Constants::LINE_DELIMITER
381 end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
382 end # while line = file.gets
388 if ( per_species_counter > 0 )
389 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
394 fasta_writer = FastaWriter.new()
395 fasta_writer.remove_gap_chars
399 io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
400 rescue Exception => e
401 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
404 if new_msa_domains != nil
406 io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
407 rescue Exception => e
408 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
412 if extension > 0 && new_msa_domains_extended != nil
414 io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
415 rescue Exception => e
416 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
421 io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
422 rescue Exception => e
423 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
427 f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LOG_SUFFIX , 'a' )
430 rescue Exception => e
431 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
434 if extract_linkers && linkers != nil
436 f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LINKERS_SUFFIX , 'a' )
439 rescue Exception => e
440 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
445 def get_linker_sequence( from, to, seq )
449 if to > seq.get_sequence_as_string.length - 1
450 to = seq.get_sequence_as_string.length - 1
455 return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
459 def obtain_inputfiles( input_dir, seq_names_files_suffix )
460 input_files = Array.new()
461 Dir.foreach( input_dir ) { |file_name|
462 if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
463 input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
469 def extract_mappings( mapping_file )
470 species_code_to_path = Hash.new()
471 File.open( mapping_file ) do | file |
472 while line = file.gets
473 if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
474 if ( line =~ /(\S+)\s+(\S+)/ )
477 if ( species_code_to_path.has_key?( species ) )
478 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
480 if ( species_code_to_path.has_value?( path ) )
481 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
483 unless ( File.exist?( path ) )
484 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
486 unless ( File.file?( path ) )
487 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
489 unless ( File.readable?( path ) )
490 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
492 if ( File.size( path ) < 1000 )
493 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
495 unless ( Util.looks_like_fasta?( path ) )
496 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
498 species_code_to_path[ species ] = path
499 puts species + " -> " + path
507 def print_counts( per_species_counter, log, ld )
508 puts " sum: " + per_species_counter.to_s
509 log << " sum: " + per_species_counter.to_s + ld
512 def read_fasta_file( input )
513 if @file_to_msa.has_key?( input )
514 return @file_to_msa[ input ]
520 msa = f.create_msa_from_file( input, FastaParser.new() )
521 rescue Exception => e
522 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
525 @file_to_msa[ input ] = msa
526 @seqs += msa.get_number_of_seqs
527 puts " total seqs in memory: " + @seqs.to_s
535 puts( " " + PRG_NAME + ".rb <sequence id ('prot') files suffix> <dir containing sequence id ('prot') files>" +
536 " <output directory for protein sequences> <output directory for domain sequences> <genome locations file>" )
538 puts( " option: -" + EXT_OPTION + "=<int>: to extend extracted domains" )
539 puts( " -" + EXTRACT_LINKERS_OPTION + " : to extract linkers" )
541 puts( " " + "Examples: mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt" )
542 puts( " " + " mse.rb .prot . FL_seqs DA_seqs ../../genome_locations.txt" )
546 end # class MultiSequenceExtractor