typo
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / multi_sequence_extractor.rb
1 #
2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2014 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7
8
9 require 'lib/evo/util/constants'
10 require 'lib/evo/util/util'
11 require 'lib/evo/msa/msa'
12 require 'lib/evo/msa/msa_factory'
13 require 'lib/evo/io/msa_io'
14 require 'lib/evo/io/parser/fasta_parser'
15 require 'lib/evo/io/writer/fasta_writer'
16 require 'lib/evo/util/command_line_arguments'
17
18 module Evoruby
19
20   class MultiSequenceExtractor
21
22     PRG_NAME                           = "mse"
23     PRG_VERSION                        = "1.04"
24     PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
25     PRG_DATE                           = "140318"
26     COPYRIGHT                          = "2014 Christian M Zmasek"
27     CONTACT                            = "phyloxml@gmail.com"
28     WWW                                = "https://sites.google.com/site/cmzmasek/home/software/forester"
29     HELP_OPTION_1                      = 'help'
30     HELP_OPTION_2                      = 'h'
31
32     EXT_OPTION                          = 'e'
33     EXTRACT_LINKERS_OPTION              = 'l'
34     LOG_SUFFIX                          = ".mse_log"
35     LINKERS_SUFFIX                      = ".linkers"
36     FASTA_SUFFIX                        = ".fasta"
37     FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
38     NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
39     PROTEINS_LIST_FILE_SEPARATOR        = "\t"
40
41     def initialize()
42       @file_to_msa = Hash.new
43       @seqs = 0
44     end
45
46     def run()
47
48       Util.print_program_information( PRG_NAME,
49         PRG_VERSION,
50         PRG_DESC ,
51         PRG_DATE,
52         COPYRIGHT,
53         CONTACT,
54         WWW,
55         STDOUT )
56
57       ld = Constants::LINE_DELIMITER
58
59       begin
60         cla = CommandLineArguments.new( ARGV )
61       rescue ArgumentError => e
62         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
63       end
64
65       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
66            cla.is_option_set?( HELP_OPTION_2 ) )
67         print_help
68         exit( 0 )
69       end
70
71       if ( cla.get_number_of_files != 5 )
72         print_help
73         exit( -1 )
74       end
75
76       allowed_opts = Array.new
77       allowed_opts.push(EXT_OPTION)
78       allowed_opts.push(EXTRACT_LINKERS_OPTION)
79
80       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
81       if ( disallowed.length > 0 )
82         Util.fatal_error( PRG_NAME,
83           "unknown option(s): " + disallowed,
84           STDOUT )
85       end
86
87       seq_names_files_suffix = cla.get_file_name( 0 )
88       input_dir              = cla.get_file_name( 1 )
89       out_dir                = cla.get_file_name( 2 )
90       out_dir_doms           = cla.get_file_name( 3 )
91       mapping_file           = cla.get_file_name( 4 )
92
93       begin
94         Util.check_file_for_readability( mapping_file )
95       rescue ArgumentError => e
96         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
97       end
98
99       extension = 0
100       if cla.is_option_set?(EXT_OPTION)
101         extension = cla.get_option_value_as_int(EXT_OPTION)
102         if extension < 0
103           extension = 0
104         end
105       end
106
107       extract_linkers = false
108       if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
109         extract_linkers = true
110       end
111
112       unless File.exist?( out_dir )
113         Dir.mkdir( out_dir )
114       end
115       unless File.exist?( out_dir_doms )
116         Dir.mkdir( out_dir_doms )
117       end
118
119       unless File.exist?( input_dir )
120         Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
121       end
122       unless File.exist?( out_dir )
123         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
124       end
125       unless File.exist?( out_dir_doms )
126         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
127       end
128       unless File.directory?( input_dir )
129         Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
130       end
131       unless File.directory?( out_dir )
132         Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
133       end
134       unless File.directory?( out_dir_doms )
135         Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
136       end
137
138       log = String.new
139
140       log << "Program            : " + PRG_NAME + ld
141       log << "Version            : " + PRG_VERSION + ld
142       log << "Program date       : " + PRG_DATE + ld
143
144       puts()
145       puts( "Sequence names files suffix: " + seq_names_files_suffix )
146       log << "Sequence names files suffix: " + seq_names_files_suffix + ld
147       puts( "Input dir                  : " + input_dir )
148       log << "Input dir                  : " + input_dir + ld
149       puts( "Output dir                 : " + out_dir )
150       log << "Output dir                 : " + out_dir + ld
151       puts( "Output dir domains         : " + out_dir_doms )
152       log << "Output dir domains         : " + out_dir_doms + ld
153       puts( "Mapping file               : " + mapping_file )
154       log << "Mapping file               : " + mapping_file + ld
155       if extension > 0
156         puts( "Extension                  : " + extension.to_s )
157         log << "Extension                  : " + extension.to_s + ld
158       end
159       if extract_linkers
160         puts( "Extract linkers            : true" )
161         log << "Extract linkers            : true" + ld
162       end
163       log << "Date                       : " + Time.now.to_s + ld
164       puts
165
166       species_codes_to_paths = extract_mappings( mapping_file )
167
168       input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
169
170       counter = 0
171
172       input_files.each { |input_file|
173         counter += 1
174         puts
175         puts
176         puts counter.to_s + "/" + input_files.size.to_s
177         read_seq_family_file( input_file,
178           seq_names_files_suffix,
179           input_dir,
180           species_codes_to_paths,
181           log,
182           out_dir,
183           out_dir_doms,
184           mapping_file,
185           extension,
186           extract_linkers )
187       }
188       puts
189       Util.print_message( PRG_NAME, "OK" )
190       puts
191
192     end
193
194
195     def read_seq_family_file( input_file,
196         seq_names_files_suffix,
197         input_dir,
198         species_codes_to_paths,
199         log,
200         out_dir,
201         out_dir_doms,
202         mapping_file,
203         extension,
204         extract_linkers )
205
206       begin
207         Util.check_file_for_readability( input_file )
208       rescue ArgumentError => e
209         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
210       end
211       basename = File.basename( input_file, seq_names_files_suffix )
212       out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
213       out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
214       out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
215       doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
216       doms_ext_out_file_path_fasta_file           = nil
217       if extension > 0
218         doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX
219       end
220       begin
221         Util.check_file_for_writability( out_file_path_fasta_file )
222         Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
223         Util.check_file_for_writability( out_file_path_ids_map  )
224         Util.check_file_for_writability( doms_out_file_path_fasta_file  )
225       rescue ArgumentError => e
226         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
227       end
228
229       ids_map_writer = nil
230       begin
231         ids_map_writer = File.open( out_file_path_ids_map, 'a' )
232       rescue Exception => e
233         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
234       end
235
236       current_species         = ""
237       current_msa            = nil
238       new_msa                = Msa.new
239       new_msa_normalized_ids = Msa.new
240       new_msa_domains = Msa.new
241       new_msa_domains_extended = Msa.new
242       per_species_counter = 0
243       linkers = ""
244
245       puts basename
246
247       File.open( input_file ) do | file |
248         species_counter = 1
249         while line = file.gets
250           line.strip!
251           if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
252             values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
253             mod_line = nil
254             if ( values.length < 2 )
255               Util.fatal_error( PRG_NAME, "unexpected format: " + line )
256             end
257             species = values[ 0 ]
258             #if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
259             #  next
260             #end
261             seq_name = values[ 1 ]
262             domain_ranges = nil
263             if ( values.length > 3 )
264               domain_ranges_block = values[ 3 ]
265               domain_ranges = domain_ranges_block.split( "/" )
266             end
267             if ( species != current_species )
268               current_species = species
269               my_file = input_dir + Constants::FILE_SEPARATOR + current_species
270
271               if ( !File.exist?( my_file ) )
272                 if species_codes_to_paths == nil
273                   Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
274                 elsif ( !species_codes_to_paths.has_key?( current_species ) )
275                   Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
276                 end
277                 my_file = species_codes_to_paths[ current_species ]
278               end
279               my_path = File.expand_path( my_file )
280               my_readlink = my_path
281               if ( File.symlink?( my_path ) )
282                 my_readlink = File.readlink( my_path )
283               end
284               current_msa = read_fasta_file( my_file )
285
286               if ( per_species_counter > 0 )
287                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
288                 per_species_counter = 0
289               end
290               puts " " + species_counter.to_s +  ":" + current_species + " [" + my_readlink + "]"
291               log << species_counter.to_s <<  ": " << current_species << " [" + my_readlink + "]" << Constants::LINE_DELIMITER
292               species_counter += 1
293             end
294             log << "   " << seq_name << Constants::LINE_DELIMITER
295             per_species_counter = per_species_counter + 1
296             seq = nil
297
298             indices = current_msa.find_by_name_start( seq_name, true )
299             if indices.size == 1
300               seq = current_msa.get_sequence( indices[ 0 ] )
301             elsif indices.size == 0
302               # Not found, try finding by partial match.
303               begin
304                 seq = current_msa.get_by_name( seq_name, true, true )
305               rescue ArgumentError => e
306                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
307               end
308             else
309               Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique"  )
310             end
311
312             normalized_id = per_species_counter.to_s( 16 ).upcase +
313              "_" + current_species
314
315             per_species_counter.to_i
316
317             ids_map_writer.write( normalized_id + "\t" + seq.get_name + Constants::LINE_DELIMITER )
318
319             orig_name = nil
320             if seq != nil
321               orig_name = seq.get_name
322               seq.set_name( seq.get_name + " [" + current_species + "]" )
323               new_msa.add_sequence( seq )
324             else
325               Util.fatal_error( PRG_NAME, "unexpected error: seq is nil" )
326             end
327
328             if domain_ranges != nil
329               first = true
330               prev_to = -1
331
332               domain_ranges.each { |range|
333                 if range != nil && range.length > 0
334                   s = range.split("-")
335                   from = s[ 0 ].to_i
336                   to = s[ 1 ].to_i
337                   new_msa_domains.add_sequence( Sequence.new( orig_name +
338                        " [" + from.to_s +
339                        "-" + to.to_s +
340                        "] [" + basename + "] [" +
341                        current_species + "]",
342                       seq.get_sequence_as_string[from..to] ) )
343                   if extension > 0
344                     from_e = from - extension
345                     if from_e < 0
346                       from_e = 0
347                     end
348                     to_e = to + extension
349                     if to_e > seq.get_sequence_as_string.length - 1
350                       to_e = seq.get_sequence_as_string.length - 1
351                     end
352                     new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
353                          " [" + from.to_s +
354                          "-" + to.to_s  +
355                          "] [extended by " +
356                          extension.to_s +
357                          "] [" + basename + "] [" +
358                          current_species + "]",
359                         seq.get_sequence_as_string[ from_e..to_e ] ) )
360                   end # extension > 0
361                   if  extract_linkers
362                     if first
363                       first = false
364                       f = 0
365                       t = from - 1
366                       if extension > 0
367                         f = from - extension
368                       end
369                       mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
370                     else
371                       mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
372                     end
373                     prev_to = to
374                   end
375                 end # range != nil && range.length > 0
376               }
377               if extract_linkers && prev_to > 0
378                 f = prev_to + 1
379                 t = seq.get_sequence_as_string.length - 1
380                 if extension > 0
381                   t = prev_to + extension
382                 end
383                 mod_line += get_linker_sequence( f, t, seq ) + "]"
384               end
385             end
386
387             new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
388             if mod_line
389               linkers << mod_line + Constants::LINE_DELIMITER
390             end
391           end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
392         end # while line = file.gets
393
394       end
395
396       ids_map_writer.close
397
398       if ( per_species_counter > 0 )
399         print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
400       end
401
402       io = MsaIO.new()
403
404       fasta_writer = FastaWriter.new()
405       fasta_writer.remove_gap_chars
406       fasta_writer.clean
407
408       begin
409         io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
410       rescue Exception => e
411         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
412       end
413
414       if new_msa_domains != nil
415         begin
416           io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
417         rescue Exception => e
418           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
419         end
420       end
421
422       if extension > 0 && new_msa_domains_extended != nil
423         begin
424           io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
425         rescue Exception => e
426           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
427         end
428       end
429
430       begin
431         io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
432       rescue Exception => e
433         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
434       end
435
436       begin
437         f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
438         f.print( log )
439         f.close
440       rescue Exception => e
441         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
442       end
443
444       if extract_linkers && linkers != nil
445         begin
446           f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LINKERS_SUFFIX , 'a' )
447           f.print( linkers )
448           f.close
449         rescue Exception => e
450           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
451         end
452       end
453     end
454
455
456     def get_linker_sequence( from, to, seq )
457       if from < 0
458         from = 0
459       end
460       if to > seq.get_sequence_as_string.length - 1
461         to = seq.get_sequence_as_string.length - 1
462       end
463       if from > to
464         return ""
465       else
466         return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
467       end
468     end
469
470     def obtain_inputfiles( input_dir, seq_names_files_suffix )
471       input_files = Array.new()
472       Dir.foreach( input_dir ) { |file_name|
473         if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
474           input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
475         end
476       }
477       input_files
478     end
479
480     def extract_mappings( mapping_file )
481       species_code_to_path = Hash.new()
482       File.open( mapping_file ) do | file |
483         while line = file.gets
484           if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
485             if ( line =~ /(\S+)\s+(\S+)/ )
486               species = $1
487               path = $2
488               if ( species_code_to_path.has_key?( species ) )
489                 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
490               end
491               if ( species_code_to_path.has_value?( path ) )
492                 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
493               end
494               unless ( File.exist?( path ) )
495                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
496               end
497               unless ( File.file?( path ) )
498                 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
499               end
500               unless ( File.readable?( path ) )
501                 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
502               end
503               if ( File.size( path ) < 1000 )
504                 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
505               end
506               unless ( Util.looks_like_fasta?( path ) )
507                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
508               end
509               species_code_to_path[ species ] = path
510               puts species + " -> " + path
511             end
512           end
513         end
514       end
515       species_code_to_path
516     end
517
518     def print_counts( per_species_counter, log, ld )
519       puts "   sum: " + per_species_counter.to_s
520       log << "   sum: " + per_species_counter.to_s + ld
521     end
522
523     def read_fasta_file( input )
524       if @file_to_msa.has_key?( input )
525         return @file_to_msa[ input ]
526       end
527
528       f = MsaFactory.new()
529       msa = nil
530       begin
531         msa = f.create_msa_from_file( input, FastaParser.new() )
532       rescue Exception => e
533         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
534       end
535       if @seqs <= 400000
536         @file_to_msa[ input ] = msa
537         @seqs += msa.get_number_of_seqs
538         puts "   total seqs in memory: " + @seqs.to_s
539       end
540       msa
541     end
542
543     def print_help()
544       puts( "Usage:" )
545       puts()
546       puts( "  " + PRG_NAME + ".rb <sequence id ('prot') files suffix> <dir containing sequence id ('prot') files>" +
547          " <output directory for protein sequences> <output directory for domain sequences> <genome locations file>" )
548       puts()
549       puts( "  option: -" + EXT_OPTION  + "=<int>: to extend extracted domains" )
550       puts( "          -" + EXTRACT_LINKERS_OPTION  + "      : to extract linkers" )
551       puts()
552       puts( "  " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" )
553       puts()
554     end
555
556   end # class MultiSequenceExtractor
557 end