in progress...
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / multi_sequence_extractor.rb
1 #
2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2014 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7
8 require 'lib/evo/util/constants'
9 require 'lib/evo/util/util'
10 require 'lib/evo/msa/msa'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/io/msa_io'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/writer/fasta_writer'
15 require 'lib/evo/util/command_line_arguments'
16
17 module Evoruby
18   class MultiSequenceExtractor
19
20     PRG_NAME                           = "mse"
21     PRG_VERSION                        = "1.04"
22     PRG_DESC                           = "processing of \"surfacing\" output: extraction of sequences by name from multiple multi-sequence ('fasta') files"
23     PRG_DATE                           = "170327"
24     WWW                                = "https://sites.google.com/site/cmzmasek/home/software/forester"
25     HELP_OPTION_1                      = 'help'
26     HELP_OPTION_2                      = 'h'
27
28     EXT_OPTION                          = 'e'
29     EXTRACT_LINKERS_OPTION              = 'l'
30     LOG_SUFFIX                          = ".mse_log"
31     LINKERS_SUFFIX                      = ".linkers"
32     FASTA_SUFFIX                        = ".fasta"
33     FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
34     NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
35     PROTEINS_LIST_FILE_SEPARATOR        = "\t"
36     def initialize()
37       @file_to_msa = Hash.new
38       @seqs = 0
39     end
40
41     def run()
42
43       Util.print_program_information( PRG_NAME,
44       PRG_VERSION,
45       PRG_DESC ,
46       PRG_DATE,
47       WWW,
48       STDOUT )
49
50       ld = Constants::LINE_DELIMITER
51
52       begin
53         cla = CommandLineArguments.new( ARGV )
54       rescue ArgumentError => e
55         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
56       end
57
58       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
59       cla.is_option_set?( HELP_OPTION_2 ) )
60         print_help
61         exit( 0 )
62       end
63
64       if ( cla.get_number_of_files != 5 )
65         print_help
66         exit( -1 )
67       end
68
69       allowed_opts = Array.new
70       allowed_opts.push(EXT_OPTION)
71       allowed_opts.push(EXTRACT_LINKERS_OPTION)
72
73       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
74       if ( disallowed.length > 0 )
75         Util.fatal_error( PRG_NAME,
76         "unknown option(s): " + disallowed,
77         STDOUT )
78       end
79
80       seq_names_files_suffix = cla.get_file_name( 0 )
81       input_dir              = cla.get_file_name( 1 )
82       out_dir                = cla.get_file_name( 2 )
83       out_dir_doms           = cla.get_file_name( 3 )
84       mapping_file           = cla.get_file_name( 4 )
85
86       begin
87         Util.check_file_for_readability( mapping_file )
88       rescue IOError => e
89         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
90       end
91
92       extension = 0
93       if cla.is_option_set?(EXT_OPTION)
94         extension = cla.get_option_value_as_int(EXT_OPTION)
95         if extension < 0
96           extension = 0
97         end
98       end
99
100       extract_linkers = false
101       if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
102         extract_linkers = true
103       end
104
105       unless File.exist?( out_dir )
106         Dir.mkdir( out_dir )
107       end
108       unless File.exist?( out_dir_doms )
109         Dir.mkdir( out_dir_doms )
110       end
111
112       unless File.exist?( input_dir )
113         Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
114       end
115       unless File.exist?( out_dir )
116         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
117       end
118       unless File.exist?( out_dir_doms )
119         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
120       end
121       unless File.directory?( input_dir )
122         Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
123       end
124       unless File.directory?( out_dir )
125         Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
126       end
127       unless File.directory?( out_dir_doms )
128         Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
129       end
130
131       log = String.new
132
133       log << "Program            : " + PRG_NAME + ld
134       log << "Version            : " + PRG_VERSION + ld
135       log << "Program date       : " + PRG_DATE + ld
136
137       puts()
138       puts( "Sequence names files suffix: " + seq_names_files_suffix )
139       log << "Sequence names files suffix: " + seq_names_files_suffix + ld
140       puts( "Input dir                  : " + input_dir )
141       log << "Input dir                  : " + input_dir + ld
142       puts( "Output dir                 : " + out_dir )
143       log << "Output dir                 : " + out_dir + ld
144       puts( "Output dir domains         : " + out_dir_doms )
145       log << "Output dir domains         : " + out_dir_doms + ld
146       puts( "Mapping file               : " + mapping_file )
147       log << "Mapping file               : " + mapping_file + ld
148       if extension > 0
149         puts( "Extension                  : " + extension.to_s )
150         log << "Extension                  : " + extension.to_s + ld
151       end
152       if extract_linkers
153         puts( "Extract linkers            : true" )
154         log << "Extract linkers            : true" + ld
155       end
156       log << "Date                       : " + Time.now.to_s + ld
157       puts
158
159       species_codes_to_paths = extract_mappings( mapping_file )
160
161       input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
162
163       counter = 0
164
165       input_files.each { |input_file|
166         counter += 1
167         puts
168         puts
169         puts counter.to_s + "/" + input_files.size.to_s
170         read_seq_family_file( input_file,
171         seq_names_files_suffix,
172         input_dir,
173         species_codes_to_paths,
174         log,
175         out_dir,
176         out_dir_doms,
177         mapping_file,
178         extension,
179         extract_linkers )
180       }
181       puts
182       Util.print_message( PRG_NAME, "OK" )
183       puts
184
185     end
186
187     def read_seq_family_file( input_file,
188       seq_names_files_suffix,
189       input_dir,
190       species_codes_to_paths,
191       log,
192       out_dir,
193       out_dir_doms,
194       mapping_file,
195       extension,
196       extract_linkers )
197
198       begin
199         Util.check_file_for_readability( input_file )
200       rescue ArgumentError => e
201         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
202       end
203       basename = File.basename( input_file, seq_names_files_suffix )
204       out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
205       out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
206       out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
207       doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
208       doms_ext_out_file_path_fasta_file           = nil
209       if extension > 0
210         doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_ext_" + extension.to_s + FASTA_SUFFIX
211       end
212       begin
213         Util.check_file_for_writability( out_file_path_fasta_file )
214         Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
215         Util.check_file_for_writability( out_file_path_ids_map  )
216         Util.check_file_for_writability( doms_out_file_path_fasta_file  )
217       rescue ArgumentError => e
218         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
219       end
220
221       ids_map_writer = nil
222       begin
223         ids_map_writer = File.open( out_file_path_ids_map, 'a' )
224       rescue Exception => e
225         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
226       end
227
228       current_species         = ""
229       current_msa            = nil
230       new_msa                = Msa.new
231       new_msa_normalized_ids = Msa.new
232       new_msa_domains = Msa.new
233       new_msa_domains_extended = Msa.new
234       per_species_counter = 0
235       linkers = ""
236
237       puts basename
238
239       File.open( input_file ) do | file |
240         species_counter = 1
241         while line = file.gets
242           line.strip!
243           if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
244             values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
245             mod_line = nil
246             if ( values.length < 2 )
247               Util.fatal_error( PRG_NAME, "unexpected format: " + line )
248             end
249             species = values[ 0 ]
250
251             seq_name = values[ 1 ]
252             domain_ranges = nil
253             if ( values.length > 3 )
254               domain_ranges_block = values[ 3 ]
255               domain_ranges = domain_ranges_block.split( "/" )
256             end
257             if ( species != current_species )
258               current_species = species
259               my_file = input_dir + Constants::FILE_SEPARATOR + current_species
260
261               if ( !File.exist?( my_file ) )
262                 if species_codes_to_paths == nil
263                   Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
264                 elsif ( !species_codes_to_paths.has_key?( current_species ) )
265                   Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
266                 end
267                 my_file = species_codes_to_paths[ current_species ]
268               end
269               my_path = File.expand_path( my_file )
270               my_readlink = my_path
271               if ( File.symlink?( my_path ) )
272                 my_readlink = File.readlink( my_path )
273               end
274               current_msa = read_fasta_file( my_file )
275
276               if ( per_species_counter > 0 )
277                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
278                 per_species_counter = 0
279               end
280               puts " " + species_counter.to_s +  ":" + current_species + " [" + my_readlink + "]"
281               log << species_counter.to_s <<  ": " << current_species << " [" + my_readlink + "]" << Constants::LINE_DELIMITER
282               species_counter += 1
283             end
284             log << "   " << seq_name << Constants::LINE_DELIMITER
285             per_species_counter = per_species_counter + 1
286             seq = nil
287
288             indices = current_msa.find_by_name_start( seq_name, true )
289             if indices.size == 1
290               seq = current_msa.get_sequence( indices[ 0 ] )
291             elsif indices.size == 0
292               # Not found, try finding by partial match.
293               begin
294                 seq = current_msa.get_by_name( seq_name, true, true )
295               rescue ArgumentError => e
296                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
297               end
298             else
299               Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique"  )
300             end
301
302             normalized_id = per_species_counter.to_s( 16 ).upcase +
303             "_" + current_species
304
305             per_species_counter.to_i
306
307             ids_map_writer.write( normalized_id + "\t" + seq.get_name + Constants::LINE_DELIMITER )
308
309             orig_name = nil
310             if seq != nil
311               orig_name = seq.get_name
312               seq.set_name( seq.get_name + " [" + current_species + "]" )
313               new_msa.add_sequence( seq )
314             else
315               Util.fatal_error( PRG_NAME, "unexpected error: seq is nil" )
316             end
317
318             if domain_ranges != nil
319               first = true
320               prev_to = -1
321
322               domain_ranges.each { |range|
323                 if range != nil && range.length > 0
324                   s = range.split("-")
325                   from = s[ 0 ].to_i
326                   to = s[ 1 ].to_i
327                   new_msa_domains.add_sequence( Sequence.new( orig_name +
328                   " [" + from.to_s +
329                   "-" + to.to_s +
330                   "] [" + basename + "] [" +
331                   current_species + "]",
332                   seq.get_sequence_as_string[from..to] ) )
333                   if extension > 0
334                     from_e = from - extension
335                     if from_e < 0
336                       from_e = 0
337                     end
338                     to_e = to + extension
339                     if to_e > seq.get_sequence_as_string.length - 1
340                       to_e = seq.get_sequence_as_string.length - 1
341                     end
342                     new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
343                     " [" + from.to_s +
344                     "-" + to.to_s  +
345                     "] [extended by " +
346                     extension.to_s +
347                     "] [" + basename + "] [" +
348                     current_species + "]",
349                     seq.get_sequence_as_string[ from_e..to_e ] ) )
350                   end # extension > 0
351                   if  extract_linkers
352                     if first
353                       first = false
354                       f = 0
355                       t = from - 1
356                       if extension > 0
357                         f = from - extension
358                       end
359                       mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
360                     else
361                       mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
362                     end
363                     prev_to = to
364                   end
365                 end # range != nil && range.length > 0
366               }
367               if extract_linkers && prev_to > 0
368                 f = prev_to + 1
369                 t = seq.get_sequence_as_string.length - 1
370                 if extension > 0
371                   t = prev_to + extension
372                 end
373                 mod_line += get_linker_sequence( f, t, seq ) + "]"
374               end
375             end
376
377             new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
378             if mod_line
379               linkers << mod_line + Constants::LINE_DELIMITER
380             end
381           end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
382         end # while line = file.gets
383
384       end
385
386       ids_map_writer.close
387
388       if ( per_species_counter > 0 )
389         print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
390       end
391
392       io = MsaIO.new()
393
394       fasta_writer = FastaWriter.new()
395       fasta_writer.remove_gap_chars
396       fasta_writer.clean
397
398       begin
399         io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
400       rescue Exception => e
401         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
402       end
403
404       if new_msa_domains != nil
405         begin
406           io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
407         rescue Exception => e
408           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
409         end
410       end
411
412       if extension > 0 && new_msa_domains_extended != nil
413         begin
414           io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
415         rescue Exception => e
416           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
417         end
418       end
419
420       begin
421         io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
422       rescue Exception => e
423         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
424       end
425
426       begin
427         f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
428         f.print( log )
429         f.close
430       rescue Exception => e
431         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
432       end
433
434       if extract_linkers && linkers != nil
435         begin
436           f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LINKERS_SUFFIX , 'a' )
437           f.print( linkers )
438           f.close
439         rescue Exception => e
440           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
441         end
442       end
443     end
444
445     def get_linker_sequence( from, to, seq )
446       if from < 0
447         from = 0
448       end
449       if to > seq.get_sequence_as_string.length - 1
450         to = seq.get_sequence_as_string.length - 1
451       end
452       if from > to
453         return ""
454       else
455         return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
456       end
457     end
458
459     def obtain_inputfiles( input_dir, seq_names_files_suffix )
460       input_files = Array.new()
461       Dir.foreach( input_dir ) { |file_name|
462         if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
463           input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
464         end
465       }
466       input_files
467     end
468
469     def extract_mappings( mapping_file )
470       species_code_to_path = Hash.new()
471       File.open( mapping_file ) do | file |
472         while line = file.gets
473           if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
474             if ( line =~ /(\S+)\s+(\S+)/ )
475               species = $1
476               path = $2
477               if ( species_code_to_path.has_key?( species ) )
478                 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
479               end
480               if ( species_code_to_path.has_value?( path ) )
481                 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
482               end
483               unless ( File.exist?( path ) )
484                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
485               end
486               unless ( File.file?( path ) )
487                 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
488               end
489               unless ( File.readable?( path ) )
490                 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
491               end
492               if ( File.size( path ) < 1000 )
493                 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
494               end
495               unless ( Util.looks_like_fasta?( path ) )
496                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
497               end
498               species_code_to_path[ species ] = path
499               puts species + " -> " + path
500             end
501           end
502         end
503       end
504       species_code_to_path
505     end
506
507     def print_counts( per_species_counter, log, ld )
508       puts "   sum: " + per_species_counter.to_s
509       log << "   sum: " + per_species_counter.to_s + ld
510     end
511
512     def read_fasta_file( input )
513       if @file_to_msa.has_key?( input )
514         return @file_to_msa[ input ]
515       end
516
517       f = MsaFactory.new()
518       msa = nil
519       begin
520         msa = f.create_msa_from_file( input, FastaParser.new() )
521       rescue Exception => e
522         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
523       end
524       if @seqs <= 400000
525         @file_to_msa[ input ] = msa
526         @seqs += msa.get_number_of_seqs
527         puts "   total seqs in memory: " + @seqs.to_s
528       end
529       msa
530     end
531
532     def print_help()
533       puts( "Usage:" )
534       puts()
535       puts( "  " + PRG_NAME + ".rb <sequence id ('prot') files suffix> <dir containing sequence id ('prot') files>" +
536       " <output directory for protein sequences> <output directory for domain sequences> <genome locations file>" )
537       puts()
538       puts( "  option: -" + EXT_OPTION  + "=<int>: to extend extracted domains" )
539       puts( "          -" + EXTRACT_LINKERS_OPTION  + "      : to extract linkers" )
540       puts()
541       puts( "  " + "Examples: mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt" )
542       puts( "  " + "          mse.rb .prot . FL_seqs DA_seqs ../../genome_locations.txt" )
543       puts()
544     end
545
546   end # class MultiSequenceExtractor
547 end