inprogress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / multi_sequence_extractor.rb
1 #
2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/msa/msa_factory'
14 require 'lib/evo/io/msa_io'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/util/command_line_arguments'
18
19
20
21 module Evoruby
22
23   class MultiSequenceExtractor
24
25     PRG_NAME                           = "mse"
26     PRG_VERSION                        = "1.03"
27     PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
28     PRG_DATE                           = "131127"
29     COPYRIGHT                          = "2008-2013 Christian M Zmasek"
30     CONTACT                            = "phylosoft@gmail.com"
31     WWW                                = "https://sites.google.com/site/cmzmasek/home/software/forester"
32     HELP_OPTION_1                      = 'help'
33     HELP_OPTION_2                      = 'h'
34
35     EXT_OPTION                          = 'e'
36     EXTRACT_LINKERS_OPTION              = 'l'
37     LOG_SUFFIX                          = ".mse_log"
38     LINKERS_SUFFIX                      = ".linkers"
39     FASTA_SUFFIX                        = ".fasta"
40     FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
41     NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
42     PROTEINS_LIST_FILE_SEPARATOR        = "\t"
43
44     def initialize()
45       @file_to_msa = Hash.new
46       @seqs = 0
47     end
48
49     def run()
50
51       Util.print_program_information( PRG_NAME,
52         PRG_VERSION,
53         PRG_DESC ,
54         PRG_DATE,
55         COPYRIGHT,
56         CONTACT,
57         WWW,
58         STDOUT )
59
60       ld = Constants::LINE_DELIMITER
61
62       begin
63         cla = CommandLineArguments.new( ARGV )
64       rescue ArgumentError => e
65         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
66       end
67
68       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
69            cla.is_option_set?( HELP_OPTION_2 ) )
70         print_help
71         exit( 0 )
72       end
73
74       if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
75         print_help
76         exit( -1 )
77       end
78
79       allowed_opts = Array.new
80       allowed_opts.push(EXT_OPTION)
81       allowed_opts.push(EXTRACT_LINKERS_OPTION)
82
83       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
84       if ( disallowed.length > 0 )
85         Util.fatal_error( PRG_NAME,
86           "unknown option(s): " + disallowed,
87           STDOUT )
88       end
89
90       seq_names_files_suffix = cla.get_file_name( 0 )
91       input_dir              = cla.get_file_name( 1 )
92       out_dir                = cla.get_file_name( 2 )
93       out_dir_doms           = cla.get_file_name( 3 )
94       mapping_file            = nil
95
96       if ( cla.get_number_of_files == 5 )
97         mapping_file = cla.get_file_name( 4 )
98         begin
99           Util.check_file_for_readability( mapping_file )
100         rescue ArgumentError => e
101           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
102         end
103       end
104
105       extension = 0
106       if cla.is_option_set?(EXT_OPTION)
107         extension = cla.get_option_value_as_int(EXT_OPTION)
108         if extension < 0
109           extension = 0
110         end
111       end
112
113       extract_linkers = false
114       if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
115         extract_linkers = true
116       end
117
118       if  !File.exist?( input_dir )
119         Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
120       end
121       if  !File.exist?( out_dir )
122         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
123       end
124       if  !File.exist?( out_dir_doms )
125         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
126       end
127       if !File.directory?( input_dir )
128         Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
129       end
130       if !File.directory?( out_dir )
131         Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
132       end
133       if !File.directory?( out_dir_doms )
134         Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
135       end
136
137
138       log = String.new
139
140       log << "Program            : " + PRG_NAME + ld
141       log << "Version            : " + PRG_VERSION + ld
142       log << "Program date       : " + PRG_DATE + ld
143
144       puts()
145       puts( "Sequence names files suffix: " + seq_names_files_suffix )
146       log << "Sequence names files suffix: " + seq_names_files_suffix + ld
147       puts( "Input dir                  : " + input_dir )
148       log << "Input dir                  : " + input_dir + ld
149       puts( "Output dir                 : " + out_dir )
150       log << "Output dir                 : " + out_dir + ld
151       puts( "Output dir domains         : " + out_dir_doms )
152       log << "Output dir domains         : " + out_dir_doms + ld
153       if ( mapping_file != nil )
154         puts( "Mapping file               : " + mapping_file )
155         log << "Mapping file               : " + mapping_file + ld
156       end
157       if extension > 0
158         puts( "Extension                  : " + extension.to_s )
159         log << "Extension                  : " + extension.to_s + ld
160       end
161       if extract_linkers
162         puts( "Extract linkers            : true" )
163         log << "Extract linkers            : true" + ld
164       end
165       log << "Date                       : " + Time.now.to_s + ld
166       puts
167
168       if ( mapping_file != nil )
169         species_codes_to_paths = extract_mappings( mapping_file )
170       end
171
172       input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
173
174       counter = 0
175
176       input_files.each { |input_file|
177         counter += 1
178         puts
179         puts
180         puts counter.to_s + "/" + input_files.size.to_s
181         read_seq_family_file( input_file,
182           seq_names_files_suffix,
183           input_dir,
184           species_codes_to_paths,
185           log,
186           out_dir,
187           out_dir_doms,
188           mapping_file,
189           extension,
190           extract_linkers )
191       }
192       puts
193       Util.print_message( PRG_NAME, "OK" )
194       puts
195
196     end
197
198
199     def read_seq_family_file( input_file,
200         seq_names_files_suffix,
201         input_dir,
202         species_codes_to_paths,
203         log,
204         out_dir,
205         out_dir_doms,
206         mapping_file,
207         extension,
208         extract_linkers )
209
210       begin
211         Util.check_file_for_readability( input_file )
212       rescue ArgumentError => e
213         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
214       end
215       basename = File.basename( input_file, seq_names_files_suffix )
216       out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
217       out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
218       out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
219       doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
220       doms_ext_out_file_path_fasta_file           = nil
221       if extension > 0
222         doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX
223       end
224       begin
225         Util.check_file_for_writability( out_file_path_fasta_file )
226         Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
227         Util.check_file_for_writability( out_file_path_ids_map  )
228         Util.check_file_for_writability( doms_out_file_path_fasta_file  )
229       rescue ArgumentError => e
230         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
231       end
232
233       ids_map_writer = nil
234       begin
235         ids_map_writer = File.open( out_file_path_ids_map, 'a' )
236       rescue Exception => e
237         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
238       end
239
240       current_species         = ""
241       current_msa            = nil
242       new_msa                = Msa.new
243       new_msa_normalized_ids = Msa.new
244       new_msa_domains = Msa.new
245       new_msa_domains_extended = Msa.new
246       per_species_counter = 0
247       linkers = ""
248
249       puts basename
250
251       File.open( input_file ) do | file |
252         species_counter = 1
253         while line = file.gets
254           line.strip!
255           if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
256             values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
257             mod_line = nil
258             if ( values.length < 2 )
259               Util.fatal_error( PRG_NAME, "unexpected format: " + line )
260             end
261             species = values[ 0 ]
262             #if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
263             #  next
264             #end
265             seq_name = values[ 1 ]
266             domain_ranges = nil
267             if ( values.length > 3 )
268               domain_ranges_block = values[ 3 ]
269               domain_ranges = domain_ranges_block.split( "/" )
270             end
271             if ( species != current_species )
272               current_species = species
273               my_file = input_dir + Constants::FILE_SEPARATOR + current_species
274
275               if ( !File.exist?( my_file ) )
276                 if species_codes_to_paths == nil
277                   Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
278                 elsif ( !species_codes_to_paths.has_key?( current_species ) )
279                   Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
280                 end
281                 my_file = species_codes_to_paths[ current_species ]
282               end
283               my_path = File.expand_path( my_file )
284               my_readlink = my_path
285               if ( File.symlink?( my_path ) )
286                 my_readlink = File.readlink( my_path )
287               end
288               current_msa = read_fasta_file( my_file )
289
290               if ( per_species_counter > 0 )
291                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
292                 per_species_counter = 0
293               end
294               puts " " + species_counter.to_s +  ":" + current_species + " [" + my_readlink + "]"
295               log << species_counter.to_s +  ": " + current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
296               species_counter += 1
297             end
298             puts "   " + seq_name
299             log << "   " + seq_name + Constants::LINE_DELIMITER
300             per_species_counter = per_species_counter + 1
301             seq = nil
302
303             indices = current_msa.find_by_name_start( seq_name, true )
304             if indices.size == 1
305               seq =  current_msa.get_sequence( indices[ 0 ] )
306             elsif indices.size == 0
307               # Not found, try finding by partial match.
308               begin
309                 seq = current_msa.get_by_name( seq_name, true, true )
310               rescue ArgumentError => e
311                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
312               end
313             else
314               Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique"  )
315             end
316
317             normalized_id = per_species_counter.to_s( 16 ).upcase +
318              "_" + current_species
319
320             per_species_counter.to_i
321
322             ids_map_writer.write( normalized_id + "\t" + seq.get_name + Constants::LINE_DELIMITER )
323
324             orig_name = nil
325             if seq != nil
326               orig_name = seq.get_name
327               seq.set_name( seq.get_name + " [" + current_species + "]" )
328               new_msa.add_sequence( seq )
329             else
330               Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
331             end
332
333             if domain_ranges != nil
334               first = true
335               prev_to = -1
336
337               domain_ranges.each { |range|
338                 if range != nil && range.length > 0
339                   s = range.split("-")
340                   from = s[ 0 ].to_i
341                   to = s[ 1 ].to_i
342                   new_msa_domains.add_sequence( Sequence.new( orig_name +
343                        " [" + from.to_s +
344                        "-" + to.to_s +
345                        "] [" + basename + "] [" +
346                        current_species + "]",
347                       seq.get_sequence_as_string[from..to] ) )
348                   if extension > 0
349                     from_e = from - extension
350                     if from_e < 0
351                       from_e = 0
352                     end
353                     to_e = to + extension
354                     if to_e > seq.get_sequence_as_string.length - 1
355                       to_e = seq.get_sequence_as_string.length - 1
356                     end
357                     new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
358                          " [" + from.to_s +
359                          "-" + to.to_s  +
360                          "] [extended by " +
361                          extension.to_s +
362                          "] [" + basename + "] [" +
363                          current_species + "]",
364                         seq.get_sequence_as_string[ from_e..to_e ] ) )
365                   end # extension > 0
366                   if  extract_linkers
367                     if first
368                       first = false
369                       f = 0
370                       t = from - 1
371                       if extension > 0
372                         f = from - extension
373                       end
374                       mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
375                     else
376                       mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
377                     end
378                     prev_to = to
379                   end
380                 end # range != nil && range.length > 0
381               }
382               if extract_linkers && prev_to > 0
383                 f = prev_to + 1
384                 t = seq.get_sequence_as_string.length - 1
385                 if extension > 0
386                   t = prev_to + extension
387                 end
388                 mod_line += get_linker_sequence( f, t, seq ) + "]"
389               end
390             end
391
392             new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
393             if mod_line
394               linkers << mod_line + Constants::LINE_DELIMITER
395             end
396           end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
397         end # while line = file.gets
398
399       end
400
401       ids_map_writer.close
402
403       if ( per_species_counter > 0 )
404         print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
405       end
406
407       io = MsaIO.new()
408
409       fasta_writer = FastaWriter.new()
410       fasta_writer.remove_gap_chars
411       fasta_writer.clean
412
413       begin
414         io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
415       rescue Exception => e
416         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
417       end
418
419       if new_msa_domains != nil
420         begin
421           io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
422         rescue Exception => e
423           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
424         end
425       end
426
427       if extension > 0 && new_msa_domains_extended != nil
428         begin
429           io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
430         rescue Exception => e
431           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
432         end
433       end
434
435       begin
436         io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
437       rescue Exception => e
438         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
439       end
440
441       begin
442         f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
443         f.print( log )
444         f.close
445       rescue Exception => e
446         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
447       end
448
449       if extract_linkers && linkers != nil
450         begin
451           f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LINKERS_SUFFIX , 'a' )
452           f.print( linkers )
453           f.close
454         rescue Exception => e
455           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
456         end
457       end
458     end
459
460
461     def get_linker_sequence( from, to, seq )
462       if from < 0
463         from = 0
464       end
465       if to > seq.get_sequence_as_string.length - 1
466         to = seq.get_sequence_as_string.length - 1
467       end
468       if from > to
469         return ""
470       else
471         return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
472       end
473     end
474
475     def obtain_inputfiles( input_dir, seq_names_files_suffix )
476       input_files = Array.new()
477       Dir.foreach( input_dir ) { |file_name|
478         if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
479           input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
480         end
481       }
482       input_files
483     end
484
485     def extract_mappings( mapping_file )
486       species_code_to_path = Hash.new()
487       File.open( mapping_file ) do | file |
488         while line = file.gets
489           if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
490             if ( line =~ /(\S+)\s+(\S+)/ )
491               species = $1
492               path = $2
493               if ( species_code_to_path.has_key?( species ) )
494                 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
495               end
496               if ( species_code_to_path.has_value?( path ) )
497                 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
498               end
499               if ( !File.exist?( path ) )
500                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
501               end
502               if ( !File.file?( path ) )
503                 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
504               end
505               if ( !File.readable?( path ) )
506                 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
507               end
508               if ( File.size( path ) < 10000 )
509                 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
510               end
511               if ( !Util.looks_like_fasta?( path ) )
512                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
513               end
514               species_code_to_path[ species ] = path
515               puts species + " -> " + path
516             end
517           end
518         end
519       end
520       species_code_to_path
521     end
522
523     def print_counts( per_species_counter, log, ld )
524       puts "   [sum: " + per_species_counter.to_s + "]"
525       log << "   [sum: " + per_species_counter.to_s + "]" + ld
526     end
527
528     def read_fasta_file( input )
529       if @file_to_msa.has_key?( input )
530         return @file_to_msa[ input ]
531       end
532
533       f = MsaFactory.new()
534       msa = nil
535       begin
536         msa = f.create_msa_from_file( input, FastaParser.new() )
537       rescue Exception => e
538         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
539       end
540       if @seqs <= 500000
541         @file_to_msa[ input ] = msa
542         @seqs += msa.get_number_of_seqs
543         puts "   total seqs in memory: " + @seqs.to_s
544       end
545       msa
546     end
547
548     def print_help()
549       puts( "Usage:" )
550       puts()
551       puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
552          "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
553          "genome multiple-sequence ('fasta') files not in input dir]" )
554       puts()
555       puts( "  option: -" + EXT_OPTION  + "=<int>: to extend extracted domains" )
556       puts( "          -" + EXTRACT_LINKERS_OPTION  + "      : to extract linkers" )
557       puts()
558       puts( "  " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" )
559       puts()
560     end
561
562   end # class MultiSequenceExtractor
563 end