inprogress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / multi_sequence_extractor.rb
1 #
2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/msa/msa_factory'
14 require 'lib/evo/io/msa_io'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/util/command_line_arguments'
18
19
20
21 module Evoruby
22
23   class MultiSequenceExtractor
24
25     PRG_NAME                           = "mse"
26     PRG_VERSION                        = "1.03"
27     PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
28     PRG_DATE                           = "131127"
29     COPYRIGHT                          = "2008-2013 Christian M Zmasek"
30     CONTACT                            = "phylosoft@gmail.com"
31     WWW                                = "https://sites.google.com/site/cmzmasek/home/software/forester"
32     HELP_OPTION_1                      = 'help'
33     HELP_OPTION_2                      = 'h'
34
35     EXT_OPTION                          = 'e'
36     EXTRACT_LINKERS_OPTION              = 'l'
37     LOG_SUFFIX                          = ".mse_log"
38     LINKERS_SUFFIX                      = ".linkers"
39     FASTA_SUFFIX                        = ".fasta"
40     FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
41     NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
42     PROTEINS_LIST_FILE_SEPARATOR        = "\t"
43
44     def initialize()
45       @file_to_msa = Hash.new
46     end
47
48     def run()
49
50       Util.print_program_information( PRG_NAME,
51         PRG_VERSION,
52         PRG_DESC ,
53         PRG_DATE,
54         COPYRIGHT,
55         CONTACT,
56         WWW,
57         STDOUT )
58
59       ld = Constants::LINE_DELIMITER
60
61       begin
62         cla = CommandLineArguments.new( ARGV )
63       rescue ArgumentError => e
64         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
65       end
66
67       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
68            cla.is_option_set?( HELP_OPTION_2 ) )
69         print_help
70         exit( 0 )
71       end
72
73       if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
74         print_help
75         exit( -1 )
76       end
77
78       allowed_opts = Array.new
79       allowed_opts.push(EXT_OPTION)
80       allowed_opts.push(EXTRACT_LINKERS_OPTION)
81
82       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
83       if ( disallowed.length > 0 )
84         Util.fatal_error( PRG_NAME,
85           "unknown option(s): " + disallowed,
86           STDOUT )
87       end
88
89       seq_names_files_suffix = cla.get_file_name( 0 )
90       input_dir              = cla.get_file_name( 1 )
91       out_dir                = cla.get_file_name( 2 )
92       out_dir_doms           = cla.get_file_name( 3 )
93       mapping_file            = nil
94
95       if ( cla.get_number_of_files == 5 )
96         mapping_file = cla.get_file_name( 4 )
97         begin
98           Util.check_file_for_readability( mapping_file )
99         rescue ArgumentError => e
100           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
101         end
102       end
103
104       extension = 0
105       if cla.is_option_set?(EXT_OPTION)
106         extension = cla.get_option_value_as_int(EXT_OPTION)
107         if extension < 0
108           extension = 0
109         end
110       end
111
112       extract_linkers = false
113       if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
114         extract_linkers = true
115       end
116
117       if  !File.exist?( input_dir )
118         Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
119       end
120       if  !File.exist?( out_dir )
121         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
122       end
123       if  !File.exist?( out_dir_doms )
124         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
125       end
126       if !File.directory?( input_dir )
127         Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
128       end
129       if !File.directory?( out_dir )
130         Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
131       end
132       if !File.directory?( out_dir_doms )
133         Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
134       end
135
136
137       log = String.new
138
139       log << "Program            : " + PRG_NAME + ld
140       log << "Version            : " + PRG_VERSION + ld
141       log << "Program date       : " + PRG_DATE + ld
142
143       puts()
144       puts( "Sequence names files suffix: " + seq_names_files_suffix )
145       log << "Sequence names files suffix: " + seq_names_files_suffix + ld
146       puts( "Input dir                  : " + input_dir )
147       log << "Input dir                  : " + input_dir + ld
148       puts( "Output dir                 : " + out_dir )
149       log << "Output dir                 : " + out_dir + ld
150       puts( "Output dir domains         : " + out_dir_doms )
151       log << "Output dir domains         : " + out_dir_doms + ld
152       if ( mapping_file != nil )
153         puts( "Mapping file               : " + mapping_file )
154         log << "Mapping file               : " + mapping_file + ld
155       end
156       if extension > 0
157         puts( "Extension                  : " + extension.to_s )
158         log << "Extension                  : " + extension.to_s + ld
159       end
160       if extract_linkers
161         puts( "Extract linkers            : true" )
162         log << "Extract linkers            : true" + ld
163       end
164       log << "Date                       : " + Time.now.to_s + ld
165       puts
166
167       if ( mapping_file != nil )
168         species_codes_to_paths = extract_mappings( mapping_file )
169       end
170
171       input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
172
173       counter = 0
174
175       input_files.each { |input_file|
176         counter += 1
177         puts
178         puts
179         puts counter.to_s + "/" + input_files.size.to_s
180         read_seq_family_file( input_file,
181           seq_names_files_suffix,
182           input_dir,
183           species_codes_to_paths,
184           log,
185           out_dir,
186           out_dir_doms,
187           mapping_file,
188           extension,
189           extract_linkers )
190       }
191       puts
192       Util.print_message( PRG_NAME, "OK" )
193       puts
194
195     end
196
197
198     def read_seq_family_file( input_file,
199         seq_names_files_suffix,
200         input_dir,
201         species_codes_to_paths,
202         log,
203         out_dir,
204         out_dir_doms,
205         mapping_file,
206         extension,
207         extract_linkers )
208
209       begin
210         Util.check_file_for_readability( input_file )
211       rescue ArgumentError => e
212         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
213       end
214       basename = File.basename( input_file, seq_names_files_suffix )
215       out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
216       out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
217       out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
218       doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
219       doms_ext_out_file_path_fasta_file           = nil
220       if extension > 0
221         doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX
222       end
223       begin
224         Util.check_file_for_writability( out_file_path_fasta_file )
225         Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
226         Util.check_file_for_writability( out_file_path_ids_map  )
227         Util.check_file_for_writability( doms_out_file_path_fasta_file  )
228       rescue ArgumentError => e
229         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
230       end
231
232       ids_map_writer = nil
233       begin
234         ids_map_writer = File.open( out_file_path_ids_map, 'a' )
235       rescue Exception => e
236         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
237       end
238
239       current_species         = ""
240       current_msa            = nil
241       new_msa                = Msa.new
242       new_msa_normalized_ids = Msa.new
243       new_msa_domains = Msa.new
244       new_msa_domains_extended = Msa.new
245       per_species_counter = 0
246       linkers = ""
247
248       puts basename
249
250       File.open( input_file ) do | file |
251         while line = file.gets
252           line.strip!
253           if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
254             values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
255             mod_line = nil
256             if ( values.length < 2 )
257               Util.fatal_error( PRG_NAME, "unexpected format: " + line )
258             end
259             species = values[ 0 ]
260             #if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
261             #  next
262             #end
263             seq_name = values[ 1 ]
264             domain_ranges = nil
265             if ( values.length > 3 )
266               domain_ranges_block = values[ 3 ]
267               domain_ranges = domain_ranges_block.split( "/" )
268             end
269             if ( species != current_species )
270               current_species = species
271               my_file = input_dir + Constants::FILE_SEPARATOR + current_species
272
273               if ( !File.exist?( my_file ) )
274                 if species_codes_to_paths == nil
275                   Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
276                 elsif ( !species_codes_to_paths.has_key?( current_species ) )
277                   Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
278                 end
279                 my_file = species_codes_to_paths[ current_species ]
280               end
281               my_path = File.expand_path( my_file )
282               my_readlink = my_path
283               if ( File.symlink?( my_path ) )
284                 my_readlink = File.readlink( my_path )
285               end
286               current_msa = read_fasta_file( my_file )
287
288               if ( per_species_counter > 0 )
289                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
290                 per_species_counter = 0
291               end
292               puts " " + current_species + " [" + my_readlink + "]"
293               log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
294             end
295             puts "   " + seq_name
296             log << "   " + seq_name + Constants::LINE_DELIMITER
297             per_species_counter = per_species_counter + 1
298             seq = nil
299
300             indices = current_msa.find_by_name_start( seq_name, true )
301             if indices.size == 1
302               seq =  current_msa.get_sequence( indices[ 0 ] )
303             elsif indices.size == 0
304               # Not found, try finding by partial match.
305               begin
306                 seq = current_msa.get_by_name( seq_name, true, true )
307               rescue ArgumentError => e
308                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
309               end
310             else
311               Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique"  )
312             end
313
314             # if current_msa.find_by_name_start( seq_name, true ).size > 0
315             #   begin
316             #     seq = current_msa.get_by_name_start( seq_name, true ).copy
317             #   rescue ArgumentError => e
318             #     Util.fatal_error( PRG_NAME, "error: " + e.to_s )
319             #   end
320             # else
321             #   # Not found, try finding by partial match.
322             #   begin
323             #     seq = current_msa.get_by_name( seq_name, true, true )
324             #   rescue ArgumentError => e
325             #     Util.fatal_error( PRG_NAME, "error: " + e.to_s )
326             #   end
327             # end
328
329             normalized_id = per_species_counter.to_s( 16 ).upcase +
330              "_" + current_species
331
332             per_species_counter.to_i
333
334             ids_map_writer.write( normalized_id + "\t" + seq.get_name + Constants::LINE_DELIMITER )
335
336             orig_name = nil
337             if seq != nil
338               orig_name = seq.get_name
339               seq.set_name( seq.get_name + " [" + current_species + "]" )
340               new_msa.add_sequence( seq )
341             else
342               Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
343             end
344
345             if domain_ranges != nil
346               first = true
347               prev_to = -1
348
349               domain_ranges.each { |range|
350                 if range != nil && range.length > 0
351                   s = range.split("-")
352                   from = s[ 0 ].to_i
353                   to = s[ 1 ].to_i
354                   new_msa_domains.add_sequence( Sequence.new( orig_name +
355                        " [" + from.to_s +
356                        "-" + to.to_s +
357                        "] [" + basename + "] [" +
358                        current_species + "]",
359                       seq.get_sequence_as_string[from..to] ) )
360                   if extension > 0
361                     from_e = from - extension
362                     if from_e < 0
363                       from_e = 0
364                     end
365                     to_e = to + extension
366                     if to_e > seq.get_sequence_as_string.length - 1
367                       to_e = seq.get_sequence_as_string.length - 1
368                     end
369                     new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
370                          " [" + from.to_s +
371                          "-" + to.to_s  +
372                          "] [extended by " +
373                          extension.to_s +
374                          "] [" + basename + "] [" +
375                          current_species + "]",
376                         seq.get_sequence_as_string[ from_e..to_e ] ) )
377                   end # extension > 0
378                   if  extract_linkers
379                     if first
380                       first = false
381                       f = 0
382                       t = from - 1
383                       if extension > 0
384                         f = from - extension
385                       end
386                       mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
387                     else
388                       mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
389                     end
390                     prev_to = to
391                   end
392                 end # range != nil && range.length > 0
393               }
394               if extract_linkers && prev_to > 0
395                 f = prev_to + 1
396                 t = seq.get_sequence_as_string.length - 1
397                 if extension > 0
398                   t = prev_to + extension
399                 end
400                 mod_line += get_linker_sequence( f, t, seq ) + "]"
401               end
402             end
403
404             new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
405             if mod_line
406               linkers << mod_line + Constants::LINE_DELIMITER
407             end
408           end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
409         end # while line = file.gets
410
411       end
412
413       ids_map_writer.close
414
415       if ( per_species_counter > 0 )
416         print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
417       end
418
419       io = MsaIO.new()
420
421       fasta_writer = FastaWriter.new()
422       fasta_writer.remove_gap_chars
423       fasta_writer.clean
424
425       begin
426         io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
427       rescue Exception => e
428         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
429       end
430
431       if new_msa_domains != nil
432         begin
433           io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
434         rescue Exception => e
435           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
436         end
437       end
438
439       if extension > 0 && new_msa_domains_extended != nil
440         begin
441           io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
442         rescue Exception => e
443           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
444         end
445       end
446
447       begin
448         io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
449       rescue Exception => e
450         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
451       end
452
453       begin
454         f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
455         f.print( log )
456         f.close
457       rescue Exception => e
458         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
459       end
460
461       if extract_linkers && linkers != nil
462         begin
463           f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LINKERS_SUFFIX , 'a' )
464           f.print( linkers )
465           f.close
466         rescue Exception => e
467           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
468         end
469       end
470     end
471
472
473     def get_linker_sequence( from, to, seq )
474       if from < 0
475         from = 0
476       end
477       if to > seq.get_sequence_as_string.length - 1
478         to = seq.get_sequence_as_string.length - 1
479       end
480       if from > to
481         return ""
482       else
483         return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
484       end
485     end
486
487     def obtain_inputfiles( input_dir, seq_names_files_suffix )
488       input_files = Array.new()
489       Dir.foreach( input_dir ) { |file_name|
490         if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
491           input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
492         end
493       }
494       input_files
495     end
496
497     def extract_mappings( mapping_file )
498       species_code_to_path = Hash.new()
499       File.open( mapping_file ) do | file |
500         while line = file.gets
501           if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
502             if ( line =~ /(\S+)\s+(\S+)/ )
503               species = $1
504               path = $2
505               if ( species_code_to_path.has_key?( species ) )
506                 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
507               end
508               if ( species_code_to_path.has_value?( path ) )
509                 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
510               end
511               if ( !File.exist?( path ) )
512                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
513               end
514               if ( !File.file?( path ) )
515                 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
516               end
517               if ( !File.readable?( path ) )
518                 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
519               end
520               if ( File.size( path ) < 10000 )
521                 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
522               end
523               if ( !Util.looks_like_fasta?( path ) )
524                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
525               end
526               species_code_to_path[ species ] = path
527               puts species + " -> " + path
528             end
529           end
530         end
531       end
532       species_code_to_path
533     end
534
535     def print_counts( per_species_counter, log, ld )
536       puts "   [sum: " + per_species_counter.to_s + "]"
537       log << "   [sum: " + per_species_counter.to_s + "]" + ld
538     end
539
540     def read_fasta_file( input )
541       if @file_to_msa.has_key?( input )
542         return @file_to_msa[ input ]
543       end
544
545       f = MsaFactory.new()
546       msa = nil
547       begin
548         msa = f.create_msa_from_file( input, FastaParser.new() )
549       rescue Exception => e
550         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
551       end
552       if @file_to_msa.size < 500 && msa.get_number_of_seqs <= 50000
553         @file_to_msa[ input ] = msa
554       end
555       msa
556     end
557
558     def print_help()
559       puts( "Usage:" )
560       puts()
561       puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
562          "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
563          "genome multiple-sequence ('fasta') files not in input dir]" )
564       puts()
565       puts( "  option: -" + EXT_OPTION  + "=<int>: to extend extracted domains" )
566       puts( "          -" + EXTRACT_LINKERS_OPTION  + "      : to extract linkers" )
567       puts()
568       puts( "  " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" )
569       puts()
570     end
571
572   end # class MultiSequenceExtractor
573 end