inprogress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / multi_sequence_extractor.rb
1 #
2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/msa/msa_factory'
14 require 'lib/evo/io/msa_io'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/util/command_line_arguments'
18
19
20
21 module Evoruby
22
23   class MultiSequenceExtractor
24
25     PRG_NAME                           = "mse"
26     PRG_VERSION                        = "1.03"
27     PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
28     PRG_DATE                           = "131127"
29     COPYRIGHT                          = "2008-2013 Christian M Zmasek"
30     CONTACT                            = "phylosoft@gmail.com"
31     WWW                                = "https://sites.google.com/site/cmzmasek/home/software/forester"
32     HELP_OPTION_1                      = 'help'
33     HELP_OPTION_2                      = 'h'
34
35     EXT_OPTION                          = 'e'
36     EXTRACT_LINKERS_OPTION              = 'l'
37     LOG_SUFFIX                          = ".mse_log"
38     LINKERS_SUFFIX                      = ".linkers"
39     FASTA_SUFFIX                        = ".fasta"
40     FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
41     NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
42     PROTEINS_LIST_FILE_SEPARATOR        = "\t"
43
44     def initialize()
45       @file_to_msa = Hash.new
46       @seqs = 0
47     end
48
49     def run()
50
51       Util.print_program_information( PRG_NAME,
52         PRG_VERSION,
53         PRG_DESC ,
54         PRG_DATE,
55         COPYRIGHT,
56         CONTACT,
57         WWW,
58         STDOUT )
59
60       ld = Constants::LINE_DELIMITER
61
62       begin
63         cla = CommandLineArguments.new( ARGV )
64       rescue ArgumentError => e
65         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
66       end
67
68       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
69            cla.is_option_set?( HELP_OPTION_2 ) )
70         print_help
71         exit( 0 )
72       end
73
74       if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
75         print_help
76         exit( -1 )
77       end
78
79       allowed_opts = Array.new
80       allowed_opts.push(EXT_OPTION)
81       allowed_opts.push(EXTRACT_LINKERS_OPTION)
82
83       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
84       if ( disallowed.length > 0 )
85         Util.fatal_error( PRG_NAME,
86           "unknown option(s): " + disallowed,
87           STDOUT )
88       end
89
90       seq_names_files_suffix = cla.get_file_name( 0 )
91       input_dir              = cla.get_file_name( 1 )
92       out_dir                = cla.get_file_name( 2 )
93       out_dir_doms           = cla.get_file_name( 3 )
94       mapping_file            = nil
95
96       if ( cla.get_number_of_files == 5 )
97         mapping_file = cla.get_file_name( 4 )
98         begin
99           Util.check_file_for_readability( mapping_file )
100         rescue ArgumentError => e
101           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
102         end
103       end
104
105       extension = 0
106       if cla.is_option_set?(EXT_OPTION)
107         extension = cla.get_option_value_as_int(EXT_OPTION)
108         if extension < 0
109           extension = 0
110         end
111       end
112
113       extract_linkers = false
114       if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
115         extract_linkers = true
116       end
117
118       if  !File.exist?( input_dir )
119         Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
120       end
121       if  !File.exist?( out_dir )
122         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
123       end
124       if  !File.exist?( out_dir_doms )
125         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
126       end
127       if !File.directory?( input_dir )
128         Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
129       end
130       if !File.directory?( out_dir )
131         Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
132       end
133       if !File.directory?( out_dir_doms )
134         Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
135       end
136
137
138       log = String.new
139
140       log << "Program            : " + PRG_NAME + ld
141       log << "Version            : " + PRG_VERSION + ld
142       log << "Program date       : " + PRG_DATE + ld
143
144       puts()
145       puts( "Sequence names files suffix: " + seq_names_files_suffix )
146       log << "Sequence names files suffix: " + seq_names_files_suffix + ld
147       puts( "Input dir                  : " + input_dir )
148       log << "Input dir                  : " + input_dir + ld
149       puts( "Output dir                 : " + out_dir )
150       log << "Output dir                 : " + out_dir + ld
151       puts( "Output dir domains         : " + out_dir_doms )
152       log << "Output dir domains         : " + out_dir_doms + ld
153       if ( mapping_file != nil )
154         puts( "Mapping file               : " + mapping_file )
155         log << "Mapping file               : " + mapping_file + ld
156       end
157       if extension > 0
158         puts( "Extension                  : " + extension.to_s )
159         log << "Extension                  : " + extension.to_s + ld
160       end
161       if extract_linkers
162         puts( "Extract linkers            : true" )
163         log << "Extract linkers            : true" + ld
164       end
165       log << "Date                       : " + Time.now.to_s + ld
166       puts
167
168       if ( mapping_file != nil )
169         species_codes_to_paths = extract_mappings( mapping_file )
170       end
171
172       input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
173
174       counter = 0
175
176       input_files.each { |input_file|
177         counter += 1
178         puts
179         puts
180         puts counter.to_s + "/" + input_files.size.to_s
181         read_seq_family_file( input_file,
182           seq_names_files_suffix,
183           input_dir,
184           species_codes_to_paths,
185           log,
186           out_dir,
187           out_dir_doms,
188           mapping_file,
189           extension,
190           extract_linkers )
191       }
192       puts
193       Util.print_message( PRG_NAME, "OK" )
194       puts
195
196     end
197
198
199     def read_seq_family_file( input_file,
200         seq_names_files_suffix,
201         input_dir,
202         species_codes_to_paths,
203         log,
204         out_dir,
205         out_dir_doms,
206         mapping_file,
207         extension,
208         extract_linkers )
209
210       begin
211         Util.check_file_for_readability( input_file )
212       rescue ArgumentError => e
213         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
214       end
215       basename = File.basename( input_file, seq_names_files_suffix )
216       out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
217       out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
218       out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
219       doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
220       doms_ext_out_file_path_fasta_file           = nil
221       if extension > 0
222         doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX
223       end
224       begin
225         Util.check_file_for_writability( out_file_path_fasta_file )
226         Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
227         Util.check_file_for_writability( out_file_path_ids_map  )
228         Util.check_file_for_writability( doms_out_file_path_fasta_file  )
229       rescue ArgumentError => e
230         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
231       end
232
233       ids_map_writer = nil
234       begin
235         ids_map_writer = File.open( out_file_path_ids_map, 'a' )
236       rescue Exception => e
237         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
238       end
239
240       current_species         = ""
241       current_msa            = nil
242       new_msa                = Msa.new
243       new_msa_normalized_ids = Msa.new
244       new_msa_domains = Msa.new
245       new_msa_domains_extended = Msa.new
246       per_species_counter = 0
247       linkers = ""
248
249       puts basename
250
251       File.open( input_file ) do | file |
252         species_counter = 1
253         while line = file.gets
254           line.strip!
255           if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
256             values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
257             mod_line = nil
258             if ( values.length < 2 )
259               Util.fatal_error( PRG_NAME, "unexpected format: " + line )
260             end
261             species = values[ 0 ]
262             #if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
263             #  next
264             #end
265             seq_name = values[ 1 ]
266             domain_ranges = nil
267             if ( values.length > 3 )
268               domain_ranges_block = values[ 3 ]
269               domain_ranges = domain_ranges_block.split( "/" )
270             end
271             if ( species != current_species )
272               current_species = species
273               my_file = input_dir + Constants::FILE_SEPARATOR + current_species
274
275               if ( !File.exist?( my_file ) )
276                 if species_codes_to_paths == nil
277                   Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
278                 elsif ( !species_codes_to_paths.has_key?( current_species ) )
279                   Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
280                 end
281                 my_file = species_codes_to_paths[ current_species ]
282               end
283               my_path = File.expand_path( my_file )
284               my_readlink = my_path
285               if ( File.symlink?( my_path ) )
286                 my_readlink = File.readlink( my_path )
287               end
288               current_msa = read_fasta_file( my_file )
289
290               if ( per_species_counter > 0 )
291                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
292                 per_species_counter = 0
293               end
294               puts " " + species_counter.to_s +  ":" + current_species + " [" + my_readlink + "]"
295               log << species_counter.to_s <<  ": " << current_species << " [" + my_readlink + "]" << Constants::LINE_DELIMITER
296               species_counter += 1
297             end
298             log << "   " << seq_name << Constants::LINE_DELIMITER
299             per_species_counter = per_species_counter + 1
300             seq = nil
301
302             indices = current_msa.find_by_name_start( seq_name, true )
303             if indices.size == 1
304               seq =  current_msa.get_sequence( indices[ 0 ] )
305             elsif indices.size == 0
306               # Not found, try finding by partial match.
307               begin
308                 seq = current_msa.get_by_name( seq_name, true, true )
309               rescue ArgumentError => e
310                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
311               end
312             else
313               Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique"  )
314             end
315
316             normalized_id = per_species_counter.to_s( 16 ).upcase +
317              "_" + current_species
318
319             per_species_counter.to_i
320
321             ids_map_writer.write( normalized_id + "\t" + seq.get_name + Constants::LINE_DELIMITER )
322
323             orig_name = nil
324             if seq != nil
325               orig_name = seq.get_name
326               seq.set_name( seq.get_name + " [" + current_species + "]" )
327               new_msa.add_sequence( seq )
328             else
329               Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
330             end
331
332             if domain_ranges != nil
333               first = true
334               prev_to = -1
335
336               domain_ranges.each { |range|
337                 if range != nil && range.length > 0
338                   s = range.split("-")
339                   from = s[ 0 ].to_i
340                   to = s[ 1 ].to_i
341                   new_msa_domains.add_sequence( Sequence.new( orig_name +
342                        " [" + from.to_s +
343                        "-" + to.to_s +
344                        "] [" + basename + "] [" +
345                        current_species + "]",
346                       seq.get_sequence_as_string[from..to] ) )
347                   if extension > 0
348                     from_e = from - extension
349                     if from_e < 0
350                       from_e = 0
351                     end
352                     to_e = to + extension
353                     if to_e > seq.get_sequence_as_string.length - 1
354                       to_e = seq.get_sequence_as_string.length - 1
355                     end
356                     new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
357                          " [" + from.to_s +
358                          "-" + to.to_s  +
359                          "] [extended by " +
360                          extension.to_s +
361                          "] [" + basename + "] [" +
362                          current_species + "]",
363                         seq.get_sequence_as_string[ from_e..to_e ] ) )
364                   end # extension > 0
365                   if  extract_linkers
366                     if first
367                       first = false
368                       f = 0
369                       t = from - 1
370                       if extension > 0
371                         f = from - extension
372                       end
373                       mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
374                     else
375                       mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
376                     end
377                     prev_to = to
378                   end
379                 end # range != nil && range.length > 0
380               }
381               if extract_linkers && prev_to > 0
382                 f = prev_to + 1
383                 t = seq.get_sequence_as_string.length - 1
384                 if extension > 0
385                   t = prev_to + extension
386                 end
387                 mod_line += get_linker_sequence( f, t, seq ) + "]"
388               end
389             end
390
391             new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
392             if mod_line
393               linkers << mod_line + Constants::LINE_DELIMITER
394             end
395           end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
396         end # while line = file.gets
397
398       end
399
400       ids_map_writer.close
401
402       if ( per_species_counter > 0 )
403         print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
404       end
405
406       io = MsaIO.new()
407
408       fasta_writer = FastaWriter.new()
409       fasta_writer.remove_gap_chars
410       fasta_writer.clean
411
412       begin
413         io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
414       rescue Exception => e
415         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
416       end
417
418       if new_msa_domains != nil
419         begin
420           io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
421         rescue Exception => e
422           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
423         end
424       end
425
426       if extension > 0 && new_msa_domains_extended != nil
427         begin
428           io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
429         rescue Exception => e
430           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
431         end
432       end
433
434       begin
435         io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
436       rescue Exception => e
437         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
438       end
439
440       begin
441         f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
442         f.print( log )
443         f.close
444       rescue Exception => e
445         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
446       end
447
448       if extract_linkers && linkers != nil
449         begin
450           f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LINKERS_SUFFIX , 'a' )
451           f.print( linkers )
452           f.close
453         rescue Exception => e
454           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
455         end
456       end
457     end
458
459
460     def get_linker_sequence( from, to, seq )
461       if from < 0
462         from = 0
463       end
464       if to > seq.get_sequence_as_string.length - 1
465         to = seq.get_sequence_as_string.length - 1
466       end
467       if from > to
468         return ""
469       else
470         return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
471       end
472     end
473
474     def obtain_inputfiles( input_dir, seq_names_files_suffix )
475       input_files = Array.new()
476       Dir.foreach( input_dir ) { |file_name|
477         if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
478           input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
479         end
480       }
481       input_files
482     end
483
484     def extract_mappings( mapping_file )
485       species_code_to_path = Hash.new()
486       File.open( mapping_file ) do | file |
487         while line = file.gets
488           if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
489             if ( line =~ /(\S+)\s+(\S+)/ )
490               species = $1
491               path = $2
492               if ( species_code_to_path.has_key?( species ) )
493                 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
494               end
495               if ( species_code_to_path.has_value?( path ) )
496                 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
497               end
498               if ( !File.exist?( path ) )
499                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
500               end
501               if ( !File.file?( path ) )
502                 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
503               end
504               if ( !File.readable?( path ) )
505                 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
506               end
507               if ( File.size( path ) < 10000 )
508                 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
509               end
510               if ( !Util.looks_like_fasta?( path ) )
511                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
512               end
513               species_code_to_path[ species ] = path
514               puts species + " -> " + path
515             end
516           end
517         end
518       end
519       species_code_to_path
520     end
521
522     def print_counts( per_species_counter, log, ld )
523       puts "   sum: " + per_species_counter.to_s
524       log << "   sum: " + per_species_counter.to_s + ld
525     end
526
527     def read_fasta_file( input )
528       if @file_to_msa.has_key?( input )
529         return @file_to_msa[ input ]
530       end
531
532       f = MsaFactory.new()
533       msa = nil
534       begin
535         msa = f.create_msa_from_file( input, FastaParser.new() )
536       rescue Exception => e
537         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
538       end
539       if @seqs <= 400000
540         @file_to_msa[ input ] = msa
541         @seqs += msa.get_number_of_seqs
542         puts "   total seqs in memory: " + @seqs.to_s
543       end
544       msa
545     end
546
547     def print_help()
548       puts( "Usage:" )
549       puts()
550       puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
551          "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
552          "genome multiple-sequence ('fasta') files not in input dir]" )
553       puts()
554       puts( "  option: -" + EXT_OPTION  + "=<int>: to extend extracted domains" )
555       puts( "          -" + EXTRACT_LINKERS_OPTION  + "      : to extract linkers" )
556       puts()
557       puts( "  " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" )
558       puts()
559     end
560
561   end # class MultiSequenceExtractor
562 end