inprogress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / multi_sequence_extractor.rb
1 #
2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/msa/msa_factory'
14 require 'lib/evo/io/msa_io'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/util/command_line_arguments'
18
19
20
21 module Evoruby
22
23   class MultiSequenceExtractor
24
25     PRG_NAME                           = "mse"
26     PRG_VERSION                        = "1.03"
27     PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
28     PRG_DATE                           = "131127"
29     COPYRIGHT                          = "2008-2013 Christian M Zmasek"
30     CONTACT                            = "phylosoft@gmail.com"
31     WWW                                = "https://sites.google.com/site/cmzmasek/home/software/forester"
32     HELP_OPTION_1                      = 'help'
33     HELP_OPTION_2                      = 'h'
34
35     EXT_OPTION                          = 'e'
36     EXTRACT_LINKERS_OPTION              = 'l'
37     LOG_SUFFIX                          = ".mse_log"
38     LINKERS_SUFFIX                      = ".linkers"
39     FASTA_SUFFIX                        = ".fasta"
40     FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
41     NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
42     PROTEINS_LIST_FILE_SEPARATOR        = "\t"
43
44     def initialize()
45       @file_to_msa = Hash.new
46       @seqs = 0
47     end
48
49     def run()
50
51       Util.print_program_information( PRG_NAME,
52         PRG_VERSION,
53         PRG_DESC ,
54         PRG_DATE,
55         COPYRIGHT,
56         CONTACT,
57         WWW,
58         STDOUT )
59
60       ld = Constants::LINE_DELIMITER
61
62       begin
63         cla = CommandLineArguments.new( ARGV )
64       rescue ArgumentError => e
65         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
66       end
67
68       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
69            cla.is_option_set?( HELP_OPTION_2 ) )
70         print_help
71         exit( 0 )
72       end
73
74       if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
75         print_help
76         exit( -1 )
77       end
78
79       allowed_opts = Array.new
80       allowed_opts.push(EXT_OPTION)
81       allowed_opts.push(EXTRACT_LINKERS_OPTION)
82
83       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
84       if ( disallowed.length > 0 )
85         Util.fatal_error( PRG_NAME,
86           "unknown option(s): " + disallowed,
87           STDOUT )
88       end
89
90       seq_names_files_suffix = cla.get_file_name( 0 )
91       input_dir              = cla.get_file_name( 1 )
92       out_dir                = cla.get_file_name( 2 )
93       out_dir_doms           = cla.get_file_name( 3 )
94       mapping_file            = nil
95
96       if ( cla.get_number_of_files == 5 )
97         mapping_file = cla.get_file_name( 4 )
98         begin
99           Util.check_file_for_readability( mapping_file )
100         rescue ArgumentError => e
101           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
102         end
103       end
104
105       extension = 0
106       if cla.is_option_set?(EXT_OPTION)
107         extension = cla.get_option_value_as_int(EXT_OPTION)
108         if extension < 0
109           extension = 0
110         end
111       end
112
113       extract_linkers = false
114       if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
115         extract_linkers = true
116       end
117
118       if  !File.exist?( input_dir )
119         Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
120       end
121       if  !File.exist?( out_dir )
122         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
123       end
124       if  !File.exist?( out_dir_doms )
125         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
126       end
127       if !File.directory?( input_dir )
128         Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
129       end
130       if !File.directory?( out_dir )
131         Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
132       end
133       if !File.directory?( out_dir_doms )
134         Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
135       end
136
137
138       log = String.new
139
140       log << "Program            : " + PRG_NAME + ld
141       log << "Version            : " + PRG_VERSION + ld
142       log << "Program date       : " + PRG_DATE + ld
143
144       puts()
145       puts( "Sequence names files suffix: " + seq_names_files_suffix )
146       log << "Sequence names files suffix: " + seq_names_files_suffix + ld
147       puts( "Input dir                  : " + input_dir )
148       log << "Input dir                  : " + input_dir + ld
149       puts( "Output dir                 : " + out_dir )
150       log << "Output dir                 : " + out_dir + ld
151       puts( "Output dir domains         : " + out_dir_doms )
152       log << "Output dir domains         : " + out_dir_doms + ld
153       if ( mapping_file != nil )
154         puts( "Mapping file               : " + mapping_file )
155         log << "Mapping file               : " + mapping_file + ld
156       end
157       if extension > 0
158         puts( "Extension                  : " + extension.to_s )
159         log << "Extension                  : " + extension.to_s + ld
160       end
161       if extract_linkers
162         puts( "Extract linkers            : true" )
163         log << "Extract linkers            : true" + ld
164       end
165       log << "Date                       : " + Time.now.to_s + ld
166       puts
167
168       if ( mapping_file != nil )
169         species_codes_to_paths = extract_mappings( mapping_file )
170       end
171
172       input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
173
174       counter = 0
175
176       input_files.each { |input_file|
177         counter += 1
178         puts
179         puts
180         puts counter.to_s + "/" + input_files.size.to_s
181         read_seq_family_file( input_file,
182           seq_names_files_suffix,
183           input_dir,
184           species_codes_to_paths,
185           log,
186           out_dir,
187           out_dir_doms,
188           mapping_file,
189           extension,
190           extract_linkers )
191       }
192       puts
193       Util.print_message( PRG_NAME, "OK" )
194       puts
195
196     end
197
198
199     def read_seq_family_file( input_file,
200         seq_names_files_suffix,
201         input_dir,
202         species_codes_to_paths,
203         log,
204         out_dir,
205         out_dir_doms,
206         mapping_file,
207         extension,
208         extract_linkers )
209
210       begin
211         Util.check_file_for_readability( input_file )
212       rescue ArgumentError => e
213         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
214       end
215       basename = File.basename( input_file, seq_names_files_suffix )
216       out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
217       out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
218       out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
219       doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
220       doms_ext_out_file_path_fasta_file           = nil
221       if extension > 0
222         doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX
223       end
224       begin
225         Util.check_file_for_writability( out_file_path_fasta_file )
226         Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
227         Util.check_file_for_writability( out_file_path_ids_map  )
228         Util.check_file_for_writability( doms_out_file_path_fasta_file  )
229       rescue ArgumentError => e
230         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
231       end
232
233       ids_map_writer = nil
234       begin
235         ids_map_writer = File.open( out_file_path_ids_map, 'a' )
236       rescue Exception => e
237         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
238       end
239
240       current_species         = ""
241       current_msa            = nil
242       new_msa                = Msa.new
243       new_msa_normalized_ids = Msa.new
244       new_msa_domains = Msa.new
245       new_msa_domains_extended = Msa.new
246       per_species_counter = 0
247       linkers = ""
248
249       puts basename
250
251       File.open( input_file ) do | file |
252         while line = file.gets
253           line.strip!
254           if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
255             values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
256             mod_line = nil
257             if ( values.length < 2 )
258               Util.fatal_error( PRG_NAME, "unexpected format: " + line )
259             end
260             species = values[ 0 ]
261             #if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
262             #  next
263             #end
264             seq_name = values[ 1 ]
265             domain_ranges = nil
266             if ( values.length > 3 )
267               domain_ranges_block = values[ 3 ]
268               domain_ranges = domain_ranges_block.split( "/" )
269             end
270             if ( species != current_species )
271               current_species = species
272               my_file = input_dir + Constants::FILE_SEPARATOR + current_species
273
274               if ( !File.exist?( my_file ) )
275                 if species_codes_to_paths == nil
276                   Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
277                 elsif ( !species_codes_to_paths.has_key?( current_species ) )
278                   Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
279                 end
280                 my_file = species_codes_to_paths[ current_species ]
281               end
282               my_path = File.expand_path( my_file )
283               my_readlink = my_path
284               if ( File.symlink?( my_path ) )
285                 my_readlink = File.readlink( my_path )
286               end
287               current_msa = read_fasta_file( my_file )
288
289               if ( per_species_counter > 0 )
290                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
291                 per_species_counter = 0
292               end
293               puts " " + current_species + " [" + my_readlink + "]"
294               log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
295             end
296             puts "   " + seq_name
297             log << "   " + seq_name + Constants::LINE_DELIMITER
298             per_species_counter = per_species_counter + 1
299             seq = nil
300
301             indices = current_msa.find_by_name_start( seq_name, true )
302             if indices.size == 1
303               seq =  current_msa.get_sequence( indices[ 0 ] )
304             elsif indices.size == 0
305               # Not found, try finding by partial match.
306               begin
307                 seq = current_msa.get_by_name( seq_name, true, true )
308               rescue ArgumentError => e
309                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
310               end
311             else
312               Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique"  )
313             end
314
315             # if current_msa.find_by_name_start( seq_name, true ).size > 0
316             #   begin
317             #     seq = current_msa.get_by_name_start( seq_name, true ).copy
318             #   rescue ArgumentError => e
319             #     Util.fatal_error( PRG_NAME, "error: " + e.to_s )
320             #   end
321             # else
322             #   # Not found, try finding by partial match.
323             #   begin
324             #     seq = current_msa.get_by_name( seq_name, true, true )
325             #   rescue ArgumentError => e
326             #     Util.fatal_error( PRG_NAME, "error: " + e.to_s )
327             #   end
328             # end
329
330             normalized_id = per_species_counter.to_s( 16 ).upcase +
331              "_" + current_species
332
333             per_species_counter.to_i
334
335             ids_map_writer.write( normalized_id + "\t" + seq.get_name + Constants::LINE_DELIMITER )
336
337             orig_name = nil
338             if seq != nil
339               orig_name = seq.get_name
340               seq.set_name( seq.get_name + " [" + current_species + "]" )
341               new_msa.add_sequence( seq )
342             else
343               Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
344             end
345
346             if domain_ranges != nil
347               first = true
348               prev_to = -1
349
350               domain_ranges.each { |range|
351                 if range != nil && range.length > 0
352                   s = range.split("-")
353                   from = s[ 0 ].to_i
354                   to = s[ 1 ].to_i
355                   new_msa_domains.add_sequence( Sequence.new( orig_name +
356                        " [" + from.to_s +
357                        "-" + to.to_s +
358                        "] [" + basename + "] [" +
359                        current_species + "]",
360                       seq.get_sequence_as_string[from..to] ) )
361                   if extension > 0
362                     from_e = from - extension
363                     if from_e < 0
364                       from_e = 0
365                     end
366                     to_e = to + extension
367                     if to_e > seq.get_sequence_as_string.length - 1
368                       to_e = seq.get_sequence_as_string.length - 1
369                     end
370                     new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
371                          " [" + from.to_s +
372                          "-" + to.to_s  +
373                          "] [extended by " +
374                          extension.to_s +
375                          "] [" + basename + "] [" +
376                          current_species + "]",
377                         seq.get_sequence_as_string[ from_e..to_e ] ) )
378                   end # extension > 0
379                   if  extract_linkers
380                     if first
381                       first = false
382                       f = 0
383                       t = from - 1
384                       if extension > 0
385                         f = from - extension
386                       end
387                       mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
388                     else
389                       mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
390                     end
391                     prev_to = to
392                   end
393                 end # range != nil && range.length > 0
394               }
395               if extract_linkers && prev_to > 0
396                 f = prev_to + 1
397                 t = seq.get_sequence_as_string.length - 1
398                 if extension > 0
399                   t = prev_to + extension
400                 end
401                 mod_line += get_linker_sequence( f, t, seq ) + "]"
402               end
403             end
404
405             new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
406             if mod_line
407               linkers << mod_line + Constants::LINE_DELIMITER
408             end
409           end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
410         end # while line = file.gets
411
412       end
413
414       ids_map_writer.close
415
416       if ( per_species_counter > 0 )
417         print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
418       end
419
420       io = MsaIO.new()
421
422       fasta_writer = FastaWriter.new()
423       fasta_writer.remove_gap_chars
424       fasta_writer.clean
425
426       begin
427         io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
428       rescue Exception => e
429         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
430       end
431
432       if new_msa_domains != nil
433         begin
434           io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
435         rescue Exception => e
436           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
437         end
438       end
439
440       if extension > 0 && new_msa_domains_extended != nil
441         begin
442           io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
443         rescue Exception => e
444           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
445         end
446       end
447
448       begin
449         io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
450       rescue Exception => e
451         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
452       end
453
454       begin
455         f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
456         f.print( log )
457         f.close
458       rescue Exception => e
459         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
460       end
461
462       if extract_linkers && linkers != nil
463         begin
464           f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LINKERS_SUFFIX , 'a' )
465           f.print( linkers )
466           f.close
467         rescue Exception => e
468           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
469         end
470       end
471     end
472
473
474     def get_linker_sequence( from, to, seq )
475       if from < 0
476         from = 0
477       end
478       if to > seq.get_sequence_as_string.length - 1
479         to = seq.get_sequence_as_string.length - 1
480       end
481       if from > to
482         return ""
483       else
484         return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
485       end
486     end
487
488     def obtain_inputfiles( input_dir, seq_names_files_suffix )
489       input_files = Array.new()
490       Dir.foreach( input_dir ) { |file_name|
491         if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
492           input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
493         end
494       }
495       input_files
496     end
497
498     def extract_mappings( mapping_file )
499       species_code_to_path = Hash.new()
500       File.open( mapping_file ) do | file |
501         while line = file.gets
502           if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
503             if ( line =~ /(\S+)\s+(\S+)/ )
504               species = $1
505               path = $2
506               if ( species_code_to_path.has_key?( species ) )
507                 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
508               end
509               if ( species_code_to_path.has_value?( path ) )
510                 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
511               end
512               if ( !File.exist?( path ) )
513                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
514               end
515               if ( !File.file?( path ) )
516                 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
517               end
518               if ( !File.readable?( path ) )
519                 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
520               end
521               if ( File.size( path ) < 10000 )
522                 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
523               end
524               if ( !Util.looks_like_fasta?( path ) )
525                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
526               end
527               species_code_to_path[ species ] = path
528               puts species + " -> " + path
529             end
530           end
531         end
532       end
533       species_code_to_path
534     end
535
536     def print_counts( per_species_counter, log, ld )
537       puts "   [sum: " + per_species_counter.to_s + "]"
538       log << "   [sum: " + per_species_counter.to_s + "]" + ld
539     end
540
541     def read_fasta_file( input )
542       if @file_to_msa.has_key?( input )
543         return @file_to_msa[ input ]
544       end
545
546       f = MsaFactory.new()
547       msa = nil
548       begin
549         msa = f.create_msa_from_file( input, FastaParser.new() )
550       rescue Exception => e
551         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
552       end
553       if @seqs <= 10000000
554         @file_to_msa[ input ] = msa
555         @seqs += msa.get_number_of_seqs
556         puts "total seqs in memory: " + @seqs.to_str
557       end
558       msa
559     end
560
561     def print_help()
562       puts( "Usage:" )
563       puts()
564       puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
565          "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
566          "genome multiple-sequence ('fasta') files not in input dir]" )
567       puts()
568       puts( "  option: -" + EXT_OPTION  + "=<int>: to extend extracted domains" )
569       puts( "          -" + EXTRACT_LINKERS_OPTION  + "      : to extract linkers" )
570       puts()
571       puts( "  " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" )
572       puts()
573     end
574
575   end # class MultiSequenceExtractor
576 end