linkers!
[jalview.git] / forester / ruby / evoruby / lib / evo / apps / multi_sequence_extractor.rb
1 #
2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/msa/msa_factory'
14 require 'lib/evo/io/msa_io'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/util/command_line_arguments'
18
19
20
21 module Evoruby
22
23   class MultiSequenceExtractor
24
25     PRG_NAME                           = "mse"
26     PRG_VERSION                        = "1.01"
27     PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
28     PRG_DATE                           = "2012.07.19"
29     COPYRIGHT                          = "2008-2012 Christian M Zmasek"
30     CONTACT                            = "phylosoft@gmail.com"
31     WWW                                = "www.phylosoft.org"
32     HELP_OPTION_1                      = 'help'
33     HELP_OPTION_2                      = 'h'
34
35     EXT_OPTION                          = 'e'
36     EXTRACT_LINKERS_OPTION              = 'l'
37     LOG_SUFFIX                          = ".mse_log"
38     FASTA_SUFFIX                        = ".fasta"
39     FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
40     NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
41     PROTEINS_LIST_FILE_SEPARATOR        = "\t"
42
43
44     def run()
45
46       Util.print_program_information( PRG_NAME,
47         PRG_VERSION,
48         PRG_DESC ,
49         PRG_DATE,
50         COPYRIGHT,
51         CONTACT,
52         WWW,
53         STDOUT )
54
55       ld = Constants::LINE_DELIMITER
56
57       begin
58         cla = CommandLineArguments.new( ARGV )
59       rescue ArgumentError => e
60         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
61       end
62
63       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
64            cla.is_option_set?( HELP_OPTION_2 ) )
65         print_help
66         exit( 0 )
67       end
68
69       if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
70         print_help
71         exit( -1 )
72       end
73
74       allowed_opts = Array.new
75       allowed_opts.push(EXT_OPTION)
76       allowed_opts.push(EXTRACT_LINKERS_OPTION)
77
78       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
79       if ( disallowed.length > 0 )
80         Util.fatal_error( PRG_NAME,
81           "unknown option(s): " + disallowed,
82           STDOUT )
83       end
84
85       seq_names_files_suffix = cla.get_file_name( 0 )
86       input_dir              = cla.get_file_name( 1 )
87       out_dir                = cla.get_file_name( 2 )
88       out_dir_doms           = cla.get_file_name( 3 )
89       mapping_file            = nil
90
91       if ( cla.get_number_of_files == 5 )
92         mapping_file = cla.get_file_name( 4 )
93         begin
94           Util.check_file_for_readability( mapping_file )
95         rescue ArgumentError => e
96           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
97         end
98       end
99
100       extension = 0
101       if cla.is_option_set?(EXT_OPTION)
102         extension = cla.get_option_value_as_int(EXT_OPTION)
103         if extension < 0
104           extension = 0
105         end
106       end
107
108       extract_linkers = false
109       if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
110         extract_linkers = true
111       end
112
113       if  !File.exist?( input_dir )
114         Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
115       end
116       if  !File.exist?( out_dir )
117         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
118       end
119       if  !File.exist?( out_dir_doms )
120         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
121       end
122       if !File.directory?( input_dir )
123         Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
124       end
125       if !File.directory?( out_dir )
126         Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
127       end
128       if !File.directory?( out_dir_doms )
129         Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
130       end
131
132
133       log = String.new
134
135       log << "Program            : " + PRG_NAME + ld
136       log << "Version            : " + PRG_VERSION + ld
137       log << "Program date       : " + PRG_DATE + ld
138
139       puts()
140       puts( "Sequence names files suffix: " + seq_names_files_suffix )
141       log << "Sequence names files suffix: " + seq_names_files_suffix + ld
142       puts( "Input dir                  : " + input_dir )
143       log << "Input dir                  : " + input_dir + ld
144       puts( "Output dir                 : " + out_dir )
145       log << "Output dir                 : " + out_dir + ld
146       puts( "Output dir domains         : " + out_dir_doms )
147       log << "Output dir domains         : " + out_dir_doms + ld
148       if ( mapping_file != nil )
149         puts( "Mapping file               : " + mapping_file )
150         log << "Mapping file               : " + mapping_file + ld
151       end
152       if ( extension > 0 )
153         puts( "Extension                  : " + extension.to_s )
154         log << "Extension                  : " + extension.to_s + ld
155       end
156       log << "Date                       : " + Time.now.to_s + ld
157       puts
158
159       if ( mapping_file != nil )
160         species_codes_to_paths = extract_mappings( mapping_file )
161       end
162
163       input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
164
165       counter = 0
166
167       input_files.each { |input_file|
168         counter += 1
169         puts
170         puts
171         puts counter.to_s + "/" + input_files.size.to_s
172         read_seq_family_file( input_file,
173           seq_names_files_suffix,
174           input_dir,
175           species_codes_to_paths,
176           log,
177           out_dir,
178           out_dir_doms,
179           mapping_file,
180           extension,
181           extract_linkers )
182       }
183       puts
184       Util.print_message( PRG_NAME, "OK" )
185       puts
186
187     end
188
189
190     def read_seq_family_file( input_file,
191         seq_names_files_suffix,
192         input_dir,
193         species_codes_to_paths,
194         log,
195         out_dir,
196         out_dir_doms,
197         mapping_file,
198         extension,
199         extract_linkers )
200
201       begin
202         Util.check_file_for_readability( input_file )
203       rescue ArgumentError => e
204         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
205       end
206       basename = File.basename( input_file, seq_names_files_suffix )
207       out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
208       out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
209       out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
210       doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
211       doms_ext_out_file_path_fasta_file           = nil
212       if extension > 0
213         doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX
214       end
215       begin
216         Util.check_file_for_writability( out_file_path_fasta_file )
217         Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
218         Util.check_file_for_writability( out_file_path_ids_map  )
219         Util.check_file_for_writability( doms_out_file_path_fasta_file  )
220       rescue ArgumentError => e
221         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
222       end
223
224       ids_map_writer = nil
225       begin
226         ids_map_writer = File.open( out_file_path_ids_map, 'a' )
227       rescue Exception => e
228         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
229       end
230
231       current_species         = ""
232       current_msa            = nil
233       new_msa                = Msa.new
234       new_msa_normalized_ids = Msa.new
235       new_msa_domains = Msa.new
236       new_msa_domains_extended = Msa.new
237       per_species_counter = 0
238
239       puts basename
240
241       File.open( input_file ) do | file |
242         while line = file.gets
243           line.strip!
244           if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
245             values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
246             mod_line = nil
247             if ( values.length < 2 )
248               Util.fatal_error( PRG_NAME, "unexpected format: " + line )
249             end
250             species = values[ 0 ]
251             if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
252               next
253             end
254             seq_name = values[ 1 ]
255             domain_ranges = nil
256             if ( values.length > 3 )
257               domain_ranges_block = values[ 3 ]
258               domain_ranges = domain_ranges_block.split( "/" )
259             end
260             if ( species != current_species )
261               current_species = species
262               my_file = input_dir + Constants::FILE_SEPARATOR + current_species
263
264               if ( !File.exist?( my_file ) )
265                 if species_codes_to_paths == nil
266                   Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
267                 elsif ( !species_codes_to_paths.has_key?( current_species ) )
268                   Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
269                 end
270                 my_file = species_codes_to_paths[ current_species ]
271               end
272               my_path = File.expand_path( my_file )
273               my_readlink = my_path
274               if ( File.symlink?( my_path ) )
275                 my_readlink = File.readlink( my_path )
276               end
277               current_msa = read_fasta_file( my_file )
278
279               if ( per_species_counter > 0 )
280                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
281                 per_species_counter = 0
282               end
283               puts " " + current_species + " [" + my_readlink + "]"
284               log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
285             end
286             puts "   " + seq_name
287             log << "   " + seq_name + Constants::LINE_DELIMITER
288             per_species_counter = per_species_counter + 1
289             seq = nil
290
291             if current_msa.find_by_name_start( seq_name, true ).size > 0
292               begin
293                 seq = current_msa.get_by_name_start( seq_name, true ).copy
294               rescue ArgumentError => e
295                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
296               end
297             else
298               # Not found, try finding by partial match.
299               begin
300                 seq = current_msa.get_by_name( seq_name, true, true )
301               rescue ArgumentError => e
302                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
303               end
304             end
305
306             normalized_id = per_species_counter.to_s( 16 ).upcase +
307              "_" + current_species
308
309             per_species_counter.to_i
310
311             ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
312
313             orig_name = nil
314             if seq != nil
315               orig_name = seq.get_name
316               seq.set_name( seq.get_name + " [" + current_species + "]" )
317               new_msa.add_sequence( seq )
318             else
319               Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
320             end
321
322             if domain_ranges != nil
323               first = true
324               prev_to = -1
325
326               domain_ranges.each { |range|
327                 if range != nil && range.length > 0
328                   s = range.split("-")
329                   from = s[ 0 ].to_i
330                   to = s[ 1 ].to_i
331                   new_msa_domains.add_sequence( Sequence.new( orig_name +
332                        " [" + from.to_s +
333                        "-" + to.to_s +
334                        "] [" + basename + "] [" +
335                        current_species + "]",
336                       seq.get_sequence_as_string[from..to] ) )
337                   if extension > 0
338                     from_e = from - extension
339                     if from_e < 0
340                       from_e = 0
341                     end
342                     to_e = to + extension
343                     if to_e > seq.get_sequence_as_string.length - 1
344                       to_e = seq.get_sequence_as_string.length - 1
345                     end
346                     new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
347                          " [" + from.to_s +
348                          "-" + to.to_s  +
349                          "] [extended by " +
350                          extension.to_s +
351                          "] [" + basename + "] [" +
352                          current_species + "]",
353                         seq.get_sequence_as_string[ from_e..to_e ] ) )
354                   end # extension > 0
355                   if  extract_linkers
356                     if first
357                       first = false
358                       f = 0
359                       t = from - 1
360                       if extension > 0
361                         f = t - extension
362                       end
363                       mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
364                     else
365                       mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
366                     end
367                     prev_to = to
368                   end
369                 end # range != nil && range.length > 0
370               }
371               if extract_linkers && prev_to > 0
372                 f = prev_to + 1
373                 t = seq.get_sequence_as_string.length - 1
374                 if extension > 0
375                   t = f + extension
376                 end
377                 mod_line += get_linker_sequence( f, t, seq ) + "]"
378               end
379             end
380
381             new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
382             if mod_line
383               puts mod_line
384             end
385           end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
386         end # while line = file.gets
387
388       end
389
390       ids_map_writer.close
391
392       if ( per_species_counter > 0 )
393         print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
394       end
395
396       io = MsaIO.new()
397
398       fasta_writer = FastaWriter.new()
399       fasta_writer.remove_gap_chars
400       fasta_writer.clean
401
402       begin
403         io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
404       rescue Exception => e
405         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
406       end
407
408       if new_msa_domains != nil
409         begin
410           io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
411         rescue Exception => e
412           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
413         end
414       end
415
416       if extension > 0 && new_msa_domains_extended != nil
417         begin
418           io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
419         rescue Exception => e
420           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
421         end
422       end
423
424       begin
425         io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
426       rescue Exception => e
427         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
428       end
429
430       begin
431         f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
432         f.print( log )
433         f.close
434       rescue Exception => e
435         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
436       end
437
438     end
439
440
441     def get_linker_sequence( from, to, seq )
442       if from < 0
443         from = 0
444       end
445       if to > seq.get_sequence_as_string.length - 1
446         to = seq.get_sequence_as_string.length - 1
447       end
448       if from > to
449         return ""
450       else
451         return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
452       end
453     end
454
455     def obtain_inputfiles( input_dir, seq_names_files_suffix )
456       input_files = Array.new()
457       Dir.foreach( input_dir ) { |file_name|
458         if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
459           input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
460         end
461       }
462       input_files
463     end
464
465     def extract_mappings( mapping_file )
466       species_code_to_path = Hash.new()
467       File.open( mapping_file ) do | file |
468         while line = file.gets
469           if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
470             if ( line =~ /(\S+)\s+(\S+)/ )
471               species = $1
472               path = $2
473               if ( species_code_to_path.has_key?( species ) )
474                 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
475               end
476               if ( species_code_to_path.has_value?( path ) )
477                 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
478               end
479               if ( !File.exist?( path ) )
480                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
481               end
482               if ( !File.file?( path ) )
483                 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
484               end
485               if ( !File.readable?( path ) )
486                 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
487               end
488               if ( File.size( path ) < 10000 )
489                 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
490               end
491               if ( !Util.looks_like_fasta?( path ) )
492                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
493               end
494               species_code_to_path[ species ] = path
495               puts species + " -> " + path
496             end
497           end
498         end
499       end
500       species_code_to_path
501     end
502
503     def print_counts( per_species_counter, log, ld )
504       puts "   [sum: " + per_species_counter.to_s + "]"
505       log << "   [sum: " + per_species_counter.to_s + "]" + ld
506     end
507
508     def read_fasta_file( input )
509       f = MsaFactory.new()
510       msa = nil
511       begin
512         msa = f.create_msa_from_file( input, FastaParser.new() )
513       rescue Exception => e
514         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
515       end
516       msa
517     end
518
519     def print_help()
520       puts( "Usage:" )
521       puts()
522       puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
523          "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
524          "genome multiple-sequence ('fasta') files not in input dir]" )
525       puts()
526       puts( "  option: -" + EXT_OPTION  + "=<int>: to extend extracted domains" )
527       puts( "          -" + EXTRACT_LINKERS_OPTION  + "      : to extract linkers" )
528       puts()
529       puts( "  " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" )
530       puts()
531     end
532
533   end # class MultiSequenceExtractor
534 end