inprogress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / multi_sequence_extractor.rb
1 #
2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/msa/msa_factory'
14 require 'lib/evo/io/msa_io'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/util/command_line_arguments'
18
19
20
21 module Evoruby
22
23   class MultiSequenceExtractor
24
25     PRG_NAME                           = "mse"
26     PRG_VERSION                        = "1.03"
27     PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
28     PRG_DATE                           = "131127"
29     COPYRIGHT                          = "2008-2013 Christian M Zmasek"
30     CONTACT                            = "phylosoft@gmail.com"
31     WWW                                = "https://sites.google.com/site/cmzmasek/home/software/forester"
32     HELP_OPTION_1                      = 'help'
33     HELP_OPTION_2                      = 'h'
34
35     EXT_OPTION                          = 'e'
36     EXTRACT_LINKERS_OPTION              = 'l'
37     LOG_SUFFIX                          = ".mse_log"
38     LINKERS_SUFFIX                      = ".linkers"
39     FASTA_SUFFIX                        = ".fasta"
40     FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
41     NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
42     PROTEINS_LIST_FILE_SEPARATOR        = "\t"
43
44
45     def run()
46
47       Util.print_program_information( PRG_NAME,
48         PRG_VERSION,
49         PRG_DESC ,
50         PRG_DATE,
51         COPYRIGHT,
52         CONTACT,
53         WWW,
54         STDOUT )
55
56       ld = Constants::LINE_DELIMITER
57
58       begin
59         cla = CommandLineArguments.new( ARGV )
60       rescue ArgumentError => e
61         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
62       end
63
64       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
65            cla.is_option_set?( HELP_OPTION_2 ) )
66         print_help
67         exit( 0 )
68       end
69
70       if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
71         print_help
72         exit( -1 )
73       end
74
75       allowed_opts = Array.new
76       allowed_opts.push(EXT_OPTION)
77       allowed_opts.push(EXTRACT_LINKERS_OPTION)
78
79       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
80       if ( disallowed.length > 0 )
81         Util.fatal_error( PRG_NAME,
82           "unknown option(s): " + disallowed,
83           STDOUT )
84       end
85
86       seq_names_files_suffix = cla.get_file_name( 0 )
87       input_dir              = cla.get_file_name( 1 )
88       out_dir                = cla.get_file_name( 2 )
89       out_dir_doms           = cla.get_file_name( 3 )
90       mapping_file            = nil
91
92       if ( cla.get_number_of_files == 5 )
93         mapping_file = cla.get_file_name( 4 )
94         begin
95           Util.check_file_for_readability( mapping_file )
96         rescue ArgumentError => e
97           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
98         end
99       end
100
101       extension = 0
102       if cla.is_option_set?(EXT_OPTION)
103         extension = cla.get_option_value_as_int(EXT_OPTION)
104         if extension < 0
105           extension = 0
106         end
107       end
108
109       extract_linkers = false
110       if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
111         extract_linkers = true
112       end
113
114       if  !File.exist?( input_dir )
115         Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
116       end
117       if  !File.exist?( out_dir )
118         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
119       end
120       if  !File.exist?( out_dir_doms )
121         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
122       end
123       if !File.directory?( input_dir )
124         Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
125       end
126       if !File.directory?( out_dir )
127         Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
128       end
129       if !File.directory?( out_dir_doms )
130         Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
131       end
132
133
134       log = String.new
135
136       log << "Program            : " + PRG_NAME + ld
137       log << "Version            : " + PRG_VERSION + ld
138       log << "Program date       : " + PRG_DATE + ld
139
140       puts()
141       puts( "Sequence names files suffix: " + seq_names_files_suffix )
142       log << "Sequence names files suffix: " + seq_names_files_suffix + ld
143       puts( "Input dir                  : " + input_dir )
144       log << "Input dir                  : " + input_dir + ld
145       puts( "Output dir                 : " + out_dir )
146       log << "Output dir                 : " + out_dir + ld
147       puts( "Output dir domains         : " + out_dir_doms )
148       log << "Output dir domains         : " + out_dir_doms + ld
149       if ( mapping_file != nil )
150         puts( "Mapping file               : " + mapping_file )
151         log << "Mapping file               : " + mapping_file + ld
152       end
153       if extension > 0
154         puts( "Extension                  : " + extension.to_s )
155         log << "Extension                  : " + extension.to_s + ld
156       end
157       if extract_linkers
158         puts( "Extract linkers            : true" )
159         log << "Extract linkers            : true" + ld
160       end
161       log << "Date                       : " + Time.now.to_s + ld
162       puts
163
164       if ( mapping_file != nil )
165         species_codes_to_paths = extract_mappings( mapping_file )
166       end
167
168       input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
169
170       counter = 0
171
172       input_files.each { |input_file|
173         counter += 1
174         puts
175         puts
176         puts counter.to_s + "/" + input_files.size.to_s
177         read_seq_family_file( input_file,
178           seq_names_files_suffix,
179           input_dir,
180           species_codes_to_paths,
181           log,
182           out_dir,
183           out_dir_doms,
184           mapping_file,
185           extension,
186           extract_linkers )
187       }
188       puts
189       Util.print_message( PRG_NAME, "OK" )
190       puts
191
192     end
193
194
195     def read_seq_family_file( input_file,
196         seq_names_files_suffix,
197         input_dir,
198         species_codes_to_paths,
199         log,
200         out_dir,
201         out_dir_doms,
202         mapping_file,
203         extension,
204         extract_linkers )
205
206       begin
207         Util.check_file_for_readability( input_file )
208       rescue ArgumentError => e
209         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
210       end
211       basename = File.basename( input_file, seq_names_files_suffix )
212       out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
213       out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
214       out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
215       doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
216       doms_ext_out_file_path_fasta_file           = nil
217       if extension > 0
218         doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX
219       end
220       begin
221         Util.check_file_for_writability( out_file_path_fasta_file )
222         Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
223         Util.check_file_for_writability( out_file_path_ids_map  )
224         Util.check_file_for_writability( doms_out_file_path_fasta_file  )
225       rescue ArgumentError => e
226         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
227       end
228
229       ids_map_writer = nil
230       begin
231         ids_map_writer = File.open( out_file_path_ids_map, 'a' )
232       rescue Exception => e
233         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
234       end
235
236       current_species         = ""
237       current_msa            = nil
238       new_msa                = Msa.new
239       new_msa_normalized_ids = Msa.new
240       new_msa_domains = Msa.new
241       new_msa_domains_extended = Msa.new
242       per_species_counter = 0
243       linkers = ""
244
245       puts basename
246
247       File.open( input_file ) do | file |
248         while line = file.gets
249           line.strip!
250           if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
251             values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
252             mod_line = nil
253             if ( values.length < 2 )
254               Util.fatal_error( PRG_NAME, "unexpected format: " + line )
255             end
256             species = values[ 0 ]
257             #if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
258             #  next
259             #end
260             seq_name = values[ 1 ]
261             domain_ranges = nil
262             if ( values.length > 3 )
263               domain_ranges_block = values[ 3 ]
264               domain_ranges = domain_ranges_block.split( "/" )
265             end
266             if ( species != current_species )
267               current_species = species
268               my_file = input_dir + Constants::FILE_SEPARATOR + current_species
269
270               if ( !File.exist?( my_file ) )
271                 if species_codes_to_paths == nil
272                   Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
273                 elsif ( !species_codes_to_paths.has_key?( current_species ) )
274                   Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
275                 end
276                 my_file = species_codes_to_paths[ current_species ]
277               end
278               my_path = File.expand_path( my_file )
279               my_readlink = my_path
280               if ( File.symlink?( my_path ) )
281                 my_readlink = File.readlink( my_path )
282               end
283               current_msa = read_fasta_file( my_file )
284
285               if ( per_species_counter > 0 )
286                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
287                 per_species_counter = 0
288               end
289               puts " " + current_species + " [" + my_readlink + "]"
290               log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
291             end
292             puts "   " + seq_name
293             log << "   " + seq_name + Constants::LINE_DELIMITER
294             per_species_counter = per_species_counter + 1
295             seq = nil
296
297             indices = current_msa.find_by_name_start( seq_name, true )
298             if indices.size == 1
299               seq =  current_msa.get_sequence( indices[ 0 ] )
300             elsif indices.size == 0
301               # Not found, try finding by partial match.
302               begin
303                 seq = current_msa.get_by_name( seq_name, true, true )
304               rescue ArgumentError => e
305                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
306               end
307             else
308               Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique"  )
309             end
310
311             # if current_msa.find_by_name_start( seq_name, true ).size > 0
312             #   begin
313             #     seq = current_msa.get_by_name_start( seq_name, true ).copy
314             #   rescue ArgumentError => e
315             #     Util.fatal_error( PRG_NAME, "error: " + e.to_s )
316             #   end
317             # else
318             #   # Not found, try finding by partial match.
319             #   begin
320             #     seq = current_msa.get_by_name( seq_name, true, true )
321             #   rescue ArgumentError => e
322             #     Util.fatal_error( PRG_NAME, "error: " + e.to_s )
323             #   end
324             # end
325
326             normalized_id = per_species_counter.to_s( 16 ).upcase +
327              "_" + current_species
328
329             per_species_counter.to_i
330
331             ids_map_writer.write( normalized_id + "\t" + seq.get_name + Constants::LINE_DELIMITER )
332
333             orig_name = nil
334             if seq != nil
335               orig_name = seq.get_name
336               seq.set_name( seq.get_name + " [" + current_species + "]" )
337               new_msa.add_sequence( seq )
338             else
339               Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
340             end
341
342             if domain_ranges != nil
343               first = true
344               prev_to = -1
345
346               domain_ranges.each { |range|
347                 if range != nil && range.length > 0
348                   s = range.split("-")
349                   from = s[ 0 ].to_i
350                   to = s[ 1 ].to_i
351                   new_msa_domains.add_sequence( Sequence.new( orig_name +
352                        " [" + from.to_s +
353                        "-" + to.to_s +
354                        "] [" + basename + "] [" +
355                        current_species + "]",
356                       seq.get_sequence_as_string[from..to] ) )
357                   if extension > 0
358                     from_e = from - extension
359                     if from_e < 0
360                       from_e = 0
361                     end
362                     to_e = to + extension
363                     if to_e > seq.get_sequence_as_string.length - 1
364                       to_e = seq.get_sequence_as_string.length - 1
365                     end
366                     new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
367                          " [" + from.to_s +
368                          "-" + to.to_s  +
369                          "] [extended by " +
370                          extension.to_s +
371                          "] [" + basename + "] [" +
372                          current_species + "]",
373                         seq.get_sequence_as_string[ from_e..to_e ] ) )
374                   end # extension > 0
375                   if  extract_linkers
376                     if first
377                       first = false
378                       f = 0
379                       t = from - 1
380                       if extension > 0
381                         f = from - extension
382                       end
383                       mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
384                     else
385                       mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
386                     end
387                     prev_to = to
388                   end
389                 end # range != nil && range.length > 0
390               }
391               if extract_linkers && prev_to > 0
392                 f = prev_to + 1
393                 t = seq.get_sequence_as_string.length - 1
394                 if extension > 0
395                   t = prev_to + extension
396                 end
397                 mod_line += get_linker_sequence( f, t, seq ) + "]"
398               end
399             end
400
401             new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
402             if mod_line
403               linkers << mod_line + Constants::LINE_DELIMITER
404             end
405           end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
406         end # while line = file.gets
407
408       end
409
410       ids_map_writer.close
411
412       if ( per_species_counter > 0 )
413         print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
414       end
415
416       io = MsaIO.new()
417
418       fasta_writer = FastaWriter.new()
419       fasta_writer.remove_gap_chars
420       fasta_writer.clean
421
422       begin
423         io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
424       rescue Exception => e
425         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
426       end
427
428       if new_msa_domains != nil
429         begin
430           io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
431         rescue Exception => e
432           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
433         end
434       end
435
436       if extension > 0 && new_msa_domains_extended != nil
437         begin
438           io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
439         rescue Exception => e
440           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
441         end
442       end
443
444       begin
445         io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
446       rescue Exception => e
447         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
448       end
449
450       begin
451         f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
452         f.print( log )
453         f.close
454       rescue Exception => e
455         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
456       end
457
458       if extract_linkers && linkers != nil
459         begin
460           f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LINKERS_SUFFIX , 'a' )
461           f.print( linkers )
462           f.close
463         rescue Exception => e
464           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
465         end
466       end
467
468
469     end
470
471
472     def get_linker_sequence( from, to, seq )
473       if from < 0
474         from = 0
475       end
476       if to > seq.get_sequence_as_string.length - 1
477         to = seq.get_sequence_as_string.length - 1
478       end
479       if from > to
480         return ""
481       else
482         return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
483       end
484     end
485
486     def obtain_inputfiles( input_dir, seq_names_files_suffix )
487       input_files = Array.new()
488       Dir.foreach( input_dir ) { |file_name|
489         if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
490           input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
491         end
492       }
493       input_files
494     end
495
496     def extract_mappings( mapping_file )
497       species_code_to_path = Hash.new()
498       File.open( mapping_file ) do | file |
499         while line = file.gets
500           if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
501             if ( line =~ /(\S+)\s+(\S+)/ )
502               species = $1
503               path = $2
504               if ( species_code_to_path.has_key?( species ) )
505                 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
506               end
507               if ( species_code_to_path.has_value?( path ) )
508                 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
509               end
510               if ( !File.exist?( path ) )
511                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
512               end
513               if ( !File.file?( path ) )
514                 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
515               end
516               if ( !File.readable?( path ) )
517                 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
518               end
519               if ( File.size( path ) < 10000 )
520                 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
521               end
522               if ( !Util.looks_like_fasta?( path ) )
523                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
524               end
525               species_code_to_path[ species ] = path
526               puts species + " -> " + path
527             end
528           end
529         end
530       end
531       species_code_to_path
532     end
533
534     def print_counts( per_species_counter, log, ld )
535       puts "   [sum: " + per_species_counter.to_s + "]"
536       log << "   [sum: " + per_species_counter.to_s + "]" + ld
537     end
538
539     def read_fasta_file( input )
540       f = MsaFactory.new()
541       msa = nil
542       begin
543         msa = f.create_msa_from_file( input, FastaParser.new() )
544       rescue Exception => e
545         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
546       end
547       msa
548     end
549
550     def print_help()
551       puts( "Usage:" )
552       puts()
553       puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
554          "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
555          "genome multiple-sequence ('fasta') files not in input dir]" )
556       puts()
557       puts( "  option: -" + EXT_OPTION  + "=<int>: to extend extracted domains" )
558       puts( "          -" + EXTRACT_LINKERS_OPTION  + "      : to extract linkers" )
559       puts()
560       puts( "  " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" )
561       puts()
562     end
563
564   end # class MultiSequenceExtractor
565 end