in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / apps / multi_sequence_extractor.rb
1 #
2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/msa/msa_factory'
14 require 'lib/evo/io/msa_io'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/util/command_line_arguments'
18
19
20
21 module Evoruby
22
23   class MultiSequenceExtractor
24
25     PRG_NAME                           = "mse"
26     PRG_VERSION                        = "1.01"
27     PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
28     PRG_DATE                           = "2012.07.19"
29     COPYRIGHT                          = "2008-2012 Christian M Zmasek"
30     CONTACT                            = "phylosoft@gmail.com"
31     WWW                                = "www.phylosoft.org"
32     HELP_OPTION_1                      = 'help'
33     HELP_OPTION_2                      = 'h'
34
35     EXT_OPTION                          = 'e'
36     LOG_SUFFIX                          = ".mse_log"
37     FASTA_SUFFIX                        = ".fasta"
38     FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
39     NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
40     PROTEINS_LIST_FILE_SEPARATOR        = "\t"
41
42     def run()
43
44       Util.print_program_information( PRG_NAME,
45         PRG_VERSION,
46         PRG_DESC ,
47         PRG_DATE,
48         COPYRIGHT,
49         CONTACT,
50         WWW,
51         STDOUT )
52
53       ld = Constants::LINE_DELIMITER
54
55       begin
56         cla = CommandLineArguments.new( ARGV )
57       rescue ArgumentError => e
58         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
59       end
60
61       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
62            cla.is_option_set?( HELP_OPTION_2 ) )
63         print_help
64         exit( 0 )
65       end
66
67       if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
68         print_help
69         exit( -1 )
70       end
71
72       allowed_opts = Array.new
73       allowed_opts.push(EXT_OPTION)
74
75       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
76       if ( disallowed.length > 0 )
77         Util.fatal_error( PRG_NAME,
78           "unknown option(s): " + disallowed,
79           STDOUT )
80       end
81
82       seq_names_files_suffix = cla.get_file_name( 0 )
83       input_dir              = cla.get_file_name( 1 )
84       out_dir                = cla.get_file_name( 2 )
85       out_dir_doms           = cla.get_file_name( 3 )
86       mapping_file            = nil
87
88       if ( cla.get_number_of_files == 5 )
89         mapping_file = cla.get_file_name( 4 )
90         begin
91           Util.check_file_for_readability( mapping_file )
92         rescue ArgumentError => e
93           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
94         end
95       end
96
97       extension = 0
98       if cla.is_option_set?(EXT_OPTION)
99         extension = cla.get_option_value_as_int(EXT_OPTION)
100         if extension < 0
101           extension = 0
102         end
103       end
104
105       if  !File.exist?( input_dir )
106         Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
107       end
108       if  !File.exist?( out_dir )
109         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
110       end
111       if  !File.exist?( out_dir_doms )
112         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
113       end
114       if !File.directory?( input_dir )
115         Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
116       end
117       if !File.directory?( out_dir )
118         Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
119       end
120       if !File.directory?( out_dir_doms )
121         Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
122       end
123
124
125       log = String.new
126
127       log << "Program            : " + PRG_NAME + ld
128       log << "Version            : " + PRG_VERSION + ld
129       log << "Program date       : " + PRG_DATE + ld
130
131       puts()
132       puts( "Sequence names files suffix: " + seq_names_files_suffix )
133       log << "Sequence names files suffix: " + seq_names_files_suffix + ld
134       puts( "Input dir                  : " + input_dir )
135       log << "Input dir                  : " + input_dir + ld
136       puts( "Output dir                 : " + out_dir )
137       log << "Output dir                 : " + out_dir + ld
138       puts( "Output dir domains         : " + out_dir_doms )
139       log << "Output dir domains         : " + out_dir_doms + ld
140       if ( mapping_file != nil )
141         puts( "Mapping file               : " + mapping_file )
142         log << "Mapping file               : " + mapping_file + ld
143       end
144       if ( extension > 0 )
145         puts( "Extension                  : " + extension.to_s )
146         log << "Extension                  : " + extension.to_s + ld
147       end
148       log << "Date                       : " + Time.now.to_s + ld
149       puts
150
151       if ( mapping_file != nil )
152         species_codes_to_paths = extract_mappings( mapping_file )
153       end
154
155       input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
156
157       counter = 0
158
159       input_files.each { |input_file|
160         counter += 1
161         puts
162         puts
163         puts counter.to_s + "/" + input_files.size.to_s
164         read_seq_family_file( input_file,
165           seq_names_files_suffix,
166           input_dir,
167           species_codes_to_paths,
168           log,
169           out_dir,
170           out_dir_doms,
171           mapping_file,
172           extension )
173       }
174       puts
175       Util.print_message( PRG_NAME, "OK" )
176       puts
177
178     end
179
180
181     def read_seq_family_file( input_file,
182         seq_names_files_suffix,
183         input_dir,
184         species_codes_to_paths,
185         log,
186         out_dir,
187         out_dir_doms,
188         mapping_file,
189         extension )
190
191       begin
192         Util.check_file_for_readability( input_file )
193       rescue ArgumentError => e
194         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
195       end
196       basename = File.basename( input_file, seq_names_files_suffix )
197       out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
198       out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
199       out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
200       doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
201       doms_ext_out_file_path_fasta_file           = nil
202       if extension > 0
203         doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX
204       end
205       begin
206         Util.check_file_for_writability( out_file_path_fasta_file )
207         Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
208         Util.check_file_for_writability( out_file_path_ids_map  )
209         Util.check_file_for_writability( doms_out_file_path_fasta_file  )
210       rescue ArgumentError => e
211         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
212       end
213
214       ids_map_writer = nil
215       begin
216         ids_map_writer = File.open( out_file_path_ids_map, 'a' )
217       rescue Exception => e
218         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
219       end
220
221       current_species         = ""
222       current_msa            = nil
223       new_msa                = Msa.new
224       new_msa_normalized_ids = Msa.new
225       new_msa_domains = Msa.new
226       new_msa_domains_extended = Msa.new
227       per_species_counter = 0
228
229       puts basename
230
231       File.open( input_file ) do | file |
232         while line = file.gets
233           line.strip!
234           if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
235             values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
236
237             if ( values.length < 2 )
238               Util.fatal_error( PRG_NAME, "unexpected format: " + line )
239             end
240             species = values[ 0 ]
241             if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
242               next
243             end
244             seq_name = values[ 1 ]
245             domain_ranges = nil
246             if ( values.length > 3 )
247               domain_ranges_block = values[ 3 ]
248               domain_ranges = domain_ranges_block.split( "/" )
249             end
250             if ( species != current_species )
251               current_species = species
252               my_file = input_dir + Constants::FILE_SEPARATOR + current_species
253
254               if ( !File.exist?( my_file ) )
255                 if species_codes_to_paths == nil
256                   Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
257                 elsif ( !species_codes_to_paths.has_key?( current_species ) )
258                   Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
259                 end
260                 my_file = species_codes_to_paths[ current_species ]
261               end
262               my_path = File.expand_path( my_file )
263               my_readlink = my_path
264               if ( File.symlink?( my_path ) )
265                 my_readlink = File.readlink( my_path )
266               end
267               current_msa = read_fasta_file( my_file )
268
269               if ( per_species_counter > 0 )
270                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
271                 per_species_counter = 0
272               end
273               puts " " + current_species + " [" + my_readlink + "]"
274               log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
275             end
276             puts "   " + seq_name
277             log << "   " + seq_name + Constants::LINE_DELIMITER
278             per_species_counter = per_species_counter + 1
279             seq = nil
280
281             if current_msa.find_by_name_start( seq_name, true ).size > 0
282               begin
283                 seq = current_msa.get_by_name_start( seq_name, true ).copy
284               rescue ArgumentError => e
285                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
286               end
287             else
288               # Not found, try finding by partial match.
289               begin
290                 seq = current_msa.get_by_name( seq_name, true, true )
291               rescue ArgumentError => e
292                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
293               end
294             end
295
296             normalized_id = per_species_counter.to_s( 16 ).upcase +
297              "_" + current_species
298
299             per_species_counter.to_i
300
301             ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
302
303             orig_name = nil
304             if ( seq != nil )
305               orig_name = seq.get_name
306               seq.set_name( seq.get_name + " [" + current_species + "]" )
307               new_msa.add_sequence( seq )
308             else
309               Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
310             end
311
312             if  domain_ranges != nil
313               domain_ranges.each { |range|
314                 if range != nil && range.length > 0
315                   s = range.split("-")
316                   from = s[ 0 ].to_i
317                   to = s[ 1 ].to_i
318                   new_msa_domains.add_sequence( Sequence.new( orig_name + " [" + from.to_s + "-" + to.to_s + "] [" + basename + "] [" + current_species + "]", seq.get_sequence_as_string[from..to] ) )
319                   if extension > 0
320                     from_e = from - extension
321                     if from_e < 0
322                       from_e = 0
323                     end
324                     to_e = to + extension
325                     if to_e > seq.get_sequence_as_string.length - 1
326                       to_e = seq.get_sequence_as_string.length - 1
327                     end
328                     new_msa_domains_extended.add_sequence( Sequence.new( orig_name + " [" + from.to_s + "-" + to.to_s  + "] [extended by " + extension.to_s + "] [" + basename + "] [" + current_species + "]",
329                         seq.get_sequence_as_string[ from_e..to_e ] ) )
330                   end
331                 end
332               }
333             end
334
335             new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
336
337           end
338         end
339
340       end
341
342       ids_map_writer.close
343
344       if ( per_species_counter > 0 )
345         print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
346       end
347
348       io = MsaIO.new()
349
350       fasta_writer = FastaWriter.new()
351       fasta_writer.remove_gap_chars
352       fasta_writer.clean
353
354       begin
355         io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
356       rescue Exception => e
357         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
358       end
359
360       if new_msa_domains != nil
361         begin
362           io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
363         rescue Exception => e
364           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
365         end
366       end
367
368       if extension > 0 && new_msa_domains_extended != nil
369         begin
370           io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
371         rescue Exception => e
372           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
373         end
374       end
375
376       begin
377         io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
378       rescue Exception => e
379         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
380       end
381
382       begin
383         f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
384         f.print( log )
385         f.close
386       rescue Exception => e
387         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
388       end
389
390     end
391
392     def obtain_inputfiles( input_dir, seq_names_files_suffix )
393       input_files = Array.new()
394       Dir.foreach( input_dir ) { |file_name|
395         if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
396           input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
397         end
398       }
399       input_files
400     end
401
402     def extract_mappings( mapping_file )
403       species_code_to_path = Hash.new()
404       File.open( mapping_file ) do | file |
405         while line = file.gets
406           if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
407             if ( line =~ /(\S+)\s+(\S+)/ )
408               species = $1
409               path = $2
410               if ( species_code_to_path.has_key?( species ) )
411                 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
412               end
413               if ( species_code_to_path.has_value?( path ) )
414                 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
415               end
416               if ( !File.exist?( path ) )
417                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
418               end
419               if ( !File.file?( path ) )
420                 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
421               end
422               if ( !File.readable?( path ) )
423                 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
424               end
425               if ( File.size( path ) < 10000 )
426                 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
427               end
428               if ( !Util.looks_like_fasta?( path ) )
429                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
430               end
431               species_code_to_path[ species ] = path
432               puts species + " -> " + path
433             end
434           end
435         end
436       end
437       species_code_to_path
438     end
439
440     def print_counts( per_species_counter, log, ld )
441       puts "   [sum: " + per_species_counter.to_s + "]"
442       log << "   [sum: " + per_species_counter.to_s + "]" + ld
443     end
444
445     def read_fasta_file( input )
446       f = MsaFactory.new()
447       msa = nil
448       begin
449         msa = f.create_msa_from_file( input, FastaParser.new() )
450       rescue Exception => e
451         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
452       end
453       msa
454     end
455
456     def print_help()
457       puts( "Usage:" )
458       puts()
459       puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
460          "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
461          "genome multiple-sequence ('fasta') files not in input dir]" )
462       puts()
463       puts( "  option: -" + EXT_OPTION  + "=<int>: to extend extracted domains" )
464       puts()
465       puts( "  " + "Example: \"mse.rb .prot . seqs doms ../genome_locations.txt\"" )
466       puts()
467     end
468
469   end # class MultiSequenceExtractor
470 end