in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / apps / multi_sequence_extractor.rb
1 #
2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/msa/msa_factory'
14 require 'lib/evo/io/msa_io'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/util/command_line_arguments'
18
19
20
21 module Evoruby
22
23   class MultiSequenceExtractor
24
25     PRG_NAME                           = "mse"
26     PRG_VERSION                        = "1.0.0"
27     PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
28     PRG_DATE                           = "2008.08.13"
29     COPYRIGHT                          = "2008-2009 Christian M Zmasek"
30     CONTACT                            = "phylosoft@gmail.com"
31     WWW                                = "www.phylosoft.org"
32     HELP_OPTION_1                      = 'help'
33     HELP_OPTION_2                      = 'h'
34
35     LOG_SUFFIX                          = ".mse_log"
36     FASTA_SUFFIX                        = ".fasta"
37     FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
38     NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
39     PROTEINS_LIST_FILE_SEPARATOR        = "\t"
40
41     def run()
42
43       Util.print_program_information( PRG_NAME,
44         PRG_VERSION,
45         PRG_DESC ,
46         PRG_DATE,
47         COPYRIGHT,
48         CONTACT,
49         WWW,
50         STDOUT )
51
52       ld = Constants::LINE_DELIMITER
53
54       begin
55         cla = CommandLineArguments.new( ARGV )
56       rescue ArgumentError => e
57         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
58       end
59
60       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
61            cla.is_option_set?( HELP_OPTION_2 ) )
62         print_help
63         exit( 0 )
64       end
65
66       if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
67         print_help
68         exit( -1 )
69       end
70
71       allowed_opts = Array.new
72
73       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
74       if ( disallowed.length > 0 )
75         Util.fatal_error( PRG_NAME,
76           "unknown option(s): " + disallowed,
77           STDOUT )
78       end
79
80       seq_names_files_suffix = cla.get_file_name( 0 )
81       input_dir              = cla.get_file_name( 1 )
82       out_dir                = cla.get_file_name( 2 )
83       out_dir_doms           = cla.get_file_name( 3 )
84       mapping_file            = nil
85
86       if ( cla.get_number_of_files == 5 )
87         mapping_file = cla.get_file_name( 4 )
88         begin
89           Util.check_file_for_readability( mapping_file )
90         rescue ArgumentError => e
91           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
92         end
93       end
94
95       if  !File.exist?( input_dir )
96         Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
97       end
98       if  !File.exist?( out_dir )
99         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
100       end
101       if  !File.exist?( out_dir_doms )
102         Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
103       end
104       if !File.directory?( input_dir )
105         Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
106       end
107       if !File.directory?( out_dir )
108         Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
109       end
110       if !File.directory?( out_dir_doms )
111         Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
112       end
113
114
115       log = String.new
116
117       log << "Program            : " + PRG_NAME + ld
118       log << "Version            : " + PRG_VERSION + ld
119       log << "Program date       : " + PRG_DATE + ld
120
121       puts()
122       puts( "Sequence names files suffix: " + seq_names_files_suffix )
123       log << "Sequence names files suffix: " + seq_names_files_suffix + ld
124       puts( "Input dir                  : " + input_dir )
125       log << "Input dir                  : " + input_dir + ld
126       puts( "Output dir                 : " + out_dir )
127       log << "Output dir                 : " + out_dir + ld
128       puts( "Output dir domains         : " + out_dir_doms )
129       log << "Output dir domains         : " + out_dir_doms + ld
130       if ( mapping_file != nil )
131         puts( "Mapping file               : " + mapping_file )
132         log << "Mapping file               : " + mapping_file + ld
133       end
134       log << "Date                       : " + Time.now.to_s + ld
135       puts
136
137       if ( mapping_file != nil )
138         species_codes_to_paths = extract_mappings( mapping_file )
139       end
140
141       input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
142
143       counter = 0
144
145       input_files.each { |input_file|
146         counter += 1
147         puts
148         puts
149         puts counter.to_s + "/" + input_files.size.to_s
150         read_seq_family_file( input_file,
151           seq_names_files_suffix,
152           input_dir,
153           species_codes_to_paths,
154           log,
155           out_dir,
156           out_dir_doms,
157           mapping_file )
158       }
159       puts
160       Util.print_message( PRG_NAME, "OK" )
161       puts
162
163     end
164
165
166     def read_seq_family_file( input_file,
167         seq_names_files_suffix,
168         input_dir,
169         species_codes_to_paths,
170         log,
171         out_dir,
172         out_dir_doms,
173         mapping_file )
174
175       begin
176         Util.check_file_for_readability( input_file )
177       rescue ArgumentError => e
178         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
179       end
180       basename = File.basename( input_file, seq_names_files_suffix )
181       out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
182       out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
183       out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
184       doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
185       begin
186         Util.check_file_for_writability( out_file_path_fasta_file )
187         Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
188         Util.check_file_for_writability( out_file_path_ids_map  )
189         Util.check_file_for_writability( doms_out_file_path_fasta_file  )
190       rescue ArgumentError => e
191         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
192       end
193
194       ids_map_writer = nil
195       begin
196         ids_map_writer = File.open( out_file_path_ids_map, 'a' )
197       rescue Exception => e
198         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
199       end
200
201       current_species         = ""
202       current_msa            = nil
203       new_msa                = Msa.new
204       new_msa_normalized_ids = Msa.new
205       new_msa_domains = Msa.new
206       per_species_counter = 0
207
208       puts basename
209
210       File.open( input_file ) do | file |
211         while line = file.gets
212           line.strip!
213           if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
214             values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
215
216             if ( values.length < 2 )
217               Util.fatal_error( PRG_NAME, "unexpected format: " + line )
218             end
219             species = values[ 0 ]
220             if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
221               next
222             end
223             seq_name = values[ 1 ]
224             domain_ranges = nil
225             if ( values.length > 3 )
226               domain_ranges_block = values[ 3 ]
227               domain_ranges = domain_ranges_block.split( "/" )
228             end
229             if ( species != current_species )
230               current_species = species
231               my_file = input_dir + Constants::FILE_SEPARATOR + current_species
232
233               if ( !File.exist?( my_file ) )
234                 if species_codes_to_paths == nil
235                   Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
236                 elsif ( !species_codes_to_paths.has_key?( current_species ) )
237                   Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
238                 end
239                 my_file = species_codes_to_paths[ current_species ]
240               end
241               my_path = File.expand_path( my_file )
242               my_readlink = my_path
243               if ( File.symlink?( my_path ) )
244                 my_readlink = File.readlink( my_path )
245               end
246               current_msa = read_fasta_file( my_file )
247
248               if ( per_species_counter > 0 )
249                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
250                 per_species_counter = 0
251               end
252               puts " " + current_species + " [" + my_readlink + "]"
253               log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
254             end
255             puts "   " + seq_name
256             log << "   " + seq_name + Constants::LINE_DELIMITER
257             per_species_counter = per_species_counter + 1
258             seq = nil
259
260             if current_msa.find_by_name_start( seq_name, true ).size > 0
261               begin
262                 seq = current_msa.get_by_name_start( seq_name, true ).copy
263               rescue ArgumentError => e
264                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
265               end
266             else
267               # Not found, try finding by partial match.
268               begin
269                 seq = current_msa.get_by_name( seq_name, true, true )
270               rescue ArgumentError => e
271                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
272               end
273             end
274
275             normalized_id = per_species_counter.to_s( 16 ).upcase +
276              "_" + current_species
277
278             per_species_counter.to_i
279
280             ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
281
282             orig_name = nil
283             if ( seq != nil )
284               orig_name = seq.get_name
285               seq.set_name( seq.get_name + " [" + current_species + "]" )
286               new_msa.add_sequence( seq )
287             else
288               Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
289             end
290
291             if  domain_ranges != nil
292               domain_ranges.each { |range|
293                 if range != nil && range.length > 0
294                   s= range.split("-")
295                   from = s[ 0 ]
296                   to = s[ 1 ]
297                   new_msa_domains.add_sequence( Sequence.new( orig_name + "/" + from + "-" + to + " [" + basename + "] [" + current_species + "]", seq.get_sequence_as_string[from.to_i..to.to_i] ) )
298                 end
299               }
300             end
301
302             new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
303
304           end
305         end
306
307       end
308
309       ids_map_writer.close
310
311       if ( per_species_counter > 0 )
312         print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
313       end
314
315       io = MsaIO.new()
316
317       fasta_writer = FastaWriter.new()
318       fasta_writer.remove_gap_chars
319       fasta_writer.clean
320
321       begin
322         io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
323       rescue Exception => e
324         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
325       end
326
327       if  new_msa_domains != nil
328         begin
329           io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
330         rescue Exception => e
331           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
332         end
333       end
334
335       begin
336         io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
337       rescue Exception => e
338         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
339       end
340
341       begin
342         f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
343         f.print( log )
344         f.close
345       rescue Exception => e
346         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
347       end
348
349     end
350
351     def obtain_inputfiles( input_dir, seq_names_files_suffix )
352       input_files = Array.new()
353       Dir.foreach( input_dir ) { |file_name|
354         if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
355           input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
356         end
357       }
358       input_files
359     end
360
361     def extract_mappings( mapping_file )
362       species_code_to_path = Hash.new()
363       File.open( mapping_file ) do | file |
364         while line = file.gets
365           if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
366             if ( line =~ /(\S+)\s+(\S+)/ )
367               species = $1
368               path = $2
369               if ( species_code_to_path.has_key?( species ) )
370                 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
371               end
372               if ( species_code_to_path.has_value?( path ) )
373                 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
374               end
375               if ( !File.exist?( path ) )
376                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
377               end
378               if ( !File.file?( path ) )
379                 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
380               end
381               if ( !File.readable?( path ) )
382                 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
383               end
384               if ( File.size( path ) < 10000 )
385                 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
386               end
387               if ( !Util.looks_like_fasta?( path ) )
388                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
389               end
390               species_code_to_path[ species ] = path
391               puts species + " -> " + path
392             end
393           end
394         end
395       end
396       species_code_to_path
397     end
398
399     def print_counts( per_species_counter, log, ld )
400       puts "   [sum: " + per_species_counter.to_s + "]"
401       log << "   [sum: " + per_species_counter.to_s + "]" + ld
402     end
403
404     def read_fasta_file( input )
405       f = MsaFactory.new()
406       msa = nil
407       begin
408         msa = f.create_msa_from_file( input, FastaParser.new() )
409       rescue Exception => e
410         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
411       end
412       msa
413     end
414
415     def print_help()
416       puts( "Usage:" )
417       puts()
418       puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
419          "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
420          "genome multiple-sequence ('fasta') files not in input dir]" )
421       puts()
422       puts( "  " + "Example: \"mse.rb .prot . seqs doms ../genome_locations.txt\"" )
423       puts()
424     end
425
426   end # class MultiSequenceExtractor
427 end