initial commit
[jalview.git] / forester / ruby / evoruby / lib / evo / apps / multi_sequence_extractor.rb
1 #
2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/msa/msa_factory'
14 require 'lib/evo/io/msa_io'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/util/command_line_arguments'
18
19
20
21 module Evoruby
22
23     class MultiSequenceExtractor
24
25         PRG_NAME                           = "mse"
26         PRG_VERSION                        = "1.0.0"
27         PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
28         PRG_DATE                           = "2008.08.13"
29         COPYRIGHT                          = "2008-2009 Christian M Zmasek"
30         CONTACT                            = "phylosoft@gmail.com"
31         WWW                                = "www.phylosoft.org"
32         HELP_OPTION_1                      = 'help'
33         HELP_OPTION_2                      = 'h'
34
35         LOG_SUFFIX                          = ".mse_log"
36         FASTA_SUFFIX                        = ".fasta"
37         FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
38         NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
39         PROTEINS_LIST_FILE_SEPARATOR        = "\t"
40         CACHE_GENOMES                       = false
41
42         def run()
43
44             Util.print_program_information( PRG_NAME,
45                 PRG_VERSION,
46                 PRG_DESC ,
47                 PRG_DATE,
48                 COPYRIGHT,
49                 CONTACT,
50                 WWW,
51                 STDOUT )
52
53             ld = Constants::LINE_DELIMITER
54
55             begin
56                 cla = CommandLineArguments.new( ARGV )
57             rescue ArgumentError => e
58                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
59             end
60
61             if ( cla.is_option_set?( HELP_OPTION_1 ) ||
62                      cla.is_option_set?( HELP_OPTION_2 ) )
63                 print_help
64                 exit( 0 )
65             end
66
67             if ( cla.get_number_of_files != 3 && cla.get_number_of_files != 4 )
68                 print_help
69                 exit( -1 )
70             end
71
72             allowed_opts = Array.new
73
74             disallowed = cla.validate_allowed_options_as_str( allowed_opts )
75             if ( disallowed.length > 0 )
76                 Util.fatal_error( PRG_NAME,
77                     "unknown option(s): " + disallowed,
78                     STDOUT )
79             end
80
81             seq_names_files_suffix = cla.get_file_name( 0 )
82             input_dir              = cla.get_file_name( 1 )
83             out_dir                = cla.get_file_name( 2 )
84             mapping_file            = nil
85
86             if ( cla.get_number_of_files == 4 )
87                 mapping_file = cla.get_file_name( 3 )
88                 begin
89                     Util.check_file_for_readability( mapping_file )
90                 rescue ArgumentError => e
91                     Util.fatal_error( PRG_NAME, "error: " + e.to_s )
92                 end
93             end
94
95             if  !File.exist?( input_dir )
96                 Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
97             end
98             if  !File.exist?( out_dir )
99                 Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
100             end
101             if !File.directory?( input_dir )
102                 Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
103             end
104             if !File.directory?( out_dir )
105                 Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
106             end
107
108
109             log = String.new
110
111             log << "Program            : " + PRG_NAME + ld
112             log << "Version            : " + PRG_VERSION + ld
113             log << "Program date       : " + PRG_DATE + ld
114
115             puts()
116             puts( "Sequence names files suffix: " + seq_names_files_suffix )
117             log << "Sequence names files suffix: " + seq_names_files_suffix + ld
118             puts( "Input dir                  : " + input_dir )
119             log << "Input dir                  : " + input_dir + ld
120             puts( "Output dir                 : " + out_dir )
121             log << "Output dir                 : " + out_dir + ld
122             if ( mapping_file != nil )
123                 puts( "Mapping file               : " + mapping_file )
124                 log << "Mapping file               : " + mapping_file + ld
125             end
126             log << "Date                       : " + Time.now.to_s + ld
127             puts
128
129             if ( mapping_file != nil )
130                 species_codes_to_paths = extract_mappings( mapping_file )
131             end
132
133             input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
134
135             counter = 0
136             species_to_genomes = Hash.new()
137
138             input_files.each { |input_file|
139                 counter += 1
140                 puts
141                 puts
142                 puts counter.to_s + "/" + input_files.size.to_s
143                 read_seq_family_file( input_file,
144                     seq_names_files_suffix,
145                     input_dir,
146                     species_codes_to_paths,
147                     species_to_genomes,
148                     log,
149                     out_dir,
150                     mapping_file )
151             }
152             puts
153             Util.print_message( PRG_NAME, "OK" )
154             puts
155
156         end
157
158
159         def read_seq_family_file( input_file,
160                 seq_names_files_suffix,
161                 input_dir,
162                 species_codes_to_paths,
163                 species_to_genomes,
164                 log,
165                 out_dir,
166                 mapping_file )
167
168             begin
169                 Util.check_file_for_readability( input_file )
170             rescue ArgumentError => e
171                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
172             end
173
174             basename = File.basename( input_file, seq_names_files_suffix )
175             out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
176             out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
177             out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX 
178             begin
179                 Util.check_file_for_writability( out_file_path_fasta_file )
180                 Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
181                 Util.check_file_for_writability( out_file_path_ids_map  )
182             rescue ArgumentError => e
183                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
184             end
185           
186             ids_map_writer = nil
187             begin
188                 ids_map_writer = File.open( out_file_path_ids_map, 'a' )
189             rescue Exception => e
190                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
191             end
192             
193             current_species         = ""
194             current_msa            = nil
195             new_msa                = Msa.new
196             new_msa_normalized_ids = Msa.new
197             per_species_counter = 0
198
199             puts basename
200
201             File.open( input_file ) do | file |
202                 while line = file.gets
203                     if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
204                         values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
205                         if ( values.length < 2 )
206                             Util.fatal_error( PRG_NAME, "unexpected format: " + line )
207                         end
208                         species = values[ 0 ]
209                         seq_name = values[ 1 ]
210                         if ( species != current_species )
211                             current_species = species
212                             my_file = input_dir + Constants::FILE_SEPARATOR + current_species
213
214                             if ( !File.exist?( my_file ) )
215                                 if species_codes_to_paths == nil
216                                     Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
217                                 elsif ( !species_codes_to_paths.has_key?( current_species ) )
218                                     Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
219                                 end
220                                 my_file = species_codes_to_paths[ current_species ]
221                             end
222                             my_path = File.expand_path( my_file )
223                             my_readlink = my_path
224                             if ( File.symlink?( my_path ) )
225                                 my_readlink = File.readlink( my_path )
226                             end
227                             current_msa = nil
228                             if ( CACHE_GENOMES && species_to_genomes.has_key?( species ) )
229                                 current_msa = species_to_genomes[ species ]
230                             else
231                                 current_msa = read_fasta_file( my_file )
232                                 if CACHE_GENOMES
233                                     species_to_genomes[ species ] = current_msa
234                                 end
235                             end
236
237                             if ( per_species_counter > 0 )
238                                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
239                                 per_species_counter = 0
240                             end
241                             puts " " + current_species + " [" + my_readlink + "]"
242                             log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
243                         end
244                         puts "   " + seq_name
245                         log << "   " + seq_name + Constants::LINE_DELIMITER
246                         per_species_counter = per_species_counter + 1
247                         seq = nil
248                         
249                         if current_msa.find_by_name_start( seq_name, true ).size > 0
250                             begin
251                                 seq = current_msa.get_by_name_start( seq_name, true ).copy
252                             rescue ArgumentError => e
253                                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
254                             end
255                         elsif    
256                             # Not found, try finding by partial match.
257                             begin
258                                 seq = current_msa.get_by_name( seq_name, true, true )
259                             rescue ArgumentError => e
260                                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
261                             end
262                         end
263
264                         normalized_id = per_species_counter.to_s( 16 ).upcase +
265                          "_" + current_species
266                         
267                         per_species_counter.to_i
268                         
269                         ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
270                         
271                         if ( seq != nil )
272                             seq.set_name( seq.get_name + " [" + current_species + "]" )
273                             new_msa.add_sequence( seq )
274                         else
275                             Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
276                         end
277                         
278                         new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
279                        
280                     end
281                 end
282
283             end
284          
285             ids_map_writer.close
286            
287             if ( per_species_counter > 0 )
288                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
289             end
290
291             io = MsaIO.new()
292
293             fasta_writer = FastaWriter.new()
294             fasta_writer.remove_gap_chars
295             fasta_writer.clean
296             
297             begin
298                 io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
299             rescue Exception => e
300                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
301             end
302             
303             begin
304                 io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
305             rescue Exception => e
306                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
307             end
308
309             begin
310                 f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
311                 f.print( log )
312                 f.close
313             rescue Exception => e
314                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
315             end
316
317         end
318
319         def obtain_inputfiles( input_dir, seq_names_files_suffix )
320             input_files = Array.new()
321             Dir.foreach( input_dir ) { |file_name|
322                 if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
323                     input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
324                 end
325             }
326             input_files
327         end
328
329         def extract_mappings( mapping_file )
330             species_code_to_path = Hash.new()
331             File.open( mapping_file ) do | file |
332                 while line = file.gets
333                     if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
334                         if ( line =~ /(\S+)\s+(\S+)/ )
335                             species = $1
336                             path = $2
337                             if ( species_code_to_path.has_key?( species ) )
338                                 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
339                             end
340                             if ( species_code_to_path.has_value?( path ) )
341                                 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
342                             end
343                             if ( !File.exist?( path ) )
344                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
345                             end
346                             if ( !File.file?( path ) )
347                                 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
348                             end
349                             if ( !File.readable?( path ) )
350                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
351                             end
352                             if ( File.size( path ) < 10000 )
353                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
354                             end
355                             if ( !Util.looks_like_fasta?( path ) )
356                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
357                             end
358                             species_code_to_path[ species ] = path
359                             puts species + " -> " + path
360                         end
361                     end
362                 end
363             end
364             species_code_to_path
365         end
366
367         def print_counts( per_species_counter, log, ld )
368             puts "   [sum: " + per_species_counter.to_s + "]"
369             log << "   [sum: " + per_species_counter.to_s + "]" + ld
370         end
371
372         def read_fasta_file( input )
373             f = MsaFactory.new()
374             msa = nil
375             begin
376                 msa = f.create_msa_from_file( input, FastaParser.new() )
377             rescue Exception => e
378                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
379             end
380             msa
381         end
382
383         def print_help()
384             puts( "Usage:" )
385             puts()
386             puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
387                  "and possibly genome multiple-sequence ('fasta') files> <output directory> [mapping file for " +
388                  "genome multiple-sequence ('fasta') files not in input dir]" )
389             puts()
390             puts( "  " + "Example: \"mse.rb .prot . seqs ../genome_locations.txt\"" )
391             puts()
392         end
393
394     end # class MultiSequenceExtractor
395 end