in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / apps / multi_sequence_extractor.rb
1 #
2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/msa/msa_factory'
14 require 'lib/evo/io/msa_io'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/util/command_line_arguments'
18
19
20
21 module Evoruby
22
23     class MultiSequenceExtractor
24
25         PRG_NAME                           = "mse"
26         PRG_VERSION                        = "1.0.0"
27         PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
28         PRG_DATE                           = "2008.08.13"
29         COPYRIGHT                          = "2008-2009 Christian M Zmasek"
30         CONTACT                            = "phylosoft@gmail.com"
31         WWW                                = "www.phylosoft.org"
32         HELP_OPTION_1                      = 'help'
33         HELP_OPTION_2                      = 'h'
34
35         LOG_SUFFIX                          = ".mse_log"
36         FASTA_SUFFIX                        = ".fasta"
37         FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
38         NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
39         PROTEINS_LIST_FILE_SEPARATOR        = "\t"
40         CACHE_GENOMES                       = false
41
42         def run()
43
44             Util.print_program_information( PRG_NAME,
45                 PRG_VERSION,
46                 PRG_DESC ,
47                 PRG_DATE,
48                 COPYRIGHT,
49                 CONTACT,
50                 WWW,
51                 STDOUT )
52
53             ld = Constants::LINE_DELIMITER
54
55             begin
56                 cla = CommandLineArguments.new( ARGV )
57             rescue ArgumentError => e
58                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
59             end
60
61             if ( cla.is_option_set?( HELP_OPTION_1 ) ||
62                      cla.is_option_set?( HELP_OPTION_2 ) )
63                 print_help
64                 exit( 0 )
65             end
66
67             if ( cla.get_number_of_files != 3 && cla.get_number_of_files != 4 )
68                 print_help
69                 exit( -1 )
70             end
71
72             allowed_opts = Array.new
73
74             disallowed = cla.validate_allowed_options_as_str( allowed_opts )
75             if ( disallowed.length > 0 )
76                 Util.fatal_error( PRG_NAME,
77                     "unknown option(s): " + disallowed,
78                     STDOUT )
79             end
80
81             seq_names_files_suffix = cla.get_file_name( 0 )
82             input_dir              = cla.get_file_name( 1 )
83             out_dir                = cla.get_file_name( 2 )
84             mapping_file            = nil
85
86             if ( cla.get_number_of_files == 4 )
87                 mapping_file = cla.get_file_name( 3 )
88                 begin
89                     Util.check_file_for_readability( mapping_file )
90                 rescue ArgumentError => e
91                     Util.fatal_error( PRG_NAME, "error: " + e.to_s )
92                 end
93             end
94
95             if  !File.exist?( input_dir )
96                 Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
97             end
98             if  !File.exist?( out_dir )
99                 Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
100             end
101             if !File.directory?( input_dir )
102                 Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
103             end
104             if !File.directory?( out_dir )
105                 Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
106             end
107
108
109             log = String.new
110
111             log << "Program            : " + PRG_NAME + ld
112             log << "Version            : " + PRG_VERSION + ld
113             log << "Program date       : " + PRG_DATE + ld
114
115             puts()
116             puts( "Sequence names files suffix: " + seq_names_files_suffix )
117             log << "Sequence names files suffix: " + seq_names_files_suffix + ld
118             puts( "Input dir                  : " + input_dir )
119             log << "Input dir                  : " + input_dir + ld
120             puts( "Output dir                 : " + out_dir )
121             log << "Output dir                 : " + out_dir + ld
122             if ( mapping_file != nil )
123                 puts( "Mapping file               : " + mapping_file )
124                 log << "Mapping file               : " + mapping_file + ld
125             end
126             log << "Date                       : " + Time.now.to_s + ld
127             puts
128
129             if ( mapping_file != nil )
130                 species_codes_to_paths = extract_mappings( mapping_file )
131             end
132
133             input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
134
135             counter = 0
136             species_to_genomes = Hash.new()
137
138             input_files.each { |input_file|
139                 counter += 1
140                 puts
141                 puts
142                 puts counter.to_s + "/" + input_files.size.to_s
143                 read_seq_family_file( input_file,
144                     seq_names_files_suffix,
145                     input_dir,
146                     species_codes_to_paths,
147                     species_to_genomes,
148                     log,
149                     out_dir,
150                     mapping_file )
151             }
152             puts
153             Util.print_message( PRG_NAME, "OK" )
154             puts
155
156         end
157
158
159         def read_seq_family_file( input_file,
160                 seq_names_files_suffix,
161                 input_dir,
162                 species_codes_to_paths,
163                 species_to_genomes,
164                 log,
165                 out_dir,
166                 mapping_file )
167
168             begin
169                 Util.check_file_for_readability( input_file )
170             rescue ArgumentError => e
171                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
172             end
173
174             basename = File.basename( input_file, seq_names_files_suffix )
175             out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
176             out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
177             out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX 
178             begin
179                 Util.check_file_for_writability( out_file_path_fasta_file )
180                 Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
181                 Util.check_file_for_writability( out_file_path_ids_map  )
182             rescue ArgumentError => e
183                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
184             end
185           
186             ids_map_writer = nil
187             begin
188                 ids_map_writer = File.open( out_file_path_ids_map, 'a' )
189             rescue Exception => e
190                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
191             end
192             
193             current_species         = ""
194             current_msa            = nil
195             new_msa                = Msa.new
196             new_msa_normalized_ids = Msa.new
197             per_species_counter = 0
198
199             puts basename
200
201             File.open( input_file ) do | file |
202                 while line = file.gets
203                     if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
204                         values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
205                         if ( values.length < 2 )
206                             Util.fatal_error( PRG_NAME, "unexpected format: " + line )
207                         end
208                         species = values[ 0 ]
209                         if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
210                           next
211                         end
212                         seq_name = values[ 1 ]
213                         if ( species != current_species )
214                             current_species = species
215                             my_file = input_dir + Constants::FILE_SEPARATOR + current_species
216
217                             if ( !File.exist?( my_file ) )
218                                 if species_codes_to_paths == nil
219                                     Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
220                                 elsif ( !species_codes_to_paths.has_key?( current_species ) )
221                                     Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
222                                 end
223                                 my_file = species_codes_to_paths[ current_species ]
224                             end
225                             my_path = File.expand_path( my_file )
226                             my_readlink = my_path
227                             if ( File.symlink?( my_path ) )
228                                 my_readlink = File.readlink( my_path )
229                             end
230                             current_msa = nil
231                             if ( CACHE_GENOMES && species_to_genomes.has_key?( species ) )
232                                 current_msa = species_to_genomes[ species ]
233                             else
234                                 current_msa = read_fasta_file( my_file )
235                                 if CACHE_GENOMES
236                                     species_to_genomes[ species ] = current_msa
237                                 end
238                             end
239
240                             if ( per_species_counter > 0 )
241                                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
242                                 per_species_counter = 0
243                             end
244                             puts " " + current_species + " [" + my_readlink + "]"
245                             log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
246                         end
247                         puts "   " + seq_name
248                         log << "   " + seq_name + Constants::LINE_DELIMITER
249                         per_species_counter = per_species_counter + 1
250                         seq = nil
251                         
252                         if current_msa.find_by_name_start( seq_name, true ).size > 0
253                             begin
254                                 seq = current_msa.get_by_name_start( seq_name, true ).copy
255                             rescue ArgumentError => e
256                                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
257                             end
258                         elsif    
259                             # Not found, try finding by partial match.
260                             begin
261                                 seq = current_msa.get_by_name( seq_name, true, true )
262                             rescue ArgumentError => e
263                                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
264                             end
265                         end
266
267                         normalized_id = per_species_counter.to_s( 16 ).upcase +
268                          "_" + current_species
269                         
270                         per_species_counter.to_i
271                         
272                         ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
273                         
274                         if ( seq != nil )
275                             seq.set_name( seq.get_name + " [" + current_species + "]" )
276                             new_msa.add_sequence( seq )
277                         else
278                             Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
279                         end
280                         
281                         new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
282                        
283                     end
284                 end
285
286             end
287          
288             ids_map_writer.close
289            
290             if ( per_species_counter > 0 )
291                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
292             end
293
294             io = MsaIO.new()
295
296             fasta_writer = FastaWriter.new()
297             fasta_writer.remove_gap_chars
298             fasta_writer.clean
299             
300             begin
301                 io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
302             rescue Exception => e
303                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
304             end
305             
306             begin
307                 io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
308             rescue Exception => e
309                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
310             end
311
312             begin
313                 f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
314                 f.print( log )
315                 f.close
316             rescue Exception => e
317                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
318             end
319
320         end
321
322         def obtain_inputfiles( input_dir, seq_names_files_suffix )
323             input_files = Array.new()
324             Dir.foreach( input_dir ) { |file_name|
325                 if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
326                     input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
327                 end
328             }
329             input_files
330         end
331
332         def extract_mappings( mapping_file )
333             species_code_to_path = Hash.new()
334             File.open( mapping_file ) do | file |
335                 while line = file.gets
336                     if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
337                         if ( line =~ /(\S+)\s+(\S+)/ )
338                             species = $1
339                             path = $2
340                             if ( species_code_to_path.has_key?( species ) )
341                                 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
342                             end
343                             if ( species_code_to_path.has_value?( path ) )
344                                 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
345                             end
346                             if ( !File.exist?( path ) )
347                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
348                             end
349                             if ( !File.file?( path ) )
350                                 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
351                             end
352                             if ( !File.readable?( path ) )
353                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
354                             end
355                             if ( File.size( path ) < 10000 )
356                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
357                             end
358                             if ( !Util.looks_like_fasta?( path ) )
359                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
360                             end
361                             species_code_to_path[ species ] = path
362                             puts species + " -> " + path
363                         end
364                     end
365                 end
366             end
367             species_code_to_path
368         end
369
370         def print_counts( per_species_counter, log, ld )
371             puts "   [sum: " + per_species_counter.to_s + "]"
372             log << "   [sum: " + per_species_counter.to_s + "]" + ld
373         end
374
375         def read_fasta_file( input )
376             f = MsaFactory.new()
377             msa = nil
378             begin
379                 msa = f.create_msa_from_file( input, FastaParser.new() )
380             rescue Exception => e
381                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
382             end
383             msa
384         end
385
386         def print_help()
387             puts( "Usage:" )
388             puts()
389             puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
390                  "and possibly genome multiple-sequence ('fasta') files> <output directory> [mapping file for " +
391                  "genome multiple-sequence ('fasta') files not in input dir]" )
392             puts()
393             puts( "  " + "Example: \"mse.rb .prot . seqs ../genome_locations.txt\"" )
394             puts()
395         end
396
397     end # class MultiSequenceExtractor
398 end