in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / apps / multi_sequence_extractor.rb
1 #
2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/msa/msa_factory'
14 require 'lib/evo/io/msa_io'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/util/command_line_arguments'
18
19
20
21 module Evoruby
22
23     class MultiSequenceExtractor
24
25         PRG_NAME                           = "mse"
26         PRG_VERSION                        = "1.0.0"
27         PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
28         PRG_DATE                           = "2008.08.13"
29         COPYRIGHT                          = "2008-2009 Christian M Zmasek"
30         CONTACT                            = "phylosoft@gmail.com"
31         WWW                                = "www.phylosoft.org"
32         HELP_OPTION_1                      = 'help'
33         HELP_OPTION_2                      = 'h'
34
35         LOG_SUFFIX                          = ".mse_log"
36         FASTA_SUFFIX                        = ".fasta"
37         FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
38         NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
39         PROTEINS_LIST_FILE_SEPARATOR        = "\t"
40
41         def run()
42
43             Util.print_program_information( PRG_NAME,
44                 PRG_VERSION,
45                 PRG_DESC ,
46                 PRG_DATE,
47                 COPYRIGHT,
48                 CONTACT,
49                 WWW,
50                 STDOUT )
51
52             ld = Constants::LINE_DELIMITER
53
54             begin
55                 cla = CommandLineArguments.new( ARGV )
56             rescue ArgumentError => e
57                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
58             end
59
60             if ( cla.is_option_set?( HELP_OPTION_1 ) ||
61                      cla.is_option_set?( HELP_OPTION_2 ) )
62                 print_help
63                 exit( 0 )
64             end
65
66             if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
67                 print_help
68                 exit( -1 )
69             end
70
71             allowed_opts = Array.new
72
73             disallowed = cla.validate_allowed_options_as_str( allowed_opts )
74             if ( disallowed.length > 0 )
75                 Util.fatal_error( PRG_NAME,
76                     "unknown option(s): " + disallowed,
77                     STDOUT )
78             end
79
80             seq_names_files_suffix = cla.get_file_name( 0 )
81             input_dir              = cla.get_file_name( 1 )
82             out_dir                = cla.get_file_name( 2 )
83             out_dir_doms           = cla.get_file_name( 3 )
84             mapping_file            = nil
85
86             if ( cla.get_number_of_files == 5 )
87                 mapping_file = cla.get_file_name( 4 )
88                 begin
89                     Util.check_file_for_readability( mapping_file )
90                 rescue ArgumentError => e
91                     Util.fatal_error( PRG_NAME, "error: " + e.to_s )
92                 end
93             end
94
95             if  !File.exist?( input_dir )
96                 Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
97             end
98             if  !File.exist?( out_dir )
99                 Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
100             end
101               if  !File.exist?( out_dir_doms )
102                 Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
103             end
104             if !File.directory?( input_dir )
105                 Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
106             end
107             if !File.directory?( out_dir )
108                 Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
109             end
110             if !File.directory?( out_dir_doms )
111                 Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
112             end
113
114
115             log = String.new
116
117             log << "Program            : " + PRG_NAME + ld
118             log << "Version            : " + PRG_VERSION + ld
119             log << "Program date       : " + PRG_DATE + ld
120
121             puts()
122             puts( "Sequence names files suffix: " + seq_names_files_suffix )
123             log << "Sequence names files suffix: " + seq_names_files_suffix + ld
124             puts( "Input dir                  : " + input_dir )
125             log << "Input dir                  : " + input_dir + ld
126             puts( "Output dir                 : " + out_dir )
127             log << "Output dir                 : " + out_dir + ld
128             puts( "Output dir domains         : " + out_dir_doms )
129             log << "Output dir domains         : " + out_dir_doms + ld
130             if ( mapping_file != nil )
131                 puts( "Mapping file               : " + mapping_file )
132                 log << "Mapping file               : " + mapping_file + ld
133             end
134             log << "Date                       : " + Time.now.to_s + ld
135             puts
136
137             if ( mapping_file != nil )
138                 species_codes_to_paths = extract_mappings( mapping_file )
139             end
140
141             input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
142
143             counter = 0
144             species_to_genomes = Hash.new()
145
146             input_files.each { |input_file|
147                 counter += 1
148                 puts
149                 puts
150                 puts counter.to_s + "/" + input_files.size.to_s
151                 read_seq_family_file( input_file,
152                     seq_names_files_suffix,
153                     input_dir,
154                     species_codes_to_paths,
155                     species_to_genomes,
156                     log,
157                     out_dir,
158                     out_dir_doms,
159                     mapping_file )
160             }
161             puts
162             Util.print_message( PRG_NAME, "OK" )
163             puts
164
165         end
166
167
168         def read_seq_family_file( input_file,
169                 seq_names_files_suffix,
170                 input_dir,
171                 species_codes_to_paths,
172                 species_to_genomes,
173                 log,
174                 out_dir,
175                 out_dir_doms,
176                 mapping_file )
177
178             begin
179                 Util.check_file_for_readability( input_file )
180             rescue ArgumentError => e
181                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
182             end
183             basename = File.basename( input_file, seq_names_files_suffix )
184             out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
185             out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
186             out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX 
187             doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
188             begin
189                 Util.check_file_for_writability( out_file_path_fasta_file )
190                 Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
191                 Util.check_file_for_writability( out_file_path_ids_map  )
192                 Util.check_file_for_writability( doms_out_file_path_fasta_file  )
193             rescue ArgumentError => e
194                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
195             end
196           
197             ids_map_writer = nil
198             begin
199                 ids_map_writer = File.open( out_file_path_ids_map, 'a' )
200             rescue Exception => e
201                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
202             end
203             
204             current_species         = ""
205             current_msa            = nil
206             new_msa                = Msa.new
207             new_msa_normalized_ids = Msa.new
208             new_msa_domains = Msa.new
209             per_species_counter = 0
210
211             puts basename
212
213             File.open( input_file ) do | file |
214                 while line = file.gets
215                     if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
216                         values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
217                        
218                         if ( values.length < 2 )
219                             Util.fatal_error( PRG_NAME, "unexpected format: " + line )
220                         end
221                         species = values[ 0 ]
222                         if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
223                           next
224                         end
225                         seq_name = values[ 1 ]
226                         domain_ranges = nil
227                         if ( values.length > 2 )        
228                           domain_ranges_block = values[ 2 ]
229                           domain_ranges = domain_ranges_block.split( "/" )
230                         end
231                         if ( species != current_species )
232                             current_species = species
233                             my_file = input_dir + Constants::FILE_SEPARATOR + current_species
234
235                             if ( !File.exist?( my_file ) )
236                                 if species_codes_to_paths == nil
237                                     Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
238                                 elsif ( !species_codes_to_paths.has_key?( current_species ) )
239                                     Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
240                                 end
241                                 my_file = species_codes_to_paths[ current_species ]
242                             end
243                             my_path = File.expand_path( my_file )
244                             my_readlink = my_path
245                             if ( File.symlink?( my_path ) )
246                                 my_readlink = File.readlink( my_path )
247                             end
248                             current_msa = read_fasta_file( my_file )
249                             end
250
251                             if ( per_species_counter > 0 )
252                                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
253                                 per_species_counter = 0
254                             end
255                             puts " " + current_species + " [" + my_readlink + "]"
256                             log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
257                         end
258                         puts "   " + seq_name
259                         log << "   " + seq_name + Constants::LINE_DELIMITER
260                         per_species_counter = per_species_counter + 1
261                         seq = nil
262                         
263                         if current_msa.find_by_name_start( seq_name, true ).size > 0
264                             begin
265                                 seq = current_msa.get_by_name_start( seq_name, true ).copy
266                             rescue ArgumentError => e
267                                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
268                             end
269                         elsif    
270                             # Not found, try finding by partial match.
271                             begin
272                                 seq = current_msa.get_by_name( seq_name, true, true )
273                             rescue ArgumentError => e
274                                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
275                             end
276                         end
277
278                         normalized_id = per_species_counter.to_s( 16 ).upcase +
279                          "_" + current_species
280                         
281                         per_species_counter.to_i
282                         
283                         ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
284                         
285                         orig_name = nil
286                         if ( seq != nil )
287                             orig_name = seq.get_name
288                             seq.set_name( seq.get_name + " [" + current_species + "]" )
289                             new_msa.add_sequence( seq )
290                         else
291                             Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
292                         end
293                         
294                         if  domain_ranges != nil 
295                             domain_ranges.each { |range| 
296                                 if range != nil
297                                     s= range.split("-")
298                                     from = s[ 0 ]
299                                     to = s[ 1 ]
300                                     puts from + "-" + to
301                                     new_msa_domains.add_sequence( Sequence.new( orig_name + "/" + from + "-" + to + " [" + basename + "] [" + current_species + "]", seq.get_sequence_as_string[from..to] ) )
302                                 end   
303                             }
304                         end
305                         
306                         new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
307                        
308                     end
309                 end
310
311             end
312          
313             ids_map_writer.close
314            
315             if ( per_species_counter > 0 )
316                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
317             end
318
319             io = MsaIO.new()
320
321             fasta_writer = FastaWriter.new()
322             fasta_writer.remove_gap_chars
323             fasta_writer.clean
324             
325             begin
326                 io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
327             rescue Exception => e
328                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
329             end
330             
331             if  new_msa_domains != nil 
332                 begin
333                     io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
334                 rescue Exception => e
335                     Util.fatal_error( PRG_NAME, "error: " + e.to_s )
336                 end
337            end
338             
339             begin
340                 io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
341             rescue Exception => e
342                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
343             end
344
345             begin
346                 f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
347                 f.print( log )
348                 f.close
349             rescue Exception => e
350                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
351             end
352
353         end
354
355         def obtain_inputfiles( input_dir, seq_names_files_suffix )
356             input_files = Array.new()
357             Dir.foreach( input_dir ) { |file_name|
358                 if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
359                     input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
360                 end
361             }
362             input_files
363         end
364
365         def extract_mappings( mapping_file )
366             species_code_to_path = Hash.new()
367             File.open( mapping_file ) do | file |
368                 while line = file.gets
369                     if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
370                         if ( line =~ /(\S+)\s+(\S+)/ )
371                             species = $1
372                             path = $2
373                             if ( species_code_to_path.has_key?( species ) )
374                                 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
375                             end
376                             if ( species_code_to_path.has_value?( path ) )
377                                 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
378                             end
379                             if ( !File.exist?( path ) )
380                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
381                             end
382                             if ( !File.file?( path ) )
383                                 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
384                             end
385                             if ( !File.readable?( path ) )
386                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
387                             end
388                             if ( File.size( path ) < 10000 )
389                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
390                             end
391                             if ( !Util.looks_like_fasta?( path ) )
392                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
393                             end
394                             species_code_to_path[ species ] = path
395                             puts species + " -> " + path
396                         end
397                     end
398                 end
399             end
400             species_code_to_path
401         end
402
403         def print_counts( per_species_counter, log, ld )
404             puts "   [sum: " + per_species_counter.to_s + "]"
405             log << "   [sum: " + per_species_counter.to_s + "]" + ld
406         end
407
408         def read_fasta_file( input )
409             f = MsaFactory.new()
410             msa = nil
411             begin
412                 msa = f.create_msa_from_file( input, FastaParser.new() )
413             rescue Exception => e
414                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
415             end
416             msa
417         end
418
419         def print_help()
420             puts( "Usage:" )
421             puts()
422             puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
423                  "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
424                  "genome multiple-sequence ('fasta') files not in input dir]" )
425             puts()
426             puts( "  " + "Example: \"mse.rb .prot . seqs doms ../genome_locations.txt\"" )
427             puts()
428         end
429
430     end # class MultiSequenceExtractor
431 end