in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / apps / multi_sequence_extractor.rb
1 #
2 # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
3 #
4 # Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/msa/msa_factory'
14 require 'lib/evo/io/msa_io'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/util/command_line_arguments'
18
19
20
21 module Evoruby
22
23     class MultiSequenceExtractor
24
25         PRG_NAME                           = "mse"
26         PRG_VERSION                        = "1.0.0"
27         PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
28         PRG_DATE                           = "2008.08.13"
29         COPYRIGHT                          = "2008-2009 Christian M Zmasek"
30         CONTACT                            = "phylosoft@gmail.com"
31         WWW                                = "www.phylosoft.org"
32         HELP_OPTION_1                      = 'help'
33         HELP_OPTION_2                      = 'h'
34
35         LOG_SUFFIX                          = ".mse_log"
36         FASTA_SUFFIX                        = ".fasta"
37         FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
38         NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
39         PROTEINS_LIST_FILE_SEPARATOR        = "\t"
40
41         def run()
42
43             Util.print_program_information( PRG_NAME,
44                 PRG_VERSION,
45                 PRG_DESC ,
46                 PRG_DATE,
47                 COPYRIGHT,
48                 CONTACT,
49                 WWW,
50                 STDOUT )
51
52             ld = Constants::LINE_DELIMITER
53
54             begin
55                 cla = CommandLineArguments.new( ARGV )
56             rescue ArgumentError => e
57                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
58             end
59
60             if ( cla.is_option_set?( HELP_OPTION_1 ) ||
61                      cla.is_option_set?( HELP_OPTION_2 ) )
62                 print_help
63                 exit( 0 )
64             end
65
66             if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
67                 print_help
68                 exit( -1 )
69             end
70
71             allowed_opts = Array.new
72
73             disallowed = cla.validate_allowed_options_as_str( allowed_opts )
74             if ( disallowed.length > 0 )
75                 Util.fatal_error( PRG_NAME,
76                     "unknown option(s): " + disallowed,
77                     STDOUT )
78             end
79
80             seq_names_files_suffix = cla.get_file_name( 0 )
81             input_dir              = cla.get_file_name( 1 )
82             out_dir                = cla.get_file_name( 2 )
83             out_dir_doms           = cla.get_file_name( 3 )
84             mapping_file            = nil
85
86             if ( cla.get_number_of_files == 5 )
87                 mapping_file = cla.get_file_name( 4 )
88                 begin
89                     Util.check_file_for_readability( mapping_file )
90                 rescue ArgumentError => e
91                     Util.fatal_error( PRG_NAME, "error: " + e.to_s )
92                 end
93             end
94
95             if  !File.exist?( input_dir )
96                 Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
97             end
98             if  !File.exist?( out_dir )
99                 Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
100             end
101               if  !File.exist?( out_dir_doms )
102                 Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
103             end
104             if !File.directory?( input_dir )
105                 Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
106             end
107             if !File.directory?( out_dir )
108                 Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
109             end
110             if !File.directory?( out_dir_doms )
111                 Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
112             end
113
114
115             log = String.new
116
117             log << "Program            : " + PRG_NAME + ld
118             log << "Version            : " + PRG_VERSION + ld
119             log << "Program date       : " + PRG_DATE + ld
120
121             puts()
122             puts( "Sequence names files suffix: " + seq_names_files_suffix )
123             log << "Sequence names files suffix: " + seq_names_files_suffix + ld
124             puts( "Input dir                  : " + input_dir )
125             log << "Input dir                  : " + input_dir + ld
126             puts( "Output dir                 : " + out_dir )
127             log << "Output dir                 : " + out_dir + ld
128             puts( "Output dir domains         : " + out_dir_doms )
129             log << "Output dir domains         : " + out_dir_doms + ld
130             if ( mapping_file != nil )
131                 puts( "Mapping file               : " + mapping_file )
132                 log << "Mapping file               : " + mapping_file + ld
133             end
134             log << "Date                       : " + Time.now.to_s + ld
135             puts
136
137             if ( mapping_file != nil )
138                 species_codes_to_paths = extract_mappings( mapping_file )
139             end
140
141             input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
142
143             counter = 0
144             species_to_genomes = Hash.new()
145
146             input_files.each { |input_file|
147                 counter += 1
148                 puts
149                 puts
150                 puts counter.to_s + "/" + input_files.size.to_s
151                 read_seq_family_file( input_file,
152                     seq_names_files_suffix,
153                     input_dir,
154                     species_codes_to_paths,
155                     species_to_genomes,
156                     log,
157                     out_dir,
158                     out_dir_doms,
159                     mapping_file )
160             }
161             puts
162             Util.print_message( PRG_NAME, "OK" )
163             puts
164
165         end
166
167
168         def read_seq_family_file( input_file,
169                 seq_names_files_suffix,
170                 input_dir,
171                 species_codes_to_paths,
172                 species_to_genomes,
173                 log,
174                 out_dir,
175                 out_dir_doms,
176                 mapping_file )
177
178             begin
179                 Util.check_file_for_readability( input_file )
180             rescue ArgumentError => e
181                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
182             end
183             basename = File.basename( input_file, seq_names_files_suffix )
184             out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
185             out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
186             out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX 
187             doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
188             begin
189                 Util.check_file_for_writability( out_file_path_fasta_file )
190                 Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
191                 Util.check_file_for_writability( out_file_path_ids_map  )
192                 Util.check_file_for_writability( doms_out_file_path_fasta_file  )
193             rescue ArgumentError => e
194                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
195             end
196           
197             ids_map_writer = nil
198             begin
199                 ids_map_writer = File.open( out_file_path_ids_map, 'a' )
200             rescue Exception => e
201                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
202             end
203             
204             current_species         = ""
205             current_msa            = nil
206             new_msa                = Msa.new
207             new_msa_normalized_ids = Msa.new
208             new_msa_domains = Msa.new
209             per_species_counter = 0
210
211             puts basename
212
213             File.open( input_file ) do | file |
214                 while line = file.gets
215                     if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
216                         values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
217                        
218                         if ( values.length < 2 )
219                             Util.fatal_error( PRG_NAME, "unexpected format: " + line )
220                         end
221                         species = values[ 0 ]
222                         if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
223                           next
224                         end
225                         seq_name = values[ 1 ]
226                         domain_ranges = nil
227                         if ( values.length > 3 )        
228                           domain_ranges_block = values[ 3 ]
229                           puts "domain_ranges_bloc=" + domain_ranges_block
230                           domain_ranges = domain_ranges_block.split( "/" )
231                         end
232                         if ( species != current_species )
233                             current_species = species
234                             my_file = input_dir + Constants::FILE_SEPARATOR + current_species
235
236                             if ( !File.exist?( my_file ) )
237                                 if species_codes_to_paths == nil
238                                     Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
239                                 elsif ( !species_codes_to_paths.has_key?( current_species ) )
240                                     Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
241                                 end
242                                 my_file = species_codes_to_paths[ current_species ]
243                             end
244                             my_path = File.expand_path( my_file )
245                             my_readlink = my_path
246                             if ( File.symlink?( my_path ) )
247                                 my_readlink = File.readlink( my_path )
248                             end
249                             current_msa = read_fasta_file( my_file )
250                             
251
252                             if ( per_species_counter > 0 )
253                                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
254                                 per_species_counter = 0
255                             end
256                             puts " " + current_species + " [" + my_readlink + "]"
257                             log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
258                         end
259                         puts "   " + seq_name
260                         log << "   " + seq_name + Constants::LINE_DELIMITER
261                         per_species_counter = per_species_counter + 1
262                         seq = nil
263                         
264                         if current_msa.find_by_name_start( seq_name, true ).size > 0
265                             begin
266                                 seq = current_msa.get_by_name_start( seq_name, true ).copy
267                             rescue ArgumentError => e
268                                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
269                             end
270                         elsif    
271                             # Not found, try finding by partial match.
272                             begin
273                                 seq = current_msa.get_by_name( seq_name, true, true )
274                             rescue ArgumentError => e
275                                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
276                             end
277                         end
278
279                         normalized_id = per_species_counter.to_s( 16 ).upcase +
280                          "_" + current_species
281                         
282                         per_species_counter.to_i
283                         
284                         ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
285                         
286                         orig_name = nil
287                         if ( seq != nil )
288                             orig_name = seq.get_name
289                             seq.set_name( seq.get_name + " [" + current_species + "]" )
290                             new_msa.add_sequence( seq )
291                         else
292                             Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
293                         end
294                         
295                         if  domain_ranges != nil 
296                             domain_ranges.each { |range| 
297                                 puts "reange:" + range + ":\n"  
298                                 if range != nil && range.length > 0
299                                     s= range.split("-")
300                                     from = s[ 0 ]
301                                     to = s[ 1 ]
302                                     puts from + "-" + to
303                                     new_msa_domains.add_sequence( Sequence.new( orig_name + "/" + from + "-" + to + " [" + basename + "] [" + current_species + "]", seq.get_sequence_as_string[from.to_i..to.to_i] ) )
304                                 end   
305                             }
306                         end
307                         
308                         new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
309                        
310                     end
311                 end
312
313             end
314          
315             ids_map_writer.close
316            
317             if ( per_species_counter > 0 )
318                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
319             end
320
321             io = MsaIO.new()
322
323             fasta_writer = FastaWriter.new()
324             fasta_writer.remove_gap_chars
325             fasta_writer.clean
326             
327             begin
328                 io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
329             rescue Exception => e
330                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
331             end
332             
333             if  new_msa_domains != nil 
334                 begin
335                     io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
336                 rescue Exception => e
337                     Util.fatal_error( PRG_NAME, "error: " + e.to_s )
338                 end
339             end
340             
341             begin
342                 io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
343             rescue Exception => e
344                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
345             end
346
347             begin
348                 f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
349                 f.print( log )
350                 f.close
351             rescue Exception => e
352                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
353             end
354
355         end
356
357         def obtain_inputfiles( input_dir, seq_names_files_suffix )
358             input_files = Array.new()
359             Dir.foreach( input_dir ) { |file_name|
360                 if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
361                     input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
362                 end
363             }
364             input_files
365         end
366
367         def extract_mappings( mapping_file )
368             species_code_to_path = Hash.new()
369             File.open( mapping_file ) do | file |
370                 while line = file.gets
371                     if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
372                         if ( line =~ /(\S+)\s+(\S+)/ )
373                             species = $1
374                             path = $2
375                             if ( species_code_to_path.has_key?( species ) )
376                                 Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
377                             end
378                             if ( species_code_to_path.has_value?( path ) )
379                                 Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
380                             end
381                             if ( !File.exist?( path ) )
382                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
383                             end
384                             if ( !File.file?( path ) )
385                                 Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
386                             end
387                             if ( !File.readable?( path ) )
388                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
389                             end
390                             if ( File.size( path ) < 10000 )
391                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
392                             end
393                             if ( !Util.looks_like_fasta?( path ) )
394                                 Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
395                             end
396                             species_code_to_path[ species ] = path
397                             puts species + " -> " + path
398                         end
399                     end
400                 end
401             end
402             species_code_to_path
403         end
404
405         def print_counts( per_species_counter, log, ld )
406             puts "   [sum: " + per_species_counter.to_s + "]"
407             log << "   [sum: " + per_species_counter.to_s + "]" + ld
408         end
409
410         def read_fasta_file( input )
411             f = MsaFactory.new()
412             msa = nil
413             begin
414                 msa = f.create_msa_from_file( input, FastaParser.new() )
415             rescue Exception => e
416                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
417             end
418             msa
419         end
420
421         def print_help()
422             puts( "Usage:" )
423             puts()
424             puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
425                  "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
426                  "genome multiple-sequence ('fasta') files not in input dir]" )
427             puts()
428             puts( "  " + "Example: \"mse.rb .prot . seqs doms ../genome_locations.txt\"" )
429             puts()
430         end
431
432     end # class MultiSequenceExtractor
433 end