in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / apps / multi_sequence_extractor.rb
index bf38d07..f8838a7 100644 (file)
@@ -20,412 +20,408 @@ require 'lib/evo/util/command_line_arguments'
 
 module Evoruby
 
-    class MultiSequenceExtractor
-
-        PRG_NAME                           = "mse"
-        PRG_VERSION                        = "1.0.0"
-        PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
-        PRG_DATE                           = "2008.08.13"
-        COPYRIGHT                          = "2008-2009 Christian M Zmasek"
-        CONTACT                            = "phylosoft@gmail.com"
-        WWW                                = "www.phylosoft.org"
-        HELP_OPTION_1                      = 'help'
-        HELP_OPTION_2                      = 'h'
-
-        LOG_SUFFIX                          = ".mse_log"
-        FASTA_SUFFIX                        = ".fasta"
-        FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
-        NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
-        PROTEINS_LIST_FILE_SEPARATOR        = "\t"
-
-        def run()
-
-            Util.print_program_information( PRG_NAME,
-                PRG_VERSION,
-                PRG_DESC ,
-                PRG_DATE,
-                COPYRIGHT,
-                CONTACT,
-                WWW,
-                STDOUT )
-
-            ld = Constants::LINE_DELIMITER
-
-            begin
-                cla = CommandLineArguments.new( ARGV )
-            rescue ArgumentError => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+  class MultiSequenceExtractor
+
+    PRG_NAME                           = "mse"
+    PRG_VERSION                        = "1.0.0"
+    PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
+    PRG_DATE                           = "2008.08.13"
+    COPYRIGHT                          = "2008-2009 Christian M Zmasek"
+    CONTACT                            = "phylosoft@gmail.com"
+    WWW                                = "www.phylosoft.org"
+    HELP_OPTION_1                      = 'help'
+    HELP_OPTION_2                      = 'h'
+
+    LOG_SUFFIX                          = ".mse_log"
+    FASTA_SUFFIX                        = ".fasta"
+    FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
+    NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
+    PROTEINS_LIST_FILE_SEPARATOR        = "\t"
+
+    def run()
+
+      Util.print_program_information( PRG_NAME,
+        PRG_VERSION,
+        PRG_DESC ,
+        PRG_DATE,
+        COPYRIGHT,
+        CONTACT,
+        WWW,
+        STDOUT )
+
+      ld = Constants::LINE_DELIMITER
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+           cla.is_option_set?( HELP_OPTION_2 ) )
+        print_help
+        exit( 0 )
+      end
+
+      if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
+        print_help
+        exit( -1 )
+      end
+
+      allowed_opts = Array.new
+
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME,
+          "unknown option(s): " + disallowed,
+          STDOUT )
+      end
+
+      seq_names_files_suffix = cla.get_file_name( 0 )
+      input_dir              = cla.get_file_name( 1 )
+      out_dir                = cla.get_file_name( 2 )
+      out_dir_doms           = cla.get_file_name( 3 )
+      mapping_file            = nil
+
+      if ( cla.get_number_of_files == 5 )
+        mapping_file = cla.get_file_name( 4 )
+        begin
+          Util.check_file_for_readability( mapping_file )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+        end
+      end
+
+      if  !File.exist?( input_dir )
+        Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
+      end
+      if  !File.exist?( out_dir )
+        Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
+      end
+      if  !File.exist?( out_dir_doms )
+        Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
+      end
+      if !File.directory?( input_dir )
+        Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
+      end
+      if !File.directory?( out_dir )
+        Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
+      end
+      if !File.directory?( out_dir_doms )
+        Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
+      end
+
+
+      log = String.new
+
+      log << "Program            : " + PRG_NAME + ld
+      log << "Version            : " + PRG_VERSION + ld
+      log << "Program date       : " + PRG_DATE + ld
+
+      puts()
+      puts( "Sequence names files suffix: " + seq_names_files_suffix )
+      log << "Sequence names files suffix: " + seq_names_files_suffix + ld
+      puts( "Input dir                  : " + input_dir )
+      log << "Input dir                  : " + input_dir + ld
+      puts( "Output dir                 : " + out_dir )
+      log << "Output dir                 : " + out_dir + ld
+      puts( "Output dir domains         : " + out_dir_doms )
+      log << "Output dir domains         : " + out_dir_doms + ld
+      if ( mapping_file != nil )
+        puts( "Mapping file               : " + mapping_file )
+        log << "Mapping file               : " + mapping_file + ld
+      end
+      log << "Date                       : " + Time.now.to_s + ld
+      puts
+
+      if ( mapping_file != nil )
+        species_codes_to_paths = extract_mappings( mapping_file )
+      end
+
+      input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
+
+      counter = 0
+
+      input_files.each { |input_file|
+        counter += 1
+        puts
+        puts
+        puts counter.to_s + "/" + input_files.size.to_s
+        read_seq_family_file( input_file,
+          seq_names_files_suffix,
+          input_dir,
+          species_codes_to_paths,
+          log,
+          out_dir,
+          out_dir_doms,
+          mapping_file )
+      }
+      puts
+      Util.print_message( PRG_NAME, "OK" )
+      puts
+
+    end
+
+
+    def read_seq_family_file( input_file,
+        seq_names_files_suffix,
+        input_dir,
+        species_codes_to_paths,
+        log,
+        out_dir,
+        out_dir_doms,
+        mapping_file )
+
+      begin
+        Util.check_file_for_readability( input_file )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+      basename = File.basename( input_file, seq_names_files_suffix )
+      out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
+      out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
+      out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
+      doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
+      begin
+        Util.check_file_for_writability( out_file_path_fasta_file )
+        Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
+        Util.check_file_for_writability( out_file_path_ids_map  )
+        Util.check_file_for_writability( doms_out_file_path_fasta_file  )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      ids_map_writer = nil
+      begin
+        ids_map_writer = File.open( out_file_path_ids_map, 'a' )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      current_species         = ""
+      current_msa            = nil
+      new_msa                = Msa.new
+      new_msa_normalized_ids = Msa.new
+      new_msa_domains = Msa.new
+      per_species_counter = 0
+
+      puts basename
+
+      File.open( input_file ) do | file |
+        while line = file.gets
+          line.strip!
+          if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
+            values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
+
+            if ( values.length < 2 )
+              Util.fatal_error( PRG_NAME, "unexpected format: " + line )
             end
-
-            if ( cla.is_option_set?( HELP_OPTION_1 ) ||
-                     cla.is_option_set?( HELP_OPTION_2 ) )
-                print_help
-                exit( 0 )
+            species = values[ 0 ]
+            if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
+              next
             end
-
-            if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
-                print_help
-                exit( -1 )
-            end
-
-            allowed_opts = Array.new
-
-            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
-            if ( disallowed.length > 0 )
-                Util.fatal_error( PRG_NAME,
-                    "unknown option(s): " + disallowed,
-                    STDOUT )
+            seq_name = values[ 1 ]
+            domain_ranges = nil
+            if ( values.length > 3 )
+              domain_ranges_block = values[ 3 ]
+              domain_ranges = domain_ranges_block.split( "/" )
             end
-
-            seq_names_files_suffix = cla.get_file_name( 0 )
-            input_dir              = cla.get_file_name( 1 )
-            out_dir                = cla.get_file_name( 2 )
-            out_dir_doms           = cla.get_file_name( 3 )
-            mapping_file            = nil
-
-            if ( cla.get_number_of_files == 5 )
-                mapping_file = cla.get_file_name( 4 )
-                begin
-                    Util.check_file_for_readability( mapping_file )
-                rescue ArgumentError => e
-                    Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            if ( species != current_species )
+              current_species = species
+              my_file = input_dir + Constants::FILE_SEPARATOR + current_species
+
+              if ( !File.exist?( my_file ) )
+                if species_codes_to_paths == nil
+                  Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
+                elsif ( !species_codes_to_paths.has_key?( current_species ) )
+                  Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
                 end
+                my_file = species_codes_to_paths[ current_species ]
+              end
+              my_path = File.expand_path( my_file )
+              my_readlink = my_path
+              if ( File.symlink?( my_path ) )
+                my_readlink = File.readlink( my_path )
+              end
+              current_msa = read_fasta_file( my_file )
+
+              if ( per_species_counter > 0 )
+                print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
+                per_species_counter = 0
+              end
+              puts " " + current_species + " [" + my_readlink + "]"
+              log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
             end
-
-            if  !File.exist?( input_dir )
-                Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
-            end
-            if  !File.exist?( out_dir )
-                Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
-            end
-              if  !File.exist?( out_dir_doms )
-                Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
-            end
-            if !File.directory?( input_dir )
-                Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
-            end
-            if !File.directory?( out_dir )
-                Util.fatal_error( PRG_NAME, "error:  [#{out_dir}] is not a directory" )
-            end
-            if !File.directory?( out_dir_doms )
-                Util.fatal_error( PRG_NAME, "error:  [#{out_dir_doms}] is not a directory" )
+            puts "   " + seq_name
+            log << "   " + seq_name + Constants::LINE_DELIMITER
+            per_species_counter = per_species_counter + 1
+            seq = nil
+
+            if current_msa.find_by_name_start( seq_name, true ).size > 0
+              begin
+                seq = current_msa.get_by_name_start( seq_name, true ).copy
+              rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+              end
+            else
+              # Not found, try finding by partial match.
+              begin
+                seq = current_msa.get_by_name( seq_name, true, true )
+              rescue ArgumentError => e
+                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+              end
             end
 
+            normalized_id = per_species_counter.to_s( 16 ).upcase +
+             "_" + current_species
 
-            log = String.new
-
-            log << "Program            : " + PRG_NAME + ld
-            log << "Version            : " + PRG_VERSION + ld
-            log << "Program date       : " + PRG_DATE + ld
-
-            puts()
-            puts( "Sequence names files suffix: " + seq_names_files_suffix )
-            log << "Sequence names files suffix: " + seq_names_files_suffix + ld
-            puts( "Input dir                  : " + input_dir )
-            log << "Input dir                  : " + input_dir + ld
-            puts( "Output dir                 : " + out_dir )
-            log << "Output dir                 : " + out_dir + ld
-            puts( "Output dir domains         : " + out_dir_doms )
-            log << "Output dir domains         : " + out_dir_doms + ld
-            if ( mapping_file != nil )
-                puts( "Mapping file               : " + mapping_file )
-                log << "Mapping file               : " + mapping_file + ld
+            per_species_counter.to_i
+
+            ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
+
+            orig_name = nil
+            if ( seq != nil )
+              orig_name = seq.get_name
+              seq.set_name( seq.get_name + " [" + current_species + "]" )
+              new_msa.add_sequence( seq )
+            else
+              Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
             end
-            log << "Date                       : " + Time.now.to_s + ld
-            puts
 
-            if ( mapping_file != nil )
-                species_codes_to_paths = extract_mappings( mapping_file )
+            if  domain_ranges != nil
+              domain_ranges.each { |range|
+                if range != nil && range.length > 0
+                  s= range.split("-")
+                  from = s[ 0 ]
+                  to = s[ 1 ]
+                  new_msa_domains.add_sequence( Sequence.new( orig_name + "/" + from + "-" + to + " [" + basename + "] [" + current_species + "]", seq.get_sequence_as_string[from.to_i..to.to_i] ) )
+                end
+              }
             end
 
-            input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
-
-            counter = 0
-            species_to_genomes = Hash.new()
-
-            input_files.each { |input_file|
-                counter += 1
-                puts
-                puts
-                puts counter.to_s + "/" + input_files.size.to_s
-                read_seq_family_file( input_file,
-                    seq_names_files_suffix,
-                    input_dir,
-                    species_codes_to_paths,
-                    species_to_genomes,
-                    log,
-                    out_dir,
-                    out_dir_doms,
-                    mapping_file )
-            }
-            puts
-            Util.print_message( PRG_NAME, "OK" )
-            puts
+            new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
 
+          end
         end
 
+      end
 
-        def read_seq_family_file( input_file,
-                seq_names_files_suffix,
-                input_dir,
-                species_codes_to_paths,
-                species_to_genomes,
-                log,
-                out_dir,
-                out_dir_doms,
-                mapping_file )
+      ids_map_writer.close
 
-            begin
-                Util.check_file_for_readability( input_file )
-            rescue ArgumentError => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-            end
-            basename = File.basename( input_file, seq_names_files_suffix )
-            out_file_path_fasta_file                = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
-            out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
-            out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX 
-            doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
-            begin
-                Util.check_file_for_writability( out_file_path_fasta_file )
-                Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
-                Util.check_file_for_writability( out_file_path_ids_map  )
-                Util.check_file_for_writability( doms_out_file_path_fasta_file  )
-            rescue ArgumentError => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-            end
-          
-            ids_map_writer = nil
-            begin
-                ids_map_writer = File.open( out_file_path_ids_map, 'a' )
-            rescue Exception => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-            end
-            
-            current_species         = ""
-            current_msa            = nil
-            new_msa                = Msa.new
-            new_msa_normalized_ids = Msa.new
-            new_msa_domains = Msa.new
-            per_species_counter = 0
-
-            puts basename
-
-            File.open( input_file ) do | file |
-                while line = file.gets
-                    if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
-                        values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
-                       
-                        if ( values.length < 2 )
-                            Util.fatal_error( PRG_NAME, "unexpected format: " + line )
-                        end
-                        species = values[ 0 ]
-                        if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
-                          next
-                        end
-                        seq_name = values[ 1 ]
-                        domain_ranges = nil
-                        if ( values.length > 3 )        
-                          domain_ranges_block = values[ 3 ]
-                          domain_ranges = domain_ranges_block.split( "/" )
-                        end
-                        if ( species != current_species )
-                            current_species = species
-                            my_file = input_dir + Constants::FILE_SEPARATOR + current_species
-
-                            if ( !File.exist?( my_file ) )
-                                if species_codes_to_paths == nil
-                                    Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
-                                elsif ( !species_codes_to_paths.has_key?( current_species ) )
-                                    Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
-                                end
-                                my_file = species_codes_to_paths[ current_species ]
-                            end
-                            my_path = File.expand_path( my_file )
-                            my_readlink = my_path
-                            if ( File.symlink?( my_path ) )
-                                my_readlink = File.readlink( my_path )
-                            end
-                            current_msa = read_fasta_file( my_file )
-                            end
-
-                            if ( per_species_counter > 0 )
-                                print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
-                                per_species_counter = 0
-                            end
-                            puts " " + current_species + " [" + my_readlink + "]"
-                            log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
-                        end
-                        puts "   " + seq_name
-                        log << "   " + seq_name + Constants::LINE_DELIMITER
-                        per_species_counter = per_species_counter + 1
-                        seq = nil
-                        
-                        if current_msa.find_by_name_start( seq_name, true ).size > 0
-                            begin
-                                seq = current_msa.get_by_name_start( seq_name, true ).copy
-                            rescue ArgumentError => e
-                                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-                            end
-                        elsif    
-                            # Not found, try finding by partial match.
-                            begin
-                                seq = current_msa.get_by_name( seq_name, true, true )
-                            rescue ArgumentError => e
-                                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-                            end
-                        end
-
-                        normalized_id = per_species_counter.to_s( 16 ).upcase +
-                         "_" + current_species
-                        
-                        per_species_counter.to_i
-                        
-                        ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
-                        
-                        orig_name = nil
-                        if ( seq != nil )
-                            orig_name = seq.get_name
-                            seq.set_name( seq.get_name + " [" + current_species + "]" )
-                            new_msa.add_sequence( seq )
-                        else
-                            Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
-                        end
-                        
-                        if  domain_ranges != nil 
-                            domain_ranges.each { |range| 
-                                if range != nil
-                                    s= range.split("-")
-                                    from = s[ 0 ]
-                                    to = s[ 1 ]
-                                    puts from + "-" + to
-                                    new_msa_domains.add_sequence( Sequence.new( orig_name + "/" + from + "-" + to + " [" + basename + "] [" + current_species + "]", seq.get_sequence_as_string[from..to] ) )
-                                end   
-                            }
-                        end
-                        
-                        new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
-                       
-                    end
-                end
+      if ( per_species_counter > 0 )
+        print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
+      end
 
-            end
-         
-            ids_map_writer.close
-           
-            if ( per_species_counter > 0 )
-                print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
-            end
+      io = MsaIO.new()
 
-            io = MsaIO.new()
+      fasta_writer = FastaWriter.new()
+      fasta_writer.remove_gap_chars
+      fasta_writer.clean
 
-            fasta_writer = FastaWriter.new()
-            fasta_writer.remove_gap_chars
-            fasta_writer.clean
-            
-            begin
-                io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
-            rescue Exception => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-            end
-            
-            if  new_msa_domains != nil 
-                begin
-                    io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
-                rescue Exception => e
-                    Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-                end
-           end
-            
-            begin
-                io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
-            rescue Exception => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-            end
-
-            begin
-                f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
-                f.print( log )
-                f.close
-            rescue Exception => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-            end
+      begin
+        io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
 
+      if  new_msa_domains != nil
+        begin
+          io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
+        rescue Exception => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
         end
-
-        def obtain_inputfiles( input_dir, seq_names_files_suffix )
-            input_files = Array.new()
-            Dir.foreach( input_dir ) { |file_name|
-                if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
-                    input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
-                end
-            }
-            input_files
-        end
-
-        def extract_mappings( mapping_file )
-            species_code_to_path = Hash.new()
-            File.open( mapping_file ) do | file |
-                while line = file.gets
-                    if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
-                        if ( line =~ /(\S+)\s+(\S+)/ )
-                            species = $1
-                            path = $2
-                            if ( species_code_to_path.has_key?( species ) )
-                                Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
-                            end
-                            if ( species_code_to_path.has_value?( path ) )
-                                Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
-                            end
-                            if ( !File.exist?( path ) )
-                                Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
-                            end
-                            if ( !File.file?( path ) )
-                                Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
-                            end
-                            if ( !File.readable?( path ) )
-                                Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
-                            end
-                            if ( File.size( path ) < 10000 )
-                                Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
-                            end
-                            if ( !Util.looks_like_fasta?( path ) )
-                                Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
-                            end
-                            species_code_to_path[ species ] = path
-                            puts species + " -> " + path
-                        end
-                    end
-                end
-            end
-            species_code_to_path
-        end
-
-        def print_counts( per_species_counter, log, ld )
-            puts "   [sum: " + per_species_counter.to_s + "]"
-            log << "   [sum: " + per_species_counter.to_s + "]" + ld
+      end
+
+      begin
+        io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      begin
+        f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LOG_SUFFIX , 'a' )
+        f.print( log )
+        f.close
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+    end
+
+    def obtain_inputfiles( input_dir, seq_names_files_suffix )
+      input_files = Array.new()
+      Dir.foreach( input_dir ) { |file_name|
+        if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
+          input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
         end
-
-        def read_fasta_file( input )
-            f = MsaFactory.new()
-            msa = nil
-            begin
-                msa = f.create_msa_from_file( input, FastaParser.new() )
-            rescue Exception => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      }
+      input_files
+    end
+
+    def extract_mappings( mapping_file )
+      species_code_to_path = Hash.new()
+      File.open( mapping_file ) do | file |
+        while line = file.gets
+          if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
+            if ( line =~ /(\S+)\s+(\S+)/ )
+              species = $1
+              path = $2
+              if ( species_code_to_path.has_key?( species ) )
+                Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
+              end
+              if ( species_code_to_path.has_value?( path ) )
+                Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
+              end
+              if ( !File.exist?( path ) )
+                Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
+              end
+              if ( !File.file?( path ) )
+                Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
+              end
+              if ( !File.readable?( path ) )
+                Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
+              end
+              if ( File.size( path ) < 10000 )
+                Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
+              end
+              if ( !Util.looks_like_fasta?( path ) )
+                Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
+              end
+              species_code_to_path[ species ] = path
+              puts species + " -> " + path
             end
-            msa
+          end
         end
-
-        def print_help()
-            puts( "Usage:" )
-            puts()
-            puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
-                 "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
-                 "genome multiple-sequence ('fasta') files not in input dir]" )
-            puts()
-            puts( "  " + "Example: \"mse.rb .prot . seqs doms ../genome_locations.txt\"" )
-            puts()
-        end
-
-    end # class MultiSequenceExtractor
+      end
+      species_code_to_path
+    end
+
+    def print_counts( per_species_counter, log, ld )
+      puts "   [sum: " + per_species_counter.to_s + "]"
+      log << "   [sum: " + per_species_counter.to_s + "]" + ld
+    end
+
+    def read_fasta_file( input )
+      f = MsaFactory.new()
+      msa = nil
+      begin
+        msa = f.create_msa_from_file( input, FastaParser.new() )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+      msa
+    end
+
+    def print_help()
+      puts( "Usage:" )
+      puts()
+      puts( "  " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
+         "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
+         "genome multiple-sequence ('fasta') files not in input dir]" )
+      puts()
+      puts( "  " + "Example: \"mse.rb .prot . seqs doms ../genome_locations.txt\"" )
+      puts()
+    end
+
+  end # class MultiSequenceExtractor
 end
\ No newline at end of file