+ class MultiSequenceExtractor
+
+ PRG_NAME = "mse"
+ PRG_VERSION = "1.0.0"
+ PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
+ PRG_DATE = "2008.08.13"
+ COPYRIGHT = "2008-2009 Christian M Zmasek"
+ CONTACT = "phylosoft@gmail.com"
+ WWW = "www.phylosoft.org"
+ HELP_OPTION_1 = 'help'
+ HELP_OPTION_2 = 'h'
+
+ LOG_SUFFIX = ".mse_log"
+ FASTA_SUFFIX = ".fasta"
+ FASTA_WITH_NORMALIZED_IDS_SUFFIX = ".ni.fasta"
+ NORMALIZED_IDS_MAP_SUFFIX = ".nim"
+ PROTEINS_LIST_FILE_SEPARATOR = "\t"
+
+ def run()
+
+ Util.print_program_information( PRG_NAME,
+ PRG_VERSION,
+ PRG_DESC ,
+ PRG_DATE,
+ COPYRIGHT,
+ CONTACT,
+ WWW,
+ STDOUT )
+
+ ld = Constants::LINE_DELIMITER
+
+ begin
+ cla = CommandLineArguments.new( ARGV )
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+ end
+
+ if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+ cla.is_option_set?( HELP_OPTION_2 ) )
+ print_help
+ exit( 0 )
+ end
+
+ if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
+ print_help
+ exit( -1 )
+ end
+
+ allowed_opts = Array.new
+
+ disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+ if ( disallowed.length > 0 )
+ Util.fatal_error( PRG_NAME,
+ "unknown option(s): " + disallowed,
+ STDOUT )
+ end
+
+ seq_names_files_suffix = cla.get_file_name( 0 )
+ input_dir = cla.get_file_name( 1 )
+ out_dir = cla.get_file_name( 2 )
+ out_dir_doms = cla.get_file_name( 3 )
+ mapping_file = nil
+
+ if ( cla.get_number_of_files == 5 )
+ mapping_file = cla.get_file_name( 4 )
+ begin
+ Util.check_file_for_readability( mapping_file )
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+ end
+ end
+
+ if !File.exist?( input_dir )
+ Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
+ end
+ if !File.exist?( out_dir )
+ Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
+ end
+ if !File.exist?( out_dir_doms )
+ Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
+ end
+ if !File.directory?( input_dir )
+ Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
+ end
+ if !File.directory?( out_dir )
+ Util.fatal_error( PRG_NAME, "error: [#{out_dir}] is not a directory" )
+ end
+ if !File.directory?( out_dir_doms )
+ Util.fatal_error( PRG_NAME, "error: [#{out_dir_doms}] is not a directory" )
+ end
+
+
+ log = String.new
+
+ log << "Program : " + PRG_NAME + ld
+ log << "Version : " + PRG_VERSION + ld
+ log << "Program date : " + PRG_DATE + ld
+
+ puts()
+ puts( "Sequence names files suffix: " + seq_names_files_suffix )
+ log << "Sequence names files suffix: " + seq_names_files_suffix + ld
+ puts( "Input dir : " + input_dir )
+ log << "Input dir : " + input_dir + ld
+ puts( "Output dir : " + out_dir )
+ log << "Output dir : " + out_dir + ld
+ puts( "Output dir domains : " + out_dir_doms )
+ log << "Output dir domains : " + out_dir_doms + ld
+ if ( mapping_file != nil )
+ puts( "Mapping file : " + mapping_file )
+ log << "Mapping file : " + mapping_file + ld
+ end
+ log << "Date : " + Time.now.to_s + ld
+ puts
+
+ if ( mapping_file != nil )
+ species_codes_to_paths = extract_mappings( mapping_file )
+ end
+
+ input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
+
+ counter = 0
+
+ input_files.each { |input_file|
+ counter += 1
+ puts
+ puts
+ puts counter.to_s + "/" + input_files.size.to_s
+ read_seq_family_file( input_file,
+ seq_names_files_suffix,
+ input_dir,
+ species_codes_to_paths,
+ log,
+ out_dir,
+ out_dir_doms,
+ mapping_file )
+ }
+ puts
+ Util.print_message( PRG_NAME, "OK" )
+ puts
+
+ end
+
+
+ def read_seq_family_file( input_file,
+ seq_names_files_suffix,
+ input_dir,
+ species_codes_to_paths,
+ log,
+ out_dir,
+ out_dir_doms,
+ mapping_file )
+
+ begin
+ Util.check_file_for_readability( input_file )
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+ end
+ basename = File.basename( input_file, seq_names_files_suffix )
+ out_file_path_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
+ out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
+ out_file_path_ids_map = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
+ doms_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
+ begin
+ Util.check_file_for_writability( out_file_path_fasta_file )
+ Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
+ Util.check_file_for_writability( out_file_path_ids_map )
+ Util.check_file_for_writability( doms_out_file_path_fasta_file )
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+ end
+
+ ids_map_writer = nil
+ begin
+ ids_map_writer = File.open( out_file_path_ids_map, 'a' )
+ rescue Exception => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+ end
+
+ current_species = ""
+ current_msa = nil
+ new_msa = Msa.new
+ new_msa_normalized_ids = Msa.new
+ new_msa_domains = Msa.new
+ per_species_counter = 0
+
+ puts basename
+
+ File.open( input_file ) do | file |
+ while line = file.gets
+ if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
+ values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
+
+ if ( values.length < 2 )
+ Util.fatal_error( PRG_NAME, "unexpected format: " + line )