From 6e8476f2d8e828d94551a499ed70f0ae912e6921 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Fri, 20 Jul 2012 00:27:33 +0000 Subject: [PATCH] in progress --- .../lib/evo/apps/multi_sequence_extractor.rb | 63 ++++++++++++++++---- 1 file changed, 53 insertions(+), 10 deletions(-) diff --git a/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb index f8838a7..aa860f1 100644 --- a/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb @@ -23,15 +23,16 @@ module Evoruby class MultiSequenceExtractor PRG_NAME = "mse" - PRG_VERSION = "1.0.0" + PRG_VERSION = "1.01" PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files" - PRG_DATE = "2008.08.13" - COPYRIGHT = "2008-2009 Christian M Zmasek" + PRG_DATE = "2012.07.19" + COPYRIGHT = "2008-2012 Christian M Zmasek" CONTACT = "phylosoft@gmail.com" WWW = "www.phylosoft.org" HELP_OPTION_1 = 'help' HELP_OPTION_2 = 'h' + EXT_OPTION = 'e' LOG_SUFFIX = ".mse_log" FASTA_SUFFIX = ".fasta" FASTA_WITH_NORMALIZED_IDS_SUFFIX = ".ni.fasta" @@ -69,6 +70,7 @@ module Evoruby end allowed_opts = Array.new + allowed_opts.push(EXT_OPTION) disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) @@ -92,6 +94,14 @@ module Evoruby end end + extension = 0 + if cla.is_option_set?(EXT_OPTION) + extension = cla.get_option_value_as_int(EXT_OPTION) + if extension < 0 + extension = 0 + end + end + if !File.exist?( input_dir ) Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" ) end @@ -131,6 +141,10 @@ module Evoruby puts( "Mapping file : " + mapping_file ) log << "Mapping file : " + mapping_file + ld end + if ( extension > 0 ) + puts( "Extension : " + extension.to_s ) + log << "Extension : " + extension.to_s + ld + end log << "Date : " + Time.now.to_s + ld puts @@ -154,7 +168,8 @@ module Evoruby log, out_dir, out_dir_doms, - mapping_file ) + mapping_file, + extension ) } puts Util.print_message( PRG_NAME, "OK" ) @@ -170,7 +185,8 @@ module Evoruby log, out_dir, out_dir_doms, - mapping_file ) + mapping_file, + extension ) begin Util.check_file_for_readability( input_file ) @@ -182,6 +198,10 @@ module Evoruby out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX out_file_path_ids_map = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX doms_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX + doms_ext_out_file_path_fasta_file = nil + if extension > 0 + doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX + end begin Util.check_file_for_writability( out_file_path_fasta_file ) Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file ) @@ -203,6 +223,7 @@ module Evoruby new_msa = Msa.new new_msa_normalized_ids = Msa.new new_msa_domains = Msa.new + new_msa_domains_extended = Msa.new per_species_counter = 0 puts basename @@ -291,10 +312,22 @@ module Evoruby if domain_ranges != nil domain_ranges.each { |range| if range != nil && range.length > 0 - s= range.split("-") - from = s[ 0 ] - to = s[ 1 ] - new_msa_domains.add_sequence( Sequence.new( orig_name + "/" + from + "-" + to + " [" + basename + "] [" + current_species + "]", seq.get_sequence_as_string[from.to_i..to.to_i] ) ) + s = range.split("-") + from = s[ 0 ].to_i + to = s[ 1 ].to_i + new_msa_domains.add_sequence( Sequence.new( orig_name + "/" + from + "-" + to + " [" + basename + "] [" + current_species + "]", seq.get_sequence_as_string[from..to] ) ) + if extension > 0 + from_e = from - extension + if from_e < 0 + from_e = 0 + end + to_e = to + extension + if to_e > seq.get_sequence_as_string.length - 1 + to_e = seq.get_sequence_as_string.length - 1 + end + new_msa_domains_extended.add_sequence( Sequence.new( orig_name + "/" + from + "-" + to + " [ext: " + extension.to_s + "] [" + basename + "] [" + current_species + "]", + seq.get_sequence_as_string[ from_e..to_e ] ) ) + end end } end @@ -324,7 +357,7 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end - if new_msa_domains != nil + if new_msa_domains != nil begin io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer ) rescue Exception => e @@ -332,6 +365,14 @@ module Evoruby end end + if extension > 0 && new_msa_domains_extended != nil + begin + io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + end + begin io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer ) rescue Exception => e @@ -419,6 +460,8 @@ module Evoruby "and possibly genome multiple-sequence ('fasta') files> [mapping file for " + "genome multiple-sequence ('fasta') files not in input dir]" ) puts() + puts( " option: -" + EXT_OPTION + "=: to extend extracted domains" ) + puts() puts( " " + "Example: \"mse.rb .prot . seqs doms ../genome_locations.txt\"" ) puts() end -- 1.7.10.2