X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Fapps%2Fmulti_sequence_extractor.rb;h=b499c7262fa555d38560d9656740d58e5bdad257;hb=0217958a797b6789dee8fabadaee2ca30719fb9b;hp=43aa80fbcd74e94672e29412b75cee92cbf99e96;hpb=dcdfe712174898d24c6c60f42a9296a0b8805388;p=jalview.git diff --git a/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb index 43aa80f..b499c72 100644 --- a/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/apps/multi_sequence_extractor.rb @@ -23,21 +23,25 @@ module Evoruby class MultiSequenceExtractor PRG_NAME = "mse" - PRG_VERSION = "1.0.0" + PRG_VERSION = "1.02" PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files" - PRG_DATE = "2008.08.13" - COPYRIGHT = "2008-2009 Christian M Zmasek" + PRG_DATE = "2012.07.20" + COPYRIGHT = "2008-2012 Christian M Zmasek" CONTACT = "phylosoft@gmail.com" WWW = "www.phylosoft.org" HELP_OPTION_1 = 'help' HELP_OPTION_2 = 'h' + EXT_OPTION = 'e' + EXTRACT_LINKERS_OPTION = 'l' LOG_SUFFIX = ".mse_log" + LINKERS_SUFFIX = ".linkers" FASTA_SUFFIX = ".fasta" FASTA_WITH_NORMALIZED_IDS_SUFFIX = ".ni.fasta" NORMALIZED_IDS_MAP_SUFFIX = ".nim" PROTEINS_LIST_FILE_SEPARATOR = "\t" + def run() Util.print_program_information( PRG_NAME, @@ -69,6 +73,8 @@ module Evoruby end allowed_opts = Array.new + allowed_opts.push(EXT_OPTION) + allowed_opts.push(EXTRACT_LINKERS_OPTION) disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) @@ -92,6 +98,19 @@ module Evoruby end end + extension = 0 + if cla.is_option_set?(EXT_OPTION) + extension = cla.get_option_value_as_int(EXT_OPTION) + if extension < 0 + extension = 0 + end + end + + extract_linkers = false + if cla.is_option_set?(EXTRACT_LINKERS_OPTION) + extract_linkers = true + end + if !File.exist?( input_dir ) Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" ) end @@ -131,6 +150,14 @@ module Evoruby puts( "Mapping file : " + mapping_file ) log << "Mapping file : " + mapping_file + ld end + if extension > 0 + puts( "Extension : " + extension.to_s ) + log << "Extension : " + extension.to_s + ld + end + if extract_linkers + puts( "Extract linkers : true" ) + log << "Extract linkers : true" + ld + end log << "Date : " + Time.now.to_s + ld puts @@ -154,7 +181,9 @@ module Evoruby log, out_dir, out_dir_doms, - mapping_file ) + mapping_file, + extension, + extract_linkers ) } puts Util.print_message( PRG_NAME, "OK" ) @@ -170,7 +199,9 @@ module Evoruby log, out_dir, out_dir_doms, - mapping_file ) + mapping_file, + extension, + extract_linkers ) begin Util.check_file_for_readability( input_file ) @@ -182,6 +213,10 @@ module Evoruby out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX out_file_path_ids_map = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX doms_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX + doms_ext_out_file_path_fasta_file = nil + if extension > 0 + doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX + end begin Util.check_file_for_writability( out_file_path_fasta_file ) Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file ) @@ -203,15 +238,18 @@ module Evoruby new_msa = Msa.new new_msa_normalized_ids = Msa.new new_msa_domains = Msa.new + new_msa_domains_extended = Msa.new per_species_counter = 0 + linkers = "" puts basename File.open( input_file ) do | file | while line = file.gets - if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) ) + line.strip! + if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) values = line.split( PROTEINS_LIST_FILE_SEPARATOR ) - + mod_line = nil if ( values.length < 2 ) Util.fatal_error( PRG_NAME, "unexpected format: " + line ) end @@ -279,7 +317,7 @@ module Evoruby ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER ) orig_name = nil - if ( seq != nil ) + if seq != nil orig_name = seq.get_name seq.set_name( seq.get_name + " [" + current_species + "]" ) new_msa.add_sequence( seq ) @@ -287,21 +325,71 @@ module Evoruby Util.fatal_error( PRG_NAME, "unexected error: seq is nil" ) end - if domain_ranges != nil + if domain_ranges != nil + first = true + prev_to = -1 + domain_ranges.each { |range| if range != nil && range.length > 0 - s= range.split("-") - from = s[ 0 ] - to = s[ 1 ] - new_msa_domains.add_sequence( Sequence.new( orig_name + "/" + from + "-" + to + " [" + basename + "] [" + current_species + "]", seq.get_sequence_as_string[from.to_i..to.to_i] ) ) - end + s = range.split("-") + from = s[ 0 ].to_i + to = s[ 1 ].to_i + new_msa_domains.add_sequence( Sequence.new( orig_name + + " [" + from.to_s + + "-" + to.to_s + + "] [" + basename + "] [" + + current_species + "]", + seq.get_sequence_as_string[from..to] ) ) + if extension > 0 + from_e = from - extension + if from_e < 0 + from_e = 0 + end + to_e = to + extension + if to_e > seq.get_sequence_as_string.length - 1 + to_e = seq.get_sequence_as_string.length - 1 + end + new_msa_domains_extended.add_sequence( Sequence.new( orig_name + + " [" + from.to_s + + "-" + to.to_s + + "] [extended by " + + extension.to_s + + "] [" + basename + "] [" + + current_species + "]", + seq.get_sequence_as_string[ from_e..to_e ] ) ) + end # extension > 0 + if extract_linkers + if first + first = false + f = 0 + t = from - 1 + if extension > 0 + f = from - extension + end + mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|" + else + mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|" + end + prev_to = to + end + end # range != nil && range.length > 0 } + if extract_linkers && prev_to > 0 + f = prev_to + 1 + t = seq.get_sequence_as_string.length - 1 + if extension > 0 + t = prev_to + extension + end + mod_line += get_linker_sequence( f, t, seq ) + "]" + end end new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) ) - - end - end + if mod_line + linkers << mod_line + Constants::LINE_DELIMITER + end + end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) + end # while line = file.gets end @@ -323,7 +411,7 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end - if new_msa_domains != nil + if new_msa_domains != nil begin io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer ) rescue Exception => e @@ -331,6 +419,14 @@ module Evoruby end end + if extension > 0 && new_msa_domains_extended != nil + begin + io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + end + begin io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer ) rescue Exception => e @@ -345,6 +441,32 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end + if extract_linkers && linkers != nil + begin + f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LINKERS_SUFFIX , 'a' ) + f.print( linkers ) + f.close + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + end + + + end + + + def get_linker_sequence( from, to, seq ) + if from < 0 + from = 0 + end + if to > seq.get_sequence_as_string.length - 1 + to = seq.get_sequence_as_string.length - 1 + end + if from > to + return "" + else + return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ] + end end def obtain_inputfiles( input_dir, seq_names_files_suffix ) @@ -418,7 +540,10 @@ module Evoruby "and possibly genome multiple-sequence ('fasta') files> [mapping file for " + "genome multiple-sequence ('fasta') files not in input dir]" ) puts() - puts( " " + "Example: \"mse.rb .prot . seqs doms ../genome_locations.txt\"" ) + puts( " option: -" + EXT_OPTION + "=: to extend extracted domains" ) + puts( " -" + EXTRACT_LINKERS_OPTION + " : to extract linkers" ) + puts() + puts( " " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" ) puts() end