X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Ftool%2Fmulti_sequence_extractor.rb;h=40b1211ffca6858fe65d39692eeef35a7e0ac934;hb=205d83c44194c54ae2165623f6de89c6f4fdea11;hp=b499c7262fa555d38560d9656740d58e5bdad257;hpb=dbf5b588d65d1c62094dd5d339eca5056a5ade5f;p=jalview.git diff --git a/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb index b499c72..40b1211 100644 --- a/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb @@ -23,12 +23,12 @@ module Evoruby class MultiSequenceExtractor PRG_NAME = "mse" - PRG_VERSION = "1.02" + PRG_VERSION = "1.03" PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files" - PRG_DATE = "2012.07.20" - COPYRIGHT = "2008-2012 Christian M Zmasek" + PRG_DATE = "131127" + COPYRIGHT = "2008-2013 Christian M Zmasek" CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" + WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" HELP_OPTION_1 = 'help' HELP_OPTION_2 = 'h' @@ -41,6 +41,10 @@ module Evoruby NORMALIZED_IDS_MAP_SUFFIX = ".nim" PROTEINS_LIST_FILE_SEPARATOR = "\t" + def initialize() + @file_to_msa = Hash.new + @seqs = 0 + end def run() @@ -245,6 +249,7 @@ module Evoruby puts basename File.open( input_file ) do | file | + species_counter = 1 while line = file.gets line.strip! if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) @@ -254,9 +259,9 @@ module Evoruby Util.fatal_error( PRG_NAME, "unexpected format: " + line ) end species = values[ 0 ] - if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA" || species == "EIMTE" - next - end + #if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA" || species == "EIMTE" + # next + #end seq_name = values[ 1 ] domain_ranges = nil if ( values.length > 3 ) @@ -286,35 +291,50 @@ module Evoruby print_counts( per_species_counter, log, Constants::LINE_DELIMITER ) per_species_counter = 0 end - puts " " + current_species + " [" + my_readlink + "]" - log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER + puts " " + species_counter.to_s + ":" + current_species + " [" + my_readlink + "]" + log << species_counter.to_s + ": " + current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER + species_counter += 1 end puts " " + seq_name log << " " + seq_name + Constants::LINE_DELIMITER per_species_counter = per_species_counter + 1 seq = nil - if current_msa.find_by_name_start( seq_name, true ).size > 0 - begin - seq = current_msa.get_by_name_start( seq_name, true ).copy - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - else + indices = current_msa.find_by_name_start( seq_name, true ) + if indices.size == 1 + seq = current_msa.get_sequence( indices[ 0 ] ) + elsif indices.size == 0 # Not found, try finding by partial match. begin seq = current_msa.get_by_name( seq_name, true, true ) rescue ArgumentError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end + else + Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique" ) end + # if current_msa.find_by_name_start( seq_name, true ).size > 0 + # begin + # seq = current_msa.get_by_name_start( seq_name, true ).copy + # rescue ArgumentError => e + # Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + # end + # else + # # Not found, try finding by partial match. + # begin + # seq = current_msa.get_by_name( seq_name, true, true ) + # rescue ArgumentError => e + # Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + # end + # end + normalized_id = per_species_counter.to_s( 16 ).upcase + "_" + current_species per_species_counter.to_i - ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER ) + ids_map_writer.write( normalized_id + "\t" + seq.get_name + Constants::LINE_DELIMITER ) orig_name = nil if seq != nil @@ -450,8 +470,6 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end end - - end @@ -523,6 +541,10 @@ module Evoruby end def read_fasta_file( input ) + if @file_to_msa.has_key?( input ) + return @file_to_msa[ input ] + end + f = MsaFactory.new() msa = nil begin @@ -530,6 +552,11 @@ module Evoruby rescue Exception => e Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end + if @seqs <= 100000000 + @file_to_msa[ input ] = msa + @seqs += msa.get_number_of_seqs + puts " total seqs in memory: " + @seqs.to_s + end msa end