X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Ftool%2Fmulti_sequence_extractor.rb;h=0c525adf94bf1314eab522719b766c1381f4df64;hb=af5b37d5113266b2e849729ff1c0b5ef94e628f9;hp=0df81fef6aec01110c8cf67d4f258b6b6b931469;hpb=5f63a927ff38eee44356b56ab120ef5d3aba6173;p=jalview.git diff --git a/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb index 0df81fe..0c525ad 100644 --- a/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb @@ -43,6 +43,7 @@ module Evoruby def initialize() @file_to_msa = Hash.new + @seqs = 0 end def run() @@ -248,6 +249,7 @@ module Evoruby puts basename File.open( input_file ) do | file | + species_counter = 1 while line = file.gets line.strip! if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) @@ -289,17 +291,17 @@ module Evoruby print_counts( per_species_counter, log, Constants::LINE_DELIMITER ) per_species_counter = 0 end - puts " " + current_species + " [" + my_readlink + "]" - log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER + puts " " + species_counter.to_s + ":" + current_species + " [" + my_readlink + "]" + log << species_counter.to_s << ": " << current_species << " [" + my_readlink + "]" << Constants::LINE_DELIMITER + species_counter += 1 end - puts " " + seq_name - log << " " + seq_name + Constants::LINE_DELIMITER + log << " " << seq_name << Constants::LINE_DELIMITER per_species_counter = per_species_counter + 1 seq = nil indices = current_msa.find_by_name_start( seq_name, true ) if indices.size == 1 - seq = current_msa.get_sequence( indices[ 0 ] ) + seq = current_msa.get_sequence( indices[ 0 ] ) elsif indices.size == 0 # Not found, try finding by partial match. begin @@ -311,21 +313,6 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique" ) end - # if current_msa.find_by_name_start( seq_name, true ).size > 0 - # begin - # seq = current_msa.get_by_name_start( seq_name, true ).copy - # rescue ArgumentError => e - # Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - # end - # else - # # Not found, try finding by partial match. - # begin - # seq = current_msa.get_by_name( seq_name, true, true ) - # rescue ArgumentError => e - # Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - # end - # end - normalized_id = per_species_counter.to_s( 16 ).upcase + "_" + current_species @@ -533,8 +520,8 @@ module Evoruby end def print_counts( per_species_counter, log, ld ) - puts " [sum: " + per_species_counter.to_s + "]" - log << " [sum: " + per_species_counter.to_s + "]" + ld + puts " sum: " + per_species_counter.to_s + log << " sum: " + per_species_counter.to_s + ld end def read_fasta_file( input ) @@ -549,8 +536,10 @@ module Evoruby rescue Exception => e Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end - if @file_to_msa.size < 400 && msa.get_number_of_seqs < 40000 + if @seqs <= 400000 @file_to_msa[ input ] = msa + @seqs += msa.get_number_of_seqs + puts " total seqs in memory: " + @seqs.to_s end msa end