From 599a0dfe599da928ccfe1a8bcba65dd78c21422c Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Wed, 27 Nov 2013 23:46:40 +0000 Subject: [PATCH] inprogress --- forester/ruby/evoruby/lib/evo/msa/msa.rb | 22 ++++++++++++-- .../lib/evo/tool/multi_sequence_extractor.rb | 32 ++++++++++++++------ 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/forester/ruby/evoruby/lib/evo/msa/msa.rb b/forester/ruby/evoruby/lib/evo/msa/msa.rb index 631eab0..ca6832d 100644 --- a/forester/ruby/evoruby/lib/evo/msa/msa.rb +++ b/forester/ruby/evoruby/lib/evo/msa/msa.rb @@ -19,6 +19,8 @@ module Evoruby def initialize() @sequences = Array.new @identical_seqs_detected = Array.new + @name_to_seq_indices = Hash.new + @namestart_to_seq_indices = Hash.new end @@ -47,6 +49,8 @@ module Evoruby " sequences" raise ArgumentError, error_msg end + @name_to_seq_indices.clear + @namestart_to_seq_indices.clear @sequences.delete_at( index ) end @@ -70,6 +74,9 @@ module Evoruby end def find_by_name( name, case_sensitive, partial_match ) + if case_sensitive && !partial_match && @name_to_seq_indices.has_key?( name ) + return @name_to_seq_indices[ name ] + end indices = Array.new() for i in 0 ... get_number_of_seqs() current_name = get_sequence( i ).get_name() @@ -82,6 +89,9 @@ module Evoruby indices.push( i ) end end + if case_sensitive && !partial_match + @name_to_seq_indices[ name ] = indices + end indices end @@ -116,6 +126,9 @@ module Evoruby end def find_by_name_start( name, case_sensitive ) + if case_sensitive && @namestart_to_seq_indices.has_key?( name ) + return @namestart_to_seq_indices[ name ] + end indices = [] for i in 0 ... get_number_of_seqs() get_sequence( i ).get_name() =~ /^\s*(\S+)/ @@ -124,10 +137,13 @@ module Evoruby current_name = current_name.downcase name = name.downcase end - if ( current_name == name ) + if current_name == name indices.push( i ) end end + if case_sensitive + @namestart_to_seq_indices[ name ] = indices + end indices end @@ -162,10 +178,10 @@ module Evoruby # throws ArgumentError def get_by_name_start( name, case_sensitive = true ) indices = find_by_name_start( name, case_sensitive ) - if ( indices.length > 1 ) + if indices.length > 1 error_msg = "\"" + name + "\" not unique" raise ArgumentError, error_msg - elsif ( indices.length < 1 ) + elsif indices.length < 1 error_msg = "\"" + name + "\" not found" raise ArgumentError, error_msg end diff --git a/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb index 31f0272..ffabc45 100644 --- a/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb @@ -23,9 +23,9 @@ module Evoruby class MultiSequenceExtractor PRG_NAME = "mse" - PRG_VERSION = "1.02" + PRG_VERSION = "1.03" PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files" - PRG_DATE = "130322" + PRG_DATE = "131127" COPYRIGHT = "2008-2013 Christian M Zmasek" CONTACT = "phylosoft@gmail.com" WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" @@ -294,21 +294,35 @@ module Evoruby per_species_counter = per_species_counter + 1 seq = nil - if current_msa.find_by_name_start( seq_name, true ).size > 0 - begin - seq = current_msa.get_by_name_start( seq_name, true ).copy - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - else + indices = current_msa.find_by_name_start( seq_name, true ) + if indices.size == 1 + seq = current_msa.get_sequence( indices[ 0 ] ) + elsif indices.size == 0 # Not found, try finding by partial match. begin seq = current_msa.get_by_name( seq_name, true, true ) rescue ArgumentError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end + else + Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique" ) end + # if current_msa.find_by_name_start( seq_name, true ).size > 0 + # begin + # seq = current_msa.get_by_name_start( seq_name, true ).copy + # rescue ArgumentError => e + # Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + # end + # else + # # Not found, try finding by partial match. + # begin + # seq = current_msa.get_by_name( seq_name, true, true ) + # rescue ArgumentError => e + # Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + # end + # end + normalized_id = per_species_counter.to_s( 16 ).upcase + "_" + current_species -- 1.7.10.2