inprogress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 27 Nov 2013 23:46:40 +0000 (23:46 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 27 Nov 2013 23:46:40 +0000 (23:46 +0000)
forester/ruby/evoruby/lib/evo/msa/msa.rb
forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb

index 631eab0..ca6832d 100644 (file)
@@ -19,6 +19,8 @@ module Evoruby
     def initialize()
       @sequences = Array.new
       @identical_seqs_detected = Array.new
+      @name_to_seq_indices = Hash.new
+      @namestart_to_seq_indices = Hash.new
     end
 
 
@@ -47,6 +49,8 @@ module Evoruby
          " sequences"
         raise ArgumentError, error_msg
       end
+      @name_to_seq_indices.clear
+      @namestart_to_seq_indices.clear
       @sequences.delete_at( index )
     end
 
@@ -70,6 +74,9 @@ module Evoruby
     end
 
     def find_by_name( name, case_sensitive, partial_match )
+      if case_sensitive && !partial_match && @name_to_seq_indices.has_key?( name )
+        return @name_to_seq_indices[ name ]
+      end
       indices = Array.new()
       for i in 0 ... get_number_of_seqs()
         current_name = get_sequence( i ).get_name()
@@ -82,6 +89,9 @@ module Evoruby
           indices.push( i )
         end
       end
+      if case_sensitive && !partial_match
+        @name_to_seq_indices[ name ] = indices
+      end
       indices
     end
 
@@ -116,6 +126,9 @@ module Evoruby
     end
 
     def find_by_name_start( name, case_sensitive )
+      if case_sensitive && @namestart_to_seq_indices.has_key?( name )
+        return @namestart_to_seq_indices[ name ]
+      end
       indices = []
       for i in 0 ... get_number_of_seqs()
         get_sequence( i ).get_name() =~ /^\s*(\S+)/
@@ -124,10 +137,13 @@ module Evoruby
           current_name = current_name.downcase
           name = name.downcase
         end
-        if  ( current_name == name )
+        if  current_name == name
           indices.push( i )
         end
       end
+      if case_sensitive
+        @namestart_to_seq_indices[ name ] = indices
+      end
       indices
     end
 
@@ -162,10 +178,10 @@ module Evoruby
     # throws ArgumentError
     def get_by_name_start( name, case_sensitive = true )
       indices = find_by_name_start( name, case_sensitive )
-      if ( indices.length > 1 )
+      if indices.length > 1
         error_msg = "\"" + name + "\" not unique"
         raise ArgumentError, error_msg
-      elsif ( indices.length < 1 )
+      elsif  indices.length < 1
         error_msg = "\"" + name + "\" not found"
         raise ArgumentError, error_msg
       end
index 31f0272..ffabc45 100644 (file)
@@ -23,9 +23,9 @@ module Evoruby
   class MultiSequenceExtractor
 
     PRG_NAME                           = "mse"
-    PRG_VERSION                        = "1.02"
+    PRG_VERSION                        = "1.03"
     PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
-    PRG_DATE                           = "130322"
+    PRG_DATE                           = "131127"
     COPYRIGHT                          = "2008-2013 Christian M Zmasek"
     CONTACT                            = "phylosoft@gmail.com"
     WWW                                = "https://sites.google.com/site/cmzmasek/home/software/forester"
@@ -294,21 +294,35 @@ module Evoruby
             per_species_counter = per_species_counter + 1
             seq = nil
 
-            if current_msa.find_by_name_start( seq_name, true ).size > 0
-              begin
-                seq = current_msa.get_by_name_start( seq_name, true ).copy
-              rescue ArgumentError => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-              end
-            else
+            indices = current_msa.find_by_name_start( seq_name, true )
+            if indices.size == 1
+              seq =  current_msa.get_sequence( indices[ 0 ] )
+            elsif indices.size == 0
               # Not found, try finding by partial match.
               begin
                 seq = current_msa.get_by_name( seq_name, true, true )
               rescue ArgumentError => e
                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
               end
+            else
+              Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique"  )
             end
 
+            # if current_msa.find_by_name_start( seq_name, true ).size > 0
+            #   begin
+            #     seq = current_msa.get_by_name_start( seq_name, true ).copy
+            #   rescue ArgumentError => e
+            #     Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            #   end
+            # else
+            #   # Not found, try finding by partial match.
+            #   begin
+            #     seq = current_msa.get_by_name( seq_name, true, true )
+            #   rescue ArgumentError => e
+            #     Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+            #   end
+            # end
+
             normalized_id = per_species_counter.to_s( 16 ).upcase +
              "_" + current_species