inprogress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / multi_sequence_extractor.rb
index 5695cd4..0c525ad 100644 (file)
@@ -23,9 +23,9 @@ module Evoruby
   class MultiSequenceExtractor
 
     PRG_NAME                           = "mse"
-    PRG_VERSION                        = "1.02"
+    PRG_VERSION                        = "1.03"
     PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
-    PRG_DATE                           = "130322"
+    PRG_DATE                           = "131127"
     COPYRIGHT                          = "2008-2013 Christian M Zmasek"
     CONTACT                            = "phylosoft@gmail.com"
     WWW                                = "https://sites.google.com/site/cmzmasek/home/software/forester"
@@ -41,6 +41,10 @@ module Evoruby
     NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
     PROTEINS_LIST_FILE_SEPARATOR        = "\t"
 
+    def initialize()
+      @file_to_msa = Hash.new
+      @seqs = 0
+    end
 
     def run()
 
@@ -245,6 +249,7 @@ module Evoruby
       puts basename
 
       File.open( input_file ) do | file |
+        species_counter = 1
         while line = file.gets
           line.strip!
           if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
@@ -254,9 +259,9 @@ module Evoruby
               Util.fatal_error( PRG_NAME, "unexpected format: " + line )
             end
             species = values[ 0 ]
-            if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
-              next
-            end
+            #if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA"  || species == "EIMTE"
+            #  next
+            #end
             seq_name = values[ 1 ]
             domain_ranges = nil
             if ( values.length > 3 )
@@ -286,27 +291,26 @@ module Evoruby
                 print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
                 per_species_counter = 0
               end
-              puts " " + current_species + " [" + my_readlink + "]"
-              log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
+              puts " " + species_counter.to_s +  ":" + current_species + " [" + my_readlink + "]"
+              log << species_counter.to_s <<  ": " << current_species << " [" + my_readlink + "]" << Constants::LINE_DELIMITER
+              species_counter += 1
             end
-            puts "   " + seq_name
-            log << "   " + seq_name + Constants::LINE_DELIMITER
+            log << "   " << seq_name << Constants::LINE_DELIMITER
             per_species_counter = per_species_counter + 1
             seq = nil
 
-            if current_msa.find_by_name_start( seq_name, true ).size > 0
-              begin
-                seq = current_msa.get_by_name_start( seq_name, true ).copy
-              rescue ArgumentError => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-              end
-            else
+            indices = current_msa.find_by_name_start( seq_name, true )
+            if indices.size == 1
+              seq = current_msa.get_sequence( indices[ 0 ] )
+            elsif indices.size == 0
               # Not found, try finding by partial match.
               begin
                 seq = current_msa.get_by_name( seq_name, true, true )
               rescue ArgumentError => e
                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
               end
+            else
+              Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique"  )
             end
 
             normalized_id = per_species_counter.to_s( 16 ).upcase +
@@ -450,8 +454,6 @@ module Evoruby
           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
         end
       end
-
-
     end
 
 
@@ -518,11 +520,15 @@ module Evoruby
     end
 
     def print_counts( per_species_counter, log, ld )
-      puts "   [sum: " + per_species_counter.to_s + "]"
-      log << "   [sum: " + per_species_counter.to_s + "]" + ld
+      puts "   sum: " + per_species_counter.to_s
+      log << "   sum: " + per_species_counter.to_s + ld
     end
 
     def read_fasta_file( input )
+      if @file_to_msa.has_key?( input )
+        return @file_to_msa[ input ]
+      end
+
       f = MsaFactory.new()
       msa = nil
       begin
@@ -530,6 +536,11 @@ module Evoruby
       rescue Exception => e
         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
       end
+      if @seqs <= 400000
+        @file_to_msa[ input ] = msa
+        @seqs += msa.get_number_of_seqs
+        puts "   total seqs in memory: " + @seqs.to_s
+      end
       msa
     end