in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_multi_domain_extractor.rb
index c35e15c..f773e26 100644 (file)
@@ -4,7 +4,11 @@
 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
 # License::      GNU Lesser General Public License (LGPL)
 #
-# Last modified: 2017/02/20
+# Last modified: 2017/03/08
+
+####
+# Import: if multiple copies of same domain, thresholds need to be same!
+####
 
 require 'lib/evo/util/constants'
 require 'lib/evo/msa/msa_factory'
@@ -16,15 +20,28 @@ require 'lib/evo/io/parser/hmmscan_parser'
 module Evoruby
   class HmmscanMultiDomainExtractor
 
-    OUTPUT_ID = 'MDSX'
+    TESTING = false
+    OUTPUT_ID = 'mdsx'
+    DOMAIN_DELIMITER = ' -- '
+
+    IE_CUTOFF_FOR_DA_OVERVIEW      = 1E-6
+    REL_LEN_CUTOFF_FOR_DA_OVERVIEW = 0.5
 
-    PASSING_FL_SEQS_SUFFIX   = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
-    FAILING_FL_SEQS_SUFFIX   = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
-    TARGET_DA_SUFFIX         = "_#{OUTPUT_ID}_target_da.fasta"
-    CONCAT_TARGET_DOM_SUFFIX = "_#{OUTPUT_ID}_concat_target_dom.fasta"
+    PASSING_FL_SEQS_SUFFIX    = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
+    FAILING_FL_SEQS_SUFFIX    = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
+    TARGET_DA_SUFFIX          = "_#{OUTPUT_ID}_target_da.fasta"
+    CONCAT_TARGET_DOM_SUFFIX  = "_#{OUTPUT_ID}_concat_target_doms.fasta"
+    TARGET_DOM_OUTPUT_MIDPART = "_#{OUTPUT_ID}_target_dom_"
+    LOG_FILE_SUFFIX           = "_#{OUTPUT_ID}.log"
     def initialize
-      @passed = Hash.new
-      @non_passsing_domain_architectures = Hash.new
+      @passing_domains_data = nil
+      @failing_domains_data = nil
+      @failing_domain_architectures = nil
+      @passsing_domain_architectures = nil
+      @failing_proteins_bc_not_all_target_doms_present = nil
+      @failing_proteins_bc_missing_cutoffs = nil
+      @encountered_domain_architectures = nil
+      @log = nil
     end
 
     # raises ArgumentError, IOError, StandardError
@@ -38,6 +55,7 @@ module Evoruby
       failing_fl_seqs_outfile   = outfile_base + FAILING_FL_SEQS_SUFFIX
       target_da_outfile         = outfile_base + TARGET_DA_SUFFIX
       concat_target_dom_outfile = outfile_base + CONCAT_TARGET_DOM_SUFFIX
+      logfile                   = outfile_base + LOG_FILE_SUFFIX
 
       Util.check_file_for_readability( hmmscan_output )
       Util.check_file_for_readability( fasta_sequence_file )
@@ -45,6 +63,9 @@ module Evoruby
       Util.check_file_for_writability( failing_fl_seqs_outfile )
       Util.check_file_for_writability( target_da_outfile )
       Util.check_file_for_writability( concat_target_dom_outfile )
+      Util.check_file_for_writability( logfile )
+
+      @log = log_str
 
       in_msa = nil
       factory = MsaFactory.new()
@@ -55,73 +76,34 @@ module Evoruby
         raise IOError, error_msg
       end
 
-      out_msa = Msa.new
-
       failed_seqs_msa = Msa.new
       passed_seqs_msa = Msa.new
-      # passed_multi_seqs_msa = Msa.new
-
-      ld = Constants::LINE_DELIMITER
-
-      domain_pass_counter                = 0
-      domain_fail_counter                = 0
-      passing_domains_per_protein        = 0
-      proteins_with_failing_domains      = 0
-      domain_not_present_counter         = 0
-      protein_counter                    = 1
-      max_domain_copy_number_per_protein = -1
-      max_domain_copy_number_sequence    = ""
-      passing_target_length_sum          = 0
-      overall_target_length_sum          = 0
-      overall_target_length_min          = 10000000
-      overall_target_length_max          = -1
-      passing_target_length_min          = 10000000
-      passing_target_length_max          = -1
-
-      overall_target_ie_min          = 10000000
-      overall_target_ie_max          = -1
-      passing_target_ie_min          = 10000000
-      passing_target_ie_max          = -1
 
       hmmscan_parser = HmmscanParser.new( hmmscan_output )
       results = hmmscan_parser.parse
 
-      ####
-      # Import: if multiple copies of same domain, threshold need to be same!
-      target_domain_ary = Array.new
-
-      target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 ))
-      target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 ))
-
-      # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.6, 0 ))
-      # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.6, 1 ))
-      # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
-      # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
-
-      #target_domain_ary.push(TargetDomain.new('BH4', 0.1, -1, 0.5, 0 ))
-      #target_domain_ary.push(TargetDomain.new('Bcl-2', 0.1, -1, 0.5, 1 ))
-      # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
-
-      #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
-      #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
-
-      #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
+      target_domain_ary = parse_da target_da
 
       target_domains = Hash.new
 
-      target_domain_architecure = ""
+      target_domain_architecure = ''
 
       target_domain_ary.each do |target_domain|
         target_domains[target_domain.name] = target_domain
         if target_domain_architecure.length > 0
-          target_domain_architecure += ">"
+          target_domain_architecure << DOMAIN_DELIMITER
         end
-        target_domain_architecure += target_domain.name
+        target_domain_architecure << target_domain.name
       end
 
       target_domain_architecure.freeze
 
-      puts 'Target domain architecture: ' + target_domain_architecure
+      log 'Hmmscan outputfile             : ' + hmmscan_output
+      log 'Full length fasta sequence file: ' + fasta_sequence_file
+      log 'Target domain architecture     : ' + target_domain_architecure
+      target_domain_ary.each do |x|
+        log x.to_str
+      end
 
       target_domain_names = Set.new
 
@@ -132,18 +114,24 @@ module Evoruby
       passing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
       failing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
       total_sequences = 0
-      @passed = Hash.new
+
+      @failing_domain_architectures = Hash.new
+      @passsing_domain_architectures = Hash.new
+      @passing_domains_data = Hash.new
+      @failing_domains_data = Hash.new
+      @encountered_domain_architectures= Hash.new
+      @failing_proteins_bc_not_all_target_doms_present = 0
+      @failing_proteins_bc_missing_cutoffs = 0
       out_domain_msas = Hash.new
       out_domain_architecture_msa = Msa.new
       out_concatenated_domains_msa = Msa.new
       results.each do |hmmscan_result|
         if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
-          if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_contactenated_domains_msa, target_domain_architecure)
+          if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
             passing_sequences.push(domains_in_query_seq)
           else
             failing_sequences.push(domains_in_query_seq)
           end
-
           domains_in_query_seq = Array.new
           total_sequences += 1
         end
@@ -152,91 +140,172 @@ module Evoruby
       end # each result
 
       if prev_query_seq_name != nil
-        total_sequences += 1
-        if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_contactenated_domains_msa, target_domain_architecure)
+        if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
           passing_sequences.push(domains_in_query_seq)
         else
           failing_sequences.push(domains_in_query_seq)
         end
+        total_sequences += 1
       end
 
+      if in_msa.get_number_of_seqs < total_sequences
+        error_msg = "hmmscan output contains more protein sequences than fasta sequence file"
+        raise IOError, error_msg
+      end
+
+      log
+      log 'Domain architecture overview using default iE-cutoff ' +
+      IE_CUTOFF_FOR_DA_OVERVIEW.to_s + ' and relative length cutoff ' + REL_LEN_CUTOFF_FOR_DA_OVERVIEW.to_s + ':'
+
+      @encountered_domain_architectures = @encountered_domain_architectures.sort_by {|k, v| v}.reverse.to_h
+      counter = 1;
+      @encountered_domain_architectures.each do |k, v|
+        log counter.to_s.rjust(2) + ') ' +  v.to_s.rjust(5) + ': ' + k
+        counter += 1
+        if counter > 40
+          break
+        end
+      end
+
+      log
+      log 'Passing domain arrangements of target domain(s):'
+      @passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
+      passing_da_sum = 0
+      @passsing_domain_architectures.each do |da, count|
+        passing_da_sum += count
+        log count.to_s.rjust(4) + ': ' + da
+      end
+      log
+      log 'Failing domain arrangements of target domain(s):'
+      @failing_domain_architectures = @failing_domain_architectures.sort{|a, b|a<=>b}.to_h
+      failing_da_sum = 0
+      @failing_domain_architectures .each do |da, count|
+        failing_da_sum += count
+        log count.to_s.rjust(4) + ': ' + da
+      end
+      log
+      log 'Passing target domain(s):'
+      @passing_domains_data = @passing_domains_data.sort{|a, b|a<=>b}.to_h
+      @passing_domains_data.each do |n, d|
+        log d.to_str
+      end
+      log
+      log 'Failing target domain(s) (in proteins sequences with target domain architecture):'
+      @failing_domains_data = @failing_domains_data.sort{|a, b|a<=>b}.to_h
+      @failing_domains_data.each do |n, d|
+        log d.to_str
+      end
+
+      unless total_sequences == (passing_sequences.size + failing_sequences.size)
+        error_msg = "this should not have happened: total seqs not equal to passing plus failing seqs"
+        raise StandardError, error_msg
+      end
+
+      unless failing_sequences.size == (@failing_proteins_bc_not_all_target_doms_present + @failing_proteins_bc_missing_cutoffs)
+        error_msg = "this should not have happened: failing seqs sums not consistent"
+        raise StandardError, error_msg
+      end
+
+      unless @failing_proteins_bc_missing_cutoffs >= failing_da_sum
+        error_msg = "this should not have happened: failing seqs larger than failing da sum"
+        raise StandardError, error_msg
+      end
+
+      unless passing_sequences.size == passing_da_sum
+        error_msg = "this should not have happened: passing seqs not equal to passing da sum"
+        raise StandardError, error_msg
+      end
+
+      log
+      log "Protein sequences in sequence (fasta) file: " + in_msa.get_number_of_seqs.to_s.rjust(5)
+      log "Protein sequences in hmmscan output file  : " + total_sequences.to_s.rjust(5)
+      log "  Passing protein sequences               : " + passing_sequences.size.to_s.rjust(5)
+      log "  Failing protein sequences               : " + failing_sequences.size.to_s.rjust(5)
+      log "    Not all target domain present         : " + @failing_proteins_bc_not_all_target_doms_present.to_s.rjust(5)
+      log "    Target domain(s) failing cutoffs      : " + @failing_proteins_bc_missing_cutoffs.to_s.rjust(5)
+      log
+
       out_domain_msas.keys.sort.each do |domain_name|
-        puts "writing #{domain_name}"
-        write_msa( out_domain_msas[domain_name], domain_name + ".fasta" )
+        file_name = outfile_base + TARGET_DOM_OUTPUT_MIDPART + domain_name + '.fasta'
+        write_msa out_domain_msas[domain_name], file_name
+        log "Wrote passing target domain sequence for " +  domain_name.ljust(16) + ': ' + file_name
       end
 
-      puts "writing target_domain_architecure"
-      write_msa( out_domain_architecture_msa, target_da_outfile )
+      write_msa out_domain_architecture_msa, target_da_outfile
+      log 'Wrote target domain architecture                         : ' + target_da_outfile
 
-      puts "writing contactenated_domains_msa"
-      write_msa( out_concatenated_domains_msa, concat_target_dom_outfile )
+      write_msa out_concatenated_domains_msa, concat_target_dom_outfile
+      log 'Wrote concatenated target domain(s)                      : ' + concat_target_dom_outfile
 
       passing_sequences.each do | domains |
         query_name = domains[0].query
-        if passed_seqs_msa.find_by_name_start( query_name, true ).length < 1
+        if (!TESTING) || (passed_seqs_msa.find_by_name_start( query_name, true ).length < 1)
           add_sequence( query_name, in_msa, passed_seqs_msa )
         else
-          error_msg = "this should not have happened"
+          error_msg = 'this should not have happened'
           raise StandardError, error_msg
         end
       end
 
       failing_sequences.each do | domains |
         query_name = domains[0].query
-        if failed_seqs_msa.find_by_name_start( query_name, true ).length < 1
+        if (!TESTING) || (failed_seqs_msa.find_by_name_start( query_name, true ).length < 1)
           add_sequence( query_name, in_msa, failed_seqs_msa )
         else
-          error_msg = "this should not have happened"
+          error_msg = 'this should not have happened'
           raise StandardError, error_msg
         end
       end
 
-      puts
-
-      puts 'Non passing domain architectures containing target domain(s):'
-      @non_passsing_domain_architectures = @non_passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
-      @non_passsing_domain_architectures.each do |da, count|
-        puts da + ': ' + count.to_s
-      end
-
-      puts
+      write_msa passed_seqs_msa, passing_fl_seqs_outfile
+      log 'Wrote passing full length protein sequences              : ' + passing_fl_seqs_outfile
+      write_msa failed_seqs_msa, failing_fl_seqs_outfile
+      log 'Wrote failing full length protein sequences              : ' + failing_fl_seqs_outfile
 
-      puts 'Passing target domain counts:'
-      @passed = @passed.sort{|a, b|a<=>b}.to_h
-      @passed.each do |dom, count|
-        puts dom + ': ' + count.to_s
+      begin
+        f = File.open( logfile, 'w' )
+        f.print( @log )
+        f.close
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
       end
-
-      puts
-
-      puts "total sequences  : " + total_sequences.to_s
-      puts "passing sequences: " + passing_sequences.size.to_s
-
-      write_msa( passed_seqs_msa, passing_fl_seqs_outfile )
-      write_msa( failed_seqs_msa, failing_fl_seqs_outfile )
-
-      log_str << ld
-      log_str << "passing target domains                       : " + domain_pass_counter.to_s + ld
-      log_str << "failing target domains                       : " + domain_fail_counter.to_s + ld
-      log_str << "proteins in sequence (fasta) file            : " + in_msa.get_number_of_seqs.to_s + ld
-      log_str << "proteins in hmmscan outputfile               : " + protein_counter.to_s + ld
-      log_str << "proteins with passing target domain(s)       : " + passed_seqs_msa.get_number_of_seqs.to_s + ld
-      log_str << "proteins with no passing target domain       : " + proteins_with_failing_domains.to_s + ld
-      log_str << "proteins with no target domain               : " + domain_not_present_counter.to_s + ld
-
-      log_str << ld
-
-      return domain_pass_counter
+      log 'Wrote log file                                           : ' + logfile
 
     end # parse
 
     private
 
+    # domains: Array of HmmscanResult
+    def collect(domains, ie_cutoff, rel_len_cutoff)
+      passed = Array.new
+      domains.each do |domain|
+        length = 1 + domain.env_to - domain.env_from
+        if (domain.i_e_value <= ie_cutoff) && (length >= (rel_len_cutoff * domain.tlen))
+          passed.push domain
+        end
+      end
+      passed.sort! { |a,b| a.ali_from <=> b.ali_from }
+      s = ''
+      passed.each do |domain|
+        if s.length > 0
+          s << DOMAIN_DELIMITER
+        end
+        s << domain.model
+      end
+      if s.length > 0
+        if @encountered_domain_architectures.has_key? s
+          @encountered_domain_architectures[s] = 1 + @encountered_domain_architectures[s]
+        else
+          @encountered_domain_architectures[s] = 1
+        end
+      end
+    end
+
     # domains_in_query_seq: Array of HmmscanResult
     # target_domain_names: Set of String
     # target_domains: Hash String->TargetDomain
     # target_domain_architecture: String
-    def checkit2(domains_in_query_seq,
+    def compare(domains_in_query_seq,
       target_domain_names,
       target_domains,
       in_msa,
@@ -245,34 +314,39 @@ module Evoruby
       out_contactenated_domains_msa,
       target_domain_architecture)
 
+      collect(domains_in_query_seq, IE_CUTOFF_FOR_DA_OVERVIEW, REL_LEN_CUTOFF_FOR_DA_OVERVIEW)
+
       domain_names_in_query_seq = Set.new
       domains_in_query_seq.each do |domain|
         domain_names_in_query_seq.add(domain.model)
       end
-      if (target_domain_names.subset?(domain_names_in_query_seq))
 
-        passed_domains = Array.new
-        passed_domains_counts = Hash.new
+      passed_domains = Array.new
+      passed_domains_counts = Hash.new
+
+      if (domain_names_in_query_seq.length > 0) && (target_domain_names.subset?(domain_names_in_query_seq))
 
         domains_in_query_seq.each do |domain|
           if target_domains.has_key?(domain.model)
             target_domain = target_domains[domain.model]
 
-            if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
+            if target_domain.i_e_value >= 0
               if domain.i_e_value > target_domain.i_e_value
+                addToFailingDomainData(domain)
                 next
               end
             end
-            if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
+            if target_domain.abs_len > 0
               length = 1 + domain.env_to - domain.env_from
               if length < target_domain.abs_len
+                addToFailingDomainData(domain)
                 next
               end
             end
-            if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
+            if target_domain.rel_len > 0
               length = 1 + domain.env_to - domain.env_from
-              #  puts (target_domain.rel_len * domain.tlen)
               if length < (target_domain.rel_len * domain.tlen)
+                addToFailingDomainData(domain)
                 next
               end
             end
@@ -285,28 +359,30 @@ module Evoruby
               passed_domains_counts[domain.model] = 1
             end
 
-            if (@passed.key?(domain.model))
-              @passed[domain.model] = @passed[domain.model] + 1
-            else
-              @passed[domain.model] = 1
-            end
+            addToPassingDomainData(domain)
+
           end # if target_domains.has_key?(domain.model)
         end # domains_in_query_seq.each do |domain|
       else
+        @failing_proteins_bc_not_all_target_doms_present += 1
+        return false
+      end # target_domain_names.subset?(domain_names_in_query_seq)
+
+      if passed_domains.length < 1
+        @failing_proteins_bc_missing_cutoffs += 1
         return false
       end
 
       passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
       # Compare architectures
-      if !compareArchitectures(target_domain_architecture, passed_domains)
+      if !compareArchitectures(target_domain_architecture, passed_domains, false)
+        @failing_proteins_bc_missing_cutoffs += 1
         return false
       end
 
       domain_counter = Hash.new
 
       min_env_from = 10000000
-      max_env_from = 0
-      min_env_to = 10000000
       max_env_to = 0
 
       query_seq = nil
@@ -347,20 +423,11 @@ module Evoruby
         if domain.env_from < min_env_from
           min_env_from = domain.env_from
         end
-        if domain.env_from > max_env_from
-          max_env_from = domain.env_from
-        end
-        if domain.env_to < min_env_to
-          min_env_to = domain.env_to
-        end
         if domain.env_to > max_env_to
           max_env_to = domain.env_to
         end
       end
 
-      #puts "env from: #{min_env_from} - #{max_env_from}"
-      #puts "env to  : #{min_env_to} - #{max_env_to}"
-
       extract_domain( query_seq,
       1,
       1,
@@ -369,19 +436,31 @@ module Evoruby
       in_msa,
       out_domain_architecture_msa )
 
-      puts passed_domains_counts
-
       out_contactenated_domains_msa.add( query_seq, concatenated_domains)
 
       true
     end
 
+    def addToPassingDomainData(domain)
+      unless ( @passing_domains_data.key?(domain.model))
+        @passing_domains_data[domain.model] = DomainData.new(domain.model)
+      end
+      @passing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value)
+    end
+
+    def addToFailingDomainData(domain)
+      unless ( @failing_domains_data.key?(domain.model))
+        @failing_domains_data[domain.model] = DomainData.new(domain.model)
+      end
+      @failing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value)
+    end
+
     # passed_domains needs to be sorted!
     def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
-      domain_architecture = ""
+      domain_architecture = ''
       passed_domains.each do |domain|
         if domain_architecture.length > 0
-          domain_architecture += ">"
+          domain_architecture += DOMAIN_DELIMITER
         end
         domain_architecture += domain.model
       end
@@ -391,11 +470,17 @@ module Evoruby
       else
         pass = (domain_architecture.index(target_domain_architecture) != nil)
       end
-      if !pass
-        if (@non_passsing_domain_architectures.key?(domain_architecture))
-          @non_passsing_domain_architectures[domain_architecture] = @non_passsing_domain_architectures[domain_architecture] + 1
+      if pass
+        if (@passsing_domain_architectures.key?(domain_architecture))
+          @passsing_domain_architectures[domain_architecture] = @passsing_domain_architectures[domain_architecture] + 1
+        else
+          @passsing_domain_architectures[domain_architecture] = 1
+        end
+      else
+        if ( @failing_domain_architectures.key?(domain_architecture))
+          @failing_domain_architectures[domain_architecture] =  @failing_domain_architectures[domain_architecture] + 1
         else
-          @non_passsing_domain_architectures[domain_architecture] = 1
+          @failing_domain_architectures[domain_architecture] = 1
         end
       end
       return pass
@@ -406,7 +491,7 @@ module Evoruby
       w = FastaWriter.new()
       w.set_line_width( 60 )
       w.clean( true )
-      File.delete(filename) if File.exist?(filename) #TODO remove me
+      File.delete(filename) if File.exist?(filename)
       begin
         io.write_to_file( msa, filename, w )
       rescue Exception
@@ -469,25 +554,152 @@ module Evoruby
       seq.get_sequence_as_string
     end
 
+    def parse_da( target_da_str )
+      target_domain_hash = Hash.new
+      target_domain_ary = Array.new
+      target_das = target_da_str.split '--'
+      target_das.each do |x|
+        inds = x.split '='
+        unless inds.size == 1 || inds.size == 4
+          raise IOError, 'domain architecture is ill formatted: ' + x
+        end
+
+        target_domain_name = inds[0]
+        ie_cutoff = inds.size == 4 ? Float(inds[1]) : IE_CUTOFF_FOR_DA_OVERVIEW
+        abs_len_cutoff = inds.size == 4 ? Integer(inds[2]) : 0
+        rel_len_cutoff = inds.size == 4 ? Float(inds[3]) : REL_LEN_CUTOFF_FOR_DA_OVERVIEW
+        if target_domain_hash.has_key? target_domain_name
+          target_domain_ary.push target_domain_hash[target_domain_name]
+        else
+          td = TargetDomain.new(target_domain_name, ie_cutoff, abs_len_cutoff, rel_len_cutoff)
+          target_domain_hash[target_domain_name] = td
+          target_domain_ary.push td
+        end
+      end
+      target_domain_ary
+    end
+
+    def log(str = '')
+      puts str
+      @log << str << Constants::LINE_DELIMITER
+    end
+
   end # class HmmscanMultiDomainExtractor
 
+  class DomainData
+    def initialize( name )
+      if (name == nil) || name.size < 1
+        error_msg = "domain name nil or empty"
+        raise IOError, error_msg
+      end
+      @name = name
+      @count = 0
+      @i_e_value_min = 10000000.0
+      @i_e_value_max = -1.0
+      @i_e_value_sum = 0.0
+      @len_min = 10000000
+      @len_max = -1
+      @len_sum = 0.0
+    end
+
+    def add( name, length, i_e_value)
+      if name != @name
+        error_msg = "domain names do not match"
+        raise IOError, error_msg
+      end
+
+      if length < 0
+        error_msg = "length cannot me negative"
+        raise IOError, error_msg
+      end
+      if i_e_value < 0
+        error_msg = "iE-value cannot me negative"
+        raise IOError, error_msg
+      end
+      @count += 1
+      @i_e_value_sum += i_e_value
+      @len_sum += length
+      if i_e_value > @i_e_value_max
+        @i_e_value_max = i_e_value
+      end
+      if i_e_value < @i_e_value_min
+        @i_e_value_min = i_e_value
+      end
+      if length > @len_max
+        @len_max = length
+      end
+      if length < @len_min
+        @len_min = length
+      end
+    end
+
+    def avg_length
+      if @count == 0
+        return 0
+      end
+      @len_sum / @count
+    end
+
+    def avg_i_e_value
+      if @count == 0
+        return 0
+      end
+      @i_e_value_sum / @count
+    end
+
+    def to_str
+      s = ''
+      s << @name.rjust(16) + ': '
+      s << @count.to_s.rjust(4) + '  '
+      s << avg_length.round(1).to_s.rjust(6) + ' '
+      s << @len_min.to_s.rjust(4) + ' -'
+      s << @len_max.to_s.rjust(4) + '  '
+      s << ("%.2E" % avg_i_e_value).rjust(9) + ' '
+      s << ("%.2E" % @i_e_value_min).rjust(9) + ' -'
+      s << ("%.2E" % @i_e_value_max).rjust(9) + ' '
+      s
+    end
+
+    attr_reader :name, :count, :i_e_value_min, :i_e_value_max, :length_min, :length_max
+
+  end
+
   class TargetDomain
-    def initialize( name, i_e_value, abs_len, rel_len, position )
+    def initialize(name, i_e_value, abs_len, rel_len)
       if (name == nil) || name.size < 1
         error_msg = "target domain name nil or empty"
-        raise StandardError, error_msg
+        raise IOError, error_msg
       end
       if rel_len > 1
-        error_msg = "target domain relative length is greater than 1"
-        raise StandardError, error_msg
+        error_msg = name + ": target domain relative length is greater than 1"
+        raise IOError, error_msg
+      end
+      if (abs_len <= 0) && (rel_len <= 0)
+        error_msg = name + ": need to have either absolute length or relative length cutoff"
+        raise IOError, error_msg
+      end
+      if (abs_len > 0) && (rel_len > 0)
+        error_msg = name + ": cannot have both absolute length and relative length cutoff"
+        raise IOError, error_msg
       end
       @name = name
       @i_e_value = i_e_value
       @abs_len = abs_len
       @rel_len = rel_len
-      @position = position
     end
-    attr_reader :name, :i_e_value, :abs_len, :rel_len, :position
+
+    def to_str
+      s = @name.rjust(16) + ':'
+      s << ' iE-cutoff: ' + ("%.2E" % @i_e_value).rjust(9)
+      if @abs_len > 0
+        s << ', abs len-cutoff: ' + @abs_len.to_s.rjust(4)
+      end
+      if @rel_len > 0
+        s << ', rel len-cutoff: ' + @rel_len.to_s.rjust(4)
+      end
+      s
+    end
+    attr_reader :name, :i_e_value, :abs_len, :rel_len
   end
 
 end # module Evoruby