in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_multi_domain_extractor.rb
index c8cbc8d..f773e26 100644 (file)
@@ -4,7 +4,11 @@
 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
 # License::      GNU Lesser General Public License (LGPL)
 #
-# Last modified: 2017/02/20
+# Last modified: 2017/03/08
+
+####
+# Import: if multiple copies of same domain, thresholds need to be same!
+####
 
 require 'lib/evo/util/constants'
 require 'lib/evo/msa/msa_factory'
@@ -20,6 +24,9 @@ module Evoruby
     OUTPUT_ID = 'mdsx'
     DOMAIN_DELIMITER = ' -- '
 
+    IE_CUTOFF_FOR_DA_OVERVIEW      = 1E-6
+    REL_LEN_CUTOFF_FOR_DA_OVERVIEW = 0.5
+
     PASSING_FL_SEQS_SUFFIX    = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
     FAILING_FL_SEQS_SUFFIX    = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
     TARGET_DA_SUFFIX          = "_#{OUTPUT_ID}_target_da.fasta"
@@ -33,6 +40,7 @@ module Evoruby
       @passsing_domain_architectures = nil
       @failing_proteins_bc_not_all_target_doms_present = nil
       @failing_proteins_bc_missing_cutoffs = nil
+      @encountered_domain_architectures = nil
       @log = nil
     end
 
@@ -74,39 +82,18 @@ module Evoruby
       hmmscan_parser = HmmscanParser.new( hmmscan_output )
       results = hmmscan_parser.parse
 
-      ####
-      # Import: if multiple copies of same domain, thresholds need to be same!
-
-      #target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 ))
-      #target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 ))
-
-      #target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.9, 0 ))
-      #target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.9, 1 ))
-      # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
-      # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
-
       target_domain_ary = parse_da target_da
 
-      # target_domain_ary.push(TargetDomain.new('BH4', 1, -1, 0.2, 0 ))
-      # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 1 ))
-      # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 2 ))
-      # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
-
-      #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
-      #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
-
-      #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
-
       target_domains = Hash.new
 
-      target_domain_architecure = ""
+      target_domain_architecure = ''
 
       target_domain_ary.each do |target_domain|
         target_domains[target_domain.name] = target_domain
         if target_domain_architecure.length > 0
-          target_domain_architecure += DOMAIN_DELIMITER
+          target_domain_architecure << DOMAIN_DELIMITER
         end
-        target_domain_architecure += target_domain.name
+        target_domain_architecure << target_domain.name
       end
 
       target_domain_architecure.freeze
@@ -132,6 +119,7 @@ module Evoruby
       @passsing_domain_architectures = Hash.new
       @passing_domains_data = Hash.new
       @failing_domains_data = Hash.new
+      @encountered_domain_architectures= Hash.new
       @failing_proteins_bc_not_all_target_doms_present = 0
       @failing_proteins_bc_missing_cutoffs = 0
       out_domain_msas = Hash.new
@@ -166,7 +154,21 @@ module Evoruby
       end
 
       log
-      log 'Passing domain architectures containing target domain(s):'
+      log 'Domain architecture overview using default iE-cutoff ' +
+      IE_CUTOFF_FOR_DA_OVERVIEW.to_s + ' and relative length cutoff ' + REL_LEN_CUTOFF_FOR_DA_OVERVIEW.to_s + ':'
+
+      @encountered_domain_architectures = @encountered_domain_architectures.sort_by {|k, v| v}.reverse.to_h
+      counter = 1;
+      @encountered_domain_architectures.each do |k, v|
+        log counter.to_s.rjust(2) + ') ' +  v.to_s.rjust(5) + ': ' + k
+        counter += 1
+        if counter > 40
+          break
+        end
+      end
+
+      log
+      log 'Passing domain arrangements of target domain(s):'
       @passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
       passing_da_sum = 0
       @passsing_domain_architectures.each do |da, count|
@@ -174,8 +176,8 @@ module Evoruby
         log count.to_s.rjust(4) + ': ' + da
       end
       log
-      log 'Failing domain architectures containing one or more target domain(s):'
-      @failing_domain_architectures = @failing_domain_architectures .sort{|a, b|a<=>b}.to_h
+      log 'Failing domain arrangements of target domain(s):'
+      @failing_domain_architectures = @failing_domain_architectures.sort{|a, b|a<=>b}.to_h
       failing_da_sum = 0
       @failing_domain_architectures .each do |da, count|
         failing_da_sum += count
@@ -188,7 +190,7 @@ module Evoruby
         log d.to_str
       end
       log
-      log'Failing target domain(s):'
+      log 'Failing target domain(s) (in proteins sequences with target domain architecture):'
       @failing_domains_data = @failing_domains_data.sort{|a, b|a<=>b}.to_h
       @failing_domains_data.each do |n, d|
         log d.to_str
@@ -204,8 +206,8 @@ module Evoruby
         raise StandardError, error_msg
       end
 
-      unless @failing_proteins_bc_missing_cutoffs == failing_da_sum
-        error_msg = "this should not have happened: failing seqs not equal to failing da sum"
+      unless @failing_proteins_bc_missing_cutoffs >= failing_da_sum
+        error_msg = "this should not have happened: failing seqs larger than failing da sum"
         raise StandardError, error_msg
       end
 
@@ -273,6 +275,32 @@ module Evoruby
 
     private
 
+    # domains: Array of HmmscanResult
+    def collect(domains, ie_cutoff, rel_len_cutoff)
+      passed = Array.new
+      domains.each do |domain|
+        length = 1 + domain.env_to - domain.env_from
+        if (domain.i_e_value <= ie_cutoff) && (length >= (rel_len_cutoff * domain.tlen))
+          passed.push domain
+        end
+      end
+      passed.sort! { |a,b| a.ali_from <=> b.ali_from }
+      s = ''
+      passed.each do |domain|
+        if s.length > 0
+          s << DOMAIN_DELIMITER
+        end
+        s << domain.model
+      end
+      if s.length > 0
+        if @encountered_domain_architectures.has_key? s
+          @encountered_domain_architectures[s] = 1 + @encountered_domain_architectures[s]
+        else
+          @encountered_domain_architectures[s] = 1
+        end
+      end
+    end
+
     # domains_in_query_seq: Array of HmmscanResult
     # target_domain_names: Set of String
     # target_domains: Hash String->TargetDomain
@@ -286,33 +314,36 @@ module Evoruby
       out_contactenated_domains_msa,
       target_domain_architecture)
 
+      collect(domains_in_query_seq, IE_CUTOFF_FOR_DA_OVERVIEW, REL_LEN_CUTOFF_FOR_DA_OVERVIEW)
+
       domain_names_in_query_seq = Set.new
       domains_in_query_seq.each do |domain|
         domain_names_in_query_seq.add(domain.model)
       end
-      if target_domain_names.subset?(domain_names_in_query_seq)
 
-        passed_domains = Array.new
-        passed_domains_counts = Hash.new
+      passed_domains = Array.new
+      passed_domains_counts = Hash.new
+
+      if (domain_names_in_query_seq.length > 0) && (target_domain_names.subset?(domain_names_in_query_seq))
 
         domains_in_query_seq.each do |domain|
           if target_domains.has_key?(domain.model)
             target_domain = target_domains[domain.model]
 
-            if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
+            if target_domain.i_e_value >= 0
               if domain.i_e_value > target_domain.i_e_value
                 addToFailingDomainData(domain)
                 next
               end
             end
-            if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
+            if target_domain.abs_len > 0
               length = 1 + domain.env_to - domain.env_from
               if length < target_domain.abs_len
                 addToFailingDomainData(domain)
                 next
               end
             end
-            if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
+            if target_domain.rel_len > 0
               length = 1 + domain.env_to - domain.env_from
               if length < (target_domain.rel_len * domain.tlen)
                 addToFailingDomainData(domain)
@@ -337,9 +368,14 @@ module Evoruby
         return false
       end # target_domain_names.subset?(domain_names_in_query_seq)
 
+      if passed_domains.length < 1
+        @failing_proteins_bc_missing_cutoffs += 1
+        return false
+      end
+
       passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
       # Compare architectures
-      if !compareArchitectures(target_domain_architecture, passed_domains)
+      if !compareArchitectures(target_domain_architecture, passed_domains, false)
         @failing_proteins_bc_missing_cutoffs += 1
         return false
       end
@@ -441,10 +477,10 @@ module Evoruby
           @passsing_domain_architectures[domain_architecture] = 1
         end
       else
-        if ( @failing_domain_architectures .key?(domain_architecture))
-          @failing_domain_architectures [domain_architecture] =  @failing_domain_architectures [domain_architecture] + 1
+        if ( @failing_domain_architectures.key?(domain_architecture))
+          @failing_domain_architectures[domain_architecture] =  @failing_domain_architectures[domain_architecture] + 1
         else
-          @failing_domain_architectures [domain_architecture] = 1
+          @failing_domain_architectures[domain_architecture] = 1
         end
       end
       return pass
@@ -524,13 +560,14 @@ module Evoruby
       target_das = target_da_str.split '--'
       target_das.each do |x|
         inds = x.split '='
-        unless inds.size == 4
+        unless inds.size == 1 || inds.size == 4
           raise IOError, 'domain architecture is ill formatted: ' + x
         end
+
         target_domain_name = inds[0]
-        ie_cutoff = Float(inds[1])
-        abs_len_cutoff = Integer(inds[2])
-        rel_len_cutoff = Float(inds[3])
+        ie_cutoff = inds.size == 4 ? Float(inds[1]) : IE_CUTOFF_FOR_DA_OVERVIEW
+        abs_len_cutoff = inds.size == 4 ? Integer(inds[2]) : 0
+        rel_len_cutoff = inds.size == 4 ? Float(inds[3]) : REL_LEN_CUTOFF_FOR_DA_OVERVIEW
         if target_domain_hash.has_key? target_domain_name
           target_domain_ary.push target_domain_hash[target_domain_name]
         else