in progress...
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_domain_extractor.rb
index 89e0fe7..0a7f2b1 100644 (file)
@@ -4,7 +4,7 @@
 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
 # License::      GNU Lesser General Public License (LGPL)
 #
-# Last modified: 2017/02/16
+# Last modified: 2017/02/20
 
 require 'lib/evo/util/constants'
 require 'lib/evo/msa/msa_factory'
@@ -79,6 +79,7 @@ module Evoruby
 
       domain_pass_counter                = 0
       domain_fail_counter                = 0
+      passing_domains_per_protein        = 0
       proteins_with_failing_domains      = 0
       domain_not_present_counter         = 0
       protein_counter                    = 1
@@ -87,9 +88,14 @@ module Evoruby
       passing_target_length_sum          = 0
       overall_target_length_sum          = 0
       overall_target_length_min          = 10000000
-      overall_target_length_max          = 0
+      overall_target_length_max          = -1
       passing_target_length_min          = 10000000
-      passing_target_length_max          = 0
+      passing_target_length_max          = -1
+
+      overall_target_ie_min          = 10000000
+      overall_target_ie_max          = -1
+      passing_target_ie_min          = 10000000
+      passing_target_ie_max          = -1
 
       hmmscan_datas = []
 
@@ -103,6 +109,7 @@ module Evoruby
 
         if ( prev_query != nil ) && ( r.query != prev_query )
           protein_counter += 1
+          passing_domains_per_protein = 0
           if !saw_target
             log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
             domain_not_present_counter += 1
@@ -126,7 +133,7 @@ module Evoruby
         i_e_value = r.i_e_value
         prev_query = r.query
 
-        length = env_to - env_from + 1
+        length = 1 + env_to - env_from
 
         overall_target_length_sum += length
         if length > overall_target_length_max
@@ -136,19 +143,33 @@ module Evoruby
           overall_target_length_min = length
         end
 
+        if i_e_value > overall_target_ie_max
+          overall_target_ie_max = i_e_value
+        end
+        if i_e_value < overall_target_ie_min
+          overall_target_ie_min = i_e_value
+        end
+
         if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
         ( ( length_threshold <= 0 ) || ( length >= length_threshold.to_f ) ) )
           hmmscan_datas << HmmsearchData.new( sequence, number, out_of, env_from, env_to, i_e_value )
           passing_target_length_sum += length
+          passing_domains_per_protein += 1
           if length > passing_target_length_max
             passing_target_length_max = length
           end
           if length < passing_target_length_min
             passing_target_length_min = length
           end
-          if ( number > max_domain_copy_number_per_protein )
+          if i_e_value > passing_target_ie_max
+            passing_target_ie_max = i_e_value
+          end
+          if i_e_value < passing_target_ie_min
+            passing_target_ie_min = i_e_value
+          end
+          if ( passing_domains_per_protein > max_domain_copy_number_per_protein )
             max_domain_copy_number_sequence    = sequence
-            max_domain_copy_number_per_protein = number
+            max_domain_copy_number_per_protein = passing_domains_per_protein
           end
         else # no pass
           log << domain_fail_counter.to_s + ": " + sequence.to_s + " fails threshold(s)"
@@ -230,24 +251,30 @@ module Evoruby
       puts( "Passing target domain lengths: average: " + avg_pass.to_s  )
       log << "Passing target domain lengths: average: " + avg_pass.to_s
       log << ld
-      puts( "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + "-"  + passing_target_length_max.to_s)
-      log << "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + "-"  + passing_target_length_max.to_s
+      puts( "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - "  + passing_target_length_max.to_s)
+      log << "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - "  + passing_target_length_max.to_s
       log << ld
-      puts( "Passing target domain lengths:     sum: " + domain_pass_counter.to_s  )
-      log << "Passing target domain lengths:     sum: " + domain_pass_counter.to_s
+      puts( "Passing target domain iE:      min-max: " + passing_target_ie_min.to_s + " - "  + passing_target_ie_max.to_s)
+      log << "Passing target domain iE:      min-max: " + passing_target_ie_min.to_s + " - "  + passing_target_ie_max.to_s
+      log << ld
+      puts( "Passing target domains:            sum: " + domain_pass_counter.to_s  )
+      log << "Passing target domains:            sum: " + domain_pass_counter.to_s
       log << ld
       log << ld
       puts
       sum = domain_pass_counter + domain_fail_counter
       avg_all = overall_target_length_sum / sum
       puts( "All target domain lengths:     average: " + avg_all.to_s  )
-      log << "All target domain lengths:     average: " +avg_all.to_s
+      log << "All target domain lengths:     average: " + avg_all.to_s
+      log << ld
+      puts( "All target domain lengths:     min-max: " + overall_target_length_min.to_s + " - "  + overall_target_length_max.to_s)
+      log << "All target domain lengths:     min-max: " + overall_target_length_min.to_s + " - "  + overall_target_length_max.to_s
       log << ld
-      puts( "All target domain lengths:     min-max: " + overall_target_length_min.to_s + "-"  + overall_target_length_max.to_s)
-      log << "All target domain lengths:     min-max: " + overall_target_length_min.to_s + "-"  + overall_target_length_max.to_s
+      puts( "All target target domain iE:   min-max: " + overall_target_ie_min.to_s + " - "  + overall_target_ie_max.to_s)
+      log << "All target target domain iE:   min-max: " + overall_target_ie_min.to_s + " - "  + overall_target_ie_max.to_s
       log << ld
-      puts( "All target domain lengths:         sum: " + sum.to_s  )
-      log << "All target domain lengths:         sum: " + sum.to_s
+      puts( "All target domains:                sum: " + sum.to_s  )
+      log << "All target domains:                sum: " + sum.to_s
 
       puts
       puts( "Proteins with passing target domain(s): " + passed_seqs.get_number_of_seqs.to_s )
@@ -257,8 +284,8 @@ module Evoruby
       log << ld
       log << ld
       puts
-      puts( "Max target domain copy number per protein (includes non-passing): " + max_domain_copy_number_per_protein.to_s )
-      log << "Max target domain copy number per protein (includes non-passing): " + max_domain_copy_number_per_protein.to_s
+      puts( "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s )
+      log << "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s
       log << ld
 
       if ( max_domain_copy_number_per_protein > 1 )