in progress... multi-domain extractor started

author cmzmasek <chris.zma@outlook.com>

Wed, 22 Feb 2017 01:42:47 +0000 (17:42 -0800)

committer cmzmasek <chris.zma@outlook.com>

Wed, 22 Feb 2017 01:42:47 +0000 (17:42 -0800)
author cmzmasek <chris.zma@outlook.com>
Wed, 22 Feb 2017 01:42:47 +0000 (17:42 -0800)
committer cmzmasek <chris.zma@outlook.com>
Wed, 22 Feb 2017 01:42:47 +0000 (17:42 -0800)
diff --git a/forester/ruby/evoruby/exe/dsx.rb b/forester/ruby/evoruby/exe/dsx.rb

index 02091bf..0ae40d0 100755 (executable)
--- a/forester/ruby/evoruby/exe/dsx.rb
+++ b/forester/ruby/evoruby/exe/dsx.rb
@@ -2,12 +2,9 @@
  #
  # = exe/dsx
  #
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# $Id: dsx.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $
-#
-# last modified: 06/11/2007
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
+
  
  require 'lib/evo/tool/domain_sequence_extractor'
  
diff --git a/forester/ruby/evoruby/exe/mdsx.rb b/forester/ruby/evoruby/exe/mdsx.rb

new file mode 100755 (executable)

index 0000000..0078a40
--- /dev/null
+++ b/forester/ruby/evoruby/exe/mdsx.rb
@@ -0,0 +1,17 @@
+#!/usr/local/bin/ruby -w
+#
+# = exe/dsx
+#
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
+
+
+require 'lib/evo/tool/multi_domain_seq_extractor'
+
+module Evoruby
+
+    mdsx = MultiDomainSeqExtractor.new()
+
+    mdsx.run()
+
+end  # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb

index fc70c0d..0a7f2b1 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb
+++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb
@@ -133,7 +133,7 @@ module Evoruby
          i_e_value = r.i_e_value
          prev_query = r.query
  
-        length = env_to - env_from + 1
+        length = 1 + env_to - env_from
  
          overall_target_length_sum += length
          if length > overall_target_length_max
diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb

new file mode 100644 (file)

index 0000000..0ed3811
--- /dev/null
+++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb
@@ -0,0 +1,507 @@
+#
+# = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
+#
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
+#
+# Last modified: 2017/02/20
+
+require 'lib/evo/util/constants'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/parser/fasta_parser'
+require 'lib/evo/io/parser/hmmscan_parser'
+
+module Evoruby
+  class HmmscanMultiDomainExtractor
+    def initialize
+    end
+
+    # raises ArgumentError, IOError, StandardError
+    def parse( domain_id,
+      hmmscan_output,
+      fasta_sequence_file,
+      outfile,
+      passed_seqs_outfile,
+      failed_seqs_outfile,
+      e_value_threshold,
+      length_threshold,
+      add_position,
+      add_domain_number,
+      add_species,
+      log )
+
+      Util.check_file_for_readability( hmmscan_output )
+      Util.check_file_for_readability( fasta_sequence_file )
+      Util.check_file_for_writability( outfile + ".fasta" )
+      Util.check_file_for_writability( passed_seqs_outfile )
+      Util.check_file_for_writability( failed_seqs_outfile )
+
+      in_msa = nil
+      factory = MsaFactory.new()
+      in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
+
+      if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
+        error_msg = "could not find fasta sequences in " + fasta_sequence_file
+        raise IOError, error_msg
+      end
+
+      out_msa = Msa.new
+
+      failed_seqs = Msa.new
+      passed_seqs = Msa.new
+
+      ld = Constants::LINE_DELIMITER
+
+      domain_pass_counter                = 0
+      domain_fail_counter                = 0
+      passing_domains_per_protein        = 0
+      proteins_with_failing_domains      = 0
+      domain_not_present_counter         = 0
+      protein_counter                    = 1
+      max_domain_copy_number_per_protein = -1
+      max_domain_copy_number_sequence    = ""
+      passing_target_length_sum          = 0
+      overall_target_length_sum          = 0
+      overall_target_length_min          = 10000000
+      overall_target_length_max          = -1
+      passing_target_length_min          = 10000000
+      passing_target_length_max          = -1
+
+      overall_target_ie_min          = 10000000
+      overall_target_ie_max          = -1
+      passing_target_ie_min          = 10000000
+      passing_target_ie_max          = -1
+
+      hmmscan_datas = []
+
+      hmmscan_parser = HmmscanParser.new( hmmscan_output )
+      results = hmmscan_parser.parse
+
+      ####
+
+      pc0 = PassCondition.new('Bcl-2', 0.01, -1, 0.5 )
+
+      pcs = Hash.new
+      pcs["Bcl-2"] = pc0
+
+      prev_query = nil
+      domains_in_query_ary = Array.new
+      queries_ary = Array.new
+      results.each do |hmmscan_result|
+        if ( prev_query != nil ) && ( hmmscan_result.query != prev_query )
+          ###--
+          pass = true
+          domains_in_query_ary.each do |domain_in_query|
+            if pcs.has_key?(domain_in_query.model)
+              pc = pcs[domain_in_query.model]
+              #  @abs_len = abs_len
+              #  @percent_len = rel_len
+              if (pc.i_e_value != nil) && (pc.i_e_value >= 0)
+                if domain_in_query.i_e_value > pc.i_e_value
+                  pass = false
+                  #break
+                end
+              end
+              if (pc.abs_len != nil) && (pc.abs_len > 0)
+                length = 1 + domain_in_query.env_to - domain_in_query.env_from
+                if length < pc.abs_len
+                  pass = false
+                  #break
+                end
+              end
+              if (pc.rel_len != nil) && (pc.rel_len > 0)
+                length = 1 + domain_in_query.env_to - domain_in_query.env_from
+                if length < (pc.rel_len * domain_in_query.tlen)
+                  pass = false
+                  #break
+                end
+              end
+            end
+          end
+          if pass == true
+            queries_ary.push(domains_in_query_ary)
+          end
+          domains_in_query_ary = Array.new
+          ###--
+        end
+        prev_query = hmmscan_result.query
+        domains_in_query_ary.push(hmmscan_result)
+      end
+      if prev_query != nil
+        queries_ary.push(domains_in_query_ary)
+      end
+      puts  queries_ary[0][0].model
+      puts  queries_ary[0][0].i_e_value
+      puts  queries_ary[0][1].model
+      puts  queries_ary[0][2].model
+      puts  queries_ary[1][0].model
+      puts  queries_ary[1][0].i_e_value
+      puts  queries_ary[1][1].model
+      puts  queries_ary[2][2].model
+      queries_ary.each do | query_ary |
+        query_ary.each do | domain |
+          # puts domain.model
+        end
+        #puts
+      end
+
+      ####
+
+      prev_query = nil
+      saw_target = false
+
+      results.each do | r |
+
+        if ( prev_query != nil ) && ( r.query != prev_query )
+          protein_counter += 1
+          passing_domains_per_protein = 0
+          if !saw_target
+            log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
+            domain_not_present_counter += 1
+          end
+          saw_target = false
+        end
+
+        prev_query = r.query
+
+        if domain_id != r.model
+          next
+        end
+
+        saw_target = true
+
+        #   target    = r.model
+        sequence  = r.query
+        # sequence_len = r.qlen
+        number    = r.number
+        out_of    = r.out_of
+        env_from  = r.env_from
+        env_to    = r.env_to
+        i_e_value = r.i_e_value
+        prev_query = r.query
+
+        length = 1 + env_to - env_from
+
+        overall_target_length_sum += length
+        if length > overall_target_length_max
+          overall_target_length_max = length
+        end
+        if length < overall_target_length_min
+          overall_target_length_min = length
+        end
+
+        if i_e_value > overall_target_ie_max
+          overall_target_ie_max = i_e_value
+        end
+        if i_e_value < overall_target_ie_min
+          overall_target_ie_min = i_e_value
+        end
+
+        if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
+        ( ( length_threshold <= 0 ) || ( length >= length_threshold.to_f ) ) )
+          hmmscan_datas << HmmscanData.new( sequence, number, out_of, env_from, env_to, i_e_value )
+          passing_target_length_sum += length
+          passing_domains_per_protein += 1
+          if length > passing_target_length_max
+            passing_target_length_max = length
+          end
+          if length < passing_target_length_min
+            passing_target_length_min = length
+          end
+          if i_e_value > passing_target_ie_max
+            passing_target_ie_max = i_e_value
+          end
+          if i_e_value < passing_target_ie_min
+            passing_target_ie_min = i_e_value
+          end
+          if ( passing_domains_per_protein > max_domain_copy_number_per_protein )
+            max_domain_copy_number_sequence    = sequence
+            max_domain_copy_number_per_protein = passing_domains_per_protein
+          end
+        else # no pass
+          log << domain_fail_counter.to_s + ": " + sequence.to_s + " fails threshold(s)"
+          if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
+            log << " iE=" + i_e_value.to_s
+          end
+          if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
+            le = env_to - env_from + 1
+            log << " l=" + le.to_s
+          end
+          log << ld
+          domain_fail_counter += 1
+        end
+
+        if number > out_of
+          error_msg = "number > out_of (this should not have happened)"
+          raise StandardError, error_msg
+        end
+
+        if number == out_of
+          if !hmmscan_datas.empty?
+            process_hmmscan_datas( hmmscan_datas,
+            in_msa,
+            add_position,
+            add_domain_number,
+            add_species,
+            out_msa )
+            domain_pass_counter += hmmscan_datas.length
+            if passed_seqs.find_by_name_start( sequence, true ).length < 1
+              add_sequence( sequence, in_msa, passed_seqs )
+            else
+              error_msg = "this should not have happened"
+              raise StandardError, error_msg
+            end
+          else # no pass
+            if failed_seqs.find_by_name_start( sequence, true ).length < 1
+              add_sequence( sequence, in_msa, failed_seqs )
+              proteins_with_failing_domains += 1
+            else
+              error_msg = "this should not have happened"
+              raise StandardError, error_msg
+            end
+          end
+          hmmscan_datas.clear
+        end
+
+      end # results.each do | r |
+
+      if (prev_query != nil) && (!saw_target)
+        log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
+        domain_not_present_counter += 1
+      end
+
+      if domain_pass_counter < 1
+        error_msg = "no domain sequences were extracted"
+        raise IOError, error_msg
+      end
+
+      if ( domain_not_present_counter + passed_seqs.get_number_of_seqs + proteins_with_failing_domains ) != protein_counter
+        error_msg = "not present + passing + not passing != proteins in sequence (fasta) file (this should not have happened)"
+        raise StandardError, error_msg
+      end
+
+      puts
+      log << ld
+
+      log << ld
+      avg_pass = ( passing_target_length_sum / domain_pass_counter )
+      puts( "Passing target domain lengths: average: " + avg_pass.to_s  )
+      log << "Passing target domain lengths: average: " + avg_pass.to_s
+      log << ld
+      puts( "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - "  + passing_target_length_max.to_s)
+      log << "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - "  + passing_target_length_max.to_s
+      log << ld
+      puts( "Passing target domain iE:      min-max: " + passing_target_ie_min.to_s + " - "  + passing_target_ie_max.to_s)
+      log << "Passing target domain iE:      min-max: " + passing_target_ie_min.to_s + " - "  + passing_target_ie_max.to_s
+      log << ld
+      puts( "Passing target domains:            sum: " + domain_pass_counter.to_s  )
+      log << "Passing target domains:            sum: " + domain_pass_counter.to_s
+      log << ld
+      log << ld
+      puts
+      sum = domain_pass_counter + domain_fail_counter
+      avg_all = overall_target_length_sum / sum
+      puts( "All target domain lengths:     average: " + avg_all.to_s  )
+      log << "All target domain lengths:     average: " + avg_all.to_s
+      log << ld
+      puts( "All target domain lengths:     min-max: " + overall_target_length_min.to_s + " - "  + overall_target_length_max.to_s)
+      log << "All target domain lengths:     min-max: " + overall_target_length_min.to_s + " - "  + overall_target_length_max.to_s
+      log << ld
+      puts( "All target target domain iE:   min-max: " + overall_target_ie_min.to_s + " - "  + overall_target_ie_max.to_s)
+      log << "All target target domain iE:   min-max: " + overall_target_ie_min.to_s + " - "  + overall_target_ie_max.to_s
+      log << ld
+      puts( "All target domains:                sum: " + sum.to_s  )
+      log << "All target domains:                sum: " + sum.to_s
+
+      puts
+      puts( "Proteins with passing target domain(s): " + passed_seqs.get_number_of_seqs.to_s )
+      puts( "Proteins with no passing target domain: " + proteins_with_failing_domains.to_s )
+      puts( "Proteins with no target domain        : " + domain_not_present_counter.to_s )
+
+      log << ld
+      log << ld
+      puts
+      puts( "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s )
+      log << "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s
+      log << ld
+
+      if ( max_domain_copy_number_per_protein > 1 )
+        puts( "First target protein with this copy number: " + max_domain_copy_number_sequence )
+        log << "First target protein with this copy number: " + max_domain_copy_number_sequence
+        log << ld
+      end
+
+      write_msa( out_msa, outfile + ".fasta"  )
+      write_msa( passed_seqs, passed_seqs_outfile )
+      write_msa( failed_seqs, failed_seqs_outfile )
+
+      log << ld
+      log << "passing target domains                       : " + domain_pass_counter.to_s + ld
+      log << "failing target domains                       : " + domain_fail_counter.to_s + ld
+      log << "proteins in sequence (fasta) file            : " + in_msa.get_number_of_seqs.to_s + ld
+      log << "proteins in hmmscan outputfile               : " + protein_counter.to_s + ld
+      log << "proteins with passing target domain(s)       : " + passed_seqs.get_number_of_seqs.to_s + ld
+      log << "proteins with no passing target domain       : " + proteins_with_failing_domains.to_s + ld
+      log << "proteins with no target domain               : " + domain_not_present_counter.to_s + ld
+
+      log << ld
+
+      return domain_pass_counter
+
+    end # parse
+
+    private
+
+    def write_msa( msa, filename )
+      io = MsaIO.new()
+      w = FastaWriter.new()
+      w.set_line_width( 60 )
+      w.clean( true )
+      begin
+        io.write_to_file( msa, filename, w )
+      rescue Exception
+        error_msg = "could not write to \"" + filename + "\""
+        raise IOError, error_msg
+      end
+    end
+
+    def add_sequence( sequence_name, in_msa, add_to_msa )
+      seqs = in_msa.find_by_name_start( sequence_name, true )
+      if ( seqs.length < 1 )
+        error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
+        raise StandardError, error_msg
+      end
+      if ( seqs.length > 1 )
+        error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
+        raise StandardError, error_msg
+      end
+      seq = in_msa.get_sequence( seqs[ 0 ] )
+      add_to_msa.add_sequence( seq )
+    end
+
+    def process_hmmscan_datas( hmmscan_datas,
+      in_msa,
+      add_position,
+      add_domain_number,
+      add_species,
+      out_msa )
+
+      actual_out_of = hmmscan_datas.size
+
+      seq_name = ""
+      prev_seq_name = nil
+
+      hmmscan_datas.each_with_index do |hmmscan_data, index|
+        if hmmscan_data.number < ( index + 1 )
+          error_msg = "hmmscan_data.number < ( index + 1 ) (this should not have happened)"
+          raise StandardError, error_msg
+        end
+
+        seq_name = hmmscan_data.seq_name
+
+        extract_domain( seq_name,
+        index + 1,
+        actual_out_of,
+        hmmscan_data.env_from,
+        hmmscan_data.env_to,
+        in_msa,
+        out_msa,
+        add_position,
+        add_domain_number,
+        add_species )
+
+        if prev_seq_name && prev_seq_name != seq_name
+          error_msg = "this should not have happened"
+          raise StandardError, error_msg
+        end
+        prev_seq_name = seq_name
+      end # each
+    end # def process_hmmscan_data
+
+    def extract_domain( sequence,
+      number,
+      out_of,
+      seq_from,
+      seq_to,
+      in_msa,
+      out_msa,
+      add_position,
+      add_domain_number,
+      add_species )
+      if number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
+        error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
+        raise StandardError, error_msg
+      end
+      if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
+        error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
+        raise StandardError, error_msg
+      end
+      seqs = in_msa.find_by_name_start( sequence, true )
+      if seqs.length < 1
+        error_msg = "sequence \"" + sequence + "\" not found in sequence file"
+        raise IOError, error_msg
+      end
+      if seqs.length > 1
+        error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
+        raise IOError, error_msg
+      end
+      # hmmscan is 1 based, whereas sequences are 0 bases in this package.
+      seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
+
+      orig_name = seq.get_name
+
+      seq.set_name( orig_name.split[ 0 ] )
+
+      if add_position
+        seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
+      end
+
+      if out_of != 1 && add_domain_number
+        seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
+      end
+
+      if add_species
+        a = orig_name.rindex "["
+        b = orig_name.rindex "]"
+        unless a && b
+          error_msg = "species not found in " + orig_name
+          raise StandardError, error_msg
+        end
+        species = orig_name[ a .. b ]
+        seq.set_name( seq.get_name + " " + species )
+      end
+      out_msa.add_sequence( seq )
+    end
+
+    def is_ignorable?( line )
+      return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
+    end
+
+  end # class HmmscanDomainExtractor
+
+  class HmmscanData
+    def initialize( seq_name, number, out_of, env_from, env_to, i_e_value )
+      @seq_name = seq_name
+      @number = number
+      @out_of = out_of
+      @env_from = env_from
+      @env_to = env_to
+      @i_e_value = i_e_value
+    end
+    attr_reader :seq_name, :number, :out_of, :env_from, :env_to, :i_e_value
+  end
+
+  class PassCondition
+    def initialize( hmm, i_e_value, abs_len, rel_len )
+      @hmm = hmm
+      @i_e_value = i_e_value
+      @abs_len = abs_len
+      @percent_len = rel_len
+    end
+    attr_reader :hmm, :i_e_value, :abs_len, :rel_len
+  end
+
+end # module Evoruby
+
diff --git a/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb

index 5448f4a..c1e23b0 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb
+++ b/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb
@@ -1,5 +1,5 @@
  #
-# = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class
+# = lib/evo/tool/domain_sequence_extractor - DomainSequenceExtractor class
  #
  # Copyright::    Copyright (C) 2017 Christian M. Zmasek
  # License::      GNU Lesser General Public License (LGPL)
diff --git a/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb

new file mode 100644 (file)

index 0000000..6c74e8a
--- /dev/null
+++ b/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb
@@ -0,0 +1,286 @@
+#
+# = lib/evo/tool/multi_domain_seq_extractor - MultiDomainSeqExtractor class
+#
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/io/parser/hmmscan_multi_domain_extractor'
+
+module Evoruby
+  class MultiDomainSeqExtractor
+
+    PRG_NAME       = "mdsx"
+    PRG_VERSION    = "1.000"
+    PRG_DESC       = "Extraction of multi domain sequences from hmmscan output"
+    PRG_DATE       = "20170220"
+    WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
+
+    E_VALUE_THRESHOLD_DEFAULT = 0.1
+    LENGTH_THRESHOLD_DEFAULT  = 50
+
+    E_VALUE_THRESHOLD_OPTION           = 'e'
+    LENGTH_THRESHOLD_OPTION            = 'l'
+    ADD_POSITION_OPTION                = 'p'
+    ADD_DOMAIN_NUMBER_OPTION           = 'd'
+    ADD_SPECIES                        = 's'
+    LOG_FILE_SUFFIX                    = '_multi_domain_seq_extr.log'
+    PASSED_SEQS_SUFFIX                 = '_with_passing_domains.fasta'
+    FAILED_SEQS_SUFFIX                 = '_with_no_passing_domains.fasta'
+    HELP_OPTION_1                      = 'help'
+    HELP_OPTION_2                      = 'h'
+    def run()
+
+      Util.print_program_information( PRG_NAME,
+      PRG_VERSION,
+      PRG_DESC ,
+      PRG_DATE,
+      WWW,
+      STDOUT )
+
+      if ( ARGV == nil || ( ARGV.length < 1 )  )
+        print_help
+        exit( -1 )
+      end
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError
+        Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
+      end
+
+      if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+      cla.is_option_set?( HELP_OPTION_2 ) )
+        print_help
+        exit( 0 )
+      end
+
+      unless ( cla.get_number_of_files == 2 || cla.get_number_of_files == 3 || cla.get_number_of_files == 4 )
+        print_help
+        exit( -1 )
+      end
+
+      allowed_opts = Array.new
+      allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
+      allowed_opts.push( ADD_POSITION_OPTION )
+      allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
+      allowed_opts.push( LENGTH_THRESHOLD_OPTION )
+      allowed_opts.push( ADD_SPECIES )
+
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME,
+        "unknown option(s): " + disallowed,
+        STDOUT )
+      end
+
+      e_value_threshold = E_VALUE_THRESHOLD_DEFAULT
+      if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
+        begin
+          e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+        if ( e_value_threshold < 0.0 )
+          Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
+        end
+      end
+
+      length_threshold = LENGTH_THRESHOLD_DEFAULT
+      if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
+        begin
+          length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+        if ( length_threshold < 0)
+          Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
+        end
+      end
+
+      domain_id           = cla.get_file_name( 0 )
+      hmmscan_output      = cla.get_file_name( 1 )
+      fasta_sequence_file = ""
+      outfile             = ""
+
+      if (cla.get_number_of_files == 4)
+        fasta_sequence_file = cla.get_file_name( 2 )
+        outfile             = cla.get_file_name( 3 )
+      elsif (cla.get_number_of_files == 2 || cla.get_number_of_files == 3)
+        if (cla.get_number_of_files == 3)
+          fasta_sequence_file = cla.get_file_name( 2 )
+        else
+          hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
+          if ( hmmscan_index != nil )
+            prefix = hmmscan_output[0 .. hmmscan_index-1 ]
+            suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
+            files = Dir.entries( "." )
+            matching_files = Util.get_matching_files( files, prefix, suffix)
+            if matching_files.length < 1
+              Util.fatal_error( PRG_NAME, 'no file matching [' + prefix +
+              '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
+            end
+            if matching_files.length > 1
+              Util.fatal_error( PRG_NAME, 'more than one file matching [' +
+              prefix  + '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
+            end
+            fasta_sequence_file = matching_files[ 0 ]
+          else
+            Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
+          end
+        end
+        hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
+        if ( hmmscan_index != nil )
+          outfile = hmmscan_output.sub(Constants::HMMSCAN, "_") + "_" + domain_id
+          e = e_value_threshold >= 1 ? e_value_threshold.to_i : e_value_threshold.to_s.sub!('.','_')
+          outfile += "_E" + e.to_s
+          outfile += "_L" + length_threshold.to_i.to_s
+        else
+          Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
+        end
+      end
+
+      if outfile.downcase.end_with?( ".fasta" )
+        outfile = outfile[ 0 .. outfile.length - 7 ]
+      elsif outfile.downcase.end_with?( ".fsa" )
+        outfile = outfile[ 0 .. outfile.length - 5 ]
+      end
+
+      add_position = false
+      if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
+        add_position = true
+      end
+
+      add_domain_number = false
+      if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
+        add_domain_number = true
+      end
+
+      add_species = false
+      if cla.is_option_set?( ADD_SPECIES )
+        add_species = true
+      end
+
+      log = String.new
+      ld = Constants::LINE_DELIMITER
+
+      puts()
+      puts( "Domain                                                                             : " + domain_id )
+      log << "Domain                                                                             : " + domain_id + ld
+      puts( "Hmmscan outputfile                                                                 : " + hmmscan_output )
+      log << "Hmmscan outputfile                                                                 : " + hmmscan_output + ld
+      puts( "Fasta sequencefile (complete sequences)                                            : " + fasta_sequence_file )
+      log << "Fasta sequencefile (complete sequences)                                            : " + fasta_sequence_file + ld
+      puts( "Outputfile                                                                         : " + outfile + ".fasta" )
+      log << "Outputfile                                                                         : " + outfile + ld
+      puts( "Passed sequences outfile (fasta)                                                   : " + outfile + PASSED_SEQS_SUFFIX )
+      log << "Passed sequences outfile (fasta)                                                   : " + outfile + PASSED_SEQS_SUFFIX + ld
+      puts( "Failed sequences outfile (fasta)                                                   : " + outfile + FAILED_SEQS_SUFFIX )
+      log << "Failed sequences outfile (fasta)                                                   : " + outfile + FAILED_SEQS_SUFFIX + ld
+      puts( "Logfile                                                                            : " + outfile + LOG_FILE_SUFFIX )
+      log << "Logfile                                                                            : " + outfile + LOG_FILE_SUFFIX + ld
+      if ( e_value_threshold >= 0.0 )
+        puts( "iE-value threshold                                                                 : " + e_value_threshold.to_s )
+        log << "iE-value threshold                                                                 : " + e_value_threshold.to_s + ld
+      else
+        puts( "iE-value threshold                                                                 : no threshold" )
+        log << "iE-value threshold                                                                 : no threshold" + ld
+      end
+      if ( length_threshold > 0 )
+        puts( "Length threshold (env)                                                             : " + length_threshold.to_s )
+        log << "Length threshold (env)                                                             : " + length_threshold.to_s + ld
+      else
+        puts( "Length threshold (env)                                                             : no threshold" )
+        log << "Length threshold  (env)                                                            : no threshold" + ld
+      end
+      if ( add_position )
+        puts( "Add positions (rel to complete seq) to extracted domains                           : true" )
+        log << "Add positions (rel to complete seq) to extracted domains                           : true" + ld
+      else
+        puts( "Add positions (rel to complete seq) to extracted domains                           : false" )
+        log << "Add positions (rel to complete seq) to extracted domains                           : false" + ld
+      end
+
+      if ( add_domain_number )
+        puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
+        log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
+      else
+        puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
+        log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
+      end
+
+      puts
+      log <<  ld
+
+      domain_count = 0
+      begin
+        parser = HmmscanMultiDomainExtractor.new()
+        domain_count = parser.parse( domain_id,
+        hmmscan_output,
+        fasta_sequence_file,
+        outfile,
+        outfile + PASSED_SEQS_SUFFIX,
+        outfile + FAILED_SEQS_SUFFIX,
+        e_value_threshold,
+        length_threshold,
+        add_position,
+        add_domain_number,
+        add_species,
+        log )
+      rescue ArgumentError, IOError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+
+      rescue Exception => e
+        puts e.backtrace
+        Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
+
+      end
+
+      puts
+      Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
+      Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta")
+      Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
+      Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX )
+      Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX )
+
+      begin
+        f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
+        f.print( log )
+        f.close
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+
+      puts
+      Util.print_message( PRG_NAME, "OK" )
+      puts
+
+    end
+
+    def print_help()
+      puts()
+      puts( "Usage:" )
+      puts()
+      puts( "  " + PRG_NAME + ".rb [options] <target domain> <hmmscan outputfile> [file containing complete sequences in fasta format] [outputfile]" )
+      puts()
+      puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f> : iE-value threshold for target domain, default is " + E_VALUE_THRESHOLD_DEFAULT.to_s )
+      puts( "           -" + LENGTH_THRESHOLD_OPTION   + "=<i> : length threshold target domain (env), default is " + LENGTH_THRESHOLD_DEFAULT.to_s )
+      puts( "           -" + ADD_DOMAIN_NUMBER_OPTION  + "     : to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
+      puts( "           -" + ADD_POSITION_OPTION  + "     : to add positions (rel to complete seq) to extracted domains" )
+      puts( "           -" + ADD_SPECIES  + "     : to add species [in brackets]" )
+      puts()
+      puts( "Examples:" )
+      puts
+      puts( "  " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta P53_#{Constants::PFAM_V_FOR_EX}_10_Pkinase_E1_0e-06_L50" )
+      puts
+      puts( "  " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta" )
+      puts
+      puts( "  " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10" )
+      puts()
+    end
+
+  end # class DomainSequenceExtractor
+
+end # module Evoruby
author	cmzmasek <chris.zma@outlook.com>
	Wed, 22 Feb 2017 01:42:47 +0000 (17:42 -0800)
committer	cmzmasek <chris.zma@outlook.com>
	Wed, 22 Feb 2017 01:42:47 +0000 (17:42 -0800)
forester/ruby/evoruby/exe/dsx.rb		patch \| blob \| history
forester/ruby/evoruby/exe/mdsx.rb	[new file with mode: 0755]	patch \| blob
forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb	[new file with mode: 0644]	patch \| blob
forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb	[new file with mode: 0644]	patch \| blob