in progress...

author cmzmasek <chris.zma@outlook.com>

Wed, 8 Mar 2017 02:55:59 +0000 (18:55 -0800)

committer cmzmasek <chris.zma@outlook.com>

Wed, 8 Mar 2017 02:55:59 +0000 (18:55 -0800)
author cmzmasek <chris.zma@outlook.com>
Wed, 8 Mar 2017 02:55:59 +0000 (18:55 -0800)
committer cmzmasek <chris.zma@outlook.com>
Wed, 8 Mar 2017 02:55:59 +0000 (18:55 -0800)
diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb

index 33e48a4..35b32df 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb
+++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb
@@ -16,14 +16,16 @@ require 'lib/evo/io/parser/hmmscan_parser'
  module Evoruby
    class HmmscanMultiDomainExtractor
  
+    TESTING = false
      OUTPUT_ID = 'mdsx'
      DOMAIN_DELIMITER = ' -- '
  
      PASSING_FL_SEQS_SUFFIX    = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
      FAILING_FL_SEQS_SUFFIX    = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
      TARGET_DA_SUFFIX          = "_#{OUTPUT_ID}_target_da.fasta"
-    CONCAT_TARGET_DOM_SUFFIX  = "_#{OUTPUT_ID}_concat_target_dom.fasta"
+    CONCAT_TARGET_DOM_SUFFIX  = "_#{OUTPUT_ID}_concat_target_doms.fasta"
      TARGET_DOM_OUTPUT_MIDPART = "_#{OUTPUT_ID}_target_dom_"
+    LOG_FILE_SUFFIX           = "_#{OUTPUT_ID}.log"
      def initialize
        @passing_domains_data = nil
        @failing_domains_data = nil
@@ -31,6 +33,7 @@ module Evoruby
        @passsing_domain_architectures = nil
        @failing_proteins_bc_not_all_target_doms_present = nil
        @failing_proteins_bc_missing_cutoffs = nil
+      @log = nil
      end
  
      # raises ArgumentError, IOError, StandardError
@@ -44,6 +47,7 @@ module Evoruby
        failing_fl_seqs_outfile   = outfile_base + FAILING_FL_SEQS_SUFFIX
        target_da_outfile         = outfile_base + TARGET_DA_SUFFIX
        concat_target_dom_outfile = outfile_base + CONCAT_TARGET_DOM_SUFFIX
+      logfile                   = outfile_base + LOG_FILE_SUFFIX
  
        Util.check_file_for_readability( hmmscan_output )
        Util.check_file_for_readability( fasta_sequence_file )
@@ -51,6 +55,9 @@ module Evoruby
        Util.check_file_for_writability( failing_fl_seqs_outfile )
        Util.check_file_for_writability( target_da_outfile )
        Util.check_file_for_writability( concat_target_dom_outfile )
+      Util.check_file_for_writability( logfile )
+
+      @log = log_str
  
        in_msa = nil
        factory = MsaFactory.new()
@@ -64,14 +71,11 @@ module Evoruby
        failed_seqs_msa = Msa.new
        passed_seqs_msa = Msa.new
  
-      ld = Constants::LINE_DELIMITER
-
        hmmscan_parser = HmmscanParser.new( hmmscan_output )
        results = hmmscan_parser.parse
  
        ####
        # Import: if multiple copies of same domain, thresholds need to be same!
-      target_domain_ary = Array.new
  
        #target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 ))
        #target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 ))
@@ -81,9 +85,11 @@ module Evoruby
        # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
        # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
  
-      target_domain_ary.push(TargetDomain.new('BH4', 1, -1, 0.2, 0 ))
-      target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 1 ))
-      target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 2 ))
+      target_domain_ary = parse_da target_da
+
+      # target_domain_ary.push(TargetDomain.new('BH4', 1, -1, 0.2, 0 ))
+      # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 1 ))
+      # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 2 ))
        # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
  
        #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
@@ -105,7 +111,12 @@ module Evoruby
  
        target_domain_architecure.freeze
  
-      puts 'Target domain architecture: ' + target_domain_architecure
+      log 'Hmmscan outputfile             : ' + hmmscan_output
+      log 'Full length fasta sequence file: ' + fasta_sequence_file
+      log 'Target domain architecture     : ' + target_domain_architecure
+      target_domain_ary.each do |x|
+        log x.to_str
+      end
  
        target_domain_names = Set.new
  
@@ -128,7 +139,7 @@ module Evoruby
        out_concatenated_domains_msa = Msa.new
        results.each do |hmmscan_result|
          if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
-          if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
+          if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
              passing_sequences.push(domains_in_query_seq)
            else
              failing_sequences.push(domains_in_query_seq)
@@ -141,44 +152,46 @@ module Evoruby
        end # each result
  
        if prev_query_seq_name != nil
-        total_sequences += 1
-        if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
+        if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
            passing_sequences.push(domains_in_query_seq)
          else
            failing_sequences.push(domains_in_query_seq)
          end
+        total_sequences += 1
        end
  
-      puts
-
-      puts 'Failing domain architectures containing one or more target domain(s):'
-      @failing_domain_architectures = @failing_domain_architectures .sort{|a, b|a<=>b}.to_h
-      failing_da_sum = 0
-      @failing_domain_architectures .each do |da, count|
-        failing_da_sum += count
-        puts count.to_s.rjust(4) + ': ' + da
+      if in_msa.get_number_of_seqs < total_sequences
+        error_msg = "hmmscan output contains more protein sequences than fasta sequence file"
+        raise IOError, error_msg
        end
  
-      puts
-
-      puts 'Passing domain architectures containing target domain(s):'
+      log
+      log 'Passing domain architectures containing target domain(s):'
        @passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
        passing_da_sum = 0
        @passsing_domain_architectures.each do |da, count|
          passing_da_sum += count
-        puts count.to_s.rjust(4) + ': ' + da
+        log count.to_s.rjust(4) + ': ' + da
        end
-
-      puts 'Passing domain(s):'
+      log
+      log 'Failing domain architectures containing one or more target domain(s):'
+      @failing_domain_architectures = @failing_domain_architectures .sort{|a, b|a<=>b}.to_h
+      failing_da_sum = 0
+      @failing_domain_architectures .each do |da, count|
+        failing_da_sum += count
+        log count.to_s.rjust(4) + ': ' + da
+      end
+      log
+      log 'Passing target domain(s):'
        @passing_domains_data = @passing_domains_data.sort{|a, b|a<=>b}.to_h
        @passing_domains_data.each do |n, d|
-        puts d.to_str
+        log d.to_str
        end
-
-      puts 'Failing domain(s):'
+      log
+      log'Failing target domain(s):'
        @failing_domains_data = @failing_domains_data.sort{|a, b|a<=>b}.to_h
        @failing_domains_data.each do |n, d|
-        puts d.to_str
+        log d.to_str
        end
  
        unless total_sequences == (passing_sequences.size + failing_sequences.size)
@@ -201,60 +214,60 @@ module Evoruby
          raise StandardError, error_msg
        end
  
-      puts
-
-      puts "Protein sequences in sequence (fasta) file: " + in_msa.get_number_of_seqs.to_s.rjust(5)
-      puts "Protein sequences in hmmscan output file  : " + total_sequences.to_s.rjust(5)
-      puts "  Passing protein sequences               : " + passing_sequences.size.to_s.rjust(5)
-      puts "  Failing protein sequences               : " + failing_sequences.size.to_s.rjust(5)
-      puts "    Not all target domain present         : " + @failing_proteins_bc_not_all_target_doms_present.to_s.rjust(5)
-      puts "    Target domain(s) failing cutoffs      : " + @failing_proteins_bc_missing_cutoffs.to_s.rjust(5)
-
-      puts
+      log
+      log "Protein sequences in sequence (fasta) file: " + in_msa.get_number_of_seqs.to_s.rjust(5)
+      log "Protein sequences in hmmscan output file  : " + total_sequences.to_s.rjust(5)
+      log "  Passing protein sequences               : " + passing_sequences.size.to_s.rjust(5)
+      log "  Failing protein sequences               : " + failing_sequences.size.to_s.rjust(5)
+      log "    Not all target domain present         : " + @failing_proteins_bc_not_all_target_doms_present.to_s.rjust(5)
+      log "    Target domain(s) failing cutoffs      : " + @failing_proteins_bc_missing_cutoffs.to_s.rjust(5)
+      log
  
        out_domain_msas.keys.sort.each do |domain_name|
          file_name = outfile_base + TARGET_DOM_OUTPUT_MIDPART + domain_name + '.fasta'
-
-        write_msa( out_domain_msas[domain_name], file_name )
-        puts "wrote #{domain_name}"
+        write_msa out_domain_msas[domain_name], file_name
+        log "Wrote passing target domain sequence for " +  domain_name.ljust(16) + ': ' + file_name
        end
  
-      write_msa( out_domain_architecture_msa, target_da_outfile )
-      puts "wrote target_domain_architecure"
+      write_msa out_domain_architecture_msa, target_da_outfile
+      log 'Wrote target domain architecture                         : ' + target_da_outfile
  
-      write_msa( out_concatenated_domains_msa, concat_target_dom_outfile )
-      puts "wrote concatenated_domains_msa"
+      write_msa out_concatenated_domains_msa, concat_target_dom_outfile
+      log 'Wrote concatenated target domain(s)                      : ' + concat_target_dom_outfile
  
        passing_sequences.each do | domains |
          query_name = domains[0].query
-        if passed_seqs_msa.find_by_name_start( query_name, true ).length < 1
+        if (!TESTING) || (passed_seqs_msa.find_by_name_start( query_name, true ).length < 1)
            add_sequence( query_name, in_msa, passed_seqs_msa )
          else
-          error_msg = "this should not have happened"
+          error_msg = 'this should not have happened'
            raise StandardError, error_msg
          end
        end
  
        failing_sequences.each do | domains |
          query_name = domains[0].query
-        if failed_seqs_msa.find_by_name_start( query_name, true ).length < 1
+        if (!TESTING) || (failed_seqs_msa.find_by_name_start( query_name, true ).length < 1)
            add_sequence( query_name, in_msa, failed_seqs_msa )
          else
-          error_msg = "this should not have happened"
+          error_msg = 'this should not have happened'
            raise StandardError, error_msg
          end
        end
  
-      write_msa( passed_seqs_msa, passing_fl_seqs_outfile )
-      puts "wrote ..."
-      write_msa( failed_seqs_msa, failing_fl_seqs_outfile )
-      puts "wrote ..."
-
-      log_str << ld
+      write_msa passed_seqs_msa, passing_fl_seqs_outfile
+      log 'Wrote passing full length protein sequences              : ' + passing_fl_seqs_outfile
+      write_msa failed_seqs_msa, failing_fl_seqs_outfile
+      log 'Wrote failing full length protein sequences              : ' + failing_fl_seqs_outfile
  
-      log_str << "proteins in sequence (fasta) file            : " + in_msa.get_number_of_seqs.to_s + ld
-
-      log_str << ld
+      begin
+        f = File.open( logfile, 'w' )
+        f.print( @log )
+        f.close
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+      log 'Wrote log file                                           : ' + logfile
  
      end # parse
  
@@ -264,7 +277,7 @@ module Evoruby
      # target_domain_names: Set of String
      # target_domains: Hash String->TargetDomain
      # target_domain_architecture: String
-    def checkit2(domains_in_query_seq,
+    def compare(domains_in_query_seq,
        target_domain_names,
        target_domains,
        in_msa,
@@ -442,7 +455,7 @@ module Evoruby
        w = FastaWriter.new()
        w.set_line_width( 60 )
        w.clean( true )
-      File.delete(filename) if File.exist?(filename) #TODO remove me
+      File.delete(filename) if File.exist?(filename)
        begin
          io.write_to_file( msa, filename, w )
        rescue Exception
@@ -505,13 +518,42 @@ module Evoruby
        seq.get_sequence_as_string
      end
  
+    def parse_da( target_da_str )
+      target_domain_hash = Hash.new
+      target_domain_ary = Array.new
+      target_das = target_da_str.split '--'
+      target_das.each do |x|
+        inds = x.split '='
+        unless inds.size == 4
+          raise IOError, 'domain architecture is ill formatted: ' + x
+        end
+        target_domain_name = inds[0]
+        ie_cutoff = inds[1].to_f
+        abs_len_cutoff = inds[2].to_i
+        rel_len_cutoff = inds[3].to_f
+        if target_domain_hash.has_key? target_domain_name
+          target_domain_ary.push target_domain_hash[target_domain_name]
+        else
+          td = TargetDomain.new(target_domain_name, ie_cutoff, abs_len_cutoff, rel_len_cutoff)
+          target_domain_hash[target_domain_name] = td
+          target_domain_ary.push td
+        end
+      end
+      target_domain_ary
+    end
+
+    def log(str = '')
+      puts str
+      @log << str << Constants::LINE_DELIMITER
+    end
+
    end # class HmmscanMultiDomainExtractor
  
    class DomainData
      def initialize( name )
        if (name == nil) || name.size < 1
          error_msg = "domain name nil or empty"
-        raise StandardError, error_msg
+        raise IOError, error_msg
        end
        @name = name
        @count = 0
@@ -526,16 +568,16 @@ module Evoruby
      def add( name, length, i_e_value)
        if name != @name
          error_msg = "domain names do not match"
-        raise StandardError, error_msg
+        raise IOError, error_msg
        end
  
        if length < 0
          error_msg = "length cannot me negative"
-        raise StandardError, error_msg
+        raise IOError, error_msg
        end
        if i_e_value < 0
          error_msg = "iE-value cannot me negative"
-        raise StandardError, error_msg
+        raise IOError, error_msg
        end
        @count += 1
        @i_e_value_sum += i_e_value
@@ -570,7 +612,7 @@ module Evoruby
  
      def to_str
        s = ''
-      s << @name.rjust(24) + ': '
+      s << @name.rjust(16) + ': '
        s << @count.to_s.rjust(4) + '  '
        s << avg_length.round(1).to_s.rjust(6) + ' '
        s << @len_min.to_s.rjust(4) + ' -'
@@ -586,22 +628,41 @@ module Evoruby
    end
  
    class TargetDomain
-    def initialize( name, i_e_value, abs_len, rel_len, position )
+    def initialize(name, i_e_value, abs_len, rel_len)
        if (name == nil) || name.size < 1
          error_msg = "target domain name nil or empty"
-        raise StandardError, error_msg
+        raise IOError, error_msg
        end
        if rel_len > 1
-        error_msg = "target domain relative length is greater than 1"
-        raise StandardError, error_msg
+        error_msg = name + ": target domain relative length is greater than 1"
+        raise IOError, error_msg
+      end
+      if (abs_len <= 0) && (rel_len <= 0)
+        error_msg = name + ": need to have either absolute length or relative length cutoff"
+        raise IOError, error_msg
+      end
+      if (abs_len > 0) && (rel_len > 0)
+        error_msg = name + ": cannot have both absolute length and relative length cutoff"
+        raise IOError, error_msg
        end
        @name = name
        @i_e_value = i_e_value
        @abs_len = abs_len
        @rel_len = rel_len
-      @position = position
      end
-    attr_reader :name, :i_e_value, :abs_len, :rel_len, :position
+
+    def to_str
+      s = @name.rjust(16) + ':'
+      s << ' iE-cutoff: ' + ("%.2E" % @i_e_value).rjust(9)
+      if @abs_len > 0
+        s << ', abs len-cutoff: ' + @abs_len.to_s.rjust(4)
+      end
+      if @rel_len > 0
+        s << ', rel len-cutoff: ' + @rel_len.to_s.rjust(4)
+      end
+      s
+    end
+    attr_reader :name, :i_e_value, :abs_len, :rel_len
    end
  
  end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/msa/msa.rb b/forester/ruby/evoruby/lib/evo/msa/msa.rb

index 549ae2c..0dcad71 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/msa/msa.rb
+++ b/forester/ruby/evoruby/lib/evo/msa/msa.rb
@@ -6,15 +6,12 @@
  #
  # Last modified: 2017/02/07
  
-
  require 'lib/evo/util/constants'
  require 'lib/evo/util/util'
  require 'lib/evo/sequence/sequence'
  
  module Evoruby
-
    class Msa
-
      def initialize()
        @sequences = Array.new
        @identical_seqs_detected = Array.new
@@ -22,7 +19,6 @@ module Evoruby
        @namestart_to_seq_indices = Hash.new
      end
  
-
      def add_sequence( sequence )
        @sequences.push( sequence )
      end
@@ -34,8 +30,8 @@ module Evoruby
      def get_sequence( index )
        if ( index < 0 || index > get_number_of_seqs() - 1 )
          error_msg = "attempt to get sequence " <<
-         index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
-         " sequences"
+        index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
+        " sequences"
          raise ArgumentError, error_msg
        end
        return @sequences[ index ]
@@ -44,8 +40,8 @@ module Evoruby
      def remove_sequence!( index )
        if ( index < 0 || index > get_number_of_seqs() - 1 )
          error_msg = "attempt to remove sequence " <<
-         index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
-         " sequences"
+        index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
+        " sequences"
          raise ArgumentError, error_msg
        end
        @name_to_seq_indices.clear
@@ -57,7 +53,6 @@ module Evoruby
        @identical_seqs_detected
      end
  
-
      def is_aligned()
        if ( get_number_of_seqs < 1 )
          return false
@@ -90,7 +85,7 @@ module Evoruby
            name = name.downcase
          end
          if current_name == name ||
-           ( partial_match && current_name.include?( name ) )
+        ( partial_match && current_name.include?( name ) )
            indices.push( i )
          end
        end
@@ -127,26 +122,26 @@ module Evoruby
        get_sequence( indices[ 0 ] )
      end
  
-    def find_by_name_start( name, case_sensitive )
+    def find_by_name_start(name, case_sensitive = true)
        if case_sensitive && @namestart_to_seq_indices.has_key?( name )
          return @namestart_to_seq_indices[ name ]
        end
        indices = []
-      for i in 0 ... get_number_of_seqs()
-        get_sequence( i ).get_name() =~ /^\s*(\S+)/
+      for i in 0 ... get_number_of_seqs
+        get_sequence(i).get_name =~ /^\s*(\S+)/
          current_name = $1
          if case_sensitive
-          if !@namestart_to_seq_indices.has_key?( current_name )
-            @namestart_to_seq_indices[ current_name ] = []
+          unless @namestart_to_seq_indices.has_key?(current_name)
+            @namestart_to_seq_indices[current_name] = []
            end
-          @namestart_to_seq_indices[ current_name ].push( i )
+          @namestart_to_seq_indices[current_name].push( i )
          end
          if !case_sensitive
            current_name = current_name.downcase
            name = name.downcase
          end
-        if  current_name == name
-          indices.push( i )
+        if current_name == name
+          indices.push(i)
          end
        end
        indices
@@ -160,7 +155,7 @@ module Evoruby
            name = name.downcase
          end
          if current_name == name ||
-           ( partial_match && current_name.include?( name ) )
+        ( partial_match && current_name.include?( name ) )
            return true
          end
        end
@@ -193,7 +188,6 @@ module Evoruby
        get_sequence( indices[ 0 ] )
      end
  
-
      def get_sub_alignment( seq_numbers )
        msa = Msa.new()
        for i in 0 ... seq_numbers.length()
@@ -230,7 +224,6 @@ module Evoruby
        s
      end
  
-
      def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
        if ( !is_aligned() )
          error_msg = "attempt to get overlap diagram of unaligned msa"
@@ -273,7 +266,7 @@ module Evoruby
        overlap_count = 0
        for i in 0...seq_1.get_length()
          if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
-           !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
+        !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
            overlap_count += 1
            if overlap_count >= min_overlap
              return true
@@ -289,7 +282,7 @@ module Evoruby
        overlap_count = 0
        for i in 0...seq_1.get_length
          if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
-           !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
+        !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
            overlap_count += 1
          end
        end
@@ -302,10 +295,10 @@ module Evoruby
        identities_count = 0
        for i in 0...seq_1.get_length
          if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
-           !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
-           seq_1.get_character_code( i ) != 63 &&
-           ( seq_1.get_residue( i ).downcase() ==
-             seq_2.get_residue( i ).downcase() )
+        !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
+        seq_1.get_character_code( i ) != 63 &&
+        ( seq_1.get_residue( i ).downcase() ==
+        seq_2.get_residue( i ).downcase() )
            identities_count += 1
          end
        end
@@ -325,7 +318,6 @@ module Evoruby
        remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
      end
  
-
      def remove_sequences_by_gap_ratio!( gap_ratio )
        if ( !is_aligned() )
          error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
@@ -345,7 +337,6 @@ module Evoruby
        removed
      end
  
-
      def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
        n = get_number_of_seqs
        removed = Array.new
@@ -355,10 +346,10 @@ module Evoruby
          for j in ( i + 1 ) ... n
            if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
              if  !consider_taxonomy ||
-               ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
-                 ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
+            ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
+            ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
                if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
-                 Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
+              Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
                  to_be_removed.add( j )
  
                  tax_i = ""
@@ -389,7 +380,6 @@ module Evoruby
        removed
      end
  
-
      def remove_sequences_by_non_gap_length!( min_non_gap_length )
        n = get_number_of_seqs
        removed = Array.new
@@ -504,7 +494,6 @@ module Evoruby
        return cols
      end
  
-
      # Split an alignment into n alignemnts of equal size, except last one
      def split( n, verbose = false )
        if ( n < 2 || n > get_number_of_seqs )
@@ -538,7 +527,6 @@ module Evoruby
        msas
      end
  
-
      private
  
      def get_overlaps( min_overlap = 1 )
@@ -588,7 +576,6 @@ module Evoruby
        return self
      end
  
-
    end # class Msa
  
  end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb

index c1a35ab..762556b 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb
+++ b/forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb
@@ -15,10 +15,9 @@ module Evoruby
      PRG_NAME       = "mdsx"
      PRG_VERSION    = "1.000"
      PRG_DESC       = "Extraction of multi domain sequences from hmmscan output"
-    PRG_DATE       = "20170220"
+    PRG_DATE       = "20170307"
      WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
  
-    LOG_FILE_SUFFIX                    = '_MDSX.log'
      HELP_OPTION_1                      = 'help'
      HELP_OPTION_2                      = 'h'
      def run()
@@ -64,7 +63,7 @@ module Evoruby
        domain_id           = cla.get_file_name( 0 )
        hmmscan_output      = cla.get_file_name( 1 )
        fasta_sequence_file = ""
-      outfile             = ""
+      outfile_base        = ""
  
        if cla.get_number_of_files == 3
          fasta_sequence_file = cla.get_file_name( 2 )
@@ -90,41 +89,26 @@ module Evoruby
        end
        hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
        if hmmscan_index != nil
-        outfile = hmmscan_output.sub(Constants::HMMSCAN, '_')
+        outfile_base = hmmscan_output.sub(Constants::HMMSCAN, '_')
        else
          Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
        end
  
-      log = String.new
-      ld = Constants::LINE_DELIMITER
-
-      #      puts()
-      #      puts( "Domain                                                                             : " + domain_id )
-      #      log << "Domain                                                                             : " + domain_id + ld
-      #      puts( "Hmmscan outputfile                                                                 : " + hmmscan_output )
-      #      log << "Hmmscan outputfile                                                                 : " + hmmscan_output + ld
-      #      puts( "Fasta sequencefile (complete sequences)                                            : " + fasta_sequence_file )
-      #      log << "Fasta sequencefile (complete sequences)                                            : " + fasta_sequence_file + ld
-      #      puts( "Outputfile                                                                         : " + outfile + ".fasta" )
-      #      log << "Outputfile                                                                         : " + outfile + ld
-      #      puts( "Passed sequences outfile (fasta)                                                   : " + outfile + PASSED_SEQS_SUFFIX )
-      #      log << "Passed sequences outfile (fasta)                                                   : " + outfile + PASSED_SEQS_SUFFIX + ld
-      #      puts( "Failed sequences outfile (fasta)                                                   : " + outfile + FAILED_SEQS_SUFFIX )
-      #      log << "Failed sequences outfile (fasta)                                                   : " + outfile + FAILED_SEQS_SUFFIX + ld
-      #      puts( "Logfile                                                                            : " + outfile + LOG_FILE_SUFFIX )
-      #      log << "Logfile                                                                            : " + outfile + LOG_FILE_SUFFIX + ld
+      log_str = ''
  
-      puts
-      log <<  ld
+      log_str << PRG_NAME << Constants::LINE_DELIMITER
+      log_str <<  PRG_VERSION << Constants::LINE_DELIMITER
+      log_str << PRG_DESC << Constants::LINE_DELIMITER
+      log_str << PRG_DATE << Constants::LINE_DELIMITER
+      log_str << Constants::LINE_DELIMITER
  
-      domain_count = 0
        begin
          parser = HmmscanMultiDomainExtractor.new()
-        domain_count = parser.parse( domain_id,
+        parser.parse( domain_id,
          hmmscan_output,
          fasta_sequence_file,
-        outfile,
-        log )
+        outfile_base,
+        log_str)
        rescue ArgumentError, IOError => e
          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
  
@@ -134,42 +118,26 @@ module Evoruby
        end
  
        puts
-  
-      #  Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta")
-      #  Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
-      #  Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX )
-      #  Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX )
-
-      begin
-        f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
-        f.print( log )
-        f.close
-      rescue Exception => e
-        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-      end
-
-      puts
        Util.print_message( PRG_NAME, "OK" )
        puts
  
      end
  
      def print_help()
-      puts()
-      puts( "Usage:" )
-      puts()
-      puts( "  " + PRG_NAME + ".rb <da> <hmmscan outputfile> [file containing complete sequences in fasta format]" )
-      puts()
-      puts( "  options: -"  )
-      puts()
-      puts( "Examples:" )
        puts
-      puts( "  " + PRG_NAME + ".rb " )
+      puts "Usage:"
+      puts
+      puts "  " + PRG_NAME + ".rb <da> <hmmscan outputfile> [file containing complete sequences in fasta format]"
+      puts
+      puts "  options: -"
+      puts
+      puts "Examples:"
+      puts
+      puts "  " + PRG_NAME + ".rb "
+      puts
        puts
-
-      puts()
      end
  
-  end # class DomainSequenceExtractor
+  end # class MultiDomainSeqExtractor
  
  end # module Evoruby
author	cmzmasek <chris.zma@outlook.com>
	Wed, 8 Mar 2017 02:55:59 +0000 (18:55 -0800)
committer	cmzmasek <chris.zma@outlook.com>
	Wed, 8 Mar 2017 02:55:59 +0000 (18:55 -0800)
forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/msa/msa.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb		patch \| blob \| history