in progress...
authorcmzmasek <chris.zma@outlook.com>
Wed, 8 Mar 2017 02:55:59 +0000 (18:55 -0800)
committercmzmasek <chris.zma@outlook.com>
Wed, 8 Mar 2017 02:55:59 +0000 (18:55 -0800)
forester/ruby/evoruby/lib/evo/io/parser/hmmscan_multi_domain_extractor.rb
forester/ruby/evoruby/lib/evo/msa/msa.rb
forester/ruby/evoruby/lib/evo/tool/multi_domain_seq_extractor.rb

index 33e48a4..35b32df 100644 (file)
@@ -16,14 +16,16 @@ require 'lib/evo/io/parser/hmmscan_parser'
 module Evoruby
   class HmmscanMultiDomainExtractor
 
+    TESTING = false
     OUTPUT_ID = 'mdsx'
     DOMAIN_DELIMITER = ' -- '
 
     PASSING_FL_SEQS_SUFFIX    = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
     FAILING_FL_SEQS_SUFFIX    = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
     TARGET_DA_SUFFIX          = "_#{OUTPUT_ID}_target_da.fasta"
-    CONCAT_TARGET_DOM_SUFFIX  = "_#{OUTPUT_ID}_concat_target_dom.fasta"
+    CONCAT_TARGET_DOM_SUFFIX  = "_#{OUTPUT_ID}_concat_target_doms.fasta"
     TARGET_DOM_OUTPUT_MIDPART = "_#{OUTPUT_ID}_target_dom_"
+    LOG_FILE_SUFFIX           = "_#{OUTPUT_ID}.log"
     def initialize
       @passing_domains_data = nil
       @failing_domains_data = nil
@@ -31,6 +33,7 @@ module Evoruby
       @passsing_domain_architectures = nil
       @failing_proteins_bc_not_all_target_doms_present = nil
       @failing_proteins_bc_missing_cutoffs = nil
+      @log = nil
     end
 
     # raises ArgumentError, IOError, StandardError
@@ -44,6 +47,7 @@ module Evoruby
       failing_fl_seqs_outfile   = outfile_base + FAILING_FL_SEQS_SUFFIX
       target_da_outfile         = outfile_base + TARGET_DA_SUFFIX
       concat_target_dom_outfile = outfile_base + CONCAT_TARGET_DOM_SUFFIX
+      logfile                   = outfile_base + LOG_FILE_SUFFIX
 
       Util.check_file_for_readability( hmmscan_output )
       Util.check_file_for_readability( fasta_sequence_file )
@@ -51,6 +55,9 @@ module Evoruby
       Util.check_file_for_writability( failing_fl_seqs_outfile )
       Util.check_file_for_writability( target_da_outfile )
       Util.check_file_for_writability( concat_target_dom_outfile )
+      Util.check_file_for_writability( logfile )
+
+      @log = log_str
 
       in_msa = nil
       factory = MsaFactory.new()
@@ -64,14 +71,11 @@ module Evoruby
       failed_seqs_msa = Msa.new
       passed_seqs_msa = Msa.new
 
-      ld = Constants::LINE_DELIMITER
-
       hmmscan_parser = HmmscanParser.new( hmmscan_output )
       results = hmmscan_parser.parse
 
       ####
       # Import: if multiple copies of same domain, thresholds need to be same!
-      target_domain_ary = Array.new
 
       #target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 ))
       #target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 ))
@@ -81,9 +85,11 @@ module Evoruby
       # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
       # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
 
-      target_domain_ary.push(TargetDomain.new('BH4', 1, -1, 0.2, 0 ))
-      target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 1 ))
-      target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 2 ))
+      target_domain_ary = parse_da target_da
+
+      # target_domain_ary.push(TargetDomain.new('BH4', 1, -1, 0.2, 0 ))
+      # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 1 ))
+      # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 2 ))
       # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
 
       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
@@ -105,7 +111,12 @@ module Evoruby
 
       target_domain_architecure.freeze
 
-      puts 'Target domain architecture: ' + target_domain_architecure
+      log 'Hmmscan outputfile             : ' + hmmscan_output
+      log 'Full length fasta sequence file: ' + fasta_sequence_file
+      log 'Target domain architecture     : ' + target_domain_architecure
+      target_domain_ary.each do |x|
+        log x.to_str
+      end
 
       target_domain_names = Set.new
 
@@ -128,7 +139,7 @@ module Evoruby
       out_concatenated_domains_msa = Msa.new
       results.each do |hmmscan_result|
         if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
-          if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
+          if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
             passing_sequences.push(domains_in_query_seq)
           else
             failing_sequences.push(domains_in_query_seq)
@@ -141,44 +152,46 @@ module Evoruby
       end # each result
 
       if prev_query_seq_name != nil
-        total_sequences += 1
-        if checkit2(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
+        if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
           passing_sequences.push(domains_in_query_seq)
         else
           failing_sequences.push(domains_in_query_seq)
         end
+        total_sequences += 1
       end
 
-      puts
-
-      puts 'Failing domain architectures containing one or more target domain(s):'
-      @failing_domain_architectures = @failing_domain_architectures .sort{|a, b|a<=>b}.to_h
-      failing_da_sum = 0
-      @failing_domain_architectures .each do |da, count|
-        failing_da_sum += count
-        puts count.to_s.rjust(4) + ': ' + da
+      if in_msa.get_number_of_seqs < total_sequences
+        error_msg = "hmmscan output contains more protein sequences than fasta sequence file"
+        raise IOError, error_msg
       end
 
-      puts
-
-      puts 'Passing domain architectures containing target domain(s):'
+      log
+      log 'Passing domain architectures containing target domain(s):'
       @passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
       passing_da_sum = 0
       @passsing_domain_architectures.each do |da, count|
         passing_da_sum += count
-        puts count.to_s.rjust(4) + ': ' + da
+        log count.to_s.rjust(4) + ': ' + da
       end
-
-      puts 'Passing domain(s):'
+      log
+      log 'Failing domain architectures containing one or more target domain(s):'
+      @failing_domain_architectures = @failing_domain_architectures .sort{|a, b|a<=>b}.to_h
+      failing_da_sum = 0
+      @failing_domain_architectures .each do |da, count|
+        failing_da_sum += count
+        log count.to_s.rjust(4) + ': ' + da
+      end
+      log
+      log 'Passing target domain(s):'
       @passing_domains_data = @passing_domains_data.sort{|a, b|a<=>b}.to_h
       @passing_domains_data.each do |n, d|
-        puts d.to_str
+        log d.to_str
       end
-
-      puts 'Failing domain(s):'
+      log
+      log'Failing target domain(s):'
       @failing_domains_data = @failing_domains_data.sort{|a, b|a<=>b}.to_h
       @failing_domains_data.each do |n, d|
-        puts d.to_str
+        log d.to_str
       end
 
       unless total_sequences == (passing_sequences.size + failing_sequences.size)
@@ -201,60 +214,60 @@ module Evoruby
         raise StandardError, error_msg
       end
 
-      puts
-
-      puts "Protein sequences in sequence (fasta) file: " + in_msa.get_number_of_seqs.to_s.rjust(5)
-      puts "Protein sequences in hmmscan output file  : " + total_sequences.to_s.rjust(5)
-      puts "  Passing protein sequences               : " + passing_sequences.size.to_s.rjust(5)
-      puts "  Failing protein sequences               : " + failing_sequences.size.to_s.rjust(5)
-      puts "    Not all target domain present         : " + @failing_proteins_bc_not_all_target_doms_present.to_s.rjust(5)
-      puts "    Target domain(s) failing cutoffs      : " + @failing_proteins_bc_missing_cutoffs.to_s.rjust(5)
-
-      puts
+      log
+      log "Protein sequences in sequence (fasta) file: " + in_msa.get_number_of_seqs.to_s.rjust(5)
+      log "Protein sequences in hmmscan output file  : " + total_sequences.to_s.rjust(5)
+      log "  Passing protein sequences               : " + passing_sequences.size.to_s.rjust(5)
+      log "  Failing protein sequences               : " + failing_sequences.size.to_s.rjust(5)
+      log "    Not all target domain present         : " + @failing_proteins_bc_not_all_target_doms_present.to_s.rjust(5)
+      log "    Target domain(s) failing cutoffs      : " + @failing_proteins_bc_missing_cutoffs.to_s.rjust(5)
+      log
 
       out_domain_msas.keys.sort.each do |domain_name|
         file_name = outfile_base + TARGET_DOM_OUTPUT_MIDPART + domain_name + '.fasta'
-
-        write_msa( out_domain_msas[domain_name], file_name )
-        puts "wrote #{domain_name}"
+        write_msa out_domain_msas[domain_name], file_name
+        log "Wrote passing target domain sequence for " +  domain_name.ljust(16) + ': ' + file_name
       end
 
-      write_msa( out_domain_architecture_msa, target_da_outfile )
-      puts "wrote target_domain_architecure"
+      write_msa out_domain_architecture_msa, target_da_outfile
+      log 'Wrote target domain architecture                         : ' + target_da_outfile
 
-      write_msa( out_concatenated_domains_msa, concat_target_dom_outfile )
-      puts "wrote concatenated_domains_msa"
+      write_msa out_concatenated_domains_msa, concat_target_dom_outfile
+      log 'Wrote concatenated target domain(s)                      : ' + concat_target_dom_outfile
 
       passing_sequences.each do | domains |
         query_name = domains[0].query
-        if passed_seqs_msa.find_by_name_start( query_name, true ).length < 1
+        if (!TESTING) || (passed_seqs_msa.find_by_name_start( query_name, true ).length < 1)
           add_sequence( query_name, in_msa, passed_seqs_msa )
         else
-          error_msg = "this should not have happened"
+          error_msg = 'this should not have happened'
           raise StandardError, error_msg
         end
       end
 
       failing_sequences.each do | domains |
         query_name = domains[0].query
-        if failed_seqs_msa.find_by_name_start( query_name, true ).length < 1
+        if (!TESTING) || (failed_seqs_msa.find_by_name_start( query_name, true ).length < 1)
           add_sequence( query_name, in_msa, failed_seqs_msa )
         else
-          error_msg = "this should not have happened"
+          error_msg = 'this should not have happened'
           raise StandardError, error_msg
         end
       end
 
-      write_msa( passed_seqs_msa, passing_fl_seqs_outfile )
-      puts "wrote ..."
-      write_msa( failed_seqs_msa, failing_fl_seqs_outfile )
-      puts "wrote ..."
-
-      log_str << ld
+      write_msa passed_seqs_msa, passing_fl_seqs_outfile
+      log 'Wrote passing full length protein sequences              : ' + passing_fl_seqs_outfile
+      write_msa failed_seqs_msa, failing_fl_seqs_outfile
+      log 'Wrote failing full length protein sequences              : ' + failing_fl_seqs_outfile
 
-      log_str << "proteins in sequence (fasta) file            : " + in_msa.get_number_of_seqs.to_s + ld
-
-      log_str << ld
+      begin
+        f = File.open( logfile, 'w' )
+        f.print( @log )
+        f.close
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+      log 'Wrote log file                                           : ' + logfile
 
     end # parse
 
@@ -264,7 +277,7 @@ module Evoruby
     # target_domain_names: Set of String
     # target_domains: Hash String->TargetDomain
     # target_domain_architecture: String
-    def checkit2(domains_in_query_seq,
+    def compare(domains_in_query_seq,
       target_domain_names,
       target_domains,
       in_msa,
@@ -442,7 +455,7 @@ module Evoruby
       w = FastaWriter.new()
       w.set_line_width( 60 )
       w.clean( true )
-      File.delete(filename) if File.exist?(filename) #TODO remove me
+      File.delete(filename) if File.exist?(filename)
       begin
         io.write_to_file( msa, filename, w )
       rescue Exception
@@ -505,13 +518,42 @@ module Evoruby
       seq.get_sequence_as_string
     end
 
+    def parse_da( target_da_str )
+      target_domain_hash = Hash.new
+      target_domain_ary = Array.new
+      target_das = target_da_str.split '--'
+      target_das.each do |x|
+        inds = x.split '='
+        unless inds.size == 4
+          raise IOError, 'domain architecture is ill formatted: ' + x
+        end
+        target_domain_name = inds[0]
+        ie_cutoff = inds[1].to_f
+        abs_len_cutoff = inds[2].to_i
+        rel_len_cutoff = inds[3].to_f
+        if target_domain_hash.has_key? target_domain_name
+          target_domain_ary.push target_domain_hash[target_domain_name]
+        else
+          td = TargetDomain.new(target_domain_name, ie_cutoff, abs_len_cutoff, rel_len_cutoff)
+          target_domain_hash[target_domain_name] = td
+          target_domain_ary.push td
+        end
+      end
+      target_domain_ary
+    end
+
+    def log(str = '')
+      puts str
+      @log << str << Constants::LINE_DELIMITER
+    end
+
   end # class HmmscanMultiDomainExtractor
 
   class DomainData
     def initialize( name )
       if (name == nil) || name.size < 1
         error_msg = "domain name nil or empty"
-        raise StandardError, error_msg
+        raise IOError, error_msg
       end
       @name = name
       @count = 0
@@ -526,16 +568,16 @@ module Evoruby
     def add( name, length, i_e_value)
       if name != @name
         error_msg = "domain names do not match"
-        raise StandardError, error_msg
+        raise IOError, error_msg
       end
 
       if length < 0
         error_msg = "length cannot me negative"
-        raise StandardError, error_msg
+        raise IOError, error_msg
       end
       if i_e_value < 0
         error_msg = "iE-value cannot me negative"
-        raise StandardError, error_msg
+        raise IOError, error_msg
       end
       @count += 1
       @i_e_value_sum += i_e_value
@@ -570,7 +612,7 @@ module Evoruby
 
     def to_str
       s = ''
-      s << @name.rjust(24) + ': '
+      s << @name.rjust(16) + ': '
       s << @count.to_s.rjust(4) + '  '
       s << avg_length.round(1).to_s.rjust(6) + ' '
       s << @len_min.to_s.rjust(4) + ' -'
@@ -586,22 +628,41 @@ module Evoruby
   end
 
   class TargetDomain
-    def initialize( name, i_e_value, abs_len, rel_len, position )
+    def initialize(name, i_e_value, abs_len, rel_len)
       if (name == nil) || name.size < 1
         error_msg = "target domain name nil or empty"
-        raise StandardError, error_msg
+        raise IOError, error_msg
       end
       if rel_len > 1
-        error_msg = "target domain relative length is greater than 1"
-        raise StandardError, error_msg
+        error_msg = name + ": target domain relative length is greater than 1"
+        raise IOError, error_msg
+      end
+      if (abs_len <= 0) && (rel_len <= 0)
+        error_msg = name + ": need to have either absolute length or relative length cutoff"
+        raise IOError, error_msg
+      end
+      if (abs_len > 0) && (rel_len > 0)
+        error_msg = name + ": cannot have both absolute length and relative length cutoff"
+        raise IOError, error_msg
       end
       @name = name
       @i_e_value = i_e_value
       @abs_len = abs_len
       @rel_len = rel_len
-      @position = position
     end
-    attr_reader :name, :i_e_value, :abs_len, :rel_len, :position
+
+    def to_str
+      s = @name.rjust(16) + ':'
+      s << ' iE-cutoff: ' + ("%.2E" % @i_e_value).rjust(9)
+      if @abs_len > 0
+        s << ', abs len-cutoff: ' + @abs_len.to_s.rjust(4)
+      end
+      if @rel_len > 0
+        s << ', rel len-cutoff: ' + @rel_len.to_s.rjust(4)
+      end
+      s
+    end
+    attr_reader :name, :i_e_value, :abs_len, :rel_len
   end
 
 end # module Evoruby
index 549ae2c..0dcad71 100644 (file)
@@ -6,15 +6,12 @@
 #
 # Last modified: 2017/02/07
 
-
 require 'lib/evo/util/constants'
 require 'lib/evo/util/util'
 require 'lib/evo/sequence/sequence'
 
 module Evoruby
-
   class Msa
-
     def initialize()
       @sequences = Array.new
       @identical_seqs_detected = Array.new
@@ -22,7 +19,6 @@ module Evoruby
       @namestart_to_seq_indices = Hash.new
     end
 
-
     def add_sequence( sequence )
       @sequences.push( sequence )
     end
@@ -34,8 +30,8 @@ module Evoruby
     def get_sequence( index )
       if ( index < 0 || index > get_number_of_seqs() - 1 )
         error_msg = "attempt to get sequence " <<
-         index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
-         " sequences"
+        index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
+        " sequences"
         raise ArgumentError, error_msg
       end
       return @sequences[ index ]
@@ -44,8 +40,8 @@ module Evoruby
     def remove_sequence!( index )
       if ( index < 0 || index > get_number_of_seqs() - 1 )
         error_msg = "attempt to remove sequence " <<
-         index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
-         " sequences"
+        index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
+        " sequences"
         raise ArgumentError, error_msg
       end
       @name_to_seq_indices.clear
@@ -57,7 +53,6 @@ module Evoruby
       @identical_seqs_detected
     end
 
-
     def is_aligned()
       if ( get_number_of_seqs < 1 )
         return false
@@ -90,7 +85,7 @@ module Evoruby
           name = name.downcase
         end
         if current_name == name ||
-           ( partial_match && current_name.include?( name ) )
+        ( partial_match && current_name.include?( name ) )
           indices.push( i )
         end
       end
@@ -127,26 +122,26 @@ module Evoruby
       get_sequence( indices[ 0 ] )
     end
 
-    def find_by_name_start( name, case_sensitive )
+    def find_by_name_start(name, case_sensitive = true)
       if case_sensitive && @namestart_to_seq_indices.has_key?( name )
         return @namestart_to_seq_indices[ name ]
       end
       indices = []
-      for i in 0 ... get_number_of_seqs()
-        get_sequence( i ).get_name() =~ /^\s*(\S+)/
+      for i in 0 ... get_number_of_seqs
+        get_sequence(i).get_name =~ /^\s*(\S+)/
         current_name = $1
         if case_sensitive
-          if !@namestart_to_seq_indices.has_key?( current_name )
-            @namestart_to_seq_indices[ current_name ] = []
+          unless @namestart_to_seq_indices.has_key?(current_name)
+            @namestart_to_seq_indices[current_name] = []
           end
-          @namestart_to_seq_indices[ current_name ].push( i )
+          @namestart_to_seq_indices[current_name].push( i )
         end
         if !case_sensitive
           current_name = current_name.downcase
           name = name.downcase
         end
-        if  current_name == name
-          indices.push( i )
+        if current_name == name
+          indices.push(i)
         end
       end
       indices
@@ -160,7 +155,7 @@ module Evoruby
           name = name.downcase
         end
         if current_name == name ||
-           ( partial_match && current_name.include?( name ) )
+        ( partial_match && current_name.include?( name ) )
           return true
         end
       end
@@ -193,7 +188,6 @@ module Evoruby
       get_sequence( indices[ 0 ] )
     end
 
-
     def get_sub_alignment( seq_numbers )
       msa = Msa.new()
       for i in 0 ... seq_numbers.length()
@@ -230,7 +224,6 @@ module Evoruby
       s
     end
 
-
     def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
       if ( !is_aligned() )
         error_msg = "attempt to get overlap diagram of unaligned msa"
@@ -273,7 +266,7 @@ module Evoruby
       overlap_count = 0
       for i in 0...seq_1.get_length()
         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
-           !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
+        !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
           overlap_count += 1
           if overlap_count >= min_overlap
             return true
@@ -289,7 +282,7 @@ module Evoruby
       overlap_count = 0
       for i in 0...seq_1.get_length
         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
-           !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
+        !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
           overlap_count += 1
         end
       end
@@ -302,10 +295,10 @@ module Evoruby
       identities_count = 0
       for i in 0...seq_1.get_length
         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
-           !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
-           seq_1.get_character_code( i ) != 63 &&
-           ( seq_1.get_residue( i ).downcase() ==
-             seq_2.get_residue( i ).downcase() )
+        !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
+        seq_1.get_character_code( i ) != 63 &&
+        ( seq_1.get_residue( i ).downcase() ==
+        seq_2.get_residue( i ).downcase() )
           identities_count += 1
         end
       end
@@ -325,7 +318,6 @@ module Evoruby
       remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
     end
 
-
     def remove_sequences_by_gap_ratio!( gap_ratio )
       if ( !is_aligned() )
         error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
@@ -345,7 +337,6 @@ module Evoruby
       removed
     end
 
-
     def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
       n = get_number_of_seqs
       removed = Array.new
@@ -355,10 +346,10 @@ module Evoruby
         for j in ( i + 1 ) ... n
           if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
             if  !consider_taxonomy ||
-               ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
-                 ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
+            ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
+            ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
               if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
-                 Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
+              Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
                 to_be_removed.add( j )
 
                 tax_i = ""
@@ -389,7 +380,6 @@ module Evoruby
       removed
     end
 
-
     def remove_sequences_by_non_gap_length!( min_non_gap_length )
       n = get_number_of_seqs
       removed = Array.new
@@ -504,7 +494,6 @@ module Evoruby
       return cols
     end
 
-
     # Split an alignment into n alignemnts of equal size, except last one
     def split( n, verbose = false )
       if ( n < 2 || n > get_number_of_seqs )
@@ -538,7 +527,6 @@ module Evoruby
       msas
     end
 
-
     private
 
     def get_overlaps( min_overlap = 1 )
@@ -588,7 +576,6 @@ module Evoruby
       return self
     end
 
-
   end # class Msa
 
 end # module Evoruby
index c1a35ab..762556b 100644 (file)
@@ -15,10 +15,9 @@ module Evoruby
     PRG_NAME       = "mdsx"
     PRG_VERSION    = "1.000"
     PRG_DESC       = "Extraction of multi domain sequences from hmmscan output"
-    PRG_DATE       = "20170220"
+    PRG_DATE       = "20170307"
     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
 
-    LOG_FILE_SUFFIX                    = '_MDSX.log'
     HELP_OPTION_1                      = 'help'
     HELP_OPTION_2                      = 'h'
     def run()
@@ -64,7 +63,7 @@ module Evoruby
       domain_id           = cla.get_file_name( 0 )
       hmmscan_output      = cla.get_file_name( 1 )
       fasta_sequence_file = ""
-      outfile             = ""
+      outfile_base        = ""
 
       if cla.get_number_of_files == 3
         fasta_sequence_file = cla.get_file_name( 2 )
@@ -90,41 +89,26 @@ module Evoruby
       end
       hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
       if hmmscan_index != nil
-        outfile = hmmscan_output.sub(Constants::HMMSCAN, '_')
+        outfile_base = hmmscan_output.sub(Constants::HMMSCAN, '_')
       else
         Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
       end
 
-      log = String.new
-      ld = Constants::LINE_DELIMITER
-
-      #      puts()
-      #      puts( "Domain                                                                             : " + domain_id )
-      #      log << "Domain                                                                             : " + domain_id + ld
-      #      puts( "Hmmscan outputfile                                                                 : " + hmmscan_output )
-      #      log << "Hmmscan outputfile                                                                 : " + hmmscan_output + ld
-      #      puts( "Fasta sequencefile (complete sequences)                                            : " + fasta_sequence_file )
-      #      log << "Fasta sequencefile (complete sequences)                                            : " + fasta_sequence_file + ld
-      #      puts( "Outputfile                                                                         : " + outfile + ".fasta" )
-      #      log << "Outputfile                                                                         : " + outfile + ld
-      #      puts( "Passed sequences outfile (fasta)                                                   : " + outfile + PASSED_SEQS_SUFFIX )
-      #      log << "Passed sequences outfile (fasta)                                                   : " + outfile + PASSED_SEQS_SUFFIX + ld
-      #      puts( "Failed sequences outfile (fasta)                                                   : " + outfile + FAILED_SEQS_SUFFIX )
-      #      log << "Failed sequences outfile (fasta)                                                   : " + outfile + FAILED_SEQS_SUFFIX + ld
-      #      puts( "Logfile                                                                            : " + outfile + LOG_FILE_SUFFIX )
-      #      log << "Logfile                                                                            : " + outfile + LOG_FILE_SUFFIX + ld
+      log_str = ''
 
-      puts
-      log <<  ld
+      log_str << PRG_NAME << Constants::LINE_DELIMITER
+      log_str <<  PRG_VERSION << Constants::LINE_DELIMITER
+      log_str << PRG_DESC << Constants::LINE_DELIMITER
+      log_str << PRG_DATE << Constants::LINE_DELIMITER
+      log_str << Constants::LINE_DELIMITER
 
-      domain_count = 0
       begin
         parser = HmmscanMultiDomainExtractor.new()
-        domain_count = parser.parse( domain_id,
+        parser.parse( domain_id,
         hmmscan_output,
         fasta_sequence_file,
-        outfile,
-        log )
+        outfile_base,
+        log_str)
       rescue ArgumentError, IOError => e
         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 
@@ -134,42 +118,26 @@ module Evoruby
       end
 
       puts
-  
-      #  Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta")
-      #  Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
-      #  Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX )
-      #  Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX )
-
-      begin
-        f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
-        f.print( log )
-        f.close
-      rescue Exception => e
-        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-      end
-
-      puts
       Util.print_message( PRG_NAME, "OK" )
       puts
 
     end
 
     def print_help()
-      puts()
-      puts( "Usage:" )
-      puts()
-      puts( "  " + PRG_NAME + ".rb <da> <hmmscan outputfile> [file containing complete sequences in fasta format]" )
-      puts()
-      puts( "  options: -"  )
-      puts()
-      puts( "Examples:" )
       puts
-      puts( "  " + PRG_NAME + ".rb " )
+      puts "Usage:"
+      puts
+      puts "  " + PRG_NAME + ".rb <da> <hmmscan outputfile> [file containing complete sequences in fasta format]"
+      puts
+      puts "  options: -"
+      puts
+      puts "Examples:"
+      puts
+      puts "  " + PRG_NAME + ".rb "
+      puts
       puts
-
-      puts()
     end
 
-  end # class DomainSequenceExtractor
+  end # class MultiDomainSeqExtractor
 
 end # module Evoruby