inprogress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Tue, 19 Mar 2013 21:09:48 +0000 (21:09 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Tue, 19 Mar 2013 21:09:48 +0000 (21:09 +0000)
forester/ruby/evoruby/lib/evo/msa/msa.rb
forester/ruby/evoruby/lib/evo/sequence/sequence.rb
forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb
forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb

index f338093..1681827 100644 (file)
@@ -85,8 +85,31 @@ module Evoruby
       indices
     end
 
+    def find_by_name_pattern( name_re )
+      indices = []
+      for i in 0 ... get_number_of_seqs()
+        if name_re.match( get_sequence( i ).get_name() )
+          indices.push( i )
+        end
+      end
+      indices
+    end
+
+    # throws ArgumentError
+    def get_by_name_pattern( name_re )
+      indices = find_by_name_pattern( name_re )
+      if ( indices.length > 1 )
+        error_msg = "pattern \"" + name_re.to_s + "\" not unique"
+        raise ArgumentError, error_msg
+      elsif ( indices.length < 1 )
+        error_msg = "pattern \"" + name_re.to_s + "\" not found"
+        raise ArgumentError, error_msg
+      end
+      get_sequence( indices[ 0 ] )
+    end
+
     def find_by_name_start( name, case_sensitive )
-      indices = Array.new()
+      indices = []
       for i in 0 ... get_number_of_seqs()
         get_sequence( i ).get_name() =~ /^\s*(\S+)/
         current_name = $1
@@ -168,7 +191,7 @@ module Evoruby
     end
 
     def to_str
-      to_fasta 
+      to_fasta
     end
 
     def to_fasta
@@ -178,8 +201,8 @@ module Evoruby
       end
       s
     end
-    
-    
+
+
     def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
       if ( !is_aligned() )
         error_msg = "attempt to get overlap diagram of unaligned msa"
@@ -328,7 +351,7 @@ module Evoruby
           end
         end
       end
-    
+
       to_be_removed_ary = to_be_removed.to_a.sort.reverse
 
       to_be_removed_ary.each { | index |
index 09f8428..a5c2a98 100644 (file)
@@ -159,11 +159,11 @@ module Evoruby
         def to_str
             return "[" + @name + "] " + @molecular_sequence
         end
-        
+
         def to_fasta
             return ">" + @name + Constants::LINE_DELIMITER  + @molecular_sequence
         end
-        
+
 
     end # class Sequence
 
index 08d74c1..3b48317 100644 (file)
@@ -23,21 +23,18 @@ module Evoruby
 
   class HmmscanAnalysis
 
-    PRG_NAME       = "hsp"
-    PRG_VERSION    = "2.001"
+    PRG_NAME       = "hsa"
+    PRG_VERSION    = "1.000"
     PRG_DESC       = "hmmscan summary"
-    PRG_DATE       = "2013.10.23"
+    PRG_DATE       = "130319"
     COPYRIGHT      = "2013 Christian M Zmasek"
     CONTACT        = "phyloxml@gmail.com"
     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
 
-    DELIMITER_OPTION              = "d"
     SPECIES_OPTION                = "s"
     I_E_VALUE_THRESHOLD_OPTION    = "ie"
     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
     HMM_FOR_PROTEIN_OUTPUT        = "m"
-    IGNORE_DUF_OPTION             = "i"
-    PARSE_OUT_DESCRIPITION_OPTION = "a"
     HELP_OPTION_1                 = "help"
     HELP_OPTION_2                 = "h"
 
@@ -78,11 +75,8 @@ module Evoruby
       end
 
       allowed_opts = Array.new
-      allowed_opts.push( DELIMITER_OPTION )
       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
-      allowed_opts.push( IGNORE_DUF_OPTION )
-      allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
       allowed_opts.push( SPECIES_OPTION )
 
@@ -105,15 +99,6 @@ module Evoruby
         msa = read_fasta_file(seq_file_path )
       end
 
-      column_delimiter = "\t"
-      if ( cla.is_option_set?( DELIMITER_OPTION ) )
-        begin
-          column_delimiter = cla.get_option_value( DELIMITER_OPTION )
-        rescue ArgumentError => e
-          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
-        end
-      end
-
       i_e_value_threshold = -1
       if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
         begin
@@ -142,7 +127,7 @@ module Evoruby
       if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
         begin
           hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
-          hmm_for_protein_outputs = hmm_for_protein_output.split( "~" );
+          hmm_for_protein_outputs = hmm_for_protein_output.split( "/" );
         rescue ArgumentError => e
           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
         end
@@ -157,27 +142,13 @@ module Evoruby
         end
       end
 
-      ignore_dufs = false
-      if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
-        ignore_dufs = true
-      end
-
-      parse_descriptions = false
-      if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
-        parse_descriptions = true
-      end
-
-
       begin
         parse( inpath,
-          column_delimiter,
           i_e_value_threshold,
-          ignore_dufs,
-          parse_descriptions,
           fs_e_value_threshold,
           hmm_for_protein_outputs,
           species,
-          msa        )
+          msa )
       rescue IOError => e
         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
       end
@@ -200,10 +171,7 @@ module Evoruby
 
     # raises ArgumentError, IOError
     def parse( inpath,
-        column_delimiter,
         i_e_value_threshold,
-        ignore_dufs,
-        get_descriptions,
         fs_e_value_threshold,
         hmm_for_protein_outputs,
         species,
@@ -214,25 +182,14 @@ module Evoruby
       hmmscan_parser = HmmscanParser.new( inpath )
       results = hmmscan_parser.parse
 
-
-      query     = ""
-      desc      = ""
-      model     = ""
-      env_from  = ""
-      env_to    = ""
-      i_e_value = ""
-
       hmmscan_results_per_protein = []
 
+      query     = ""
       prev_query = ""
 
       results.each do | r |
-        model     = r.model
-        query     = r.query
-        i_e_value = r.i_e_value
-        env_from  = r.env_from
-        env_to    = r.env_to
 
+        query     = r.query
 
         if !prev_query.empty? && prev_query != query
           if !hmmscan_results_per_protein.empty?
@@ -241,7 +198,7 @@ module Evoruby
               hmm_for_protein_outputs,
               i_e_value_threshold,
               species,
-              msa            )
+              msa )
           end
           hmmscan_results_per_protein.clear
         end
@@ -263,7 +220,7 @@ module Evoruby
           hmm_for_protein_outputs,
           i_e_value_threshold,
           species,
-          msa        )
+          msa )
       end
 
 
@@ -339,52 +296,46 @@ module Evoruby
         end
       end
 
-      s = ""
+
       query = nil
+      qlen = nil
       owns.each do | own |
-        s << own.query + "\t"
-        query = own.query
+
+        if query == nil
+          query = own.query
+
+          qlen = own.qlen
+        else
+          # sanity checks
+          raise StandardError, "failed sanity check" if query != own.query || qlen != own.qlen
+          raise StandardError, "failed sanity check: qlen != own.qlen" if qlen != own.qlen
+        end
       end
+
+      s = query + "\t"
       s << species + "\t"
       owns.each do | own |
         s << own.fs_e_value.to_s + "\t"
       end
-      owns.each do | own |
-        s << own.qlen.to_s + "\t" #TODO !
-      end
 
-      #   dcs.each do | dc |
-      #     s << dc.to_s + "\t"
-      #   end
+      s << qlen.to_s + "\t"
+
       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
       hmmscan_results_per_protein_filtered.each do | r |
         s << r.model + " "
       end
       s << "\t"
-
-
-
-      # overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
-
-      #   s << overview  + "\t"
-
-      #  s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
-
+      s <<  make_overview_da( hmmscan_results_per_protein_filtered )
+      s << "\t"
       prev_r = nil
       hmmscan_results_per_protein_filtered.each do | r |
-
         if  prev_r != nil
           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
-
           if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] )
-            puts "xxx"
-            linker( prev_r.env_to, r.env_from, query, msa )
+            extract_linker( prev_r.env_to, r.env_from, query, msa )
           end
-
-
         else
           s << make_interdomain_sequence( r.env_from, false )
-
         end
         s << r.model
         s << "["
@@ -393,58 +344,90 @@ module Evoruby
         s << "]"
         prev_r = r
       end
-      # s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
+      s << make_interdomain_sequence( qlen - prev_r.env_to, false )
+
       puts s
     end
 
+    def extract_linkers( hmmscan_results_per_protein_filtered, target_hmms, query, msa  )
+      prev_r = nil
+      hmmscan_results_per_protein_filtered.each do | r |
+        if  prev_r != nil
+          if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] )
+            extract_linker( prev_r.env_to, r.env_from, query, msa )
+          end
+        end
+        prev_r = r
+      end
+    end
 
-    def linker( first, last , query , msa)
-      puts first.to_s + "-" + last.to_s
+    def extract_linker( first, last , query , msa)
       if ( last - first >= 1 )
-        seq = msa.get_by_name( query, true, false )
-        linker = seq.get_subsequence( first -1 , last - 1 )
-        puts linker.get_sequence_as_string
+        seq = msa.get_by_name_pattern( /\b#{Regexp.quote(query)}\b/ )
+        linker = seq.get_subsequence( first - 1 , last - 1 )
+        linker.get_sequence_as_string
       end
-
     end
 
+    def make_detailed_da( hmmscan_results_per_protein_filtered  )
+      s = ""
+      prev_r = nil
+      hmmscan_results_per_protein_filtered.each do | r |
+        if  prev_r != nil
+          s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
+        else
+          s << make_interdomain_sequence( r.env_from, false )
+        end
+        s << r.model
+        s << "["
+        s << r.env_from.to_s << "-" << r.env_to.to_s
+        s << " " << r.i_e_value.to_s
+        s << "]"
+        prev_r = r
+      end
+      s << make_interdomain_sequence( qlen - prev_r.env_to, false )
+      s
+    end
 
-    def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
-      linkers = ""
+    def make_overview_da( hmmscan_results_per_protein_filtered )
+      overview = ""
       prev_r = nil
       hmmscan_results_per_protein_filtered.each do | r |
-        if r.model == hmm_for_protein_output
-          if  prev_r != nil
-            linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
+
+        if prev_r == nil
+          overview << r.model
+        else
+          if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
+            overview << "~" << r.model
+          else
+            overview << "----" << r.model
           end
-          prev_r = r
         end
+        prev_r = r
+
       end
-      linkers
+      overview
     end
 
 
 
-    def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
-      overview = ""
+
+
+    def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
+      linkers = ""
       prev_r = nil
       hmmscan_results_per_protein_filtered.each do | r |
         if r.model == hmm_for_protein_output
-          if prev_r == nil
-            overview << hmm_for_protein_output
-          else
-            if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
-              overview << "~" << hmm_for_protein_output
-            else
-              overview << "----" << hmm_for_protein_output
-            end
+          if  prev_r != nil
+            linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
           end
           prev_r = r
         end
       end
-      overview
+      linkers
     end
 
+
     def make_interdomain_sequence( d, mark_short = true )
       s = ""
       d /= 20
@@ -466,10 +449,7 @@ module Evoruby
       puts()
       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
       puts()
-      puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
-      puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
-      puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
-      puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
+      puts( "  options: -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
       puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
       puts( "           -" + SPECIES_OPTION + ": species for protein summary" )
index bb32da5..08de9c1 100644 (file)
@@ -13,16 +13,15 @@ require 'lib/evo/util/constants'
 require 'lib/evo/util/util'
 require 'lib/evo/util/command_line_arguments'
 require 'lib/evo/io/parser/hmmscan_parser'
-require 'lib/evo/io/web/uniprotkb'
 
 module Evoruby
 
   class HmmscanSummary
 
     PRG_NAME       = "hsp"
-    PRG_VERSION    = "2.001"
+    PRG_VERSION    = "2.002"
     PRG_DESC       = "hmmscan summary"
-    PRG_DATE       = "2013.10.23"
+    PRG_DATE       = "130319"
     COPYRIGHT      = "2013 Christian M Zmasek"
     CONTACT        = "phyloxml@gmail.com"
     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
@@ -34,7 +33,6 @@ module Evoruby
     HMM_FOR_PROTEIN_OUTPUT        = "m"
     IGNORE_DUF_OPTION             = "i"
     PARSE_OUT_DESCRIPITION_OPTION = "a"
-    UNIPROT                       = "u"
     HELP_OPTION_1                 = "help"
     HELP_OPTION_2                 = "h"
 
@@ -48,14 +46,14 @@ module Evoruby
 
     def run
 
-   #   Util.print_program_information( PRG_NAME,
-   #     PRG_VERSION,
-   #     PRG_DESC,
-   #     PRG_DATE,
-   #     COPYRIGHT,
-   #     CONTACT,
-   #     WWW,
-   #     STDOUT )
+      Util.print_program_information( PRG_NAME,
+        PRG_VERSION,
+        PRG_DESC,
+        PRG_DATE,
+        COPYRIGHT,
+        CONTACT,
+        WWW,
+        STDOUT )
 
       begin
         cla = CommandLineArguments.new( ARGV )
@@ -81,7 +79,6 @@ module Evoruby
       allowed_opts.push( IGNORE_DUF_OPTION )
       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
-      allowed_opts.push( UNIPROT )
       allowed_opts.push( SPECIES_OPTION )
 
       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
@@ -115,8 +112,6 @@ module Evoruby
         end
       end
 
-
-
       fs_e_value_threshold = -1.0
       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
         begin
@@ -138,15 +133,6 @@ module Evoruby
         end
       end
 
-      uniprot = ""
-      if ( cla.is_option_set?( UNIPROT ) )
-        begin
-          uniprot = cla.get_option_value( UNIPROT )
-        rescue ArgumentError => e
-          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
-        end
-      end
-
       species = "HUMAN"
       if ( cla.is_option_set?( SPECIES_OPTION ) )
         begin
@@ -166,42 +152,39 @@ module Evoruby
         parse_descriptions = true
       end
 
-#      puts()
-#      puts( "hmmpfam outputfile  : " + inpath )
-#      puts( "outputfile          : " + outpath )
-#      puts( "species             : " + species )
-#      if ( i_e_value_threshold >= 0.0 )
-#        puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
-#      else
-#        puts( "i-E-value threshold : no threshold" )
-#      end
-#      if ( parse_descriptions )
-#        puts( "parse descriptions  : true" )
-#      else
-#        puts( "parse descriptions  : false" )
-#      end
-#      if ( ignore_dufs )
-#        puts( "ignore DUFs         : true" )
-#      else
-#        puts( "ignore DUFs         : false" )
-#      end
-#      if ( column_delimiter == "\t" )
-#        puts( "column delimiter    : TAB" )
-#      else
-#        puts( "column delimiter     : " + column_delimiter )
-#      end
-#      if fs_e_value_threshold >= 0.0
-#        puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
-#      else
-#        puts( "E-value threshold   : no threshold" )
-#      end
-#      if !hmm_for_protein_output.empty?
-#        puts( "HMM for proteins    : " + hmm_for_protein_output )
-#      end
-#      if !uniprot.empty?
-#        puts( "Uniprot             : " + uniprot )
-#      end
-#      puts()
+      puts()
+      puts( "hmmpfam outputfile  : " + inpath )
+      puts( "outputfile          : " + outpath )
+      puts( "species             : " + species )
+      if ( i_e_value_threshold >= 0.0 )
+        puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
+      else
+        puts( "i-E-value threshold : no threshold" )
+      end
+      if ( parse_descriptions )
+        puts( "parse descriptions  : true" )
+      else
+        puts( "parse descriptions  : false" )
+      end
+      if ( ignore_dufs )
+        puts( "ignore DUFs         : true" )
+      else
+        puts( "ignore DUFs         : false" )
+      end
+      if ( column_delimiter == "\t" )
+        puts( "column delimiter    : TAB" )
+      else
+        puts( "column delimiter     : " + column_delimiter )
+      end
+      if fs_e_value_threshold >= 0.0
+        puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
+      else
+        puts( "E-value threshold   : no threshold" )
+      end
+      if !hmm_for_protein_output.empty?
+        puts( "HMM for proteins    : " + hmm_for_protein_output )
+      end
+      puts()
 
       begin
         parse( inpath,
@@ -212,21 +195,20 @@ module Evoruby
           parse_descriptions,
           fs_e_value_threshold,
           hmm_for_protein_output,
-          uniprot,
           species )
       rescue IOError => e
         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
       end
       domain_counts = get_domain_counts()
 
-#      puts
-#      puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
-#      puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
-#      puts
-#      puts( Util.draw_histogram( domain_counts, "#" ) )
-#      puts
-#      Util.print_message( PRG_NAME, 'OK' )
-#      puts 
+      puts
+      puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
+      puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
+      puts
+      puts( Util.draw_histogram( domain_counts, "#" ) )
+      puts
+      Util.print_message( PRG_NAME, 'OK' )
+      puts
 
     end # def run
 
@@ -241,7 +223,6 @@ module Evoruby
         get_descriptions,
         fs_e_value_threshold,
         hmm_for_protein_output,
-        uniprot,
         species )
 
       Util.check_file_for_readability( inpath )
@@ -296,7 +277,6 @@ module Evoruby
                 fs_e_value_threshold,
                 hmm_for_protein_output,
                 i_e_value_threshold,
-                uniprot,
                 species )
             end
             hmmscan_results_per_protein.clear
@@ -318,7 +298,6 @@ module Evoruby
           fs_e_value_threshold,
           hmm_for_protein_output,
           i_e_value_threshold,
-          uniprot,
           species )
       end
 
@@ -347,7 +326,6 @@ module Evoruby
         fs_e_value_threshold,
         hmm_for_protein_output,
         i_e_value_threshold,
-        uniprotkb,
         species )
 
       dc = 0
@@ -397,15 +375,6 @@ module Evoruby
       end
       s << "\t"
 
-      if !uniprotkb.empty?
-        #e = UniprotKB::get_entry_by_id( process_id( own.query ) )
-
-        #if e != nil
-        #  s << uniprot_annotation( e )
-        # # s << "\uniprot_annotationt"
-        #end
-      end
-
       overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
 
       s << overview  + "\t"
@@ -414,7 +383,6 @@ module Evoruby
 
       prev_r = nil
       hmmscan_results_per_protein_filtered.each do | r |
-
         if  prev_r != nil
           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
         else
@@ -428,23 +396,10 @@ module Evoruby
         s << "]"
         prev_r = r
       end
-      s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
+      s << make_interdomain_sequence( own.qlen - prev_r.env_to, false )
       puts s
     end
 
-    def uniprot_annotation( e )
-      s = ""
-      pdb_ids = e.get_pdb_ids
-      if !pdb_ids.empty?
-        pdb_ids.each do | pdb |
-          s << pdb << ", "
-        end
-      else
-        s << "-"
-      end
-      s
-    end
-
     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
       linkers = ""
       prev_r = nil