in progress...
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_summary.rb
index 75f5dd8..6e9afb6 100644 (file)
@@ -1,32 +1,23 @@
 #
 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
 #
-# Copyright::  Copyright (C) 2012 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
-#
-# last modified: 121003
+# Copyright::    Copyright (C) 2017 Christian M Zmasek
+# License::      GNU Lesser General Public License (LGPL)
 
 require 'set'
-
 require 'lib/evo/util/constants'
 require 'lib/evo/util/util'
 require 'lib/evo/util/command_line_arguments'
 require 'lib/evo/io/parser/hmmscan_parser'
-require 'lib/evo/io/web/uniprotkb'
 
 module Evoruby
-
   class HmmscanSummary
 
     PRG_NAME       = "hsp"
-    PRG_VERSION    = "2.000"
-    PRG_DESC       = "hmmscan summary"
-    PRG_DATE       = "2012.10.23"
-    COPYRIGHT      = "2012 Christian M Zmasek"
-    CONTACT        = "phylosoft@gmail.com"
-    WWW            = "www.phylosoft.org"
+    PRG_VERSION    = "2.003"
+    PRG_DESC       = "Summarize hmmscan output tables into simpler tables"
+    PRG_DATE       = "170213"
+    WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
 
     DELIMITER_OPTION              = "d"
     SPECIES_OPTION                = "s"
@@ -35,13 +26,12 @@ module Evoruby
     HMM_FOR_PROTEIN_OUTPUT        = "m"
     IGNORE_DUF_OPTION             = "i"
     PARSE_OUT_DESCRIPITION_OPTION = "a"
-    UNIPROT                       = "u"
     HELP_OPTION_1                 = "help"
     HELP_OPTION_2                 = "h"
 
-    USE_AVOID_HMMS = true
-    AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
-    LIMIT_FOR_CLOSE_DOMAINS = 20
+    USE_AVOID_HMMS = false
+    AVOID_HHMS = [ "x", "y", "z" ]
+    LIMIT_FOR_CLOSE_DOMAINS = 20 # Used for protein architecture summary
 
     def initialize
       @domain_counts = Hash.new
@@ -49,16 +39,17 @@ module Evoruby
 
     def run
 
-
-
       Util.print_program_information( PRG_NAME,
-        PRG_VERSION,
-        PRG_DESC,
-        PRG_DATE,
-        COPYRIGHT,
-        CONTACT,
-        WWW,
-        STDOUT )
+      PRG_VERSION,
+      PRG_DESC,
+      PRG_DATE,
+      WWW,
+      STDOUT )
+
+      if ( ARGV == nil || ( ARGV.length < 1 )  )
+        print_help
+        exit( -1 )
+      end
 
       begin
         cla = CommandLineArguments.new( ARGV )
@@ -67,16 +58,11 @@ module Evoruby
       end
 
       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
-            cla.is_option_set?( HELP_OPTION_2 ) )
+      cla.is_option_set?( HELP_OPTION_2 ) )
         print_help
         exit( 0 )
       end
 
-      if ( cla.get_number_of_files != 2 )
-        print_help
-        exit( -1 )
-      end
-
       allowed_opts = Array.new
       allowed_opts.push( DELIMITER_OPTION )
       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
@@ -84,18 +70,26 @@ module Evoruby
       allowed_opts.push( IGNORE_DUF_OPTION )
       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
-      allowed_opts.push( UNIPROT )
       allowed_opts.push( SPECIES_OPTION )
 
       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
       if ( disallowed.length > 0 )
         Util.fatal_error( PRG_NAME,
-          "unknown option(s): " + disallowed,
-          STDOUT )
+        "unknown option(s): " + disallowed,
+        STDOUT )
       end
 
       inpath = cla.get_file_name( 0 )
-      outpath = cla.get_file_name( 1 )
+
+      outpath = ""
+      if ( cla.get_number_of_files == 1 )
+        outpath = inpath + Constants::DOMAIN_TABLE_SUFFIX
+      elsif ( cla.get_number_of_files == 2 )
+        outpath = cla.get_file_name( 1 )
+      else
+        print_help
+        exit( -1 )
+      end
 
       column_delimiter = "\t"
       if ( cla.is_option_set?( DELIMITER_OPTION ) )
@@ -118,8 +112,6 @@ module Evoruby
         end
       end
 
-      
-
       fs_e_value_threshold = -1.0
       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
         begin
@@ -141,17 +133,8 @@ module Evoruby
         end
       end
 
-      uniprot = ""
-      if ( cla.is_option_set?( UNIPROT ) )
-        begin
-          uniprot = cla.get_option_value( UNIPROT )
-        rescue ArgumentError => e
-          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
-        end
-      end
-
       species = "HUMAN"
-       if ( cla.is_option_set?( SPECIES_OPTION ) )
+      if ( cla.is_option_set?( SPECIES_OPTION ) )
         begin
           species = cla.get_option_value( SPECIES_OPTION )
         rescue ArgumentError => e
@@ -170,9 +153,9 @@ module Evoruby
       end
 
       puts()
-      puts( "hmmpfam outputfile  : " + inpath )
+      puts( "hmmscan outputfile  : " + inpath )
       puts( "outputfile          : " + outpath )
-      puts( "species             : " + species )
+
       if ( i_e_value_threshold >= 0.0 )
         puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
       else
@@ -191,44 +174,42 @@ module Evoruby
       if ( column_delimiter == "\t" )
         puts( "column delimiter    : TAB" )
       else
-        puts( "column delimiter     : " + column_delimiter )
-      end
-      if fs_e_value_threshold >= 0.0
-        puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
-      else
-        puts( "E-value threshold   : no threshold" )
+        puts( "column delimiter    : " + column_delimiter )
       end
       if !hmm_for_protein_output.empty?
         puts( "HMM for proteins    : " + hmm_for_protein_output )
-      end
-      if !uniprot.empty?
-        puts( "Uniprot             : " + uniprot )
+        puts( "species             : " + species )
+        if fs_e_value_threshold >= 0.0
+          puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
+        else
+          puts( "E-value threshold   : no threshold" )
+        end
       end
       puts()
 
       begin
         parse( inpath,
-          outpath,
-          column_delimiter,
-          i_e_value_threshold,
-          ignore_dufs,
-          parse_descriptions,
-          fs_e_value_threshold,
-          hmm_for_protein_output,
-          uniprot,
-          species )
+        outpath,
+        column_delimiter,
+        i_e_value_threshold,
+        ignore_dufs,
+        parse_descriptions,
+        fs_e_value_threshold,
+        hmm_for_protein_output,
+        species )
       rescue IOError => e
         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
       end
       domain_counts = get_domain_counts()
 
-
       puts
       puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
       puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
       puts
       puts( Util.draw_histogram( domain_counts, "#" ) )
       puts
+      Util.print_message( PRG_NAME, "wrote: " + outpath )
+      Util.print_message( PRG_NAME, "next step in standard analysis pipeline: d2f.rb")
       Util.print_message( PRG_NAME, 'OK' )
       puts
 
@@ -238,17 +219,14 @@ module Evoruby
 
     # raises ArgumentError, IOError
     def parse( inpath,
-        outpath,
-        column_delimiter,
-        i_e_value_threshold,
-        ignore_dufs,
-        get_descriptions,
-        fs_e_value_threshold,
-        hmm_for_protein_output,
-        uniprot,
-        species )
-
-
+      outpath,
+      column_delimiter,
+      i_e_value_threshold,
+      ignore_dufs,
+      get_descriptions,
+      fs_e_value_threshold,
+      hmm_for_protein_output,
+      species )
 
       Util.check_file_for_readability( inpath )
       Util.check_file_for_writability( outpath )
@@ -267,33 +245,32 @@ module Evoruby
 
       hmmscan_results_per_protein = []
 
-
-
       prev_query = ""
 
       results.each do | r |
         model     = r.model
+        desc      = r.desc
         query     = r.query
         i_e_value = r.i_e_value
         env_from  = r.env_from
         env_to    = r.env_to
 
         if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
-            ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
+        ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
           count_model( model )
           outfile.print( query +
-              column_delimiter )
+          column_delimiter )
           if ( get_descriptions )
             outfile.print( desc +
-                column_delimiter )
+            column_delimiter )
           end
           outfile.print( model +
-              column_delimiter +
-              env_from.to_s +
-              column_delimiter +
-              env_to.to_s +
-              column_delimiter +
-              i_e_value.to_s )
+          column_delimiter +
+          env_from.to_s +
+          column_delimiter +
+          env_to.to_s +
+          column_delimiter +
+          i_e_value.to_s )
           outfile.print( Constants::LINE_DELIMITER )
         end
 
@@ -301,11 +278,10 @@ module Evoruby
           if  !prev_query.empty? && prev_query != query
             if !hmmscan_results_per_protein.empty?
               process_hmmscan_results_per_protein( hmmscan_results_per_protein,
-                fs_e_value_threshold,
-                hmm_for_protein_output,
-                i_e_value_threshold,
-                false,
-                species )
+              fs_e_value_threshold,
+              hmm_for_protein_output,
+              i_e_value_threshold,
+              species )
             end
             hmmscan_results_per_protein.clear
           end
@@ -320,20 +296,17 @@ module Evoruby
           end
         end
       end
-      puts hmm_for_protein_output
-      puts hmmscan_results_per_protein
+
       if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
         process_hmmscan_results_per_protein( hmmscan_results_per_protein,
-          fs_e_value_threshold,
-          hmm_for_protein_output,
-          i_e_value_threshold,
-          false,
-          species )
+        fs_e_value_threshold,
+        hmm_for_protein_output,
+        i_e_value_threshold,
+        species )
       end
 
       outfile.flush()
       outfile.close()
-
     end # def parse
 
     def process_id( id )
@@ -343,8 +316,6 @@ module Evoruby
       id
     end
 
-
-
     def count_model( model )
       if ( @domain_counts.has_key?( model ) )
         count = @domain_counts[ model ].to_i
@@ -356,13 +327,10 @@ module Evoruby
     end
 
     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
-        fs_e_value_threshold,
-        hmm_for_protein_output,
-        i_e_value_threshold,
-        uniprotkb,
-        species      )
-      
-      puts "+"
+      fs_e_value_threshold,
+      hmm_for_protein_output,
+      i_e_value_threshold,
+      species )
 
       dc = 0
       # filter according to i-Evalue threshold
@@ -370,12 +338,13 @@ module Evoruby
       hmmscan_results_per_protein_filtered = []
 
       hmmscan_results_per_protein.each do | r |
+
         if r.model == hmm_for_protein_output
-          if r.fs_e_value > fs_e_value_threshold
+          if fs_e_value_threshold > 0.0 && r.fs_e_value > fs_e_value_threshold
             return
           end
         end
-        if r.i_e_value <= i_e_value_threshold
+        if i_e_value_threshold <= 0 || r.i_e_value <= i_e_value_threshold
           hmmscan_results_per_protein_filtered << r
           if r.model == hmm_for_protein_output
             dc += 1
@@ -408,13 +377,6 @@ module Evoruby
         s << r.model + " "
       end
       s << "\t"
-      puts s
-      #e = UniprotKB::get_entry_by_id( process_id( own.query ) )
-
-      #if e != nil
-      #  s << uniprot_annotation( e )
-      # # s << "\uniprot_annotationt"
-      #end
 
       overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
 
@@ -424,7 +386,6 @@ module Evoruby
 
       prev_r = nil
       hmmscan_results_per_protein_filtered.each do | r |
-
         if  prev_r != nil
           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
         else
@@ -438,23 +399,10 @@ module Evoruby
         s << "]"
         prev_r = r
       end
-      s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
+      s << make_interdomain_sequence( own.qlen - prev_r.env_to, false )
       puts s
     end
 
-    def uniprot_annotation( e )
-      s = ""
-      pdb_ids = e.get_pdb_ids
-      if !pdb_ids.empty?
-        pdb_ids.each do | pdb |
-          s << pdb << ", "
-        end
-      else
-        s << "-"
-      end
-      s
-    end
-
     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
       linkers = ""
       prev_r = nil
@@ -508,19 +456,26 @@ module Evoruby
       s
     end
 
-
-
     def print_help()
       puts( "Usage:" )
       puts()
-      puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
+      puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> [outputfile]" )
+      puts()
+      puts( "  options: -" + DELIMITER_OPTION + "=<s> : column delimiter for outputfile, default is TAB" )
+      puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + "=<f>: i-E-value threshold, default is no threshold" )
+      puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + "     : parse query description (in addition to query name)" )
+      puts( "           -" + IGNORE_DUF_OPTION  + "     : ignore DUFs" )
+      puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + "=<s> : HMM for protein architectures summary" )
+      puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + "=<f>: E-value threshold for full protein sequences, only for protein architectures summary" )
+      puts( "           -" + SPECIES_OPTION + "=<s> : species for protein architectures summary" )
+      puts()
+      puts( "  [next step in standard analysis pipeline: d2f.rb]")
+      puts()
+      puts( "Examples:" )
+      puts()
+      puts( "  " + "hmmscan --max --domtblout P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 -E 10 Pfam-A.hmm P53_ni.fasta" )
       puts()
-      puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
-      puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
-      puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
-      puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
-      puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
-      puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
+      puts( "  " + PRG_NAME + ".rb P53_hmmscan_300_10" )
       puts()
     end