inprogress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_summary.rb
index e6ba67d..1cbbece 100644 (file)
@@ -8,11 +8,13 @@
 #
 # last modified: 121003
 
+require 'set'
+
 require 'lib/evo/util/constants'
 require 'lib/evo/util/util'
 require 'lib/evo/util/command_line_arguments'
 require 'lib/evo/io/parser/hmmscan_parser'
-require 'lib/evo/io/parser/uniprot_parser'
+require 'lib/evo/io/web/uniprotkb'
 
 module Evoruby
 
@@ -27,6 +29,7 @@ module Evoruby
     WWW            = "www.phylosoft.org"
 
     DELIMITER_OPTION              = "d"
+    SPECIES_OPTION                = "s"
     I_E_VALUE_THRESHOLD_OPTION    = "ie"
     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
     HMM_FOR_PROTEIN_OUTPUT        = "m"
@@ -46,6 +49,8 @@ module Evoruby
 
     def run
 
+
+
       Util.print_program_information( PRG_NAME,
         PRG_VERSION,
         PRG_DESC,
@@ -62,7 +67,7 @@ module Evoruby
       end
 
       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
-           cla.is_option_set?( HELP_OPTION_2 ) )
+            cla.is_option_set?( HELP_OPTION_2 ) )
         print_help
         exit( 0 )
       end
@@ -80,6 +85,7 @@ module Evoruby
       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
       allowed_opts.push( UNIPROT )
+      allowed_opts.push( SPECIES_OPTION )
 
       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
       if ( disallowed.length > 0 )
@@ -112,6 +118,8 @@ module Evoruby
         end
       end
 
+      
+
       fs_e_value_threshold = -1.0
       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
         begin
@@ -132,11 +140,20 @@ module Evoruby
           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
         end
       end
-      
+
       uniprot = ""
-       if ( cla.is_option_set?( UNIPROT ) )
+      if ( cla.is_option_set?( UNIPROT ) )
         begin
-           uniprot = cla.get_option_value( UNIPROT )
+          uniprot = cla.get_option_value( UNIPROT )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+
+      species = "HUMAN"
+       if ( cla.is_option_set?( SPECIES_OPTION ) )
+        begin
+          species = cla.get_option_value( SPECIES_OPTION )
         rescue ArgumentError => e
           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
         end
@@ -155,6 +172,7 @@ module Evoruby
       puts()
       puts( "hmmpfam outputfile  : " + inpath )
       puts( "outputfile          : " + outpath )
+      puts( "species             : " + species )
       if ( i_e_value_threshold >= 0.0 )
         puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
       else
@@ -175,15 +193,15 @@ module Evoruby
       else
         puts( "column delimiter     : " + column_delimiter )
       end
-      if fs_e_value_threshold >= 0.0 
+      if fs_e_value_threshold >= 0.0
         puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
       else
         puts( "E-value threshold   : no threshold" )
       end
-      if !hmm_for_protein_output.empty? 
+      if !hmm_for_protein_output.empty?
         puts( "HMM for proteins    : " + hmm_for_protein_output )
       end
-      if !uniprot.empty? 
+      if !uniprot.empty?
         puts( "Uniprot             : " + uniprot )
       end
       puts()
@@ -197,8 +215,9 @@ module Evoruby
           parse_descriptions,
           fs_e_value_threshold,
           hmm_for_protein_output,
-          uniprot )
-      rescue ArgumentError, IOError => e
+          uniprot,
+          species )
+      rescue IOError => e
         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
       end
       domain_counts = get_domain_counts()
@@ -226,18 +245,17 @@ module Evoruby
         get_descriptions,
         fs_e_value_threshold,
         hmm_for_protein_output,
-        uniprot )
+        uniprot,
+        species )
+
+
+
       Util.check_file_for_readability( inpath )
       Util.check_file_for_writability( outpath )
 
       hmmscan_parser = HmmscanParser.new( inpath )
       results = hmmscan_parser.parse
-      
-      uniprot_entries = nil
-      if !uniprot.empty? 
-        uniprot_entries = read_uniprot( results, uniprot  )
-      end
-      
+
       outfile = File.open( outpath, "a" )
 
       query     = ""
@@ -249,7 +267,7 @@ module Evoruby
 
       hmmscan_results_per_protein = []
 
-      
+
 
       prev_query = ""
 
@@ -261,21 +279,21 @@ module Evoruby
         env_to    = r.env_to
 
         if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
-           ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
+            ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
           count_model( model )
           outfile.print( query +
-             column_delimiter )
+              column_delimiter )
           if ( get_descriptions )
             outfile.print( desc +
-               column_delimiter )
+                column_delimiter )
           end
           outfile.print( model +
-             column_delimiter +
-             env_from.to_s +
-             column_delimiter +
-             env_to.to_s +
-             column_delimiter +
-             i_e_value.to_s )
+              column_delimiter +
+              env_from.to_s +
+              column_delimiter +
+              env_to.to_s +
+              column_delimiter +
+              i_e_value.to_s )
           outfile.print( Constants::LINE_DELIMITER )
         end
 
@@ -285,7 +303,9 @@ module Evoruby
               process_hmmscan_results_per_protein( hmmscan_results_per_protein,
                 fs_e_value_threshold,
                 hmm_for_protein_output,
-                i_e_value_threshold )
+                i_e_value_threshold,
+                false,
+                species )
             end
             hmmscan_results_per_protein.clear
           end
@@ -300,31 +320,29 @@ module Evoruby
           end
         end
       end
-      if !hmm_for_protein_output.empty?
-        if !hmmscan_results_per_protein.empty?
-          process_hmmscan_results_per_protein( hmmscan_results_per_protein,
-            fs_e_value_threshold,
-            hmm_for_protein_output,
-            i_e_value_threshold, 
-             uniprot_entries )
-        end
+      if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
+        process_hmmscan_results_per_protein( hmmscan_results_per_protein,
+          fs_e_value_threshold,
+          hmm_for_protein_output,
+          i_e_value_threshold,
+          false,
+          species )
       end
+
       outfile.flush()
       outfile.close()
 
     end # def parse
 
-    
-     def read_uniprot( hmmscan_results, uniprot  )  
-        ids = []
-         hmmscan_results.each do | r |
-           ids << r.query
-         end 
-         uniprot_parser = UniprotParser.new uniprot
-         uniprot_entries = uniprot_parser.parse ids 
-         uniprot_entries
-      end
-    
+    def process_id( id )
+      if  id =~ /(sp|tr)\|\S+\|(\S+)/
+        id = $2
+      end
+      id
+    end
+
+
+
     def count_model( model )
       if ( @domain_counts.has_key?( model ) )
         count = @domain_counts[ model ].to_i
@@ -339,7 +357,8 @@ module Evoruby
         fs_e_value_threshold,
         hmm_for_protein_output,
         i_e_value_threshold,
-         uniprot_entries      )
+        uniprotkb,
+        species      )
 
       dc = 0
       # filter according to i-Evalue threshold
@@ -376,7 +395,7 @@ module Evoruby
 
       s = ""
       s << own.query + "\t"
-      s << "HUMAN" + "\t"
+      s << species + "\t"
       s << own.fs_e_value.to_s + "\t"
       s << own.qlen.to_s + "\t"
       s << dc.to_s + "\t"
@@ -385,11 +404,16 @@ module Evoruby
         s << r.model + " "
       end
       s << "\t"
-     s <<  uniprot_entries[  own.query ]
-       s << "\t"
+      #e = UniprotKB::get_entry_by_id( process_id( own.query ) )
+
+      #if e != nil
+      #  s << uniprot_annotation( e )
+      # # s << "\uniprot_annotationt"
+      #end
+
       overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
 
-      s << overview   + "\t"
+      s << overview  + "\t"
 
       s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
 
@@ -413,6 +437,18 @@ module Evoruby
       puts s
     end
 
+    def uniprot_annotation( e )
+      s = ""
+      pdb_ids = e.get_pdb_ids
+      if !pdb_ids.empty?
+        pdb_ids.each do | pdb |
+          s << pdb << ", "
+        end
+      else
+        s << "-"
+      end
+      s
+    end
 
     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
       linkers = ""