in progress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 24 Oct 2012 02:22:12 +0000 (02:22 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 24 Oct 2012 02:22:12 +0000 (02:22 +0000)
forester/ruby/evoruby/lib/evo/io/parser/uniprot_parser.rb [new file with mode: 0644]
forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb

diff --git a/forester/ruby/evoruby/lib/evo/io/parser/uniprot_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/uniprot_parser.rb
new file mode 100644 (file)
index 0000000..28a5730
--- /dev/null
@@ -0,0 +1,94 @@
+#
+# = lib/evo/io/parser/uniprot_parser - UniprotParser class
+#
+# Copyright::  Copyright (C) 2012 Christian M. Zmasek
+# License::    GNU Lesser General Public License (LGPL)
+#
+# $Id:  Exp $
+#
+# last modified: 121003
+
+
+#require 'iconv'
+
+
+require 'lib/evo/util/util'
+
+module Evoruby
+
+  class UniprotParser
+
+    ID = "ID"
+    DE = "DE"
+    DR = "DR"
+    LAST = '//'
+
+    def initialize file
+      Util.check_file_for_readability file
+      @file = file
+    end
+
+   
+
+    def parse( ids )
+      #ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
+      entries = []
+      de = []
+      dr = []
+      read = false
+      File.open( @file ).each do | line |
+        if line.index ID == 0
+          ids.each do | id |
+            if line.index id == 0
+              read = true
+              break
+            end
+          end
+        end
+        if read
+          if line.index LAST == 0
+            read = false
+            e = UniprotEntry.new
+            e.de = de
+            e.dr = dr
+            entries << e
+            de = []
+            dr = []
+          else
+            if line.index DE == 0
+              add( line, de )
+            elsif line.index DR == 0
+              add( line, dr )
+            end
+          end
+        end
+      end
+      entries
+    end
+
+    private
+
+    def add( line, ary )
+      line =~/[A-Z]{2}\s+(.+)/
+      ary << $1
+    end
+
+
+  end # class UniprotParser
+
+  class UniprotEntry
+
+    attr_accessor :id
+    attr_accessor :ac
+    attr_accessor :de
+    attr_accessor :gn
+    attr_accessor :os
+    attr_accessor :ox
+    attr_accessor :dr
+    attr_accessor :pe
+    attr_accessor :kw
+
+  end
+
+
+end # module Evoruby
index 9c5e3a9..82b2782 100644 (file)
@@ -1,17 +1,18 @@
 #
-# = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class
+# = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
 #
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
+# Copyright::  Copyright (C) 2012 Christian M. Zmasek
 # License::    GNU Lesser General Public License (LGPL)
 #
 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
 #
-# last modified: 11/24/2009
+# last modified: 121003
 
 require 'lib/evo/util/constants'
 require 'lib/evo/util/util'
 require 'lib/evo/util/command_line_arguments'
 require 'lib/evo/io/parser/hmmscan_parser'
+require 'lib/evo/io/parser/uniprot_parser'
 
 module Evoruby
 
@@ -19,7 +20,7 @@ module Evoruby
 
     PRG_NAME       = "hsp"
     PRG_VERSION    = "2.000"
-    PRG_DESC       = "hmmscan parser"
+    PRG_DESC       = "hmmscan summary"
     PRG_DATE       = "2012.10.19"
     COPYRIGHT      = "2012 Christian M Zmasek"
     CONTACT        = "phylosoft@gmail.com"
@@ -31,6 +32,7 @@ module Evoruby
     HMM_FOR_PROTEIN_OUTPUT        = "m"
     IGNORE_DUF_OPTION             = "i"
     PARSE_OUT_DESCRIPITION_OPTION = "a"
+    UNIPROT                       = "u"
     HELP_OPTION_1                 = "help"
     HELP_OPTION_2                 = "h"
 
@@ -77,6 +79,7 @@ module Evoruby
       allowed_opts.push( IGNORE_DUF_OPTION )
       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
+       allowed_opts.push( UNIPROT )
 
       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
       if ( disallowed.length > 0 )
@@ -129,6 +132,15 @@ module Evoruby
           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
         end
       end
+      
+      uniprot = ""
+       if ( cla.is_option_set?( UNIPROT ) )
+        begin
+           uniprot = cla.get_option_value( UNIPROT )
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
 
       ignore_dufs = false
       if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
@@ -163,14 +175,17 @@ module Evoruby
       else
         puts( "column delimiter     : " + column_delimiter )
       end
-      if ( fs_e_value_threshold >= 0.0 )
+      if fs_e_value_threshold >= 0.0 
         puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
       else
         puts( "E-value threshold   : no threshold" )
       end
-      if ( !hmm_for_protein_output.empty? )
+      if !hmm_for_protein_output.empty? 
         puts( "HMM for proteins    : " + hmm_for_protein_output )
       end
+      if !uniprot.empty? 
+        puts( "Uniprot             : " + uniprot )
+      end
       puts()
 
       begin
@@ -181,7 +196,8 @@ module Evoruby
           ignore_dufs,
           parse_descriptions,
           fs_e_value_threshold,
-          hmm_for_protein_output )
+          hmm_for_protein_output,
+          uniprot )
       rescue ArgumentError, IOError => e
         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
       end
@@ -209,10 +225,19 @@ module Evoruby
         ignore_dufs,
         get_descriptions,
         fs_e_value_threshold,
-        hmm_for_protein_output )
+        hmm_for_protein_output,
+        uniprot )
       Util.check_file_for_readability( inpath )
       Util.check_file_for_writability( outpath )
 
+      hmmscan_parser = HmmscanParser.new( inpath )
+      results = hmmscan_parser.parse
+      
+      uniprot_entries = nil
+      if !uniprot.empty? 
+        uniprot_entries = read_uniprot( results, uniprot  )
+      end
+      
       outfile = File.open( outpath, "a" )
 
       query     = ""
@@ -224,11 +249,11 @@ module Evoruby
 
       hmmscan_results_per_protein = []
 
-      hmmscan_parser = HmmscanParser.new( inpath )
+      
 
       prev_query = ""
 
-      hmmscan_parser.parse.each do | r |
+      results.each do | r |
         model     = r.model
         query     = r.query
         i_e_value = r.i_e_value
@@ -288,6 +313,17 @@ module Evoruby
 
     end # def parse
 
+    
+     def read_uniprot( hmmscan_results, uniprot  )  
+        ids = []
+         hmmscan_results.each do | r |
+           ids << r.query
+         end 
+         uniprot_parser = UniprotParser.new uniprot
+         uniprot_entries = uniprot_parser.parse ids 
+         uniprot_entries
+      end
+    
     def count_model( model )
       if ( @domain_counts.has_key?( model ) )
         count = @domain_counts[ model ].to_i