From fea8aa1f2d8d2eebfbbb57ede28edc003367c12f Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Wed, 24 Oct 2012 02:22:12 +0000 Subject: [PATCH] in progress --- .../evoruby/lib/evo/io/parser/uniprot_parser.rb | 94 ++++++++++++++++++++ .../ruby/evoruby/lib/evo/tool/hmmscan_summary.rb | 56 +++++++++--- 2 files changed, 140 insertions(+), 10 deletions(-) create mode 100644 forester/ruby/evoruby/lib/evo/io/parser/uniprot_parser.rb diff --git a/forester/ruby/evoruby/lib/evo/io/parser/uniprot_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/uniprot_parser.rb new file mode 100644 index 0000000..28a5730 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/parser/uniprot_parser.rb @@ -0,0 +1,94 @@ +# +# = lib/evo/io/parser/uniprot_parser - UniprotParser class +# +# Copyright:: Copyright (C) 2012 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: Exp $ +# +# last modified: 121003 + + +#require 'iconv' + + +require 'lib/evo/util/util' + +module Evoruby + + class UniprotParser + + ID = "ID" + DE = "DE" + DR = "DR" + LAST = '//' + + def initialize file + Util.check_file_for_readability file + @file = file + end + + + + def parse( ids ) + #ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' ) + entries = [] + de = [] + dr = [] + read = false + File.open( @file ).each do | line | + if line.index ID == 0 + ids.each do | id | + if line.index id == 0 + read = true + break + end + end + end + if read + if line.index LAST == 0 + read = false + e = UniprotEntry.new + e.de = de + e.dr = dr + entries << e + de = [] + dr = [] + else + if line.index DE == 0 + add( line, de ) + elsif line.index DR == 0 + add( line, dr ) + end + end + end + end + entries + end + + private + + def add( line, ary ) + line =~/[A-Z]{2}\s+(.+)/ + ary << $1 + end + + + end # class UniprotParser + + class UniprotEntry + + attr_accessor :id + attr_accessor :ac + attr_accessor :de + attr_accessor :gn + attr_accessor :os + attr_accessor :ox + attr_accessor :dr + attr_accessor :pe + attr_accessor :kw + + end + + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb index 9c5e3a9..82b2782 100644 --- a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb +++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb @@ -1,17 +1,18 @@ # -# = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class +# = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class # -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# Copyright:: Copyright (C) 2012 Christian M. Zmasek # License:: GNU Lesser General Public License (LGPL) # # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $ # -# last modified: 11/24/2009 +# last modified: 121003 require 'lib/evo/util/constants' require 'lib/evo/util/util' require 'lib/evo/util/command_line_arguments' require 'lib/evo/io/parser/hmmscan_parser' +require 'lib/evo/io/parser/uniprot_parser' module Evoruby @@ -19,7 +20,7 @@ module Evoruby PRG_NAME = "hsp" PRG_VERSION = "2.000" - PRG_DESC = "hmmscan parser" + PRG_DESC = "hmmscan summary" PRG_DATE = "2012.10.19" COPYRIGHT = "2012 Christian M Zmasek" CONTACT = "phylosoft@gmail.com" @@ -31,6 +32,7 @@ module Evoruby HMM_FOR_PROTEIN_OUTPUT = "m" IGNORE_DUF_OPTION = "i" PARSE_OUT_DESCRIPITION_OPTION = "a" + UNIPROT = "u" HELP_OPTION_1 = "help" HELP_OPTION_2 = "h" @@ -77,6 +79,7 @@ module Evoruby allowed_opts.push( IGNORE_DUF_OPTION ) allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION ) allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT ) + allowed_opts.push( UNIPROT ) disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) @@ -129,6 +132,15 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end end + + uniprot = "" + if ( cla.is_option_set?( UNIPROT ) ) + begin + uniprot = cla.get_option_value( UNIPROT ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end ignore_dufs = false if ( cla.is_option_set?( IGNORE_DUF_OPTION ) ) @@ -163,14 +175,17 @@ module Evoruby else puts( "column delimiter : " + column_delimiter ) end - if ( fs_e_value_threshold >= 0.0 ) + if fs_e_value_threshold >= 0.0 puts( "E-value threshold : " + fs_e_value_threshold.to_s ) else puts( "E-value threshold : no threshold" ) end - if ( !hmm_for_protein_output.empty? ) + if !hmm_for_protein_output.empty? puts( "HMM for proteins : " + hmm_for_protein_output ) end + if !uniprot.empty? + puts( "Uniprot : " + uniprot ) + end puts() begin @@ -181,7 +196,8 @@ module Evoruby ignore_dufs, parse_descriptions, fs_e_value_threshold, - hmm_for_protein_output ) + hmm_for_protein_output, + uniprot ) rescue ArgumentError, IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end @@ -209,10 +225,19 @@ module Evoruby ignore_dufs, get_descriptions, fs_e_value_threshold, - hmm_for_protein_output ) + hmm_for_protein_output, + uniprot ) Util.check_file_for_readability( inpath ) Util.check_file_for_writability( outpath ) + hmmscan_parser = HmmscanParser.new( inpath ) + results = hmmscan_parser.parse + + uniprot_entries = nil + if !uniprot.empty? + uniprot_entries = read_uniprot( results, uniprot ) + end + outfile = File.open( outpath, "a" ) query = "" @@ -224,11 +249,11 @@ module Evoruby hmmscan_results_per_protein = [] - hmmscan_parser = HmmscanParser.new( inpath ) + prev_query = "" - hmmscan_parser.parse.each do | r | + results.each do | r | model = r.model query = r.query i_e_value = r.i_e_value @@ -288,6 +313,17 @@ module Evoruby end # def parse + + def read_uniprot( hmmscan_results, uniprot ) + ids = [] + hmmscan_results.each do | r | + ids << r.query + end + uniprot_parser = UniprotParser.new uniprot + uniprot_entries = uniprot_parser.parse ids + uniprot_entries + end + def count_model( model ) if ( @domain_counts.has_key?( model ) ) count = @domain_counts[ model ].to_i -- 1.7.10.2