X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Ftool%2Fhmmscan_summary.rb;h=a8f15907c5641fa7860b50d7dec2e29ba348aac7;hb=d03733a9998b73a5880b9dab0f688e6a69661019;hp=9c5e3a99637f7ec4759e27638356df8205e0f748;hpb=79fbb03cb44e42992dcb01f0f64386d59c243838;p=jalview.git diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb index 9c5e3a9..a8f1590 100644 --- a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb +++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb @@ -1,17 +1,21 @@ # -# = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class +# = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class # -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# Copyright:: Copyright (C) 2012 Christian M. Zmasek # License:: GNU Lesser General Public License (LGPL) # # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $ # -# last modified: 11/24/2009 +# last modified: 121003 + +require 'set' require 'lib/evo/util/constants' require 'lib/evo/util/util' require 'lib/evo/util/command_line_arguments' require 'lib/evo/io/parser/hmmscan_parser' +require 'lib/evo/io/parser/uniprot_parser' +require 'lib/evo/io/web/uniprotkb' module Evoruby @@ -19,8 +23,8 @@ module Evoruby PRG_NAME = "hsp" PRG_VERSION = "2.000" - PRG_DESC = "hmmscan parser" - PRG_DATE = "2012.10.19" + PRG_DESC = "hmmscan summary" + PRG_DATE = "2012.10.23" COPYRIGHT = "2012 Christian M Zmasek" CONTACT = "phylosoft@gmail.com" WWW = "www.phylosoft.org" @@ -31,6 +35,7 @@ module Evoruby HMM_FOR_PROTEIN_OUTPUT = "m" IGNORE_DUF_OPTION = "i" PARSE_OUT_DESCRIPITION_OPTION = "a" + UNIPROT = "u" HELP_OPTION_1 = "help" HELP_OPTION_2 = "h" @@ -44,6 +49,9 @@ module Evoruby def run + ukb = UniprotKB.new + ukb.get + Util.print_program_information( PRG_NAME, PRG_VERSION, PRG_DESC, @@ -77,6 +85,7 @@ module Evoruby allowed_opts.push( IGNORE_DUF_OPTION ) allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION ) allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT ) + allowed_opts.push( UNIPROT ) disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) @@ -130,6 +139,15 @@ module Evoruby end end + uniprot = "" + if ( cla.is_option_set?( UNIPROT ) ) + begin + uniprot = cla.get_option_value( UNIPROT ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + ignore_dufs = false if ( cla.is_option_set?( IGNORE_DUF_OPTION ) ) ignore_dufs = true @@ -163,14 +181,17 @@ module Evoruby else puts( "column delimiter : " + column_delimiter ) end - if ( fs_e_value_threshold >= 0.0 ) + if fs_e_value_threshold >= 0.0 puts( "E-value threshold : " + fs_e_value_threshold.to_s ) else puts( "E-value threshold : no threshold" ) end - if ( !hmm_for_protein_output.empty? ) + if !hmm_for_protein_output.empty? puts( "HMM for proteins : " + hmm_for_protein_output ) end + if !uniprot.empty? + puts( "Uniprot : " + uniprot ) + end puts() begin @@ -181,8 +202,9 @@ module Evoruby ignore_dufs, parse_descriptions, fs_e_value_threshold, - hmm_for_protein_output ) - rescue ArgumentError, IOError => e + hmm_for_protein_output, + uniprot ) + rescue IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end domain_counts = get_domain_counts() @@ -209,10 +231,22 @@ module Evoruby ignore_dufs, get_descriptions, fs_e_value_threshold, - hmm_for_protein_output ) + hmm_for_protein_output, + uniprot ) + + + Util.check_file_for_readability( inpath ) Util.check_file_for_writability( outpath ) + hmmscan_parser = HmmscanParser.new( inpath ) + results = hmmscan_parser.parse + + uniprot_entries = nil + if !uniprot.empty? && !hmm_for_protein_output.empty? + uniprot_entries = read_uniprot( results, uniprot ) + end + outfile = File.open( outpath, "a" ) query = "" @@ -224,11 +258,11 @@ module Evoruby hmmscan_results_per_protein = [] - hmmscan_parser = HmmscanParser.new( inpath ) + prev_query = "" - hmmscan_parser.parse.each do | r | + results.each do | r | model = r.model query = r.query i_e_value = r.i_e_value @@ -260,7 +294,8 @@ module Evoruby process_hmmscan_results_per_protein( hmmscan_results_per_protein, fs_e_value_threshold, hmm_for_protein_output, - i_e_value_threshold ) + i_e_value_threshold, + uniprot_entries ) end hmmscan_results_per_protein.clear end @@ -275,19 +310,37 @@ module Evoruby end end end - if !hmm_for_protein_output.empty? - if !hmmscan_results_per_protein.empty? - process_hmmscan_results_per_protein( hmmscan_results_per_protein, - fs_e_value_threshold, - hmm_for_protein_output, - i_e_value_threshold ) - end + if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty? + process_hmmscan_results_per_protein( hmmscan_results_per_protein, + fs_e_value_threshold, + hmm_for_protein_output, + i_e_value_threshold, + uniprot_entries ) end + outfile.flush() outfile.close() end # def parse + def process_id( id ) + if id =~ /(sp|tr)\|\S+\|(\S+)/ + id = $2 + end + id + end + + def read_uniprot( hmmscan_results, uniprot ) + ids = Set.new + hmmscan_results.each do | r | + + ids << process_id( r.query ) + end + uniprot_parser = UniprotParser.new uniprot + uniprot_entries = uniprot_parser.parse ids + uniprot_entries + end + def count_model( model ) if ( @domain_counts.has_key?( model ) ) count = @domain_counts[ model ].to_i @@ -301,7 +354,8 @@ module Evoruby def process_hmmscan_results_per_protein( hmmscan_results_per_protein, fs_e_value_threshold, hmm_for_protein_output, - i_e_value_threshold ) + i_e_value_threshold, + uniprot_entries ) dc = 0 # filter according to i-Evalue threshold @@ -347,7 +401,23 @@ module Evoruby s << r.model + " " end s << "\t" + e = uniprot_entries[ process_id( own.query ) ] + if e != nil && e.de != nil + e.de.each { |i| s << i + " " } + else + s << "-" + end + s << "\t" + + if e != nil && e.gn != nil + e.gn.each { |i| s << i + " " } + else + s << "-" + end + + + s << "\t" overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) s << overview + "\t"