X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Ftool%2Fhmmscan_summary.rb;h=08de9c125e9385c009c34134d8a87d8e4cf90f0e;hb=f7f405fad4bc9dc65154cef12d11b47decc6e3f1;hp=9c5e3a99637f7ec4759e27638356df8205e0f748;hpb=79fbb03cb44e42992dcb01f0f64386d59c243838;p=jalview.git diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb index 9c5e3a9..08de9c1 100644 --- a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb +++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb @@ -1,12 +1,13 @@ # -# = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class +# = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class # -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# Copyright:: Copyright (C) 2012 Christian M. Zmasek # License:: GNU Lesser General Public License (LGPL) # # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $ # -# last modified: 11/24/2009 + +require 'set' require 'lib/evo/util/constants' require 'lib/evo/util/util' @@ -18,14 +19,15 @@ module Evoruby class HmmscanSummary PRG_NAME = "hsp" - PRG_VERSION = "2.000" - PRG_DESC = "hmmscan parser" - PRG_DATE = "2012.10.19" - COPYRIGHT = "2012 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" + PRG_VERSION = "2.002" + PRG_DESC = "hmmscan summary" + PRG_DATE = "130319" + COPYRIGHT = "2013 Christian M Zmasek" + CONTACT = "phyloxml@gmail.com" + WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" DELIMITER_OPTION = "d" + SPECIES_OPTION = "s" I_E_VALUE_THRESHOLD_OPTION = "ie" FS_E_VALUE_THRESHOLD_OPTION = "pe" HMM_FOR_PROTEIN_OUTPUT = "m" @@ -77,6 +79,7 @@ module Evoruby allowed_opts.push( IGNORE_DUF_OPTION ) allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION ) allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT ) + allowed_opts.push( SPECIES_OPTION ) disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) @@ -130,6 +133,15 @@ module Evoruby end end + species = "HUMAN" + if ( cla.is_option_set?( SPECIES_OPTION ) ) + begin + species = cla.get_option_value( SPECIES_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + ignore_dufs = false if ( cla.is_option_set?( IGNORE_DUF_OPTION ) ) ignore_dufs = true @@ -143,6 +155,7 @@ module Evoruby puts() puts( "hmmpfam outputfile : " + inpath ) puts( "outputfile : " + outpath ) + puts( "species : " + species ) if ( i_e_value_threshold >= 0.0 ) puts( "i-E-value threshold : " + i_e_value_threshold.to_s ) else @@ -163,12 +176,12 @@ module Evoruby else puts( "column delimiter : " + column_delimiter ) end - if ( fs_e_value_threshold >= 0.0 ) + if fs_e_value_threshold >= 0.0 puts( "E-value threshold : " + fs_e_value_threshold.to_s ) else puts( "E-value threshold : no threshold" ) end - if ( !hmm_for_protein_output.empty? ) + if !hmm_for_protein_output.empty? puts( "HMM for proteins : " + hmm_for_protein_output ) end puts() @@ -181,13 +194,13 @@ module Evoruby ignore_dufs, parse_descriptions, fs_e_value_threshold, - hmm_for_protein_output ) - rescue ArgumentError, IOError => e + hmm_for_protein_output, + species ) + rescue IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end domain_counts = get_domain_counts() - puts puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" ) puts( "(number of different domains: " + domain_counts.length.to_s + ")" ) @@ -209,10 +222,15 @@ module Evoruby ignore_dufs, get_descriptions, fs_e_value_threshold, - hmm_for_protein_output ) + hmm_for_protein_output, + species ) + Util.check_file_for_readability( inpath ) Util.check_file_for_writability( outpath ) + hmmscan_parser = HmmscanParser.new( inpath ) + results = hmmscan_parser.parse + outfile = File.open( outpath, "a" ) query = "" @@ -224,11 +242,9 @@ module Evoruby hmmscan_results_per_protein = [] - hmmscan_parser = HmmscanParser.new( inpath ) - prev_query = "" - hmmscan_parser.parse.each do | r | + results.each do | r | model = r.model query = r.query i_e_value = r.i_e_value @@ -260,7 +276,8 @@ module Evoruby process_hmmscan_results_per_protein( hmmscan_results_per_protein, fs_e_value_threshold, hmm_for_protein_output, - i_e_value_threshold ) + i_e_value_threshold, + species ) end hmmscan_results_per_protein.clear end @@ -275,19 +292,26 @@ module Evoruby end end end - if !hmm_for_protein_output.empty? - if !hmmscan_results_per_protein.empty? - process_hmmscan_results_per_protein( hmmscan_results_per_protein, - fs_e_value_threshold, - hmm_for_protein_output, - i_e_value_threshold ) - end + + if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty? + process_hmmscan_results_per_protein( hmmscan_results_per_protein, + fs_e_value_threshold, + hmm_for_protein_output, + i_e_value_threshold, + species ) end + outfile.flush() outfile.close() - end # def parse + def process_id( id ) + if id =~ /(sp|tr)\|\S+\|(\S+)/ + id = $2 + end + id + end + def count_model( model ) if ( @domain_counts.has_key?( model ) ) count = @domain_counts[ model ].to_i @@ -301,7 +325,8 @@ module Evoruby def process_hmmscan_results_per_protein( hmmscan_results_per_protein, fs_e_value_threshold, hmm_for_protein_output, - i_e_value_threshold ) + i_e_value_threshold, + species ) dc = 0 # filter according to i-Evalue threshold @@ -309,12 +334,14 @@ module Evoruby hmmscan_results_per_protein_filtered = [] hmmscan_results_per_protein.each do | r | + + if r.model == hmm_for_protein_output - if r.fs_e_value > fs_e_value_threshold + if fs_e_value_threshold > 0.0 && r.fs_e_value > fs_e_value_threshold return end end - if r.i_e_value <= i_e_value_threshold + if i_e_value_threshold <= 0 || r.i_e_value <= i_e_value_threshold hmmscan_results_per_protein_filtered << r if r.model == hmm_for_protein_output dc += 1 @@ -338,7 +365,7 @@ module Evoruby s = "" s << own.query + "\t" - s << "HUMAN" + "\t" + s << species + "\t" s << own.fs_e_value.to_s + "\t" s << own.qlen.to_s + "\t" s << dc.to_s + "\t" @@ -350,13 +377,12 @@ module Evoruby overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) - s << overview + "\t" + s << overview + "\t" s << calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + "\t" prev_r = nil hmmscan_results_per_protein_filtered.each do | r | - if prev_r != nil s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 ) else @@ -370,11 +396,10 @@ module Evoruby s << "]" prev_r = r end - s << make_interdomain_sequence( own.qlen - prev_r.env_from, false ) + s << make_interdomain_sequence( own.qlen - prev_r.env_to, false ) puts s end - def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) linkers = "" prev_r = nil @@ -429,7 +454,6 @@ module Evoruby end - def print_help() puts( "Usage:" ) puts() @@ -441,6 +465,7 @@ module Evoruby puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" ) puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" ) puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" ) + puts( " -" + SPECIES_OPTION + ": species for protein summary" ) puts() end