#
-# = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class
+# = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
#
-# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
+# Copyright:: Copyright (C) 2012 Christian M. Zmasek
# License:: GNU Lesser General Public License (LGPL)
#
# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
#
-# last modified: 11/24/2009
+
+require 'set'
require 'lib/evo/util/constants'
require 'lib/evo/util/util'
class HmmscanSummary
PRG_NAME = "hsp"
- PRG_VERSION = "2.000"
- PRG_DESC = "hmmscan parser"
- PRG_DATE = "2012.10.19"
- COPYRIGHT = "2012 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
+ PRG_VERSION = "2.002"
+ PRG_DESC = "hmmscan summary"
+ PRG_DATE = "130319"
+ COPYRIGHT = "2013 Christian M Zmasek"
+ CONTACT = "phyloxml@gmail.com"
+ WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
DELIMITER_OPTION = "d"
+ SPECIES_OPTION = "s"
I_E_VALUE_THRESHOLD_OPTION = "ie"
FS_E_VALUE_THRESHOLD_OPTION = "pe"
HMM_FOR_PROTEIN_OUTPUT = "m"
allowed_opts.push( IGNORE_DUF_OPTION )
allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
+ allowed_opts.push( SPECIES_OPTION )
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
end
end
+ species = "HUMAN"
+ if ( cla.is_option_set?( SPECIES_OPTION ) )
+ begin
+ species = cla.get_option_value( SPECIES_OPTION )
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
+ end
+
ignore_dufs = false
if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
ignore_dufs = true
puts()
puts( "hmmpfam outputfile : " + inpath )
puts( "outputfile : " + outpath )
+ puts( "species : " + species )
if ( i_e_value_threshold >= 0.0 )
puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
else
else
puts( "column delimiter : " + column_delimiter )
end
- if ( fs_e_value_threshold >= 0.0 )
+ if fs_e_value_threshold >= 0.0
puts( "E-value threshold : " + fs_e_value_threshold.to_s )
else
puts( "E-value threshold : no threshold" )
end
- if ( !hmm_for_protein_output.empty? )
+ if !hmm_for_protein_output.empty?
puts( "HMM for proteins : " + hmm_for_protein_output )
end
puts()
ignore_dufs,
parse_descriptions,
fs_e_value_threshold,
- hmm_for_protein_output )
- rescue ArgumentError, IOError => e
+ hmm_for_protein_output,
+ species )
+ rescue IOError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
domain_counts = get_domain_counts()
-
puts
puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
ignore_dufs,
get_descriptions,
fs_e_value_threshold,
- hmm_for_protein_output )
+ hmm_for_protein_output,
+ species )
+
Util.check_file_for_readability( inpath )
Util.check_file_for_writability( outpath )
+ hmmscan_parser = HmmscanParser.new( inpath )
+ results = hmmscan_parser.parse
+
outfile = File.open( outpath, "a" )
query = ""
hmmscan_results_per_protein = []
- hmmscan_parser = HmmscanParser.new( inpath )
-
prev_query = ""
- hmmscan_parser.parse.each do | r |
+ results.each do | r |
model = r.model
query = r.query
i_e_value = r.i_e_value
process_hmmscan_results_per_protein( hmmscan_results_per_protein,
fs_e_value_threshold,
hmm_for_protein_output,
- i_e_value_threshold )
+ i_e_value_threshold,
+ species )
end
hmmscan_results_per_protein.clear
end
end
end
end
- if !hmm_for_protein_output.empty?
- if !hmmscan_results_per_protein.empty?
- process_hmmscan_results_per_protein( hmmscan_results_per_protein,
- fs_e_value_threshold,
- hmm_for_protein_output,
- i_e_value_threshold )
- end
+
+ if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
+ process_hmmscan_results_per_protein( hmmscan_results_per_protein,
+ fs_e_value_threshold,
+ hmm_for_protein_output,
+ i_e_value_threshold,
+ species )
end
+
outfile.flush()
outfile.close()
-
end # def parse
+ def process_id( id )
+ if id =~ /(sp|tr)\|\S+\|(\S+)/
+ id = $2
+ end
+ id
+ end
+
def count_model( model )
if ( @domain_counts.has_key?( model ) )
count = @domain_counts[ model ].to_i
def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
fs_e_value_threshold,
hmm_for_protein_output,
- i_e_value_threshold )
+ i_e_value_threshold,
+ species )
dc = 0
# filter according to i-Evalue threshold
hmmscan_results_per_protein_filtered = []
hmmscan_results_per_protein.each do | r |
+
+
if r.model == hmm_for_protein_output
- if r.fs_e_value > fs_e_value_threshold
+ if fs_e_value_threshold > 0.0 && r.fs_e_value > fs_e_value_threshold
return
end
end
- if r.i_e_value <= i_e_value_threshold
+ if i_e_value_threshold <= 0 || r.i_e_value <= i_e_value_threshold
hmmscan_results_per_protein_filtered << r
if r.model == hmm_for_protein_output
dc += 1
s = ""
s << own.query + "\t"
- s << "HUMAN" + "\t"
+ s << species + "\t"
s << own.fs_e_value.to_s + "\t"
s << own.qlen.to_s + "\t"
s << dc.to_s + "\t"
overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
- s << overview + "\t"
+ s << overview + "\t"
s << calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + "\t"
prev_r = nil
hmmscan_results_per_protein_filtered.each do | r |
-
if prev_r != nil
s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
else
s << "]"
prev_r = r
end
- s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
+ s << make_interdomain_sequence( own.qlen - prev_r.env_to, false )
puts s
end
-
def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
linkers = ""
prev_r = nil
end
-
def print_help()
puts( "Usage:" )
puts()
puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" )
puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" )
puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
+ puts( " -" + SPECIES_OPTION + ": species for protein summary" )
puts()
end