#
# last modified: 121003
+require 'set'
+
require 'lib/evo/util/constants'
require 'lib/evo/util/util'
require 'lib/evo/util/command_line_arguments'
require 'lib/evo/io/parser/hmmscan_parser'
-require 'lib/evo/io/parser/uniprot_parser'
+require 'lib/evo/io/web/uniprotkb'
module Evoruby
PRG_NAME = "hsp"
PRG_VERSION = "2.000"
PRG_DESC = "hmmscan summary"
- PRG_DATE = "2012.10.19"
+ PRG_DATE = "2012.10.23"
COPYRIGHT = "2012 Christian M Zmasek"
CONTACT = "phylosoft@gmail.com"
WWW = "www.phylosoft.org"
DELIMITER_OPTION = "d"
+ SPECIES_OPTION = "s"
I_E_VALUE_THRESHOLD_OPTION = "ie"
FS_E_VALUE_THRESHOLD_OPTION = "pe"
HMM_FOR_PROTEIN_OUTPUT = "m"
def run
+
+
Util.print_program_information( PRG_NAME,
PRG_VERSION,
PRG_DESC,
end
if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
+ cla.is_option_set?( HELP_OPTION_2 ) )
print_help
exit( 0 )
end
allowed_opts.push( IGNORE_DUF_OPTION )
allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
- allowed_opts.push( UNIPROT )
+ allowed_opts.push( UNIPROT )
+ allowed_opts.push( SPECIES_OPTION )
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
end
end
+
+
fs_e_value_threshold = -1.0
if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
begin
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
end
-
+
uniprot = ""
- if ( cla.is_option_set?( UNIPROT ) )
+ if ( cla.is_option_set?( UNIPROT ) )
+ begin
+ uniprot = cla.get_option_value( UNIPROT )
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
+ end
+
+ species = "HUMAN"
+ if ( cla.is_option_set?( SPECIES_OPTION ) )
begin
- uniprot = cla.get_option_value( UNIPROT )
+ species = cla.get_option_value( SPECIES_OPTION )
rescue ArgumentError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
puts()
puts( "hmmpfam outputfile : " + inpath )
puts( "outputfile : " + outpath )
+ puts( "species : " + species )
if ( i_e_value_threshold >= 0.0 )
puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
else
else
puts( "column delimiter : " + column_delimiter )
end
- if fs_e_value_threshold >= 0.0
+ if fs_e_value_threshold >= 0.0
puts( "E-value threshold : " + fs_e_value_threshold.to_s )
else
puts( "E-value threshold : no threshold" )
end
- if !hmm_for_protein_output.empty?
+ if !hmm_for_protein_output.empty?
puts( "HMM for proteins : " + hmm_for_protein_output )
end
- if !uniprot.empty?
+ if !uniprot.empty?
puts( "Uniprot : " + uniprot )
end
puts()
parse_descriptions,
fs_e_value_threshold,
hmm_for_protein_output,
- uniprot )
- rescue ArgumentError, IOError => e
+ uniprot,
+ species )
+ rescue IOError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
domain_counts = get_domain_counts()
get_descriptions,
fs_e_value_threshold,
hmm_for_protein_output,
- uniprot )
+ uniprot,
+ species )
+
+
+
Util.check_file_for_readability( inpath )
Util.check_file_for_writability( outpath )
hmmscan_parser = HmmscanParser.new( inpath )
results = hmmscan_parser.parse
-
- uniprot_entries = nil
- if !uniprot.empty?
- uniprot_entries = read_uniprot( results, uniprot )
- end
-
+
outfile = File.open( outpath, "a" )
query = ""
hmmscan_results_per_protein = []
-
+
prev_query = ""
env_to = r.env_to
if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
- ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
+ ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
count_model( model )
outfile.print( query +
- column_delimiter )
+ column_delimiter )
if ( get_descriptions )
outfile.print( desc +
- column_delimiter )
+ column_delimiter )
end
outfile.print( model +
- column_delimiter +
- env_from.to_s +
- column_delimiter +
- env_to.to_s +
- column_delimiter +
- i_e_value.to_s )
+ column_delimiter +
+ env_from.to_s +
+ column_delimiter +
+ env_to.to_s +
+ column_delimiter +
+ i_e_value.to_s )
outfile.print( Constants::LINE_DELIMITER )
end
process_hmmscan_results_per_protein( hmmscan_results_per_protein,
fs_e_value_threshold,
hmm_for_protein_output,
- i_e_value_threshold )
+ i_e_value_threshold,
+ false,
+ species )
end
hmmscan_results_per_protein.clear
end
end
end
end
- if !hmm_for_protein_output.empty?
- if !hmmscan_results_per_protein.empty?
- process_hmmscan_results_per_protein( hmmscan_results_per_protein,
- fs_e_value_threshold,
- hmm_for_protein_output,
- i_e_value_threshold )
- end
+
+ if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
+ process_hmmscan_results_per_protein( hmmscan_results_per_protein,
+ fs_e_value_threshold,
+ hmm_for_protein_output,
+ i_e_value_threshold,
+ false,
+ species )
end
+
outfile.flush()
outfile.close()
end # def parse
-
- def read_uniprot( hmmscan_results, uniprot )
- ids = []
- hmmscan_results.each do | r |
- ids << r.query
- end
- uniprot_parser = UniprotParser.new uniprot
- uniprot_entries = uniprot_parser.parse ids
- uniprot_entries
+ def process_id( id )
+ if id =~ /(sp|tr)\|\S+\|(\S+)/
+ id = $2
end
-
+ id
+ end
+
+
+
def count_model( model )
if ( @domain_counts.has_key?( model ) )
count = @domain_counts[ model ].to_i
def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
fs_e_value_threshold,
hmm_for_protein_output,
- i_e_value_threshold )
+ i_e_value_threshold,
+ uniprotkb,
+ species )
+
+
dc = 0
# filter according to i-Evalue threshold
hmmscan_results_per_protein_filtered = []
hmmscan_results_per_protein.each do | r |
+
+ puts r.model
+ puts r.fs_e_value
+
if r.model == hmm_for_protein_output
if r.fs_e_value > fs_e_value_threshold
return
s = ""
s << own.query + "\t"
- s << "HUMAN" + "\t"
+ s << species + "\t"
s << own.fs_e_value.to_s + "\t"
s << own.qlen.to_s + "\t"
s << dc.to_s + "\t"
s << r.model + " "
end
s << "\t"
+ puts s
+ #e = UniprotKB::get_entry_by_id( process_id( own.query ) )
+
+ #if e != nil
+ # s << uniprot_annotation( e )
+ # # s << "\uniprot_annotationt"
+ #end
overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
- s << overview + "\t"
+ s << overview + "\t"
s << calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + "\t"
puts s
end
+ def uniprot_annotation( e )
+ s = ""
+ pdb_ids = e.get_pdb_ids
+ if !pdb_ids.empty?
+ pdb_ids.each do | pdb |
+ s << pdb << ", "
+ end
+ else
+ s << "-"
+ end
+ s
+ end
def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
linkers = ""