#
# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
#
-# last modified: 121003
require 'set'
class HmmscanSummary
PRG_NAME = "hsp"
- PRG_VERSION = "2.000"
+ PRG_VERSION = "2.001"
PRG_DESC = "hmmscan summary"
- PRG_DATE = "2012.10.23"
- COPYRIGHT = "2012 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
+ PRG_DATE = "2013.10.23"
+ COPYRIGHT = "2013 Christian M Zmasek"
+ CONTACT = "phyloxml@gmail.com"
+ WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
DELIMITER_OPTION = "d"
SPECIES_OPTION = "s"
def run
-
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
+ # Util.print_program_information( PRG_NAME,
+ # PRG_VERSION,
+ # PRG_DESC,
+ # PRG_DATE,
+ # COPYRIGHT,
+ # CONTACT,
+ # WWW,
+ # STDOUT )
begin
cla = CommandLineArguments.new( ARGV )
end
if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
+ cla.is_option_set?( HELP_OPTION_2 ) )
print_help
exit( 0 )
end
end
end
-
+
fs_e_value_threshold = -1.0
if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
end
species = "HUMAN"
- if ( cla.is_option_set?( SPECIES_OPTION ) )
+ if ( cla.is_option_set?( SPECIES_OPTION ) )
begin
species = cla.get_option_value( SPECIES_OPTION )
rescue ArgumentError => e
parse_descriptions = true
end
- puts()
- puts( "hmmpfam outputfile : " + inpath )
- puts( "outputfile : " + outpath )
- puts( "species : " + species )
- if ( i_e_value_threshold >= 0.0 )
- puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
- else
- puts( "i-E-value threshold : no threshold" )
- end
- if ( parse_descriptions )
- puts( "parse descriptions : true" )
- else
- puts( "parse descriptions : false" )
- end
- if ( ignore_dufs )
- puts( "ignore DUFs : true" )
- else
- puts( "ignore DUFs : false" )
- end
- if ( column_delimiter == "\t" )
- puts( "column delimiter : TAB" )
- else
- puts( "column delimiter : " + column_delimiter )
- end
- if fs_e_value_threshold >= 0.0
- puts( "E-value threshold : " + fs_e_value_threshold.to_s )
- else
- puts( "E-value threshold : no threshold" )
- end
- if !hmm_for_protein_output.empty?
- puts( "HMM for proteins : " + hmm_for_protein_output )
- end
- if !uniprot.empty?
- puts( "Uniprot : " + uniprot )
- end
- puts()
+# puts()
+# puts( "hmmpfam outputfile : " + inpath )
+# puts( "outputfile : " + outpath )
+# puts( "species : " + species )
+# if ( i_e_value_threshold >= 0.0 )
+# puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
+# else
+# puts( "i-E-value threshold : no threshold" )
+# end
+# if ( parse_descriptions )
+# puts( "parse descriptions : true" )
+# else
+# puts( "parse descriptions : false" )
+# end
+# if ( ignore_dufs )
+# puts( "ignore DUFs : true" )
+# else
+# puts( "ignore DUFs : false" )
+# end
+# if ( column_delimiter == "\t" )
+# puts( "column delimiter : TAB" )
+# else
+# puts( "column delimiter : " + column_delimiter )
+# end
+# if fs_e_value_threshold >= 0.0
+# puts( "E-value threshold : " + fs_e_value_threshold.to_s )
+# else
+# puts( "E-value threshold : no threshold" )
+# end
+# if !hmm_for_protein_output.empty?
+# puts( "HMM for proteins : " + hmm_for_protein_output )
+# end
+# if !uniprot.empty?
+# puts( "Uniprot : " + uniprot )
+# end
+# puts()
begin
parse( inpath,
end
domain_counts = get_domain_counts()
-
- puts
- puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
- puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
- puts
- puts( Util.draw_histogram( domain_counts, "#" ) )
- puts
- Util.print_message( PRG_NAME, 'OK' )
- puts
+# puts
+# puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" )
+# puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
+# puts
+# puts( Util.draw_histogram( domain_counts, "#" ) )
+# puts
+# Util.print_message( PRG_NAME, 'OK' )
+# puts
end # def run
uniprot,
species )
-
-
Util.check_file_for_readability( inpath )
Util.check_file_for_writability( outpath )
hmmscan_results_per_protein = []
-
-
prev_query = ""
results.each do | r |
env_to = r.env_to
if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
- ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
+ ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
count_model( model )
outfile.print( query +
- column_delimiter )
+ column_delimiter )
if ( get_descriptions )
outfile.print( desc +
- column_delimiter )
+ column_delimiter )
end
outfile.print( model +
- column_delimiter +
- env_from.to_s +
- column_delimiter +
- env_to.to_s +
- column_delimiter +
- i_e_value.to_s )
+ column_delimiter +
+ env_from.to_s +
+ column_delimiter +
+ env_to.to_s +
+ column_delimiter +
+ i_e_value.to_s )
outfile.print( Constants::LINE_DELIMITER )
end
fs_e_value_threshold,
hmm_for_protein_output,
i_e_value_threshold,
- false,
+ uniprot,
species )
end
hmmscan_results_per_protein.clear
end
end
end
-
+
if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
process_hmmscan_results_per_protein( hmmscan_results_per_protein,
fs_e_value_threshold,
hmm_for_protein_output,
i_e_value_threshold,
- false,
+ uniprot,
species )
end
outfile.flush()
outfile.close()
-
end # def parse
def process_id( id )
id
end
-
-
def count_model( model )
if ( @domain_counts.has_key?( model ) )
count = @domain_counts[ model ].to_i
hmm_for_protein_output,
i_e_value_threshold,
uniprotkb,
- species )
-
-
+ species )
dc = 0
# filter according to i-Evalue threshold
hmmscan_results_per_protein_filtered = []
hmmscan_results_per_protein.each do | r |
-
- puts r
-
+
+
if r.model == hmm_for_protein_output
- if r.fs_e_value > fs_e_value_threshold
+ if fs_e_value_threshold > 0.0 && r.fs_e_value > fs_e_value_threshold
return
end
end
- if r.i_e_value <= i_e_value_threshold
+ if i_e_value_threshold <= 0 || r.i_e_value <= i_e_value_threshold
hmmscan_results_per_protein_filtered << r
if r.model == hmm_for_protein_output
dc += 1
s << r.model + " "
end
s << "\t"
- puts s
- #e = UniprotKB::get_entry_by_id( process_id( own.query ) )
- #if e != nil
- # s << uniprot_annotation( e )
- # # s << "\uniprot_annotationt"
- #end
+ if !uniprotkb.empty?
+ #e = UniprotKB::get_entry_by_id( process_id( own.query ) )
+
+ #if e != nil
+ # s << uniprot_annotation( e )
+ # # s << "\uniprot_annotationt"
+ #end
+ end
overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
end
-
def print_help()
puts( "Usage:" )
puts()
puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" )
puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" )
puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
+ puts( " -" + SPECIES_OPTION + ": species for protein summary" )
puts()
end