2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
4 # Copyright:: Copyright (C) 2012 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
12 require 'lib/evo/util/constants'
13 require 'lib/evo/util/util'
14 require 'lib/evo/util/command_line_arguments'
15 require 'lib/evo/io/parser/hmmscan_parser'
16 require 'lib/evo/msa/msa'
17 require 'lib/evo/msa/msa_factory'
18 require 'lib/evo/io/msa_io'
19 require 'lib/evo/io/parser/fasta_parser'
20 require 'lib/evo/io/writer/fasta_writer'
28 PRG_DESC = "hmmscan summary"
29 PRG_DATE = "2013.10.23"
30 COPYRIGHT = "2013 Christian M Zmasek"
31 CONTACT = "phyloxml@gmail.com"
32 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
34 DELIMITER_OPTION = "d"
36 I_E_VALUE_THRESHOLD_OPTION = "ie"
37 FS_E_VALUE_THRESHOLD_OPTION = "pe"
38 HMM_FOR_PROTEIN_OUTPUT = "m"
39 IGNORE_DUF_OPTION = "i"
40 PARSE_OUT_DESCRIPITION_OPTION = "a"
41 HELP_OPTION_1 = "help"
44 USE_AVOID_HMMS = false
45 AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
46 LIMIT_FOR_CLOSE_DOMAINS = 20
49 @domain_counts = Hash.new
54 # Util.print_program_information( PRG_NAME,
64 cla = CommandLineArguments.new( ARGV )
65 rescue ArgumentError => e
66 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
69 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
70 cla.is_option_set?( HELP_OPTION_2 ) )
75 if ( cla.get_number_of_files != 1 && cla.get_number_of_files != 2 )
80 allowed_opts = Array.new
81 allowed_opts.push( DELIMITER_OPTION )
82 allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
83 allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
84 allowed_opts.push( IGNORE_DUF_OPTION )
85 allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
86 allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
87 allowed_opts.push( SPECIES_OPTION )
89 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
90 if ( disallowed.length > 0 )
91 Util.fatal_error( PRG_NAME,
92 "unknown option(s): " + disallowed,
96 inpath = cla.get_file_name( 0 )
99 if ( cla.get_number_of_files == 2 )
100 seq_file_path = cla.get_file_name( 1 )
104 if seq_file_path != nil
105 msa = read_fasta_file(seq_file_path )
108 column_delimiter = "\t"
109 if ( cla.is_option_set?( DELIMITER_OPTION ) )
111 column_delimiter = cla.get_option_value( DELIMITER_OPTION )
112 rescue ArgumentError => e
113 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
117 i_e_value_threshold = -1
118 if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
120 i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
121 rescue ArgumentError => e
122 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
124 if ( i_e_value_threshold < 0.0 )
125 Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
129 fs_e_value_threshold = -1
130 if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
132 fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
133 rescue ArgumentError => e
134 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
136 if ( fs_e_value_threshold < 0.0 )
137 Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
141 hmm_for_protein_outputs = []
142 if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
144 hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
145 hmm_for_protein_outputs = hmm_for_protein_output.split( "~" );
146 rescue ArgumentError => e
147 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
152 if ( cla.is_option_set?( SPECIES_OPTION ) )
154 species = cla.get_option_value( SPECIES_OPTION )
155 rescue ArgumentError => e
156 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
161 if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
165 parse_descriptions = false
166 if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
167 parse_descriptions = true
177 fs_e_value_threshold,
178 hmm_for_protein_outputs,
182 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
189 def read_fasta_file( input )
193 msa = f.create_msa_from_file( input, FastaParser.new() )
194 rescue Exception => e
195 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
201 # raises ArgumentError, IOError
207 fs_e_value_threshold,
208 hmm_for_protein_outputs,
212 Util.check_file_for_readability( inpath )
214 hmmscan_parser = HmmscanParser.new( inpath )
215 results = hmmscan_parser.parse
225 hmmscan_results_per_protein = []
229 results.each do | r |
232 i_e_value = r.i_e_value
233 env_from = r.env_from
237 if !prev_query.empty? && prev_query != query
238 if !hmmscan_results_per_protein.empty?
239 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
240 fs_e_value_threshold,
241 hmm_for_protein_outputs,
246 hmmscan_results_per_protein.clear
251 if !AVOID_HHMS.include? r.model
252 hmmscan_results_per_protein << r
255 hmmscan_results_per_protein << r
260 if !hmm_for_protein_outputs.empty? && !hmmscan_results_per_protein.empty?
261 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
262 fs_e_value_threshold,
263 hmm_for_protein_outputs,
273 if id =~ /(sp|tr)\|\S+\|(\S+)/
281 def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
282 fs_e_value_threshold,
288 raise StandardError, "target hmms is empty" if target_hmms.length < 1
289 raise StandardError, "results is empty" if hmmscan_results_per_protein.length < 1
291 # filter according to i-Evalue threshold
292 # abort if fs Evalue too high
295 if fs_e_value_threshold >= 0.0
296 hmmscan_results_per_protein.each do | r |
297 target_hmms.each do | hmm |
298 if r.model == hmm && r.fs_e_value > fs_e_value_threshold
306 hmmscan_results_per_protein_filtered = []
309 hmmscan_results_per_protein.each do | r |
310 if i_e_value_threshold < 0 || r.i_e_value <= i_e_value_threshold
311 hmmscan_results_per_protein_filtered << r
312 target_hmms.each do | hmm |
323 if matched.length < target_hmms.length
326 if hmmscan_results_per_protein_filtered.length < 1
330 hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
333 target_hmms.each do | hmm |
334 hmmscan_results_per_protein_filtered.each do | r |
345 s << own.query + "\t"
350 s << own.fs_e_value.to_s + "\t"
353 s << own.qlen.to_s + "\t" #TODO !
357 # s << dc.to_s + "\t"
359 s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
360 hmmscan_results_per_protein_filtered.each do | r |
367 # overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
369 # s << overview + "\t"
371 # s << calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + "\t"
374 hmmscan_results_per_protein_filtered.each do | r |
377 s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
379 if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] )
381 linker( prev_r.env_to, r.env_from, query, msa )
386 s << make_interdomain_sequence( r.env_from, false )
391 s << r.env_from.to_s << "-" << r.env_to.to_s
392 s << " " << r.i_e_value.to_s
396 # s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
401 def linker( first, last , query , msa)
402 puts first.to_s + "-" + last.to_s
403 if ( last - first >= 1 )
404 seq = msa.get_by_name( query, true, false )
405 linker = seq.get_subsequence( first -1 , last - 1 )
406 puts linker.get_sequence_as_string
412 def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
415 hmmscan_results_per_protein_filtered.each do | r |
416 if r.model == hmm_for_protein_output
418 linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
428 def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
431 hmmscan_results_per_protein_filtered.each do | r |
432 if r.model == hmm_for_protein_output
434 overview << hmm_for_protein_output
436 if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
437 overview << "~" << hmm_for_protein_output
439 overview << "----" << hmm_for_protein_output
448 def make_interdomain_sequence( d, mark_short = true )
467 puts( " " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
469 puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
470 puts( " -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" )
471 puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" )
472 puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" )
473 puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" )
474 puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
475 puts( " -" + SPECIES_OPTION + ": species for protein summary" )