2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
4 # Copyright:: Copyright (C) 2012 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
12 require 'lib/evo/util/constants'
13 require 'lib/evo/util/util'
14 require 'lib/evo/util/command_line_arguments'
15 require 'lib/evo/io/parser/hmmscan_parser'
16 require 'lib/evo/msa/msa'
17 require 'lib/evo/msa/msa_factory'
18 require 'lib/evo/io/msa_io'
19 require 'lib/evo/io/parser/fasta_parser'
20 require 'lib/evo/io/writer/fasta_writer'
28 PRG_DESC = "hmmscan summary"
30 COPYRIGHT = "2013 Christian M Zmasek"
31 CONTACT = "phyloxml@gmail.com"
32 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
35 I_E_VALUE_THRESHOLD_OPTION = "ie"
36 FS_E_VALUE_THRESHOLD_OPTION = "pe"
37 HMM_FOR_PROTEIN_OUTPUT = "m"
38 HELP_OPTION_1 = "help"
41 USE_AVOID_HMMS = false
42 AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
43 LIMIT_FOR_CLOSE_DOMAINS = 20
46 @domain_counts = Hash.new
51 # Util.print_program_information( PRG_NAME,
61 cla = CommandLineArguments.new( ARGV )
62 rescue ArgumentError => e
63 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
66 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
67 cla.is_option_set?( HELP_OPTION_2 ) )
72 if ( cla.get_number_of_files != 1 && cla.get_number_of_files != 2 )
77 allowed_opts = Array.new
78 allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
79 allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
80 allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
81 allowed_opts.push( SPECIES_OPTION )
83 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
84 if ( disallowed.length > 0 )
85 Util.fatal_error( PRG_NAME,
86 "unknown option(s): " + disallowed,
90 inpath = cla.get_file_name( 0 )
93 if ( cla.get_number_of_files == 2 )
94 seq_file_path = cla.get_file_name( 1 )
98 if seq_file_path != nil
99 msa = read_fasta_file(seq_file_path )
102 i_e_value_threshold = -1
103 if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
105 i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
106 rescue ArgumentError => e
107 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
109 if ( i_e_value_threshold < 0.0 )
110 Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
114 fs_e_value_threshold = -1
115 if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
117 fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
118 rescue ArgumentError => e
119 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
121 if ( fs_e_value_threshold < 0.0 )
122 Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
126 hmm_for_protein_outputs = []
127 if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
129 hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
130 hmm_for_protein_outputs = hmm_for_protein_output.split( "/" );
131 rescue ArgumentError => e
132 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
137 if ( cla.is_option_set?( SPECIES_OPTION ) )
139 species = cla.get_option_value( SPECIES_OPTION )
140 rescue ArgumentError => e
141 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
148 fs_e_value_threshold,
149 hmm_for_protein_outputs,
153 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
160 def read_fasta_file( input )
164 msa = f.create_msa_from_file( input, FastaParser.new() )
165 rescue Exception => e
166 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
172 # raises ArgumentError, IOError
175 fs_e_value_threshold,
176 hmm_for_protein_outputs,
180 Util.check_file_for_readability( inpath )
182 hmmscan_parser = HmmscanParser.new( inpath )
183 results = hmmscan_parser.parse
185 hmmscan_results_per_protein = []
190 results.each do | r |
194 if !prev_query.empty? && prev_query != query
195 if !hmmscan_results_per_protein.empty?
196 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
197 fs_e_value_threshold,
198 hmm_for_protein_outputs,
203 hmmscan_results_per_protein.clear
208 if !AVOID_HHMS.include? r.model
209 hmmscan_results_per_protein << r
212 hmmscan_results_per_protein << r
217 if !hmm_for_protein_outputs.empty? && !hmmscan_results_per_protein.empty?
218 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
219 fs_e_value_threshold,
220 hmm_for_protein_outputs,
230 if id =~ /(sp|tr)\|\S+\|(\S+)/
238 def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
239 fs_e_value_threshold,
245 raise StandardError, "target hmms is empty" if target_hmms.length < 1
246 raise StandardError, "results is empty" if hmmscan_results_per_protein.length < 1
248 # filter according to i-Evalue threshold
249 # abort if fs Evalue too high
252 if fs_e_value_threshold >= 0.0
253 hmmscan_results_per_protein.each do | r |
254 target_hmms.each do | hmm |
255 if r.model == hmm && r.fs_e_value > fs_e_value_threshold
263 hmmscan_results_per_protein_filtered = []
266 hmmscan_results_per_protein.each do | r |
267 if i_e_value_threshold < 0 || r.i_e_value <= i_e_value_threshold
268 hmmscan_results_per_protein_filtered << r
269 target_hmms.each do | hmm |
280 if matched.length < target_hmms.length
283 if hmmscan_results_per_protein_filtered.length < 1
287 hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
290 target_hmms.each do | hmm |
291 hmmscan_results_per_protein_filtered.each do | r |
310 raise StandardError, "failed sanity check" if query != own.query || qlen != own.qlen
311 raise StandardError, "failed sanity check: qlen != own.qlen" if qlen != own.qlen
318 s << own.fs_e_value.to_s + "\t"
321 s << qlen.to_s + "\t"
323 s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
324 hmmscan_results_per_protein_filtered.each do | r |
328 s << make_overview_da( hmmscan_results_per_protein_filtered )
331 hmmscan_results_per_protein_filtered.each do | r |
333 s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
334 if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] )
335 extract_linker( prev_r.env_to, r.env_from, query, msa )
338 s << make_interdomain_sequence( r.env_from, false )
342 s << r.env_from.to_s << "-" << r.env_to.to_s
343 s << " " << r.i_e_value.to_s
347 s << make_interdomain_sequence( qlen - prev_r.env_to, false )
352 def extract_linkers( hmmscan_results_per_protein_filtered, target_hmms, query, msa )
354 hmmscan_results_per_protein_filtered.each do | r |
356 if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] )
357 extract_linker( prev_r.env_to, r.env_from, query, msa )
364 def extract_linker( first, last , query , msa)
365 if ( last - first >= 1 )
366 seq = msa.get_by_name_pattern( /\b#{Regexp.quote(query)}\b/ )
367 linker = seq.get_subsequence( first - 1 , last - 1 )
368 linker.get_sequence_as_string
372 def make_detailed_da( hmmscan_results_per_protein_filtered )
375 hmmscan_results_per_protein_filtered.each do | r |
377 s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
379 s << make_interdomain_sequence( r.env_from, false )
383 s << r.env_from.to_s << "-" << r.env_to.to_s
384 s << " " << r.i_e_value.to_s
388 s << make_interdomain_sequence( qlen - prev_r.env_to, false )
392 def make_overview_da( hmmscan_results_per_protein_filtered )
395 hmmscan_results_per_protein_filtered.each do | r |
400 if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
401 overview << "~" << r.model
403 overview << "----" << r.model
416 def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
419 hmmscan_results_per_protein_filtered.each do | r |
420 if r.model == hmm_for_protein_output
422 linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
431 def make_interdomain_sequence( d, mark_short = true )
450 puts( " " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
452 puts( " options: -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" )
453 puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" )
454 puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
455 puts( " -" + SPECIES_OPTION + ": species for protein summary" )