2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
4 # Copyright:: Copyright (C) 2012 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
12 require 'lib/evo/util/constants'
13 require 'lib/evo/util/util'
14 require 'lib/evo/util/command_line_arguments'
15 require 'lib/evo/io/parser/hmmscan_parser'
16 require 'lib/evo/msa/msa'
17 require 'lib/evo/msa/msa_factory'
18 require 'lib/evo/io/msa_io'
19 require 'lib/evo/io/parser/fasta_parser'
20 require 'lib/evo/io/writer/fasta_writer'
28 PRG_DESC = "hmmscan analysis"
30 COPYRIGHT = "2013 Christian M Zmasek"
31 CONTACT = "phyloxml@gmail.com"
32 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
34 I_E_VALUE_THRESHOLD_OPTION = "ie"
35 FS_E_VALUE_THRESHOLD_OPTION = "pe"
38 HELP_OPTION_1 = "help"
41 USE_AVOID_HMMS = false
42 AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
43 LIMIT_FOR_CLOSE_DOMAINS = 20
48 Util.print_program_information( PRG_NAME,
58 cla = CommandLineArguments.new( ARGV )
59 rescue ArgumentError => e
60 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
63 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
64 cla.is_option_set?( HELP_OPTION_2 ) )
69 if ( cla.get_number_of_files != 1 && cla.get_number_of_files != 3 )
74 allowed_opts = Array.new
75 allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
76 allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
77 allowed_opts.push( TARGET_MODELS )
78 allowed_opts.push( EXTRACTION )
80 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
81 if ( disallowed.length > 0 )
82 Util.fatal_error( PRG_NAME,
83 "unknown option(s): " + disallowed,
87 inpath = cla.get_file_name( 0 )
88 Util.check_file_for_readability( inpath )
90 extraction_output = nil
92 if ( cla.get_number_of_files == 3 )
93 seq_file_path = cla.get_file_name( 1 )
94 Util.check_file_for_readability( seq_file_path )
95 extraction_output = cla.get_file_name( 2 )
96 if File.exist?( extraction_output )
97 Util.fatal_error( PRG_NAME, "error: [#{extraction_output}] already exists" )
103 if seq_file_path != nil
104 msa = read_fasta_file( seq_file_path )
107 i_e_value_threshold = -1
108 if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
110 i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
111 rescue ArgumentError => e
112 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
114 if ( i_e_value_threshold < 0.0 )
115 Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
119 fs_e_value_threshold = -1
120 if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
122 fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
123 rescue ArgumentError => e
124 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
126 if ( fs_e_value_threshold < 0.0 )
127 Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
132 if ( cla.is_option_set?( TARGET_MODELS ) )
134 hmm_for_protein_output = cla.get_option_value( TARGET_MODELS )
135 target_models = hmm_for_protein_output.split( "/" );
136 rescue ArgumentError => e
137 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
142 if ( cla.is_option_set?( EXTRACTION ) )
144 hmm_for_protein_output = cla.get_option_value( EXTRACTION )
145 x_models = hmm_for_protein_output.split( "~" );
146 rescue ArgumentError => e
147 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
156 fs_e_value_threshold,
162 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
169 def read_fasta_file( input )
173 msa = f.create_msa_from_file( input, FastaParser.new() )
174 rescue Exception => e
175 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
181 # raises ArgumentError, IOError
184 fs_e_value_threshold,
190 extraction_output_file = nil
191 if extraction_output != nil
192 extraction_output_file = File.open( extraction_output, "a" )
195 hmmscan_parser = HmmscanParser.new( inpath )
197 results = hmmscan_parser.parse
199 hmmscan_results_per_protein = []
203 results.each do | r |
205 if !prev_query.empty? && prev_query != query
206 if !hmmscan_results_per_protein.empty?
207 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
208 fs_e_value_threshold,
213 extraction_output_file )
215 hmmscan_results_per_protein.clear
220 if !AVOID_HHMS.include? r.model
221 hmmscan_results_per_protein << r
224 hmmscan_results_per_protein << r
229 if !hmmscan_results_per_protein.empty?
230 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
231 fs_e_value_threshold,
236 extraction_output_file )
238 if extraction_output_file != nil
239 extraction_output_file.close
245 def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
246 fs_e_value_threshold,
251 extraction_output_file )
253 raise StandardError, "target hmms is empty" if target_hmms.length < 1
254 raise StandardError, "results is empty" if hmmscan_results_per_protein.length < 1
256 # filter according to i-Evalue threshold
257 # abort if fs Evalue too high
259 if fs_e_value_threshold >= 0.0
260 hmmscan_results_per_protein.each do | r |
261 target_hmms.each do | hmm |
262 if r.model == hmm && r.fs_e_value > fs_e_value_threshold
270 hmmscan_results_per_protein_filtered = []
273 hmmscan_results_per_protein.each do | r |
274 if i_e_value_threshold < 0 || r.i_e_value <= i_e_value_threshold
275 hmmscan_results_per_protein_filtered << r
276 target_hmms.each do | hmm |
286 if matched.length < target_hmms.length
289 if hmmscan_results_per_protein_filtered.length < 1
293 hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
296 target_hmms.each do | hmm |
297 hmmscan_results_per_protein_filtered.each do | r |
312 raise StandardError, "failed sanity check" if query != own.query || qlen != own.qlen
313 raise StandardError, "failed sanity check: qlen != own.qlen" if qlen != own.qlen
318 s << "species" + "\t"
320 s << own.fs_e_value.to_s + "\t"
323 s << qlen.to_s + "\t"
325 s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
326 hmmscan_results_per_protein_filtered.each do | r |
330 s << make_overview_da( hmmscan_results_per_protein_filtered )
332 s << make_detailed_da( hmmscan_results_per_protein_filtered, qlen )
334 if x_models != nil && x_models.length > 0
335 extract_linkers( hmmscan_results_per_protein_filtered, x_models, query, "lizrd", msa, extraction_output_file )
339 def extract_linkers( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file )
340 raise StandardError, "extraction output file is nil" if extraction_output_file == nil
342 hmmscan_results_per_protein_filtered.each do | r |
344 if ( x_models.length == 2 && prev_r.model == x_models[ 0 ] && r.model == x_models[ 1 ] )
345 extract_linker( prev_r.env_to, r.env_from, seq, extraction_output_file )
353 def get_sequence( query, msa )
355 indices = msa.find_by_name_start( query , true )
356 if indices.length != 1
357 if query[ -1, 1 ] == "|"
360 seq = msa.get_by_name_pattern( /\b#{Regexp.quote(query)}\b/ )
362 seq = msa.get_sequence( indices[ 0 ] )
368 def extract_linker( first, last , seq, output_file )
370 if ( last - first >= 1 )
371 indices = msa.find_by_name_start( query , true )
372 if indices.length != 1
373 if query[ -1, 1 ] == "|"
376 seq = msa.get_by_name_pattern( /\b#{Regexp.quote(query)}\b/ )
378 seq = msa.get_sequence( indices[ 0 ] )
381 if seq.get_name =~ /\[([A-Z0-9]{3,5})\]/
385 output_file.print( ">" + seq.get_name + " [" + first.to_s + "-" + last.to_s + "]" + "\n")
386 output_file.print( seq.get_subsequence( first - 1 , last - 1 ).get_sequence_as_string + "\n" )
390 def make_detailed_da( hmmscan_results_per_protein_filtered , qlen )
393 hmmscan_results_per_protein_filtered.each do | r |
395 s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
397 s << make_interdomain_sequence( r.env_from, false )
401 s << r.env_from.to_s << "-" << r.env_to.to_s
402 s << " " << r.i_e_value.to_s
406 s << make_interdomain_sequence( qlen - prev_r.env_to, false )
410 def make_overview_da( hmmscan_results_per_protein_filtered )
413 hmmscan_results_per_protein_filtered.each do | r |
417 if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
418 overview << "~" << r.model
420 overview << "----" << r.model
433 def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
436 hmmscan_results_per_protein_filtered.each do | r |
437 if r.model == hmm_for_protein_output
439 linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
448 def make_interdomain_sequence( d, mark_short = true )
467 puts( " " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
469 puts( " options: -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" )
470 puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" )
471 puts( " -" + TARGET_MODELS + ": target HMMs" )