2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
4 # Copyright:: Copyright (C) 2012 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/util/command_line_arguments'
13 require 'lib/evo/io/parser/hmmscan_parser'
14 require 'lib/evo/msa/msa'
15 require 'lib/evo/msa/msa_factory'
16 require 'lib/evo/io/msa_io'
17 require 'lib/evo/io/parser/fasta_parser'
25 PRG_DESC = "hmmscan analysis"
27 COPYRIGHT = "2014 Christian Zmasek"
28 CONTACT = "phyloxml@gmail.com"
29 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
31 I_E_VALUE_THRESHOLD_OPTION = "ie"
32 FS_E_VALUE_THRESHOLD_OPTION = "pe"
35 HELP_OPTION_1 = "help"
39 AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
40 LIMIT_FOR_CLOSE_DOMAINS = 20
45 Util.print_program_information( PRG_NAME,
55 cla = CommandLineArguments.new( ARGV )
56 rescue ArgumentError => e
57 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
60 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
61 cla.is_option_set?( HELP_OPTION_2 ) )
66 if ( cla.get_number_of_files != 2 && cla.get_number_of_files != 3 )
71 allowed_opts = Array.new
72 allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
73 allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
74 allowed_opts.push( TARGET_MODELS )
75 allowed_opts.push( EXTRACTION )
77 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
78 if ( disallowed.length > 0 )
79 Util.fatal_error( PRG_NAME,
80 "unknown option(s): " + disallowed,
84 inpath = cla.get_file_name( 0 )
85 Util.check_file_for_readability( inpath )
87 seq_file_path = cla.get_file_name( 1 )
88 Util.check_file_for_readability( seq_file_path )
90 extraction_output = nil
91 if ( cla.get_number_of_files == 3 )
92 extraction_output = cla.get_file_name( 2 )
93 if File.exist?( extraction_output )
94 Util.fatal_error( PRG_NAME, "error: [#{extraction_output}] already exists" )
100 if seq_file_path != nil
101 msa = read_fasta_file( seq_file_path )
104 i_e_value_threshold = -1
105 if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
107 i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
108 rescue ArgumentError => e
109 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
111 if ( i_e_value_threshold < 0.0 )
112 Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
116 fs_e_value_threshold = -1
117 if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
119 fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
120 rescue ArgumentError => e
121 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
123 if ( fs_e_value_threshold < 0.0 )
124 Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
129 if ( cla.is_option_set?( TARGET_MODELS ) )
131 hmm_for_protein_output = cla.get_option_value( TARGET_MODELS )
132 target_models = hmm_for_protein_output.split( "/" )
133 rescue ArgumentError => e
134 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
139 if ( cla.is_option_set?( EXTRACTION ) )
141 x_models = cla.get_option_value( EXTRACTION ).split( "/" )
142 rescue ArgumentError => e
143 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
148 if i_e_value_threshold > 0
149 puts "i E-value threshold:" + "\t" + i_e_value_threshold.to_s
151 puts "i E-value threshold:" + "\t" + "n/a"
154 if fs_e_value_threshold > 0
155 puts "fs E-value threshold:" + "\t" + fs_e_value_threshold.to_s
157 puts "fs E-value threshold:" + "\t" + "n/a"
163 title << "SPECIES" + "\t"
164 target_models.each do | t |
167 title << "LENGTH" + "\t"
168 title << "#DOMAINS" + "\t"
169 title << "DOMAINS" + "\t"
170 unless x_models.empty?
171 title << "LINKERS" + "\t"
174 title << "DETAILED DA"
180 fs_e_value_threshold,
186 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
194 def read_fasta_file( input )
198 msa = f.create_msa_from_file( input, FastaParser.new() )
199 rescue Exception => e
200 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
206 # raises ArgumentError, IOError
209 fs_e_value_threshold,
215 extraction_output_file = nil
216 if extraction_output != nil
217 extraction_output_file = File.open( extraction_output, "a" )
220 hmmscan_parser = HmmscanParser.new( inpath )
222 results = hmmscan_parser.parse
224 hmmscan_results_per_protein = []
228 results.each do | r |
230 if !prev_query.empty? && prev_query != query
231 if !hmmscan_results_per_protein.empty?
232 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
233 fs_e_value_threshold,
238 extraction_output_file )
240 hmmscan_results_per_protein.clear
245 if !AVOID_HHMS.include? r.model
246 hmmscan_results_per_protein << r
249 hmmscan_results_per_protein << r
254 if !hmmscan_results_per_protein.empty?
255 process_hmmscan_results_per_protein( hmmscan_results_per_protein,
256 fs_e_value_threshold,
261 extraction_output_file )
263 if extraction_output_file != nil
264 extraction_output_file.close
269 def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
270 fs_e_value_threshold,
275 extraction_output_file )
277 raise StandardError, "target hmms is empty" if target_hmms.length < 1
278 raise StandardError, "results is empty" if hmmscan_results_per_protein.length < 1
279 raise StandardError, "msa is empty" if ( msa == nil || msa.get_number_of_seqs < 1 )
282 # filter according to i-Evalue threshold
283 # abort if fs Evalue too high
285 if fs_e_value_threshold >= 0
286 hmmscan_results_per_protein.each do | r |
287 target_hmms.each do | hmm |
288 if r.model == hmm && r.fs_e_value > fs_e_value_threshold
295 hmmscan_results_per_protein_filtered = []
298 hmmscan_results_per_protein.each do | r |
299 if i_e_value_threshold < 0 || r.i_e_value <= i_e_value_threshold
300 hmmscan_results_per_protein_filtered << r
301 target_hmms.each do | hmm |
310 if matched.length < target_hmms.length
313 if hmmscan_results_per_protein_filtered.length < 1
317 hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
320 target_hmms.each do | hmm |
321 hmmscan_results_per_protein_filtered.each do | r |
336 raise StandardError, "failed sanity check" if query != own.query || qlen != own.qlen
337 raise StandardError, "failed sanity check: qlen != own.qlen" if qlen != own.qlen
343 seq = get_sequence( query, msa )
344 species = get_species( seq )
345 raise StandardError, "could not get species" if species == nil || species.empty?
346 if x_models != nil && !x_models.empty?
347 ll = extract_linker( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file )
353 s << own.fs_e_value.to_s + "\t"
356 s << qlen.to_s + "\t"
358 s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
359 hmmscan_results_per_protein_filtered.each do | r |
370 s << make_overview_da( hmmscan_results_per_protein_filtered )
372 s << make_detailed_da( hmmscan_results_per_protein_filtered, qlen )
378 def extract_linker( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file )
379 raise StandardError, "extraction output file is nil" if extraction_output_file == nil
381 hmmscan_results_per_protein_filtered.each do | r |
383 if ( x_models.length == 2 && prev_r.model == x_models[ 0 ] && r.model == x_models[ 1 ] )
384 ll = output_linker( prev_r.env_to, r.env_from, seq, extraction_output_file )
394 def get_sequence( query, msa )
396 indices = msa.find_by_name_start( query , true )
397 if indices.length != 1
398 if query[ -1, 1 ] == "|"
401 seq = msa.get_by_name_pattern( /\b#{Regexp.quote(query)}\b/ )
403 seq = msa.get_sequence( indices[ 0 ] )
409 def get_species( seq )
411 if seq.get_name =~ /\[([A-Z0-9]{3,5})\]/
418 def output_linker( first, last , seq, output_file )
419 if ( last - first >= 1 )
420 output_file.print( ">" + seq.get_name + " [" + first.to_s + "-" + last.to_s + "]" + "\n")
421 output_file.print( seq.get_subsequence( first - 1 , last - 1 ).get_sequence_as_string + "\n" )
428 def make_detailed_da( hmmscan_results_per_protein_filtered , qlen )
431 hmmscan_results_per_protein_filtered.each do | r |
433 s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
435 s << make_interdomain_sequence( r.env_from, false )
439 s << r.env_from.to_s << "-" << r.env_to.to_s
440 s << " " << r.i_e_value.to_s
444 s << make_interdomain_sequence( qlen - prev_r.env_to, false )
449 def make_overview_da( hmmscan_results_per_protein_filtered )
452 hmmscan_results_per_protein_filtered.each do | r |
456 if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
457 overview << "~" << r.model
459 overview << "----" << r.model
469 def make_interdomain_sequence( d, mark_short = true )
488 puts " " + PRG_NAME + ".rb [options] <hmmscan outputfile> <sequences in fasta format> [outputfile]"
490 puts " options: -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold"
491 puts " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary"
492 puts " -" + TARGET_MODELS + ": target HMMs (separated by /)"
493 puts " -" + EXTRACTION + ": to extract matching sequences to [outputfile]"