e0f9a86d61177c3ba1ce141b1dc14ba7d16c2b56
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_analysis.rb
1 #
2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
8 #
9
10 require 'set'
11
12 require 'lib/evo/util/constants'
13 require 'lib/evo/util/util'
14 require 'lib/evo/util/command_line_arguments'
15 require 'lib/evo/io/parser/hmmscan_parser'
16 require 'lib/evo/msa/msa'
17 require 'lib/evo/msa/msa_factory'
18 require 'lib/evo/io/msa_io'
19 require 'lib/evo/io/parser/fasta_parser'
20 require 'lib/evo/io/writer/fasta_writer'
21
22 module Evoruby
23
24   class HmmscanAnalysis
25
26     PRG_NAME       = "hsa"
27     PRG_VERSION    = "1.000"
28     PRG_DESC       = "hmmscan analysis"
29     PRG_DATE       = "130319"
30     COPYRIGHT      = "2013 Christian M Zmasek"
31     CONTACT        = "phyloxml@gmail.com"
32     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
33
34     I_E_VALUE_THRESHOLD_OPTION    = "ie"
35     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
36     TARGET_MODELS                 = "m"
37     EXTRACTION                    = "x"
38     HELP_OPTION_1                 = "help"
39     HELP_OPTION_2                 = "h"
40
41     USE_AVOID_HMMS = false
42     AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
43     LIMIT_FOR_CLOSE_DOMAINS = 20
44
45
46     def run
47
48       Util.print_program_information( PRG_NAME,
49         PRG_VERSION,
50         PRG_DESC,
51         PRG_DATE,
52         COPYRIGHT,
53         CONTACT,
54         WWW,
55         STDOUT )
56
57       begin
58         cla = CommandLineArguments.new( ARGV )
59       rescue ArgumentError => e
60         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
61       end
62
63       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
64            cla.is_option_set?( HELP_OPTION_2 ) )
65         print_help
66         exit( 0 )
67       end
68
69       if ( cla.get_number_of_files != 1 && cla.get_number_of_files != 3 )
70         print_help
71         exit( -1 )
72       end
73
74       allowed_opts = Array.new
75       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
76       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
77       allowed_opts.push( TARGET_MODELS )
78       allowed_opts.push( EXTRACTION )
79
80       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
81       if ( disallowed.length > 0 )
82         Util.fatal_error( PRG_NAME,
83           "unknown option(s): " + disallowed,
84           STDOUT )
85       end
86
87       inpath = cla.get_file_name( 0 )
88       Util.check_file_for_readability( inpath )
89       seq_file_path = nil
90       extraction_output = nil
91
92       if ( cla.get_number_of_files == 3 )
93         seq_file_path = cla.get_file_name( 1 )
94         Util.check_file_for_readability(  seq_file_path  )
95         extraction_output = cla.get_file_name( 2 )
96         if File.exist?( extraction_output )
97           Util.fatal_error( PRG_NAME, "error: [#{extraction_output}] already exists" )
98         end
99
100       end
101
102       msa = nil
103       if seq_file_path != nil
104         msa = read_fasta_file( seq_file_path )
105       end
106
107       i_e_value_threshold = -1
108       if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
109         begin
110           i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
111         rescue ArgumentError => e
112           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
113         end
114         if ( i_e_value_threshold < 0.0 )
115           Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
116         end
117       end
118
119       fs_e_value_threshold = -1
120       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
121         begin
122           fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
123         rescue ArgumentError => e
124           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
125         end
126         if ( fs_e_value_threshold < 0.0 )
127           Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
128         end
129       end
130
131       target_models = []
132       if ( cla.is_option_set?( TARGET_MODELS ) )
133         begin
134           hmm_for_protein_output = cla.get_option_value( TARGET_MODELS )
135           target_models = hmm_for_protein_output.split( "/" );
136         rescue ArgumentError => e
137           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
138         end
139       end
140
141       x_models = []
142       if ( cla.is_option_set?( EXTRACTION ) )
143         begin
144           hmm_for_protein_output = cla.get_option_value( EXTRACTION )
145           x_models = hmm_for_protein_output.split( "~" );
146         rescue ArgumentError => e
147           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
148         end
149       end
150
151
152
153       begin
154         parse( inpath,
155           i_e_value_threshold,
156           fs_e_value_threshold,
157           target_models,
158           x_models,
159           msa,
160           extraction_output )
161       rescue IOError => e
162         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
163       end
164
165     end # def run
166
167     private
168
169     def read_fasta_file( input )
170       f = MsaFactory.new()
171       msa = nil
172       begin
173         msa = f.create_msa_from_file( input, FastaParser.new() )
174       rescue Exception => e
175         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
176       end
177       msa
178     end
179
180
181     # raises ArgumentError, IOError
182     def parse( inpath,
183         i_e_value_threshold,
184         fs_e_value_threshold,
185         target_models,
186         x_models,
187         msa,
188         extraction_output )
189
190       extraction_output_file = nil
191       if extraction_output != nil
192         extraction_output_file = File.open( extraction_output, "a" )
193       end
194
195       hmmscan_parser = HmmscanParser.new( inpath )
196
197       results = hmmscan_parser.parse
198
199       hmmscan_results_per_protein = []
200
201       query     = ""
202       prev_query = ""
203       results.each do | r |
204         query = r.query
205         if !prev_query.empty? && prev_query != query
206           if !hmmscan_results_per_protein.empty?
207             process_hmmscan_results_per_protein( hmmscan_results_per_protein,
208               fs_e_value_threshold,
209               target_models,
210               x_models,
211               i_e_value_threshold,
212               msa,
213               extraction_output_file )
214           end
215           hmmscan_results_per_protein.clear
216         end
217         prev_query = query
218
219         if USE_AVOID_HMMS
220           if !AVOID_HHMS.include? r.model
221             hmmscan_results_per_protein << r
222           end
223         else
224           hmmscan_results_per_protein << r
225         end
226
227       end
228
229       if !hmmscan_results_per_protein.empty?
230         process_hmmscan_results_per_protein( hmmscan_results_per_protein,
231           fs_e_value_threshold,
232           target_models,
233           x_models,
234           i_e_value_threshold,
235           msa,
236           extraction_output_file )
237       end
238       if extraction_output_file != nil
239         extraction_output_file.close
240       end
241     end # def parse
242
243
244
245     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
246         fs_e_value_threshold,
247         target_hmms,
248         x_models,
249         i_e_value_threshold,
250         msa,
251         extraction_output_file )
252
253       raise StandardError, "target hmms is empty" if target_hmms.length < 1
254       raise StandardError, "results is empty" if hmmscan_results_per_protein.length < 1
255
256       # filter according to i-Evalue threshold
257       # abort if fs Evalue too high
258
259       if fs_e_value_threshold >= 0.0
260         hmmscan_results_per_protein.each do | r |
261           target_hmms.each do | hmm |
262             if r.model == hmm && r.fs_e_value > fs_e_value_threshold
263               return
264             end
265           end
266         end
267       end
268
269       #  dcs = []
270       hmmscan_results_per_protein_filtered = []
271       matched = Set.new
272
273       hmmscan_results_per_protein.each do | r |
274         if i_e_value_threshold < 0 || r.i_e_value <= i_e_value_threshold
275           hmmscan_results_per_protein_filtered << r
276           target_hmms.each do | hmm |
277             if r.model == hmm
278               matched << hmm
279               break
280             end
281           end
282
283         end
284       end
285
286       if matched.length < target_hmms.length
287         return
288       end
289       if  hmmscan_results_per_protein_filtered.length < 1
290         return
291       end
292
293       hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
294
295       owns = []
296       target_hmms.each do | hmm |
297         hmmscan_results_per_protein_filtered.each do | r |
298           if r.model == hmm
299             owns << r
300             break
301           end
302         end
303       end
304
305       query = nil
306       qlen = nil
307       owns.each do | own |
308         if query == nil
309           query = own.query
310           qlen = own.qlen
311         else
312           raise StandardError, "failed sanity check" if query != own.query || qlen != own.qlen
313           raise StandardError, "failed sanity check: qlen != own.qlen" if qlen != own.qlen
314         end
315       end
316
317       s = query + "\t"
318       s << "species" + "\t"
319       owns.each do | own |
320         s << own.fs_e_value.to_s + "\t"
321       end
322
323       s << qlen.to_s + "\t"
324
325       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
326       hmmscan_results_per_protein_filtered.each do | r |
327         s << r.model + " "
328       end
329       s << "\t"
330       s <<  make_overview_da( hmmscan_results_per_protein_filtered )
331       s << "\t"
332       s << make_detailed_da( hmmscan_results_per_protein_filtered, qlen )
333       puts s
334       if x_models != nil &&  x_models.length > 0
335         extract_linkers( hmmscan_results_per_protein_filtered, x_models, query, "lizrd",  msa,  extraction_output_file )
336       end
337     end
338
339     def extract_linkers( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file )
340       raise StandardError, "extraction output file is nil" if extraction_output_file == nil
341       prev_r = nil
342       hmmscan_results_per_protein_filtered.each do | r |
343         if  prev_r != nil
344           if ( x_models.length == 2 && prev_r.model == x_models[ 0 ] && r.model == x_models[ 1 ] )
345             extract_linker( prev_r.env_to, r.env_from, seq, extraction_output_file )
346           end
347         end
348         prev_r = r
349       end
350     end
351
352     
353     def get_sequence( query, msa  )
354       seq = nil
355       indices = msa.find_by_name_start( query , true )
356       if indices.length != 1
357         if query[ -1, 1 ] == "|"
358           query.chop!
359         end
360         seq = msa.get_by_name_pattern( /\b#{Regexp.quote(query)}\b/ )
361       else
362         seq = msa.get_sequence( indices[ 0 ] )
363       end
364       seq
365     end
366     
367     
368     def extract_linker( first, last , seq,  output_file )
369      
370       if ( last - first >= 1 )
371         indices = msa.find_by_name_start( query , true )
372         if indices.length != 1
373           if query[ -1, 1 ] == "|"
374             query.chop!
375           end
376           seq = msa.get_by_name_pattern( /\b#{Regexp.quote(query)}\b/ )
377         else
378           seq = msa.get_sequence( indices[ 0 ] )
379         end
380         species = nil
381         if seq.get_name =~ /\[([A-Z0-9]{3,5})\]/
382           species = $1
383         end 
384        
385         output_file.print( ">" + seq.get_name + " [" +  first.to_s + "-" + last.to_s +  "]" + "\n")
386         output_file.print( seq.get_subsequence( first - 1 , last - 1 ).get_sequence_as_string + "\n" )
387       end
388     end
389
390     def make_detailed_da( hmmscan_results_per_protein_filtered , qlen )
391       s = ""
392       prev_r = nil
393       hmmscan_results_per_protein_filtered.each do | r |
394         if  prev_r != nil
395           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
396         else
397           s << make_interdomain_sequence( r.env_from, false )
398         end
399         s << r.model
400         s << "["
401         s << r.env_from.to_s << "-" << r.env_to.to_s
402         s << " " << r.i_e_value.to_s
403         s << "]"
404         prev_r = r
405       end
406       s << make_interdomain_sequence( qlen - prev_r.env_to, false )
407       s
408     end
409
410     def make_overview_da( hmmscan_results_per_protein_filtered )
411       overview = ""
412       prev_r = nil
413       hmmscan_results_per_protein_filtered.each do | r |
414         if prev_r == nil
415           overview << r.model
416         else
417           if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
418             overview << "~" << r.model
419           else
420             overview << "----" << r.model
421           end
422         end
423         prev_r = r
424
425       end
426       overview
427     end
428
429
430
431
432
433     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
434       linkers = ""
435       prev_r = nil
436       hmmscan_results_per_protein_filtered.each do | r |
437         if r.model == hmm_for_protein_output
438           if  prev_r != nil
439             linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
440           end
441           prev_r = r
442         end
443       end
444       linkers
445     end
446
447
448     def make_interdomain_sequence( d, mark_short = true )
449       s = ""
450       d /= 20
451       if d >= 10
452         s << "----//----"
453       elsif d >= 1
454         d.times do
455           s << "-"
456         end
457       elsif mark_short
458         s << "~"
459       end
460       s
461     end
462
463
464     def print_help()
465       puts( "Usage:" )
466       puts()
467       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
468       puts()
469       puts( "  options: -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
470       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
471       puts( "           -" + TARGET_MODELS + ": target HMMs" )
472       puts()
473     end
474
475   end # class
476
477 end # module Evoruby