08d74c12c851441bdb7b8165478e1ac41aade492
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_analysis.rb
1 #
2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
8 #
9
10 require 'set'
11
12 require 'lib/evo/util/constants'
13 require 'lib/evo/util/util'
14 require 'lib/evo/util/command_line_arguments'
15 require 'lib/evo/io/parser/hmmscan_parser'
16 require 'lib/evo/msa/msa'
17 require 'lib/evo/msa/msa_factory'
18 require 'lib/evo/io/msa_io'
19 require 'lib/evo/io/parser/fasta_parser'
20 require 'lib/evo/io/writer/fasta_writer'
21
22 module Evoruby
23
24   class HmmscanAnalysis
25
26     PRG_NAME       = "hsp"
27     PRG_VERSION    = "2.001"
28     PRG_DESC       = "hmmscan summary"
29     PRG_DATE       = "2013.10.23"
30     COPYRIGHT      = "2013 Christian M Zmasek"
31     CONTACT        = "phyloxml@gmail.com"
32     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
33
34     DELIMITER_OPTION              = "d"
35     SPECIES_OPTION                = "s"
36     I_E_VALUE_THRESHOLD_OPTION    = "ie"
37     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
38     HMM_FOR_PROTEIN_OUTPUT        = "m"
39     IGNORE_DUF_OPTION             = "i"
40     PARSE_OUT_DESCRIPITION_OPTION = "a"
41     HELP_OPTION_1                 = "help"
42     HELP_OPTION_2                 = "h"
43
44     USE_AVOID_HMMS = false
45     AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
46     LIMIT_FOR_CLOSE_DOMAINS = 20
47
48     def initialize
49       @domain_counts = Hash.new
50     end
51
52     def run
53
54       #   Util.print_program_information( PRG_NAME,
55       #     PRG_VERSION,
56       #     PRG_DESC,
57       #     PRG_DATE,
58       #     COPYRIGHT,
59       #     CONTACT,
60       #     WWW,
61       #     STDOUT )
62
63       begin
64         cla = CommandLineArguments.new( ARGV )
65       rescue ArgumentError => e
66         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
67       end
68
69       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
70            cla.is_option_set?( HELP_OPTION_2 ) )
71         print_help
72         exit( 0 )
73       end
74
75       if ( cla.get_number_of_files != 1 &&  cla.get_number_of_files != 2 )
76         print_help
77         exit( -1 )
78       end
79
80       allowed_opts = Array.new
81       allowed_opts.push( DELIMITER_OPTION )
82       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
83       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
84       allowed_opts.push( IGNORE_DUF_OPTION )
85       allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
86       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
87       allowed_opts.push( SPECIES_OPTION )
88
89       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
90       if ( disallowed.length > 0 )
91         Util.fatal_error( PRG_NAME,
92           "unknown option(s): " + disallowed,
93           STDOUT )
94       end
95
96       inpath = cla.get_file_name( 0 )
97       seq_file_path = nil
98
99       if ( cla.get_number_of_files == 2 )
100         seq_file_path = cla.get_file_name( 1 )
101       end
102
103       msa = nil
104       if seq_file_path != nil
105         msa = read_fasta_file(seq_file_path )
106       end
107
108       column_delimiter = "\t"
109       if ( cla.is_option_set?( DELIMITER_OPTION ) )
110         begin
111           column_delimiter = cla.get_option_value( DELIMITER_OPTION )
112         rescue ArgumentError => e
113           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
114         end
115       end
116
117       i_e_value_threshold = -1
118       if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
119         begin
120           i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
121         rescue ArgumentError => e
122           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
123         end
124         if ( i_e_value_threshold < 0.0 )
125           Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
126         end
127       end
128
129       fs_e_value_threshold = -1
130       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
131         begin
132           fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
133         rescue ArgumentError => e
134           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
135         end
136         if ( fs_e_value_threshold < 0.0 )
137           Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
138         end
139       end
140
141       hmm_for_protein_outputs = []
142       if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
143         begin
144           hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
145           hmm_for_protein_outputs = hmm_for_protein_output.split( "~" );
146         rescue ArgumentError => e
147           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
148         end
149       end
150
151       species = "HUMAN"
152       if ( cla.is_option_set?( SPECIES_OPTION ) )
153         begin
154           species = cla.get_option_value( SPECIES_OPTION )
155         rescue ArgumentError => e
156           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
157         end
158       end
159
160       ignore_dufs = false
161       if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
162         ignore_dufs = true
163       end
164
165       parse_descriptions = false
166       if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
167         parse_descriptions = true
168       end
169
170
171       begin
172         parse( inpath,
173           column_delimiter,
174           i_e_value_threshold,
175           ignore_dufs,
176           parse_descriptions,
177           fs_e_value_threshold,
178           hmm_for_protein_outputs,
179           species,
180           msa        )
181       rescue IOError => e
182         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
183       end
184
185     end # def run
186
187     private
188
189     def read_fasta_file( input )
190       f = MsaFactory.new()
191       msa = nil
192       begin
193         msa = f.create_msa_from_file( input, FastaParser.new() )
194       rescue Exception => e
195         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
196       end
197       msa
198     end
199
200
201     # raises ArgumentError, IOError
202     def parse( inpath,
203         column_delimiter,
204         i_e_value_threshold,
205         ignore_dufs,
206         get_descriptions,
207         fs_e_value_threshold,
208         hmm_for_protein_outputs,
209         species,
210         msa )
211
212       Util.check_file_for_readability( inpath )
213
214       hmmscan_parser = HmmscanParser.new( inpath )
215       results = hmmscan_parser.parse
216
217
218       query     = ""
219       desc      = ""
220       model     = ""
221       env_from  = ""
222       env_to    = ""
223       i_e_value = ""
224
225       hmmscan_results_per_protein = []
226
227       prev_query = ""
228
229       results.each do | r |
230         model     = r.model
231         query     = r.query
232         i_e_value = r.i_e_value
233         env_from  = r.env_from
234         env_to    = r.env_to
235
236
237         if !prev_query.empty? && prev_query != query
238           if !hmmscan_results_per_protein.empty?
239             process_hmmscan_results_per_protein( hmmscan_results_per_protein,
240               fs_e_value_threshold,
241               hmm_for_protein_outputs,
242               i_e_value_threshold,
243               species,
244               msa            )
245           end
246           hmmscan_results_per_protein.clear
247         end
248         prev_query = query
249
250         if USE_AVOID_HMMS
251           if !AVOID_HHMS.include? r.model
252             hmmscan_results_per_protein << r
253           end
254         else
255           hmmscan_results_per_protein << r
256         end
257
258       end
259
260       if !hmm_for_protein_outputs.empty? && !hmmscan_results_per_protein.empty?
261         process_hmmscan_results_per_protein( hmmscan_results_per_protein,
262           fs_e_value_threshold,
263           hmm_for_protein_outputs,
264           i_e_value_threshold,
265           species,
266           msa        )
267       end
268
269
270     end # def parse
271
272     def process_id( id )
273       if  id =~ /(sp|tr)\|\S+\|(\S+)/
274         id = $2
275       end
276       id
277     end
278
279
280
281     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
282         fs_e_value_threshold,
283         target_hmms,
284         i_e_value_threshold,
285         species,
286         msa )
287
288       raise StandardError, "target hmms is empty" if target_hmms.length < 1
289       raise StandardError, "results is empty" if hmmscan_results_per_protein.length < 1
290
291       # filter according to i-Evalue threshold
292       # abort if fs Evalue too high
293
294
295       if fs_e_value_threshold >= 0.0
296         hmmscan_results_per_protein.each do | r |
297           target_hmms.each do | hmm |
298             if r.model == hmm && r.fs_e_value > fs_e_value_threshold
299               return
300             end
301           end
302         end
303       end
304
305       #  dcs = []
306       hmmscan_results_per_protein_filtered = []
307       matched = Set.new
308
309       hmmscan_results_per_protein.each do | r |
310         if i_e_value_threshold < 0 || r.i_e_value <= i_e_value_threshold
311           hmmscan_results_per_protein_filtered << r
312           target_hmms.each do | hmm |
313             if r.model == hmm
314
315               matched << hmm
316               break
317             end
318           end
319
320         end
321       end
322
323       if matched.length < target_hmms.length
324         return
325       end
326       if  hmmscan_results_per_protein_filtered.length < 1
327         return
328       end
329
330       hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
331
332       owns = []
333       target_hmms.each do | hmm |
334         hmmscan_results_per_protein_filtered.each do | r |
335           if r.model == hmm
336             owns << r
337             break
338           end
339         end
340       end
341
342       s = ""
343       query = nil
344       owns.each do | own |
345         s << own.query + "\t"
346         query = own.query
347       end
348       s << species + "\t"
349       owns.each do | own |
350         s << own.fs_e_value.to_s + "\t"
351       end
352       owns.each do | own |
353         s << own.qlen.to_s + "\t" #TODO !
354       end
355
356       #   dcs.each do | dc |
357       #     s << dc.to_s + "\t"
358       #   end
359       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
360       hmmscan_results_per_protein_filtered.each do | r |
361         s << r.model + " "
362       end
363       s << "\t"
364
365
366
367       # overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
368
369       #   s << overview  + "\t"
370
371       #  s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
372
373       prev_r = nil
374       hmmscan_results_per_protein_filtered.each do | r |
375
376         if  prev_r != nil
377           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
378
379           if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] )
380             puts "xxx"
381             linker( prev_r.env_to, r.env_from, query, msa )
382           end
383
384
385         else
386           s << make_interdomain_sequence( r.env_from, false )
387
388         end
389         s << r.model
390         s << "["
391         s << r.env_from.to_s << "-" << r.env_to.to_s
392         s << " " << r.i_e_value.to_s
393         s << "]"
394         prev_r = r
395       end
396       # s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
397       puts s
398     end
399
400
401     def linker( first, last , query , msa)
402       puts first.to_s + "-" + last.to_s
403       if ( last - first >= 1 )
404         seq = msa.get_by_name( query, true, false )
405         linker = seq.get_subsequence( first -1 , last - 1 )
406         puts linker.get_sequence_as_string
407       end
408
409     end
410
411
412     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
413       linkers = ""
414       prev_r = nil
415       hmmscan_results_per_protein_filtered.each do | r |
416         if r.model == hmm_for_protein_output
417           if  prev_r != nil
418             linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
419           end
420           prev_r = r
421         end
422       end
423       linkers
424     end
425
426
427
428     def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
429       overview = ""
430       prev_r = nil
431       hmmscan_results_per_protein_filtered.each do | r |
432         if r.model == hmm_for_protein_output
433           if prev_r == nil
434             overview << hmm_for_protein_output
435           else
436             if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
437               overview << "~" << hmm_for_protein_output
438             else
439               overview << "----" << hmm_for_protein_output
440             end
441           end
442           prev_r = r
443         end
444       end
445       overview
446     end
447
448     def make_interdomain_sequence( d, mark_short = true )
449       s = ""
450       d /= 20
451       if d >= 10
452         s << "----//----"
453       elsif d >= 1
454         d.times do
455           s << "-"
456         end
457       elsif mark_short
458         s << "~"
459       end
460       s
461     end
462
463
464     def print_help()
465       puts( "Usage:" )
466       puts()
467       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
468       puts()
469       puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
470       puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
471       puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
472       puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
473       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
474       puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
475       puts( "           -" + SPECIES_OPTION + ": species for protein summary" )
476       puts()
477     end
478
479   end # class
480
481 end # module Evoruby