inprogress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_analysis.rb
1 #
2 # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
8 #
9
10 require 'set'
11
12 require 'lib/evo/util/constants'
13 require 'lib/evo/util/util'
14 require 'lib/evo/util/command_line_arguments'
15 require 'lib/evo/io/parser/hmmscan_parser'
16 require 'lib/evo/msa/msa'
17 require 'lib/evo/msa/msa_factory'
18 require 'lib/evo/io/msa_io'
19 require 'lib/evo/io/parser/fasta_parser'
20 require 'lib/evo/io/writer/fasta_writer'
21
22 module Evoruby
23
24   class HmmscanAnalysis
25
26     PRG_NAME       = "hsa"
27     PRG_VERSION    = "1.000"
28     PRG_DESC       = "hmmscan summary"
29     PRG_DATE       = "130319"
30     COPYRIGHT      = "2013 Christian M Zmasek"
31     CONTACT        = "phyloxml@gmail.com"
32     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
33
34     SPECIES_OPTION                = "s"
35     I_E_VALUE_THRESHOLD_OPTION    = "ie"
36     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
37     HMM_FOR_PROTEIN_OUTPUT        = "m"
38     HELP_OPTION_1                 = "help"
39     HELP_OPTION_2                 = "h"
40
41     USE_AVOID_HMMS = false
42     AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
43     LIMIT_FOR_CLOSE_DOMAINS = 20
44
45     def initialize
46       @domain_counts = Hash.new
47     end
48
49     def run
50
51       #   Util.print_program_information( PRG_NAME,
52       #     PRG_VERSION,
53       #     PRG_DESC,
54       #     PRG_DATE,
55       #     COPYRIGHT,
56       #     CONTACT,
57       #     WWW,
58       #     STDOUT )
59
60       begin
61         cla = CommandLineArguments.new( ARGV )
62       rescue ArgumentError => e
63         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
64       end
65
66       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
67            cla.is_option_set?( HELP_OPTION_2 ) )
68         print_help
69         exit( 0 )
70       end
71
72       if ( cla.get_number_of_files != 1 &&  cla.get_number_of_files != 2 )
73         print_help
74         exit( -1 )
75       end
76
77       allowed_opts = Array.new
78       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
79       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
80       allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
81       allowed_opts.push( SPECIES_OPTION )
82
83       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
84       if ( disallowed.length > 0 )
85         Util.fatal_error( PRG_NAME,
86           "unknown option(s): " + disallowed,
87           STDOUT )
88       end
89
90       inpath = cla.get_file_name( 0 )
91       seq_file_path = nil
92
93       if ( cla.get_number_of_files == 2 )
94         seq_file_path = cla.get_file_name( 1 )
95       end
96
97       msa = nil
98       if seq_file_path != nil
99         msa = read_fasta_file(seq_file_path )
100       end
101
102       i_e_value_threshold = -1
103       if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) )
104         begin
105           i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION )
106         rescue ArgumentError => e
107           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
108         end
109         if ( i_e_value_threshold < 0.0 )
110           Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT )
111         end
112       end
113
114       fs_e_value_threshold = -1
115       if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) )
116         begin
117           fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION )
118         rescue ArgumentError => e
119           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
120         end
121         if ( fs_e_value_threshold < 0.0 )
122           Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
123         end
124       end
125
126       hmm_for_protein_outputs = []
127       if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
128         begin
129           hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
130           hmm_for_protein_outputs = hmm_for_protein_output.split( "/" );
131         rescue ArgumentError => e
132           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
133         end
134       end
135
136       species = "HUMAN"
137       if ( cla.is_option_set?( SPECIES_OPTION ) )
138         begin
139           species = cla.get_option_value( SPECIES_OPTION )
140         rescue ArgumentError => e
141           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
142         end
143       end
144
145       begin
146         parse( inpath,
147           i_e_value_threshold,
148           fs_e_value_threshold,
149           hmm_for_protein_outputs,
150           species,
151           msa )
152       rescue IOError => e
153         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
154       end
155
156     end # def run
157
158     private
159
160     def read_fasta_file( input )
161       f = MsaFactory.new()
162       msa = nil
163       begin
164         msa = f.create_msa_from_file( input, FastaParser.new() )
165       rescue Exception => e
166         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
167       end
168       msa
169     end
170
171
172     # raises ArgumentError, IOError
173     def parse( inpath,
174         i_e_value_threshold,
175         fs_e_value_threshold,
176         hmm_for_protein_outputs,
177         species,
178         msa )
179
180       Util.check_file_for_readability( inpath )
181
182       hmmscan_parser = HmmscanParser.new( inpath )
183       results = hmmscan_parser.parse
184
185       hmmscan_results_per_protein = []
186
187       query     = ""
188       prev_query = ""
189
190       results.each do | r |
191
192         query     = r.query
193
194         if !prev_query.empty? && prev_query != query
195           if !hmmscan_results_per_protein.empty?
196             process_hmmscan_results_per_protein( hmmscan_results_per_protein,
197               fs_e_value_threshold,
198               hmm_for_protein_outputs,
199               i_e_value_threshold,
200               species,
201               msa )
202           end
203           hmmscan_results_per_protein.clear
204         end
205         prev_query = query
206
207         if USE_AVOID_HMMS
208           if !AVOID_HHMS.include? r.model
209             hmmscan_results_per_protein << r
210           end
211         else
212           hmmscan_results_per_protein << r
213         end
214
215       end
216
217       if !hmm_for_protein_outputs.empty? && !hmmscan_results_per_protein.empty?
218         process_hmmscan_results_per_protein( hmmscan_results_per_protein,
219           fs_e_value_threshold,
220           hmm_for_protein_outputs,
221           i_e_value_threshold,
222           species,
223           msa )
224       end
225
226
227     end # def parse
228
229     def process_id( id )
230       if  id =~ /(sp|tr)\|\S+\|(\S+)/
231         id = $2
232       end
233       id
234     end
235
236
237
238     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
239         fs_e_value_threshold,
240         target_hmms,
241         i_e_value_threshold,
242         species,
243         msa )
244
245       raise StandardError, "target hmms is empty" if target_hmms.length < 1
246       raise StandardError, "results is empty" if hmmscan_results_per_protein.length < 1
247
248       # filter according to i-Evalue threshold
249       # abort if fs Evalue too high
250
251
252       if fs_e_value_threshold >= 0.0
253         hmmscan_results_per_protein.each do | r |
254           target_hmms.each do | hmm |
255             if r.model == hmm && r.fs_e_value > fs_e_value_threshold
256               return
257             end
258           end
259         end
260       end
261
262       #  dcs = []
263       hmmscan_results_per_protein_filtered = []
264       matched = Set.new
265
266       hmmscan_results_per_protein.each do | r |
267         if i_e_value_threshold < 0 || r.i_e_value <= i_e_value_threshold
268           hmmscan_results_per_protein_filtered << r
269           target_hmms.each do | hmm |
270             if r.model == hmm
271
272               matched << hmm
273               break
274             end
275           end
276
277         end
278       end
279
280       if matched.length < target_hmms.length
281         return
282       end
283       if  hmmscan_results_per_protein_filtered.length < 1
284         return
285       end
286
287       hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
288
289       owns = []
290       target_hmms.each do | hmm |
291         hmmscan_results_per_protein_filtered.each do | r |
292           if r.model == hmm
293             owns << r
294             break
295           end
296         end
297       end
298
299
300       query = nil
301       qlen = nil
302       owns.each do | own |
303
304         if query == nil
305           query = own.query
306
307           qlen = own.qlen
308         else
309           # sanity checks
310           raise StandardError, "failed sanity check" if query != own.query || qlen != own.qlen
311           raise StandardError, "failed sanity check: qlen != own.qlen" if qlen != own.qlen
312         end
313       end
314
315       s = query + "\t"
316       s << species + "\t"
317       owns.each do | own |
318         s << own.fs_e_value.to_s + "\t"
319       end
320
321       s << qlen.to_s + "\t"
322
323       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
324       hmmscan_results_per_protein_filtered.each do | r |
325         s << r.model + " "
326       end
327       s << "\t"
328       s <<  make_overview_da( hmmscan_results_per_protein_filtered )
329       s << "\t"
330       prev_r = nil
331       hmmscan_results_per_protein_filtered.each do | r |
332         if  prev_r != nil
333           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
334           if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] )
335             extract_linker( prev_r.env_to, r.env_from, query, msa )
336           end
337         else
338           s << make_interdomain_sequence( r.env_from, false )
339         end
340         s << r.model
341         s << "["
342         s << r.env_from.to_s << "-" << r.env_to.to_s
343         s << " " << r.i_e_value.to_s
344         s << "]"
345         prev_r = r
346       end
347       s << make_interdomain_sequence( qlen - prev_r.env_to, false )
348
349       puts s
350     end
351
352     def extract_linkers( hmmscan_results_per_protein_filtered, target_hmms, query, msa  )
353       prev_r = nil
354       hmmscan_results_per_protein_filtered.each do | r |
355         if  prev_r != nil
356           if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] )
357             extract_linker( prev_r.env_to, r.env_from, query, msa )
358           end
359         end
360         prev_r = r
361       end
362     end
363
364     def extract_linker( first, last , query , msa)
365       if ( last - first >= 1 )
366         seq = msa.get_by_name_pattern( /\b#{Regexp.quote(query)}\b/ )
367         linker = seq.get_subsequence( first - 1 , last - 1 )
368         linker.get_sequence_as_string
369       end
370     end
371
372     def make_detailed_da( hmmscan_results_per_protein_filtered  )
373       s = ""
374       prev_r = nil
375       hmmscan_results_per_protein_filtered.each do | r |
376         if  prev_r != nil
377           s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
378         else
379           s << make_interdomain_sequence( r.env_from, false )
380         end
381         s << r.model
382         s << "["
383         s << r.env_from.to_s << "-" << r.env_to.to_s
384         s << " " << r.i_e_value.to_s
385         s << "]"
386         prev_r = r
387       end
388       s << make_interdomain_sequence( qlen - prev_r.env_to, false )
389       s
390     end
391
392     def make_overview_da( hmmscan_results_per_protein_filtered )
393       overview = ""
394       prev_r = nil
395       hmmscan_results_per_protein_filtered.each do | r |
396
397         if prev_r == nil
398           overview << r.model
399         else
400           if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
401             overview << "~" << r.model
402           else
403             overview << "----" << r.model
404           end
405         end
406         prev_r = r
407
408       end
409       overview
410     end
411
412
413
414
415
416     def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
417       linkers = ""
418       prev_r = nil
419       hmmscan_results_per_protein_filtered.each do | r |
420         if r.model == hmm_for_protein_output
421           if  prev_r != nil
422             linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
423           end
424           prev_r = r
425         end
426       end
427       linkers
428     end
429
430
431     def make_interdomain_sequence( d, mark_short = true )
432       s = ""
433       d /= 20
434       if d >= 10
435         s << "----//----"
436       elsif d >= 1
437         d.times do
438           s << "-"
439         end
440       elsif mark_short
441         s << "~"
442       end
443       s
444     end
445
446
447     def print_help()
448       puts( "Usage:" )
449       puts()
450       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
451       puts()
452       puts( "  options: -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
453       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
454       puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
455       puts( "           -" + SPECIES_OPTION + ": species for protein summary" )
456       puts()
457     end
458
459   end # class
460
461 end # module Evoruby