in progress... multi-domain extractor started
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / domain_sequence_extractor.rb
1 #
2 # = lib/evo/tool/domain_sequence_extractor - DomainSequenceExtractor class
3 #
4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
5 # License::      GNU Lesser General Public License (LGPL)
6
7 require 'lib/evo/util/constants'
8 require 'lib/evo/util/util'
9 require 'lib/evo/util/command_line_arguments'
10 require 'lib/evo/io/parser/hmmscan_domain_extractor'
11
12 module Evoruby
13   class DomainSequenceExtractor
14
15     PRG_NAME       = "dsx"
16     PRG_VERSION    = "2.002"
17     PRG_DESC       = "Extraction of domain sequences from hmmscan output"
18     PRG_DATE       = "20170220"
19     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
20
21     E_VALUE_THRESHOLD_DEFAULT = 0.1
22     LENGTH_THRESHOLD_DEFAULT  = 50
23
24     E_VALUE_THRESHOLD_OPTION           = 'e'
25     LENGTH_THRESHOLD_OPTION            = 'l'
26     ADD_POSITION_OPTION                = 'p'
27     ADD_DOMAIN_NUMBER_OPTION           = 'd'
28     ADD_SPECIES                        = 's'
29     LOG_FILE_SUFFIX                    = '_domain_seq_extr.log'
30     PASSED_SEQS_SUFFIX                 = '_with_passing_domains.fasta'
31     FAILED_SEQS_SUFFIX                 = '_with_no_passing_domains.fasta'
32     HELP_OPTION_1                      = 'help'
33     HELP_OPTION_2                      = 'h'
34     def run()
35
36       Util.print_program_information( PRG_NAME,
37       PRG_VERSION,
38       PRG_DESC ,
39       PRG_DATE,
40       WWW,
41       STDOUT )
42
43       if ( ARGV == nil || ( ARGV.length < 1 )  )
44         print_help
45         exit( -1 )
46       end
47
48       begin
49         cla = CommandLineArguments.new( ARGV )
50       rescue ArgumentError
51         Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
52       end
53
54       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
55       cla.is_option_set?( HELP_OPTION_2 ) )
56         print_help
57         exit( 0 )
58       end
59
60       unless ( cla.get_number_of_files == 2 || cla.get_number_of_files == 3 || cla.get_number_of_files == 4 )
61         print_help
62         exit( -1 )
63       end
64
65       allowed_opts = Array.new
66       allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
67       allowed_opts.push( ADD_POSITION_OPTION )
68       allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
69       allowed_opts.push( LENGTH_THRESHOLD_OPTION )
70       allowed_opts.push( ADD_SPECIES )
71
72       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
73       if ( disallowed.length > 0 )
74         Util.fatal_error( PRG_NAME,
75         "unknown option(s): " + disallowed,
76         STDOUT )
77       end
78
79       e_value_threshold = E_VALUE_THRESHOLD_DEFAULT
80       if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
81         begin
82           e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
83         rescue ArgumentError => e
84           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
85         end
86         if ( e_value_threshold < 0.0 )
87           Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
88         end
89       end
90
91       length_threshold = LENGTH_THRESHOLD_DEFAULT
92       if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
93         begin
94           length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
95         rescue ArgumentError => e
96           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
97         end
98         if ( length_threshold < 0)
99           Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
100         end
101       end
102
103       domain_id           = cla.get_file_name( 0 )
104       hmmscan_output      = cla.get_file_name( 1 )
105       fasta_sequence_file = ""
106       outfile             = ""
107
108       if (cla.get_number_of_files == 4)
109         fasta_sequence_file = cla.get_file_name( 2 )
110         outfile             = cla.get_file_name( 3 )
111       elsif (cla.get_number_of_files == 2 || cla.get_number_of_files == 3)
112         if (cla.get_number_of_files == 3)
113           fasta_sequence_file = cla.get_file_name( 2 )
114         else
115           hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
116           if ( hmmscan_index != nil )
117             prefix = hmmscan_output[0 .. hmmscan_index-1 ]
118             suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
119             files = Dir.entries( "." )
120             matching_files = Util.get_matching_files( files, prefix, suffix)
121             if matching_files.length < 1
122               Util.fatal_error( PRG_NAME, 'no file matching [' + prefix +
123               '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
124             end
125             if matching_files.length > 1
126               Util.fatal_error( PRG_NAME, 'more than one file matching [' +
127               prefix  + '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
128             end
129             fasta_sequence_file = matching_files[ 0 ]
130           else
131             Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
132           end
133         end
134         hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
135         if ( hmmscan_index != nil )
136           outfile = hmmscan_output.sub(Constants::HMMSCAN, "_") + "_" + domain_id
137           e = e_value_threshold >= 1 ? e_value_threshold.to_i : e_value_threshold.to_s.sub!('.','_')
138           outfile += "_E" + e.to_s
139           outfile += "_L" + length_threshold.to_i.to_s
140         else
141           Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
142         end
143       end
144
145       if outfile.downcase.end_with?( ".fasta" )
146         outfile = outfile[ 0 .. outfile.length - 7 ]
147       elsif outfile.downcase.end_with?( ".fsa" )
148         outfile = outfile[ 0 .. outfile.length - 5 ]
149       end
150
151       add_position = false
152       if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
153         add_position = true
154       end
155
156       add_domain_number = false
157       if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
158         add_domain_number = true
159       end
160
161       add_species = false
162       if cla.is_option_set?( ADD_SPECIES )
163         add_species = true
164       end
165
166       log = String.new
167       ld = Constants::LINE_DELIMITER
168
169       puts()
170       puts( "Domain                                                                             : " + domain_id )
171       log << "Domain                                                                             : " + domain_id + ld
172       puts( "Hmmscan outputfile                                                                 : " + hmmscan_output )
173       log << "Hmmscan outputfile                                                                 : " + hmmscan_output + ld
174       puts( "Fasta sequencefile (complete sequences)                                            : " + fasta_sequence_file )
175       log << "Fasta sequencefile (complete sequences)                                            : " + fasta_sequence_file + ld
176       puts( "Outputfile                                                                         : " + outfile + ".fasta" )
177       log << "Outputfile                                                                         : " + outfile + ld
178       puts( "Passed sequences outfile (fasta)                                                   : " + outfile + PASSED_SEQS_SUFFIX )
179       log << "Passed sequences outfile (fasta)                                                   : " + outfile + PASSED_SEQS_SUFFIX + ld
180       puts( "Failed sequences outfile (fasta)                                                   : " + outfile + FAILED_SEQS_SUFFIX )
181       log << "Failed sequences outfile (fasta)                                                   : " + outfile + FAILED_SEQS_SUFFIX + ld
182       puts( "Logfile                                                                            : " + outfile + LOG_FILE_SUFFIX )
183       log << "Logfile                                                                            : " + outfile + LOG_FILE_SUFFIX + ld
184       if ( e_value_threshold >= 0.0 )
185         puts( "iE-value threshold                                                                 : " + e_value_threshold.to_s )
186         log << "iE-value threshold                                                                 : " + e_value_threshold.to_s + ld
187       else
188         puts( "iE-value threshold                                                                 : no threshold" )
189         log << "iE-value threshold                                                                 : no threshold" + ld
190       end
191       if ( length_threshold > 0 )
192         puts( "Length threshold (env)                                                             : " + length_threshold.to_s )
193         log << "Length threshold (env)                                                             : " + length_threshold.to_s + ld
194       else
195         puts( "Length threshold (env)                                                             : no threshold" )
196         log << "Length threshold  (env)                                                            : no threshold" + ld
197       end
198       if ( add_position )
199         puts( "Add positions (rel to complete seq) to extracted domains                           : true" )
200         log << "Add positions (rel to complete seq) to extracted domains                           : true" + ld
201       else
202         puts( "Add positions (rel to complete seq) to extracted domains                           : false" )
203         log << "Add positions (rel to complete seq) to extracted domains                           : false" + ld
204       end
205
206       if ( add_domain_number )
207         puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
208         log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
209       else
210         puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
211         log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
212       end
213
214       puts
215       log <<  ld
216
217       domain_count = 0
218       begin
219         parser = HmmscanDomainExtractor.new()
220         domain_count = parser.parse( domain_id,
221         hmmscan_output,
222         fasta_sequence_file,
223         outfile,
224         outfile + PASSED_SEQS_SUFFIX,
225         outfile + FAILED_SEQS_SUFFIX,
226         e_value_threshold,
227         length_threshold,
228         add_position,
229         add_domain_number,
230         add_species,
231         false,
232         log )
233       rescue ArgumentError, IOError => e
234         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
235
236       rescue Exception => e
237         puts e.backtrace
238         Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
239
240       end
241
242       puts
243       Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
244       Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta")
245       Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
246       Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX )
247       Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX )
248
249       begin
250         f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
251         f.print( log )
252         f.close
253       rescue Exception => e
254         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
255       end
256
257       puts
258       Util.print_message( PRG_NAME, "OK" )
259       puts
260
261     end
262
263     def print_help()
264       puts()
265       puts( "Usage:" )
266       puts()
267       puts( "  " + PRG_NAME + ".rb [options] <target domain> <hmmscan outputfile> [file containing complete sequences in fasta format] [outputfile]" )
268       puts()
269       puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f> : iE-value threshold for target domain, default is " + E_VALUE_THRESHOLD_DEFAULT.to_s )
270       puts( "           -" + LENGTH_THRESHOLD_OPTION   + "=<i> : length threshold target domain (env), default is " + LENGTH_THRESHOLD_DEFAULT.to_s )
271       puts( "           -" + ADD_DOMAIN_NUMBER_OPTION  + "     : to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
272       puts( "           -" + ADD_POSITION_OPTION  + "     : to add positions (rel to complete seq) to extracted domains" )
273       puts( "           -" + ADD_SPECIES  + "     : to add species [in brackets]" )
274       puts()
275       puts( "Examples:" )
276       puts
277       puts( "  " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta P53_#{Constants::PFAM_V_FOR_EX}_10_Pkinase_E1_0e-06_L50" )
278       puts
279       puts( "  " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta" )
280       puts
281       puts( "  " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10" )
282       puts()
283     end
284
285   end # class DomainSequenceExtractor
286
287 end # module Evoruby