2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # Last modified: 2017/02/20
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
17 class HmmscanMultiDomainExtractor
21 # raises ArgumentError, IOError, StandardError
35 Util.check_file_for_readability( hmmscan_output )
36 Util.check_file_for_readability( fasta_sequence_file )
37 Util.check_file_for_writability( outfile + ".fasta" )
38 Util.check_file_for_writability( passed_seqs_outfile )
39 Util.check_file_for_writability( failed_seqs_outfile )
42 factory = MsaFactory.new()
43 in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
45 if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
46 error_msg = "could not find fasta sequences in " + fasta_sequence_file
47 raise IOError, error_msg
55 ld = Constants::LINE_DELIMITER
57 domain_pass_counter = 0
58 domain_fail_counter = 0
59 passing_domains_per_protein = 0
60 proteins_with_failing_domains = 0
61 domain_not_present_counter = 0
63 max_domain_copy_number_per_protein = -1
64 max_domain_copy_number_sequence = ""
65 passing_target_length_sum = 0
66 overall_target_length_sum = 0
67 overall_target_length_min = 10000000
68 overall_target_length_max = -1
69 passing_target_length_min = 10000000
70 passing_target_length_max = -1
72 overall_target_ie_min = 10000000
73 overall_target_ie_max = -1
74 passing_target_ie_min = 10000000
75 passing_target_ie_max = -1
79 hmmscan_parser = HmmscanParser.new( hmmscan_output )
80 results = hmmscan_parser.parse
84 pc0 = PassCondition.new('Bcl-2', 0.01, -1, 0.5 )
90 domains_in_query_ary = Array.new
91 queries_ary = Array.new
92 results.each do |hmmscan_result|
93 if ( prev_query != nil ) && ( hmmscan_result.query != prev_query )
96 domains_in_query_ary.each do |domain_in_query|
97 if pcs.has_key?(domain_in_query.model)
98 pc = pcs[domain_in_query.model]
100 # @percent_len = rel_len
101 if (pc.i_e_value != nil) && (pc.i_e_value >= 0)
102 if domain_in_query.i_e_value > pc.i_e_value
107 if (pc.abs_len != nil) && (pc.abs_len > 0)
108 length = 1 + domain_in_query.env_to - domain_in_query.env_from
109 if length < pc.abs_len
114 if (pc.rel_len != nil) && (pc.rel_len > 0)
115 length = 1 + domain_in_query.env_to - domain_in_query.env_from
116 if length < (pc.rel_len * domain_in_query.tlen)
124 queries_ary.push(domains_in_query_ary)
126 domains_in_query_ary = Array.new
129 prev_query = hmmscan_result.query
130 domains_in_query_ary.push(hmmscan_result)
133 queries_ary.push(domains_in_query_ary)
135 puts queries_ary[0][0].model
136 puts queries_ary[0][0].i_e_value
137 puts queries_ary[0][1].model
138 puts queries_ary[0][2].model
139 puts queries_ary[1][0].model
140 puts queries_ary[1][0].i_e_value
141 puts queries_ary[1][1].model
142 puts queries_ary[2][2].model
143 queries_ary.each do | query_ary |
144 query_ary.each do | domain |
155 results.each do | r |
157 if ( prev_query != nil ) && ( r.query != prev_query )
159 passing_domains_per_protein = 0
161 log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
162 domain_not_present_counter += 1
169 if domain_id != r.model
177 # sequence_len = r.qlen
180 env_from = r.env_from
182 i_e_value = r.i_e_value
185 length = 1 + env_to - env_from
187 overall_target_length_sum += length
188 if length > overall_target_length_max
189 overall_target_length_max = length
191 if length < overall_target_length_min
192 overall_target_length_min = length
195 if i_e_value > overall_target_ie_max
196 overall_target_ie_max = i_e_value
198 if i_e_value < overall_target_ie_min
199 overall_target_ie_min = i_e_value
202 if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
203 ( ( length_threshold <= 0 ) || ( length >= length_threshold.to_f ) ) )
204 hmmscan_datas << HmmscanData.new( sequence, number, out_of, env_from, env_to, i_e_value )
205 passing_target_length_sum += length
206 passing_domains_per_protein += 1
207 if length > passing_target_length_max
208 passing_target_length_max = length
210 if length < passing_target_length_min
211 passing_target_length_min = length
213 if i_e_value > passing_target_ie_max
214 passing_target_ie_max = i_e_value
216 if i_e_value < passing_target_ie_min
217 passing_target_ie_min = i_e_value
219 if ( passing_domains_per_protein > max_domain_copy_number_per_protein )
220 max_domain_copy_number_sequence = sequence
221 max_domain_copy_number_per_protein = passing_domains_per_protein
224 log << domain_fail_counter.to_s + ": " + sequence.to_s + " fails threshold(s)"
225 if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
226 log << " iE=" + i_e_value.to_s
228 if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
229 le = env_to - env_from + 1
230 log << " l=" + le.to_s
233 domain_fail_counter += 1
237 error_msg = "number > out_of (this should not have happened)"
238 raise StandardError, error_msg
242 if !hmmscan_datas.empty?
243 process_hmmscan_datas( hmmscan_datas,
249 domain_pass_counter += hmmscan_datas.length
250 if passed_seqs.find_by_name_start( sequence, true ).length < 1
251 add_sequence( sequence, in_msa, passed_seqs )
253 error_msg = "this should not have happened"
254 raise StandardError, error_msg
257 if failed_seqs.find_by_name_start( sequence, true ).length < 1
258 add_sequence( sequence, in_msa, failed_seqs )
259 proteins_with_failing_domains += 1
261 error_msg = "this should not have happened"
262 raise StandardError, error_msg
268 end # results.each do | r |
270 if (prev_query != nil) && (!saw_target)
271 log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
272 domain_not_present_counter += 1
275 if domain_pass_counter < 1
276 error_msg = "no domain sequences were extracted"
277 raise IOError, error_msg
280 if ( domain_not_present_counter + passed_seqs.get_number_of_seqs + proteins_with_failing_domains ) != protein_counter
281 error_msg = "not present + passing + not passing != proteins in sequence (fasta) file (this should not have happened)"
282 raise StandardError, error_msg
289 avg_pass = ( passing_target_length_sum / domain_pass_counter )
290 puts( "Passing target domain lengths: average: " + avg_pass.to_s )
291 log << "Passing target domain lengths: average: " + avg_pass.to_s
293 puts( "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - " + passing_target_length_max.to_s)
294 log << "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - " + passing_target_length_max.to_s
296 puts( "Passing target domain iE: min-max: " + passing_target_ie_min.to_s + " - " + passing_target_ie_max.to_s)
297 log << "Passing target domain iE: min-max: " + passing_target_ie_min.to_s + " - " + passing_target_ie_max.to_s
299 puts( "Passing target domains: sum: " + domain_pass_counter.to_s )
300 log << "Passing target domains: sum: " + domain_pass_counter.to_s
304 sum = domain_pass_counter + domain_fail_counter
305 avg_all = overall_target_length_sum / sum
306 puts( "All target domain lengths: average: " + avg_all.to_s )
307 log << "All target domain lengths: average: " + avg_all.to_s
309 puts( "All target domain lengths: min-max: " + overall_target_length_min.to_s + " - " + overall_target_length_max.to_s)
310 log << "All target domain lengths: min-max: " + overall_target_length_min.to_s + " - " + overall_target_length_max.to_s
312 puts( "All target target domain iE: min-max: " + overall_target_ie_min.to_s + " - " + overall_target_ie_max.to_s)
313 log << "All target target domain iE: min-max: " + overall_target_ie_min.to_s + " - " + overall_target_ie_max.to_s
315 puts( "All target domains: sum: " + sum.to_s )
316 log << "All target domains: sum: " + sum.to_s
319 puts( "Proteins with passing target domain(s): " + passed_seqs.get_number_of_seqs.to_s )
320 puts( "Proteins with no passing target domain: " + proteins_with_failing_domains.to_s )
321 puts( "Proteins with no target domain : " + domain_not_present_counter.to_s )
326 puts( "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s )
327 log << "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s
330 if ( max_domain_copy_number_per_protein > 1 )
331 puts( "First target protein with this copy number: " + max_domain_copy_number_sequence )
332 log << "First target protein with this copy number: " + max_domain_copy_number_sequence
336 write_msa( out_msa, outfile + ".fasta" )
337 write_msa( passed_seqs, passed_seqs_outfile )
338 write_msa( failed_seqs, failed_seqs_outfile )
341 log << "passing target domains : " + domain_pass_counter.to_s + ld
342 log << "failing target domains : " + domain_fail_counter.to_s + ld
343 log << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld
344 log << "proteins in hmmscan outputfile : " + protein_counter.to_s + ld
345 log << "proteins with passing target domain(s) : " + passed_seqs.get_number_of_seqs.to_s + ld
346 log << "proteins with no passing target domain : " + proteins_with_failing_domains.to_s + ld
347 log << "proteins with no target domain : " + domain_not_present_counter.to_s + ld
351 return domain_pass_counter
357 def write_msa( msa, filename )
359 w = FastaWriter.new()
360 w.set_line_width( 60 )
363 io.write_to_file( msa, filename, w )
365 error_msg = "could not write to \"" + filename + "\""
366 raise IOError, error_msg
370 def add_sequence( sequence_name, in_msa, add_to_msa )
371 seqs = in_msa.find_by_name_start( sequence_name, true )
372 if ( seqs.length < 1 )
373 error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
374 raise StandardError, error_msg
376 if ( seqs.length > 1 )
377 error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
378 raise StandardError, error_msg
380 seq = in_msa.get_sequence( seqs[ 0 ] )
381 add_to_msa.add_sequence( seq )
384 def process_hmmscan_datas( hmmscan_datas,
391 actual_out_of = hmmscan_datas.size
396 hmmscan_datas.each_with_index do |hmmscan_data, index|
397 if hmmscan_data.number < ( index + 1 )
398 error_msg = "hmmscan_data.number < ( index + 1 ) (this should not have happened)"
399 raise StandardError, error_msg
402 seq_name = hmmscan_data.seq_name
404 extract_domain( seq_name,
407 hmmscan_data.env_from,
415 if prev_seq_name && prev_seq_name != seq_name
416 error_msg = "this should not have happened"
417 raise StandardError, error_msg
419 prev_seq_name = seq_name
421 end # def process_hmmscan_data
423 def extract_domain( sequence,
433 if number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
434 error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
435 raise StandardError, error_msg
437 if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
438 error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
439 raise StandardError, error_msg
441 seqs = in_msa.find_by_name_start( sequence, true )
443 error_msg = "sequence \"" + sequence + "\" not found in sequence file"
444 raise IOError, error_msg
447 error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
448 raise IOError, error_msg
450 # hmmscan is 1 based, whereas sequences are 0 bases in this package.
451 seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
453 orig_name = seq.get_name
455 seq.set_name( orig_name.split[ 0 ] )
458 seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
461 if out_of != 1 && add_domain_number
462 seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
466 a = orig_name.rindex "["
467 b = orig_name.rindex "]"
469 error_msg = "species not found in " + orig_name
470 raise StandardError, error_msg
472 species = orig_name[ a .. b ]
473 seq.set_name( seq.get_name + " " + species )
475 out_msa.add_sequence( seq )
478 def is_ignorable?( line )
479 return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
482 end # class HmmscanDomainExtractor
485 def initialize( seq_name, number, out_of, env_from, env_to, i_e_value )
491 @i_e_value = i_e_value
493 attr_reader :seq_name, :number, :out_of, :env_from, :env_to, :i_e_value
497 def initialize( hmm, i_e_value, abs_len, rel_len )
499 @i_e_value = i_e_value
501 @percent_len = rel_len
503 attr_reader :hmm, :i_e_value, :abs_len, :rel_len