in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_multi_domain_extractor.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanMultiDomainExtractor class
3 #
4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
5 # License::      GNU Lesser General Public License (LGPL)
6 #
7 # Last modified: 2017/02/20
8
9 require 'lib/evo/util/constants'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/writer/fasta_writer'
13 require 'lib/evo/io/parser/fasta_parser'
14 require 'lib/evo/io/parser/hmmscan_parser'
15
16 module Evoruby
17   class HmmscanMultiDomainExtractor
18
19     TESTING = false
20     OUTPUT_ID = 'mdsx'
21     DOMAIN_DELIMITER = ' -- '
22
23     IE_CUTOFF_FOR_DA_OVERVIEW      = 1E-6
24     REL_LEN_CUTOFF_FOR_DA_OVERVIEW = 0.5
25
26     PASSING_FL_SEQS_SUFFIX    = "_#{OUTPUT_ID}_passing_full_length_seqs.fasta"
27     FAILING_FL_SEQS_SUFFIX    = "_#{OUTPUT_ID}_failing_full_length_seqs.fasta"
28     TARGET_DA_SUFFIX          = "_#{OUTPUT_ID}_target_da.fasta"
29     CONCAT_TARGET_DOM_SUFFIX  = "_#{OUTPUT_ID}_concat_target_doms.fasta"
30     TARGET_DOM_OUTPUT_MIDPART = "_#{OUTPUT_ID}_target_dom_"
31     LOG_FILE_SUFFIX           = "_#{OUTPUT_ID}.log"
32     def initialize
33       @passing_domains_data = nil
34       @failing_domains_data = nil
35       @failing_domain_architectures = nil
36       @passsing_domain_architectures = nil
37       @failing_proteins_bc_not_all_target_doms_present = nil
38       @failing_proteins_bc_missing_cutoffs = nil
39       @encountered_domain_architectures = nil
40       @log = nil
41     end
42
43     # raises ArgumentError, IOError, StandardError
44     def parse( target_da,
45       hmmscan_output,
46       fasta_sequence_file,
47       outfile_base,
48       log_str )
49
50       passing_fl_seqs_outfile   = outfile_base + PASSING_FL_SEQS_SUFFIX
51       failing_fl_seqs_outfile   = outfile_base + FAILING_FL_SEQS_SUFFIX
52       target_da_outfile         = outfile_base + TARGET_DA_SUFFIX
53       concat_target_dom_outfile = outfile_base + CONCAT_TARGET_DOM_SUFFIX
54       logfile                   = outfile_base + LOG_FILE_SUFFIX
55
56       Util.check_file_for_readability( hmmscan_output )
57       Util.check_file_for_readability( fasta_sequence_file )
58       Util.check_file_for_writability( passing_fl_seqs_outfile )
59       Util.check_file_for_writability( failing_fl_seqs_outfile )
60       Util.check_file_for_writability( target_da_outfile )
61       Util.check_file_for_writability( concat_target_dom_outfile )
62       Util.check_file_for_writability( logfile )
63
64       @log = log_str
65
66       in_msa = nil
67       factory = MsaFactory.new()
68       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
69
70       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
71         error_msg = "could not find fasta sequences in " + fasta_sequence_file
72         raise IOError, error_msg
73       end
74
75       failed_seqs_msa = Msa.new
76       passed_seqs_msa = Msa.new
77
78       hmmscan_parser = HmmscanParser.new( hmmscan_output )
79       results = hmmscan_parser.parse
80
81       ####
82       # Import: if multiple copies of same domain, thresholds need to be same!
83
84       #target_domain_ary.push(TargetDomain.new('DNA_pol_B_exo1', 1e-6, -1, 0.6, 0 ))
85       #target_domain_ary.push(TargetDomain.new('DNA_pol_B', 1e-6, -1, 0.6, 1 ))
86
87       #target_domain_ary.push(TargetDomain.new('Hexokinase_1', 1e-6, -1, 0.9, 0 ))
88       #target_domain_ary.push(TargetDomain.new('Hexokinase_2', 1e-6, -1, 0.9, 1 ))
89       # target_domain_ary.push(TargetDomain.new('Hexokinase_2', 0.1, -1, 0.5, 1 ))
90       # target_domain_ary.push(TargetDomain.new('Hexokinase_1', 0.1, -1, 0.5, 1 ))
91
92       target_domain_ary = parse_da target_da
93
94       # target_domain_ary.push(TargetDomain.new('BH4', 1, -1, 0.2, 0 ))
95       # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 1 ))
96       # target_domain_ary.push(TargetDomain.new('Bcl-2', 1, -1, 0.2, 2 ))
97       # target_domain_ary.push(TargetDomain.new('Bcl-2_3', 0.01, -1, 0.5, 2 ))
98
99       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 0 ))
100       #  target_domain_ary.push(TargetDomain.new('Nitrate_red_del', 1000, -1, 0.1, 1 ))
101
102       #target_domain_ary.push(TargetDomain.new('Chordopox_A33R', 1000, -1, 0.1 ))
103
104       target_domains = Hash.new
105
106       target_domain_architecure = ''
107
108       target_domain_ary.each do |target_domain|
109         target_domains[target_domain.name] = target_domain
110         if target_domain_architecure.length > 0
111           target_domain_architecure << DOMAIN_DELIMITER
112         end
113         target_domain_architecure << target_domain.name
114       end
115
116       target_domain_architecure.freeze
117
118       log 'Hmmscan outputfile             : ' + hmmscan_output
119       log 'Full length fasta sequence file: ' + fasta_sequence_file
120       log 'Target domain architecture     : ' + target_domain_architecure
121       target_domain_ary.each do |x|
122         log x.to_str
123       end
124
125       target_domain_names = Set.new
126
127       target_domains.each_key {|key| target_domain_names.add( key ) }
128
129       prev_query_seq_name = nil
130       domains_in_query_seq = Array.new
131       passing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
132       failing_sequences = Array.new # This will be an Array of Array of HmmscanResult for the same query seq.
133       total_sequences = 0
134
135       @failing_domain_architectures = Hash.new
136       @passsing_domain_architectures = Hash.new
137       @passing_domains_data = Hash.new
138       @failing_domains_data = Hash.new
139       @encountered_domain_architectures= Hash.new
140       @failing_proteins_bc_not_all_target_doms_present = 0
141       @failing_proteins_bc_missing_cutoffs = 0
142       out_domain_msas = Hash.new
143       out_domain_architecture_msa = Msa.new
144       out_concatenated_domains_msa = Msa.new
145       results.each do |hmmscan_result|
146         if ( prev_query_seq_name != nil ) && ( hmmscan_result.query != prev_query_seq_name )
147           if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
148             passing_sequences.push(domains_in_query_seq)
149           else
150             failing_sequences.push(domains_in_query_seq)
151           end
152           domains_in_query_seq = Array.new
153           total_sequences += 1
154         end
155         prev_query_seq_name = hmmscan_result.query
156         domains_in_query_seq.push(hmmscan_result)
157       end # each result
158
159       if prev_query_seq_name != nil
160         if compare(domains_in_query_seq, target_domain_names, target_domains, in_msa, out_domain_msas, out_domain_architecture_msa, out_concatenated_domains_msa, target_domain_architecure)
161           passing_sequences.push(domains_in_query_seq)
162         else
163           failing_sequences.push(domains_in_query_seq)
164         end
165         total_sequences += 1
166       end
167
168       if in_msa.get_number_of_seqs < total_sequences
169         error_msg = "hmmscan output contains more protein sequences than fasta sequence file"
170         raise IOError, error_msg
171       end
172
173       log
174       log 'Domain architecture overview using default iE-cutoff ' +
175       IE_CUTOFF_FOR_DA_OVERVIEW.to_s + ' and relative length cutoff ' + REL_LEN_CUTOFF_FOR_DA_OVERVIEW.to_s + ':'
176
177       @encountered_domain_architectures = @encountered_domain_architectures.sort_by {|k, v| v}.reverse.to_h
178       counter = 1;
179       @encountered_domain_architectures.each do |k, v|
180         log counter.to_s.rjust(2) + ') ' +  v.to_s.rjust(5) + ': ' + k
181         counter += 1
182         if counter > 80
183           break
184         end
185       end
186
187       log
188       log 'Passing domain architectures containing target domain(s):'
189       @passsing_domain_architectures = @passsing_domain_architectures.sort{|a, b|a<=>b}.to_h
190       passing_da_sum = 0
191       @passsing_domain_architectures.each do |da, count|
192         passing_da_sum += count
193         log count.to_s.rjust(4) + ': ' + da
194       end
195       log
196       log 'Failing domain architectures containing one or more target domain(s):'
197       @failing_domain_architectures = @failing_domain_architectures .sort{|a, b|a<=>b}.to_h
198       failing_da_sum = 0
199       @failing_domain_architectures .each do |da, count|
200         failing_da_sum += count
201         log count.to_s.rjust(4) + ': ' + da
202       end
203       log
204       log 'Passing target domain(s):'
205       @passing_domains_data = @passing_domains_data.sort{|a, b|a<=>b}.to_h
206       @passing_domains_data.each do |n, d|
207         log d.to_str
208       end
209       log
210       log'Failing target domain(s):'
211       @failing_domains_data = @failing_domains_data.sort{|a, b|a<=>b}.to_h
212       @failing_domains_data.each do |n, d|
213         log d.to_str
214       end
215
216       unless total_sequences == (passing_sequences.size + failing_sequences.size)
217         error_msg = "this should not have happened: total seqs not equal to passing plus failing seqs"
218         raise StandardError, error_msg
219       end
220
221       unless failing_sequences.size == (@failing_proteins_bc_not_all_target_doms_present + @failing_proteins_bc_missing_cutoffs)
222         error_msg = "this should not have happened: failing seqs sums not consistent"
223         raise StandardError, error_msg
224       end
225
226       unless @failing_proteins_bc_missing_cutoffs == failing_da_sum
227         error_msg = "this should not have happened: failing seqs not equal to failing da sum"
228         raise StandardError, error_msg
229       end
230
231       unless passing_sequences.size == passing_da_sum
232         error_msg = "this should not have happened: passing seqs not equal to passing da sum"
233         raise StandardError, error_msg
234       end
235
236       log
237       log "Protein sequences in sequence (fasta) file: " + in_msa.get_number_of_seqs.to_s.rjust(5)
238       log "Protein sequences in hmmscan output file  : " + total_sequences.to_s.rjust(5)
239       log "  Passing protein sequences               : " + passing_sequences.size.to_s.rjust(5)
240       log "  Failing protein sequences               : " + failing_sequences.size.to_s.rjust(5)
241       log "    Not all target domain present         : " + @failing_proteins_bc_not_all_target_doms_present.to_s.rjust(5)
242       log "    Target domain(s) failing cutoffs      : " + @failing_proteins_bc_missing_cutoffs.to_s.rjust(5)
243       log
244
245       out_domain_msas.keys.sort.each do |domain_name|
246         file_name = outfile_base + TARGET_DOM_OUTPUT_MIDPART + domain_name + '.fasta'
247         write_msa out_domain_msas[domain_name], file_name
248         log "Wrote passing target domain sequence for " +  domain_name.ljust(16) + ': ' + file_name
249       end
250
251       write_msa out_domain_architecture_msa, target_da_outfile
252       log 'Wrote target domain architecture                         : ' + target_da_outfile
253
254       write_msa out_concatenated_domains_msa, concat_target_dom_outfile
255       log 'Wrote concatenated target domain(s)                      : ' + concat_target_dom_outfile
256
257       passing_sequences.each do | domains |
258         query_name = domains[0].query
259         if (!TESTING) || (passed_seqs_msa.find_by_name_start( query_name, true ).length < 1)
260           add_sequence( query_name, in_msa, passed_seqs_msa )
261         else
262           error_msg = 'this should not have happened'
263           raise StandardError, error_msg
264         end
265       end
266
267       failing_sequences.each do | domains |
268         query_name = domains[0].query
269         if (!TESTING) || (failed_seqs_msa.find_by_name_start( query_name, true ).length < 1)
270           add_sequence( query_name, in_msa, failed_seqs_msa )
271         else
272           error_msg = 'this should not have happened'
273           raise StandardError, error_msg
274         end
275       end
276
277       write_msa passed_seqs_msa, passing_fl_seqs_outfile
278       log 'Wrote passing full length protein sequences              : ' + passing_fl_seqs_outfile
279       write_msa failed_seqs_msa, failing_fl_seqs_outfile
280       log 'Wrote failing full length protein sequences              : ' + failing_fl_seqs_outfile
281
282       begin
283         f = File.open( logfile, 'w' )
284         f.print( @log )
285         f.close
286       rescue Exception => e
287         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
288       end
289       log 'Wrote log file                                           : ' + logfile
290
291     end # parse
292
293     private
294
295     # domains: Array of HmmscanResult
296     def collect(domains, ie_cutoff, rel_len_cutoff)
297       passed = Array.new
298       domains.each do |domain|
299         length = 1 + domain.env_to - domain.env_from
300         if (domain.i_e_value <= ie_cutoff) && (length >= (rel_len_cutoff * domain.tlen))
301           passed.push domain
302         end
303       end
304       passed.sort! { |a,b| a.ali_from <=> b.ali_from }
305       s = ''
306       passed.each do |domain|
307         if s.length > 0
308           s << DOMAIN_DELIMITER
309         end
310         s << domain.model
311       end
312       if s.length > 0
313         if @encountered_domain_architectures.has_key? s
314           @encountered_domain_architectures[s] = 1 + @encountered_domain_architectures[s]
315         else
316           @encountered_domain_architectures[s] = 1
317         end
318       end
319     end
320
321     # domains_in_query_seq: Array of HmmscanResult
322     # target_domain_names: Set of String
323     # target_domains: Hash String->TargetDomain
324     # target_domain_architecture: String
325     def compare(domains_in_query_seq,
326       target_domain_names,
327       target_domains,
328       in_msa,
329       out_domain_msas,
330       out_domain_architecture_msa,
331       out_contactenated_domains_msa,
332       target_domain_architecture)
333
334       collect(domains_in_query_seq, IE_CUTOFF_FOR_DA_OVERVIEW, REL_LEN_CUTOFF_FOR_DA_OVERVIEW)
335
336       domain_names_in_query_seq = Set.new
337       domains_in_query_seq.each do |domain|
338         domain_names_in_query_seq.add(domain.model)
339       end
340       if target_domain_names.subset?(domain_names_in_query_seq)
341
342         passed_domains = Array.new
343         passed_domains_counts = Hash.new
344
345         domains_in_query_seq.each do |domain|
346           if target_domains.has_key?(domain.model)
347             target_domain = target_domains[domain.model]
348
349             if (target_domain.i_e_value != nil) && (target_domain.i_e_value >= 0)
350               if domain.i_e_value > target_domain.i_e_value
351                 addToFailingDomainData(domain)
352                 next
353               end
354             end
355             if (target_domain.abs_len != nil) && (target_domain.abs_len > 0)
356               length = 1 + domain.env_to - domain.env_from
357               if length < target_domain.abs_len
358                 addToFailingDomainData(domain)
359                 next
360               end
361             end
362             if (target_domain.rel_len != nil) && (target_domain.rel_len > 0)
363               length = 1 + domain.env_to - domain.env_from
364               if length < (target_domain.rel_len * domain.tlen)
365                 addToFailingDomainData(domain)
366                 next
367               end
368             end
369
370             passed_domains.push(domain)
371
372             if (passed_domains_counts.key?(domain.model))
373               passed_domains_counts[domain.model] = passed_domains_counts[domain.model] + 1
374             else
375               passed_domains_counts[domain.model] = 1
376             end
377
378             addToPassingDomainData(domain)
379
380           end # if target_domains.has_key?(domain.model)
381         end # domains_in_query_seq.each do |domain|
382       else
383         @failing_proteins_bc_not_all_target_doms_present += 1
384         return false
385       end # target_domain_names.subset?(domain_names_in_query_seq)
386
387       passed_domains.sort! { |a,b| a.ali_from <=> b.ali_from }
388       # Compare architectures
389       if !compareArchitectures(target_domain_architecture, passed_domains)
390         @failing_proteins_bc_missing_cutoffs += 1
391         return false
392       end
393
394       domain_counter = Hash.new
395
396       min_env_from = 10000000
397       max_env_to = 0
398
399       query_seq = nil
400
401       concatenated_domains = ''
402
403       passed_domains.each do |domain|
404         domain_name = domain.model
405
406         unless out_domain_msas.has_key? domain_name
407           out_domain_msas[ domain_name ] = Msa.new
408         end
409
410         if domain_counter.key?(domain_name)
411           domain_counter[domain_name] = domain_counter[domain_name] + 1
412         else
413           domain_counter[domain_name] = 1
414         end
415
416         if query_seq == nil
417           query_seq = domain.query
418           query_seq.freeze
419         elsif query_seq != domain.query
420           error_msg = "seq names do not match: this should not have happened"
421           raise StandardError, error_msg
422         end
423
424         extracted_domain_seq = extract_domain( query_seq,
425         domain_counter[domain_name],
426         passed_domains_counts[domain_name],
427         domain.env_from,
428         domain.env_to,
429         in_msa,
430         out_domain_msas[ domain_name ] )
431
432         concatenated_domains += extracted_domain_seq
433
434         if domain.env_from < min_env_from
435           min_env_from = domain.env_from
436         end
437         if domain.env_to > max_env_to
438           max_env_to = domain.env_to
439         end
440       end
441
442       extract_domain( query_seq,
443       1,
444       1,
445       min_env_from,
446       max_env_to,
447       in_msa,
448       out_domain_architecture_msa )
449
450       out_contactenated_domains_msa.add( query_seq, concatenated_domains)
451
452       true
453     end
454
455     def addToPassingDomainData(domain)
456       unless ( @passing_domains_data.key?(domain.model))
457         @passing_domains_data[domain.model] = DomainData.new(domain.model)
458       end
459       @passing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value)
460     end
461
462     def addToFailingDomainData(domain)
463       unless ( @failing_domains_data.key?(domain.model))
464         @failing_domains_data[domain.model] = DomainData.new(domain.model)
465       end
466       @failing_domains_data[domain.model].add( domain.model, (1 + domain.env_to - domain.env_from), domain.i_e_value)
467     end
468
469     # passed_domains needs to be sorted!
470     def compareArchitectures(target_domain_architecture, passed_domains, strict = false)
471       domain_architecture = ''
472       passed_domains.each do |domain|
473         if domain_architecture.length > 0
474           domain_architecture += DOMAIN_DELIMITER
475         end
476         domain_architecture += domain.model
477       end
478       pass = false
479       if strict
480         pass = (target_domain_architecture == domain_architecture)
481       else
482         pass = (domain_architecture.index(target_domain_architecture) != nil)
483       end
484       if pass
485         if (@passsing_domain_architectures.key?(domain_architecture))
486           @passsing_domain_architectures[domain_architecture] = @passsing_domain_architectures[domain_architecture] + 1
487         else
488           @passsing_domain_architectures[domain_architecture] = 1
489         end
490       else
491         if ( @failing_domain_architectures .key?(domain_architecture))
492           @failing_domain_architectures [domain_architecture] =  @failing_domain_architectures [domain_architecture] + 1
493         else
494           @failing_domain_architectures [domain_architecture] = 1
495         end
496       end
497       return pass
498     end
499
500     def write_msa( msa, filename )
501       io = MsaIO.new()
502       w = FastaWriter.new()
503       w.set_line_width( 60 )
504       w.clean( true )
505       File.delete(filename) if File.exist?(filename)
506       begin
507         io.write_to_file( msa, filename, w )
508       rescue Exception
509         error_msg = "could not write to \"" + filename + "\""
510         raise IOError, error_msg
511       end
512     end
513
514     def add_sequence( sequence_name, in_msa, add_to_msa )
515       seqs = in_msa.find_by_name_start( sequence_name, true )
516       if ( seqs.length < 1 )
517         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
518         raise StandardError, error_msg
519       end
520       if ( seqs.length > 1 )
521         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
522         raise StandardError, error_msg
523       end
524       seq = in_msa.get_sequence( seqs[ 0 ] )
525       add_to_msa.add_sequence( seq )
526     end
527
528     def extract_domain( seq_name,
529       number,
530       out_of,
531       seq_from,
532       seq_to,
533       in_msa,
534       out_msa)
535       if number < 1 || out_of < 1 || number > out_of
536         error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
537         raise StandardError, error_msg
538       end
539       if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
540         error_msg = "impossible: seq-from=" + seq_from.to_s + ", seq-to=" + seq_to.to_s
541         raise StandardError, error_msg
542       end
543       seqs = in_msa.find_by_name_start(seq_name, true)
544       if seqs.length < 1
545         error_msg = "sequence \"" + seq_name + "\" not found in sequence file"
546         raise IOError, error_msg
547       end
548       if seqs.length > 1
549         error_msg = "sequence \"" + seq_name + "\" not unique in sequence file"
550         raise IOError, error_msg
551       end
552       # hmmscan is 1 based, whereas sequences are 0 bases in this package.
553       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
554
555       orig_name = seq.get_name
556
557       seq.set_name( orig_name.split[ 0 ] )
558
559       if out_of != 1
560         seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
561       end
562
563       out_msa.add_sequence( seq )
564
565       seq.get_sequence_as_string
566     end
567
568     def parse_da( target_da_str )
569       target_domain_hash = Hash.new
570       target_domain_ary = Array.new
571       target_das = target_da_str.split '--'
572       target_das.each do |x|
573         inds = x.split '='
574         unless inds.size == 4
575           raise IOError, 'domain architecture is ill formatted: ' + x
576         end
577         target_domain_name = inds[0]
578         ie_cutoff = Float(inds[1])
579         abs_len_cutoff = Integer(inds[2])
580         rel_len_cutoff = Float(inds[3])
581         if target_domain_hash.has_key? target_domain_name
582           target_domain_ary.push target_domain_hash[target_domain_name]
583         else
584           td = TargetDomain.new(target_domain_name, ie_cutoff, abs_len_cutoff, rel_len_cutoff)
585           target_domain_hash[target_domain_name] = td
586           target_domain_ary.push td
587         end
588       end
589       target_domain_ary
590     end
591
592     def log(str = '')
593       puts str
594       @log << str << Constants::LINE_DELIMITER
595     end
596
597   end # class HmmscanMultiDomainExtractor
598
599   class DomainData
600     def initialize( name )
601       if (name == nil) || name.size < 1
602         error_msg = "domain name nil or empty"
603         raise IOError, error_msg
604       end
605       @name = name
606       @count = 0
607       @i_e_value_min = 10000000.0
608       @i_e_value_max = -1.0
609       @i_e_value_sum = 0.0
610       @len_min = 10000000
611       @len_max = -1
612       @len_sum = 0.0
613     end
614
615     def add( name, length, i_e_value)
616       if name != @name
617         error_msg = "domain names do not match"
618         raise IOError, error_msg
619       end
620
621       if length < 0
622         error_msg = "length cannot me negative"
623         raise IOError, error_msg
624       end
625       if i_e_value < 0
626         error_msg = "iE-value cannot me negative"
627         raise IOError, error_msg
628       end
629       @count += 1
630       @i_e_value_sum += i_e_value
631       @len_sum += length
632       if i_e_value > @i_e_value_max
633         @i_e_value_max = i_e_value
634       end
635       if i_e_value < @i_e_value_min
636         @i_e_value_min = i_e_value
637       end
638       if length > @len_max
639         @len_max = length
640       end
641       if length < @len_min
642         @len_min = length
643       end
644     end
645
646     def avg_length
647       if @count == 0
648         return 0
649       end
650       @len_sum / @count
651     end
652
653     def avg_i_e_value
654       if @count == 0
655         return 0
656       end
657       @i_e_value_sum / @count
658     end
659
660     def to_str
661       s = ''
662       s << @name.rjust(16) + ': '
663       s << @count.to_s.rjust(4) + '  '
664       s << avg_length.round(1).to_s.rjust(6) + ' '
665       s << @len_min.to_s.rjust(4) + ' -'
666       s << @len_max.to_s.rjust(4) + '  '
667       s << ("%.2E" % avg_i_e_value).rjust(9) + ' '
668       s << ("%.2E" % @i_e_value_min).rjust(9) + ' -'
669       s << ("%.2E" % @i_e_value_max).rjust(9) + ' '
670       s
671     end
672
673     attr_reader :name, :count, :i_e_value_min, :i_e_value_max, :length_min, :length_max
674
675   end
676
677   class TargetDomain
678     def initialize(name, i_e_value, abs_len, rel_len)
679       if (name == nil) || name.size < 1
680         error_msg = "target domain name nil or empty"
681         raise IOError, error_msg
682       end
683       if rel_len > 1
684         error_msg = name + ": target domain relative length is greater than 1"
685         raise IOError, error_msg
686       end
687       if (abs_len <= 0) && (rel_len <= 0)
688         error_msg = name + ": need to have either absolute length or relative length cutoff"
689         raise IOError, error_msg
690       end
691       if (abs_len > 0) && (rel_len > 0)
692         error_msg = name + ": cannot have both absolute length and relative length cutoff"
693         raise IOError, error_msg
694       end
695       @name = name
696       @i_e_value = i_e_value
697       @abs_len = abs_len
698       @rel_len = rel_len
699     end
700
701     def to_str
702       s = @name.rjust(16) + ':'
703       s << ' iE-cutoff: ' + ("%.2E" % @i_e_value).rjust(9)
704       if @abs_len > 0
705         s << ', abs len-cutoff: ' + @abs_len.to_s.rjust(4)
706       end
707       if @rel_len > 0
708         s << ', rel len-cutoff: ' + @rel_len.to_s.rjust(4)
709       end
710       s
711     end
712     attr_reader :name, :i_e_value, :abs_len, :rel_len
713   end
714
715 end # module Evoruby
716