- ####
-
- prev_query = nil
- saw_target = false
-
- results.each do | r |
-
- if ( prev_query != nil ) && ( r.query != prev_query )
- protein_counter += 1
- passing_domains_per_protein = 0
- if !saw_target
- log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
- domain_not_present_counter += 1
- end
- saw_target = false
- end
-
- prev_query = r.query
-
- if domain_id != r.model
- next
- end
-
- saw_target = true
-
- sequence = r.query
- number = r.number
- out_of = r.out_of
- env_from = r.env_from
- env_to = r.env_to
- i_e_value = r.i_e_value
- prev_query = r.query
-
- length = 1 + env_to - env_from
-
- overall_target_length_sum += length
- if length > overall_target_length_max
- overall_target_length_max = length
- end
- if length < overall_target_length_min
- overall_target_length_min = length
- end
-
- if i_e_value > overall_target_ie_max
- overall_target_ie_max = i_e_value
- end
- if i_e_value < overall_target_ie_min
- overall_target_ie_min = i_e_value
- end
-
- if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
- ( ( length_threshold <= 0 ) || ( length >= length_threshold.to_f ) ) )
- hmmscan_datas << HmmscanData.new( sequence, number, out_of, env_from, env_to, i_e_value )
- passing_target_length_sum += length
- passing_domains_per_protein += 1
- if length > passing_target_length_max
- passing_target_length_max = length
- end
- if length < passing_target_length_min
- passing_target_length_min = length
- end
- if i_e_value > passing_target_ie_max
- passing_target_ie_max = i_e_value
- end
- if i_e_value < passing_target_ie_min
- passing_target_ie_min = i_e_value
- end
- if ( passing_domains_per_protein > max_domain_copy_number_per_protein )
- max_domain_copy_number_sequence = sequence
- max_domain_copy_number_per_protein = passing_domains_per_protein
- end
- else # no pass
- log << domain_fail_counter.to_s + ": " + sequence.to_s + " fails threshold(s)"
- if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
- log << " iE=" + i_e_value.to_s
- end
- if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
- le = env_to - env_from + 1
- log << " l=" + le.to_s
- end
- log << ld
- domain_fail_counter += 1
- end
-
- if number > out_of
- error_msg = "number > out_of (this should not have happened)"
- raise StandardError, error_msg
- end
-
- if number == out_of
- if !hmmscan_datas.empty?
- process_hmmscan_datas( hmmscan_datas,
- in_msa,
- add_domain_number,
- out_msa )
- domain_pass_counter += hmmscan_datas.length
- if passed_seqs.find_by_name_start( sequence, true ).length < 1
- add_sequence( sequence, in_msa, passed_seqs )
- else
- error_msg = "this should not have happened"
- raise StandardError, error_msg
- end
- else # no pass
- if failed_seqs.find_by_name_start( sequence, true ).length < 1
- add_sequence( sequence, in_msa, failed_seqs )
- proteins_with_failing_domains += 1
- else
- error_msg = "this should not have happened"
- raise StandardError, error_msg
- end
- end
- hmmscan_datas.clear
- end
-
- end # results.each do | r |
-
- if (prev_query != nil) && (!saw_target)
- log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
- domain_not_present_counter += 1
- end
-
- if domain_pass_counter < 1
- error_msg = "no domain sequences were extracted"
- raise IOError, error_msg
- end
-
- if ( domain_not_present_counter + passed_seqs.get_number_of_seqs + proteins_with_failing_domains ) != protein_counter
- error_msg = "not present + passing + not passing != proteins in sequence (fasta) file (this should not have happened)"
- raise StandardError, error_msg
- end
-
- puts
- log << ld
-
- log << ld
- avg_pass = ( passing_target_length_sum / domain_pass_counter )
- puts( "Passing target domain lengths: average: " + avg_pass.to_s )
- log << "Passing target domain lengths: average: " + avg_pass.to_s
- log << ld
- puts( "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - " + passing_target_length_max.to_s)
- log << "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + " - " + passing_target_length_max.to_s
- log << ld
- puts( "Passing target domain iE: min-max: " + passing_target_ie_min.to_s + " - " + passing_target_ie_max.to_s)
- log << "Passing target domain iE: min-max: " + passing_target_ie_min.to_s + " - " + passing_target_ie_max.to_s
- log << ld
- puts( "Passing target domains: sum: " + domain_pass_counter.to_s )
- log << "Passing target domains: sum: " + domain_pass_counter.to_s
- log << ld
- log << ld
- puts
- sum = domain_pass_counter + domain_fail_counter
- avg_all = overall_target_length_sum / sum
- puts( "All target domain lengths: average: " + avg_all.to_s )
- log << "All target domain lengths: average: " + avg_all.to_s
- log << ld
- puts( "All target domain lengths: min-max: " + overall_target_length_min.to_s + " - " + overall_target_length_max.to_s)
- log << "All target domain lengths: min-max: " + overall_target_length_min.to_s + " - " + overall_target_length_max.to_s
- log << ld
- puts( "All target target domain iE: min-max: " + overall_target_ie_min.to_s + " - " + overall_target_ie_max.to_s)
- log << "All target target domain iE: min-max: " + overall_target_ie_min.to_s + " - " + overall_target_ie_max.to_s
- log << ld
- puts( "All target domains: sum: " + sum.to_s )
- log << "All target domains: sum: " + sum.to_s
-
- puts
- puts( "Proteins with passing target domain(s): " + passed_multi_seqs.get_number_of_seqs.to_s )
- #puts( "Proteins with passing target domain(s): " + passed_seqs.get_number_of_seqs.to_s )
- puts( "Proteins with no passing target domain: " + proteins_with_failing_domains.to_s )
- puts( "Proteins with no target domain : " + domain_not_present_counter.to_s )
-
- log << ld
- log << ld
- puts
- puts( "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s )
- log << "Max target domain copy number per protein: " + max_domain_copy_number_per_protein.to_s
- log << ld
-
- if ( max_domain_copy_number_per_protein > 1 )
- puts( "First target protein with this copy number: " + max_domain_copy_number_sequence )
- log << "First target protein with this copy number: " + max_domain_copy_number_sequence
- log << ld
- end
-
- write_msa( out_msa, outfile + ".fasta" )
- write_msa( passed_multi_seqs, passed_seqs_outfile )
- write_msa( failed_seqs, failed_seqs_outfile )