#
# Copyright:: Copyright (C) 2017 Christian M. Zmasek
# License:: GNU Lesser General Public License (LGPL)
+#
+# Last modified: 2017/02/16
require 'lib/evo/util/constants'
require 'lib/evo/msa/msa_factory'
ld = Constants::LINE_DELIMITER
- domain_pass_counter = 0
- domain_fail_counter = 0
- proteins_with_failing_domains = 0
+ domain_pass_counter = 0
+ domain_fail_counter = 0
+ proteins_with_failing_domains = 0
+ domain_not_present_counter = 0
+ protein_counter = 1
max_domain_copy_number_per_protein = -1
max_domain_copy_number_sequence = ""
+ passing_target_length_sum = 0
+ overall_target_length_sum = 0
+ overall_target_length_min = 10000000
+ overall_target_length_max = 0
+ passing_target_length_min = 10000000
+ passing_target_length_max = 0
hmmscan_datas = []
hmmscan_parser = HmmscanParser.new( hmmscan_output )
results = hmmscan_parser.parse
+ prev_query = nil
+ saw_target = false
+
results.each do | r |
+
+ if ( prev_query != nil ) && ( r.query != prev_query )
+ protein_counter += 1
+ if !saw_target
+ log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
+ domain_not_present_counter += 1
+ end
+ saw_target = false
+ end
+
+ prev_query = r.query
+
if domain_id != r.model
next
end
+ saw_target = true
+
sequence = r.query
number = r.number
out_of = r.out_of
env_from = r.env_from
env_to = r.env_to
i_e_value = r.i_e_value
+ prev_query = r.query
+
+ length = env_to - env_from + 1
+
+ overall_target_length_sum += length
+ if length > overall_target_length_max
+ overall_target_length_max = length
+ end
+ if length < overall_target_length_min
+ overall_target_length_min = length
+ end
if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
- ( ( length_threshold <= 0 ) || ( env_to - env_from + 1 ) >= length_threshold.to_f ) )
+ ( ( length_threshold <= 0 ) || ( length >= length_threshold.to_f ) ) )
hmmscan_datas << HmmsearchData.new( sequence, number, out_of, env_from, env_to, i_e_value )
+ passing_target_length_sum += length
+ if length > passing_target_length_max
+ passing_target_length_max = length
+ end
+ if length < passing_target_length_min
+ passing_target_length_min = length
+ end
if ( number > max_domain_copy_number_per_protein )
max_domain_copy_number_sequence = sequence
max_domain_copy_number_per_protein = number
end
- else # failed
- print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" )
- log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)"
+ else # no pass
+ log << domain_fail_counter.to_s + ": " + sequence.to_s + " fails threshold(s)"
if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
- print( " iE=" + i_e_value.to_s )
log << " iE=" + i_e_value.to_s
end
if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
le = env_to - env_from + 1
- print( " l=" + le.to_s )
log << " l=" + le.to_s
end
- print( ld )
log << ld
- domain_fail_counter += 1
+ domain_fail_counter += 1
end
if number > out_of
- error_msg = "number > out_of ! (this should not have happened)"
+ error_msg = "number > out_of (this should not have happened)"
raise StandardError, error_msg
end
error_msg = "this should not have happened"
raise StandardError, error_msg
end
- else
+ else # no pass
if failed_seqs.find_by_name_start( sequence, true ).length < 1
add_sequence( sequence, in_msa, failed_seqs )
proteins_with_failing_domains += 1
end
hmmscan_datas.clear
end
+
+ end # results.each do | r |
+
+ if (prev_query != nil) && (!saw_target)
+ log << domain_not_present_counter.to_s + ": " + prev_query.to_s + " lacks target domain" + ld
+ domain_not_present_counter += 1
end
if domain_pass_counter < 1
raise IOError, error_msg
end
+ if ( domain_not_present_counter + passed_seqs.get_number_of_seqs + proteins_with_failing_domains ) != protein_counter
+ error_msg = "not present + passing + not passing != proteins in sequence (fasta) file (this should not have happened)"
+ raise StandardError, error_msg
+ end
+
+ puts
+ log << ld
+
+ log << ld
+ avg_pass = ( passing_target_length_sum / domain_pass_counter )
+ puts( "Passing target domain lengths: average: " + avg_pass.to_s )
+ log << "Passing target domain lengths: average: " + avg_pass.to_s
+ log << ld
+ puts( "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + "-" + passing_target_length_max.to_s)
+ log << "Passing target domain lengths: min-max: " + passing_target_length_min.to_s + "-" + passing_target_length_max.to_s
+ log << ld
+ puts( "Passing target domain lengths: sum: " + domain_pass_counter.to_s )
+ log << "Passing target domain lengths: sum: " + domain_pass_counter.to_s
+ log << ld
+ log << ld
+ puts
+ sum = domain_pass_counter + domain_fail_counter
+ avg_all = overall_target_length_sum / sum
+ puts( "All target domain lengths: average: " + avg_all.to_s )
+ log << "All target domain lengths: average: " +avg_all.to_s
+ log << ld
+ puts( "All target domain lengths: min-max: " + overall_target_length_min.to_s + "-" + overall_target_length_max.to_s)
+ log << "All target domain lengths: min-max: " + overall_target_length_min.to_s + "-" + overall_target_length_max.to_s
+ log << ld
+ puts( "All target domain lengths: sum: " + sum.to_s )
+ log << "All target domain lengths: sum: " + sum.to_s
+
+ puts
+ puts( "Proteins with passing target domain(s): " + passed_seqs.get_number_of_seqs.to_s )
+ puts( "Proteins with no passing target domain: " + proteins_with_failing_domains.to_s )
+ puts( "Proteins with no target domain : " + domain_not_present_counter.to_s )
+
+ log << ld
log << ld
- puts( "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s )
- log << "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s
+ puts
+ puts( "Max target domain copy number per protein (includes non-passing): " + max_domain_copy_number_per_protein.to_s )
+ log << "Max target domain copy number per protein (includes non-passing): " + max_domain_copy_number_per_protein.to_s
log << ld
if ( max_domain_copy_number_per_protein > 1 )
- puts( "First protein with this copy number: " + max_domain_copy_number_sequence )
- log << "First protein with this copy number: " + max_domain_copy_number_sequence
+ puts( "First target protein with this copy number: " + max_domain_copy_number_sequence )
+ log << "First target protein with this copy number: " + max_domain_copy_number_sequence
log << ld
end
end
log << ld
- log << "passing domains : " + domain_pass_counter.to_s + ld
- log << "failing domains : " + domain_fail_counter.to_s + ld
- log << "input proteins : " + in_msa.get_number_of_seqs.to_s + ld
- log << "proteins with passing domains : " + passed_seqs.get_number_of_seqs.to_s + ld
- log << "proteins with no passing domains : " + proteins_with_failing_domains.to_s + ld
+ log << "passing target domains : " + domain_pass_counter.to_s + ld
+ log << "failing target domains : " + domain_fail_counter.to_s + ld
+ log << "proteins in sequence (fasta) file : " + in_msa.get_number_of_seqs.to_s + ld
+ log << "proteins in hmmscan outputfile : " + protein_counter.to_s + ld
+ log << "proteins with passing target domain(s) : " + passed_seqs.get_number_of_seqs.to_s + ld
+ log << "proteins with no passing target domain : " + proteins_with_failing_domains.to_s + ld
+ log << "proteins with no target domain : " + domain_not_present_counter.to_s + ld
if min_linker
- log << "min linker length : " + min_linker.to_s + ld
- log << "single domains : " + out_msa_singles.get_number_of_seqs.to_s + ld
- log << "domains in close pairs : " + (2 * out_msa_pairs.get_number_of_seqs).to_s + ld
- log << "isolated domains : " + out_msa_isolated.get_number_of_seqs.to_s + ld
- log << "proteins wih single domains : " + out_msa_single_domains_protein_seqs.get_number_of_seqs.to_s + ld
- log << "proteins wih close pair domains : " + out_msa_close_pairs_protein_seqs.get_number_of_seqs.to_s + ld
- log << "proteins wih close pair domains only : " + out_msa_close_pairs_only_protein_seqs.get_number_of_seqs.to_s + ld
- log << "proteins wih isolated domains : " + out_msa_isolated_protein_seqs.get_number_of_seqs.to_s + ld
- log << "proteins wih isolated domains only : " + out_msa_isolated_only_protein_seqs.get_number_of_seqs.to_s + ld
- log << "proteins wih close pair and isolated domains: " + out_msa_isolated_and_close_pair_protein_seqs.get_number_of_seqs.to_s + ld
+ log << "min linker length : " + min_linker.to_s + ld
+ log << "single domains : " + out_msa_singles.get_number_of_seqs.to_s + ld
+ log << "domains in close pairs : " + (2 * out_msa_pairs.get_number_of_seqs).to_s + ld
+ log << "isolated domains : " + out_msa_isolated.get_number_of_seqs.to_s + ld
+ log << "proteins with single domains : " + out_msa_single_domains_protein_seqs.get_number_of_seqs.to_s + ld
+ log << "proteins with close pair domains : " + out_msa_close_pairs_protein_seqs.get_number_of_seqs.to_s + ld
+ log << "proteins with close pair domains only : " + out_msa_close_pairs_only_protein_seqs.get_number_of_seqs.to_s + ld
+ log << "proteins with isolated domains : " + out_msa_isolated_protein_seqs.get_number_of_seqs.to_s + ld
+ log << "proteins with isolated domains only : " + out_msa_isolated_only_protein_seqs.get_number_of_seqs.to_s + ld
+ log << "proteins with close pair and isolated domains: " + out_msa_isolated_and_close_pair_protein_seqs.get_number_of_seqs.to_s + ld
end
log << ld
error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
raise IOError, error_msg
end
- # hmmsearch is 1 based, wheres sequences are 0 bases in this package.
+ # hmmscan is 1 based, whereas sequences are 0 bases in this package.
seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
orig_name = seq.get_name
PRG_DATE = "20170214"
WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
+ E_VALUE_THRESHOLD_DEFAULT = 0.1
+ LENGTH_THRESHOLD_DEFAULT = 50
+
E_VALUE_THRESHOLD_OPTION = 'e'
LENGTH_THRESHOLD_OPTION = 'l'
ADD_POSITION_OPTION = 'p'
STDOUT )
end
- e_value_threshold = 0.1
+ e_value_threshold = E_VALUE_THRESHOLD_DEFAULT
if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
begin
e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
end
end
- length_threshold = 50
+ length_threshold = LENGTH_THRESHOLD_DEFAULT
if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
begin
length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
ld = Constants::LINE_DELIMITER
puts()
- puts( "Domain : " + domain_id )
- log << "Domain : " + domain_id + ld
- puts( "Hmmscan outputfile : " + hmmscan_output )
- log << "Hmmscan outputfile : " + hmmscan_output + ld
- puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file )
- log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld
- puts( "Outputfile : " + outfile + ".fasta" )
- log << "Outputfile : " + outfile + ld
- puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX )
- log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld
- puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX )
- log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld
- puts( "Logfile : " + outfile + LOG_FILE_SUFFIX )
- log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld
+ puts( "Domain : " + domain_id )
+ log << "Domain : " + domain_id + ld
+ puts( "Hmmscan outputfile : " + hmmscan_output )
+ log << "Hmmscan outputfile : " + hmmscan_output + ld
+ puts( "Fasta sequencefile (complete sequences) : " + fasta_sequence_file )
+ log << "Fasta sequencefile (complete sequences) : " + fasta_sequence_file + ld
+ puts( "Outputfile : " + outfile + ".fasta" )
+ log << "Outputfile : " + outfile + ld
+ puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX )
+ log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld
+ puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX )
+ log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld
+ puts( "Logfile : " + outfile + LOG_FILE_SUFFIX )
+ log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld
if ( e_value_threshold >= 0.0 )
- puts( "E-value threshold : " + e_value_threshold.to_s )
- log << "E-value threshold : " + e_value_threshold.to_s + ld
+ puts( "iE-value threshold : " + e_value_threshold.to_s )
+ log << "iE-value threshold : " + e_value_threshold.to_s + ld
else
- puts( "E-value threshold : no threshold" )
- log << "E-value threshold : no threshold" + ld
+ puts( "iE-value threshold : no threshold" )
+ log << "iE-value threshold : no threshold" + ld
end
if ( length_threshold > 0 )
- puts( "Length threshold : " + length_threshold.to_s )
- log << "Length threshold : " + length_threshold.to_s + ld
+ puts( "Length threshold (env) : " + length_threshold.to_s )
+ log << "Length threshold (env) : " + length_threshold.to_s + ld
else
- puts( "Length threshold : no threshold" )
- log << "Length threshold : no threshold" + ld
+ puts( "Length threshold (env) : no threshold" )
+ log << "Length threshold (env) : no threshold" + ld
end
-
if ( add_position )
- puts( "Add positions (rel to complete seq) to extracted domains: true" )
- log << "Add positions (rel to complete seq) to extracted domains: true" + ld
+ puts( "Add positions (rel to complete seq) to extracted domains : true" )
+ log << "Add positions (rel to complete seq) to extracted domains : true" + ld
else
- puts( "Add positions (rel to complete seq) to extracted domains: false" )
- log << "Add positions (rel to complete seq) to extracted domains: false" + ld
+ puts( "Add positions (rel to complete seq) to extracted domains : false" )
+ log << "Add positions (rel to complete seq) to extracted domains : false" + ld
end
if ( add_domain_number )
end
puts
+ log << ld
domain_count = 0
begin
puts
Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
- Util.print_message( PRG_NAME, "wrote; " + outfile + ".fasta")
- Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
- Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX )
- Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX )
+ Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta")
+ Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
+ Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX )
+ Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX )
begin
f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
puts()
puts( "Usage:" )
puts()
- puts( " " + PRG_NAME + ".rb [options] <domain> <hmmscan outputfile> [file containing complete sequences in fasta format] [outputfile]" )
+ puts( " " + PRG_NAME + ".rb [options] <target domain> <hmmscan outputfile> [file containing complete sequences in fasta format] [outputfile]" )
puts()
- puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f> : iE-value threshold, default is 0.1" )
- puts( " -" + LENGTH_THRESHOLD_OPTION + "=<i> : length threshold, default is 50" )
+ puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f> : iE-value threshold for target domain, default is " + E_VALUE_THRESHOLD_DEFAULT.to_s )
+ puts( " -" + LENGTH_THRESHOLD_OPTION + "=<i> : length threshold target domain (env), default is " + LENGTH_THRESHOLD_DEFAULT.to_s )
puts( " -" + ADD_DOMAIN_NUMBER_OPTION + " : to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
puts( " -" + ADD_POSITION_OPTION + " : to add positions (rel to complete seq) to extracted domains" )
puts( " -" + ADD_SPECIES + " : to add species [in brackets]" )
require 'date'
require 'set'
-
require 'lib/evo/util/constants'
require 'lib/evo/util/util'
require 'lib/evo/util/command_line_arguments'
require 'lib/evo/io/writer/msa_writer'
module Evoruby
-
class MsaProcessor
PRG_NAME = "msa_pro"
- PRG_DATE = "131112"
+ PRG_DATE = "170215"
PRG_DESC = "processing of multiple sequence alignments"
PRG_VERSION = "1.08"
- COPYRIGHT = "2008-2010 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
+ WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
NAME_LENGTH_DEFAULT = 10
WIDTH_DEFAULT_FASTA = 60
LOG_SUFFIX = "_msa_pro.log"
HELP_OPTION_1 = "help"
HELP_OPTION_2 = "h"
-
-
def initialize()
@input_format_set = false
@output_format_set = false
@last = -1
end
-
def run()
Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
+ PRG_VERSION,
+ PRG_DESC,
+ PRG_DATE,
+ WWW,
+ STDOUT )
if ( ARGV == nil || ARGV.length < 1 )
Util.print_message( PRG_NAME, "Illegal number of arguments" )
end
if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
+ cla.is_option_set?( HELP_OPTION_2 ) )
print_help
exit( 0 )
end
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
Util.fatal_error( PRG_NAME,
- "unknown option(s): " + disallowed )
+ "unknown option(s): " + disallowed )
end
input = cla.get_file_name( 0 )
msa = sort( msa )
end
-
-
if ( @split > 0 )
begin
msas = msa.split( @split, true )
if removed.size > 0
identicals = msa.get_identical_seqs_detected
log << "the following " + identicals.size.to_s + " sequences are identical:" + ld
- identicals.each { | s |
- log << s + ld
+ identicals.each { | identical |
+ log << identical + ld
}
log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld
removed.each { | seq_name |
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
-
end
Util.print_message( PRG_NAME, "OK" )
puts
end
-
private
def sort( msa )
@fasta_input = fi
@input_format_set = true
end
+
def set_phylip_input( pi = true )
@phylip_input = pi
@input_format_set = true
end
+
def set_name_length( i )
@name_length = i
@name_length_set = true
end
+
def set_width( i )
@width = i
end
+
def set_fasta_output( fo = true )
@fasta_output = fo
@output_format_set = true
end
+
def set_pi_output( pso = true )
@pi_output = pso
@output_format_set = true
end
+
def set_nexus_output( nexus = true )
@nexus_output = nexus
@output_format_set = true
end
+
def set_clean( c = true )
@clean = c
end
+
def set_remove_gap_columns( rgc = true )
@rgc = rgc
end
+
def set_remove_gap_only_columns( rgoc = true )
@rgoc = rgoc
end
+
def set_remove_gaps( rg = true )
@rg = rg
end
+
def set_remove_gap_ratio( rgr )
@rgr = rgr
end
+
def set_remove_seqs_gap_ratio( rsgr )
@rsgr = rsgr
end
+
def set_remove_seqs_min_non_gap_length( rsl )
@rsl = rsl
end
+
def set_remove_seqs( file )
@seqs_name_file = file
@remove_seqs = true
@keep_seqs = false
end
+
def set_keep_seqs( file )
@seqs_name_file = file
@keep_seqs = true
@remove_seqs = false
end
+
def set_trim( first, last )
@trim = true
@first = first
@last = last
end
+
def set_remove_matching( remove )
@remove_matching = remove
end
+
def set_keep_matching( keep )
@keep_matching = keep
end
+
def set_rem_red( rr )
@rem_red = rr
end
-
-
def set_split( s )
if ( s > 0 )
@split = s
@die_if_name_too_long = true
end
-
end
def print_help()
puts()
end
-
-
-
-
end # class MsaProcessor
-
end # module Evoruby
DOMAINS_MAPFILE_SUFFIX = '_hmmscan_10.dff'
SLEEP_TIME = 0.05
REMOVE_NI = true
- IDS_ONLY = true #TODO this should be a command line option
+ IDS_ONLY = false #TODO this should be a command line option
FIXED_NIM_FILE = 'all.nim' #TODO this should be a command line option
TMP_FILE_1 = '___PD1___'
TMP_FILE_2 = '___PD2___'
PRG_DATE = "170209"
PRG_DESC = "decoration of phylogenies with sequence/species names and domain architectures"
PRG_VERSION = "1.02"
- COPYRIGHT = "2017 Christian M Zmasek"
- CONTACT = "phyloxml at gmail dot com"
WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
HELP_OPTION_1 = "help"
PRG_VERSION,
PRG_DESC,
PRG_DATE,
- COPYRIGHT,
- CONTACT,
WWW,
STDOUT )
log << 'input suffix : ' + in_suffix + NL
log << 'output suffix : ' + out_suffix + NL
- if ( File.exists?( TMP_FILE_1 ) )
+ if ( File.exist?( TMP_FILE_1 ) )
File.delete( TMP_FILE_1 )
end
- if ( File.exists?( TMP_FILE_2 ) )
+ if ( File.exist?( TMP_FILE_2 ) )
File.delete( TMP_FILE_2 )
end