+++ /dev/null
-#!/usr/local/bin/ruby -w
-#
-# = exe/fae
-#
-# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: fae.rb,v 1.1 2008/09/10 02:16:34 cmzmasek Exp $
-
-
-require 'lib/evo/tools/fasta_extractor'
-
-module Evoruby
-
- mse = FastaExtractor.new()
-
- mse.run()
-
-end # module Evoruby
\ No newline at end of file
--- /dev/null
+#!/usr/local/bin/ruby -w
+#
+# = exe/fastx
+#
+#
+# Copyright:: Copyright (C) 2017 Christian M. Zmasek
+# License:: GNU Lesser General Public License (LGPL)
+
+require 'lib/evo/tool/fasta_extractor'
+
+module Evoruby
+
+ mse = FastaExtractor.new()
+
+ mse.run()
+
+end # module Evoruby
\ No newline at end of file
#
# = exe/hsp
#
-# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: hsp.rb,v 1.1 2009/11/25 05:42:04 cmzmasek Exp $
-#
-# last modified: 11/24/2009
+# Copyright:: Copyright (C) 2017 Christian M. Zmasek
+# License:: GNU Lesser General Public License (LGPL)
require 'lib/evo/tool/hmmscan_analysis'
#
# = exe/hsp
#
-# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: hsp.rb,v 1.1 2009/11/25 05:42:04 cmzmasek Exp $
-#
-# last modified: 11/24/2009
+# Copyright:: Copyright (C) 2017 Christian M. Zmasek
+# License:: GNU Lesser General Public License (LGPL)
require 'lib/evo/tool/hmmscan_summary'
module Evoruby
- hsp = HmmscanSummary.new()
+ hsp = HmmscanSummary.new()
- hsp.run()
+ hsp.run()
-end # module Evoruby
+end # module Evoruby
\ No newline at end of file
#
# = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanDomainExtractor class
#
-# Copyright:: Copyright (C) 2012 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: $
-
+# Copyright:: Copyright (C) 2017 Christian M. Zmasek
+# License:: GNU Lesser General Public License (LGPL)
require 'lib/evo/util/constants'
require 'lib/evo/msa/msa_factory'
require 'lib/evo/io/parser/fasta_parser'
require 'lib/evo/io/parser/hmmscan_parser'
-
-
module Evoruby
-
class HmmscanDomainExtractor
ADD_TO_CLOSE_PAIRS = 0
-
def initialize
end
# raises ArgumentError, IOError, StandardError
def parse( domain_id,
- hmmscan_output,
- fasta_sequence_file,
- outfile,
- passed_seqs_outfile,
- failed_seqs_outfile,
- e_value_threshold,
- length_threshold,
- add_position,
- add_domain_number,
- add_species,
- min_linker,
- log )
+ hmmscan_output,
+ fasta_sequence_file,
+ outfile,
+ passed_seqs_outfile,
+ failed_seqs_outfile,
+ e_value_threshold,
+ length_threshold,
+ add_position,
+ add_domain_number,
+ add_species,
+ min_linker,
+ log )
Util.check_file_for_readability( hmmscan_output )
Util.check_file_for_readability( fasta_sequence_file )
i_e_value = r.i_e_value
if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
- ( ( length_threshold <= 0 ) || ( env_to - env_from + 1 ) >= length_threshold.to_f ) )
+ ( ( length_threshold <= 0 ) || ( env_to - env_from + 1 ) >= length_threshold.to_f ) )
hmmscan_datas << HmmsearchData.new( sequence, number, out_of, env_from, env_to, i_e_value )
if ( number > max_domain_copy_number_per_protein )
max_domain_copy_number_sequence = sequence
if number == out_of
if !hmmscan_datas.empty?
process_hmmscan_datas( hmmscan_datas,
- in_msa,
- add_position,
- add_domain_number,
- add_species,
- out_msa,
- out_msa_singles,
- out_msa_pairs,
- out_msa_isolated,
- min_linker,
- out_msa_single_domains_protein_seqs,
- out_msa_close_pairs_protein_seqs,
- out_msa_close_pairs_only_protein_seqs,
- out_msa_isolated_protein_seqs,
- out_msa_isolated_only_protein_seqs,
- out_msa_isolated_and_close_pair_protein_seqs )
+ in_msa,
+ add_position,
+ add_domain_number,
+ add_species,
+ out_msa,
+ out_msa_singles,
+ out_msa_pairs,
+ out_msa_isolated,
+ min_linker,
+ out_msa_single_domains_protein_seqs,
+ out_msa_close_pairs_protein_seqs,
+ out_msa_close_pairs_only_protein_seqs,
+ out_msa_isolated_protein_seqs,
+ out_msa_isolated_only_protein_seqs,
+ out_msa_isolated_and_close_pair_protein_seqs )
domain_pass_counter += hmmscan_datas.length
if passed_seqs.find_by_name_start( sequence, true ).length < 1
add_sequence( sequence, in_msa, passed_seqs )
write_msa( failed_seqs, failed_seqs_outfile )
if out_msa_pairs
- write_msa( out_msa_pairs, outfile +"_" + min_linker.to_s + ".fasta")
+ write_msa( out_msa_pairs, outfile + "_" + min_linker.to_s + ".fasta")
end
if out_msa_singles
- write_msa( out_msa_singles, outfile +"_singles.fasta")
+ write_msa( out_msa_singles, outfile + "_singles.fasta")
end
if out_msa_isolated
end
if out_msa_single_domains_protein_seqs
- write_msa( out_msa_single_domains_protein_seqs, outfile +"_proteins_with_singles.fasta" )
+ write_msa( out_msa_single_domains_protein_seqs, outfile + "_proteins_with_singles.fasta" )
end
if out_msa_close_pairs_protein_seqs
if min_linker
if ( out_msa_single_domains_protein_seqs.get_number_of_seqs +
- out_msa_close_pairs_only_protein_seqs.get_number_of_seqs +
- out_msa_isolated_only_protein_seqs.get_number_of_seqs +
- out_msa_isolated_and_close_pair_protein_seqs.get_number_of_seqs ) != passed_seqs.get_number_of_seqs
+ out_msa_close_pairs_only_protein_seqs.get_number_of_seqs +
+ out_msa_isolated_only_protein_seqs.get_number_of_seqs +
+ out_msa_isolated_and_close_pair_protein_seqs.get_number_of_seqs ) != passed_seqs.get_number_of_seqs
error_msg = "this should not have happened"
raise StandardError, error_msg
end
end # parse
-
private
def write_msa( msa, filename )
end
end
-
def add_sequence( sequence_name, in_msa, add_to_msa )
seqs = in_msa.find_by_name_start( sequence_name, true )
if ( seqs.length < 1 )
end
def process_hmmscan_datas( hmmscan_datas,
- in_msa,
- add_position,
- add_domain_number,
- add_species,
- out_msa,
- out_msa_singles,
- out_msa_pairs,
- out_msa_isolated,
- min_linker,
- out_msa_single_domains_protein_seqs,
- out_msa_close_pairs_protein_seqs,
- out_msa_close_pairs_only_protein_seqs,
- out_msa_isolated_protein_seqs,
- out_msa_isolated_only_protein_seqs,
- out_msa_isolated_and_close_pair_protein_seqs )
+ in_msa,
+ add_position,
+ add_domain_number,
+ add_species,
+ out_msa,
+ out_msa_singles,
+ out_msa_pairs,
+ out_msa_isolated,
+ min_linker,
+ out_msa_single_domains_protein_seqs,
+ out_msa_close_pairs_protein_seqs,
+ out_msa_close_pairs_only_protein_seqs,
+ out_msa_isolated_protein_seqs,
+ out_msa_isolated_only_protein_seqs,
+ out_msa_isolated_and_close_pair_protein_seqs )
actual_out_of = hmmscan_datas.size
saw_close_pair = false
seq_name = hmmscan_data.seq_name
extract_domain( seq_name,
- index + 1,
- actual_out_of,
- hmmscan_data.env_from,
- hmmscan_data.env_to,
- in_msa,
- out_msa,
- add_position,
- add_domain_number,
- add_species )
+ index + 1,
+ actual_out_of,
+ hmmscan_data.env_from,
+ hmmscan_data.env_to,
+ in_msa,
+ out_msa,
+ add_position,
+ add_domain_number,
+ add_species )
if min_linker
if actual_out_of == 1
extract_domain( seq_name,
- 1,
- 1,
- hmmscan_data.env_from,
- hmmscan_data.env_to,
- in_msa,
- out_msa_singles,
- add_position,
- add_domain_number,
- add_species )
+ 1,
+ 1,
+ hmmscan_data.env_from,
+ hmmscan_data.env_to,
+ in_msa,
+ out_msa_singles,
+ add_position,
+ add_domain_number,
+ add_species )
if out_msa_single_domains_protein_seqs.find_by_name_start( seq_name, true ).length < 1
add_sequence( seq_name, in_msa, out_msa_single_domains_protein_seqs )
else
last = index == hmmscan_datas.length - 1
if ( ( first && ( ( hmmscan_datas[ index + 1 ].env_from - hmmscan_data.env_to ) > min_linker) ) ||
- ( last && ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) ||
- ( !first && !last && ( ( hmmscan_datas[ index + 1 ].env_from - hmmscan_data.env_to ) > min_linker ) &&
- ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) )
+ ( last && ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) ||
+ ( !first && !last && ( ( hmmscan_datas[ index + 1 ].env_from - hmmscan_data.env_to ) > min_linker ) &&
+ ( ( hmmscan_data.env_from - hmmscan_datas[ index - 1 ].env_to ) > min_linker ) ) )
extract_domain( seq_name,
- index + 1,
- actual_out_of,
- hmmscan_data.env_from,
- hmmscan_data.env_to,
- in_msa,
- out_msa_isolated,
- add_position,
- add_domain_number,
- add_species )
+ index + 1,
+ actual_out_of,
+ hmmscan_data.env_from,
+ hmmscan_data.env_to,
+ in_msa,
+ out_msa_isolated,
+ add_position,
+ add_domain_number,
+ add_species )
saw_isolated = true
elsif !first
end
extract_domain( seq_name,
- index.to_s + "+" + ( index + 1 ).to_s,
- actual_out_of,
- from,
- to,
- in_msa,
- out_msa_pairs,
- add_position,
- add_domain_number,
- add_species )
+ index.to_s + "+" + ( index + 1 ).to_s,
+ actual_out_of,
+ from,
+ to,
+ in_msa,
+ out_msa_pairs,
+ add_position,
+ add_domain_number,
+ add_species )
saw_close_pair = true
end
end
end # def process_hmmscan_data
def extract_domain( sequence,
- number,
- out_of,
- seq_from,
- seq_to,
- in_msa,
- out_msa,
- add_position,
- add_domain_number,
- add_species )
+ number,
+ out_of,
+ seq_from,
+ seq_to,
+ in_msa,
+ out_msa,
+ add_position,
+ add_domain_number,
+ add_species )
if number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
error_msg = "number=" + number.to_s + ", out of=" + out_of.to_s
raise StandardError, error_msg
class DomainSequenceExtractor
PRG_NAME = "dsx"
- PRG_VERSION = "2.001"
- PRG_DESC = "extraction of domain sequences from hmmscan output"
- PRG_DATE = "20170213"
+ PRG_VERSION = "2.002"
+ PRG_DESC = "Extraction of domain sequences from hmmscan output"
+ PRG_DATE = "20170214"
WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
E_VALUE_THRESHOLD_OPTION = 'e'
ADD_POSITION_OPTION = 'p'
ADD_DOMAIN_NUMBER_OPTION = 'd'
ADD_SPECIES = 's'
- MIN_LINKER_OPT = 'ml'
LOG_FILE_SUFFIX = '_domain_seq_extr.log'
PASSED_SEQS_SUFFIX = '_with_passing_domains.fasta'
FAILED_SEQS_SUFFIX = '_with_no_passing_domains.fasta'
WWW,
STDOUT )
- ld = Constants::LINE_DELIMITER
+ if ( ARGV == nil || ( ARGV.length < 1 ) )
+ print_help
+ exit( -1 )
+ end
begin
cla = CommandLineArguments.new( ARGV )
exit( 0 )
end
- if ( cla.get_number_of_files != 4 )
+ unless ( cla.get_number_of_files == 2 || cla.get_number_of_files == 3 || cla.get_number_of_files == 4 )
print_help
exit( -1 )
end
allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
allowed_opts.push( LENGTH_THRESHOLD_OPTION )
allowed_opts.push( ADD_SPECIES )
- allowed_opts.push( MIN_LINKER_OPT )
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
STDOUT )
end
- domain_id = cla.get_file_name( 0 )
- hmmsearch_output = cla.get_file_name( 1 )
- fasta_sequence_file = cla.get_file_name( 2 )
- outfile = cla.get_file_name( 3 )
-
- if outfile.downcase.end_with?( ".fasta" )
- outfile = outfile[ 0 .. outfile.length - 7 ]
- elsif outfile.downcase.end_with?( ".fsa" )
- outfile = outfile[ 0 .. outfile.length - 5 ]
- end
-
- add_position = false
- if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
- add_position = true
- end
-
- add_domain_number = false
- if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
- add_domain_number = true
- end
-
- add_species = false
- if cla.is_option_set? ADD_SPECIES
- add_species = true
- end
-
- e_value_threshold = -1.0
+ e_value_threshold = 0.1
if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
begin
e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
rescue ArgumentError => e
- Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
if ( e_value_threshold < 0.0 )
- Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
+ Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
end
end
- length_threshold = -1
+ length_threshold = 50
if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
begin
length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
rescue ArgumentError => e
- Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
if ( length_threshold < 0)
- Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
+ Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
end
end
- min_linker = nil
- if ( cla.is_option_set?( MIN_LINKER_OPT ) )
- begin
- min_linker = cla.get_option_value_as_int( MIN_LINKER_OPT )
- rescue ArgumentError => e
- Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ domain_id = cla.get_file_name( 0 )
+ hmmscan_output = cla.get_file_name( 1 )
+ fasta_sequence_file = ""
+ outfile = ""
+
+ if (cla.get_number_of_files == 4)
+ fasta_sequence_file = cla.get_file_name( 2 )
+ outfile = cla.get_file_name( 3 )
+ elsif (cla.get_number_of_files == 2 || cla.get_number_of_files == 3)
+ if (cla.get_number_of_files == 3)
+ fasta_sequence_file = cla.get_file_name( 2 )
+ else
+ hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
+ if ( hmmscan_index != nil )
+ prefix = hmmscan_output[0 .. hmmscan_index-1 ]
+ suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
+ files = Dir.entries( "." )
+ matching_files = Util.get_matching_files( files, prefix, suffix)
+ if matching_files.length < 1
+ Util.fatal_error( PRG_NAME, 'no file matching [' + prefix +
+ '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
+ end
+ if matching_files.length > 1
+ Util.fatal_error( PRG_NAME, 'more than one file matching [' +
+ prefix + '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
+ end
+ fasta_sequence_file = matching_files[ 0 ]
+ else
+ Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
+ end
end
- if ( !min_linker || min_linker > 100 || min_linker < -100 )
- Forester::Util.fatal_error( PRG_NAME, "unexpected value for min linker " + min_linker.to_s, STDOUT )
+ hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
+ if ( hmmscan_index != nil )
+ outfile = hmmscan_output.sub(Constants::HMMSCAN, "_") + "_" + domain_id
+ e = e_value_threshold >= 1 ? e_value_threshold.to_i : e_value_threshold.to_s.sub!('.','_')
+ outfile += "_E" + e.to_s
+ outfile += "_L" + length_threshold.to_i.to_s
+ else
+ Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
end
end
+ if outfile.downcase.end_with?( ".fasta" )
+ outfile = outfile[ 0 .. outfile.length - 7 ]
+ elsif outfile.downcase.end_with?( ".fsa" )
+ outfile = outfile[ 0 .. outfile.length - 5 ]
+ end
+
+ add_position = false
+ if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
+ add_position = true
+ end
+
+ add_domain_number = false
+ if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
+ add_domain_number = true
+ end
+
+ add_species = false
+ if cla.is_option_set?( ADD_SPECIES )
+ add_species = true
+ end
+
log = String.new
+ ld = Constants::LINE_DELIMITER
puts()
puts( "Domain : " + domain_id )
log << "Domain : " + domain_id + ld
- puts( "Hmmscan outputfile : " + hmmsearch_output )
- log << "Hmmscan outputfile : " + hmmsearch_output + ld
+ puts( "Hmmscan outputfile : " + hmmscan_output )
+ log << "Hmmscan outputfile : " + hmmscan_output + ld
puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file )
log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld
puts( "Outputfile : " + outfile + ".fasta" )
puts( "Length threshold : no threshold" )
log << "Length threshold : no threshold" + ld
end
- if ( min_linker )
- puts( "Min linker : " + min_linker.to_s )
- log << "Min linker : " + min_linker.to_s + ld
-
- end
if ( add_position )
puts( "Add positions (rel to complete seq) to extracted domains: true" )
begin
parser = HmmscanDomainExtractor.new()
domain_count = parser.parse( domain_id,
- hmmsearch_output,
+ hmmscan_output,
fasta_sequence_file,
outfile,
outfile + PASSED_SEQS_SUFFIX,
add_position,
add_domain_number,
add_species,
- min_linker,
+ false,
log )
rescue ArgumentError, IOError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
puts()
puts( "Usage:" )
puts()
- puts( " " + PRG_NAME + ".rb [options] <domain> <hmmscan outputfile> <file containing complete sequences in fasta format> <outputfile>" )
+ puts( " " + PRG_NAME + ".rb [options] <domain> <hmmscan outputfile> [file containing complete sequences in fasta format] [outputfile]" )
+ puts()
+ puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f> : iE-value threshold, default is 0.1" )
+ puts( " -" + LENGTH_THRESHOLD_OPTION + "=<i> : length threshold, default is 50" )
+ puts( " -" + ADD_DOMAIN_NUMBER_OPTION + " : to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
+ puts( " -" + ADD_POSITION_OPTION + " : to add positions (rel to complete seq) to extracted domains" )
+ puts( " -" + ADD_SPECIES + " : to add species [in brackets]" )
puts()
- puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f>: iE-value threshold, default is no threshold" )
- puts( " -" + LENGTH_THRESHOLD_OPTION + "=<i>: length threshold, default is no threshold" )
- puts( " -" + ADD_POSITION_OPTION + ": to add positions (rel to complete seq) to extracted domains" )
- puts( " -" + ADD_DOMAIN_NUMBER_OPTION + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
- puts( " -" + ADD_SPECIES + ": to add species [in brackets]" )
- puts( " -" + MIN_LINKER_OPT + "=<i>: to extract pairs of same domains with a distance inbetween shorter than a given value" )
+ puts( "Examples:" )
+ puts
+ puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta P53_#{Constants::PFAM_V_FOR_EX}_10_Pkinase_E1_0e-06_L50" )
+ puts
+ puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 P53_ni.fasta" )
+ puts
+ puts( " " + PRG_NAME + ".rb -d -e=1e-6 -l=50 Pkinase P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10" )
puts()
end
if ( cla.get_number_of_files == 2 )
original_sequences_file = cla.get_file_name( 1 )
else
- hmmscan_index = domains_list_file.index("hmmscan")
+ hmmscan_index = domains_list_file.index(Constants::HMMSCAN)
if ( hmmscan_index != nil )
prefix = domains_list_file[0 .. hmmscan_index-1 ]
suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
prefix + '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
end
original_sequences_file = matching_files[ 0 ]
+ else
+ Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
end
+
end
outfile = domains_list_file
if (outfile.end_with?(Constants::DOMAIN_TABLE_SUFFIX) )
outfile = outfile.chomp(Constants::DOMAIN_TABLE_SUFFIX)
end
if ( e_value_threshold >= 0.0 )
- outfile = outfile + Constants::DOMAINS_TO_FORESTER_EVALUE_CUTOFF_SUFFIX + e_value_threshold.to_s
+ e = e_value_threshold >= 1 ? e_value_threshold.to_i : e_value_threshold.to_s.sub!('.','_')
+ outfile = outfile + Constants::DOMAINS_TO_FORESTER_EVALUE_CUTOFF_SUFFIX + e.to_s
end
outfile = outfile + Constants::DOMAINS_TO_FORESTER_OUTFILE_SUFFIX
end
puts
Util.print_message( PRG_NAME, "wrote: " + outfile )
- Util.print_message( PRG_NAME, "next steps in standard analysis pipeline: hmmsearch followed by dsx.rb")
+ Util.print_message( PRG_NAME, "next step in standard analysis pipeline: dsx.rb")
Util.print_message( PRG_NAME, 'OK' )
puts
puts
puts( " " + PRG_NAME + ".rb [options] <domain table (parsed hmmpfam output)> [file containing complete sequences in fasta format] [outputfile]" )
puts()
- puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f> : E-value threshold, default is no threshold" )
- puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" )
+ puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f>: E-value threshold, default is no threshold" )
+ puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" )
puts
+ puts( " [next step in standard analysis pipeline: dsx.rb]")
+ puts()
puts( "Examples:" )
puts
- puts( " " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta P53_hmmscan_300_10.dff" )
+ puts( " " + PRG_NAME + ".rb -o P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10.dff" )
puts
- puts( " " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta" )
+ puts( " " + PRG_NAME + ".rb -o P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta" )
puts
- puts( " " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table" )
+ puts( " " + PRG_NAME + ".rb -o P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table" )
puts()
end
#
# = lib/evo/apps/fasta_extractor.rb - FastaExtractor class
#
-# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: fasta_extractor.rb,v 1.2 2010/12/13 19:00:11 cmzmasek Exp $
-
+# Copyright:: Copyright (C) 2017 Christian M. Zmasek
+# License:: GNU Lesser General Public License (LGPL)
-require 'lib/evo/util/util'
require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
require 'lib/evo/util/command_line_arguments'
-
module Evoruby
-
- class FastaExtractor
-
- PRG_NAME = "fae"
- PRG_VERSION = "1.0.0"
- PRG_DESC = "extraction of nucleotide sequences from a fasta file by names from wublast search"
- PRG_DATE = "2008.08.09"
- COPYRIGHT = "2008-2009 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
- HELP_OPTION_1 = 'help'
- HELP_OPTION_2 = 'h'
-
-
- def run()
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC ,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- ld = Constants::LINE_DELIMITER
-
- begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
- print_help
- exit( 0 )
- end
-
- if ( cla.get_number_of_files != 3 )
- print_help
- exit( -1 )
- end
-
- allowed_opts = Array.new
-
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME,
- "unknown option(s): " + disallowed,
- STDOUT )
+ class FastaExtractor
+
+ PRG_NAME = "fastx"
+ PRG_VERSION = "1.000"
+ PRG_DESC = "extraction of molecular sequences from a fasta file"
+ PRG_DATE = "170215"
+ WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
+
+ HELP_OPTION_1 = 'help'
+ HELP_OPTION_2 = 'h'
+ def run()
+
+ Util.print_program_information( PRG_NAME,
+ PRG_VERSION,
+ PRG_DESC ,
+ PRG_DATE,
+ WWW,
+ STDOUT )
+
+ if ( ARGV == nil || ( ARGV.length < 1 ) )
+ print_help
+ exit( -1 )
+ end
+
+ begin
+ cla = CommandLineArguments.new( ARGV )
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+ end
+
+ if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+ cla.is_option_set?( HELP_OPTION_2 ) )
+ print_help
+ exit( 0 )
+ end
+
+ if ( cla.get_number_of_files != 3 )
+ print_help
+ exit( -1 )
+ end
+
+ allowed_opts = Array.new
+
+ disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+ if ( disallowed.length > 0 )
+ Util.fatal_error( PRG_NAME,
+ "unknown option(s): " + disallowed,
+ STDOUT )
+ end
+
+ input_file = cla.get_file_name( 0 )
+ query = cla.get_file_name( 1 )
+ output_file = cla.get_file_name( 2 )
+
+ if !File.exist?( input_file )
+ Util.fatal_error( PRG_NAME, "error: input file [#{input_file}] does not exist" )
+ end
+ if File.exist?( output_file )
+ Util.fatal_error( PRG_NAME, "error: [#{output_file}] already exists" )
+ end
+
+ results = extract_sequences( query, input_file, output_file )
+
+ Util.print_message( PRG_NAME, "matched: " + results )
+ Util.print_message( PRG_NAME, "wrote: " + output_file )
+ Util.print_message( PRG_NAME, "OK" )
+
+ end
+
+ def extract_sequences( query, fasta_file, output_file )
+ output = File.open( output_file, "a" )
+ matching_state = false
+ matches = 0
+ total = 0
+ File.open( fasta_file ) do | file |
+ while line = file.gets
+ if !Util.is_string_empty?( line )
+ if line =~ /^\s*>/
+ total += 1
+ if total % 10000 == 0
+ STDOUT.write "\r#{matches}/#{total}"
+ STDOUT.flush
+ end
+ if line =~ /#{query}/
+ matching_state = true
+ matches += 1
+ output.print( line )
+ else
+ matching_state = false
+ end
+ elsif matching_state
+ output.print( line )
end
-
- input_file = cla.get_file_name( 0 )
- names_file = cla.get_file_name( 1 )
- output_file = cla.get_file_name( 2 )
-
- if !File.exist?( input_file )
- Util.fatal_error( PRG_NAME, "error: input file [#{input_file}] does not exist" )
- end
- if !File.exist?( names_file )
- Util.fatal_error( PRG_NAME, "error: names file [#{names_file}] does not exist" )
- end
- if File.exist?( output_file )
- Util.fatal_error( PRG_NAME, "error: [#{output_file }] already exists" )
- end
-
- names = extract_names_with_frames( names_file )
-
- extract_sequences( names, input_file, output_file )
-
- puts
- Util.print_message( PRG_NAME, "OK" )
- puts
-
+ end
end
-
-
- def extract_names_with_frames( names_file )
- names = Hash.new()
- File.open( names_file ) do | file |
- while line = file.gets
- if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
- if ( line =~ /(\S+)\s+([+|-]\d)\s+\d+\s+(\S+)/ )
- name = $1
- frame = $2
- e = $3
- names[ name ] = "[" + frame + "] [" + e + "]"
- end
- end
- end
- end
- names
- end
-
- def extract_sequences( names, fasta_file, output_file )
- output = File.open( output_file, "a" )
- matching_state = false
- counter = 0
- File.open( fasta_file ) do | file |
- while line = file.gets
- if !Util.is_string_empty?( line )
- if ( line =~ /\s*>\s*(.+)/ )
- name = $1
- if names.has_key?( name )
- matching_state = true
- counter += 1
- puts counter.to_s + ". " +name + " " + names[ name ]
- output.print( ">" + name + " " + names[ name ] )
- output.print( Evoruby::Constants::LINE_DELIMITER )
- else
- matching_state = false
- end
- elsif matching_state
- output.print( line )
- end
- end
- end
- end
- output.close()
- end
-
- def print_help()
- puts( "Usage:" )
- puts()
- puts( " " + PRG_NAME + ".rb <input fasta file> <names file based on blast output> <output file>" )
- puts()
- end
-
- end # class FastaExtractor
+ end
+ output.close()
+ matches.to_s + "/" + total.to_s
+ end
+
+ def print_help()
+ puts( "Usage:" )
+ puts()
+ puts( " " + PRG_NAME + ".rb <input fasta file> <query> <output file>" )
+ puts()
+ puts( "Examples:" )
+ puts
+ puts( " " + PRG_NAME + ".rb Pfam-A.fasta kinase kinases" )
+ puts()
+ end
+
+ end # class FastaExtractor
end
\ No newline at end of file
#
# = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
#
-# Copyright:: Copyright (C) 2012 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
-#
+# Copyright:: Copyright (C) 2017 Christian M. Zmasek
+# License:: GNU Lesser General Public License (LGPL)
require 'lib/evo/util/constants'
require 'lib/evo/util/util'
end
puts()
- puts( "hmmpfam outputfile : " + inpath )
+ puts( "hmmscan outputfile : " + inpath )
puts( "outputfile : " + outpath )
if ( i_e_value_threshold >= 0.0 )
puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + "=<f>: E-value threshold for full protein sequences, only for protein architectures summary" )
puts( " -" + SPECIES_OPTION + "=<s> : species for protein architectures summary" )
puts()
- puts( "Example:" )
+ puts( " [next step in standard analysis pipeline: d2f.rb]")
puts()
- puts( " " + "hmmscan --nobias --domtblout P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 -E 10 Pfam-A.hmm P53_ni.fasta" )
+ puts( "Examples:" )
+ puts()
+ puts( " " + "hmmscan --max --domtblout P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 -E 10 Pfam-A.hmm P53_ni.fasta" )
puts()
puts( " " + PRG_NAME + ".rb P53_hmmscan_300_10" )
puts()
class TaxonomyProcessor
PRG_NAME = "tap"
- PRG_DATE = "170213"
+ PRG_DATE = "170214"
PRG_DESC = "Replacement of labels in multiple sequence files"
PRG_VERSION = "2.004"
WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
puts( " options: -" + EXTRACT_TAXONOMY_OPTION + " : to extract taxonomy information from bracketed expressions" )
puts( " -" + ANNOTATION_OPTION + "=<s>: to add an annotation to all entries" )
puts()
+ puts( " [next steps in standard analysis pipeline: hmmscan followed by hsp.rb]")
+ puts()
puts( "Example:" )
puts()
puts( " " + PRG_NAME + ".rb P53.fasta" )
ID_NORMALIZED_FASTA_FILE_SUFFIX = "_ni.fasta"
ID_MAP_FILE_SUFFIX = ".nim"
DOMAIN_TABLE_SUFFIX = "_domain_table"
+ HMMSCAN = "_hmmscan_"
DOMAINS_TO_FORESTER_OUTFILE_SUFFIX = ".dff"
DOMAINS_TO_FORESTER_EVALUE_CUTOFF_SUFFIX = "_dtfE"
#!/usr/local/bin/ruby -w
#
-# = hmm_split
-#
-# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: hmm_split.rb,v 1.5 2008/11/17 22:32:43 cmzmasek Exp $
-#
-# To split a Pfam HMM file into one file for each HMM.
+# = hmm_split
#
-
+# Copyright:: Copyright (C) 2017 Christian M Zmasek
+# License:: GNU Lesser General Public License (LGPL)
module ForesterScripts
-
- if RUBY_VERSION !~ /1.9/
- puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
- exit( -1 )
- end
-
-
- if ( ARGV == nil || ARGV.length != 3 )
- puts( "usage: hmm_split.rb <Pfam HMM file> <outfile suffix> <outdir>" )
- exit( -1 )
- end
-
- hmmfile = ARGV[ 0 ]
- suffix = ARGV[ 1 ]
- outdir = ARGV[ 2 ]
-
- if ( !File.exists?( outdir ) )
- puts( "outdir [" + outdir + "] does not exist" )
- exit( -1 )
- end
- if ( !File.exists?( hmmfile ) )
- puts( "Pfam HMM file [" + hmmfile + "] does not exist" )
- exit( -1 )
- end
-
- data = String.new
- name = String.new
- line_count = 0
- count = 0
-
- File.open( hmmfile ) do | file |
- while line = file.gets
- data = data + line
- line_count += 1
- if ( line =~ /NAME\s+(.+)/ )
- if name.length > 0
- puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" )
- exit( -1 )
- end
- name = $1
- elsif ( line =~ /\/\// )
- if name.length < 1
- puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" )
- exit( -1 )
- end
-
- outfile = outdir + '/' + name + suffix
- if ( File.exists?( outfile ) )
- puts( "file [" + outfile + "] already exists" )
- exit( -1 )
- end
- open( outfile, 'w' ) do | out |
- out.write( data )
- end
- count += 1
- puts( count.to_s + ": " + name )
- data = String.new
- name = String.new
- end
- end
- end
-
- puts()
- puts( "wrote " + count.to_s + " individual HMM files to " + outdir )
-
+
+ if ( ARGV == nil || ARGV.length != 3 )
+ puts( "usage: hmm_split.rb <Pfam HMM file> <outfile suffix> <outdir>" )
+ exit( -1 )
+ end
+
+ hmmfile = ARGV[ 0 ]
+ suffix = ARGV[ 1 ]
+ outdir = ARGV[ 2 ]
+
+ if ( !File.exist?( outdir ) )
+ puts( "outdir [" + outdir + "] does not exist" )
+ exit( -1 )
+ end
+ if ( !File.exist?( hmmfile ) )
+ puts( "Pfam HMM file [" + hmmfile + "] does not exist" )
+ exit( -1 )
+ end
+
+ data = String.new
+ name = String.new
+ line_count = 0
+ count = 0
+
+ File.open( hmmfile ) do | file |
+ while line = file.gets
+ data = data + line
+ line_count += 1
+ if ( line =~ /NAME\s+(.+)/ )
+ if name.length > 0
+ puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" )
+ exit( -1 )
+ end
+ name = $1
+ elsif ( line =~ /\/\// )
+ if name.length < 1
+ puts( "Pfam HMM file [" + hmmfile + "] format error [line: " + line + "]" )
+ exit( -1 )
+ end
+
+ outfile = outdir + '/' + name + suffix
+ if ( File.exist?( outfile ) )
+ puts( "file [" + outfile + "] already exists" )
+ exit( -1 )
+ end
+ open( outfile, 'w' ) do | out |
+ out.write( data )
+ end
+ count += 1
+ puts( count.to_s + ": " + name )
+ data = String.new
+ name = String.new
+ end
+ end
+ end
+
+ puts()
+ puts( "wrote " + count.to_s + " individual HMM files to " + outdir )
+
end
\ No newline at end of file