--- /dev/null
+#!/usr/local/bin/ruby -w
+#
+# = exe/dsx
+#
+# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
+# License:: GNU Lesser General Public License (LGPL)
+#
+# $Id: dsx.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $
+#
+# last modified: 06/11/2007
+
+require 'lib/evo/apps/domain_sequence_extractor_new'
+
+module Evoruby
+
+ dsx = DomainSequenceExtractor.new()
+
+ dsx.run()
+
+end # module Evoruby
end
end
w = FastaWriter.new
- w.write(unique_genes_msa, "uniques.fasta")
- w.write(longest_non_unique_genes_msa, "non_uniques_longest.fasta")
+ w.write(unique_genes_msa, "seqs_from_unique_genes.fasta")
+ w.write(longest_non_unique_genes_msa, "longest_seqs_from_nonunique_genes.fasta")
end
--- /dev/null
+#
+# = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class
+#
+# Copyright:: Copyright (C) 2012 Christian M. Zmasek
+# License:: GNU Lesser General Public License (LGPL)
+#
+# $Id:Exp $
+
+
+require 'lib/evo/util/constants'
+require 'lib/evo/util/util'
+require 'lib/evo/util/command_line_arguments'
+require 'lib/evo/io/parser/hmmscan_domain_extractor'
+
+module Evoruby
+
+ class DomainSequenceExtractor
+
+ PRG_NAME = "dsx"
+ PRG_VERSION = "1.000"
+ PRG_DESC = "extraction of domain sequences from hmmscan output"
+ PRG_DATE = "20120906"
+ COPYRIGHT = "2012 Christian M Zmasek"
+ CONTACT = "phylosoft@gmail.com"
+ WWW = "www.phylosoft.org"
+
+ E_VALUE_THRESHOLD_OPTION = 'e'
+ LENGTH_THRESHOLD_OPTION = 'l'
+ ADD_POSITION_OPTION = 'p'
+ ADD_DOMAIN_NUMBER_OPTION = 'd'
+ ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT = 'dd'
+ ADD_DOMAIN_NUMBER_OPTION_AS_LETTER = 'dl'
+ TRIM_OPTION = 't'
+ LOG_FILE_SUFFIX = '_domain_seq_extr.log'
+ PASSED_SEQS_SUFFIX = '_domain_seq_extr_passed'
+ FAILED_SEQS_SUFFIX = '_domain_seq_extr_failed'
+ HELP_OPTION_1 = 'help'
+ HELP_OPTION_2 = 'h'
+
+ def run()
+
+ Util.print_program_information( PRG_NAME,
+ PRG_VERSION,
+ PRG_DESC ,
+ PRG_DATE,
+ COPYRIGHT,
+ CONTACT,
+ WWW,
+ STDOUT )
+
+ ld = Constants::LINE_DELIMITER
+
+ begin
+ cla = CommandLineArguments.new( ARGV )
+ rescue ArgumentError
+ Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
+ end
+
+ if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+ cla.is_option_set?( HELP_OPTION_2 ) )
+ print_help
+ exit( 0 )
+ end
+
+ if ( cla.get_number_of_files != 4 )
+ print_help
+ exit( -1 )
+ end
+
+ allowed_opts = Array.new
+ allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
+ allowed_opts.push( ADD_POSITION_OPTION )
+ allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
+ allowed_opts.push( LENGTH_THRESHOLD_OPTION )
+ allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT )
+ allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER )
+ allowed_opts.push( TRIM_OPTION )
+
+ disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+ if ( disallowed.length > 0 )
+ Util.fatal_error( PRG_NAME,
+ "unknown option(s): " + disallowed,
+ STDOUT )
+ end
+
+ domain_id = cla.get_file_name( 0 )
+ hmmsearch_output = cla.get_file_name( 1 )
+ fasta_sequence_file = cla.get_file_name( 2 )
+ outfile = cla.get_file_name( 3 )
+
+ add_position = false
+ if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
+ add_position = true
+ end
+
+ trim = false
+ if ( cla.is_option_set?( TRIM_OPTION ) )
+ trim = true
+ end
+
+ add_domain_number = false
+ add_domain_number_as_letter = false
+ add_domain_number_as_digit = false
+
+ if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
+ add_domain_number = true
+ end
+ if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) )
+ add_domain_number_as_letter = true
+ end
+ if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) )
+ add_domain_number_as_digit = true
+ end
+
+ if ( add_domain_number_as_letter && add_domain_number_as_digit )
+ puts( "attempt to add domain number as letter and digit at the same time" )
+ print_help
+ exit( -1 )
+ end
+
+ e_value_threshold = -1.0
+ if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
+ begin
+ e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
+ rescue ArgumentError => e
+ Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
+ if ( e_value_threshold < 0.0 )
+ Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
+ end
+ end
+
+ length_threshold = -1
+ if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
+ begin
+ length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
+ rescue ArgumentError => e
+ Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
+ if ( length_threshold < 0)
+ Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
+ end
+ end
+
+ log = String.new
+
+ puts()
+ puts( "Domain : " + domain_id )
+ log << "Domain : " + domain_id + ld
+ puts( "Hmmscan outputfile : " + hmmsearch_output )
+ log << "Hmmscan outputfile : " + hmmsearch_output + ld
+ puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file )
+ log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld
+ puts( "Outputfile : " + outfile )
+ log << "Outputfile : " + outfile + ld
+ puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX )
+ log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld
+ puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX )
+ log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld
+ puts( "Logfile : " + outfile + LOG_FILE_SUFFIX )
+ log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld
+ if ( e_value_threshold >= 0.0 )
+ puts( "E-value threshold : " + e_value_threshold.to_s )
+ log << "E-value threshold : " + e_value_threshold.to_s + ld
+ else
+ puts( "E-value threshold : no threshold" )
+ log << "E-value threshold : no threshold" + ld
+ end
+ if ( length_threshold > 0 )
+ puts( "Length threshold : " + length_threshold.to_s )
+ log << "Length threshold : " + length_threshold.to_s + ld
+ else
+ puts( "Length threshold : no threshold" )
+ log << "Length threshold : no threshold" + ld
+ end
+
+ if ( trim )
+ puts( "Trim last 2 chars : true" )
+ log << "Trim last 2 chars : true" + ld
+ else
+ puts( "Trim names : false" )
+ log << "Trim names : false" + ld
+ end
+
+
+ if ( add_position )
+ puts( "Add positions (rel to complete seq) to extracted domains: true" )
+ log << "Add positions (rel to complete seq) to extracted domains: true" + ld
+ else
+ puts( "Add positions (rel to complete seq) to extracted domains: false" )
+ log << "Add positions (rel to complete seq) to extracted domains: false" + ld
+ end
+
+ if ( add_domain_number || add_domain_number_as_digit || add_domain_number_as_letter )
+ puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
+ log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
+ else
+ puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
+ log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
+ end
+
+ puts
+
+ domain_count = 0
+ begin
+ parser = HmmscanDomainExtractor.new()
+ domain_count = parser.parse( domain_id,
+ hmmsearch_output,
+ fasta_sequence_file,
+ outfile,
+ outfile + PASSED_SEQS_SUFFIX,
+ outfile + FAILED_SEQS_SUFFIX,
+ e_value_threshold,
+ length_threshold,
+ add_position,
+ add_domain_number,
+ add_domain_number_as_digit,
+ add_domain_number_as_letter,
+ trim,
+ log )
+ rescue ArgumentError, IOError, StandardError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ rescue Exception => e
+ Util.fatal_error( PRG_NAME, "unexpected exception!: " + e.to_s, STDOUT )
+ end
+
+ puts
+ Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
+ Util.print_message( PRG_NAME, "wrote; " + outfile )
+ Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
+ Util.print_message( PRG_NAME, "(wrote: " + outfile + PASSED_SEQS_SUFFIX + ")" )
+ Util.print_message( PRG_NAME, "(wrote: " + outfile + FAILED_SEQS_SUFFIX + ")" )
+
+ begin
+ f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
+ f.print( log )
+ f.close
+ rescue Exception => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+ end
+
+ puts
+ Util.print_message( PRG_NAME, "OK" )
+ puts
+
+ end
+
+ def print_help()
+ puts()
+ puts( "Usage:" )
+ puts()
+ puts( " " + PRG_NAME + ".rb [options] <domain> <hmmscan outputfile> <file containing complete sequences in fasta format> <outputfile>" )
+ puts()
+ puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f>: E-value threshold, default is no threshold" )
+ puts( " -" + LENGTH_THRESHOLD_OPTION + "=<i>: length threshold, default is no threshold" )
+ puts( " -" + ADD_POSITION_OPTION + ": to add positions (rel to complete seq) to extracted domains" )
+ puts( " -" + ADD_DOMAIN_NUMBER_OPTION + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
+ puts( " -" + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT + ": to add numbers to extracted domains as digit (example \"domain2\")" )
+ puts( " -" + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER + ": to add numbers to extracted domains as letter (example \"domaina\")" )
+ puts( " -" + TRIM_OPTION + ": to remove the last 2 characters from sequence names" )
+ puts()
+ end
+
+ end # class DomainSequenceExtractor
+
+end # module Evoruby
# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
# License:: GNU Lesser General Public License (LGPL)
#
-# $Id: domains_to_forester.rb,v 1.11 2010/12/13 19:00:11 cmzmasek Exp $
+# $Id: Exp $
#
# last modified: 06/11/2007
module Evoruby
- class DomainsToForester
-
- PRG_NAME = "d2f"
- PRG_DESC = "parsed hmmpfam output to forester format"
- PRG_VERSION = "1.0.0"
- PRG_DATE = "2007.12.18"
- COPYRIGHT = "2007 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
- E_VALUE_THRESHOLD_OPTION = "e"
- OVERWRITE_IF_SAME_FROM_TO_OPTION = "o"
- HELP_OPTION_1 = "help"
- HELP_OPTION_2 = "h"
-
- def parse( domains_list_file,
- original_seqs_file,
- outfile,
- column_delimiter,
- e_value_threshold,
- overwrite_if_same_from_to )
- Util.check_file_for_readability( domains_list_file )
- Util.check_file_for_readability( original_seqs_file )
- Util.check_file_for_writability( outfile )
-
- domain_structures = Hash.new() # protein name is key, domain structure is value
-
- f = MsaFactory.new
-
- original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new )
- if ( original_seqs.get_number_of_seqs < 1 )
- error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format"
- raise ArgumentError, error_msg
+ class DomainsToForester
+
+ PRG_NAME = "d2f"
+ PRG_DESC = "parsed hmmpfam output to forester format"
+ PRG_VERSION = "1.001"
+ PRG_DATE = "20120807"
+ COPYRIGHT = "2012 Christian M Zmasek"
+ CONTACT = "phylosoft@gmail.com"
+ WWW = "www.phylosoft.org"
+
+ E_VALUE_THRESHOLD_OPTION = "e"
+ OVERWRITE_IF_SAME_FROM_TO_OPTION = "o"
+ HELP_OPTION_1 = "help"
+ HELP_OPTION_2 = "h"
+
+ def parse( domains_list_file,
+ original_seqs_file,
+ outfile,
+ column_delimiter,
+ e_value_threshold,
+ overwrite_if_same_from_to )
+ Util.check_file_for_readability( domains_list_file )
+ Util.check_file_for_readability( original_seqs_file )
+ Util.check_file_for_writability( outfile )
+
+ domain_structures = Hash.new() # protein name is key, domain structure is value
+
+ f = MsaFactory.new
+
+ original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new )
+ if ( original_seqs.get_number_of_seqs < 1 )
+ error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format"
+ raise ArgumentError, error_msg
+ end
+
+ File.open( domains_list_file ) do | file |
+ while line = file.gets
+ if !is_ignorable?( line )
+ a = line.split( column_delimiter )
+ l = a.length
+ if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) )
+ error_msg = "unexpected format at line: " + line
+ raise IOError, error_msg
end
-
- File.open( domains_list_file ) do | file |
- while line = file.gets
- if ( !is_ignorable?( line ) )
- a = line.split( column_delimiter )
- l = a.length
- if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) )
- error_msg = "unexpected format at line: " + line
- raise IOError, error_msg
- end
- protein_name = a[ 0 ]
- domain_name = a[ 1 ]
- seq_from = -1
- seq_to = -1
- begin
- seq_from = a[ 2 ].to_i
- rescue Exception
- error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]"
- raise IOError, error_msg
- end
- begin
- seq_to = a[ 3 ].to_i
- rescue Exception
- error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]"
- raise IOError, error_msg
- end
-
- e_value = -1
- if ( l > 4 )
- begin
- e_value = a[ 4 ].to_f
- rescue Exception
- error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]"
- raise IOError, error_msg
- end
- end
-
- seq = original_seqs.get_by_name( protein_name, true, false )
-
- total_length = seq.get_length
-
- if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) ) )
- pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value )
- ds = nil
- if ( domain_structures.has_key?( protein_name ) )
- ds = domain_structures[ protein_name ]
- else
- ds = DomainStructure.new( total_length )
- domain_structures[ protein_name ] = ds
- end
- ds.add_domain( pd, overwrite_if_same_from_to )
- end
-
- end
- end
- end
-
- out = File.open( outfile, "a" )
- ds = domain_structures.sort
- for d in ds
- protein_name = d[ 0 ]
- domain_structure = d[ 1 ]
- out.print( protein_name.to_s )
- out.print( ":" )
- out.print( domain_structure.to_NHX )
- out.print( Constants::LINE_DELIMITER )
- end
-
- out.flush()
- out.close()
-
- end # parse
-
-
-
-
- def run()
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
+ protein_name = a[ 0 ]
+ domain_name = a[ 1 ]
+ seq_from = -1
+ seq_to = -1
begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
-
- if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
- print_help
- exit( 0 )
- end
-
- if ( cla.get_number_of_files != 3 )
- print_help
- exit( -1 )
+ seq_from = a[ 2 ].to_i
+ rescue Exception
+ error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]"
+ raise IOError, error_msg
end
-
- allowed_opts = Array.new
- allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
- allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION )
-
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME,
- "unknown option(s): " + disallowed,
- STDOUT )
+ begin
+ seq_to = a[ 3 ].to_i
+ rescue Exception
+ error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]"
+ raise IOError, error_msg
end
- domains_list_file = cla.get_file_name( 0 )
- original_sequences_file = cla.get_file_name( 1 )
- outfile = cla.get_file_name( 2 )
-
-
- e_value_threshold = -1.0
- if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
- begin
- e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- if ( e_value_threshold < 0.0 )
- Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
- end
- end
- overwrite_if_same_from_to = false
- if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) )
- overwrite_if_same_from_to = true
+ e_value = -1
+ if l > 4
+ begin
+ e_value = a[ 4 ].to_f
+ rescue Exception
+ error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]"
+ raise IOError, error_msg
+ end
end
- puts()
- puts( "Domains list file : " + domains_list_file )
- puts( "Fasta sequencefile (complete sequences): " + original_sequences_file )
- puts( "Outputfile : " + outfile )
- if ( e_value_threshold >= 0.0 )
- puts( "E-value threshold : " + e_value_threshold.to_s )
- else
- puts( "E-value threshold : no threshold" )
- end
- if ( overwrite_if_same_from_to )
- puts( "Overwrite if same from and to : true" )
- else
- puts( "Overwrite if same from and to : false" )
- end
+ seq = original_seqs.get_by_name_start( protein_name )
- puts
+ total_length = seq.get_length
- begin
- parse( domains_list_file,
- original_sequences_file,
- outfile,
- " ",
- e_value_threshold,
- overwrite_if_same_from_to )
-
- rescue ArgumentError, IOError, StandardError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
+ if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) ) )
+ pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value )
+ ds = nil
+ if ( domain_structures.has_key?( protein_name ) )
+ ds = domain_structures[ protein_name ]
+ else
+ ds = DomainStructure.new( total_length )
+ domain_structures[ protein_name ] = ds
+ end
+ ds.add_domain( pd, overwrite_if_same_from_to )
end
-
- puts
- Util.print_message( PRG_NAME, 'OK' )
- puts
-
+ end
end
-
- private
-
- def print_help()
- puts()
- puts( "Usage:" )
- puts()
- puts( " " + PRG_NAME + ".rb [options] <domains list file (parsed hmmpfam output)> <file containing complete sequences in fasta format> <outputfile>" )
- puts()
- puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f> : E-value threshold, default is no threshold" )
- puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" )
- puts()
+ end
+
+ out = File.open( outfile, "a" )
+ ds = domain_structures.sort
+ for d in ds
+ protein_name = d[ 0 ]
+ domain_structure = d[ 1 ]
+ out.print( protein_name.to_s )
+ out.print( ":" )
+ out.print( domain_structure.to_NHX )
+ out.print( Constants::LINE_DELIMITER )
+ end
+
+ out.flush()
+ out.close()
+
+ end # parse
+
+
+
+
+ def run()
+
+ Util.print_program_information( PRG_NAME,
+ PRG_VERSION,
+ PRG_DESC,
+ PRG_DATE,
+ COPYRIGHT,
+ CONTACT,
+ WWW,
+ STDOUT )
+
+ begin
+ cla = CommandLineArguments.new( ARGV )
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
+
+ if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+ cla.is_option_set?( HELP_OPTION_2 ) )
+ print_help
+ exit( 0 )
+ end
+
+ if cla.get_number_of_files != 3
+ print_help
+ exit( -1 )
+ end
+
+ allowed_opts = Array.new
+ allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
+ allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION )
+
+ disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+ if ( disallowed.length > 0 )
+ Util.fatal_error( PRG_NAME,
+ "unknown option(s): " + disallowed,
+ STDOUT )
+ end
+
+ domains_list_file = cla.get_file_name( 0 )
+ original_sequences_file = cla.get_file_name( 1 )
+ outfile = cla.get_file_name( 2 )
+
+
+ e_value_threshold = -1.0
+ if cla.is_option_set?( E_VALUE_THRESHOLD_OPTION )
+ begin
+ e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
-
-
-
- def is_ignorable?( line )
- return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/)
+ if ( e_value_threshold < 0.0 )
+ Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
end
-
-
- end # class DomainsToForester
+ end
+ overwrite_if_same_from_to = false
+ if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) )
+ overwrite_if_same_from_to = true
+ end
+
+ puts
+ puts( "Domains list file : " + domains_list_file )
+ puts( "Fasta sequencefile (complete sequences): " + original_sequences_file )
+ puts( "Outputfile : " + outfile )
+ if ( e_value_threshold >= 0.0 )
+ puts( "E-value threshold : " + e_value_threshold.to_s )
+ else
+ puts( "E-value threshold : no threshold" )
+ end
+ if ( overwrite_if_same_from_to )
+ puts( "Overwrite if same from and to : true" )
+ else
+ puts( "Overwrite if same from and to : false" )
+ end
+
+ puts
+
+ begin
+ parse( domains_list_file,
+ original_sequences_file,
+ outfile,
+ " ",
+ e_value_threshold,
+ overwrite_if_same_from_to )
+
+ rescue ArgumentError, IOError, StandardError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ rescue Exception => e
+ Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
+ end
+
+
+ puts
+ Util.print_message( PRG_NAME, 'OK' )
+ puts
+
+ end
+
+ private
+
+ def print_help()
+ puts
+ puts( "Usage:" )
+ puts
+ puts( " " + PRG_NAME + ".rb [options] <domains list file (parsed hmmpfam output)> <file containing complete sequences in fasta format> <outputfile>" )
+ puts()
+ puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f> : E-value threshold, default is no threshold" )
+ puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" )
+ puts
+ end
+
+
+
+ def is_ignorable?( line )
+ return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/)
+ end
+
+
+ end # class DomainsToForester
end # module Evoruby
--- /dev/null
+#
+# = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanDomainExtractor class
+#
+# Copyright:: Copyright (C) 2012 Christian M. Zmasek
+# License:: GNU Lesser General Public License (LGPL)
+#
+# $Id: $
+
+
+require 'lib/evo/util/constants'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/io/msa_io'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/parser/fasta_parser'
+
+
+module Evoruby
+
+ class HmmscanDomainExtractor
+
+ TRIM_BY = 2
+
+ def initialize
+ end
+
+ # raises ArgumentError, IOError, StandardError
+ def parse( domain_id,
+ hmmsearch_output,
+ fasta_sequence_file,
+ outfile,
+ passed_seqs_outfile,
+ failed_seqs_outfile,
+ e_value_threshold,
+ length_threshold,
+ add_position,
+ add_domain_number,
+ add_domain_number_as_digit,
+ add_domain_number_as_letter,
+ trim_name,
+ log )
+
+ Util.check_file_for_readability( hmmsearch_output )
+ Util.check_file_for_readability( fasta_sequence_file )
+ Util.check_file_for_writability( outfile )
+ Util.check_file_for_writability( passed_seqs_outfile )
+ Util.check_file_for_writability( failed_seqs_outfile )
+
+ in_msa = nil
+ factory = MsaFactory.new()
+ in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
+
+ if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
+ error_msg = "could not find fasta sequences in " + fasta_sequence_file
+ raise IOError, error_msg
+ end
+
+ out_msa = Msa.new
+ failed_seqs = Msa.new
+ passed_seqs = Msa.new
+
+ ld = Constants::LINE_DELIMITER
+
+ domain_pass_counter = 0
+ domain_fail_counter = 0
+ proteins_with_passing_domains = 0
+ proteins_with_failing_domains = 0
+ max_domain_copy_number_per_protein = -1
+ max_domain_copy_number_sequence = ''
+ failed_species_counts = Hash.new
+ passed_species_counts = Hash.new
+
+ File.open( hmmsearch_output ) do | file |
+ while line = file.gets
+ if !is_ignorable?( line ) && line =~ /^\S+\s+/
+
+ # tn acc tlen query acc qlen Evalue score bias # of c-E i-E score bias hf ht af at ef et acc desc
+ # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
+ line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
+
+ target_name = $1
+ if domain_id != target_name
+ next
+ end
+
+
+ sequence = $4
+ number = $10.to_i
+ out_of = $11.to_i
+ env_from = $20.to_i
+ env_to = $21.to_i
+ i_e_value = $13.to_f
+ if ( number > max_domain_copy_number_per_protein )
+ max_domain_copy_number_sequence = sequence
+ max_domain_copy_number_per_protein = number
+ end
+ if ( ( ( e_value_threshold.to_f < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
+ ( ( length_threshold.to_f <= 0 ) || ( env_to - env_from + 1 ) >= length_threshold.to_f ) )
+ extract_domain( sequence,
+ number,
+ out_of,
+ env_from,
+ env_to,
+ in_msa,
+ out_msa,
+ add_position,
+ add_domain_number,
+ add_domain_number_as_digit,
+ add_domain_number_as_letter,
+ trim_name )
+ domain_pass_counter += 1
+ count_species( sequence, passed_species_counts )
+ if passed_seqs.find_by_name_start( sequence, true ).length < 1
+ add_sequence( sequence, in_msa, passed_seqs )
+ proteins_with_passing_domains += 1
+ end
+ else
+ print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" )
+ log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)"
+ if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
+ print( " iE=" + i_e_value.to_s )
+ log << " iE=" + i_e_value.to_s
+ end
+ if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
+ le = env_to - env_from + 1
+ print( " l=" + le.to_s )
+ log << " l=" + le.to_s
+ end
+ print( Constants::LINE_DELIMITER )
+ log << Constants::LINE_DELIMITER
+ domain_fail_counter += 1
+ count_species( sequence, failed_species_counts )
+ if failed_seqs.find_by_name_start( sequence, true ).length < 1
+ add_sequence( sequence, in_msa, failed_seqs )
+ proteins_with_failing_domains += 1
+ end
+ end
+ end
+ end
+ end
+
+ if domain_pass_counter < 1
+ error_msg = "no domain sequences were extracted"
+ raise StandardError, error_msg
+ end
+
+ log << Constants::LINE_DELIMITER
+ puts( "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s )
+ log << "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s
+ log << Constants::LINE_DELIMITER
+
+ if ( max_domain_copy_number_per_protein > 1 )
+ puts( "First protein with this copy number: " + max_domain_copy_number_sequence )
+ log << "First protein with this copy number: " + max_domain_copy_number_sequence
+ log << Constants::LINE_DELIMITER
+ end
+
+ io = MsaIO.new()
+ w = FastaWriter.new()
+ w.set_line_width( 60 )
+ w.clean( true )
+
+ begin
+ io.write_to_file( out_msa, outfile, w )
+ rescue Exception
+ error_msg = "could not write to \"" + outfile + "\""
+ raise IOError, error_msg
+ end
+
+ begin
+ io.write_to_file( passed_seqs, passed_seqs_outfile, w )
+ rescue Exception
+ error_msg = "could not write to \"" + passed_seqs_outfile + "\""
+ raise IOError, error_msg
+ end
+
+ begin
+ io.write_to_file( failed_seqs, failed_seqs_outfile, w )
+ rescue Exception
+ error_msg = "could not write to \"" + failed_seqs_outfile + "\""
+ raise IOError, error_msg
+ end
+
+ log << ld
+ log << "passing domains : " + domain_pass_counter.to_s + ld
+ log << "failing domains : " + domain_fail_counter.to_s + ld
+ log << "proteins with passing domains: " + proteins_with_passing_domains.to_s + ld
+ log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld
+ log << ld
+ log << 'passing domains counts per species: ' << ld
+ passed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld }
+ log << ld
+ log << 'failing domains counts per species: ' << ld
+ failed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld }
+ log << ld
+ return domain_pass_counter
+
+ end # parse
+
+ private
+
+
+ def add_sequence( sequence_name, in_msa, add_to_msa )
+ seqs = in_msa.find_by_name_start( sequence_name, true )
+ if ( seqs.length < 1 )
+ error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
+ raise StandardError, error_msg
+ end
+ if ( seqs.length > 1 )
+ error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
+ raise StandardError, error_msg
+ end
+ seq = in_msa.get_sequence( seqs[ 0 ] )
+ add_to_msa.add_sequence( seq )
+ end
+
+ # raises ArgumentError, StandardError
+ def extract_domain( sequence,
+ number,
+ out_of,
+ seq_from,
+ seq_to,
+ in_msa,
+ out_msa,
+ add_position,
+ add_domain_number,
+ add_domain_number_as_digit,
+ add_domain_number_as_letter,
+ trim_name )
+ if ( number < 1 || out_of < 1 || number > out_of )
+ error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s
+ raise ArgumentError, error_msg
+ end
+ if ( seq_from < 1 || seq_to < 1 || seq_from >= seq_to )
+ error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s
+ raise ArgumentError, error_msg
+ end
+ seqs = in_msa.find_by_name_start( sequence, true )
+ if seqs.length < 1
+ error_msg = "sequence \"" + sequence + "\" not found in sequence file"
+ raise StandardError, error_msg
+ end
+ if seqs.length > 1
+ error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
+ raise StandardError, error_msg
+ end
+ # hmmsearch is 1 based, wheres sequences are 0 bases in this package.
+ seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
+
+ seq.set_name( seq.get_name.split[ 0 ] )
+
+ if add_position
+ seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
+ end
+
+ if trim_name
+ seq.set_name( seq.get_name[ 0, seq.get_name.length - TRIM_BY ] )
+ end
+
+ if out_of != 1
+ if ( add_domain_number_as_digit )
+ seq.set_name( seq.get_name + number.to_s )
+ elsif ( add_domain_number_as_letter )
+ if number > 25
+ error_msg = 'too many identical domains per sequence, cannot use letters to distinguish them'
+ raise StandardError, error_msg
+ end
+ seq.set_name( seq.get_name + ( number + 96 ).chr )
+ elsif ( add_domain_number )
+ seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
+ end
+ end
+
+ # if ( seq.get_name.length > 10 )
+ # error_msg = "sequence name [" + seq.get_name + "] is longer than 10 characters"
+ # raise StandardError, error_msg
+ # end
+
+ out_msa.add_sequence( seq )
+ end
+
+ def count_species( sequence, species_counts_map )
+ species = get_species( sequence )
+ if species != nil
+ if !species_counts_map.has_key?( species )
+ species_counts_map[ species ] = 1
+ else
+ species_counts_map[ species ] = species_counts_map[ species ] + 1
+ end
+ end
+ end
+
+ def get_species( sequence_name )
+ if sequence_name =~ /^.+_(.+)$/
+ return $1
+ else
+ return nil
+ end
+ end
+
+ def is_ignorable?( line )
+ return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
+ end
+
+ end # class HmmscanDomainExtractor
+
+end # module Evoruby
+
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
- # line =~ /^(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)/
sequence = $1
number = $10.to_i
out_of = $11.to_i