From: cmzmasek@gmail.com Date: Fri, 7 Sep 2012 17:24:58 +0000 (+0000) Subject: in progress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=fbe3c1c03a26ef842dcff136357f7b4d4c279f24;p=jalview.git in progress --- diff --git a/forester/ruby/evoruby/exe/dsx2.rb b/forester/ruby/evoruby/exe/dsx2.rb new file mode 100755 index 0000000..dc793d1 --- /dev/null +++ b/forester/ruby/evoruby/exe/dsx2.rb @@ -0,0 +1,20 @@ +#!/usr/local/bin/ruby -w +# +# = exe/dsx +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: dsx.rb,v 1.3 2008/08/28 17:09:06 cmzmasek Exp $ +# +# last modified: 06/11/2007 + +require 'lib/evo/apps/domain_sequence_extractor_new' + +module Evoruby + + dsx = DomainSequenceExtractor.new() + + dsx.run() + +end # module Evoruby diff --git a/forester/ruby/evoruby/exe/select_same_gn.rb b/forester/ruby/evoruby/exe/select_same_gn.rb index 7ab2c91..5458eef 100755 --- a/forester/ruby/evoruby/exe/select_same_gn.rb +++ b/forester/ruby/evoruby/exe/select_same_gn.rb @@ -77,6 +77,6 @@ module Evoruby end end w = FastaWriter.new - w.write(unique_genes_msa, "uniques.fasta") - w.write(longest_non_unique_genes_msa, "non_uniques_longest.fasta") + w.write(unique_genes_msa, "seqs_from_unique_genes.fasta") + w.write(longest_non_unique_genes_msa, "longest_seqs_from_nonunique_genes.fasta") end diff --git a/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor_new.rb b/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor_new.rb new file mode 100644 index 0000000..efa3968 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/apps/domain_sequence_extractor_new.rb @@ -0,0 +1,266 @@ +# +# = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class +# +# Copyright:: Copyright (C) 2012 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id:Exp $ + + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/io/parser/hmmscan_domain_extractor' + +module Evoruby + + class DomainSequenceExtractor + + PRG_NAME = "dsx" + PRG_VERSION = "1.000" + PRG_DESC = "extraction of domain sequences from hmmscan output" + PRG_DATE = "20120906" + COPYRIGHT = "2012 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + E_VALUE_THRESHOLD_OPTION = 'e' + LENGTH_THRESHOLD_OPTION = 'l' + ADD_POSITION_OPTION = 'p' + ADD_DOMAIN_NUMBER_OPTION = 'd' + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT = 'dd' + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER = 'dl' + TRIM_OPTION = 't' + LOG_FILE_SUFFIX = '_domain_seq_extr.log' + PASSED_SEQS_SUFFIX = '_domain_seq_extr_passed' + FAILED_SEQS_SUFFIX = '_domain_seq_extr_failed' + HELP_OPTION_1 = 'help' + HELP_OPTION_2 = 'h' + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC , + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + ld = Constants::LINE_DELIMITER + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError + Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 4 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( ADD_POSITION_OPTION ) + allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION ) + allowed_opts.push( LENGTH_THRESHOLD_OPTION ) + allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) + allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) + allowed_opts.push( TRIM_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + domain_id = cla.get_file_name( 0 ) + hmmsearch_output = cla.get_file_name( 1 ) + fasta_sequence_file = cla.get_file_name( 2 ) + outfile = cla.get_file_name( 3 ) + + add_position = false + if ( cla.is_option_set?( ADD_POSITION_OPTION ) ) + add_position = true + end + + trim = false + if ( cla.is_option_set?( TRIM_OPTION ) ) + trim = true + end + + add_domain_number = false + add_domain_number_as_letter = false + add_domain_number_as_digit = false + + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) ) + add_domain_number = true + end + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_LETTER ) ) + add_domain_number_as_letter = true + end + if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT ) ) + add_domain_number_as_digit = true + end + + if ( add_domain_number_as_letter && add_domain_number_as_digit ) + puts( "attempt to add domain number as letter and digit at the same time" ) + print_help + exit( -1 ) + end + + e_value_threshold = -1.0 + if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) + begin + e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( e_value_threshold < 0.0 ) + Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) + end + end + + length_threshold = -1 + if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) ) + begin + length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION ) + rescue ArgumentError => e + Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( length_threshold < 0) + Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT ) + end + end + + log = String.new + + puts() + puts( "Domain : " + domain_id ) + log << "Domain : " + domain_id + ld + puts( "Hmmscan outputfile : " + hmmsearch_output ) + log << "Hmmscan outputfile : " + hmmsearch_output + ld + puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file ) + log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld + puts( "Outputfile : " + outfile ) + log << "Outputfile : " + outfile + ld + puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX ) + log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld + puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX ) + log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld + puts( "Logfile : " + outfile + LOG_FILE_SUFFIX ) + log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld + if ( e_value_threshold >= 0.0 ) + puts( "E-value threshold : " + e_value_threshold.to_s ) + log << "E-value threshold : " + e_value_threshold.to_s + ld + else + puts( "E-value threshold : no threshold" ) + log << "E-value threshold : no threshold" + ld + end + if ( length_threshold > 0 ) + puts( "Length threshold : " + length_threshold.to_s ) + log << "Length threshold : " + length_threshold.to_s + ld + else + puts( "Length threshold : no threshold" ) + log << "Length threshold : no threshold" + ld + end + + if ( trim ) + puts( "Trim last 2 chars : true" ) + log << "Trim last 2 chars : true" + ld + else + puts( "Trim names : false" ) + log << "Trim names : false" + ld + end + + + if ( add_position ) + puts( "Add positions (rel to complete seq) to extracted domains: true" ) + log << "Add positions (rel to complete seq) to extracted domains: true" + ld + else + puts( "Add positions (rel to complete seq) to extracted domains: false" ) + log << "Add positions (rel to complete seq) to extracted domains: false" + ld + end + + if ( add_domain_number || add_domain_number_as_digit || add_domain_number_as_letter ) + puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" ) + log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld + else + puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" ) + log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld + end + + puts + + domain_count = 0 + begin + parser = HmmscanDomainExtractor.new() + domain_count = parser.parse( domain_id, + hmmsearch_output, + fasta_sequence_file, + outfile, + outfile + PASSED_SEQS_SUFFIX, + outfile + FAILED_SEQS_SUFFIX, + e_value_threshold, + length_threshold, + add_position, + add_domain_number, + add_domain_number_as_digit, + add_domain_number_as_letter, + trim, + log ) + rescue ArgumentError, IOError, StandardError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "unexpected exception!: " + e.to_s, STDOUT ) + end + + puts + Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" ) + Util.print_message( PRG_NAME, "wrote; " + outfile ) + Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX ) + Util.print_message( PRG_NAME, "(wrote: " + outfile + PASSED_SEQS_SUFFIX + ")" ) + Util.print_message( PRG_NAME, "(wrote: " + outfile + FAILED_SEQS_SUFFIX + ")" ) + + begin + f = File.open( outfile + LOG_FILE_SUFFIX, 'a' ) + f.print( log ) + f.close + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + + puts + Util.print_message( PRG_NAME, "OK" ) + puts + + end + + def print_help() + puts() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=: E-value threshold, default is no threshold" ) + puts( " -" + LENGTH_THRESHOLD_OPTION + "=: length threshold, default is no threshold" ) + puts( " -" + ADD_POSITION_OPTION + ": to add positions (rel to complete seq) to extracted domains" ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION_AS_DIGIT + ": to add numbers to extracted domains as digit (example \"domain2\")" ) + puts( " -" + ADD_DOMAIN_NUMBER_OPTION_AS_LETTER + ": to add numbers to extracted domains as letter (example \"domaina\")" ) + puts( " -" + TRIM_OPTION + ": to remove the last 2 characters from sequence names" ) + puts() + end + + end # class DomainSequenceExtractor + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/apps/domains_to_forester.rb b/forester/ruby/evoruby/lib/evo/apps/domains_to_forester.rb index a03cc2a..a031150 100644 --- a/forester/ruby/evoruby/lib/evo/apps/domains_to_forester.rb +++ b/forester/ruby/evoruby/lib/evo/apps/domains_to_forester.rb @@ -4,7 +4,7 @@ # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek # License:: GNU Lesser General Public License (LGPL) # -# $Id: domains_to_forester.rb,v 1.11 2010/12/13 19:00:11 cmzmasek Exp $ +# $Id: Exp $ # # last modified: 06/11/2007 @@ -18,235 +18,235 @@ require 'lib/evo/sequence/domain_structure' module Evoruby - class DomainsToForester - - PRG_NAME = "d2f" - PRG_DESC = "parsed hmmpfam output to forester format" - PRG_VERSION = "1.0.0" - PRG_DATE = "2007.12.18" - COPYRIGHT = "2007 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - - E_VALUE_THRESHOLD_OPTION = "e" - OVERWRITE_IF_SAME_FROM_TO_OPTION = "o" - HELP_OPTION_1 = "help" - HELP_OPTION_2 = "h" - - def parse( domains_list_file, - original_seqs_file, - outfile, - column_delimiter, - e_value_threshold, - overwrite_if_same_from_to ) - Util.check_file_for_readability( domains_list_file ) - Util.check_file_for_readability( original_seqs_file ) - Util.check_file_for_writability( outfile ) - - domain_structures = Hash.new() # protein name is key, domain structure is value - - f = MsaFactory.new - - original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new ) - if ( original_seqs.get_number_of_seqs < 1 ) - error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format" - raise ArgumentError, error_msg + class DomainsToForester + + PRG_NAME = "d2f" + PRG_DESC = "parsed hmmpfam output to forester format" + PRG_VERSION = "1.001" + PRG_DATE = "20120807" + COPYRIGHT = "2012 Christian M Zmasek" + CONTACT = "phylosoft@gmail.com" + WWW = "www.phylosoft.org" + + E_VALUE_THRESHOLD_OPTION = "e" + OVERWRITE_IF_SAME_FROM_TO_OPTION = "o" + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" + + def parse( domains_list_file, + original_seqs_file, + outfile, + column_delimiter, + e_value_threshold, + overwrite_if_same_from_to ) + Util.check_file_for_readability( domains_list_file ) + Util.check_file_for_readability( original_seqs_file ) + Util.check_file_for_writability( outfile ) + + domain_structures = Hash.new() # protein name is key, domain structure is value + + f = MsaFactory.new + + original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new ) + if ( original_seqs.get_number_of_seqs < 1 ) + error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format" + raise ArgumentError, error_msg + end + + File.open( domains_list_file ) do | file | + while line = file.gets + if !is_ignorable?( line ) + a = line.split( column_delimiter ) + l = a.length + if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) ) + error_msg = "unexpected format at line: " + line + raise IOError, error_msg end - - File.open( domains_list_file ) do | file | - while line = file.gets - if ( !is_ignorable?( line ) ) - a = line.split( column_delimiter ) - l = a.length - if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) ) - error_msg = "unexpected format at line: " + line - raise IOError, error_msg - end - protein_name = a[ 0 ] - domain_name = a[ 1 ] - seq_from = -1 - seq_to = -1 - begin - seq_from = a[ 2 ].to_i - rescue Exception - error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]" - raise IOError, error_msg - end - begin - seq_to = a[ 3 ].to_i - rescue Exception - error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]" - raise IOError, error_msg - end - - e_value = -1 - if ( l > 4 ) - begin - e_value = a[ 4 ].to_f - rescue Exception - error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]" - raise IOError, error_msg - end - end - - seq = original_seqs.get_by_name( protein_name, true, false ) - - total_length = seq.get_length - - if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) ) ) - pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value ) - ds = nil - if ( domain_structures.has_key?( protein_name ) ) - ds = domain_structures[ protein_name ] - else - ds = DomainStructure.new( total_length ) - domain_structures[ protein_name ] = ds - end - ds.add_domain( pd, overwrite_if_same_from_to ) - end - - end - end - end - - out = File.open( outfile, "a" ) - ds = domain_structures.sort - for d in ds - protein_name = d[ 0 ] - domain_structure = d[ 1 ] - out.print( protein_name.to_s ) - out.print( ":" ) - out.print( domain_structure.to_NHX ) - out.print( Constants::LINE_DELIMITER ) - end - - out.flush() - out.close() - - end # parse - - - - - def run() - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - + protein_name = a[ 0 ] + domain_name = a[ 1 ] + seq_from = -1 + seq_to = -1 begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - - if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) - print_help - exit( 0 ) - end - - if ( cla.get_number_of_files != 3 ) - print_help - exit( -1 ) + seq_from = a[ 2 ].to_i + rescue Exception + error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]" + raise IOError, error_msg end - - allowed_opts = Array.new - allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) - allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION ) - - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, - "unknown option(s): " + disallowed, - STDOUT ) + begin + seq_to = a[ 3 ].to_i + rescue Exception + error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]" + raise IOError, error_msg end - domains_list_file = cla.get_file_name( 0 ) - original_sequences_file = cla.get_file_name( 1 ) - outfile = cla.get_file_name( 2 ) - - - e_value_threshold = -1.0 - if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) ) - begin - e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - if ( e_value_threshold < 0.0 ) - Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) - end - end - overwrite_if_same_from_to = false - if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) ) - overwrite_if_same_from_to = true + e_value = -1 + if l > 4 + begin + e_value = a[ 4 ].to_f + rescue Exception + error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]" + raise IOError, error_msg + end end - puts() - puts( "Domains list file : " + domains_list_file ) - puts( "Fasta sequencefile (complete sequences): " + original_sequences_file ) - puts( "Outputfile : " + outfile ) - if ( e_value_threshold >= 0.0 ) - puts( "E-value threshold : " + e_value_threshold.to_s ) - else - puts( "E-value threshold : no threshold" ) - end - if ( overwrite_if_same_from_to ) - puts( "Overwrite if same from and to : true" ) - else - puts( "Overwrite if same from and to : false" ) - end + seq = original_seqs.get_by_name_start( protein_name ) - puts + total_length = seq.get_length - begin - parse( domains_list_file, - original_sequences_file, - outfile, - " ", - e_value_threshold, - overwrite_if_same_from_to ) - - rescue ArgumentError, IOError, StandardError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT ) + if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) ) ) + pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value ) + ds = nil + if ( domain_structures.has_key?( protein_name ) ) + ds = domain_structures[ protein_name ] + else + ds = DomainStructure.new( total_length ) + domain_structures[ protein_name ] = ds + end + ds.add_domain( pd, overwrite_if_same_from_to ) end - - puts - Util.print_message( PRG_NAME, 'OK' ) - puts - + end end - - private - - def print_help() - puts() - puts( "Usage:" ) - puts() - puts( " " + PRG_NAME + ".rb [options] " ) - puts() - puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "= : E-value threshold, default is no threshold" ) - puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" ) - puts() + end + + out = File.open( outfile, "a" ) + ds = domain_structures.sort + for d in ds + protein_name = d[ 0 ] + domain_structure = d[ 1 ] + out.print( protein_name.to_s ) + out.print( ":" ) + out.print( domain_structure.to_NHX ) + out.print( Constants::LINE_DELIMITER ) + end + + out.flush() + out.close() + + end # parse + + + + + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if cla.get_number_of_files != 3 + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + domains_list_file = cla.get_file_name( 0 ) + original_sequences_file = cla.get_file_name( 1 ) + outfile = cla.get_file_name( 2 ) + + + e_value_threshold = -1.0 + if cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) + begin + e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end - - - - def is_ignorable?( line ) - return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/) + if ( e_value_threshold < 0.0 ) + Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) end - - - end # class DomainsToForester + end + overwrite_if_same_from_to = false + if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) ) + overwrite_if_same_from_to = true + end + + puts + puts( "Domains list file : " + domains_list_file ) + puts( "Fasta sequencefile (complete sequences): " + original_sequences_file ) + puts( "Outputfile : " + outfile ) + if ( e_value_threshold >= 0.0 ) + puts( "E-value threshold : " + e_value_threshold.to_s ) + else + puts( "E-value threshold : no threshold" ) + end + if ( overwrite_if_same_from_to ) + puts( "Overwrite if same from and to : true" ) + else + puts( "Overwrite if same from and to : false" ) + end + + puts + + begin + parse( domains_list_file, + original_sequences_file, + outfile, + " ", + e_value_threshold, + overwrite_if_same_from_to ) + + rescue ArgumentError, IOError, StandardError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT ) + end + + + puts + Util.print_message( PRG_NAME, 'OK' ) + puts + + end + + private + + def print_help() + puts + puts( "Usage:" ) + puts + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "= : E-value threshold, default is no threshold" ) + puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" ) + puts + end + + + + def is_ignorable?( line ) + return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/) + end + + + end # class DomainsToForester end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb new file mode 100644 index 0000000..bd3ae8b --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb @@ -0,0 +1,307 @@ +# +# = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanDomainExtractor class +# +# Copyright:: Copyright (C) 2012 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: $ + + +require 'lib/evo/util/constants' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/parser/fasta_parser' + + +module Evoruby + + class HmmscanDomainExtractor + + TRIM_BY = 2 + + def initialize + end + + # raises ArgumentError, IOError, StandardError + def parse( domain_id, + hmmsearch_output, + fasta_sequence_file, + outfile, + passed_seqs_outfile, + failed_seqs_outfile, + e_value_threshold, + length_threshold, + add_position, + add_domain_number, + add_domain_number_as_digit, + add_domain_number_as_letter, + trim_name, + log ) + + Util.check_file_for_readability( hmmsearch_output ) + Util.check_file_for_readability( fasta_sequence_file ) + Util.check_file_for_writability( outfile ) + Util.check_file_for_writability( passed_seqs_outfile ) + Util.check_file_for_writability( failed_seqs_outfile ) + + in_msa = nil + factory = MsaFactory.new() + in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() ) + + if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 ) + error_msg = "could not find fasta sequences in " + fasta_sequence_file + raise IOError, error_msg + end + + out_msa = Msa.new + failed_seqs = Msa.new + passed_seqs = Msa.new + + ld = Constants::LINE_DELIMITER + + domain_pass_counter = 0 + domain_fail_counter = 0 + proteins_with_passing_domains = 0 + proteins_with_failing_domains = 0 + max_domain_copy_number_per_protein = -1 + max_domain_copy_number_sequence = '' + failed_species_counts = Hash.new + passed_species_counts = Hash.new + + File.open( hmmsearch_output ) do | file | + while line = file.gets + if !is_ignorable?( line ) && line =~ /^\S+\s+/ + + # tn acc tlen query acc qlen Evalue score bias # of c-E i-E score bias hf ht af at ef et acc desc + # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 + line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/ + + target_name = $1 + if domain_id != target_name + next + end + + + sequence = $4 + number = $10.to_i + out_of = $11.to_i + env_from = $20.to_i + env_to = $21.to_i + i_e_value = $13.to_f + if ( number > max_domain_copy_number_per_protein ) + max_domain_copy_number_sequence = sequence + max_domain_copy_number_per_protein = number + end + if ( ( ( e_value_threshold.to_f < 0.0 ) || ( i_e_value <= e_value_threshold ) ) && + ( ( length_threshold.to_f <= 0 ) || ( env_to - env_from + 1 ) >= length_threshold.to_f ) ) + extract_domain( sequence, + number, + out_of, + env_from, + env_to, + in_msa, + out_msa, + add_position, + add_domain_number, + add_domain_number_as_digit, + add_domain_number_as_letter, + trim_name ) + domain_pass_counter += 1 + count_species( sequence, passed_species_counts ) + if passed_seqs.find_by_name_start( sequence, true ).length < 1 + add_sequence( sequence, in_msa, passed_seqs ) + proteins_with_passing_domains += 1 + end + else + print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" ) + log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" + if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) ) + print( " iE=" + i_e_value.to_s ) + log << " iE=" + i_e_value.to_s + end + if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f ) + le = env_to - env_from + 1 + print( " l=" + le.to_s ) + log << " l=" + le.to_s + end + print( Constants::LINE_DELIMITER ) + log << Constants::LINE_DELIMITER + domain_fail_counter += 1 + count_species( sequence, failed_species_counts ) + if failed_seqs.find_by_name_start( sequence, true ).length < 1 + add_sequence( sequence, in_msa, failed_seqs ) + proteins_with_failing_domains += 1 + end + end + end + end + end + + if domain_pass_counter < 1 + error_msg = "no domain sequences were extracted" + raise StandardError, error_msg + end + + log << Constants::LINE_DELIMITER + puts( "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s ) + log << "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s + log << Constants::LINE_DELIMITER + + if ( max_domain_copy_number_per_protein > 1 ) + puts( "First protein with this copy number: " + max_domain_copy_number_sequence ) + log << "First protein with this copy number: " + max_domain_copy_number_sequence + log << Constants::LINE_DELIMITER + end + + io = MsaIO.new() + w = FastaWriter.new() + w.set_line_width( 60 ) + w.clean( true ) + + begin + io.write_to_file( out_msa, outfile, w ) + rescue Exception + error_msg = "could not write to \"" + outfile + "\"" + raise IOError, error_msg + end + + begin + io.write_to_file( passed_seqs, passed_seqs_outfile, w ) + rescue Exception + error_msg = "could not write to \"" + passed_seqs_outfile + "\"" + raise IOError, error_msg + end + + begin + io.write_to_file( failed_seqs, failed_seqs_outfile, w ) + rescue Exception + error_msg = "could not write to \"" + failed_seqs_outfile + "\"" + raise IOError, error_msg + end + + log << ld + log << "passing domains : " + domain_pass_counter.to_s + ld + log << "failing domains : " + domain_fail_counter.to_s + ld + log << "proteins with passing domains: " + proteins_with_passing_domains.to_s + ld + log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld + log << ld + log << 'passing domains counts per species: ' << ld + passed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld } + log << ld + log << 'failing domains counts per species: ' << ld + failed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld } + log << ld + return domain_pass_counter + + end # parse + + private + + + def add_sequence( sequence_name, in_msa, add_to_msa ) + seqs = in_msa.find_by_name_start( sequence_name, true ) + if ( seqs.length < 1 ) + error_msg = "sequence \"" + sequence_name + "\" not found in sequence file" + raise StandardError, error_msg + end + if ( seqs.length > 1 ) + error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file" + raise StandardError, error_msg + end + seq = in_msa.get_sequence( seqs[ 0 ] ) + add_to_msa.add_sequence( seq ) + end + + # raises ArgumentError, StandardError + def extract_domain( sequence, + number, + out_of, + seq_from, + seq_to, + in_msa, + out_msa, + add_position, + add_domain_number, + add_domain_number_as_digit, + add_domain_number_as_letter, + trim_name ) + if ( number < 1 || out_of < 1 || number > out_of ) + error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s + raise ArgumentError, error_msg + end + if ( seq_from < 1 || seq_to < 1 || seq_from >= seq_to ) + error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s + raise ArgumentError, error_msg + end + seqs = in_msa.find_by_name_start( sequence, true ) + if seqs.length < 1 + error_msg = "sequence \"" + sequence + "\" not found in sequence file" + raise StandardError, error_msg + end + if seqs.length > 1 + error_msg = "sequence \"" + sequence + "\" not unique in sequence file" + raise StandardError, error_msg + end + # hmmsearch is 1 based, wheres sequences are 0 bases in this package. + seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 ) + + seq.set_name( seq.get_name.split[ 0 ] ) + + if add_position + seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s ) + end + + if trim_name + seq.set_name( seq.get_name[ 0, seq.get_name.length - TRIM_BY ] ) + end + + if out_of != 1 + if ( add_domain_number_as_digit ) + seq.set_name( seq.get_name + number.to_s ) + elsif ( add_domain_number_as_letter ) + if number > 25 + error_msg = 'too many identical domains per sequence, cannot use letters to distinguish them' + raise StandardError, error_msg + end + seq.set_name( seq.get_name + ( number + 96 ).chr ) + elsif ( add_domain_number ) + seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s ) + end + end + + # if ( seq.get_name.length > 10 ) + # error_msg = "sequence name [" + seq.get_name + "] is longer than 10 characters" + # raise StandardError, error_msg + # end + + out_msa.add_sequence( seq ) + end + + def count_species( sequence, species_counts_map ) + species = get_species( sequence ) + if species != nil + if !species_counts_map.has_key?( species ) + species_counts_map[ species ] = 1 + else + species_counts_map[ species ] = species_counts_map[ species ] + 1 + end + end + end + + def get_species( sequence_name ) + if sequence_name =~ /^.+_(.+)$/ + return $1 + else + return nil + end + end + + def is_ignorable?( line ) + return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ ) + end + + end # class HmmscanDomainExtractor + +end # module Evoruby + diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb index d228024..238d13c 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmsearch_domain_extractor.rb @@ -76,7 +76,6 @@ module Evoruby # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/ - # line =~ /^(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)/ sequence = $1 number = $10.to_i out_of = $11.to_i