+++ /dev/null
-#
-# = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class
-#
-# Copyright:: Copyright (C) 2012 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id:Exp $
-
-
-require 'lib/evo/util/constants'
-require 'lib/evo/util/util'
-require 'lib/evo/util/command_line_arguments'
-require 'lib/evo/io/parser/hmmscan_domain_extractor'
-
-module Evoruby
-
- class DomainSequenceExtractor
-
- PRG_NAME = "dsx"
- PRG_VERSION = "2.000"
- PRG_DESC = "extraction of domain sequences from hmmscan output"
- PRG_DATE = "20121001"
- COPYRIGHT = "2012 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
- E_VALUE_THRESHOLD_OPTION = 'e'
- LENGTH_THRESHOLD_OPTION = 'l'
- ADD_POSITION_OPTION = 'p'
- ADD_DOMAIN_NUMBER_OPTION = 'd'
- ADD_SPECIES = 's'
- MIN_LINKER_OPT = 'ml'
- LOG_FILE_SUFFIX = '_domain_seq_extr.log'
- PASSED_SEQS_SUFFIX = '_with_passing_domains.fasta'
- FAILED_SEQS_SUFFIX = '_with_no_passing_domains.fasta'
- HELP_OPTION_1 = 'help'
- HELP_OPTION_2 = 'h'
-
- def run()
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC ,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- ld = Constants::LINE_DELIMITER
-
- begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError
- Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
- end
-
- if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
- print_help
- exit( 0 )
- end
-
- if ( cla.get_number_of_files != 4 )
- print_help
- exit( -1 )
- end
-
- allowed_opts = Array.new
- allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
- allowed_opts.push( ADD_POSITION_OPTION )
- allowed_opts.push( ADD_DOMAIN_NUMBER_OPTION )
- allowed_opts.push( LENGTH_THRESHOLD_OPTION )
- allowed_opts.push( ADD_SPECIES )
- allowed_opts.push( MIN_LINKER_OPT )
-
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME,
- "unknown option(s): " + disallowed,
- STDOUT )
- end
-
- domain_id = cla.get_file_name( 0 )
- hmmsearch_output = cla.get_file_name( 1 )
- fasta_sequence_file = cla.get_file_name( 2 )
- outfile = cla.get_file_name( 3 )
-
- if outfile.downcase.end_with?( ".fasta" )
- outfile = outfile[ 0 .. outfile.length - 7 ]
- elsif outfile.downcase.end_with?( ".fsa" )
- outfile = outfile[ 0 .. outfile.length - 5 ]
- end
-
-
- add_position = false
- if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
- add_position = true
- end
-
- add_domain_number = false
- if ( cla.is_option_set?( ADD_DOMAIN_NUMBER_OPTION ) )
- add_domain_number = true
- end
-
- add_species = false
- if cla.is_option_set? ADD_SPECIES
- add_species = true
- end
-
- e_value_threshold = -1.0
- if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
- begin
- e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
- rescue ArgumentError => e
- Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- if ( e_value_threshold < 0.0 )
- Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
- end
- end
-
- length_threshold = -1
- if ( cla.is_option_set?( LENGTH_THRESHOLD_OPTION ) )
- begin
- length_threshold = cla.get_option_value_as_int( LENGTH_THRESHOLD_OPTION )
- rescue ArgumentError => e
- Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- if ( length_threshold < 0)
- Forester::Util.fatal_error( PRG_NAME, "attempt to use a negative length threshold", STDOUT )
- end
- end
-
-
- min_linker = nil
- if ( cla.is_option_set?( MIN_LINKER_OPT ) )
- begin
- min_linker = cla.get_option_value_as_int( MIN_LINKER_OPT )
- rescue ArgumentError => e
- Forester::Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- if ( !min_linker || min_linker > 100 || min_linker < -100 )
- Forester::Util.fatal_error( PRG_NAME, "unexpected value for min linker " + min_linker.to_s, STDOUT )
- end
- end
-
-
- log = String.new
-
- puts()
- puts( "Domain : " + domain_id )
- log << "Domain : " + domain_id + ld
- puts( "Hmmscan outputfile : " + hmmsearch_output )
- log << "Hmmscan outputfile : " + hmmsearch_output + ld
- puts( "Fasta sequencefile (complete sequences): " + fasta_sequence_file )
- log << "Fasta sequencefile (complete sequences): " + fasta_sequence_file + ld
- puts( "Outputfile : " + outfile + ".fasta" )
- log << "Outputfile : " + outfile + ld
- puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX )
- log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld
- puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX )
- log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld
- puts( "Logfile : " + outfile + LOG_FILE_SUFFIX )
- log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld
- if ( e_value_threshold >= 0.0 )
- puts( "E-value threshold : " + e_value_threshold.to_s )
- log << "E-value threshold : " + e_value_threshold.to_s + ld
- else
- puts( "E-value threshold : no threshold" )
- log << "E-value threshold : no threshold" + ld
- end
- if ( length_threshold > 0 )
- puts( "Length threshold : " + length_threshold.to_s )
- log << "Length threshold : " + length_threshold.to_s + ld
- else
- puts( "Length threshold : no threshold" )
- log << "Length threshold : no threshold" + ld
- end
- if ( min_linker )
- puts( "Min linker : " + min_linker.to_s )
- log << "Min linker : " + min_linker.to_s + ld
-
- end
-
-
- if ( add_position )
- puts( "Add positions (rel to complete seq) to extracted domains: true" )
- log << "Add positions (rel to complete seq) to extracted domains: true" + ld
- else
- puts( "Add positions (rel to complete seq) to extracted domains: false" )
- log << "Add positions (rel to complete seq) to extracted domains: false" + ld
- end
-
- if ( add_domain_number )
- puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): true" )
- log << "Add numbers to extracted domains (in case of more than one domain per complete seq): true" + ld
- else
- puts( "Add numbers to extracted domains (in case of more than one domain per complete seq): false" )
- log << "Add numbers to extracted domains (in case of more than one domain per complete seq): false" + ld
- end
-
- puts
-
- domain_count = 0
- begin
- parser = HmmscanDomainExtractor.new()
- domain_count = parser.parse( domain_id,
- hmmsearch_output,
- fasta_sequence_file,
- outfile,
- outfile + PASSED_SEQS_SUFFIX,
- outfile + FAILED_SEQS_SUFFIX,
- e_value_threshold,
- length_threshold,
- add_position,
- add_domain_number,
- add_species,
- min_linker,
- log )
- rescue ArgumentError, IOError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
-
- rescue Exception => e
- puts e.backtrace
- Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
-
- end
-
- puts
- Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
- Util.print_message( PRG_NAME, "wrote; " + outfile + ".fasta")
- Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
- Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX )
- Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX )
-
- begin
- f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
- f.print( log )
- f.close
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- puts
- Util.print_message( PRG_NAME, "OK" )
- puts
-
- end
-
- def print_help()
- puts()
- puts( "Usage:" )
- puts()
- puts( " " + PRG_NAME + ".rb [options] <domain> <hmmscan outputfile> <file containing complete sequences in fasta format> <outputfile>" )
- puts()
- puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f>: E-value threshold, default is no threshold" )
- puts( " -" + LENGTH_THRESHOLD_OPTION + "=<i>: length threshold, default is no threshold" )
- puts( " -" + ADD_POSITION_OPTION + ": to add positions (rel to complete seq) to extracted domains" )
- puts( " -" + ADD_DOMAIN_NUMBER_OPTION + ": to add numbers to extracted domains (in case of more than one domain per complete seq) (example \"domain~2-3\")" )
- puts( " -" + ADD_SPECIES + ": to add species [in brackets]" )
- puts( " -" + MIN_LINKER_OPT + "=<i>: to extract pairs of same domains with a distance inbetween shorter than a given value" )
- puts()
- end
-
- end # class DomainSequenceExtractor
-
-end # module Evoruby
+++ /dev/null
-#
-# = lib/evo/apps/domains_to_forester - DomainsToForester class
-#
-# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: Exp $
-#
-# last modified: 06/11/2007
-
-require 'lib/evo/util/constants'
-require 'lib/evo/util/util'
-require 'lib/evo/util/command_line_arguments'
-require 'lib/evo/msa/msa_factory'
-require 'lib/evo/io/parser/fasta_parser'
-require 'lib/evo/sequence/protein_domain'
-require 'lib/evo/sequence/domain_structure'
-
-module Evoruby
-
- class DomainsToForester
-
- PRG_NAME = "d2f"
- PRG_DESC = "parsed hmmpfam output to forester format"
- PRG_VERSION = "1.001"
- PRG_DATE = "20120807"
- COPYRIGHT = "2012 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
- E_VALUE_THRESHOLD_OPTION = "e"
- OVERWRITE_IF_SAME_FROM_TO_OPTION = "o"
- HELP_OPTION_1 = "help"
- HELP_OPTION_2 = "h"
-
- def parse( domains_list_file,
- original_seqs_file,
- outfile,
- column_delimiter,
- e_value_threshold,
- overwrite_if_same_from_to )
- Util.check_file_for_readability( domains_list_file )
- Util.check_file_for_readability( original_seqs_file )
- Util.check_file_for_writability( outfile )
-
- domain_structures = Hash.new() # protein name is key, domain structure is value
-
- f = MsaFactory.new
-
- original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new )
- if ( original_seqs.get_number_of_seqs < 1 )
- error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format"
- raise ArgumentError, error_msg
- end
-
- File.open( domains_list_file ) do | file |
- while line = file.gets
- if !is_ignorable?( line )
-
- a = line.split( column_delimiter )
- l = a.length
- if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) )
- error_msg = "unexpected format at line: " + line
- raise IOError, error_msg
- end
- protein_name = a[ 0 ]
- domain_name = a[ 1 ]
- seq_from = -1
- seq_to = -1
- ##########################################
- if domain_name =~ /RRM_\d/
- puts "ignoring " + line
- next
- end
- ##########################################
-
-
- begin
- seq_from = a[ 2 ].to_i
- rescue Exception
- error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]"
- raise IOError, error_msg
- end
- begin
- seq_to = a[ 3 ].to_i
- rescue Exception
- error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]"
- raise IOError, error_msg
- end
-
- e_value = -1
- if l > 4
- begin
- e_value = a[ 4 ].to_f
- rescue Exception
- error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]"
- raise IOError, error_msg
- end
- end
-
- seq = original_seqs.get_by_name_start( protein_name )
-
- total_length = seq.get_length
-
- if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) ) )
- pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value )
- ds = nil
- if ( domain_structures.has_key?( protein_name ) )
- ds = domain_structures[ protein_name ]
- else
- ds = DomainStructure.new( total_length )
- domain_structures[ protein_name ] = ds
- end
- ds.add_domain( pd, overwrite_if_same_from_to )
- end
-
- end
- end
- end
-
- out = File.open( outfile, "a" )
- ds = domain_structures.sort
- for d in ds
- protein_name = d[ 0 ]
- domain_structure = d[ 1 ]
- out.print( protein_name.to_s )
- out.print( ": " )
- out.print( domain_structure.to_NHX )
- out.print( Constants::LINE_DELIMITER )
- end
-
- out.flush()
- out.close()
-
- end # parse
-
-
-
-
- def run()
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
-
- if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
- print_help
- exit( 0 )
- end
-
- if cla.get_number_of_files != 3
- print_help
- exit( -1 )
- end
-
- allowed_opts = Array.new
- allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
- allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION )
-
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME,
- "unknown option(s): " + disallowed,
- STDOUT )
- end
-
- domains_list_file = cla.get_file_name( 0 )
- original_sequences_file = cla.get_file_name( 1 )
- outfile = cla.get_file_name( 2 )
-
-
- e_value_threshold = -1.0
- if cla.is_option_set?( E_VALUE_THRESHOLD_OPTION )
- begin
- e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- if ( e_value_threshold < 0.0 )
- Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
- end
- end
- overwrite_if_same_from_to = false
- if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) )
- overwrite_if_same_from_to = true
- end
-
- puts
- puts( "Domains list file : " + domains_list_file )
- puts( "Fasta sequencefile (complete sequences): " + original_sequences_file )
- puts( "Outputfile : " + outfile )
- if ( e_value_threshold >= 0.0 )
- puts( "E-value threshold : " + e_value_threshold.to_s )
- else
- puts( "E-value threshold : no threshold" )
- end
- if ( overwrite_if_same_from_to )
- puts( "Overwrite if same from and to : true" )
- else
- puts( "Overwrite if same from and to : false" )
- end
-
- puts
-
- begin
- parse( domains_list_file,
- original_sequences_file,
- outfile,
- " ",
- e_value_threshold,
- overwrite_if_same_from_to )
-
- rescue ArgumentError, IOError, StandardError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
- end
-
-
- puts
- Util.print_message( PRG_NAME, 'OK' )
- puts
-
- end
-
- private
-
- def print_help()
- puts
- puts( "Usage:" )
- puts
- puts( " " + PRG_NAME + ".rb [options] <domains list file (parsed hmmpfam output)> <file containing complete sequences in fasta format> <outputfile>" )
- puts()
- puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f> : E-value threshold, default is no threshold" )
- puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" )
- puts
- end
-
-
-
- def is_ignorable?( line )
- return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/)
- end
-
-
- end # class DomainsToForester
-
-
-end # module Evoruby
+++ /dev/null
-#
-# = lib/evo/apps/evo_nursery.rb - EvoNursery class
-#
-# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: evo_nursery.rb,v 1.11 2010/12/13 19:00:11 cmzmasek Exp $
-
-
-
-require 'lib/evo/soft/fastme'
-require 'lib/evo/soft/tree_puzzle'
-require 'lib/evo/util/constants'
-require 'lib/evo/util/util'
-require 'lib/evo/util/command_line_arguments'
-require 'lib/evo/msa/msa_factory'
-require 'lib/evo/io/msa_io'
-require 'lib/evo/io/writer/phylip_sequential_writer'
-require 'lib/evo/io/parser/general_msa_parser'
-require 'lib/evo/io/writer/msa_writer'
-
-require 'iconv'
-
-module Evoruby
-
- class EvoNursery
- GAP_RATIO = 0.50
- GAP_RATIO_FOR_SEQS = 0.75
- MIN_LENGTH = 30
- MIN_SEQS = 4
- MAX_SEQS = 3000
- MAX_ALN_FILE_SIZE = 5000000
- MODEL = :auto
- RATES = :uniform
- FASTME_INITIAL_TREE = :GME
- ALN_NAME = '_align_'
- TREE_PUZZLE_OUTDIST = TreePuzzle::OUTDIST
- TREE_PUZZLE_OUTFILE = TreePuzzle::OUTFILE
- FASTME_OUTTREE = FastMe::OUTTREE
- FASTME_OUTPUT_D = FastMe::OUTPUT_D
-
- PRG_NAME = "evo_nursery"
- PRG_DATE = "2012.03.21"
- PRG_DESC = "pfam alignments to evolutionary trees"
- PRG_VERSION = "0.20"
- COPYRIGHT = "2009-2012 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
- HELP_OPTION_1 = "help"
- HELP_OPTION_2 = "h"
-
- def run
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- if RUBY_VERSION !~ /1.9/
- puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
- exit( -1 )
- end
-
- forester_home = Util.get_env_variable_value( Constants::FORESTER_HOME_ENV_VARIABLE )
- java_home = Util.get_env_variable_value( Constants::JAVA_HOME_ENV_VARIABLE )
- decorator = java_home + '/bin/java -cp ' + forester_home + '/java/forester.jar org.forester.application.decorator'
-
- if ( ARGV == nil || ARGV.length != 1 )
- help
- exit( -1 )
- end
-
- begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
- help
- exit( 0 )
- end
-
- output_dir = cla.get_file_name( 0 )
-
- if output_dir !~ /\/$/
- output_dir = output_dir + '/'
- end
-
- if !File.exists?( output_dir )
- Util.fatal_error( PRG_NAME, output_dir.to_s + " does not exist", STDOUT )
- end
- ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
- files = Dir.entries( "." )
- skipped = Array.new
- counter = 1
- analyzed = 0;
- begin
- files.each { |pfam_aln_file|
- if ( !File.directory?( pfam_aln_file ) &&
- pfam_aln_file !~ /^\./ &&
- pfam_aln_file !~ /.+\.tre$/ )
-
- tree_out_file = output_dir + File.basename( pfam_aln_file ) + ".xml"
-
- if File.exists?( tree_out_file )
- puts
- puts
- puts "***** skipping " + File.basename( pfam_aln_file ) + ", already exists"
- puts
- skipped.push( File.basename( pfam_aln_file ) + " [already exists]" )
- next
- end
-
- puts
- puts counter.to_s + ": " + pfam_aln_file.to_str
- counter += 1
- if File.size( pfam_aln_file ) > MAX_ALN_FILE_SIZE
- puts "***** skipping, file size: " + File.size( pfam_aln_file ).to_s
- skipped.push( File.basename( pfam_aln_file ) + " [file size: " + File.size( pfam_aln_file ).to_s + "]" )
- next
- end
-
- f = MsaFactory.new()
- msa = f.create_msa_from_file( pfam_aln_file, GeneralMsaParser.new() )
-
- if msa.get_number_of_seqs < MIN_SEQS || msa.get_number_of_seqs > MAX_SEQS
- puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s
- skipped.push( File.basename( pfam_aln_file ) + " [seqs: " + msa.get_number_of_seqs.to_s + "]" )
- next
- end
-
- msa.remove_gap_columns_w_gap_ratio!( GAP_RATIO )
-
- length = msa.get_length
- if length < MIN_LENGTH
- puts "***** skipping, length: " + length.to_s
- skipped.push( File.basename( pfam_aln_file ) + " [length: " + length.to_s + "]" )
- next
- end
-
- msa.remove_sequences_by_gap_ratio!( GAP_RATIO_FOR_SEQS )
-
- if msa.get_number_of_seqs < MIN_SEQS
- puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s
- skipped.push( File.basename( pfam_aln_file ) + " [seqs: " + msa.get_number_of_seqs.to_s + "]" )
- next
- end
-
- map_file = output_dir + File.basename( pfam_aln_file ) + ".map"
- f = File.open( map_file, 'a' )
- for i in 0 ... msa.get_number_of_seqs
- name = msa.get_sequence( i ).get_name()
- name =~ /(.+)_(.+)\/.+/
- acc = $1
- tax_code = $2
-
- mapping_str = i.to_s
- mapping_str << "\t"
- mapping_str << 'TAXONOMY_CODE:'
- mapping_str << tax_code
- mapping_str << "\t"
- mapping_str << 'SEQ_SYMBOL:'
- mapping_str << ( acc + '_' + tax_code )
- mapping_str << "\t"
- if ( acc.length < 6 )
- acc = acc + '_' + tax_code
- end
- mapping_str << 'SEQ_ACCESSION:'
- mapping_str << acc
- mapping_str << "\t"
- mapping_str << 'SEQ_ACCESSION_SOURCE:UniProtKB'
- mapping_str << "\t"
- mapping_str << 'NODE_NAME:'
- mapping_str << name
- f.print( mapping_str )
- f.print( "\n" )
- name = msa.get_sequence( i ).set_name( i.to_s )
- end
- f.close
-
- io = MsaIO.new()
- w = MsaWriter
- w = PhylipSequentialWriter.new()
- w.clean( true )
- w.set_max_name_length( 10 )
- if File.exists?( output_dir + ALN_NAME )
- File.unlink( output_dir + ALN_NAME )
- end
- io.write_to_file( msa, output_dir + ALN_NAME, w )
-
- tp = TreePuzzle.new()
- tp.run( output_dir + ALN_NAME,
- MODEL,
- RATES,
- msa.get_number_of_seqs )
-
- File.rename( output_dir + ALN_NAME, output_dir + File.basename( pfam_aln_file ) + ".aln" )
-
- fastme = FastMe.new()
- fastme.run( TREE_PUZZLE_OUTDIST, 0, FASTME_INITIAL_TREE )
-
- pfam_acc = nil
- pfam_de = nil
- File.open( pfam_aln_file ) do |file|
- while line = file.gets
- line = ic.iconv( line )
- if line =~ /^#=AC\s+(.+)/
- pfam_acc = $1
- end
- if line =~ /^#=DE\s+(.+)/
- pfam_de = $1
- end
- if pfam_acc && pfam_de
- break
- end
- end
- end
- if !pfam_acc || !pfam_de
- Util.fatal_error( PRG_NAME, "problem with " + pfam_aln_file.to_s, STDOUT )
- end
-
- puzzle_model = nil
- File.open( TREE_PUZZLE_OUTFILE ) do |file|
- while line = file.gets
- line = ic.iconv( line )
- if line =~ /^Model\s+of\s+substitution:\s+(.+)/
- puzzle_model = $1
- break
- end
- end
- end
- if !puzzle_model
- Util.fatal_error( PRG_NAME, "problem with puzzle outfile: " + TREE_PUZZLE_OUTFILE.to_s, STDOUT )
- end
-
- desc = pfam_de
- desc << ' | '
- desc << 'ML pwd estimation by TREE-PUZZLE version '
- desc << TreePuzzle::VERSION
- desc << ', model: '
- desc << puzzle_model
- desc << ', rates: '
- desc << RATES.to_s
- desc << '; tree estimation by FastME version '
- desc << FastMe::VERSION
- desc << ', initial tree: '
- desc << FASTME_INITIAL_TREE.to_s
- desc << '; aln length: '
- desc << msa.get_length.to_s
-
- cmd = decorator + " -table -p -pn=\"" + pfam_aln_file +
- "\" -pi=pfam:" + pfam_acc +
- " -pd=\"" + desc + "\" " +
- FASTME_OUTTREE + ' ' +
- map_file + ' ' + tree_out_file
-
- IO.popen( cmd , 'r+' ) do | pipe |
- pipe.close_write
- end
- analyzed += 1
-
- File.unlink( map_file )
- File.unlink(TREE_PUZZLE_OUTDIST)
- File.unlink( TREE_PUZZLE_OUTFILE )
- File.unlink( FASTME_OUTPUT_D )
- end
- }
- rescue ArgumentError, IOError, StandardError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
-
- puts()
- puts( 'Skipped:' )
- puts()
- for i in 0 ... skipped.size
- puts i.to_s + ": " + skipped[ i ]
- end
-
- puts()
- puts( 'Skipped : ' + skipped.size.to_s + ' alignments' )
- puts( 'Analyzed: ' + analyzed.to_s + ' alignments' )
-
- puts( 'Min gap ratio for col del : ' + GAP_RATIO.to_s )
- puts( 'Min gap ratio for seq del : ' + GAP_RATIO_FOR_SEQS.to_s )
- puts( 'Minimal aln length : ' + MIN_LENGTH.to_s )
- puts( 'Minimal number of sequences: ' + MIN_SEQS.to_s )
- puts( 'Maximal number of sequences: ' + MAX_SEQS.to_s )
- puts( 'Maximal aln file size : ' + MAX_ALN_FILE_SIZE.to_s )
- puts( 'Model : ' + MODEL.to_s )
- puts( 'FastME initial tree: ' + FASTME_INITIAL_TREE.to_s )
-
- puts()
- puts( '[' + PRG_NAME + '] > OK' )
- puts()
-
- end # run
-
- private
-
- def help
- puts( "Usage:" )
- puts()
- puts( " " + PRG_NAME + ".rb <output dir> " )
- puts()
- end
-
-
- end # class EvoNursery
-
-end # module Evoruby
\ No newline at end of file
+++ /dev/null
-#
-# = lib/evo/apps/fasta_extractor.rb - FastaExtractor class
-#
-# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: fasta_extractor.rb,v 1.2 2010/12/13 19:00:11 cmzmasek Exp $
-
-
-require 'lib/evo/util/util'
-require 'lib/evo/util/constants'
-require 'lib/evo/util/command_line_arguments'
-
-
-module Evoruby
-
- class FastaExtractor
-
- PRG_NAME = "fae"
- PRG_VERSION = "1.0.0"
- PRG_DESC = "extraction of nucleotide sequences from a fasta file by names from wublast search"
- PRG_DATE = "2008.08.09"
- COPYRIGHT = "2008-2009 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
- HELP_OPTION_1 = 'help'
- HELP_OPTION_2 = 'h'
-
-
- def run()
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC ,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- ld = Constants::LINE_DELIMITER
-
- begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
- print_help
- exit( 0 )
- end
-
- if ( cla.get_number_of_files != 3 )
- print_help
- exit( -1 )
- end
-
- allowed_opts = Array.new
-
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME,
- "unknown option(s): " + disallowed,
- STDOUT )
- end
-
- input_file = cla.get_file_name( 0 )
- names_file = cla.get_file_name( 1 )
- output_file = cla.get_file_name( 2 )
-
- if !File.exist?( input_file )
- Util.fatal_error( PRG_NAME, "error: input file [#{input_file}] does not exist" )
- end
- if !File.exist?( names_file )
- Util.fatal_error( PRG_NAME, "error: names file [#{names_file}] does not exist" )
- end
- if File.exist?( output_file )
- Util.fatal_error( PRG_NAME, "error: [#{output_file }] already exists" )
- end
-
- names = extract_names_with_frames( names_file )
-
- extract_sequences( names, input_file, output_file )
-
- puts
- Util.print_message( PRG_NAME, "OK" )
- puts
-
- end
-
-
- def extract_names_with_frames( names_file )
- names = Hash.new()
- File.open( names_file ) do | file |
- while line = file.gets
- if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
- if ( line =~ /(\S+)\s+([+|-]\d)\s+\d+\s+(\S+)/ )
- name = $1
- frame = $2
- e = $3
- names[ name ] = "[" + frame + "] [" + e + "]"
- end
- end
- end
- end
- names
- end
-
- def extract_sequences( names, fasta_file, output_file )
- output = File.open( output_file, "a" )
- matching_state = false
- counter = 0
- File.open( fasta_file ) do | file |
- while line = file.gets
- if !Util.is_string_empty?( line )
- if ( line =~ /\s*>\s*(.+)/ )
- name = $1
- if names.has_key?( name )
- matching_state = true
- counter += 1
- puts counter.to_s + ". " +name + " " + names[ name ]
- output.print( ">" + name + " " + names[ name ] )
- output.print( Evoruby::Constants::LINE_DELIMITER )
- else
- matching_state = false
- end
- elsif matching_state
- output.print( line )
- end
- end
- end
- end
- output.close()
- end
-
- def print_help()
- puts( "Usage:" )
- puts()
- puts( " " + PRG_NAME + ".rb <input fasta file> <names file based on blast output> <output file>" )
- puts()
- end
-
- end # class FastaExtractor
-end
\ No newline at end of file
+++ /dev/null
-#
-# = lib/evo/apps/fasta_taxonomy_processor - FastaTaxonomyProcessor class
-#
-# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: fasta_taxonomy_processor.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $
-
-
-require 'lib/evo/util/util'
-require 'lib/evo/msa/msa_factory'
-require 'lib/evo/msa/msa'
-require 'lib/evo/io/msa_io'
-require 'lib/evo/io/parser/sp_taxonomy_parser'
-require 'lib/evo/io/parser/fasta_parser'
-require 'lib/evo/io/writer/fasta_writer'
-require 'lib/evo/io/writer/phylip_sequential_writer'
-require 'lib/evo/util/command_line_arguments'
-require 'lib/evo/apps/tseq_taxonomy_processor'
-
-module Evoruby
-
- class FastaTaxonomyProcessor
-
- PRG_NAME = "fasta_tap"
- PRG_DATE = "2009.01.20"
- PRG_DESC = "preprocessing of multiple sequence files in ncbi fasta format"
- PRG_VERSION = "1.00"
- COPYRIGHT = "2009 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
- def initialize()
- @tax_ids_to_sp_taxonomies = Hash.new()
- end
-
- def run()
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- if ARGV == nil || ARGV.length != 4
- puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in ncbi fasta format> <name for fasta outfile> <name for map outfile>" )
- puts()
- exit( -1 )
- end
-
- begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
- allowed_opts = Array.new
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
- end
-
- sp_taxonomy_infile = cla.get_file_name( 0 )
- sequences_infile = cla.get_file_name( 1 )
- sequences_outfile = cla.get_file_name( 2 )
- mapping_outfile = cla.get_file_name( 3 )
-
- Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
- Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
- Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
- Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
-
- sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
-
- Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
-
- fasta_parser = FastaParser.new
- msa_fac = MsaFactory.new
-
- seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser )
-
- Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
-
- removed = seqs.remove_redundant_sequences!( true, true )
-
- if removed.size > 0
- Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
- removed.each { | seq_name |
- puts seq_name
- }
- Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
- end
-
- mapping_out = File.open( mapping_outfile, "a" )
-
- for i in 0 ... seqs.get_number_of_seqs
- seq = seqs.get_sequence( i )
- seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
- end
-
- io = MsaIO.new()
-
- w = FastaWriter.new()
-
- w.set_max_name_length( 10 )
- w.clean( true )
- begin
- io.write_to_file( seqs, sequences_outfile, w )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
- end
- mapping_out.close()
-
- Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
- Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
- Util.print_message( PRG_NAME, "OK" )
-
- end
-
- private
-
- def modify_name( seq, i, sp_taxonomies, mapping_outfile )
-
- #i = i + 1792
-
- seq_desc = seq.get_name
-
- taxonomy_sn = nil
-
- if seq_desc =~ /\[(.+)\]/
- taxonomy_sn = $1
- else
- Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]" )
- end
-
- matching_sp_taxonomy = nil
-
- sp_taxonomies.each { |sp_taxonomy|
- if ( sp_taxonomy.scientific_name == taxonomy_sn )
- matching_sp_taxonomy = sp_taxonomy
- end
- }
-
- if matching_sp_taxonomy == nil
- Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" )
- end
-
- new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
-
- gi = nil
- if seq_desc =~ /gi\|(.+?)\|/
- gi = $1
- else
- Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]" )
- end
-
- seq_name = ""
-
- if seq_desc =~ /\|\s*([^|]+?)\s*\[/
- seq_name = $1
- end
-
- if seq_name =~ /\[.+\]$/
- # Redundant taxonomy information hides here.
- seq_name = seq_name.sub(/\[.+\]$/, '')
- end
- if seq_name =~ /^\s*hypothetical\s+protein\s*/i
- # Pointless information.
- seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
- end
- if seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i
- # Pointless information.
- seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' )
- end
-
- if gi != nil
- mapping_outfile.print( new_name + "\t" +
- TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
- TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
- TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
- TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
- TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" +
- TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" +
- TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
- TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
- Constants::LINE_DELIMITER )
- else
- mapping_outfile.print( new_name + "\t" +
- TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
- TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
- TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
- TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
- TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
- TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
- Constants::LINE_DELIMITER )
-
- end
- new_name
- end
-
- end
-
-end # module Evoruby
+++ /dev/null
-#
-# = lib/evo/apps/hmmscan_parser.rb - HmmscanParser class
-#
-# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
-#
-# last modified: 11/24/2009
-
-require 'lib/evo/util/constants'
-require 'lib/evo/util/util'
-require 'lib/evo/util/command_line_arguments'
-
-module Evoruby
-
- class HmmscanParser
-
- PRG_NAME = "hsp"
- PRG_VERSION = "1.0.1"
- PRG_DESC = "hmmscan parser"
- PRG_DATE = "2009.11.24"
- COPYRIGHT = "2009 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
- DELIMITER_OPTION = "d"
- E_VALUE_THRESHOLD_OPTION = "e"
- IGNORE_DUF_OPTION = "i"
- PARSE_OUT_DESCRIPITION_OPTION = "a"
- HELP_OPTION_1 = "help"
- HELP_OPTION_2 = "h"
-
- def initialize
- @domain_counts = Hash.new
- end
-
- # raises ArgumentError, IOError
- def parse( inpath,
- outpath,
- column_delimiter,
- e_value_threshold,
- ignore_dufs,
- get_descriptions )
- Util.check_file_for_readability( inpath )
- Util.check_file_for_writability( outpath )
-
- outfile = File.open( outpath, "a" )
-
- query = String.new
- desc = String.new
- model = String.new
- env_from = String.new
- env_to = String.new
- i_e_value = String.new
-
- queries_count = 0
-
- nl = Constants::LINE_DELIMITER
-
- File.open( inpath ) do | file |
- while line = file.gets
- if !HmmscanParser.is_ignorable?( line ) && line =~ /^\S+\s+\S/
-
- # tn acc tlen query acc qlen Evalue score bias # of c-E i-E score bias hf ht af at ef et acc desc
- # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
- line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
-
- model = $1
- query = $4
- i_e_value = $13.to_f
- env_from = $20.to_i
- env_to = $21.to_i
-
- if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
- ( !ignore_dufs || ( model !~ /^DUF\d+/ ) ) )
- count_model( model )
- outfile.print( query +
- column_delimiter )
- if ( get_descriptions )
- outfile.print( desc +
- column_delimiter )
- end
- outfile.print( model +
- column_delimiter +
- env_from.to_s +
- column_delimiter +
- env_to.to_s +
- column_delimiter +
- i_e_value.to_s )
- outfile.print( nl )
- end
- end
- end # while line = file.gets
- end
- outfile.flush()
- outfile.close()
-
- return queries_count
-
- end # def parse
-
- def count_model( model )
- if ( @domain_counts.has_key?( model ) )
- count = @domain_counts[ model ].to_i
- count += 1
- @domain_counts[ model ] = count
- else
- @domain_counts[ model ] = 1
- end
- end
-
-
- def get_domain_counts()
- return @domain_counts
- end
-
- def run()
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
-
- if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
- print_help
- exit( 0 )
- end
-
- if ( cla.get_number_of_files != 2 )
- print_help
- exit( -1 )
- end
-
- allowed_opts = Array.new
- allowed_opts.push( DELIMITER_OPTION )
- allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
- allowed_opts.push( IGNORE_DUF_OPTION )
- allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
-
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME,
- "unknown option(s): " + disallowed,
- STDOUT )
- end
-
- inpath = cla.get_file_name( 0 )
- outpath = cla.get_file_name( 1 )
-
- column_delimiter = "\t"
- if ( cla.is_option_set?( DELIMITER_OPTION ) )
- begin
- column_delimiter = cla.get_option_value( DELIMITER_OPTION )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- end
-
- e_value_threshold = -1.0
- if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
- begin
- e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- if ( e_value_threshold < 0.0 )
- Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
- end
- end
-
- ignore_dufs = false
- if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
- ignore_dufs = true
- end
-
- parse_descriptions = false
- if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
- parse_descriptions = true
- end
-
- puts()
- puts( "hmmpfam outputfile: " + inpath )
- puts( "outputfile : " + outpath )
- if ( e_value_threshold >= 0.0 )
- puts( "E-value threshold : " + e_value_threshold.to_s )
- else
- puts( "E-value threshold : no threshold" )
- end
- if ( parse_descriptions )
- puts( "parse descriptions: true" )
- else
- puts( "parse descriptions: false" )
- end
- if ( ignore_dufs )
- puts( "ignore DUFs : true" )
- else
- puts( "ignore DUFs : false" )
- end
- if ( column_delimiter == "\t" )
- puts( "column delimiter : TAB" )
- else
- puts( "column delimiter : " + column_delimiter )
- end
- puts()
-
- begin
- queries_count = parse( inpath,
- outpath,
- column_delimiter,
- e_value_threshold,
- ignore_dufs,
- parse_descriptions )
- rescue ArgumentError, IOError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- domain_counts = get_domain_counts()
-
- puts
- puts( "read output for a total of " + queries_count.to_s + " query sequences" )
- puts
- puts( "domain counts (considering potential E-value threshold and ignoring of DUFs):" )
- puts( "(number of different domains: " + domain_counts.length.to_s + ")" )
- puts
- puts( Util.draw_histogram( domain_counts, "#" ) )
- puts
- Util.print_message( PRG_NAME, 'OK' )
- puts
-
- end # def run()
-
- def print_help()
- puts( "Usage:" )
- puts()
- puts( " " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
- puts()
- puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
- puts( " -" + E_VALUE_THRESHOLD_OPTION + ": E-value threshold, default is no threshold" )
- puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" )
- puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" )
- puts()
- end
-
-
- private
-
-
- def HmmscanParser.is_ignorable?( line )
- return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
- end
-
- end # class HmmscanParser
-
-end # module Evoruby
\ No newline at end of file
+++ /dev/null
-#
-# = lib/evo/apps/msa_processor.rb - MsaProcessor class
-#
-# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: msa_processor.rb,v 1.33 2010/12/13 19:00:10 cmzmasek Exp $
-#
-
-require 'date'
-require 'set'
-
-require 'lib/evo/util/constants'
-require 'lib/evo/util/util'
-require 'lib/evo/util/command_line_arguments'
-require 'lib/evo/msa/msa_factory'
-require 'lib/evo/io/msa_io'
-require 'lib/evo/io/writer/phylip_sequential_writer'
-require 'lib/evo/io/writer/nexus_writer'
-require 'lib/evo/io/writer/fasta_writer'
-require 'lib/evo/io/parser/fasta_parser'
-require 'lib/evo/io/parser/general_msa_parser'
-require 'lib/evo/io/writer/msa_writer'
-
-module Evoruby
-
- class MsaProcessor
-
- PRG_NAME = "msa_pro"
- PRG_DATE = "2012.05.11"
- PRG_DESC = "processing of multiple sequence alignments"
- PRG_VERSION = "1.06"
- COPYRIGHT = "2008-2010 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
-
- NAME_LENGTH_DEFAULT = 10
- WIDTH_DEFAULT_FASTA = 60
- INPUT_TYPE_OPTION = "i"
- OUTPUT_TYPE_OPTION = "o"
- MAXIMAL_NAME_LENGTH_OPTION = "n"
- WIDTH_OPTION = "w"
- CLEAN_UP_SEQ_OPTION = "c"
- REM_RED_OPTION = "rem_red"
- REMOVE_GAP_COLUMNS_OPTION = "rgc"
- REMOVE_GAP_ONLY_COLUMNS = "rgoc"
- REMOVE_COLUMNS_GAP_RATIO_OPTION = "rr"
- REMOVE_ALL_GAP_CHARACTERS_OPTION = "rg"
- REMOVE_ALL_SEQUENCES_LISTED_OPTION = "r"
- KEEP_ONLY_SEQUENCES_LISTED_OPTION = "k"
-
- KEEP_MATCHING_SEQUENCES_OPTION = "mk"
- REMOVE_MATCHING_SEQUENCES_OPTION = "mr"
-
- TRIM_OPTION = "t"
- REMOVE_SEQS_GAP_RATIO_OPTION = "rsgr"
- REMOVE_SEQS_NON_GAP_LENGTH_OPTION = "rsl"
- SPLIT = "split"
- LOG_SUFFIX = "_msa_pro.log"
- HELP_OPTION_1 = "help"
- HELP_OPTION_2 = "h"
-
-
- def initialize()
- @input_format_set = false
- @output_format_set = false
- @fasta_input = false
- @phylip_input = true
- @name_length = NAME_LENGTH_DEFAULT
- @name_length_set = false
- @width = WIDTH_DEFAULT_FASTA # fasta only
- @pi_output = true
- @fasta_output = false
- @nexus_output = false
- @clean = false # phylip only
- @rgc = false
- @rgoc = false
- @rg = false # fasta only
- @rem_red = false
- @rgr = -1
- @rsgr = -1
- @rsl = -1
- @remove_matching = nil
- @keep_matching = nil
-
- @seqs_name_file = nil
- @remove_seqs = false
- @keep_seqs = false
- @trim = false
- @split = -1
- @first = -1
- @last = -1
- end
-
-
- def run()
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- if ( ARGV == nil || ARGV.length < 1 )
- Util.print_message( PRG_NAME, "Illegal number of arguments" )
- print_help
- exit( -1 )
- end
-
- begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "Error: " + e.to_s, STDOUT )
- end
-
- if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
- print_help
- exit( 0 )
- end
-
- if ( cla.get_number_of_files != 2 || ARGV.length < 2 )
- Util.print_message( PRG_NAME, "Illegal number of arguments" )
- print_help
- exit( -1 )
- end
-
- allowed_opts = Array.new
- allowed_opts.push( INPUT_TYPE_OPTION )
- allowed_opts.push( OUTPUT_TYPE_OPTION )
- allowed_opts.push( MAXIMAL_NAME_LENGTH_OPTION )
- allowed_opts.push( WIDTH_OPTION )
- allowed_opts.push( CLEAN_UP_SEQ_OPTION )
- allowed_opts.push( REMOVE_GAP_COLUMNS_OPTION )
- allowed_opts.push( REMOVE_GAP_ONLY_COLUMNS )
- allowed_opts.push( REMOVE_COLUMNS_GAP_RATIO_OPTION )
- allowed_opts.push( REMOVE_ALL_GAP_CHARACTERS_OPTION )
- allowed_opts.push( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
- allowed_opts.push( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
- allowed_opts.push( TRIM_OPTION )
- allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION )
- allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
- allowed_opts.push( SPLIT )
- allowed_opts.push( REM_RED_OPTION )
- allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
- allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
-
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME,
- "unknown option(s): " + disallowed )
- end
-
- input = cla.get_file_name( 0 )
- output = cla.get_file_name( 1 )
-
- analyze_command_line( cla )
-
- begin
- Util.check_file_for_readability( input )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- begin
- Util.check_file_for_writability( output )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- if ( @rg )
- set_pi_output( false )
- set_fasta_output( true )
- set_nexus_output( false )
- end
-
- if ( !@input_format_set )
- fasta_like = false
- begin
- fasta_like = Util.looks_like_fasta?( input )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
- @fasta_input = fasta_like
- @phylip_input = !fasta_like
- if ( !@output_format_set )
- @fasta_output = fasta_like
- @pi_output = !fasta_like
- @nexus_output = false
- end
- end
-
- ld = Constants::LINE_DELIMITER
- log = PRG_NAME + " " + PRG_VERSION + " [" + PRG_DATE + "]" + " LOG" + ld
- now = DateTime.now
- log << "Date/time: " + now.to_s + ld
-
- puts()
- puts( "Input alignment : " + input )
- log << "Input alignment : " + input + ld
- puts( "Output alignment : " + output )
- log << "Output alignment : " + output + ld
- if ( @phylip_input )
- puts( "Input is : Phylip, or something like it" )
- log << "Input is : Phylip, or something like it" + ld
- elsif ( @fasta_input )
- puts( "Input is : Fasta" )
- log << "Input is : Fasta" + ld
- end
- if( @rgr >= 0 )
- puts( "Max col gap ratio: " + @rgr.to_s )
- log << "Max col gap ratio: " + @rgr.to_s + ld
- elsif ( @rgc )
- puts( "Remove gap colums" )
- log << "Remove gap colums" + ld
- elsif( @rgoc )
- puts( "Remove gap only colums" )
- log << "Remove gap only colums" + ld
- end
- if ( @clean )
- puts( "Clean up : true" )
- log << "Clean up : true" + ld
- end
-
- if ( @pi_output )
- puts( "Output is : Phylip interleaved" )
- log << "Output is : Phylip interleaved" + ld
- elsif ( @fasta_output )
- puts( "Output is : Fasta" )
- log << "Output is : Fasta" + ld
- if ( @width )
- puts( "Width : " + @width.to_s )
- log << "Width : " + @width.to_s + ld
- end
- if ( @rg )
- puts( "Remove all gap characters (alignment is destroyed)" )
- log << "Remove all gap characters (alignment is destroyed)" + ld
- end
- elsif ( @nexus_output )
- puts( "Output is : Nexus" )
- log << "Output is : Nexus" + ld
- end
- if ( @name_length_set || !@fasta_output )
- puts( "Max name length : " + @name_length.to_s )
- log << "Max name length : " + @name_length.to_s + ld
- end
- if( @rsgr >= 0 )
- puts( "Remove sequences for which the gap ratio > " + @rsgr.to_s )
- log << "Remove sequences for which the gap ratio > " + @rsgr.to_s + ld
- end
- if( @rsl >= 0 )
- puts( "Remove sequences with less than " + @rsl.to_s + " non-gap characters" )
- log << "Remove sequences with less than " + @rsl.to_s + " non-gap characters" + ld
- end
- if ( @remove_seqs )
- puts( "Remove sequences listed in: " + @seqs_name_file )
- log << "Remove sequences listed in: " + @seqs_name_file + ld
- elsif ( @keep_seqs )
- puts( "Keep only sequences listed in: " + @seqs_name_file )
- log << "Keep only sequences listed in: " + @seqs_name_file + ld
- end
- if ( @trim )
- puts( "Keep only columns from: "+ @first.to_s + " to " + @last.to_s )
- log << "Keep only columns from: "+ @first.to_s + " to " + @last.to_s + ld
- end
- if ( @rem_red )
- puts( "Remove redundant sequences: true" )
- log << "Remove redundant sequences: true" + ld
- end
- if ( @split > 0 )
- puts( "Split : " + @split.to_s )
- log << "Split : " + @split.to_s + ld
- end
- puts()
-
- f = MsaFactory.new()
-
- msa = nil
-
- begin
- if ( @phylip_input )
- msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
- elsif ( @fasta_input )
- msa = f.create_msa_from_file( input, FastaParser.new() )
- end
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
-
- if ( msa.is_aligned() )
- Util.print_message( PRG_NAME, "Length of original alignment : " + msa.get_length.to_s )
- log << "Length of original alignment : " + msa.get_length.to_s + ld
- gp = msa.calculate_gap_proportion
- Util.print_message( PRG_NAME, "Gap-proportion of original alignment : " + gp.to_s )
- log << "Gap-proportion of original alignment : " + gp.to_s + ld
- else
- Util.print_message( PRG_NAME, "the input is not aligned" )
- log << "The input is not aligned" + ld
- end
-
- all_names = Set.new()
- for i in 0 ... msa.get_number_of_seqs()
- current_name = msa.get_sequence( i ).get_name
- if all_names.include?( current_name )
- Util.print_warning_message( PRG_NAME, "sequence name [" + current_name + "] is not unique" )
- else
- all_names.add( current_name )
- end
- end
-
- begin
-
- if ( @remove_seqs || @keep_seqs )
- names = Util.file2array( @seqs_name_file, true )
- if ( names == nil || names.length() < 1 )
- error_msg = "file \"" + @seqs_name_file.to_s + "\" appears empty"
- Util.fatal_error( PRG_NAME, error_msg )
- end
-
- if ( @remove_seqs )
- c = 0
- for i in 0 ... names.length()
- to_delete = msa.find_by_name( names[ i ], true, false )
- if ( to_delete.length() < 1 )
- error_msg = "sequence name \"" + names[ i ] + "\" not found"
- Util.fatal_error( PRG_NAME, error_msg )
- elsif ( to_delete.length() > 1 )
- error_msg = "sequence name \"" + names[ i ] + "\" is not unique"
- Util.fatal_error( PRG_NAME, error_msg )
- else
- msa.remove_sequence!( to_delete[ 0 ] )
- c += 1
- end
- end
- Util.print_message( PRG_NAME, "Removed " + c.to_s + " sequences" )
- log << "Removed " + c.to_s + " sequences" + ld
- elsif ( @keep_seqs )
- msa_new = Msa.new()
- r = 0
- k = 0
- for j in 0 ... msa.get_number_of_seqs()
- if ( names.include?( msa.get_sequence( j ).get_name() ) )
- msa_new.add_sequence( msa.get_sequence( j ) )
- k += 1
- else
- r += 1
- end
- end
- msa = msa_new
- Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" )
- log << "Kept " + k.to_s + " sequences" + ld
- Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
- log << "removed " + r.to_s + " sequences" + ld
- end
- end
-
- if ( @trim )
- msa.trim!( @first, @last )
- end
- if( @rgr >= 0 )
- msa.remove_gap_columns_w_gap_ratio!( @rgr )
- elsif ( @rgc )
- msa.remove_gap_columns!()
- elsif( @rgoc )
- msa.remove_gap_only_columns!()
- end
- if( @rsgr >= 0 )
- n = msa.get_number_of_seqs()
- removed = msa.remove_sequences_by_gap_ratio!( @rsgr )
- k = msa.get_number_of_seqs()
- r = n - k
- Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" )
- log << "Kept " + k.to_s + " sequences" + ld
- Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
- log << "Removed " + r.to_s + " sequences:" + ld
- removed.each { | seq_name |
- log << " " + seq_name + ld
- }
- end
- if( @rsl >= 0 )
- n = msa.get_number_of_seqs()
- removed = msa.remove_sequences_by_non_gap_length!( @rsl )
- k = msa.get_number_of_seqs()
- r = n - k
- Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" )
- log << "Kept " + k.to_s + " sequences" + ld
- Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
- log << "Removed " + r.to_s + " sequences:" + ld
- removed.each { | seq_name |
- log << " " + seq_name + ld
- }
- end
- if ( @keep_matching )
- n = msa.get_number_of_seqs
- to_be_removed = Set.new
- for ii in 0 ... n
- seq = msa.get_sequence( ii )
- if !seq.get_name.downcase.index( @keep_matching.downcase )
- to_be_removed.add( ii )
- end
- end
- to_be_removed_ary = to_be_removed.to_a.sort.reverse
- to_be_removed_ary.each { | index |
- msa.remove_sequence!( index )
- }
- # msa = sort( msa )
- end
- if ( @remove_matching )
- n = msa.get_number_of_seqs
- to_be_removed = Set.new
- for iii in 0 ... n
-
- seq = msa.get_sequence( iii )
-
- if seq.get_name.downcase.index( @remove_matching.downcase )
- to_be_removed.add( iii )
- end
- end
- to_be_removed_ary = to_be_removed.to_a.sort.reverse
- to_be_removed_ary.each { | index |
- msa.remove_sequence!( index )
- }
- msa = sort( msa )
- end
-
-
-
- if ( @split > 0 )
- begin
- msas = msa.split( @split, true )
- io = MsaIO.new()
- w = MsaWriter
- if ( @pi_output )
- w = PhylipSequentialWriter.new()
- w.clean( @clean )
- w.set_max_name_length( @name_length )
- elsif( @fasta_output )
- w = FastaWriter.new()
- w.set_line_width( @width )
- if ( @rg )
- w.remove_gap_chars( true )
- Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
- log << "removing gap character, the output is likely to become unaligned" + ld
- end
- w.clean( @clean )
- if ( @name_length_set )
- w.set_max_name_length( @name_length )
- end
- elsif( @nexus_output )
- w = NexusWriter.new()
- w.clean( @clean )
- w.set_max_name_length( @name_length )
- end
- i = 0
- for m in msas
- i = i + 1
- io.write_to_file( m, output + "_" + i.to_s, w )
- end
- Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files" )
- log << "wrote " + msas.length.to_s + " files" + ld
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
-
- end
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
-
- if ( @split <= 0 )
-
- unless ( @rg )
- if ( msa.is_aligned() )
- Util.print_message( PRG_NAME, "Length of processed alignment : " + msa.get_length.to_s )
- log << "Length of processed alignment : " + msa.get_length.to_s + ld
- gp = msa.calculate_gap_proportion
- Util.print_message( PRG_NAME, "Gap-proportion of processed alignment: " + gp.to_s )
- log << "Gap-proportion of processed alignment: " + gp.to_s + ld
- else
- Util.print_warning_message( PRG_NAME, "output is not aligned" )
- log << "output is not aligned" + ld
- end
- end
-
- if @rem_red
- removed = msa.remove_redundant_sequences!( true, false )
- if removed.size > 0
- identicals = msa.get_identical_seqs_detected
- log << "the following " + identicals.size.to_s + " sequences are identical:" + ld
- identicals.each { | s |
- log << s + ld
- }
- log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld
- removed.each { | seq_name |
- log << seq_name + ld
- }
- Util.print_message( PRG_NAME, "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" )
- log << "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" + ld
- end
- end
-
- io = MsaIO.new()
-
- w = MsaWriter
-
- if ( @pi_output )
- w = PhylipSequentialWriter.new()
- w.clean( @clean )
- w.set_max_name_length( @name_length )
- elsif( @fasta_output )
- w = FastaWriter.new()
- w.set_line_width( @width )
- if ( @rg )
- w.remove_gap_chars( true )
- Util.print_warning_message( PRG_NAME, "removing gap characters, the output is likely to become unaligned" )
- log << "removing gap character, the output is likely to become unaligned" + ld
- end
- w.clean( @clean )
- if ( @name_length_set )
- w.set_max_name_length( @name_length )
- end
- elsif( @nexus_output )
- w = NexusWriter.new()
- w.clean( @clean )
- w.set_max_name_length( @name_length )
- end
-
-
- begin
- io.write_to_file( msa, output, w )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- begin
- f = File.open( output + LOG_SUFFIX, 'a' )
- f.print( log )
- f.close
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
-
- end
- Util.print_message( PRG_NAME, "OK" )
- puts
- end
-
-
- private
-
- def sort( msa )
- names = Set.new
- for i in 0 ... msa.get_number_of_seqs
- name = msa.get_sequence( i ).get_name
- names.add( name )
- end
- sorted_ary = names.to_a.sort
- new_msa = Msa.new
- sorted_ary.each { | seq_name |
- seq = msa.get_sequence( msa.find_by_name( seq_name, true, false )[ 0 ] )
- new_msa.add_sequence( seq )
- }
- new_msa
- end
-
- def set_fasta_input( fi = true )
- @fasta_input = fi
- @input_format_set = true
- end
- def set_phylip_input( pi = true )
- @phylip_input = pi
- @input_format_set = true
- end
- def set_name_length( i )
- @name_length = i
- @name_length_set = true
- end
- def set_width( i )
- @width = i
- end
- def set_fasta_output( fo = true )
- @fasta_output = fo
- @output_format_set = true
- end
- def set_pi_output( pso = true )
- @pi_output = pso
- @output_format_set = true
- end
- def set_nexus_output( nexus = true )
- @nexus_output = nexus
- @output_format_set = true
- end
- def set_clean( c = true )
- @clean = c
- end
- def set_remove_gap_columns( rgc = true )
- @rgc = rgc
- end
- def set_remove_gap_only_columns( rgoc = true )
- @rgoc = rgoc
- end
- def set_remove_gaps( rg = true )
- @rg = rg
- end
- def set_remove_gap_ratio( rgr )
- @rgr = rgr
- end
- def set_remove_seqs_gap_ratio( rsgr )
- @rsgr = rsgr
- end
- def set_remove_seqs_min_non_gap_length( rsl )
- @rsl = rsl
- end
- def set_remove_seqs( file )
- @seqs_name_file = file
- @remove_seqs = true
- @keep_seqs = false
- end
- def set_keep_seqs( file )
- @seqs_name_file = file
- @keep_seqs = true
- @remove_seqs = false
- end
- def set_trim( first, last )
- @trim = true
- @first = first
- @last = last
- end
- def set_remove_matching( remove )
- @remove_matching = remove
- end
- def set_keep_matching( keep )
- @keep_matching = keep
- end
- def set_rem_red( rr )
- @rem_red = rr
- end
-
-
-
- def set_split( s )
- if ( s > 0 )
- @split = s
- @clean = false # phylip only
- @rgc = false
- @rgoc = false
- @rg = false # fasta only
- @rgr = -1
- @rsgr = -1
- @rsl = -1
- @seqs_name_file = nil
- @remove_seqs = false
- @keep_seqs = false
- @trim = false
- @first = -1
- @last = -1
- end
- end
- def analyze_command_line( cla )
- if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
- begin
- type = cla.get_option_value( INPUT_TYPE_OPTION )
- if ( type == "p" )
- set_phylip_input( true )
- set_fasta_input( false )
- elsif ( type == "f" )
- set_fasta_input( true )
- set_phylip_input( false )
- end
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- end
- if ( cla.is_option_set?( OUTPUT_TYPE_OPTION ) )
- begin
- type = cla.get_option_value( OUTPUT_TYPE_OPTION )
- if ( type == "p" )
- set_pi_output( true )
- set_fasta_output( false )
- set_nexus_output( false )
- elsif ( type == "f" )
- set_pi_output( false )
- set_fasta_output( true )
- set_nexus_output( false )
- elsif ( type == "n" )
- set_pi_output( false )
- set_fasta_output( false )
- set_nexus_output( true )
- end
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- end
- if ( cla.is_option_set?( MAXIMAL_NAME_LENGTH_OPTION ) )
- begin
- l = cla.get_option_value_as_int( MAXIMAL_NAME_LENGTH_OPTION )
- set_name_length( l )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- end
- if ( cla.is_option_set?( WIDTH_OPTION ) )
- begin
- w = cla.get_option_value_as_int( WIDTH_OPTION )
- set_width( w )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- end
- if ( cla.is_option_set?( CLEAN_UP_SEQ_OPTION ) )
- set_clean( true )
- end
- if ( cla.is_option_set?( REMOVE_GAP_COLUMNS_OPTION ) )
- set_remove_gap_columns( true )
- end
- if ( cla.is_option_set?( REM_RED_OPTION ) )
- set_rem_red( true )
- end
- if ( cla.is_option_set?( REMOVE_GAP_ONLY_COLUMNS ) )
- set_remove_gap_only_columns( true )
- end
- if ( cla.is_option_set?( REMOVE_ALL_GAP_CHARACTERS_OPTION ) )
- set_remove_gaps( true )
- end
- if ( cla.is_option_set?( REMOVE_COLUMNS_GAP_RATIO_OPTION ) )
- begin
- f = cla.get_option_value_as_float( REMOVE_COLUMNS_GAP_RATIO_OPTION )
- set_remove_gap_ratio( f )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- end
- if ( cla.is_option_set?( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) )
- begin
- s = cla.get_option_value( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
- set_remove_seqs( s )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- end
- if ( cla.is_option_set?( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) )
- begin
- s = cla.get_option_value( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
- set_keep_seqs( s )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- end
- if ( cla.is_option_set?( TRIM_OPTION ) )
- begin
- s = cla.get_option_value( TRIM_OPTION )
- if ( s =~ /(\d+)-(\d+)/ )
- set_trim( $1.to_i(), $2.to_i() )
- else
- puts( "illegal argument" )
- print_help
- exit( -1 )
- end
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- end
- if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) )
- begin
- f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION )
- set_remove_seqs_gap_ratio( f )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- end
- if ( cla.is_option_set?( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) )
- begin
- f = cla.get_option_value_as_int( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
- set_remove_seqs_min_non_gap_length( f )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- end
- if ( cla.is_option_set?( SPLIT ) )
- begin
- s = cla.get_option_value_as_int( SPLIT )
- set_split( s )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
-
- end
- if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) )
- begin
- s = cla.get_option_value( REMOVE_MATCHING_SEQUENCES_OPTION )
- set_remove_matching( s )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- end
- if ( cla.is_option_set?( KEEP_MATCHING_SEQUENCES_OPTION ) )
- begin
- s = cla.get_option_value( KEEP_MATCHING_SEQUENCES_OPTION )
- set_keep_matching( s )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
- end
-
-
- end
-
- def print_help()
- puts()
- puts( "Usage:" )
- puts()
- puts( " " + PRG_NAME + ".rb [options] <input alignment> <output>" )
- puts()
- puts( " options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta, p for phylip selex type" )
- puts( " -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta, n for nexus, p for phylip sequential (default)" )
- puts( " -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
- puts( " -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
- puts( " -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" )
- puts( " -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" )
- puts( " -" + REMOVE_GAP_ONLY_COLUMNS + ": remove gap-only columns" )
- puts( " -" + REMOVE_COLUMNS_GAP_RATIO_OPTION + "=<n>: remove columns for which ( seqs with gap / number of sequences > n )" )
- puts( " -" + REMOVE_ALL_GAP_CHARACTERS_OPTION + ": remove all gap characters (destroys alignment, fasta output only)" )
- puts( " -" + REMOVE_ALL_SEQUENCES_LISTED_OPTION + "=<file>: remove all sequences listed in file" )
- puts( " -" + KEEP_ONLY_SEQUENCES_LISTED_OPTION + "=<file>: keep only sequences listed in file" )
- puts( " -" + TRIM_OPTION + "=<first>-<last>: remove columns before first and after last" )
- puts( " -" + REMOVE_SEQS_GAP_RATIO_OPTION + "=<n>: remove sequences for which the gap ratio > n (after column operations)" )
- puts( " -" + REMOVE_SEQS_NON_GAP_LENGTH_OPTION + "=<n> remove sequences with less than n non-gap characters (after column operations)" )
- puts( " -" + REMOVE_MATCHING_SEQUENCES_OPTION + "=<s> remove all sequences with names containing s" )
- puts( " -" + KEEP_MATCHING_SEQUENCES_OPTION + "=<s> keep only sequences with names containing s" )
- puts( " -" + SPLIT + "=<n> split a fasta file into n files of equal number of sequences (expect for " )
- puts( " last one), cannot be used with other options" )
- puts( " -" + REM_RED_OPTION + ": remove redundant sequences" )
- puts()
- end
-
-
-
-
-
- end # class MsaProcessor
-
-
-end # module Evoruby
+++ /dev/null
-#
-# = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class
-#
-# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $
-
-
-require 'lib/evo/util/constants'
-require 'lib/evo/util/util'
-require 'lib/evo/msa/msa'
-require 'lib/evo/msa/msa_factory'
-require 'lib/evo/io/msa_io'
-require 'lib/evo/io/parser/fasta_parser'
-require 'lib/evo/io/writer/fasta_writer'
-require 'lib/evo/util/command_line_arguments'
-
-
-
-module Evoruby
-
- class MultiSequenceExtractor
-
- PRG_NAME = "mse"
- PRG_VERSION = "1.02"
- PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
- PRG_DATE = "2012.07.20"
- COPYRIGHT = "2008-2012 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
- HELP_OPTION_1 = 'help'
- HELP_OPTION_2 = 'h'
-
- EXT_OPTION = 'e'
- EXTRACT_LINKERS_OPTION = 'l'
- LOG_SUFFIX = ".mse_log"
- LINKERS_SUFFIX = ".linkers"
- FASTA_SUFFIX = ".fasta"
- FASTA_WITH_NORMALIZED_IDS_SUFFIX = ".ni.fasta"
- NORMALIZED_IDS_MAP_SUFFIX = ".nim"
- PROTEINS_LIST_FILE_SEPARATOR = "\t"
-
-
- def run()
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC ,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- ld = Constants::LINE_DELIMITER
-
- begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
- print_help
- exit( 0 )
- end
-
- if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 )
- print_help
- exit( -1 )
- end
-
- allowed_opts = Array.new
- allowed_opts.push(EXT_OPTION)
- allowed_opts.push(EXTRACT_LINKERS_OPTION)
-
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME,
- "unknown option(s): " + disallowed,
- STDOUT )
- end
-
- seq_names_files_suffix = cla.get_file_name( 0 )
- input_dir = cla.get_file_name( 1 )
- out_dir = cla.get_file_name( 2 )
- out_dir_doms = cla.get_file_name( 3 )
- mapping_file = nil
-
- if ( cla.get_number_of_files == 5 )
- mapping_file = cla.get_file_name( 4 )
- begin
- Util.check_file_for_readability( mapping_file )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
- end
-
- extension = 0
- if cla.is_option_set?(EXT_OPTION)
- extension = cla.get_option_value_as_int(EXT_OPTION)
- if extension < 0
- extension = 0
- end
- end
-
- extract_linkers = false
- if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
- extract_linkers = true
- end
-
- if !File.exist?( input_dir )
- Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
- end
- if !File.exist?( out_dir )
- Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" )
- end
- if !File.exist?( out_dir_doms )
- Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" )
- end
- if !File.directory?( input_dir )
- Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" )
- end
- if !File.directory?( out_dir )
- Util.fatal_error( PRG_NAME, "error: [#{out_dir}] is not a directory" )
- end
- if !File.directory?( out_dir_doms )
- Util.fatal_error( PRG_NAME, "error: [#{out_dir_doms}] is not a directory" )
- end
-
-
- log = String.new
-
- log << "Program : " + PRG_NAME + ld
- log << "Version : " + PRG_VERSION + ld
- log << "Program date : " + PRG_DATE + ld
-
- puts()
- puts( "Sequence names files suffix: " + seq_names_files_suffix )
- log << "Sequence names files suffix: " + seq_names_files_suffix + ld
- puts( "Input dir : " + input_dir )
- log << "Input dir : " + input_dir + ld
- puts( "Output dir : " + out_dir )
- log << "Output dir : " + out_dir + ld
- puts( "Output dir domains : " + out_dir_doms )
- log << "Output dir domains : " + out_dir_doms + ld
- if ( mapping_file != nil )
- puts( "Mapping file : " + mapping_file )
- log << "Mapping file : " + mapping_file + ld
- end
- if extension > 0
- puts( "Extension : " + extension.to_s )
- log << "Extension : " + extension.to_s + ld
- end
- if extract_linkers
- puts( "Extract linkers : true" )
- log << "Extract linkers : true" + ld
- end
- log << "Date : " + Time.now.to_s + ld
- puts
-
- if ( mapping_file != nil )
- species_codes_to_paths = extract_mappings( mapping_file )
- end
-
- input_files = obtain_inputfiles( input_dir, seq_names_files_suffix )
-
- counter = 0
-
- input_files.each { |input_file|
- counter += 1
- puts
- puts
- puts counter.to_s + "/" + input_files.size.to_s
- read_seq_family_file( input_file,
- seq_names_files_suffix,
- input_dir,
- species_codes_to_paths,
- log,
- out_dir,
- out_dir_doms,
- mapping_file,
- extension,
- extract_linkers )
- }
- puts
- Util.print_message( PRG_NAME, "OK" )
- puts
-
- end
-
-
- def read_seq_family_file( input_file,
- seq_names_files_suffix,
- input_dir,
- species_codes_to_paths,
- log,
- out_dir,
- out_dir_doms,
- mapping_file,
- extension,
- extract_linkers )
-
- begin
- Util.check_file_for_readability( input_file )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
- basename = File.basename( input_file, seq_names_files_suffix )
- out_file_path_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_SUFFIX
- out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
- out_file_path_ids_map = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
- doms_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
- doms_ext_out_file_path_fasta_file = nil
- if extension > 0
- doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX
- end
- begin
- Util.check_file_for_writability( out_file_path_fasta_file )
- Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
- Util.check_file_for_writability( out_file_path_ids_map )
- Util.check_file_for_writability( doms_out_file_path_fasta_file )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- ids_map_writer = nil
- begin
- ids_map_writer = File.open( out_file_path_ids_map, 'a' )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- current_species = ""
- current_msa = nil
- new_msa = Msa.new
- new_msa_normalized_ids = Msa.new
- new_msa_domains = Msa.new
- new_msa_domains_extended = Msa.new
- per_species_counter = 0
- linkers = ""
-
- puts basename
-
- File.open( input_file ) do | file |
- while line = file.gets
- line.strip!
- if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
- values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
- mod_line = nil
- if ( values.length < 2 )
- Util.fatal_error( PRG_NAME, "unexpected format: " + line )
- end
- species = values[ 0 ]
- if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA" || species == "EIMTE"
- next
- end
- seq_name = values[ 1 ]
- domain_ranges = nil
- if ( values.length > 3 )
- domain_ranges_block = values[ 3 ]
- domain_ranges = domain_ranges_block.split( "/" )
- end
- if ( species != current_species )
- current_species = species
- my_file = input_dir + Constants::FILE_SEPARATOR + current_species
-
- if ( !File.exist?( my_file ) )
- if species_codes_to_paths == nil
- Util.fatal_error( PRG_NAME, "error: [#{my_file}] not found and no mapping file provided" )
- elsif ( !species_codes_to_paths.has_key?( current_species ) )
- Util.fatal_error( PRG_NAME, "error: species [#{current_species}] not found in mapping file [#{mapping_file}]" )
- end
- my_file = species_codes_to_paths[ current_species ]
- end
- my_path = File.expand_path( my_file )
- my_readlink = my_path
- if ( File.symlink?( my_path ) )
- my_readlink = File.readlink( my_path )
- end
- current_msa = read_fasta_file( my_file )
-
- if ( per_species_counter > 0 )
- print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
- per_species_counter = 0
- end
- puts " " + current_species + " [" + my_readlink + "]"
- log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
- end
- puts " " + seq_name
- log << " " + seq_name + Constants::LINE_DELIMITER
- per_species_counter = per_species_counter + 1
- seq = nil
-
- if current_msa.find_by_name_start( seq_name, true ).size > 0
- begin
- seq = current_msa.get_by_name_start( seq_name, true ).copy
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
- else
- # Not found, try finding by partial match.
- begin
- seq = current_msa.get_by_name( seq_name, true, true )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
- end
-
- normalized_id = per_species_counter.to_s( 16 ).upcase +
- "_" + current_species
-
- per_species_counter.to_i
-
- ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
-
- orig_name = nil
- if seq != nil
- orig_name = seq.get_name
- seq.set_name( seq.get_name + " [" + current_species + "]" )
- new_msa.add_sequence( seq )
- else
- Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
- end
-
- if domain_ranges != nil
- first = true
- prev_to = -1
-
- domain_ranges.each { |range|
- if range != nil && range.length > 0
- s = range.split("-")
- from = s[ 0 ].to_i
- to = s[ 1 ].to_i
- new_msa_domains.add_sequence( Sequence.new( orig_name +
- " [" + from.to_s +
- "-" + to.to_s +
- "] [" + basename + "] [" +
- current_species + "]",
- seq.get_sequence_as_string[from..to] ) )
- if extension > 0
- from_e = from - extension
- if from_e < 0
- from_e = 0
- end
- to_e = to + extension
- if to_e > seq.get_sequence_as_string.length - 1
- to_e = seq.get_sequence_as_string.length - 1
- end
- new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
- " [" + from.to_s +
- "-" + to.to_s +
- "] [extended by " +
- extension.to_s +
- "] [" + basename + "] [" +
- current_species + "]",
- seq.get_sequence_as_string[ from_e..to_e ] ) )
- end # extension > 0
- if extract_linkers
- if first
- first = false
- f = 0
- t = from - 1
- if extension > 0
- f = from - extension
- end
- mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
- else
- mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
- end
- prev_to = to
- end
- end # range != nil && range.length > 0
- }
- if extract_linkers && prev_to > 0
- f = prev_to + 1
- t = seq.get_sequence_as_string.length - 1
- if extension > 0
- t = prev_to + extension
- end
- mod_line += get_linker_sequence( f, t, seq ) + "]"
- end
- end
-
- new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
- if mod_line
- linkers << mod_line + Constants::LINE_DELIMITER
- end
- end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
- end # while line = file.gets
-
- end
-
- ids_map_writer.close
-
- if ( per_species_counter > 0 )
- print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
- end
-
- io = MsaIO.new()
-
- fasta_writer = FastaWriter.new()
- fasta_writer.remove_gap_chars
- fasta_writer.clean
-
- begin
- io.write_to_file( new_msa, out_file_path_fasta_file, fasta_writer )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- if new_msa_domains != nil
- begin
- io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
- end
-
- if extension > 0 && new_msa_domains_extended != nil
- begin
- io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
- end
-
- begin
- io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- begin
- f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LOG_SUFFIX , 'a' )
- f.print( log )
- f.close
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- if extract_linkers && linkers != nil
- begin
- f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LINKERS_SUFFIX , 'a' )
- f.print( linkers )
- f.close
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
- end
-
-
- end
-
-
- def get_linker_sequence( from, to, seq )
- if from < 0
- from = 0
- end
- if to > seq.get_sequence_as_string.length - 1
- to = seq.get_sequence_as_string.length - 1
- end
- if from > to
- return ""
- else
- return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
- end
- end
-
- def obtain_inputfiles( input_dir, seq_names_files_suffix )
- input_files = Array.new()
- Dir.foreach( input_dir ) { |file_name|
- if file_name.index( seq_names_files_suffix ) == ( file_name.size - seq_names_files_suffix.size )
- input_files.push( input_dir + Constants::FILE_SEPARATOR + file_name )
- end
- }
- input_files
- end
-
- def extract_mappings( mapping_file )
- species_code_to_path = Hash.new()
- File.open( mapping_file ) do | file |
- while line = file.gets
- if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
- if ( line =~ /(\S+)\s+(\S+)/ )
- species = $1
- path = $2
- if ( species_code_to_path.has_key?( species ) )
- Util.fatal_error( PRG_NAME, "error: species code [#{species}] is not unique" )
- end
- if ( species_code_to_path.has_value?( path ) )
- Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" )
- end
- if ( !File.exist?( path ) )
- Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" )
- end
- if ( !File.file?( path ) )
- Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" )
- end
- if ( !File.readable?( path ) )
- Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" )
- end
- if ( File.size( path ) < 10000 )
- Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" )
- end
- if ( !Util.looks_like_fasta?( path ) )
- Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" )
- end
- species_code_to_path[ species ] = path
- puts species + " -> " + path
- end
- end
- end
- end
- species_code_to_path
- end
-
- def print_counts( per_species_counter, log, ld )
- puts " [sum: " + per_species_counter.to_s + "]"
- log << " [sum: " + per_species_counter.to_s + "]" + ld
- end
-
- def read_fasta_file( input )
- f = MsaFactory.new()
- msa = nil
- begin
- msa = f.create_msa_from_file( input, FastaParser.new() )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
- msa
- end
-
- def print_help()
- puts( "Usage:" )
- puts()
- puts( " " + PRG_NAME + ".rb <sequence names file suffix> <input dir containing sequence names files " +
- "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
- "genome multiple-sequence ('fasta') files not in input dir]" )
- puts()
- puts( " option: -" + EXT_OPTION + "=<int>: to extend extracted domains" )
- puts( " -" + EXTRACT_LINKERS_OPTION + " : to extract linkers" )
- puts()
- puts( " " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" )
- puts()
- end
-
- end # class MultiSequenceExtractor
-end
\ No newline at end of file
+++ /dev/null
-#
-# = lib/evo/apps/ - class
-#
-# Copyright:: Copyright (C) 2009 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: new_tap.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $
-
-
-require 'lib/evo/util/util'
-require 'lib/evo/msa/msa_factory'
-require 'lib/evo/msa/msa'
-require 'lib/evo/io/msa_io'
-require 'lib/evo/io/parser/fasta_parser'
-require 'lib/evo/io/parser/general_msa_parser'
-require 'lib/evo/io/writer/fasta_writer'
-require 'lib/evo/io/writer/phylip_sequential_writer'
-require 'lib/evo/util/command_line_arguments'
-
-module Evoruby
-
- class TaxonomyProcessor
-
- PRG_NAME = ""
- PRG_DATE = "2009.10.09"
- PRG_DESC = "replacement of labels in multiple sequence files"
- PRG_VERSION = "1.00"
- COPYRIGHT = "2009 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
- REMOVE_REDUNDANT_SEQS_OPTION = "rr"
-
- def initialize()
- @taxonomies = Hash.new()
- end
-
- def run()
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- if ( ARGV == nil || ( ARGV.length != 3 && ARGV.length != 4 ) )
- puts( "Usage: #{PRG_NAME}.rb <input sequences> <output sequences> <output map>" )
- puts()
- puts( " options: -" + REMOVE_REDUNDANT_SEQS_OPTION + ": to remove redundant sequences" )
- puts()
- exit( -1 )
- end
-
- begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- input = cla.get_file_name( 0 )
- output = cla.get_file_name( 1 )
- map_file = cla.get_file_name( 2 )
-
- allowed_opts = Array.new
- allowed_opts.push( REMOVE_REDUNDANT_SEQS_OPTION )
-
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
- end
-
-
- remove_redudant = false
- if ( cla.is_option_set?( REMOVE_REDUNDANT_SEQS_OPTION ) )
- remove_redudant = true
- end
-
- if ( File.exists?( output ) )
- Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" )
- end
- if ( File.exists?( map_file ) )
- Util.fatal_error( PRG_NAME, "map file [" + map_file + "] already exists" )
- end
- if ( !File.exists?( input) )
- Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" )
- end
-
- fasta_like = Util.looks_like_fasta?( input )
-
- puts()
- puts( "Input alignment : " + input )
- puts( "Output alignment: " + output )
- puts( "Output map : " + map_file )
- if ( fasta_like )
- puts( "Format : Fasta" )
- else
- puts( "Format : Phylip like" )
- end
- puts()
-
- species_map = Hash.new
-
- f = MsaFactory.new()
- begin
- if ( fasta_like )
- msa = f.create_msa_from_file( input, FastaParser.new() )
- else
- msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
- end
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s )
- end
-
- if ( msa == nil || msa.get_number_of_seqs() < 1 )
- Util.fatal_error( PRG_NAME, "failed to read MSA" )
- end
- begin
- Util.check_file_for_writability( map_file )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT )
- end
-
- if ( remove_redudant )
- removed = msa.remove_redundant_sequences!( true )
- if removed.size > 0
- Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
- removed.each { | seq_name |
- puts seq_name
- }
- Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" )
- end
- end
-
- lf = File.open( map_file, "a" )
- for i in 0 ... msa.get_number_of_seqs
- seq = msa.get_sequence( i )
- end
-
- io = MsaIO.new()
- w = nil
- if ( fasta_like )
- w = FastaWriter.new()
- else
- w = PhylipSequentialWriter.new()
- end
- w.set_max_name_length( 10 )
- w.clean( true )
- begin
- io.write_to_file( msa, output, w )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
- end
- lf.close()
- if ( @taxonomies.length > 0 )
- Util.print_message( PRG_NAME, "number of unique taxonomies: " + @taxonomies.length.to_s )
- end
- Util.print_message( PRG_NAME, "wrote: " + map_file )
- Util.print_message( PRG_NAME, "wrote: " + output )
- Util.print_message( PRG_NAME, "OK" )
- end
-
- end # class
-
-end # module Evoruby
\ No newline at end of file
+++ /dev/null
-#!/usr/local/bin/ruby -w
-#
-# = lib/evo/apps/phylogenies_decorator
-#
-# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# decoration of phylogenies with sequence/species names and domain architectures
-#
-# $Id: phylogenies_decorator.rb,v 1.34 2010/12/13 19:00:11 cmzmasek Exp $
-#
-# Environment variable FORESTER_HOME needs to point to the appropriate
-# directory (e.g. setenv FORESTER_HOME $HOME/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester-atv/)
-
-require 'lib/evo/util/constants'
-require 'lib/evo/util/util'
-require 'lib/evo/util/command_line_arguments'
-
-require 'date'
-
-module Evoruby
-
- class PhylogeniesDecorator
-
- DECORATOR_OPTIONS_SEQ_NAMES = '-r=1 -mdn'
- # -mdn is a hidden expert option to rename e.g. "6_ORYLA3" to "6_[3]_ORYLA"
- #DECORATOR_OPTIONS_SEQ_NAMES = '-sn -r=1'
- DECORATOR_OPTIONS_DOMAINS = '-r=1'
- IDS_MAPFILE_SUFFIX = '.nim'
- DOMAINS_MAPFILE_SUFFIX = '.dff'
- SLEEP_TIME = 0.1
- REMOVE_NI = true
- TMP_FILE = '___PD___'
- LOG_FILE = '00_phylogenies_decorator.log'
- FORESTER_HOME = ENV[Constants::FORESTER_HOME_ENV_VARIABLE]
- JAVA_HOME = ENV[Constants::JAVA_HOME_ENV_VARIABLE]
-
- PRG_NAME = "phylogenies_decorator"
- PRG_DATE = "2008.09.02"
- PRG_DESC = "decoration of phylogenies with sequence/species names and domain architectures"
- PRG_VERSION = "1.0.1"
- COPYRIGHT = "2008-2009 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
- IDS_ONLY_OPTION = "n"
- DOMAINS_ONLY_OPTION = "d"
- HELP_OPTION_1 = "help"
- HELP_OPTION_2 = "h"
-
- NL = Constants::LINE_DELIMITER
-
- def run
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- if ( ARGV == nil || ARGV.length > 3 || ARGV.length < 2 )
- print_help
- exit( -1 )
- end
-
- if FORESTER_HOME == nil || FORESTER_HOME.length < 1
- Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::FORESTER_HOME_ENV_VARIABLE} has not been set" )
- end
- if JAVA_HOME == nil || JAVA_HOME.length < 1
- Util.fatal_error( PRG_NAME, "apparently environment variable #{Constants::JAVA_HOME_ENV_VARIABLE} has not been set" )
- end
-
- if !File.exist?( FORESTER_HOME )
- Util.fatal_error( PRG_NAME, '[' + FORESTER_HOME + '] does not exist' )
- end
- if !File.exist?( JAVA_HOME )
- Util.fatal_error( PRG_NAME, '[' + JAVA_HOME + '] does not exist' )
- end
-
- decorator = JAVA_HOME + '/bin/java -cp ' + FORESTER_HOME + '/java/forester.jar org.forester.application.decorator'
-
- begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
- print_help
- exit( 0 )
- end
-
- if File.exist?( LOG_FILE )
- Util.fatal_error( PRG_NAME, 'logfile [' + LOG_FILE + '] already exists' )
- end
-
- allowed_opts = Array.new
- allowed_opts.push( IDS_ONLY_OPTION )
- allowed_opts.push( DOMAINS_ONLY_OPTION )
-
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
- end
-
- ids_only = false
- domains_only = false
-
- in_suffix = cla.get_file_name( 0 )
- out_suffix = cla.get_file_name( 1 )
-
- if cla.is_option_set?( IDS_ONLY_OPTION )
- ids_only = true
- end
- if cla.is_option_set?( DOMAINS_ONLY_OPTION )
- domains_only = true
- end
-
- if ( ids_only && domains_only )
- Util.fatal_error( PRG_NAME, 'attempt to use ids only and domains only at the same time' )
- end
-
- log = String.new
-
- now = DateTime.now
- log << "Program : " + PRG_NAME + NL
- log << "Version : " + PRG_VERSION + NL
- log << "Program date : " + PRG_DATE + NL
- log << "Options for seq names: " + DECORATOR_OPTIONS_SEQ_NAMES + NL
- log << "Options for domains : " + DECORATOR_OPTIONS_DOMAINS + NL
- log << "FORESTER_HOME : " + FORESTER_HOME + NL
- log << "JAVA_HOME : " + JAVA_HOME + NL + NL
- log << "Date/time: " + now.to_s + NL
- log << "Directory: " + Dir.getwd + NL + NL
-
- Util.print_message( PRG_NAME, 'input suffix : ' + in_suffix )
- Util.print_message( PRG_NAME, 'output suffix : ' + out_suffix )
-
- log << 'input suffix : ' + in_suffix + NL
- log << 'output suffix : ' + out_suffix + NL
-
- if ( File.exists?( TMP_FILE ) )
- File.delete( TMP_FILE )
- end
-
- files = Dir.entries( "." )
-
- counter = 0
-
- files.each { | phylogeny_file |
- if ( !File.directory?( phylogeny_file ) &&
- phylogeny_file !~ /^\./ &&
- phylogeny_file !~ /^00/ &&
- phylogeny_file !~ /#{out_suffix}$/ &&
- phylogeny_file =~ /#{in_suffix}$/ )
- begin
- Util.check_file_for_readability( phylogeny_file )
- rescue ArgumentError
- Util.fatal_error( PRG_NAME, 'can not read from: ' + phylogeny_file + ': '+ $! )
- end
-
- counter += 1
-
- outfile = phylogeny_file.sub( /#{in_suffix}$/, out_suffix )
-
- if REMOVE_NI
- outfile = outfile.sub( /_ni_/, '_' )
- end
-
- if File.exists?( outfile )
- msg = counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile +
- ' : already exists, skipping'
- Util.print_message( PRG_NAME, msg )
- log << msg + NL
- next
- end
-
- Util.print_message( PRG_NAME, counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile )
- log << counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile + NL
-
- phylogeny_id = get_id( phylogeny_file )
-
- ids_mapfile_name = nil
- domains_mapfile_name = nil
-
- if ids_only
- ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX )
- elsif domains_only
- domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX )
- else
- ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX )
- domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX )
- end
-
- if domains_mapfile_name != nil
- begin
- Util.check_file_for_readability( domains_mapfile_name )
- rescue ArgumentError
- Util.fatal_error( PRG_NAME, 'failed to read from [#{domains_mapfile_name}]: ' + $! )
- end
- end
-
- if ids_mapfile_name != nil
- begin
- Util.check_file_for_readability( ids_mapfile_name )
- rescue ArgumentError
- Util.fatal_error( PRG_NAME, 'failed to read from [#{ids_mapfile_name}]: ' + $! )
- end
- end
-
- if domains_mapfile_name != nil
- if ids_mapfile_name != nil
- my_outfile = TMP_FILE
- else
- my_outfile = outfile
- end
- cmd = decorator + ' ' + DECORATOR_OPTIONS_DOMAINS + ' ' +
- '-f=d ' + phylogeny_file + ' ' +
- domains_mapfile_name + ' ' + my_outfile
- execute_cmd( cmd, log )
- end
-
- if ids_mapfile_name != nil
- if domains_mapfile_name != nil
- my_infile = TMP_FILE
- else
- my_infile = phylogeny_file
- end
- cmd = decorator + ' ' + DECORATOR_OPTIONS_SEQ_NAMES + ' ' +
- '-f=s ' + my_infile + ' ' +
- ids_mapfile_name + ' ' + outfile
- execute_cmd( cmd, log )
- end
-
- if ( File.exists?( TMP_FILE ) )
- File.delete( TMP_FILE )
- end
- end
- }
- open( LOG_FILE, 'w' ) do | f |
- f.write( log )
- end
- puts
- Util.print_message( PRG_NAME, 'OK' )
- puts
- end # def run
-
- def execute_cmd( cmd, log )
- log << 'excuting ' + cmd + NL
- IO.popen( cmd , 'r+' ) do | pipe |
- pipe.close_write
- log << pipe.read + NL + NL
- end
- sleep( SLEEP_TIME )
- end
-
-
- def get_id( phylogeny_file_name )
- phylogeny_file_name =~ /^([^_]+)/
- $1
- end
-
- def get_file( files_in_dir, phylogeny_id, suffix_pattern )
- matching_files = Array.new
- files_in_dir.each { | file |
-
- if ( !File.directory?( file ) &&
- file !~ /^\./ &&
- file !~ /^00/ &&
- file =~ /^#{phylogeny_id}.*#{suffix_pattern}$/ )
- matching_files << file
- end
- }
- if matching_files.length < 1
- Util.fatal_error( PRG_NAME, 'no file matching [' + phylogeny_id +
- '_] [' + suffix_pattern + '] present in current directory' )
- elsif matching_files.length > 1
- Util.fatal_error( PRG_NAME, 'more than one file matching [' + phylogeny_id +
- '_] [' + suffix_pattern + '] present in current directory' )
- end
- matching_files[ 0 ]
- end
-
- def print_help()
- puts( "Usage:" )
- puts()
- puts( " " + PRG_NAME + ".rb [options] <suffix of intrees to be decorated> <suffix for decorated outtrees> " )
- puts()
- puts( " options: -" + IDS_ONLY_OPTION + ": decorate with sequence/species names only" )
- puts( " -" + DOMAINS_ONLY_OPTION + ": decorate with domain structures" )
- puts()
- end
- end # class PhylogenyiesDecorator
-
-end # module Evoruby
+++ /dev/null
-#
-# = lib/evo/apps/phylogeny_factory - PhylogenyFactory class
-#
-# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: phylogeny_factory.rb,v 1.32 2010/12/13 19:00:11 cmzmasek Exp $
-
-require 'lib/evo/util/constants'
-require 'lib/evo/util/util'
-require 'lib/evo/util/command_line_arguments'
-
-require 'set'
-require 'date'
-
-module Evoruby
-
- class PhylogenyFactory
-
- PRG_NAME = "phylogeny_factory"
- PRG_DATE = "2010.05.26"
- PRG_DESC = "automated phylogeny reconstruction using queing system"
- PRG_VERSION = "1.1"
- COPYRIGHT = "2010 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
- USE_JOB_SUBMISSION_SYSTEM_OPTION = 's'
- LOG_FILE = '00_phylogeny_factory.log'
- TEMPLATE_FILE = '00_phylogeny_factory.template'
- PBS_O_WORKDIR = '$PBS_O_WORKDIR/'
- MIN_LENGTH_DEFAULT = 40
- WALLTIME = '100:00:00'
- QUEUE = 'default'
-
- TMP_CMD_FILE_SUFFIX = '_QSUB'
-
- HMM = 'HMM'
- RSL = 'RSL'
-
- OPTION_OPEN = '%['
- OPTION_CLOSE = ']%'
-
- WAIT = 1.0
-
- NL = Constants::LINE_DELIMITER
-
- def run
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
- allowed_opts = Array.new
- allowed_opts.push( USE_JOB_SUBMISSION_SYSTEM_OPTION )
-
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME,
- "unknown option(s): " + disallowed,
- STDOUT )
- end
-
- if File.exists?( LOG_FILE )
- puts( '[' + PRG_NAME + '] > log file [' + LOG_FILE + '] already exists' )
- exit( -1 )
- end
-
- if !File.exists?( TEMPLATE_FILE )
- puts( '[' + PRG_NAME + '] > template file [' + TEMPLATE_FILE + '] not found' )
- exit( -1 )
- end
-
- use_job_submission_system = false
- if cla.is_option_set?( USE_JOB_SUBMISSION_SYSTEM_OPTION )
- use_job_submission_system = true
- end
-
- log = String.new
-
- now = DateTime.now
- log << "Program : " + PRG_NAME + NL
- log << "Version : " + PRG_VERSION + NL
- log << "Program date: " + PRG_DATE + NL + NL
- log << "Date/time : " + now.to_s + NL
- log << "Directory : " + Dir.getwd + NL + NL
-
- puts( '[' + PRG_NAME + '] > reading ' + TEMPLATE_FILE )
-
- paths = Hash.new # path placeholder -> full path
- min_lengths = Hash.new # alignment id -> minimal length
- hmms = Hash.new # alignment id -> hmm
- options = Hash.new # option placeholder -> option
- ids = Set.new
-
- commands = Array.new
-
- log << "////////////////////////////////////////////////////////////////// #{NL}"
- log << "Template file [" + TEMPLATE_FILE + "]:#{NL}"
-
- command = String.new
-
- open( TEMPLATE_FILE ).each { | line |
- log << line
- if ( line =~ /^#/ )
- elsif ( line =~ /^\$\s*(\S+)\s*=\s*(\S+)/ )
- paths[ $1 ] = $2
- puts( '[' + PRG_NAME + '] > paths : ' + $1 + ' => ' + $2 )
-
- elsif ( line =~ /^%\s*#{HMM}\s*(\S+)\s*=\s*(\S+)/ )
- hmms[ $1 ] = $2
- puts( '[' + PRG_NAME + '] > hmms : ' + $1 + ' => ' + $2 )
-
- elsif ( line =~ /^%\s*#{RSL}\s*(\S+)\s*=\s*(\S+)/ )
- min_lengths[ $1 ] = $2
- puts( '[' + PRG_NAME + '] > min lengths: ' + $1 + ' => ' + $2 )
-
- elsif ( line =~ /^%\s*(\S+)\s*=\s*(\S+)/ )
- options[ $1 ] = $2
- puts( '[' + PRG_NAME + '] > options : ' + $1 + ' => ' + $2 )
-
- elsif ( line =~ /^>\s*(.+)/ )
- command = command + $1 + ";#{NL}"
-
- elsif ( line =~ /^-/ )
- commands << prepare( command, paths )
- command = String.new
- end
- }
- log << "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ #{NL}#{NL}"
-
- files = Dir.entries( "." )
-
- files.each { | file |
- if ( !File.directory?( file ) &&
- file !~ /^\./ &&
- file !~ /#{TEMPLATE_FILE}/ &&
- file !~ /.bck$/ &&
- file !~ /.log$/ &&
- file !~ /nohup/ &&
- file !~ /^00/ )
- aln_name = file.to_str
- id = get_id( aln_name )
- if !ids.include?( id )
- ids.add( id )
- end
- puts( '[' + PRG_NAME + '] > file [id] : ' + aln_name + ' [' + id + ']' )
- commands.each do | cmd |
-
- cmd = subst_hmm( cmd, aln_name, hmms )
- cmd = subst_min_length( cmd, aln_name, min_lengths )
- cmd = subst_options( cmd, options )
- if use_job_submission_system
- cmd = subst_aln_name( cmd, PBS_O_WORKDIR + aln_name )
- else
- cmd = subst_aln_name( cmd, aln_name )
- end
-
- if ( cmd =~ /%/ )
- cmd =~ /(%.*?%)/
- problem = $1
- puts( '[' + PRG_NAME + '] > WARNING : [' + id + '] command still contains placeholder: ' + problem )
- log << "WARNING: command still contains placeholder: " + cmd + NL
- else
- tmp_cmd_file = file.to_str[ 0..4 ] + TMP_CMD_FILE_SUFFIX
- if ( File.exists?( tmp_cmd_file ) )
- File.delete( tmp_cmd_file )
- end
- if use_job_submission_system
- open( tmp_cmd_file, 'w' ) do |f|
- f.write( cmd )
- end
- end
-
- log << cmd + NL
-
- if use_job_submission_system
- IO.popen( 'qsub -q ' + QUEUE + ' -l walltime=' + WALLTIME + ' ' + tmp_cmd_file , 'r+' ) do | pipe |
- pipe.close_write
- end
- else
- spawn( 'nohup ' + cmd + ' &', STDERR => "/dev/null" )
- end
-
- sleep( WAIT )
- if ( File.exists?( tmp_cmd_file ) )
- File.delete( tmp_cmd_file )
- end
- end
- end
- end
- }
-
- open( LOG_FILE, 'w' ) do | f |
- f.write( log )
- end
-
- puts()
- puts( '[' + PRG_NAME + '] > OK' )
- puts()
-
- end # def run
-
- def prepare( command, paths )
- paths.each_pair{ | name, full |
- command = command.gsub( name, full )
- }
- command
- end
-
- def subst_options( command, options )
- opt_placeholders = command.scan( /%\[\S+\]%/ )
- opt_placeholders.each { | opt_placeholder |
- opt_placeholder = opt_placeholder.gsub( OPTION_OPEN , '' )
- opt_placeholder = opt_placeholder.gsub( OPTION_CLOSE, '' )
- opt_value = options[ opt_placeholder ]
- if ( opt_value != nil && opt_value.size > 0 )
- command = command.gsub( OPTION_OPEN + opt_placeholder + OPTION_CLOSE, opt_value )
- end
- }
- command
- end
-
- def subst_aln_name( command, aln_name )
- command = command.gsub( '$', aln_name )
- command
- end
-
- def subst_hmm( command, aln_name, hmms )
- id = get_id( aln_name )
- hmm = hmms[ id ]
- if ( hmm != nil && hmm.size > 0 )
- command = command.gsub( OPTION_OPEN + HMM + OPTION_CLOSE, hmm )
- end
- command
- end
-
- def subst_min_length( command, aln_name, min_lengths )
- id = get_id( aln_name )
- min_length = min_lengths[ id ]
- if ( min_length != nil && min_length.size > 0 )
- command = command.gsub( OPTION_OPEN + RSL + OPTION_CLOSE, min_length )
- else
- command = command.gsub( OPTION_OPEN + RSL + OPTION_CLOSE, MIN_LENGTH_DEFAULT.to_s )
- end
- command
- end
-
- def get_id( aln_name )
- aln_name =~ /^([^_]+)/
- $1
- end
-
- end # class PhylogenyFactory
-
-end # module Evoruby
+++ /dev/null
-#
-# = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class
-#
-# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: taxonomy_processor.rb,v 1.26 2010/12/13 19:00:11 cmzmasek Exp $
-
-
-require 'lib/evo/util/util'
-require 'lib/evo/msa/msa_factory'
-require 'lib/evo/msa/msa'
-require 'lib/evo/io/msa_io'
-require 'lib/evo/io/parser/fasta_parser'
-require 'lib/evo/io/parser/general_msa_parser'
-require 'lib/evo/io/writer/fasta_writer'
-require 'lib/evo/io/writer/phylip_sequential_writer'
-require 'lib/evo/util/command_line_arguments'
-
-module Evoruby
-
- class TaxonomyProcessor
-
- PRG_NAME = "tap"
- PRG_DATE = "2012.09.27"
- PRG_DESC = "replacement of species names in multiple sequence files"
- PRG_VERSION = "1.02"
- COPYRIGHT = "2012 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
- SIMPLE = true
-
- EXTRACT_TAXONOMY_OPTION = "t"
-
- def initialize()
- @taxonomies = Hash.new()
- end
-
- def run()
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- if ( ARGV == nil || ( ARGV.length != 1 && ARGV.length != 3 && ARGV.length != 4 && ARGV.length != 5 && ARGV.length != 6 ) )
- puts( "Usage: #{PRG_NAME}.rb [options] [input map file] <input sequences> [output sequences] [output id list]" )
- puts()
- puts( " options: -" + EXTRACT_TAXONOMY_OPTION + ": to extract taxonomy information from bracketed expression" )
- puts()
- exit( -1 )
- end
-
- begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
-
-
- mapfile = nil
- input = nil
- output = nil
- list_file = nil
-
-
-
- if cla.get_number_of_files == 4
- mapfile = cla.get_file_name( 0 )
- input = cla.get_file_name( 1 )
- output = cla.get_file_name( 2 )
- list_file = cla.get_file_name( 3 )
- elsif cla.get_number_of_files == 3
- input = cla.get_file_name( 0 )
- output = cla.get_file_name( 1 )
- list_file = cla.get_file_name( 2 )
- elsif cla.get_number_of_files == 1
- input = cla.get_file_name( 0 )
- i = nil
- if input.downcase.end_with?( ".fasta" )
- i = input[ 0 .. input.length - 7 ]
- elsif input.downcase.end_with?( ".fsa" )
- i = input[ 0 .. input.length - 5 ]
- else
- i = input
- end
- output = i + "_ni.fasta"
- list_file = i + ".nim"
- end
-
-
- allowed_opts = Array.new
- allowed_opts.push( EXTRACT_TAXONOMY_OPTION )
-
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
- end
-
- extract_taxonomy = false
- if ( cla.is_option_set?( EXTRACT_TAXONOMY_OPTION ) )
- extract_taxonomy = true
- end
-
- if ( File.exists?( output ) )
- Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" )
- end
- if ( File.exists?( list_file ) )
- Util.fatal_error( PRG_NAME, "list file [" + list_file + "] already exists" )
- end
- if ( !File.exists?( input) )
- Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" )
- end
- if ( mapfile != nil && !File.exists?( mapfile ) )
- Util.fatal_error( PRG_NAME, "mapfile [" + mapfile + "] does not exist" )
- end
-
- fasta_like = Util.looks_like_fasta?( input )
-
- puts()
- if mapfile != nil
- puts( "Map file : " + mapfile )
- end
- puts( "Input alignment : " + input )
- puts( "Output alignment: " + output )
- puts( "Name list : " + list_file )
- if ( fasta_like )
- puts( "Format : Fasta" )
- else
- puts( "Format : Phylip like" )
- end
- if ( extract_taxonomy )
- puts( "Extract taxonomy: true" )
- end
- puts()
-
- species_map = Hash.new
- if mapfile != nil
- File.open( mapfile ) do | file |
- while line = file.gets
- if ( line =~/(.+)#(.+)/ || line =~/(.+)\s+(.+)/ )
- species_map[ $1 ] = $2
- Util.print_message( PRG_NAME, "mapping: " + $1 + ' => ' + $2 )
- end
- end
- end
- end
-
- f = MsaFactory.new()
- begin
- if ( fasta_like )
- msa = f.create_msa_from_file( input, FastaParser.new() )
- else
- msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
- end
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s )
- end
-
- if ( msa == nil || msa.get_number_of_seqs() < 1 )
- Util.fatal_error( PRG_NAME, "failed to read MSA" )
- end
- begin
- Util.check_file_for_writability( list_file )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT )
- end
-
- #removed = msa.remove_redundant_sequences!( true )
- #if removed.size > 0
- # Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
- # removed.each { | seq_name |
- # puts seq_name
- # }
- # Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" )
- #end
-
- lf = File.open( list_file, "a" )
- for i in 0 ... msa.get_number_of_seqs
- seq = msa.get_sequence( i )
- seq.set_name( Util::normalize_seq_name( modify_name( seq.get_name(), i, lf, species_map, extract_taxonomy ), 10 ) )
- end
-
- io = MsaIO.new()
- w = nil
- if ( fasta_like )
- w = FastaWriter.new()
- else
- w = PhylipSequentialWriter.new()
- end
- w.set_max_name_length( 10 )
- w.clean( true )
- begin
- io.write_to_file( msa, output, w )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
- end
- lf.close()
- if ( @taxonomies.length > 0 )
- Util.print_message( PRG_NAME, "number of unique taxonomies: " + @taxonomies.length.to_s )
- end
- Util.print_message( PRG_NAME, "wrote: " + list_file )
- Util.print_message( PRG_NAME, "wrote: " + output )
- Util.print_message( PRG_NAME, "OK" )
- end
-
- private
-
- def modify_name( desc, counter, file, species_map, extract_taxonomy )
- new_desc = nil
- my_species = nil
- # if desc =~ /^>?\s*\S{1,10}_([0-9A-Z]{3,5})/
- if desc =~ /^>?\s*\S{1,10}_([A-Z]{3,5})/
- new_desc = counter.to_s( 16 ) + "_" + $1
- elsif SIMPLE
- new_desc = counter.to_s( 16 )
- elsif extract_taxonomy
- if ( desc.count( "[" ) != desc.count( "]" ) )
- Util.fatal_error( PRG_NAME, "illegal bracket count in: " + desc )
- end
- species = nil
- species_map.each_key do | key |
- if desc =~ /[\b|_]#{key}\b/ # Added boundaries to prevent e.g. RAT matching ARATH.
- species = species_map[ key ]
- new_desc = counter.to_s( 16 ) + "_" + species
- break
- end
- end
- if species == nil
- if desc =~/.*\[(\S{3,}?)\]/
- species = $1
- species.strip!
- species.upcase!
- species.gsub!( /\s+/, " " )
- species.gsub!( /-/, "" )
- species.gsub!( /\)/, "" )
- species.gsub!( /\(/, "" )
- species.gsub!( /\'/, "" )
- if species =~ /\S+\s\S+/ || species =~ /\S{3,5}/
- if species =~ /(\S+)\s(\S+)/
- code = $1[ 0..2 ] + $2[ 0..1 ]
- elsif species =~ /\S{3,5}/
- code = species
- elsif species.count( " " ) > 2
- species =~ /(\S+)\s+(\S+)\s+(\S+)$/
- third_last = $1
- second_last = $2
- last = $3
- code = code[ 0 ] + third_last[ 0 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ]
- elsif species.count( " " ) > 1
- species =~ /(\S+)\s+(\S+)$/
- second_last = $1
- last = $2
- code = code[ 0..1 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ]
- end
- new_desc = counter.to_s( 16 ) + "_" + code
- if @taxonomies.has_key?( code )
- if ( !@taxonomies.has_value?( species ) )
- Util.fatal_error( PRG_NAME, "code [#{code}] is not unique in [#{desc}]" )
- end
- else
- if ( @taxonomies.has_value?( species ) )
- Util.fatal_error( PRG_NAME, "genome [#{species}] is not unique in [#{desc}]" )
- else
- @taxonomies[ code ] = species
- end
- end
- else
- Util.fatal_error( PRG_NAME, "illegal format [#{species}] in: " + desc )
- end
- else
- Util.fatal_error( PRG_NAME, "illegal format in: " + desc )
- end
- end
- else
- species = nil
- my_species = nil
- species_map.each_key do | key |
- if desc =~ /#{key}/
- species = species_map[ key ]
- species = species.gsub( /\s+/, "" )
- species = species.gsub( /_/, " " )
- my_species = species
- if species =~ /(\S+)\s+(\S+)/
- species = $1[0..2] + $2[0..1]
- end
- species = species.gsub( /\s+/, "" )
- species = species.slice(0, 5)
- species.upcase!
- break
- end
- end
- if species == nil
- Util.fatal_error( PRG_NAME, "species not found in: " + desc )
- else
- new_desc = counter.to_s( 16 ) + "_" + species
- end
- end
- if new_desc == nil
- Util.fatal_error( PRG_NAME, "failed to extract species from: " + desc )
- end
- if my_species != nil
- file.print( new_desc + ": " + desc + " [" + my_species + "]" + "\n" )
- else
- file.print( new_desc + ": " + desc + "\n" )
- end
- new_desc
- end
-
- end # class TaxonomyProcessor
-
-end # module Evoruby
+++ /dev/null
-#
-# = lib/evo/apps/tseq_taxonomy_processor - TseqTaxonomyProcessor class
-#
-# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-# $Id: tseq_taxonomy_processor.rb,v 1.6 2010/12/13 19:00:11 cmzmasek Exp $
-
-
-require 'lib/evo/util/util'
-require 'lib/evo/msa/msa_factory'
-require 'lib/evo/msa/msa'
-require 'lib/evo/io/msa_io'
-require 'lib/evo/io/parser/sp_taxonomy_parser'
-require 'lib/evo/io/parser/ncbi_tseq_parser'
-require 'lib/evo/io/writer/fasta_writer'
-require 'lib/evo/io/writer/phylip_sequential_writer'
-require 'lib/evo/util/command_line_arguments'
-
-module Evoruby
-
- class TseqTaxonomyProcessor
-
- PRG_NAME = "tseq_tap"
- PRG_DATE = "2009.01.06"
- PRG_DESC = "preprocessing of multiple sequence files in ncbi tseq xml format"
- PRG_VERSION = "1.02"
- COPYRIGHT = "2009 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
- TAXONOMY_CODE = "TAXONOMY_CODE:"
- TAXONOMY_ID = "TAXONOMY_ID:"
- TAXONOMY_ID_TYPE = "TAXONOMY_ID_TYPE:"
- TAXONOMY_SN = "TAXONOMY_SN:"
- TAXONOMY_CN = "TAXONOMY_CN:"
- SEQ_ACCESSION = "SEQ_ACCESSION:"
- SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE:"
- SEQ_SECONDARY_ACCESSION = "SEQ_SECONDARY_ACCESSION:"
- SEQ_SYMBOL = "SEQ_SYMBOL:"
- SEQ_NAME = "SEQ_NAME:"
- SEQ_MOL_SEQ = "SEQ_MOL_SEQ:"
-
- def initialize()
- @tax_ids_to_sp_taxonomies = Hash.new()
- end
-
- def run()
-
- Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- if ARGV == nil || ARGV.length != 4
- puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in tseq xml format> <name for fasta outfile> <name for map outfile>" )
- puts()
-
- exit( -1 )
- end
-
- begin
- cla = CommandLineArguments.new( ARGV )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
- allowed_opts = Array.new
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
- end
-
- sp_taxonomy_infile = cla.get_file_name( 0 )
- sequences_infile = cla.get_file_name( 1 )
- sequences_outfile = cla.get_file_name( 2 )
- mapping_outfile = cla.get_file_name( 3 )
-
- Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
- Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
- Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
- Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
-
- sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
-
- Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
-
- tseq_parser = NcbiTSeqParser.new
- msa_fac = MsaFactory.new
-
- seqs = msa_fac.create_msa_from_file( sequences_infile, tseq_parser )
-
- Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
-
- removed = seqs.remove_redundant_sequences!( true, true )
-
- if removed.size > 0
- Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
- removed.each { | seq_name |
- puts seq_name
- }
- Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
- end
-
- mapping_out = File.open( mapping_outfile, "a" )
-
- for i in 0 ... seqs.get_number_of_seqs
- seq = seqs.get_sequence( i )
- if seq.get_taxonomy == nil
- Util.fatal_error( PRG_NAME, "sequence [" + seq.get_name + "] has no taxonomy information" )
- end
- seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
- end
-
- io = MsaIO.new()
-
- w = FastaWriter.new()
-
- w.set_max_name_length( 10 )
- w.clean( true )
- begin
- io.write_to_file( seqs, sequences_outfile, w )
- rescue Exception => e
- Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
- end
- mapping_out.close()
-
- Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
- Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
- Util.print_message( PRG_NAME, "OK" )
-
- end
-
- private
-
- def modify_name( seq, i, sp_taxonomies, mapping_outfile )
-
- tax_id = seq.get_taxonomy.get_id
- matching_sp_taxonomy = nil
-
- if @tax_ids_to_sp_taxonomies.has_key?( tax_id )
- # This is so that a second lookup will be much faster.
- matching_sp_taxonomy = @tax_ids_to_sp_taxonomies[ tax_id ]
- else
- sp_taxonomies.each { |sp_taxonomy|
- if ( sp_taxonomy.id == tax_id )
- if matching_sp_taxonomy != nil
- Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] is not unique" )
- end
- matching_sp_taxonomy = sp_taxonomy
- @tax_ids_to_sp_taxonomies[ tax_id ] = sp_taxonomy
- end
- }
- end
- if matching_sp_taxonomy == nil
- Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] for [" + seq.get_taxonomy.get_name + "] not found" )
- end
-
- new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
-
- seq_name = seq.get_name
- if seq_name =~ /\[.+\]$/
- # Redundant taxonomy information hides here.
- seq_name = seq_name.sub(/\[.+\]$/, '')
- end
- if seq_name =~ /^\s*hypothetical\s+protein\s*/i
- # Pointless information.
- seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
- end
-
- mapping_outfile.print( new_name + "\t" +
- TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
- TAXONOMY_ID + tax_id + "\t" +
- TAXONOMY_ID_TYPE + seq.get_taxonomy.get_id_source + "\t" +
- TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
- SEQ_ACCESSION + seq.get_accession + "\t" +
- SEQ_ACCESSION_SOURCE + seq.get_accession_source + "\t" +
- SEQ_SYMBOL + seq.get_symbol + "\t" +
- SEQ_NAME + seq_name + "\t" +
- SEQ_MOL_SEQ + seq.get_sequence_as_string +
- Constants::LINE_DELIMITER )
- new_name
- end
-
- end
-
-end # module Evoruby
\ No newline at end of file