#
# = lib/evo/apps/phylogenies_decorator
#
-# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
+# Copyright:: Copyright (C) 2017 Christian M. Zmasek
+# License:: GNU Lesser General Public License (LGPL)
#
-# decoration of phylogenies with sequence/species names and domain architectures
+# Last modified: 2017/02/09
#
-# $Id: phylogenies_decorator.rb,v 1.34 2010/12/13 19:00:11 cmzmasek Exp $
+# decoration of phylogenies with sequence/species names and domain architectures
#
# Environment variable FORESTER_HOME needs to point to the appropriate
-# directory (e.g. setenv FORESTER_HOME $HOME/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester-atv/)
+# directory (e.g. setenv FORESTER_HOME $HOME/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester/)
require 'lib/evo/util/constants'
require 'lib/evo/util/util'
require 'lib/evo/util/command_line_arguments'
-
require 'date'
module Evoruby
-
class PhylogeniesDecorator
#DECORATOR_OPTIONS_SEQ_NAMES = '-r=1 -mdn'
- DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -sn'
+ #DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -sn'
+ #DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -tc -mp -or'
+ DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -mp -or'
# -mdn is a hidden expert option to rename e.g. "6_ORYLA3" to "6_[3]_ORYLA"
#DECORATOR_OPTIONS_SEQ_NAMES = '-sn -r=1'
#DECORATOR_OPTIONS_DOMAINS = '-r=1'
DECORATOR_OPTIONS_DOMAINS = '-p -t'
IDS_MAPFILE_SUFFIX = '.nim'
- DOMAINS_MAPFILE_SUFFIX = '.dff'
- SLEEP_TIME = 0.1
+ DOMAINS_MAPFILE_SUFFIX = '_hmmscan_10.dff'
+ SLEEP_TIME = 0.05
REMOVE_NI = true
- TMP_FILE = '___PD___'
+ IDS_ONLY = false #TODO this should be a command line option
+ FIXED_NIM_FILE = 'all.nim' #TODO this should be a command line option
+ TMP_FILE_1 = '___PD1___'
+ TMP_FILE_2 = '___PD2___'
LOG_FILE = '00_phylogenies_decorator.log'
FORESTER_HOME = ENV[Constants::FORESTER_HOME_ENV_VARIABLE]
JAVA_HOME = ENV[Constants::JAVA_HOME_ENV_VARIABLE]
PRG_NAME = "phylogenies_decorator"
- PRG_DATE = "2012.10.11"
+ PRG_DATE = "170209"
PRG_DESC = "decoration of phylogenies with sequence/species names and domain architectures"
PRG_VERSION = "1.02"
- COPYRIGHT = "2012 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
+ WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
- IDS_ONLY_OPTION = "n"
- DOMAINS_ONLY_OPTION = "d"
HELP_OPTION_1 = "help"
HELP_OPTION_2 = "h"
NL = Constants::LINE_DELIMITER
-
def run
Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
+ PRG_VERSION,
+ PRG_DESC,
+ PRG_DATE,
+ WWW,
+ STDOUT )
if ( ARGV == nil || ARGV.length > 3 || ARGV.length < 2 )
print_help
end
if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
+ cla.is_option_set?( HELP_OPTION_2 ) )
print_help
exit( 0 )
end
Util.fatal_error( PRG_NAME, 'logfile [' + LOG_FILE + '] already exists' )
end
- allowed_opts = Array.new
- allowed_opts.push( IDS_ONLY_OPTION )
- allowed_opts.push( DOMAINS_ONLY_OPTION )
-
- disallowed = cla.validate_allowed_options_as_str( allowed_opts )
- if ( disallowed.length > 0 )
- Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
- end
-
- ids_only = false
- domains_only = false
-
in_suffix = cla.get_file_name( 0 )
out_suffix = cla.get_file_name( 1 )
- if cla.is_option_set?( IDS_ONLY_OPTION )
- ids_only = true
- end
- if cla.is_option_set?( DOMAINS_ONLY_OPTION )
- domains_only = true
- end
-
- if ( ids_only && domains_only )
- Util.fatal_error( PRG_NAME, 'attempt to use ids only and domains only at the same time' )
- end
-
log = String.new
now = DateTime.now
log << 'input suffix : ' + in_suffix + NL
log << 'output suffix : ' + out_suffix + NL
- if ( File.exists?( TMP_FILE ) )
- File.delete( TMP_FILE )
+ if ( File.exist?( TMP_FILE_1 ) )
+ File.delete( TMP_FILE_1 )
+ end
+ if ( File.exist?( TMP_FILE_2 ) )
+ File.delete( TMP_FILE_2 )
end
files = Dir.entries( "." )
files.each { | phylogeny_file |
if ( !File.directory?( phylogeny_file ) &&
- phylogeny_file !~ /^\./ &&
- phylogeny_file !~ /^00/ &&
- phylogeny_file !~ /#{out_suffix}$/ &&
- phylogeny_file =~ /#{in_suffix}$/ )
+ phylogeny_file !~ /^\./ &&
+ phylogeny_file !~ /^00/ &&
+ phylogeny_file !~ /#{out_suffix}$/ &&
+ phylogeny_file =~ /#{in_suffix}$/ )
begin
Util.check_file_for_readability( phylogeny_file )
rescue ArgumentError
outfile = outfile.sub( /_ni_/, '_' )
end
- if File.exists?( outfile )
+ if File.exist?( outfile )
msg = counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile +
- ' : already exists, skipping'
+ ' : already exists, skipping'
Util.print_message( PRG_NAME, msg )
log << msg + NL
next
log << counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile + NL
phylogeny_id = get_id( phylogeny_file )
+ if phylogeny_id == nil || phylogeny_id.size < 1
+ Util.fatal_error( PRG_NAME, 'could not get id from ' + phylogeny_file.to_s )
+ end
+ puts
+ Util.print_message( PRG_NAME, "id: " + phylogeny_id )
+ log << "id: " + phylogeny_id + NL
ids_mapfile_name = nil
domains_mapfile_name = nil
+ seqs_file_name = nil
- if ids_only
+ if ( FIXED_NIM_FILE == nil )
ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX )
- elsif domains_only
- domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX )
else
- ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX )
+ ids_mapfile_name = FIXED_NIM_FILE
+ end
+
+ unless IDS_ONLY
domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX )
+ seqs_file_name = get_seq_file( files, phylogeny_id )
end
- if domains_mapfile_name != nil
+ unless IDS_ONLY
begin
Util.check_file_for_readability( domains_mapfile_name )
rescue ArgumentError
Util.fatal_error( PRG_NAME, 'failed to read from [#{domains_mapfile_name}]: ' + $! )
end
- end
-
- if ids_mapfile_name != nil
begin
- Util.check_file_for_readability( ids_mapfile_name )
+ Util.check_file_for_readability( seqs_file_name )
rescue ArgumentError
- Util.fatal_error( PRG_NAME, 'failed to read from [#{ids_mapfile_name}]: ' + $! )
+ Util.fatal_error( PRG_NAME, 'failed to read from [#{seqs_file_name }]: ' + $! )
end
end
- if domains_mapfile_name != nil
- if ids_mapfile_name != nil
- my_outfile = TMP_FILE
- else
- my_outfile = outfile
+ begin
+ Util.check_file_for_readability( ids_mapfile_name )
+ rescue ArgumentError
+ Util.fatal_error( PRG_NAME, 'failed to read from [#{ids_mapfile_name}]: ' + $! )
+ end
+
+ unless IDS_ONLY
+ cmd = decorator +
+ ' -t -p -f=m ' + phylogeny_file + ' ' +
+ seqs_file_name + ' ' + TMP_FILE_1
+ puts cmd
+ begin
+ execute_cmd( cmd, log )
+ rescue Error
+ Util.fatal_error( PRG_NAME, 'error: ' + $! )
end
+
cmd = decorator + ' ' + DECORATOR_OPTIONS_DOMAINS + ' ' +
- '-f=d ' + phylogeny_file + ' ' +
- domains_mapfile_name + ' ' + my_outfile
- execute_cmd( cmd, log )
+ '-f=d ' + TMP_FILE_1 + ' ' +
+ domains_mapfile_name + ' ' + TMP_FILE_2
+ puts cmd
+ begin
+ execute_cmd( cmd, log )
+ rescue Error
+ Util.fatal_error( PRG_NAME, 'error: ' + $! )
+ end
end
- if ids_mapfile_name != nil
- if domains_mapfile_name != nil
- my_infile = TMP_FILE
- else
- my_infile = phylogeny_file
+ if IDS_ONLY
+ cmd = decorator + ' ' + DECORATOR_OPTIONS_SEQ_NAMES + ' ' +
+ '-f=n ' + phylogeny_file + ' ' +
+ ids_mapfile_name + ' ' + outfile
+ puts cmd
+ begin
+ execute_cmd( cmd, log )
+ rescue Error
+ Util.fatal_error( PRG_NAME, 'error: ' + $! )
end
+ else
cmd = decorator + ' ' + DECORATOR_OPTIONS_SEQ_NAMES + ' ' +
- '-f=s ' + my_infile + ' ' +
- ids_mapfile_name + ' ' + outfile
- execute_cmd( cmd, log )
- end
-
- if ( File.exists?( TMP_FILE ) )
- File.delete( TMP_FILE )
+ '-f=n ' + TMP_FILE_2 + ' ' +
+ ids_mapfile_name + ' ' + outfile
+ puts cmd
+ begin
+ execute_cmd( cmd, log )
+ rescue Error
+ Util.fatal_error( PRG_NAME, 'error: ' + $! )
+ end
+ File.delete( TMP_FILE_1 )
+ File.delete( TMP_FILE_2 )
end
end
}
end # def run
def execute_cmd( cmd, log )
- log << 'excuting ' + cmd + NL
+ log << 'executing ' + cmd + NL
IO.popen( cmd , 'r+' ) do | pipe |
pipe.close_write
log << pipe.read + NL + NL
sleep( SLEEP_TIME )
end
-
def get_id( phylogeny_file_name )
- phylogeny_file_name =~ /^([^_]+)/
- $1
+ if phylogeny_file_name =~ /^(.+?_.+?)_/
+ return $1
+ elsif phylogeny_file_name =~ /^(.+?)__/
+ return $1
+ elsif phylogeny_file_name =~ /^(.+?)_/
+ return $1
+ end
+ nil
end
def get_file( files_in_dir, phylogeny_id, suffix_pattern )
+ matching_files = Util.get_matching_files( files_in_dir, phylogeny_id, suffix_pattern )
+ if matching_files.length < 1
+ Util.fatal_error( PRG_NAME, 'no file matching [' + phylogeny_id +
+ '...' + suffix_pattern + '] present in current directory' )
+ end
+ if matching_files.length > 1
+ Util.fatal_error( PRG_NAME, 'more than one file matching [' +
+ phylogeny_id + '...' + suffix_pattern + '] present in current directory' )
+ end
+ matching_files[ 0 ]
+ end
+
+ def get_seq_file( files_in_dir, phylogeny_id )
matching_files = Array.new
+
files_in_dir.each { | file |
if ( !File.directory?( file ) &&
- file !~ /^\./ &&
- file !~ /^00/ &&
- file =~ /^#{phylogeny_id}.*#{suffix_pattern}$/ )
+ file !~ /^\./ &&
+ file !~ /^00/ &&
+ ( file =~ /^#{phylogeny_id}__.+\d$/ || file =~ /^#{phylogeny_id}_.*\.fasta$/ ) )
matching_files << file
end
}
+
if matching_files.length < 1
- Util.fatal_error( PRG_NAME, 'no file matching [' + phylogeny_id +
- '_] [' + suffix_pattern + '] present in current directory' )
- elsif matching_files.length > 1
- Util.fatal_error( PRG_NAME, 'more than one file matching [' + phylogeny_id +
- '_] [' + suffix_pattern + '] present in current directory' )
+ Util.fatal_error( PRG_NAME, 'no seq file matching [' +
+ phylogeny_id + '_] present in current directory' )
+ end
+ if matching_files.length > 1
+ Util.fatal_error( PRG_NAME, 'more than one seq file matching [' +
+ phylogeny_id + '_] present in current directory' )
end
matching_files[ 0 ]
end
def print_help()
puts( "Usage:" )
puts()
- puts( " " + PRG_NAME + ".rb [options] <suffix of intrees to be decorated> <suffix for decorated outtrees> " )
+ puts( " " + PRG_NAME + ".rb <suffix of intrees to be decorated> <suffix for decorated outtrees> " )
puts()
- puts( " options: -" + IDS_ONLY_OPTION + ": decorate with sequence/species names only" )
- puts( " -" + DOMAINS_ONLY_OPTION + ": decorate with domain structures" )
puts()
end
end # class PhylogenyiesDecorator