class MsaProcessor
PRG_NAME = "msa_pro"
- PRG_DATE = "170215"
+ PRG_DATE = "170609"
PRG_DESC = "processing of multiple sequence alignments"
- PRG_VERSION = "1.08"
+ PRG_VERSION = "1.09"
WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
NAME_LENGTH_DEFAULT = 10
REMOVE_MATCHING_SEQUENCES_OPTION = "mr"
TRIM_OPTION = "t"
+ SLIDING_EXTRACTION_OPTION = "se"
REMOVE_SEQS_GAP_RATIO_OPTION = "rsgr"
REMOVE_SEQS_NON_GAP_LENGTH_OPTION = "rsl"
SPLIT = "split"
@split_by_os = false
@first = -1
@last = -1
+ @window = false
+ @step = -1
+ @size = -1
end
def run()
allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
allowed_opts.push( DIE_IF_NAME_TOO_LONG )
+ allowed_opts.push( SLIDING_EXTRACTION_OPTION )
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
puts( "Split : " + @split.to_s )
log << "Split : " + @split.to_s + ld
end
+ if @window
+ puts( "Sliding window extraction: true" )
+ log << "Sliding window extraction: true" + ld
+ puts( "Sliding window step : " + @step.to_s )
+ log << "Sliding window step : " + @step.to_s + ld
+ puts( "Sliding window size : " + @size.to_s )
+ log << "Sliding window size : " + @size.to_s + ld
+ end
+
puts()
f = MsaFactory.new()
end
if ( @trim )
- msa.trim!( @first, @last )
+ msa.trim!( @first, @last, '_S' )
+ end
+
+ if @window
+ msas = msa.sliding_extraction( @step, @size )
+ begin
+ io = MsaIO.new()
+ w = MsaWriter
+ if ( @pi_output )
+ w = PhylipSequentialWriter.new()
+ w.clean( @clean )
+ w.set_max_name_length( @name_length )
+ elsif( @fasta_output )
+ w = FastaWriter.new()
+ w.set_line_width( @width )
+ w.clean( @clean )
+ if ( @name_length_set )
+ w.set_max_name_length( @name_length )
+ end
+ elsif( @nexus_output )
+ w = NexusWriter.new()
+ w.clean( @clean )
+ w.set_max_name_length( @name_length )
+ end
+ i = 0
+ for m in msas
+ i = i + 1
+ name = output + "_" + m.get_name
+ if @fasta_output
+ name += ".fasta"
+ elsif @nexus_output
+ name += ".nex"
+ end
+ io.write_to_file( m, name, w )
+ end
+ Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files" )
+ log << "wrote " + msas.length.to_s + " files" + ld
+ rescue Exception => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
end
+
if( @rgr >= 0 )
msa.remove_gap_columns_w_gap_ratio!( @rgr )
elsif ( @rgc )
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
- if (@split <= 0) && (!@split_by_os)
+ if (@split <= 0) && (!@split_by_os) && (!@window)
unless ( @rg )
if ( msa.is_aligned() )
@trim = false
@first = -1
@last = -1
+ @window = false
+ end
+
+ def set_window()
+ @split = -1
+ @split_by_os = false
+ @rgc = false
+ @rgoc = false
+ @rg = false # fasta only
+ @rgr = -1
+ @rsgr = -1
+ @rsl = -1
+ @seqs_name_file = nil
+ @remove_seqs = false
+ @keep_seqs = false
+ @trim = false
+ @first = -1
+ @last = -1
+ @window = true
end
def analyze_command_line( cla )
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
end
+ if ( cla.is_option_set?( SLIDING_EXTRACTION_OPTION ) )
+ begin
+ s = cla.get_option_value( SLIDING_EXTRACTION_OPTION )
+ if ( s =~ /(\d+)\/(\d+)/ )
+ set_window
+ @window = true
+ @step = $1.to_i()
+ @size = $2.to_i()
+ else
+ puts( "illegal argument" )
+ print_help
+ exit( -1 )
+ end
+ if (@step <= 0) || (@size <= 0)
+ puts( "illegal argument" )
+ print_help
+ exit( -1 )
+ end
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
+ end
+
if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) )
begin
f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION )
puts()
puts( " " + PRG_NAME + ".rb [options] <input alignment> <output>" )
puts()
- puts( " options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta, p for phylip selex type" )
- puts( " -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta, n for nexus, p for phylip sequential (default)" )
+ puts( " options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta (default), p for phylip/selex type" )
+ puts( " -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta (default), n for nexus, p for phylip sequential" )
puts( " -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
puts( " -" + DIE_IF_NAME_TOO_LONG + ": die if sequence name too long" )
puts( " -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
puts( " -" + KEEP_MATCHING_SEQUENCES_OPTION + "=<s> keep only sequences with names containing s" )
puts( " -" + SPLIT + "=<n> split a fasta file into n files of equal number of sequences (expect for " )
puts( " last one), cannot be used with other options" )
+ puts( " -" + SLIDING_EXTRACTION_OPTION + "=<step>/<window size>: sliding window extraction, cannot be used with other options" )
puts( " -" + REM_RED_OPTION + ": remove redundant sequences" )
puts()
end
DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -mp -or'
DECORATOR_OPTIONS_DOMAINS = '-p -t'
- DOMAINS_MAPFILE_SUFFIX = '.dff'
SLEEP_TIME = 0.01
REMOVE_NI = true
TMP_FILE_1 = '___PD1___'
JAVA_HOME = ENV[Constants::JAVA_HOME_ENV_VARIABLE]
PRG_NAME = "phylogenies_decorator"
- PRG_DATE = "170427"
+ PRG_DATE = "170428"
PRG_DESC = "decoration of phylogenies with sequence/species names and domain architectures"
- PRG_VERSION = "1.04"
+ PRG_VERSION = "1.05"
WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
HELP_OPTION_1 = "help"
exit( 0 )
end
- if ( cla.get_number_of_files != 2 )
+ if ( cla.get_number_of_files != 2 && cla.get_number_of_files != 3 )
print_help
exit( -1 )
end
verbose = true
end
- if File.exist?( LOG_FILE )
+ if File.exist? LOG_FILE
Util.fatal_error( PRG_NAME, 'logfile [' + LOG_FILE + '] already exists' )
end
in_suffix = cla.get_file_name( 0 )
out_suffix = cla.get_file_name( 1 )
+ mapping_files_dir = nil
+
+ if cla.get_number_of_files == 3
+ mapping_files_dir = cla.get_file_name( 2 )
+ else
+ mapping_files_dir = Dir.getwd
+ end
+ unless File.exist? mapping_files_dir
+ Util.fatal_error( PRG_NAME, 'mapping files directory [' + mapping_files_dir + '] does not exist' )
+ end
+ unless File.directory? mapping_files_dir
+ Util.fatal_error( PRG_NAME, '[' + mapping_files_dir + '] is not a directory' )
+ end
+ if Dir.entries(mapping_files_dir).length <= 2
+ Util.fatal_error( PRG_NAME, 'mapping files directory [' + mapping_files_dir + '] is empty' )
+ end
+
+ mapping_files_dir = Util.canonical_path( mapping_files_dir.to_s )
+
log = String.new
now = DateTime.now
log << "Program : " + PRG_NAME + NL
log << "Version : " + PRG_VERSION + NL
log << "Program date : " + PRG_DATE + NL
+ log << "Input/Output dir : " + Dir.getwd + NL
+ log << "Mappings file dir : " + mapping_files_dir + NL
+ log << "Input suffix : " + in_suffix + NL
+ log << "Output suffix : " + out_suffix + NL
log << "No domains data : " + no_domains.to_s + NL
log << "No mol seq data : " + no_seqs_files.to_s + NL
log << "Extract tax codes : " + extr_bracketed_tc.to_s + NL
- log << "Date/time: " + now.to_s + NL
- log << "Directory: " + Dir.getwd + NL + NL
-
- Util.print_message( PRG_NAME, 'input suffix : ' + in_suffix )
- Util.print_message( PRG_NAME, 'output suffix : ' + out_suffix )
+ log << "Date/time: " + now.to_s + NL + NL
- log << 'input suffix : ' + in_suffix + NL
- log << 'output suffix : ' + out_suffix + NL
+ Util.print_message( PRG_NAME, 'Input/Output dir : ' + Dir.getwd )
+ Util.print_message( PRG_NAME, 'Mappings file dir: ' + mapping_files_dir )
+ Util.print_message( PRG_NAME, 'Input suffix : ' + in_suffix )
+ Util.print_message( PRG_NAME, 'Output suffix : ' + out_suffix )
+ Util.print_message( PRG_NAME, 'No domains data : ' + no_domains.to_s )
+ Util.print_message( PRG_NAME, 'No mol seq data : ' + no_seqs_files.to_s )
+ Util.print_message( PRG_NAME, 'Extract tax codes: ' + extr_bracketed_tc.to_s )
if ( File.exist?( TMP_FILE_1 ) )
File.delete( TMP_FILE_1 )
begin
Util.check_file_for_readability( phylogeny_file )
rescue ArgumentError
- Util.fatal_error( PRG_NAME, 'can not read from: ' + phylogeny_file + ': '+ $! )
+ Util.fatal_error( PRG_NAME, 'can not read from: ' + phylogeny_file + ': '+ $!.to_s )
end
counter += 1
Util.print_message( PRG_NAME, counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile )
log << counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile + NL
- phylogeny_id = get_id( phylogeny_file )
+ phylogeny_id = phylogeny_file
if phylogeny_id == nil || phylogeny_id.size < 1
Util.fatal_error( PRG_NAME, 'could not get id from ' + phylogeny_file.to_s )
end
end
log << "Id: " + phylogeny_id + NL
- ids_mapfile_name = nil
- domains_mapfile_name = nil
- seqs_file_name = nil
+ ids_mapfile_path = nil
+ domains_mapfile_path = nil
+ seqs_file_path = nil
- ids_mapfile_name = get_file( ".", phylogeny_id, Constants::ID_MAP_FILE_SUFFIX )
+ ids_mapfile_name = get_file( mapping_files_dir, phylogeny_id, Constants::ID_MAP_FILE_SUFFIX )
+ ids_mapfile_path = Util.canonical_path(mapping_files_dir, ids_mapfile_name)
begin
- Util.check_file_for_readability( ids_mapfile_name )
+ Util.check_file_for_readability( ids_mapfile_path)
rescue IOError
- Util.fatal_error( PRG_NAME, 'failed to read from [#{ids_mapfile_name}]: ' + $! )
+ Util.fatal_error( PRG_NAME, "failed to read from [#{ids_mapfile_path}]: " + $!.to_s )
end
if verbose
- Util.print_message( PRG_NAME, "Ids mapfile: " + ids_mapfile_name )
+ Util.print_message( PRG_NAME, "Ids mapfile: " + ids_mapfile_path )
end
- log << "Ids mapfile: " + ids_mapfile_name + NL
+ log << "Ids mapfile: " + ids_mapfile_path + NL
unless no_seqs_files
- seqs_file_name = get_file( ".", phylogeny_id, Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX )
+ seqs_file_name = get_file( mapping_files_dir, phylogeny_id, Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX )
+ seqs_file_path = Util.canonical_path(mapping_files_dir, seqs_file_name)
begin
- Util.check_file_for_readability( seqs_file_name )
+ Util.check_file_for_readability( seqs_file_path )
rescue IOError
- Util.fatal_error( PRG_NAME, 'failed to read from [#{seqs_file_name }]: ' + $! )
+ Util.fatal_error( PRG_NAME, "failed to read from [#{seqs_file_path}]: " + $!.to_s )
end
if verbose
- Util.print_message( PRG_NAME, "Seq file: " + seqs_file_name )
+ Util.print_message( PRG_NAME, "Seq file: " + seqs_file_path )
end
- log << "Seq file: " + seqs_file_name + NL
+ log << "Seq file: " + seqs_file_path + NL
end
unless no_domains
- domains_mapfile_name = get_file( ".", phylogeny_id, Constants::DOMAINS_TO_FORESTER_OUTFILE_SUFFIX )
+ domains_mapfile_name = get_file( mapping_files_dir , phylogeny_id, Constants::DOMAINS_TO_FORESTER_OUTFILE_SUFFIX )
+ domains_mapfile_path = Util.canonical_path(mapping_files_dir, domains_mapfile_name)
begin
- Util.check_file_for_readability( domains_mapfile_name )
+ Util.check_file_for_readability( domains_mapfile_path )
rescue IOError
- Util.fatal_error( PRG_NAME, 'failed to read from [#{domains_mapfile_name}]: ' + $! )
+ Util.fatal_error( PRG_NAME, "failed to read from [#{domains_mapfile_path}]: " + $!.to_s )
end
if verbose
- Util.print_message( PRG_NAME, "Domains file: " + domains_mapfile_name )
+ Util.print_message( PRG_NAME, "Domains file: " + domains_mapfile_path )
end
- log << "Domains file: " + domains_mapfile_name + NL
+ log << "Domains file: " + domains_mapfile_path + NL
end
+ log << NL + NL
+
if no_seqs_files
FileUtils.cp(phylogeny_file, TMP_FILE_1)
else
cmd = decorator +
' -t -p -f=m ' + phylogeny_file + ' ' +
- seqs_file_name + ' ' + TMP_FILE_1
+ seqs_file_path + ' ' + TMP_FILE_1
if verbose
puts cmd
end
begin
execute_cmd( cmd, log )
- rescue Error
- Util.fatal_error( PRG_NAME, 'error: ' + $! )
+ rescue Exception
+ Util.fatal_error( PRG_NAME, 'error: ' + $!.to_s )
end
end
unless no_domains
cmd = decorator + ' ' + DECORATOR_OPTIONS_DOMAINS + ' ' +
'-f=d ' + TMP_FILE_1 + ' ' +
- domains_mapfile_name + ' ' + TMP_FILE_2
+ domains_mapfile_path + ' ' + TMP_FILE_2
if verbose
puts cmd
end
begin
execute_cmd( cmd, log )
- rescue Error
- Util.fatal_error( PRG_NAME, 'error: ' + $! )
+ rescue Exception
+ Util.fatal_error( PRG_NAME, 'error: ' + $!.to_s )
end
end
if no_domains
cmd = decorator + ' ' + opts + ' -f=n ' + TMP_FILE_1 + ' ' +
- ids_mapfile_name + ' ' + outfile
+ ids_mapfile_path + ' ' + outfile
if verbose
puts cmd
end
begin
execute_cmd( cmd, log )
- rescue Error
- Util.fatal_error( PRG_NAME, 'error: ' + $! )
+ rescue Exception
+ Util.fatal_error( PRG_NAME, 'error: ' + $!.to_s )
end
File.delete( TMP_FILE_1 )
else
cmd = decorator + ' ' + opts + ' -f=n ' + TMP_FILE_2 + ' ' +
- ids_mapfile_name + ' ' + outfile
+ ids_mapfile_path + ' ' + outfile
if verbose
puts cmd
end
begin
execute_cmd( cmd, log )
- rescue Error
- Util.fatal_error( PRG_NAME, 'error: ' + $! )
+ rescue Exception
+ Util.fatal_error( PRG_NAME, 'error: ' + $!.to_s )
end
File.delete( TMP_FILE_1 )
File.delete( TMP_FILE_2 )
pipe.close_write
log << pipe.read + NL + NL
end
+ if $?.to_i != 0
+ raise StandardError, "failed to execute " + cmd
+ end
sleep( SLEEP_TIME )
end
- def get_id( phylogeny_file_name )
- return phylogeny_file_name
- #if phylogeny_file_name =~ /^(.+?_DA)_/
- # return $1
- #elsif phylogeny_file_name =~ /^(.+?)_/
- # return $1
- #end
- #nil
- end
-
def get_file( files_in_dir, phylogeny_id, suffix_pattern )
- Util.get_matching_file( files_in_dir, phylogeny_id, suffix_pattern )
- # matching_files = Util.get_matching_files( files_in_dir, phylogeny_id, suffix_pattern )
- # if matching_files.length < 1
- # Util.fatal_error( PRG_NAME, 'no file matching [' + phylogeny_id +
- # '...' + suffix_pattern + '] present in current directory' )
- # end
- # if matching_files.length > 1
- # Util.fatal_error( PRG_NAME, 'more than one file matching [' +
- # phylogeny_id + '...' + suffix_pattern + '] present in current directory' )
- # end
- # matching_files[ 0 ]
- end
-
- def get_seq_file( files_in_dir, phylogeny_id )
- matching_files = Array.new
-
- files_in_dir.each { | file |
-
- if ( !File.directory?( file ) &&
- file !~ /^\./ &&
- file !~ /^00/ &&
- ( file =~ /^#{phylogeny_id}__.+\d$/ || file =~ /^#{phylogeny_id}_.*\.fasta$/ ) )
- matching_files << file
- end
- }
-
- if matching_files.length < 1
- Util.fatal_error( PRG_NAME, 'no seq file matching [' +
- phylogeny_id + '_] present in current directory' )
- end
- if matching_files.length > 1
- Util.fatal_error( PRG_NAME, 'more than one seq file matching [' +
- phylogeny_id + '_] present in current directory' )
+ begin
+ Util.get_matching_file( files_in_dir, phylogeny_id, suffix_pattern )
+ rescue Exception
+ Util.fatal_error( PRG_NAME, 'error: ' + $!.to_s )
end
- matching_files[ 0 ]
end
def print_help()
puts "Usage:"
puts
- puts " " + PRG_NAME + ".rb <suffix of in-trees to be decorated> <suffix for decorated out-trees> "
+ puts " " + PRG_NAME + ".rb [options] <suffix of in-trees to be decorated> <suffix for decorated out-trees> [mapping files directory, default: current dir]"
puts
- puts " required files (in this dir): " + "name mappings : .nim"
- puts " " + "sequences : _ni.fasta"
- puts " " + "domain architectures: .dff"
+ puts " " + PRG_NAME + ".rb [options] <input directory> <output directory> <mapping files directory>"
puts
- puts " options: -" + NO_DOMAINS_OPTION + ": to not add domain architecture information (.dff file)"
- puts " -" + NO_SEQS_OPTION + ": to not add molecular sequence information (_ni.fasta file)"
+ puts " required file (in mapping files directory): " + "name mappings : #{Constants::ID_MAP_FILE_SUFFIX}"
+ puts " optional files (in mapping files directory): " + "sequences : #{Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX}"
+ puts " " + "domain architectures: #{Constants::DOMAINS_TO_FORESTER_OUTFILE_SUFFIX}"
+ puts
+ puts " options: -" + NO_DOMAINS_OPTION + ": to not add domain architecture information (#{Constants::DOMAINS_TO_FORESTER_OUTFILE_SUFFIX} file)"
+ puts " -" + NO_SEQS_OPTION + ": to not add molecular sequence information (#{Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX} file)"
puts " -" + EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION + ": to extract bracketed taxonomic codes, e.g. [NEMVE]"
- puts " -" + VERBOSE_OPTION + ": verbose"
+ puts " -" + VERBOSE_OPTION + " : verbose"
puts
puts "Examples: " + PRG_NAME + ".rb .xml _d.xml"
puts " " + PRG_NAME + ".rb -#{NO_DOMAINS_OPTION} -#{NO_SEQS_OPTION} .xml _d.xml"
+ puts " " + PRG_NAME + ".rb -#{NO_DOMAINS_OPTION} -#{NO_SEQS_OPTION} .xml _d.xml mappings_dir"
+ puts
+ puts " " + PRG_NAME + ".rb in_trees_dir out_dir mappings_dir"
puts
end
end # class PhylogenyiesDecorator