require 'date'
require 'set'
-
require 'lib/evo/util/constants'
require 'lib/evo/util/util'
require 'lib/evo/util/command_line_arguments'
require 'lib/evo/io/writer/msa_writer'
module Evoruby
-
class MsaProcessor
PRG_NAME = "msa_pro"
- PRG_DATE = "131112"
+ PRG_DATE = "170609"
PRG_DESC = "processing of multiple sequence alignments"
- PRG_VERSION = "1.08"
- COPYRIGHT = "2008-2010 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
+ PRG_VERSION = "1.09"
+ WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
NAME_LENGTH_DEFAULT = 10
WIDTH_DEFAULT_FASTA = 60
REMOVE_MATCHING_SEQUENCES_OPTION = "mr"
TRIM_OPTION = "t"
+ SLIDING_EXTRACTION_OPTION = "se"
REMOVE_SEQS_GAP_RATIO_OPTION = "rsgr"
REMOVE_SEQS_NON_GAP_LENGTH_OPTION = "rsl"
SPLIT = "split"
+ SPLIT_BY_OS = "split_by_os"
LOG_SUFFIX = "_msa_pro.log"
HELP_OPTION_1 = "help"
HELP_OPTION_2 = "h"
-
-
def initialize()
@input_format_set = false
@output_format_set = false
@keep_seqs = false
@trim = false
@split = -1
+ @split_by_os = false
@first = -1
@last = -1
+ @window = false
+ @step = -1
+ @size = -1
end
-
def run()
Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
+ PRG_VERSION,
+ PRG_DESC,
+ PRG_DATE,
+ WWW,
+ STDOUT )
if ( ARGV == nil || ARGV.length < 1 )
Util.print_message( PRG_NAME, "Illegal number of arguments" )
end
if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
+ cla.is_option_set?( HELP_OPTION_2 ) )
print_help
exit( 0 )
end
allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION )
allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
allowed_opts.push( SPLIT )
+ allowed_opts.push( SPLIT_BY_OS )
allowed_opts.push( REM_RED_OPTION )
allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
allowed_opts.push( DIE_IF_NAME_TOO_LONG )
+ allowed_opts.push( SLIDING_EXTRACTION_OPTION )
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
Util.fatal_error( PRG_NAME,
- "unknown option(s): " + disallowed )
+ "unknown option(s): " + disallowed )
end
input = cla.get_file_name( 0 )
puts( "Remove redundant sequences: true" )
log << "Remove redundant sequences: true" + ld
end
+ if ( @split_by_os )
+ puts( "Split by OS : true" )
+ log << "Split : true" + ld
+ end
if ( @split > 0 )
puts( "Split : " + @split.to_s )
log << "Split : " + @split.to_s + ld
end
+ if @window
+ puts( "Sliding window extraction: true" )
+ log << "Sliding window extraction: true" + ld
+ puts( "Sliding window step : " + @step.to_s )
+ log << "Sliding window step : " + @step.to_s + ld
+ puts( "Sliding window size : " + @size.to_s )
+ log << "Sliding window size : " + @size.to_s + ld
+ end
+
puts()
f = MsaFactory.new()
end
if ( @trim )
- msa.trim!( @first, @last )
+ msa.trim!( @first, @last, '_S' )
+ end
+
+ if @window
+ msas = msa.sliding_extraction( @step, @size, @size / 2, '_Q' )
+ begin
+ io = MsaIO.new()
+ w = MsaWriter
+ if ( @pi_output )
+ w = PhylipSequentialWriter.new()
+ w.clean( @clean )
+ w.set_max_name_length( @name_length )
+ elsif( @fasta_output )
+ w = FastaWriter.new()
+ w.set_line_width( @width )
+ w.clean( @clean )
+ if ( @name_length_set )
+ w.set_max_name_length( @name_length )
+ end
+ elsif( @nexus_output )
+ w = NexusWriter.new()
+ w.clean( @clean )
+ w.set_max_name_length( @name_length )
+ end
+ for m in msas
+ name = output + "_" + m.get_name
+ if @fasta_output
+ name += ".fasta"
+ elsif @nexus_output
+ name += ".nex"
+ end
+ io.write_to_file( m, name, w )
+ end
+ Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files" )
+ log << "wrote " + msas.length.to_s + " files" + ld
+ rescue Exception => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
end
+
if( @rgr >= 0 )
msa.remove_gap_columns_w_gap_ratio!( @rgr )
elsif ( @rgc )
msa = sort( msa )
end
+ if ( @split_by_os )
+ begin
+ msa_hash = msa.split_by_os(true)
+ io = MsaIO.new()
+ w = MsaWriter
+ if ( @pi_output )
+ w = PhylipSequentialWriter.new()
+ w.clean( @clean )
+ w.set_max_name_length( @name_length )
+ elsif( @fasta_output )
+ w = FastaWriter.new()
+ w.set_line_width( @width )
+ if ( @rg )
+ w.remove_gap_chars( true )
+ Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
+ log << "removing gap character, the output is likely to become unaligned" + ld
+ end
+ w.clean( @clean )
+ if ( @name_length_set )
+ w.set_max_name_length( @name_length )
+ end
+ elsif( @nexus_output )
+ w = NexusWriter.new()
+ w.clean( @clean )
+ w.set_max_name_length( @name_length )
+ end
+ msa_hash.each do |os, m|
+ my_os = os.gsub(' ', '_').gsub('/', '_').gsub('(', '_').gsub(')', '_')
+ io.write_to_file( m, output + '_' + my_os, w )
+ end
+ Util.print_message( PRG_NAME, "wrote " + msa_hash.length.to_s + " files" )
+ log << "wrote " + msa_hash.length.to_s + " files" + ld
+ rescue Exception => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
- if ( @split > 0 )
+ elsif ( @split > 0 )
begin
msas = msa.split( @split, true )
io = MsaIO.new()
rescue Exception => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
-
end
rescue Exception => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
- if ( @split <= 0 )
+ if (@split <= 0) && (!@split_by_os) && (!@window)
unless ( @rg )
if ( msa.is_aligned() )
if removed.size > 0
identicals = msa.get_identical_seqs_detected
log << "the following " + identicals.size.to_s + " sequences are identical:" + ld
- identicals.each { | s |
- log << s + ld
+ identicals.each { | identical |
+ log << identical + ld
}
log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld
removed.each { | seq_name |
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
-
end
Util.print_message( PRG_NAME, "OK" )
puts
end
-
private
def sort( msa )
@fasta_input = fi
@input_format_set = true
end
+
def set_phylip_input( pi = true )
@phylip_input = pi
@input_format_set = true
end
+
def set_name_length( i )
@name_length = i
@name_length_set = true
end
+
def set_width( i )
@width = i
end
+
def set_fasta_output( fo = true )
@fasta_output = fo
@output_format_set = true
end
+
def set_pi_output( pso = true )
@pi_output = pso
@output_format_set = true
end
+
def set_nexus_output( nexus = true )
@nexus_output = nexus
@output_format_set = true
end
+
def set_clean( c = true )
@clean = c
end
+
def set_remove_gap_columns( rgc = true )
@rgc = rgc
end
+
def set_remove_gap_only_columns( rgoc = true )
@rgoc = rgoc
end
+
def set_remove_gaps( rg = true )
@rg = rg
end
+
def set_remove_gap_ratio( rgr )
@rgr = rgr
end
+
def set_remove_seqs_gap_ratio( rsgr )
@rsgr = rsgr
end
+
def set_remove_seqs_min_non_gap_length( rsl )
@rsl = rsl
end
+
def set_remove_seqs( file )
@seqs_name_file = file
@remove_seqs = true
@keep_seqs = false
end
+
def set_keep_seqs( file )
@seqs_name_file = file
@keep_seqs = true
@remove_seqs = false
end
+
def set_trim( first, last )
@trim = true
@first = first
@last = last
end
+
def set_remove_matching( remove )
@remove_matching = remove
end
+
def set_keep_matching( keep )
@keep_matching = keep
end
+
def set_rem_red( rr )
@rem_red = rr
end
-
-
def set_split( s )
if ( s > 0 )
@split = s
+ @split_by_os = false
@clean = false # phylip only
@rgc = false
@rgoc = false
end
end
+ def set_split_by_os()
+ @split = -1
+ @split_by_os = true
+ @clean = false # phylip only
+ @rgc = false
+ @rgoc = false
+ @rg = false # fasta only
+ @rgr = -1
+ @rsgr = -1
+ @rsl = -1
+ @seqs_name_file = nil
+ @remove_seqs = false
+ @keep_seqs = false
+ @trim = false
+ @first = -1
+ @last = -1
+ @window = false
+ end
+
+ def set_window()
+ @split = -1
+ @split_by_os = false
+ @rgc = false
+ @rgoc = false
+ @rg = false # fasta only
+ @rgr = -1
+ @rsgr = -1
+ @rsl = -1
+ @seqs_name_file = nil
+ @remove_seqs = false
+ @keep_seqs = false
+ @trim = false
+ @first = -1
+ @last = -1
+ @window = true
+ end
+
def analyze_command_line( cla )
if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
begin
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
end
+ if ( cla.is_option_set?( SLIDING_EXTRACTION_OPTION ) )
+ begin
+ s = cla.get_option_value( SLIDING_EXTRACTION_OPTION )
+ if ( s =~ /(\d+)\/(\d+)/ )
+ set_window
+ @window = true
+ @step = $1.to_i()
+ @size = $2.to_i()
+ else
+ puts( "illegal argument" )
+ print_help
+ exit( -1 )
+ end
+ if (@step <= 0) || (@size <= 0)
+ puts( "illegal argument" )
+ print_help
+ exit( -1 )
+ end
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
+ end
+
if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) )
begin
f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION )
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
end
- if ( cla.is_option_set?( SPLIT ) )
+ if cla.is_option_set?( SPLIT_BY_OS )
+ begin
+ set_split_by_os()
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
+ elsif ( cla.is_option_set?( SPLIT ) )
begin
s = cla.get_option_value_as_int( SPLIT )
set_split( s )
rescue ArgumentError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
-
end
if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) )
begin
@die_if_name_too_long = true
end
-
end
def print_help()
puts()
puts( " " + PRG_NAME + ".rb [options] <input alignment> <output>" )
puts()
- puts( " options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta, p for phylip selex type" )
- puts( " -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta, n for nexus, p for phylip sequential (default)" )
+ puts( " options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta (default), p for phylip/selex type" )
+ puts( " -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta (default), n for nexus, p for phylip sequential" )
puts( " -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
puts( " -" + DIE_IF_NAME_TOO_LONG + ": die if sequence name too long" )
puts( " -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
puts( " -" + KEEP_MATCHING_SEQUENCES_OPTION + "=<s> keep only sequences with names containing s" )
puts( " -" + SPLIT + "=<n> split a fasta file into n files of equal number of sequences (expect for " )
puts( " last one), cannot be used with other options" )
+ puts( " -" + SLIDING_EXTRACTION_OPTION + "=<step>/<window size>: sliding window extraction, cannot be used with other options" )
puts( " -" + REM_RED_OPTION + ": remove redundant sequences" )
puts()
end
-
-
-
-
end # class MsaProcessor
-
end # module Evoruby