class MsaProcessor
PRG_NAME = "msa_pro"
- PRG_DATE = "2012.05.11"
+ PRG_DATE = "131112"
PRG_DESC = "processing of multiple sequence alignments"
- PRG_VERSION = "1.06"
+ PRG_VERSION = "1.08"
COPYRIGHT = "2008-2010 Christian M Zmasek"
CONTACT = "phylosoft@gmail.com"
WWW = "www.phylosoft.org"
INPUT_TYPE_OPTION = "i"
OUTPUT_TYPE_OPTION = "o"
MAXIMAL_NAME_LENGTH_OPTION = "n"
+ DIE_IF_NAME_TOO_LONG = "d"
WIDTH_OPTION = "w"
CLEAN_UP_SEQ_OPTION = "c"
REM_RED_OPTION = "rem_red"
@rgoc = false
@rg = false # fasta only
@rem_red = false
+ @die_if_name_too_long = false
@rgr = -1
@rsgr = -1
@rsl = -1
allowed_opts.push( REM_RED_OPTION )
allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
+ allowed_opts.push( DIE_IF_NAME_TOO_LONG )
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
begin
Util.check_file_for_readability( input )
- rescue ArgumentError => e
+ rescue IOError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
begin
Util.check_file_for_writability( output )
- rescue ArgumentError => e
+ rescue IOError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
Util.print_message( PRG_NAME, "Gap-proportion of original alignment : " + gp.to_s )
log << "Gap-proportion of original alignment : " + gp.to_s + ld
else
- Util.print_message( PRG_NAME, "the input is not aligned" )
- log << "The input is not aligned" + ld
+ Util.print_message( PRG_NAME, "Input is not aligned" )
+ log << "Input is not aligned" + ld
end
all_names = Set.new()
Util.print_message( PRG_NAME, "Gap-proportion of processed alignment: " + gp.to_s )
log << "Gap-proportion of processed alignment: " + gp.to_s + ld
else
- Util.print_warning_message( PRG_NAME, "output is not aligned" )
- log << "output is not aligned" + ld
+ min = 0
+ max = 0
+ sum = 0
+ first = true
+ for s in 0 ... msa.get_number_of_seqs
+ seq = msa.get_sequence( s )
+ l = seq.get_length
+ sum += l
+ if l > max
+ max = l
+ end
+ if first || l < min
+ min = l
+ end
+ first = false
+ end
+ avg = sum / msa.get_number_of_seqs
+ Util.print_message( PRG_NAME, "Output is not aligned" )
+ log << "Output is not aligned" + ld
+ Util.print_message( PRG_NAME, "Shortest sequence : " + min.to_s )
+ log << "Shortest sequence : " + min.to_s + ld
+ Util.print_message( PRG_NAME, "Longest sequence : " + max.to_s )
+ log << "Longest sequence : " + max.to_s + ld
+ Util.print_message( PRG_NAME, "Average length : " + avg.to_s )
+ log << "Average length : " + avg.to_s + ld
+
end
end
w = PhylipSequentialWriter.new()
w.clean( @clean )
w.set_max_name_length( @name_length )
+ w.set_exception_if_name_too_long( @die_if_name_too_long )
elsif( @fasta_output )
w = FastaWriter.new()
w.set_line_width( @width )
w.clean( @clean )
if ( @name_length_set )
w.set_max_name_length( @name_length )
+ w.set_exception_if_name_too_long( @die_if_name_too_long )
end
elsif( @nexus_output )
w = NexusWriter.new()
w.clean( @clean )
w.set_max_name_length( @name_length )
+ w.set_exception_if_name_too_long( @die_if_name_too_long )
end
-
begin
io.write_to_file( msa, output, w )
rescue Exception => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
+ Util.print_message( PRG_NAME, "Number of sequences in output : " + msa.get_number_of_seqs.to_s )
+ log << "Number of sequences in output : " + msa.get_number_of_seqs.to_s + ld
+
begin
f = File.open( output + LOG_SUFFIX, 'a' )
f.print( log )
@last = -1
end
end
+
def analyze_command_line( cla )
if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
begin
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
end
+ if ( cla.is_option_set?( DIE_IF_NAME_TOO_LONG ) )
+ @die_if_name_too_long = true
+ end
end
puts( " options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta, p for phylip selex type" )
puts( " -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta, n for nexus, p for phylip sequential (default)" )
puts( " -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
+ puts( " -" + DIE_IF_NAME_TOO_LONG + ": die if sequence name too long" )
puts( " -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
puts( " -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" )
puts( " -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" )