2 # = lib/evo/apps/msa_processor.rb - MsaProcessor class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: msa_processor.rb,v 1.33 2010/12/13 19:00:10 cmzmasek Exp $
12 require 'lib/evo/util/constants'
13 require 'lib/evo/util/util'
14 require 'lib/evo/util/command_line_arguments'
15 require 'lib/evo/msa/msa_factory'
16 require 'lib/evo/io/msa_io'
17 require 'lib/evo/io/writer/phylip_sequential_writer'
18 require 'lib/evo/io/writer/nexus_writer'
19 require 'lib/evo/io/writer/fasta_writer'
20 require 'lib/evo/io/parser/fasta_parser'
21 require 'lib/evo/io/parser/general_msa_parser'
22 require 'lib/evo/io/writer/msa_writer'
29 PRG_DESC = "processing of multiple sequence alignments"
31 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
33 NAME_LENGTH_DEFAULT = 10
34 WIDTH_DEFAULT_FASTA = 60
35 INPUT_TYPE_OPTION = "i"
36 OUTPUT_TYPE_OPTION = "o"
37 MAXIMAL_NAME_LENGTH_OPTION = "n"
38 DIE_IF_NAME_TOO_LONG = "d"
40 CLEAN_UP_SEQ_OPTION = "c"
41 REM_RED_OPTION = "rem_red"
42 REMOVE_GAP_COLUMNS_OPTION = "rgc"
43 REMOVE_GAP_ONLY_COLUMNS = "rgoc"
44 REMOVE_COLUMNS_GAP_RATIO_OPTION = "rr"
45 REMOVE_ALL_GAP_CHARACTERS_OPTION = "rg"
46 REMOVE_ALL_SEQUENCES_LISTED_OPTION = "r"
47 KEEP_ONLY_SEQUENCES_LISTED_OPTION = "k"
49 KEEP_MATCHING_SEQUENCES_OPTION = "mk"
50 REMOVE_MATCHING_SEQUENCES_OPTION = "mr"
53 SLIDING_EXTRACTION_OPTION = "se"
54 REMOVE_SEQS_GAP_RATIO_OPTION = "rsgr"
55 REMOVE_SEQS_NON_GAP_LENGTH_OPTION = "rsl"
57 SPLIT_BY_OS = "split_by_os"
58 LOG_SUFFIX = "_msa_pro.log"
59 HELP_OPTION_1 = "help"
62 @input_format_set = false
63 @output_format_set = false
66 @name_length = NAME_LENGTH_DEFAULT
67 @name_length_set = false
68 @width = WIDTH_DEFAULT_FASTA # fasta only
72 @clean = false # phylip only
75 @rg = false # fasta only
77 @die_if_name_too_long = false
81 @remove_matching = nil
99 Util.print_program_information( PRG_NAME,
106 if ( ARGV == nil || ARGV.length < 1 )
107 Util.print_message( PRG_NAME, "Illegal number of arguments" )
113 cla = CommandLineArguments.new( ARGV )
114 rescue ArgumentError => e
115 Util.fatal_error( PRG_NAME, "Error: " + e.to_s, STDOUT )
118 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
119 cla.is_option_set?( HELP_OPTION_2 ) )
124 if ( cla.get_number_of_files != 2 || ARGV.length < 2 )
125 Util.print_message( PRG_NAME, "Illegal number of arguments" )
130 allowed_opts = Array.new
131 allowed_opts.push( INPUT_TYPE_OPTION )
132 allowed_opts.push( OUTPUT_TYPE_OPTION )
133 allowed_opts.push( MAXIMAL_NAME_LENGTH_OPTION )
134 allowed_opts.push( WIDTH_OPTION )
135 allowed_opts.push( CLEAN_UP_SEQ_OPTION )
136 allowed_opts.push( REMOVE_GAP_COLUMNS_OPTION )
137 allowed_opts.push( REMOVE_GAP_ONLY_COLUMNS )
138 allowed_opts.push( REMOVE_COLUMNS_GAP_RATIO_OPTION )
139 allowed_opts.push( REMOVE_ALL_GAP_CHARACTERS_OPTION )
140 allowed_opts.push( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
141 allowed_opts.push( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
142 allowed_opts.push( TRIM_OPTION )
143 allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION )
144 allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
145 allowed_opts.push( SPLIT )
146 allowed_opts.push( SPLIT_BY_OS )
147 allowed_opts.push( REM_RED_OPTION )
148 allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
149 allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
150 allowed_opts.push( DIE_IF_NAME_TOO_LONG )
151 allowed_opts.push( SLIDING_EXTRACTION_OPTION )
153 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
154 if ( disallowed.length > 0 )
155 Util.fatal_error( PRG_NAME,
156 "unknown option(s): " + disallowed )
159 input = cla.get_file_name( 0 )
160 output = cla.get_file_name( 1 )
162 analyze_command_line( cla )
165 Util.check_file_for_readability( input )
167 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
171 Util.check_file_for_writability( output )
173 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
177 set_pi_output( false )
178 set_fasta_output( true )
179 set_nexus_output( false )
182 if ( !@input_format_set )
185 fasta_like = Util.looks_like_fasta?( input )
186 rescue ArgumentError => e
187 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
189 @fasta_input = fasta_like
190 @phylip_input = !fasta_like
191 if ( !@output_format_set )
192 @fasta_output = fasta_like
193 @pi_output = !fasta_like
194 @nexus_output = false
198 ld = Constants::LINE_DELIMITER
199 log = PRG_NAME + " " + PRG_VERSION + " [" + PRG_DATE + "]" + " LOG" + ld
201 log << "Date/time: " + now.to_s + ld
204 puts( "Input alignment : " + input )
205 log << "Input alignment : " + input + ld
206 puts( "Output alignment : " + output )
207 log << "Output alignment : " + output + ld
209 puts( "Input is : Phylip, or something like it" )
210 log << "Input is : Phylip, or something like it" + ld
211 elsif ( @fasta_input )
212 puts( "Input is : Fasta" )
213 log << "Input is : Fasta" + ld
216 puts( "Max col gap ratio: " + @rgr.to_s )
217 log << "Max col gap ratio: " + @rgr.to_s + ld
219 puts( "Remove gap colums" )
220 log << "Remove gap colums" + ld
222 puts( "Remove gap only colums" )
223 log << "Remove gap only colums" + ld
226 puts( "Clean up : true" )
227 log << "Clean up : true" + ld
231 puts( "Output is : Phylip interleaved" )
232 log << "Output is : Phylip interleaved" + ld
233 elsif ( @fasta_output )
234 puts( "Output is : Fasta" )
235 log << "Output is : Fasta" + ld
237 puts( "Width : " + @width.to_s )
238 log << "Width : " + @width.to_s + ld
241 puts( "Remove all gap characters (alignment is destroyed)" )
242 log << "Remove all gap characters (alignment is destroyed)" + ld
244 elsif ( @nexus_output )
245 puts( "Output is : Nexus" )
246 log << "Output is : Nexus" + ld
248 if ( @name_length_set || !@fasta_output )
249 puts( "Max name length : " + @name_length.to_s )
250 log << "Max name length : " + @name_length.to_s + ld
253 puts( "Remove sequences for which the gap ratio > " + @rsgr.to_s )
254 log << "Remove sequences for which the gap ratio > " + @rsgr.to_s + ld
257 puts( "Remove sequences with less than " + @rsl.to_s + " non-gap characters" )
258 log << "Remove sequences with less than " + @rsl.to_s + " non-gap characters" + ld
261 puts( "Remove sequences listed in: " + @seqs_name_file )
262 log << "Remove sequences listed in: " + @seqs_name_file + ld
264 puts( "Keep only sequences listed in: " + @seqs_name_file )
265 log << "Keep only sequences listed in: " + @seqs_name_file + ld
268 puts( "Keep only columns from: "+ @first.to_s + " to " + @last.to_s )
269 log << "Keep only columns from: "+ @first.to_s + " to " + @last.to_s + ld
272 puts( "Remove redundant sequences: true" )
273 log << "Remove redundant sequences: true" + ld
276 puts( "Split by OS : true" )
277 log << "Split : true" + ld
280 puts( "Split : " + @split.to_s )
281 log << "Split : " + @split.to_s + ld
284 puts( "Sliding window extraction: true" )
285 log << "Sliding window extraction: true" + ld
286 puts( "Sliding window step : " + @step.to_s )
287 log << "Sliding window step : " + @step.to_s + ld
288 puts( "Sliding window size : " + @size.to_s )
289 log << "Sliding window size : " + @size.to_s + ld
300 msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
301 elsif ( @fasta_input )
302 msa = f.create_msa_from_file( input, FastaParser.new() )
304 rescue Exception => e
305 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
308 if ( msa.is_aligned() )
309 Util.print_message( PRG_NAME, "Length of original alignment : " + msa.get_length.to_s )
310 log << "Length of original alignment : " + msa.get_length.to_s + ld
311 gp = msa.calculate_gap_proportion
312 Util.print_message( PRG_NAME, "Gap-proportion of original alignment : " + gp.to_s )
313 log << "Gap-proportion of original alignment : " + gp.to_s + ld
315 Util.print_message( PRG_NAME, "Input is not aligned" )
316 log << "Input is not aligned" + ld
319 all_names = Set.new()
320 for i in 0 ... msa.get_number_of_seqs()
321 current_name = msa.get_sequence( i ).get_name
322 if all_names.include?( current_name )
323 Util.print_warning_message( PRG_NAME, "sequence name [" + current_name + "] is not unique" )
325 all_names.add( current_name )
331 if ( @remove_seqs || @keep_seqs )
332 names = Util.file2array( @seqs_name_file, true )
333 if ( names == nil || names.length() < 1 )
334 error_msg = "file \"" + @seqs_name_file.to_s + "\" appears empty"
335 Util.fatal_error( PRG_NAME, error_msg )
340 for i in 0 ... names.length()
341 to_delete = msa.find_by_name( names[ i ], true, false )
342 if ( to_delete.length() < 1 )
343 error_msg = "sequence name \"" + names[ i ] + "\" not found"
344 Util.fatal_error( PRG_NAME, error_msg )
345 elsif ( to_delete.length() > 1 )
346 error_msg = "sequence name \"" + names[ i ] + "\" is not unique"
347 Util.fatal_error( PRG_NAME, error_msg )
349 msa.remove_sequence!( to_delete[ 0 ] )
353 Util.print_message( PRG_NAME, "Removed " + c.to_s + " sequences" )
354 log << "Removed " + c.to_s + " sequences" + ld
359 for j in 0 ... msa.get_number_of_seqs()
360 if ( names.include?( msa.get_sequence( j ).get_name() ) )
361 msa_new.add_sequence( msa.get_sequence( j ) )
368 Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" )
369 log << "Kept " + k.to_s + " sequences" + ld
370 Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
371 log << "removed " + r.to_s + " sequences" + ld
376 msa.trim!( @first, @last, '_S' )
380 msas = msa.sliding_extraction( @step, @size, @size / 2, '_Q' )
385 w = PhylipSequentialWriter.new()
387 w.set_max_name_length( @name_length )
388 elsif( @fasta_output )
389 w = FastaWriter.new()
390 w.set_line_width( @width )
392 if ( @name_length_set )
393 w.set_max_name_length( @name_length )
395 elsif( @nexus_output )
396 w = NexusWriter.new()
398 w.set_max_name_length( @name_length )
401 name = output + "_" + m.get_name
407 io.write_to_file( m, name, w )
409 Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files" )
410 log << "wrote " + msas.length.to_s + " files" + ld
411 rescue Exception => e
412 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
417 msa.remove_gap_columns_w_gap_ratio!( @rgr )
419 msa.remove_gap_columns!()
421 msa.remove_gap_only_columns!()
424 n = msa.get_number_of_seqs()
425 removed = msa.remove_sequences_by_gap_ratio!( @rsgr )
426 k = msa.get_number_of_seqs()
428 Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" )
429 log << "Kept " + k.to_s + " sequences" + ld
430 Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
431 log << "Removed " + r.to_s + " sequences:" + ld
432 removed.each { | seq_name |
433 log << " " + seq_name + ld
437 n = msa.get_number_of_seqs()
438 removed = msa.remove_sequences_by_non_gap_length!( @rsl )
439 k = msa.get_number_of_seqs()
441 Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" )
442 log << "Kept " + k.to_s + " sequences" + ld
443 Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
444 log << "Removed " + r.to_s + " sequences:" + ld
445 removed.each { | seq_name |
446 log << " " + seq_name + ld
449 if ( @keep_matching )
450 n = msa.get_number_of_seqs
451 to_be_removed = Set.new
453 seq = msa.get_sequence( ii )
454 if !seq.get_name.downcase.index( @keep_matching.downcase )
455 to_be_removed.add( ii )
458 to_be_removed_ary = to_be_removed.to_a.sort.reverse
459 to_be_removed_ary.each { | index |
460 msa.remove_sequence!( index )
464 if ( @remove_matching )
465 n = msa.get_number_of_seqs
466 to_be_removed = Set.new
469 seq = msa.get_sequence( iii )
471 if seq.get_name.downcase.index( @remove_matching.downcase )
472 to_be_removed.add( iii )
475 to_be_removed_ary = to_be_removed.to_a.sort.reverse
476 to_be_removed_ary.each { | index |
477 msa.remove_sequence!( index )
484 msa_hash = msa.split_by_os(true)
488 w = PhylipSequentialWriter.new()
490 w.set_max_name_length( @name_length )
491 elsif( @fasta_output )
492 w = FastaWriter.new()
493 w.set_line_width( @width )
495 w.remove_gap_chars( true )
496 Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
497 log << "removing gap character, the output is likely to become unaligned" + ld
500 if ( @name_length_set )
501 w.set_max_name_length( @name_length )
503 elsif( @nexus_output )
504 w = NexusWriter.new()
506 w.set_max_name_length( @name_length )
508 msa_hash.each do |os, m|
509 my_os = os.gsub(' ', '_').gsub('/', '_').gsub('(', '_').gsub(')', '_')
510 io.write_to_file( m, output + '_' + my_os, w )
513 Util.print_message( PRG_NAME, "wrote " + msa_hash.length.to_s + " files" )
514 log << "wrote " + msa_hash.length.to_s + " files" + ld
515 rescue Exception => e
516 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
521 msas = msa.split( @split, true )
525 w = PhylipSequentialWriter.new()
527 w.set_max_name_length( @name_length )
528 elsif( @fasta_output )
529 w = FastaWriter.new()
530 w.set_line_width( @width )
532 w.remove_gap_chars( true )
533 Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
534 log << "removing gap character, the output is likely to become unaligned" + ld
537 if ( @name_length_set )
538 w.set_max_name_length( @name_length )
540 elsif( @nexus_output )
541 w = NexusWriter.new()
543 w.set_max_name_length( @name_length )
548 io.write_to_file( m, output + "_" + i.to_s, w )
550 Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files" )
551 log << "wrote " + msas.length.to_s + " files" + ld
552 rescue Exception => e
553 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
556 rescue Exception => e
557 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
560 if (@split <= 0) && (!@split_by_os) && (!@window)
563 if ( msa.is_aligned() )
564 Util.print_message( PRG_NAME, "Length of processed alignment : " + msa.get_length.to_s )
565 log << "Length of processed alignment : " + msa.get_length.to_s + ld
566 gp = msa.calculate_gap_proportion
567 Util.print_message( PRG_NAME, "Gap-proportion of processed alignment: " + gp.to_s )
568 log << "Gap-proportion of processed alignment: " + gp.to_s + ld
574 for s in 0 ... msa.get_number_of_seqs
575 seq = msa.get_sequence( s )
586 avg = sum / msa.get_number_of_seqs
587 Util.print_message( PRG_NAME, "Output is not aligned" )
588 log << "Output is not aligned" + ld
589 Util.print_message( PRG_NAME, "Shortest sequence : " + min.to_s )
590 log << "Shortest sequence : " + min.to_s + ld
591 Util.print_message( PRG_NAME, "Longest sequence : " + max.to_s )
592 log << "Longest sequence : " + max.to_s + ld
593 Util.print_message( PRG_NAME, "Average length : " + avg.to_s )
594 log << "Average length : " + avg.to_s + ld
600 removed = msa.remove_redundant_sequences!( true, false )
602 identicals = msa.get_identical_seqs_detected
603 log << "the following " + identicals.size.to_s + " sequences are identical:" + ld
604 identicals.each { | identical |
605 log << identical + ld
607 log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld
608 removed.each { | seq_name |
611 Util.print_message( PRG_NAME, "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" )
612 log << "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" + ld
621 w = PhylipSequentialWriter.new()
623 w.set_max_name_length( @name_length )
624 w.set_exception_if_name_too_long( @die_if_name_too_long )
625 elsif( @fasta_output )
626 w = FastaWriter.new()
627 w.set_line_width( @width )
629 w.remove_gap_chars( true )
630 Util.print_warning_message( PRG_NAME, "removing gap characters, the output is likely to become unaligned" )
631 log << "removing gap character, the output is likely to become unaligned" + ld
634 if ( @name_length_set )
635 w.set_max_name_length( @name_length )
636 w.set_exception_if_name_too_long( @die_if_name_too_long )
638 elsif( @nexus_output )
639 w = NexusWriter.new()
641 w.set_max_name_length( @name_length )
642 w.set_exception_if_name_too_long( @die_if_name_too_long )
646 io.write_to_file( msa, output, w )
647 rescue Exception => e
648 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
651 Util.print_message( PRG_NAME, "Number of sequences in output : " + msa.get_number_of_seqs.to_s )
652 log << "Number of sequences in output : " + msa.get_number_of_seqs.to_s + ld
655 f = File.open( output + LOG_SUFFIX, 'a' )
658 rescue Exception => e
659 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
663 Util.print_message( PRG_NAME, "OK" )
671 for i in 0 ... msa.get_number_of_seqs
672 name = msa.get_sequence( i ).get_name
675 sorted_ary = names.to_a.sort
677 sorted_ary.each { | seq_name |
678 seq = msa.get_sequence( msa.find_by_name( seq_name, true, false )[ 0 ] )
679 new_msa.add_sequence( seq )
684 def set_fasta_input( fi = true )
686 @input_format_set = true
689 def set_phylip_input( pi = true )
691 @input_format_set = true
694 def set_name_length( i )
696 @name_length_set = true
703 def set_fasta_output( fo = true )
705 @output_format_set = true
708 def set_pi_output( pso = true )
710 @output_format_set = true
713 def set_nexus_output( nexus = true )
714 @nexus_output = nexus
715 @output_format_set = true
718 def set_clean( c = true )
722 def set_remove_gap_columns( rgc = true )
726 def set_remove_gap_only_columns( rgoc = true )
730 def set_remove_gaps( rg = true )
734 def set_remove_gap_ratio( rgr )
738 def set_remove_seqs_gap_ratio( rsgr )
742 def set_remove_seqs_min_non_gap_length( rsl )
746 def set_remove_seqs( file )
747 @seqs_name_file = file
752 def set_keep_seqs( file )
753 @seqs_name_file = file
758 def set_trim( first, last )
764 def set_remove_matching( remove )
765 @remove_matching = remove
768 def set_keep_matching( keep )
769 @keep_matching = keep
772 def set_rem_red( rr )
780 @clean = false # phylip only
783 @rg = false # fasta only
787 @seqs_name_file = nil
796 def set_split_by_os()
799 @clean = false # phylip only
802 @rg = false # fasta only
806 @seqs_name_file = nil
820 @rg = false # fasta only
824 @seqs_name_file = nil
833 def analyze_command_line( cla )
834 if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
836 type = cla.get_option_value( INPUT_TYPE_OPTION )
838 set_phylip_input( true )
839 set_fasta_input( false )
840 elsif ( type == "f" )
841 set_fasta_input( true )
842 set_phylip_input( false )
844 rescue ArgumentError => e
845 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
848 if ( cla.is_option_set?( OUTPUT_TYPE_OPTION ) )
850 type = cla.get_option_value( OUTPUT_TYPE_OPTION )
852 set_pi_output( true )
853 set_fasta_output( false )
854 set_nexus_output( false )
855 elsif ( type == "f" )
856 set_pi_output( false )
857 set_fasta_output( true )
858 set_nexus_output( false )
859 elsif ( type == "n" )
860 set_pi_output( false )
861 set_fasta_output( false )
862 set_nexus_output( true )
864 rescue ArgumentError => e
865 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
868 if ( cla.is_option_set?( MAXIMAL_NAME_LENGTH_OPTION ) )
870 l = cla.get_option_value_as_int( MAXIMAL_NAME_LENGTH_OPTION )
872 rescue ArgumentError => e
873 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
876 if ( cla.is_option_set?( WIDTH_OPTION ) )
878 w = cla.get_option_value_as_int( WIDTH_OPTION )
880 rescue ArgumentError => e
881 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
884 if ( cla.is_option_set?( CLEAN_UP_SEQ_OPTION ) )
887 if ( cla.is_option_set?( REMOVE_GAP_COLUMNS_OPTION ) )
888 set_remove_gap_columns( true )
890 if ( cla.is_option_set?( REM_RED_OPTION ) )
893 if ( cla.is_option_set?( REMOVE_GAP_ONLY_COLUMNS ) )
894 set_remove_gap_only_columns( true )
896 if ( cla.is_option_set?( REMOVE_ALL_GAP_CHARACTERS_OPTION ) )
897 set_remove_gaps( true )
899 if ( cla.is_option_set?( REMOVE_COLUMNS_GAP_RATIO_OPTION ) )
901 f = cla.get_option_value_as_float( REMOVE_COLUMNS_GAP_RATIO_OPTION )
902 set_remove_gap_ratio( f )
903 rescue ArgumentError => e
904 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
907 if ( cla.is_option_set?( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) )
909 s = cla.get_option_value( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
911 rescue ArgumentError => e
912 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
915 if ( cla.is_option_set?( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) )
917 s = cla.get_option_value( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
919 rescue ArgumentError => e
920 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
923 if ( cla.is_option_set?( TRIM_OPTION ) )
925 s = cla.get_option_value( TRIM_OPTION )
926 if ( s =~ /(\d+)-(\d+)/ )
927 set_trim( $1.to_i(), $2.to_i() )
929 puts( "illegal argument" )
933 rescue ArgumentError => e
934 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
937 if ( cla.is_option_set?( SLIDING_EXTRACTION_OPTION ) )
939 s = cla.get_option_value( SLIDING_EXTRACTION_OPTION )
940 if ( s =~ /(\d+)\/(\d+)/ )
946 puts( "illegal argument" )
950 if (@step <= 0) || (@size <= 0)
951 puts( "illegal argument" )
955 rescue ArgumentError => e
956 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
960 if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) )
962 f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION )
963 set_remove_seqs_gap_ratio( f )
964 rescue ArgumentError => e
965 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
968 if ( cla.is_option_set?( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) )
970 f = cla.get_option_value_as_int( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
971 set_remove_seqs_min_non_gap_length( f )
972 rescue ArgumentError => e
973 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
976 if cla.is_option_set?( SPLIT_BY_OS )
979 rescue ArgumentError => e
980 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
982 elsif ( cla.is_option_set?( SPLIT ) )
984 s = cla.get_option_value_as_int( SPLIT )
986 rescue ArgumentError => e
987 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
990 if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) )
992 s = cla.get_option_value( REMOVE_MATCHING_SEQUENCES_OPTION )
993 set_remove_matching( s )
994 rescue ArgumentError => e
995 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
998 if ( cla.is_option_set?( KEEP_MATCHING_SEQUENCES_OPTION ) )
1000 s = cla.get_option_value( KEEP_MATCHING_SEQUENCES_OPTION )
1001 set_keep_matching( s )
1002 rescue ArgumentError => e
1003 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
1006 if ( cla.is_option_set?( DIE_IF_NAME_TOO_LONG ) )
1007 @die_if_name_too_long = true
1016 puts( " " + PRG_NAME + ".rb [options] <input alignment> <output>" )
1018 puts( " options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta (default), p for phylip/selex type" )
1019 puts( " -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta (default), n for nexus, p for phylip sequential" )
1020 puts( " -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
1021 puts( " -" + DIE_IF_NAME_TOO_LONG + ": die if sequence name too long" )
1022 puts( " -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
1023 puts( " -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" )
1024 puts( " -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" )
1025 puts( " -" + REMOVE_GAP_ONLY_COLUMNS + ": remove gap-only columns" )
1026 puts( " -" + REMOVE_COLUMNS_GAP_RATIO_OPTION + "=<n>: remove columns for which ( seqs with gap / number of sequences > n )" )
1027 puts( " -" + REMOVE_ALL_GAP_CHARACTERS_OPTION + ": remove all gap characters (destroys alignment, fasta output only)" )
1028 puts( " -" + REMOVE_ALL_SEQUENCES_LISTED_OPTION + "=<file>: remove all sequences listed in file" )
1029 puts( " -" + KEEP_ONLY_SEQUENCES_LISTED_OPTION + "=<file>: keep only sequences listed in file" )
1030 puts( " -" + TRIM_OPTION + "=<first>-<last>: remove columns before first and after last" )
1031 puts( " -" + REMOVE_SEQS_GAP_RATIO_OPTION + "=<n>: remove sequences for which the gap ratio > n (after column operations)" )
1032 puts( " -" + REMOVE_SEQS_NON_GAP_LENGTH_OPTION + "=<n> remove sequences with less than n non-gap characters (after column operations)" )
1033 puts( " -" + REMOVE_MATCHING_SEQUENCES_OPTION + "=<s> remove all sequences with names containing s" )
1034 puts( " -" + KEEP_MATCHING_SEQUENCES_OPTION + "=<s> keep only sequences with names containing s" )
1035 puts( " -" + SPLIT + "=<n> split a fasta file into n files of equal number of sequences (expect for " )
1036 puts( " last one), cannot be used with other options" )
1037 puts( " -" + SLIDING_EXTRACTION_OPTION + "=<step>/<window size>: sliding window extraction, cannot be used with other options" )
1038 puts( " -" + REM_RED_OPTION + ": remove redundant sequences" )
1042 end # class MsaProcessor
1044 end # module Evoruby