2 # = lib/evo/apps/msa_processor.rb - MsaProcessor class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: msa_processor.rb,v 1.33 2010/12/13 19:00:10 cmzmasek Exp $
12 require 'lib/evo/util/constants'
13 require 'lib/evo/util/util'
14 require 'lib/evo/util/command_line_arguments'
15 require 'lib/evo/msa/msa_factory'
16 require 'lib/evo/io/msa_io'
17 require 'lib/evo/io/writer/phylip_sequential_writer'
18 require 'lib/evo/io/writer/nexus_writer'
19 require 'lib/evo/io/writer/fasta_writer'
20 require 'lib/evo/io/parser/fasta_parser'
21 require 'lib/evo/io/parser/general_msa_parser'
22 require 'lib/evo/io/writer/msa_writer'
29 PRG_DESC = "processing of multiple sequence alignments"
31 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
33 NAME_LENGTH_DEFAULT = 10
34 WIDTH_DEFAULT_FASTA = 60
35 INPUT_TYPE_OPTION = "i"
36 OUTPUT_TYPE_OPTION = "o"
37 MAXIMAL_NAME_LENGTH_OPTION = "n"
38 DIE_IF_NAME_TOO_LONG = "d"
40 CLEAN_UP_SEQ_OPTION = "c"
41 REM_RED_OPTION = "rem_red"
42 REMOVE_GAP_COLUMNS_OPTION = "rgc"
43 REMOVE_GAP_ONLY_COLUMNS = "rgoc"
44 REMOVE_COLUMNS_GAP_RATIO_OPTION = "rr"
45 REMOVE_ALL_GAP_CHARACTERS_OPTION = "rg"
46 REMOVE_ALL_SEQUENCES_LISTED_OPTION = "r"
47 KEEP_ONLY_SEQUENCES_LISTED_OPTION = "k"
49 KEEP_MATCHING_SEQUENCES_OPTION = "mk"
50 REMOVE_MATCHING_SEQUENCES_OPTION = "mr"
53 REMOVE_SEQS_GAP_RATIO_OPTION = "rsgr"
54 REMOVE_SEQS_NON_GAP_LENGTH_OPTION = "rsl"
56 LOG_SUFFIX = "_msa_pro.log"
57 HELP_OPTION_1 = "help"
60 @input_format_set = false
61 @output_format_set = false
64 @name_length = NAME_LENGTH_DEFAULT
65 @name_length_set = false
66 @width = WIDTH_DEFAULT_FASTA # fasta only
70 @clean = false # phylip only
73 @rg = false # fasta only
75 @die_if_name_too_long = false
79 @remove_matching = nil
93 Util.print_program_information( PRG_NAME,
100 if ( ARGV == nil || ARGV.length < 1 )
101 Util.print_message( PRG_NAME, "Illegal number of arguments" )
107 cla = CommandLineArguments.new( ARGV )
108 rescue ArgumentError => e
109 Util.fatal_error( PRG_NAME, "Error: " + e.to_s, STDOUT )
112 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
113 cla.is_option_set?( HELP_OPTION_2 ) )
118 if ( cla.get_number_of_files != 2 || ARGV.length < 2 )
119 Util.print_message( PRG_NAME, "Illegal number of arguments" )
124 allowed_opts = Array.new
125 allowed_opts.push( INPUT_TYPE_OPTION )
126 allowed_opts.push( OUTPUT_TYPE_OPTION )
127 allowed_opts.push( MAXIMAL_NAME_LENGTH_OPTION )
128 allowed_opts.push( WIDTH_OPTION )
129 allowed_opts.push( CLEAN_UP_SEQ_OPTION )
130 allowed_opts.push( REMOVE_GAP_COLUMNS_OPTION )
131 allowed_opts.push( REMOVE_GAP_ONLY_COLUMNS )
132 allowed_opts.push( REMOVE_COLUMNS_GAP_RATIO_OPTION )
133 allowed_opts.push( REMOVE_ALL_GAP_CHARACTERS_OPTION )
134 allowed_opts.push( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
135 allowed_opts.push( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
136 allowed_opts.push( TRIM_OPTION )
137 allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION )
138 allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
139 allowed_opts.push( SPLIT )
140 allowed_opts.push( REM_RED_OPTION )
141 allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
142 allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
143 allowed_opts.push( DIE_IF_NAME_TOO_LONG )
145 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
146 if ( disallowed.length > 0 )
147 Util.fatal_error( PRG_NAME,
148 "unknown option(s): " + disallowed )
151 input = cla.get_file_name( 0 )
152 output = cla.get_file_name( 1 )
154 analyze_command_line( cla )
157 Util.check_file_for_readability( input )
159 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
163 Util.check_file_for_writability( output )
165 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
169 set_pi_output( false )
170 set_fasta_output( true )
171 set_nexus_output( false )
174 if ( !@input_format_set )
177 fasta_like = Util.looks_like_fasta?( input )
178 rescue ArgumentError => e
179 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
181 @fasta_input = fasta_like
182 @phylip_input = !fasta_like
183 if ( !@output_format_set )
184 @fasta_output = fasta_like
185 @pi_output = !fasta_like
186 @nexus_output = false
190 ld = Constants::LINE_DELIMITER
191 log = PRG_NAME + " " + PRG_VERSION + " [" + PRG_DATE + "]" + " LOG" + ld
193 log << "Date/time: " + now.to_s + ld
196 puts( "Input alignment : " + input )
197 log << "Input alignment : " + input + ld
198 puts( "Output alignment : " + output )
199 log << "Output alignment : " + output + ld
201 puts( "Input is : Phylip, or something like it" )
202 log << "Input is : Phylip, or something like it" + ld
203 elsif ( @fasta_input )
204 puts( "Input is : Fasta" )
205 log << "Input is : Fasta" + ld
208 puts( "Max col gap ratio: " + @rgr.to_s )
209 log << "Max col gap ratio: " + @rgr.to_s + ld
211 puts( "Remove gap colums" )
212 log << "Remove gap colums" + ld
214 puts( "Remove gap only colums" )
215 log << "Remove gap only colums" + ld
218 puts( "Clean up : true" )
219 log << "Clean up : true" + ld
223 puts( "Output is : Phylip interleaved" )
224 log << "Output is : Phylip interleaved" + ld
225 elsif ( @fasta_output )
226 puts( "Output is : Fasta" )
227 log << "Output is : Fasta" + ld
229 puts( "Width : " + @width.to_s )
230 log << "Width : " + @width.to_s + ld
233 puts( "Remove all gap characters (alignment is destroyed)" )
234 log << "Remove all gap characters (alignment is destroyed)" + ld
236 elsif ( @nexus_output )
237 puts( "Output is : Nexus" )
238 log << "Output is : Nexus" + ld
240 if ( @name_length_set || !@fasta_output )
241 puts( "Max name length : " + @name_length.to_s )
242 log << "Max name length : " + @name_length.to_s + ld
245 puts( "Remove sequences for which the gap ratio > " + @rsgr.to_s )
246 log << "Remove sequences for which the gap ratio > " + @rsgr.to_s + ld
249 puts( "Remove sequences with less than " + @rsl.to_s + " non-gap characters" )
250 log << "Remove sequences with less than " + @rsl.to_s + " non-gap characters" + ld
253 puts( "Remove sequences listed in: " + @seqs_name_file )
254 log << "Remove sequences listed in: " + @seqs_name_file + ld
256 puts( "Keep only sequences listed in: " + @seqs_name_file )
257 log << "Keep only sequences listed in: " + @seqs_name_file + ld
260 puts( "Keep only columns from: "+ @first.to_s + " to " + @last.to_s )
261 log << "Keep only columns from: "+ @first.to_s + " to " + @last.to_s + ld
264 puts( "Remove redundant sequences: true" )
265 log << "Remove redundant sequences: true" + ld
268 puts( "Split : " + @split.to_s )
269 log << "Split : " + @split.to_s + ld
279 msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
280 elsif ( @fasta_input )
281 msa = f.create_msa_from_file( input, FastaParser.new() )
283 rescue Exception => e
284 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
287 if ( msa.is_aligned() )
288 Util.print_message( PRG_NAME, "Length of original alignment : " + msa.get_length.to_s )
289 log << "Length of original alignment : " + msa.get_length.to_s + ld
290 gp = msa.calculate_gap_proportion
291 Util.print_message( PRG_NAME, "Gap-proportion of original alignment : " + gp.to_s )
292 log << "Gap-proportion of original alignment : " + gp.to_s + ld
294 Util.print_message( PRG_NAME, "Input is not aligned" )
295 log << "Input is not aligned" + ld
298 all_names = Set.new()
299 for i in 0 ... msa.get_number_of_seqs()
300 current_name = msa.get_sequence( i ).get_name
301 if all_names.include?( current_name )
302 Util.print_warning_message( PRG_NAME, "sequence name [" + current_name + "] is not unique" )
304 all_names.add( current_name )
310 if ( @remove_seqs || @keep_seqs )
311 names = Util.file2array( @seqs_name_file, true )
312 if ( names == nil || names.length() < 1 )
313 error_msg = "file \"" + @seqs_name_file.to_s + "\" appears empty"
314 Util.fatal_error( PRG_NAME, error_msg )
319 for i in 0 ... names.length()
320 to_delete = msa.find_by_name( names[ i ], true, false )
321 if ( to_delete.length() < 1 )
322 error_msg = "sequence name \"" + names[ i ] + "\" not found"
323 Util.fatal_error( PRG_NAME, error_msg )
324 elsif ( to_delete.length() > 1 )
325 error_msg = "sequence name \"" + names[ i ] + "\" is not unique"
326 Util.fatal_error( PRG_NAME, error_msg )
328 msa.remove_sequence!( to_delete[ 0 ] )
332 Util.print_message( PRG_NAME, "Removed " + c.to_s + " sequences" )
333 log << "Removed " + c.to_s + " sequences" + ld
338 for j in 0 ... msa.get_number_of_seqs()
339 if ( names.include?( msa.get_sequence( j ).get_name() ) )
340 msa_new.add_sequence( msa.get_sequence( j ) )
347 Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" )
348 log << "Kept " + k.to_s + " sequences" + ld
349 Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
350 log << "removed " + r.to_s + " sequences" + ld
355 msa.trim!( @first, @last )
358 msa.remove_gap_columns_w_gap_ratio!( @rgr )
360 msa.remove_gap_columns!()
362 msa.remove_gap_only_columns!()
365 n = msa.get_number_of_seqs()
366 removed = msa.remove_sequences_by_gap_ratio!( @rsgr )
367 k = msa.get_number_of_seqs()
369 Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" )
370 log << "Kept " + k.to_s + " sequences" + ld
371 Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
372 log << "Removed " + r.to_s + " sequences:" + ld
373 removed.each { | seq_name |
374 log << " " + seq_name + ld
378 n = msa.get_number_of_seqs()
379 removed = msa.remove_sequences_by_non_gap_length!( @rsl )
380 k = msa.get_number_of_seqs()
382 Util.print_message( PRG_NAME, "Kept " + k.to_s + " sequences" )
383 log << "Kept " + k.to_s + " sequences" + ld
384 Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
385 log << "Removed " + r.to_s + " sequences:" + ld
386 removed.each { | seq_name |
387 log << " " + seq_name + ld
390 if ( @keep_matching )
391 n = msa.get_number_of_seqs
392 to_be_removed = Set.new
394 seq = msa.get_sequence( ii )
395 if !seq.get_name.downcase.index( @keep_matching.downcase )
396 to_be_removed.add( ii )
399 to_be_removed_ary = to_be_removed.to_a.sort.reverse
400 to_be_removed_ary.each { | index |
401 msa.remove_sequence!( index )
405 if ( @remove_matching )
406 n = msa.get_number_of_seqs
407 to_be_removed = Set.new
410 seq = msa.get_sequence( iii )
412 if seq.get_name.downcase.index( @remove_matching.downcase )
413 to_be_removed.add( iii )
416 to_be_removed_ary = to_be_removed.to_a.sort.reverse
417 to_be_removed_ary.each { | index |
418 msa.remove_sequence!( index )
425 msas = msa.split( @split, true )
429 w = PhylipSequentialWriter.new()
431 w.set_max_name_length( @name_length )
432 elsif( @fasta_output )
433 w = FastaWriter.new()
434 w.set_line_width( @width )
436 w.remove_gap_chars( true )
437 Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
438 log << "removing gap character, the output is likely to become unaligned" + ld
441 if ( @name_length_set )
442 w.set_max_name_length( @name_length )
444 elsif( @nexus_output )
445 w = NexusWriter.new()
447 w.set_max_name_length( @name_length )
452 io.write_to_file( m, output + "_" + i.to_s, w )
454 Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files" )
455 log << "wrote " + msas.length.to_s + " files" + ld
456 rescue Exception => e
457 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
461 rescue Exception => e
462 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
468 if ( msa.is_aligned() )
469 Util.print_message( PRG_NAME, "Length of processed alignment : " + msa.get_length.to_s )
470 log << "Length of processed alignment : " + msa.get_length.to_s + ld
471 gp = msa.calculate_gap_proportion
472 Util.print_message( PRG_NAME, "Gap-proportion of processed alignment: " + gp.to_s )
473 log << "Gap-proportion of processed alignment: " + gp.to_s + ld
479 for s in 0 ... msa.get_number_of_seqs
480 seq = msa.get_sequence( s )
491 avg = sum / msa.get_number_of_seqs
492 Util.print_message( PRG_NAME, "Output is not aligned" )
493 log << "Output is not aligned" + ld
494 Util.print_message( PRG_NAME, "Shortest sequence : " + min.to_s )
495 log << "Shortest sequence : " + min.to_s + ld
496 Util.print_message( PRG_NAME, "Longest sequence : " + max.to_s )
497 log << "Longest sequence : " + max.to_s + ld
498 Util.print_message( PRG_NAME, "Average length : " + avg.to_s )
499 log << "Average length : " + avg.to_s + ld
505 removed = msa.remove_redundant_sequences!( true, false )
507 identicals = msa.get_identical_seqs_detected
508 log << "the following " + identicals.size.to_s + " sequences are identical:" + ld
509 identicals.each { | identical |
510 log << identical + ld
512 log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld
513 removed.each { | seq_name |
516 Util.print_message( PRG_NAME, "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" )
517 log << "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" + ld
526 w = PhylipSequentialWriter.new()
528 w.set_max_name_length( @name_length )
529 w.set_exception_if_name_too_long( @die_if_name_too_long )
530 elsif( @fasta_output )
531 w = FastaWriter.new()
532 w.set_line_width( @width )
534 w.remove_gap_chars( true )
535 Util.print_warning_message( PRG_NAME, "removing gap characters, the output is likely to become unaligned" )
536 log << "removing gap character, the output is likely to become unaligned" + ld
539 if ( @name_length_set )
540 w.set_max_name_length( @name_length )
541 w.set_exception_if_name_too_long( @die_if_name_too_long )
543 elsif( @nexus_output )
544 w = NexusWriter.new()
546 w.set_max_name_length( @name_length )
547 w.set_exception_if_name_too_long( @die_if_name_too_long )
551 io.write_to_file( msa, output, w )
552 rescue Exception => e
553 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
556 Util.print_message( PRG_NAME, "Number of sequences in output : " + msa.get_number_of_seqs.to_s )
557 log << "Number of sequences in output : " + msa.get_number_of_seqs.to_s + ld
560 f = File.open( output + LOG_SUFFIX, 'a' )
563 rescue Exception => e
564 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
568 Util.print_message( PRG_NAME, "OK" )
576 for i in 0 ... msa.get_number_of_seqs
577 name = msa.get_sequence( i ).get_name
580 sorted_ary = names.to_a.sort
582 sorted_ary.each { | seq_name |
583 seq = msa.get_sequence( msa.find_by_name( seq_name, true, false )[ 0 ] )
584 new_msa.add_sequence( seq )
589 def set_fasta_input( fi = true )
591 @input_format_set = true
594 def set_phylip_input( pi = true )
596 @input_format_set = true
599 def set_name_length( i )
601 @name_length_set = true
608 def set_fasta_output( fo = true )
610 @output_format_set = true
613 def set_pi_output( pso = true )
615 @output_format_set = true
618 def set_nexus_output( nexus = true )
619 @nexus_output = nexus
620 @output_format_set = true
623 def set_clean( c = true )
627 def set_remove_gap_columns( rgc = true )
631 def set_remove_gap_only_columns( rgoc = true )
635 def set_remove_gaps( rg = true )
639 def set_remove_gap_ratio( rgr )
643 def set_remove_seqs_gap_ratio( rsgr )
647 def set_remove_seqs_min_non_gap_length( rsl )
651 def set_remove_seqs( file )
652 @seqs_name_file = file
657 def set_keep_seqs( file )
658 @seqs_name_file = file
663 def set_trim( first, last )
669 def set_remove_matching( remove )
670 @remove_matching = remove
673 def set_keep_matching( keep )
674 @keep_matching = keep
677 def set_rem_red( rr )
684 @clean = false # phylip only
687 @rg = false # fasta only
691 @seqs_name_file = nil
700 def analyze_command_line( cla )
701 if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
703 type = cla.get_option_value( INPUT_TYPE_OPTION )
705 set_phylip_input( true )
706 set_fasta_input( false )
707 elsif ( type == "f" )
708 set_fasta_input( true )
709 set_phylip_input( false )
711 rescue ArgumentError => e
712 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
715 if ( cla.is_option_set?( OUTPUT_TYPE_OPTION ) )
717 type = cla.get_option_value( OUTPUT_TYPE_OPTION )
719 set_pi_output( true )
720 set_fasta_output( false )
721 set_nexus_output( false )
722 elsif ( type == "f" )
723 set_pi_output( false )
724 set_fasta_output( true )
725 set_nexus_output( false )
726 elsif ( type == "n" )
727 set_pi_output( false )
728 set_fasta_output( false )
729 set_nexus_output( true )
731 rescue ArgumentError => e
732 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
735 if ( cla.is_option_set?( MAXIMAL_NAME_LENGTH_OPTION ) )
737 l = cla.get_option_value_as_int( MAXIMAL_NAME_LENGTH_OPTION )
739 rescue ArgumentError => e
740 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
743 if ( cla.is_option_set?( WIDTH_OPTION ) )
745 w = cla.get_option_value_as_int( WIDTH_OPTION )
747 rescue ArgumentError => e
748 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
751 if ( cla.is_option_set?( CLEAN_UP_SEQ_OPTION ) )
754 if ( cla.is_option_set?( REMOVE_GAP_COLUMNS_OPTION ) )
755 set_remove_gap_columns( true )
757 if ( cla.is_option_set?( REM_RED_OPTION ) )
760 if ( cla.is_option_set?( REMOVE_GAP_ONLY_COLUMNS ) )
761 set_remove_gap_only_columns( true )
763 if ( cla.is_option_set?( REMOVE_ALL_GAP_CHARACTERS_OPTION ) )
764 set_remove_gaps( true )
766 if ( cla.is_option_set?( REMOVE_COLUMNS_GAP_RATIO_OPTION ) )
768 f = cla.get_option_value_as_float( REMOVE_COLUMNS_GAP_RATIO_OPTION )
769 set_remove_gap_ratio( f )
770 rescue ArgumentError => e
771 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
774 if ( cla.is_option_set?( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) )
776 s = cla.get_option_value( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
778 rescue ArgumentError => e
779 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
782 if ( cla.is_option_set?( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) )
784 s = cla.get_option_value( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
786 rescue ArgumentError => e
787 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
790 if ( cla.is_option_set?( TRIM_OPTION ) )
792 s = cla.get_option_value( TRIM_OPTION )
793 if ( s =~ /(\d+)-(\d+)/ )
794 set_trim( $1.to_i(), $2.to_i() )
796 puts( "illegal argument" )
800 rescue ArgumentError => e
801 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
804 if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) )
806 f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION )
807 set_remove_seqs_gap_ratio( f )
808 rescue ArgumentError => e
809 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
812 if ( cla.is_option_set?( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) )
814 f = cla.get_option_value_as_int( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
815 set_remove_seqs_min_non_gap_length( f )
816 rescue ArgumentError => e
817 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
820 if ( cla.is_option_set?( SPLIT ) )
822 s = cla.get_option_value_as_int( SPLIT )
824 rescue ArgumentError => e
825 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
829 if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) )
831 s = cla.get_option_value( REMOVE_MATCHING_SEQUENCES_OPTION )
832 set_remove_matching( s )
833 rescue ArgumentError => e
834 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
837 if ( cla.is_option_set?( KEEP_MATCHING_SEQUENCES_OPTION ) )
839 s = cla.get_option_value( KEEP_MATCHING_SEQUENCES_OPTION )
840 set_keep_matching( s )
841 rescue ArgumentError => e
842 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
845 if ( cla.is_option_set?( DIE_IF_NAME_TOO_LONG ) )
846 @die_if_name_too_long = true
855 puts( " " + PRG_NAME + ".rb [options] <input alignment> <output>" )
857 puts( " options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta, p for phylip selex type" )
858 puts( " -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta, n for nexus, p for phylip sequential (default)" )
859 puts( " -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
860 puts( " -" + DIE_IF_NAME_TOO_LONG + ": die if sequence name too long" )
861 puts( " -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
862 puts( " -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" )
863 puts( " -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" )
864 puts( " -" + REMOVE_GAP_ONLY_COLUMNS + ": remove gap-only columns" )
865 puts( " -" + REMOVE_COLUMNS_GAP_RATIO_OPTION + "=<n>: remove columns for which ( seqs with gap / number of sequences > n )" )
866 puts( " -" + REMOVE_ALL_GAP_CHARACTERS_OPTION + ": remove all gap characters (destroys alignment, fasta output only)" )
867 puts( " -" + REMOVE_ALL_SEQUENCES_LISTED_OPTION + "=<file>: remove all sequences listed in file" )
868 puts( " -" + KEEP_ONLY_SEQUENCES_LISTED_OPTION + "=<file>: keep only sequences listed in file" )
869 puts( " -" + TRIM_OPTION + "=<first>-<last>: remove columns before first and after last" )
870 puts( " -" + REMOVE_SEQS_GAP_RATIO_OPTION + "=<n>: remove sequences for which the gap ratio > n (after column operations)" )
871 puts( " -" + REMOVE_SEQS_NON_GAP_LENGTH_OPTION + "=<n> remove sequences with less than n non-gap characters (after column operations)" )
872 puts( " -" + REMOVE_MATCHING_SEQUENCES_OPTION + "=<s> remove all sequences with names containing s" )
873 puts( " -" + KEEP_MATCHING_SEQUENCES_OPTION + "=<s> keep only sequences with names containing s" )
874 puts( " -" + SPLIT + "=<n> split a fasta file into n files of equal number of sequences (expect for " )
875 puts( " last one), cannot be used with other options" )
876 puts( " -" + REM_RED_OPTION + ": remove redundant sequences" )
880 end # class MsaProcessor