forester/ruby/evoruby/lib/evo/tool/msa_processor.rb

   1 #
   2 # = lib/evo/apps/msa_processor.rb - MsaProcessor class
   3 #
   4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
   5 # License::    GNU Lesser General Public License (LGPL)
   6 #
   7 # $Id: msa_processor.rb,v 1.33 2010/12/13 19:00:10 cmzmasek Exp $
   8 #
   9
  10 require 'date'
  11 require 'set'
  12 require 'lib/evo/util/constants'
  13 require 'lib/evo/util/util'
  14 require 'lib/evo/util/command_line_arguments'
  15 require 'lib/evo/msa/msa_factory'
  16 require 'lib/evo/io/msa_io'
  17 require 'lib/evo/io/writer/phylip_sequential_writer'
  18 require 'lib/evo/io/writer/nexus_writer'
  19 require 'lib/evo/io/writer/fasta_writer'
  20 require 'lib/evo/io/parser/fasta_parser'
  21 require 'lib/evo/io/parser/general_msa_parser'
  22 require 'lib/evo/io/writer/msa_writer'
  23
  24 module Evoruby
  25   class MsaProcessor
  26
  27     PRG_NAME       = "msa_pro"
  28     PRG_DATE       = "170215"
  29     PRG_DESC       = "processing of multiple sequence alignments"
  30     PRG_VERSION    = "1.08"
  31     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
  32
  33     NAME_LENGTH_DEFAULT                = 10
  34     WIDTH_DEFAULT_FASTA                = 60
  35     INPUT_TYPE_OPTION                  = "i"
  36     OUTPUT_TYPE_OPTION                 = "o"
  37     MAXIMAL_NAME_LENGTH_OPTION         = "n"
  38     DIE_IF_NAME_TOO_LONG               = "d"
  39     WIDTH_OPTION                       = "w"
  40     CLEAN_UP_SEQ_OPTION                = "c"
  41     REM_RED_OPTION                     = "rem_red"
  42     REMOVE_GAP_COLUMNS_OPTION          = "rgc"
  43     REMOVE_GAP_ONLY_COLUMNS            = "rgoc"
  44     REMOVE_COLUMNS_GAP_RATIO_OPTION    = "rr"
  45     REMOVE_ALL_GAP_CHARACTERS_OPTION   = "rg"
  46     REMOVE_ALL_SEQUENCES_LISTED_OPTION = "r"
  47     KEEP_ONLY_SEQUENCES_LISTED_OPTION  = "k"
  48
  49     KEEP_MATCHING_SEQUENCES_OPTION     = "mk"
  50     REMOVE_MATCHING_SEQUENCES_OPTION   = "mr"
  51
  52     TRIM_OPTION                        = "t"
  53     REMOVE_SEQS_GAP_RATIO_OPTION       = "rsgr"
  54     REMOVE_SEQS_NON_GAP_LENGTH_OPTION  = "rsl"
  55     SPLIT                              = "split"
  56     LOG_SUFFIX                         = "_msa_pro.log"
  57     HELP_OPTION_1                      = "help"
  58     HELP_OPTION_2                      = "h"
  59     def initialize()
  60       @input_format_set = false
  61       @output_format_set = false
  62       @fasta_input      = false
  63       @phylip_input     = true
  64       @name_length      = NAME_LENGTH_DEFAULT
  65       @name_length_set  = false
  66       @width            = WIDTH_DEFAULT_FASTA     # fasta only
  67       @pi_output        = true
  68       @fasta_output     = false
  69       @nexus_output     = false
  70       @clean            = false  # phylip only
  71       @rgc              = false
  72       @rgoc             = false
  73       @rg               = false  # fasta only
  74       @rem_red          = false
  75       @die_if_name_too_long  = false
  76       @rgr              = -1
  77       @rsgr             = -1
  78       @rsl              = -1
  79       @remove_matching  = nil
  80       @keep_matching    = nil
  81
  82       @seqs_name_file   = nil
  83       @remove_seqs      = false
  84       @keep_seqs        = false
  85       @trim             = false
  86       @split            = -1
  87       @first            = -1
  88       @last             = -1
  89     end
  90
  91     def run()
  92
  93       Util.print_program_information( PRG_NAME,
  94       PRG_VERSION,
  95       PRG_DESC,
  96       PRG_DATE,
  97       WWW,
  98       STDOUT )
  99
 100       if ( ARGV == nil || ARGV.length < 1 )
 101         Util.print_message( PRG_NAME, "Illegal number of arguments" )
 102         print_help
 103         exit( -1 )
 104       end
 105
 106       begin
 107         cla = CommandLineArguments.new( ARGV )
 108       rescue ArgumentError => e
 109         Util.fatal_error( PRG_NAME, "Error: " + e.to_s, STDOUT )
 110       end
 111
 112       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
 113       cla.is_option_set?( HELP_OPTION_2 ) )
 114         print_help
 115         exit( 0 )
 116       end
 117
 118       if ( cla.get_number_of_files != 2 || ARGV.length < 2 )
 119         Util.print_message( PRG_NAME, "Illegal number of arguments" )
 120         print_help
 121         exit( -1 )
 122       end
 123
 124       allowed_opts = Array.new
 125       allowed_opts.push( INPUT_TYPE_OPTION )
 126       allowed_opts.push( OUTPUT_TYPE_OPTION )
 127       allowed_opts.push( MAXIMAL_NAME_LENGTH_OPTION )
 128       allowed_opts.push( WIDTH_OPTION )
 129       allowed_opts.push( CLEAN_UP_SEQ_OPTION )
 130       allowed_opts.push( REMOVE_GAP_COLUMNS_OPTION )
 131       allowed_opts.push( REMOVE_GAP_ONLY_COLUMNS )
 132       allowed_opts.push( REMOVE_COLUMNS_GAP_RATIO_OPTION )
 133       allowed_opts.push( REMOVE_ALL_GAP_CHARACTERS_OPTION )
 134       allowed_opts.push( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
 135       allowed_opts.push( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
 136       allowed_opts.push( TRIM_OPTION )
 137       allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION )
 138       allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
 139       allowed_opts.push( SPLIT )
 140       allowed_opts.push( REM_RED_OPTION )
 141       allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
 142       allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
 143       allowed_opts.push( DIE_IF_NAME_TOO_LONG )
 144
 145       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
 146       if ( disallowed.length > 0 )
 147         Util.fatal_error( PRG_NAME,
 148         "unknown option(s): " + disallowed )
 149       end
 150
 151       input = cla.get_file_name( 0 )
 152       output = cla.get_file_name( 1 )
 153
 154       analyze_command_line( cla )
 155
 156       begin
 157         Util.check_file_for_readability( input )
 158       rescue IOError => e
 159         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 160       end
 161
 162       begin
 163         Util.check_file_for_writability( output )
 164       rescue IOError => e
 165         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 166       end
 167
 168       if ( @rg )
 169         set_pi_output( false )
 170         set_fasta_output( true )
 171         set_nexus_output( false )
 172       end
 173
 174       if ( !@input_format_set )
 175         fasta_like = false
 176         begin
 177           fasta_like = Util.looks_like_fasta?( input )
 178         rescue ArgumentError => e
 179           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 180         end
 181         @fasta_input = fasta_like
 182         @phylip_input = !fasta_like
 183         if ( !@output_format_set )
 184           @fasta_output = fasta_like
 185           @pi_output = !fasta_like
 186           @nexus_output = false
 187         end
 188       end
 189
 190       ld = Constants::LINE_DELIMITER
 191       log = PRG_NAME + " " + PRG_VERSION + " [" + PRG_DATE + "]" + " LOG" + ld
 192       now = DateTime.now
 193       log << "Date/time: " + now.to_s + ld
 194
 195       puts()
 196       puts( "Input alignment  : " + input )
 197       log << "Input alignment  : " + input + ld
 198       puts( "Output alignment : " + output )
 199       log << "Output alignment : " + output + ld
 200       if ( @phylip_input )
 201         puts( "Input is         : Phylip, or something like it" )
 202         log << "Input is         : Phylip, or something like it" + ld
 203       elsif ( @fasta_input )
 204         puts( "Input is         : Fasta" )
 205         log << "Input is         : Fasta" + ld
 206       end
 207       if( @rgr >= 0 )
 208         puts( "Max col gap ratio: " + @rgr.to_s )
 209         log << "Max col gap ratio: " + @rgr.to_s + ld
 210       elsif ( @rgc )
 211         puts( "Remove gap colums" )
 212         log << "Remove gap colums" + ld
 213       elsif( @rgoc )
 214         puts( "Remove gap only colums" )
 215         log << "Remove gap only colums" + ld
 216       end
 217       if ( @clean )
 218         puts( "Clean up         : true" )
 219         log << "Clean up         : true" + ld
 220       end
 221
 222       if ( @pi_output )
 223         puts( "Output is        : Phylip interleaved" )
 224         log << "Output is        : Phylip interleaved" + ld
 225       elsif ( @fasta_output )
 226         puts( "Output is        : Fasta" )
 227         log << "Output is        : Fasta" + ld
 228         if ( @width )
 229           puts( "Width            : " + @width.to_s )
 230           log << "Width            : " + @width.to_s + ld
 231         end
 232         if ( @rg )
 233           puts( "Remove all gap characters (alignment is destroyed)" )
 234           log << "Remove all gap characters (alignment is destroyed)" + ld
 235         end
 236       elsif ( @nexus_output )
 237         puts( "Output is        : Nexus" )
 238         log << "Output is        : Nexus" + ld
 239       end
 240       if ( @name_length_set || !@fasta_output )
 241         puts( "Max name length  : " + @name_length.to_s )
 242         log << "Max name length  : " + @name_length.to_s + ld
 243       end
 244       if( @rsgr >= 0 )
 245         puts( "Remove sequences for which the gap ratio > " + @rsgr.to_s )
 246         log << "Remove sequences for which the gap ratio > " + @rsgr.to_s + ld
 247       end
 248       if( @rsl >= 0 )
 249         puts( "Remove sequences with less than "  + @rsl.to_s + " non-gap characters" )
 250         log << "Remove sequences with less than "  + @rsl.to_s + " non-gap characters" + ld
 251       end
 252       if ( @remove_seqs )
 253         puts( "Remove sequences listed in: " + @seqs_name_file )
 254         log << "Remove sequences listed in: " + @seqs_name_file + ld
 255       elsif ( @keep_seqs )
 256         puts( "Keep only sequences listed in: " + @seqs_name_file )
 257         log << "Keep only sequences listed in: " + @seqs_name_file + ld
 258       end
 259       if ( @trim )
 260         puts( "Keep only columns from: "+ @first.to_s + " to " + @last.to_s )
 261         log << "Keep only columns from: "+ @first.to_s + " to " + @last.to_s + ld
 262       end
 263       if ( @rem_red )
 264         puts( "Remove redundant sequences: true" )
 265         log << "Remove redundant sequences: true" + ld
 266       end
 267       if ( @split > 0 )
 268         puts( "Split            : " + @split.to_s )
 269         log << "Split            : " + @split.to_s + ld
 270       end
 271       puts()
 272
 273       f = MsaFactory.new()
 274
 275       msa = nil
 276
 277       begin
 278         if ( @phylip_input )
 279           msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
 280         elsif ( @fasta_input )
 281           msa = f.create_msa_from_file( input, FastaParser.new() )
 282         end
 283       rescue Exception => e
 284         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 285       end
 286
 287       if ( msa.is_aligned() )
 288         Util.print_message( PRG_NAME, "Length of original alignment         : " + msa.get_length.to_s )
 289         log << "Length of original alignment         : " + msa.get_length.to_s + ld
 290         gp = msa.calculate_gap_proportion
 291         Util.print_message( PRG_NAME, "Gap-proportion of original alignment : " + gp.to_s )
 292         log << "Gap-proportion of original alignment : " +  gp.to_s + ld
 293       else
 294         Util.print_message( PRG_NAME, "Input is not aligned" )
 295         log << "Input is not aligned" + ld
 296       end
 297
 298       all_names = Set.new()
 299       for i in 0 ... msa.get_number_of_seqs()
 300         current_name = msa.get_sequence( i ).get_name
 301         if all_names.include?( current_name )
 302           Util.print_warning_message( PRG_NAME, "sequence name [" + current_name + "] is not unique" )
 303         else
 304           all_names.add( current_name )
 305         end
 306       end
 307
 308       begin
 309
 310         if ( @remove_seqs || @keep_seqs )
 311           names = Util.file2array( @seqs_name_file, true )
 312           if ( names == nil ||  names.length() < 1 )
 313             error_msg = "file \"" + @seqs_name_file.to_s + "\" appears empty"
 314             Util.fatal_error( PRG_NAME, error_msg )
 315           end
 316
 317           if ( @remove_seqs )
 318             c = 0
 319             for i in 0 ... names.length()
 320               to_delete = msa.find_by_name( names[ i ], true, false )
 321               if ( to_delete.length() < 1 )
 322                 error_msg = "sequence name \"" + names[ i ] + "\" not found"
 323                 Util.fatal_error( PRG_NAME, error_msg )
 324               elsif ( to_delete.length() > 1 )
 325                 error_msg = "sequence name \"" + names[ i ] + "\" is not unique"
 326                 Util.fatal_error( PRG_NAME, error_msg )
 327               else
 328                 msa.remove_sequence!( to_delete[ 0 ] )
 329                 c += 1
 330               end
 331             end
 332             Util.print_message( PRG_NAME, "Removed " + c.to_s + " sequences" )
 333             log <<  "Removed " + c.to_s + " sequences" + ld
 334           elsif ( @keep_seqs )
 335             msa_new = Msa.new()
 336             r = 0
 337             k = 0
 338             for j in 0 ... msa.get_number_of_seqs()
 339               if ( names.include?( msa.get_sequence( j ).get_name() ) )
 340                 msa_new.add_sequence( msa.get_sequence( j ) )
 341                 k += 1
 342               else
 343                 r += 1
 344               end
 345             end
 346             msa = msa_new
 347             Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
 348             log << "Kept    " + k.to_s + " sequences" + ld
 349             Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
 350             log << "removed " + r.to_s + " sequences" + ld
 351           end
 352         end
 353
 354         if ( @trim )
 355           msa.trim!( @first, @last )
 356         end
 357         if( @rgr >= 0 )
 358           msa.remove_gap_columns_w_gap_ratio!( @rgr )
 359         elsif ( @rgc )
 360           msa.remove_gap_columns!()
 361         elsif( @rgoc )
 362           msa.remove_gap_only_columns!()
 363         end
 364         if( @rsgr >= 0 )
 365           n = msa.get_number_of_seqs()
 366           removed = msa.remove_sequences_by_gap_ratio!( @rsgr )
 367           k = msa.get_number_of_seqs()
 368           r = n - k
 369           Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
 370           log << "Kept    " + k.to_s + " sequences" + ld
 371           Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences"  )
 372           log << "Removed " + r.to_s + " sequences:" + ld
 373           removed.each { | seq_name |
 374             log << "         " + seq_name  + ld
 375           }
 376         end
 377         if( @rsl >= 0 )
 378           n = msa.get_number_of_seqs()
 379           removed = msa.remove_sequences_by_non_gap_length!( @rsl )
 380           k = msa.get_number_of_seqs()
 381           r = n - k
 382           Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
 383           log << "Kept    " + k.to_s + " sequences" + ld
 384           Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
 385           log << "Removed " + r.to_s + " sequences:" + ld
 386           removed.each { | seq_name |
 387             log << "         " + seq_name  + ld
 388           }
 389         end
 390         if ( @keep_matching )
 391           n = msa.get_number_of_seqs
 392           to_be_removed = Set.new
 393           for ii in 0 ...  n
 394             seq = msa.get_sequence( ii )
 395             if !seq.get_name.downcase.index( @keep_matching.downcase )
 396               to_be_removed.add( ii )
 397             end
 398           end
 399           to_be_removed_ary = to_be_removed.to_a.sort.reverse
 400           to_be_removed_ary.each { | index |
 401             msa.remove_sequence!( index )
 402           }
 403           # msa = sort( msa )
 404         end
 405         if ( @remove_matching )
 406           n = msa.get_number_of_seqs
 407           to_be_removed = Set.new
 408           for iii in 0 ... n
 409
 410             seq = msa.get_sequence( iii )
 411
 412             if seq.get_name.downcase.index( @remove_matching.downcase )
 413               to_be_removed.add( iii )
 414             end
 415           end
 416           to_be_removed_ary = to_be_removed.to_a.sort.reverse
 417           to_be_removed_ary.each { | index |
 418             msa.remove_sequence!( index )
 419           }
 420           msa = sort( msa )
 421         end
 422
 423         if ( @split > 0 )
 424           begin
 425             msas = msa.split( @split, true )
 426             io = MsaIO.new()
 427             w = MsaWriter
 428             if ( @pi_output )
 429               w = PhylipSequentialWriter.new()
 430               w.clean( @clean )
 431               w.set_max_name_length( @name_length )
 432             elsif( @fasta_output )
 433               w = FastaWriter.new()
 434               w.set_line_width( @width )
 435               if ( @rg )
 436                 w.remove_gap_chars( true )
 437                 Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
 438                 log << "removing gap character, the output is likely to become unaligned" + ld
 439               end
 440               w.clean( @clean )
 441               if ( @name_length_set )
 442                 w.set_max_name_length( @name_length )
 443               end
 444             elsif( @nexus_output )
 445               w = NexusWriter.new()
 446               w.clean( @clean )
 447               w.set_max_name_length( @name_length )
 448             end
 449             i = 0
 450             for m in msas
 451               i = i + 1
 452               io.write_to_file( m, output + "_" + i.to_s, w )
 453             end
 454             Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files"  )
 455             log << "wrote " + msas.length.to_s + " files" + ld
 456           rescue Exception => e
 457             Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 458           end
 459
 460         end
 461       rescue Exception => e
 462         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 463       end
 464
 465       if ( @split <= 0 )
 466
 467         unless ( @rg )
 468           if ( msa.is_aligned() )
 469             Util.print_message( PRG_NAME, "Length of processed alignment        : " + msa.get_length.to_s )
 470             log <<  "Length of processed alignment        : " + msa.get_length.to_s + ld
 471             gp = msa.calculate_gap_proportion
 472             Util.print_message( PRG_NAME, "Gap-proportion of processed alignment: " + gp.to_s )
 473             log << "Gap-proportion of processed alignment: " +  gp.to_s + ld
 474           else
 475             min = 0
 476             max = 0
 477             sum = 0
 478             first = true
 479             for s in 0 ... msa.get_number_of_seqs
 480               seq = msa.get_sequence( s )
 481               l = seq.get_length
 482               sum += l
 483               if l > max
 484                 max = l
 485               end
 486               if first || l < min
 487                 min = l
 488               end
 489               first = false
 490             end
 491             avg = sum / msa.get_number_of_seqs
 492             Util.print_message( PRG_NAME, "Output is not aligned" )
 493             log << "Output is not aligned" + ld
 494             Util.print_message( PRG_NAME, "Shortest sequence                    : " + min.to_s )
 495             log <<  "Shortest sequence                    : " + min.to_s + ld
 496             Util.print_message( PRG_NAME, "Longest sequence                     : " + max.to_s )
 497             log <<  "Longest sequence                     : " + max.to_s + ld
 498             Util.print_message( PRG_NAME, "Average length                       : " + avg.to_s )
 499             log <<  "Average length                       : " + avg.to_s + ld
 500
 501           end
 502         end
 503
 504         if @rem_red
 505           removed = msa.remove_redundant_sequences!( true, false )
 506           if removed.size > 0
 507             identicals = msa.get_identical_seqs_detected
 508             log << "the following " + identicals.size.to_s + " sequences are identical:" + ld
 509             identicals.each { | identical |
 510               log << identical + ld
 511             }
 512             log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld
 513             removed.each { | seq_name |
 514               log << seq_name + ld
 515             }
 516             Util.print_message( PRG_NAME, "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" )
 517             log << "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" + ld
 518           end
 519         end
 520
 521         io = MsaIO.new()
 522
 523         w = MsaWriter
 524
 525         if ( @pi_output )
 526           w = PhylipSequentialWriter.new()
 527           w.clean( @clean )
 528           w.set_max_name_length( @name_length )
 529           w.set_exception_if_name_too_long( @die_if_name_too_long )
 530         elsif( @fasta_output )
 531           w = FastaWriter.new()
 532           w.set_line_width( @width )
 533           if ( @rg )
 534             w.remove_gap_chars( true )
 535             Util.print_warning_message( PRG_NAME, "removing gap characters, the output is likely to become unaligned"  )
 536             log << "removing gap character, the output is likely to become unaligned" + ld
 537           end
 538           w.clean( @clean )
 539           if ( @name_length_set )
 540             w.set_max_name_length( @name_length )
 541             w.set_exception_if_name_too_long( @die_if_name_too_long )
 542           end
 543         elsif( @nexus_output )
 544           w = NexusWriter.new()
 545           w.clean( @clean )
 546           w.set_max_name_length( @name_length )
 547           w.set_exception_if_name_too_long( @die_if_name_too_long )
 548         end
 549
 550         begin
 551           io.write_to_file( msa, output, w )
 552         rescue Exception => e
 553           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 554         end
 555
 556         Util.print_message( PRG_NAME, "Number of sequences in output        : " + msa.get_number_of_seqs.to_s )
 557         log << "Number of sequences in output        : " + msa.get_number_of_seqs.to_s + ld
 558
 559         begin
 560           f = File.open( output + LOG_SUFFIX, 'a' )
 561           f.print( log )
 562           f.close
 563         rescue Exception => e
 564           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 565         end
 566
 567       end
 568       Util.print_message( PRG_NAME, "OK" )
 569       puts
 570     end
 571
 572     private
 573
 574     def sort( msa )
 575       names = Set.new
 576       for i in 0 ... msa.get_number_of_seqs
 577         name = msa.get_sequence( i ).get_name
 578         names.add( name )
 579       end
 580       sorted_ary = names.to_a.sort
 581       new_msa = Msa.new
 582       sorted_ary.each { | seq_name |
 583         seq = msa.get_sequence( msa.find_by_name( seq_name, true, false )[ 0 ] )
 584         new_msa.add_sequence( seq )
 585       }
 586       new_msa
 587     end
 588
 589     def set_fasta_input( fi = true )
 590       @fasta_input = fi
 591       @input_format_set = true
 592     end
 593
 594     def set_phylip_input( pi = true )
 595       @phylip_input = pi
 596       @input_format_set = true
 597     end
 598
 599     def set_name_length( i )
 600       @name_length = i
 601       @name_length_set = true
 602     end
 603
 604     def set_width( i )
 605       @width = i
 606     end
 607
 608     def set_fasta_output( fo = true )
 609       @fasta_output = fo
 610       @output_format_set = true
 611     end
 612
 613     def set_pi_output( pso = true )
 614       @pi_output = pso
 615       @output_format_set = true
 616     end
 617
 618     def set_nexus_output( nexus = true )
 619       @nexus_output = nexus
 620       @output_format_set = true
 621     end
 622
 623     def set_clean( c = true )
 624       @clean = c
 625     end
 626
 627     def set_remove_gap_columns( rgc = true )
 628       @rgc = rgc
 629     end
 630
 631     def set_remove_gap_only_columns( rgoc = true )
 632       @rgoc = rgoc
 633     end
 634
 635     def set_remove_gaps( rg = true )
 636       @rg = rg
 637     end
 638
 639     def set_remove_gap_ratio( rgr )
 640       @rgr = rgr
 641     end
 642
 643     def set_remove_seqs_gap_ratio( rsgr )
 644       @rsgr = rsgr
 645     end
 646
 647     def set_remove_seqs_min_non_gap_length( rsl )
 648       @rsl = rsl
 649     end
 650
 651     def set_remove_seqs( file )
 652       @seqs_name_file = file
 653       @remove_seqs    = true
 654       @keep_seqs      = false
 655     end
 656
 657     def set_keep_seqs( file )
 658       @seqs_name_file = file
 659       @keep_seqs      = true
 660       @remove_seqs    = false
 661     end
 662
 663     def set_trim( first, last )
 664       @trim            = true
 665       @first           = first
 666       @last            = last
 667     end
 668
 669     def set_remove_matching( remove )
 670       @remove_matching  = remove
 671     end
 672
 673     def set_keep_matching( keep )
 674       @keep_matching = keep
 675     end
 676
 677     def set_rem_red( rr )
 678       @rem_red = rr
 679     end
 680
 681     def set_split( s )
 682       if ( s > 0 )
 683         @split            = s
 684         @clean            = false  # phylip only
 685         @rgc              = false
 686         @rgoc             = false
 687         @rg               = false  # fasta only
 688         @rgr              = -1
 689         @rsgr             = -1
 690         @rsl              = -1
 691         @seqs_name_file   = nil
 692         @remove_seqs      = false
 693         @keep_seqs        = false
 694         @trim             = false
 695         @first            = -1
 696         @last             = -1
 697       end
 698     end
 699
 700     def analyze_command_line( cla )
 701       if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
 702         begin
 703           type = cla.get_option_value( INPUT_TYPE_OPTION )
 704           if ( type == "p" )
 705             set_phylip_input( true )
 706             set_fasta_input( false )
 707           elsif ( type == "f" )
 708             set_fasta_input( true )
 709             set_phylip_input( false )
 710           end
 711         rescue ArgumentError => e
 712           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 713         end
 714       end
 715       if ( cla.is_option_set?( OUTPUT_TYPE_OPTION ) )
 716         begin
 717           type = cla.get_option_value( OUTPUT_TYPE_OPTION )
 718           if ( type == "p" )
 719             set_pi_output( true )
 720             set_fasta_output( false )
 721             set_nexus_output( false )
 722           elsif ( type == "f" )
 723             set_pi_output( false )
 724             set_fasta_output( true )
 725             set_nexus_output( false )
 726           elsif ( type == "n" )
 727             set_pi_output( false )
 728             set_fasta_output( false )
 729             set_nexus_output( true )
 730           end
 731         rescue ArgumentError => e
 732           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 733         end
 734       end
 735       if ( cla.is_option_set?( MAXIMAL_NAME_LENGTH_OPTION ) )
 736         begin
 737           l = cla.get_option_value_as_int( MAXIMAL_NAME_LENGTH_OPTION )
 738           set_name_length( l )
 739         rescue ArgumentError => e
 740           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 741         end
 742       end
 743       if ( cla.is_option_set?( WIDTH_OPTION ) )
 744         begin
 745           w = cla.get_option_value_as_int( WIDTH_OPTION )
 746           set_width( w )
 747         rescue ArgumentError => e
 748           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 749         end
 750       end
 751       if ( cla.is_option_set?( CLEAN_UP_SEQ_OPTION ) )
 752         set_clean( true )
 753       end
 754       if ( cla.is_option_set?( REMOVE_GAP_COLUMNS_OPTION ) )
 755         set_remove_gap_columns( true )
 756       end
 757       if ( cla.is_option_set?( REM_RED_OPTION ) )
 758         set_rem_red( true )
 759       end
 760       if ( cla.is_option_set?( REMOVE_GAP_ONLY_COLUMNS ) )
 761         set_remove_gap_only_columns( true )
 762       end
 763       if ( cla.is_option_set?( REMOVE_ALL_GAP_CHARACTERS_OPTION ) )
 764         set_remove_gaps( true )
 765       end
 766       if ( cla.is_option_set?( REMOVE_COLUMNS_GAP_RATIO_OPTION ) )
 767         begin
 768           f = cla.get_option_value_as_float( REMOVE_COLUMNS_GAP_RATIO_OPTION )
 769           set_remove_gap_ratio( f )
 770         rescue ArgumentError => e
 771           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 772         end
 773       end
 774       if ( cla.is_option_set?( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) )
 775         begin
 776           s = cla.get_option_value( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
 777           set_remove_seqs( s )
 778         rescue ArgumentError => e
 779           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 780         end
 781       end
 782       if ( cla.is_option_set?( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) )
 783         begin
 784           s = cla.get_option_value( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
 785           set_keep_seqs( s )
 786         rescue ArgumentError => e
 787           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 788         end
 789       end
 790       if ( cla.is_option_set?( TRIM_OPTION ) )
 791         begin
 792           s = cla.get_option_value( TRIM_OPTION )
 793           if ( s =~ /(\d+)-(\d+)/ )
 794             set_trim( $1.to_i(), $2.to_i() )
 795           else
 796             puts( "illegal argument" )
 797             print_help
 798             exit( -1 )
 799           end
 800         rescue ArgumentError => e
 801           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 802         end
 803       end
 804       if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) )
 805         begin
 806           f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION )
 807           set_remove_seqs_gap_ratio( f )
 808         rescue ArgumentError => e
 809           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 810         end
 811       end
 812       if ( cla.is_option_set?( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) )
 813         begin
 814           f = cla.get_option_value_as_int( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
 815           set_remove_seqs_min_non_gap_length( f )
 816         rescue ArgumentError => e
 817           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 818         end
 819       end
 820       if ( cla.is_option_set?( SPLIT ) )
 821         begin
 822           s = cla.get_option_value_as_int( SPLIT )
 823           set_split( s )
 824         rescue ArgumentError => e
 825           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 826         end
 827
 828       end
 829       if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) )
 830         begin
 831           s = cla.get_option_value( REMOVE_MATCHING_SEQUENCES_OPTION )
 832           set_remove_matching( s )
 833         rescue ArgumentError => e
 834           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 835         end
 836       end
 837       if ( cla.is_option_set?( KEEP_MATCHING_SEQUENCES_OPTION ) )
 838         begin
 839           s = cla.get_option_value( KEEP_MATCHING_SEQUENCES_OPTION )
 840           set_keep_matching( s )
 841         rescue ArgumentError => e
 842           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 843         end
 844       end
 845       if ( cla.is_option_set?( DIE_IF_NAME_TOO_LONG ) )
 846         @die_if_name_too_long = true
 847       end
 848
 849     end
 850
 851     def print_help()
 852       puts()
 853       puts( "Usage:" )
 854       puts()
 855       puts( "  " + PRG_NAME + ".rb [options] <input alignment> <output>" )
 856       puts()
 857       puts( "  options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta, p for phylip selex type" )
 858       puts( "           -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta, n for nexus, p for phylip sequential (default)" )
 859       puts( "           -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
 860       puts( "           -" + DIE_IF_NAME_TOO_LONG + ": die if sequence name too long" )
 861       puts( "           -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
 862       puts( "           -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" )
 863       puts( "           -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" )
 864       puts( "           -" + REMOVE_GAP_ONLY_COLUMNS + ": remove gap-only columns" )
 865       puts( "           -" + REMOVE_COLUMNS_GAP_RATIO_OPTION + "=<n>: remove columns for which ( seqs with gap / number of sequences > n )" )
 866       puts( "           -" + REMOVE_ALL_GAP_CHARACTERS_OPTION + ": remove all gap characters (destroys alignment, fasta output only)" )
 867       puts( "           -" + REMOVE_ALL_SEQUENCES_LISTED_OPTION + "=<file>: remove all sequences listed in file" )
 868       puts( "           -" + KEEP_ONLY_SEQUENCES_LISTED_OPTION + "=<file>: keep only sequences listed in file" )
 869       puts( "           -" + TRIM_OPTION + "=<first>-<last>: remove columns before first and after last" )
 870       puts( "           -" + REMOVE_SEQS_GAP_RATIO_OPTION + "=<n>: remove sequences for which the gap ratio > n (after column operations)" )
 871       puts( "           -" + REMOVE_SEQS_NON_GAP_LENGTH_OPTION + "=<n> remove sequences with less than n non-gap characters (after column operations)" )
 872       puts( "           -" + REMOVE_MATCHING_SEQUENCES_OPTION + "=<s> remove all sequences with names containing s" )
 873       puts( "           -" + KEEP_MATCHING_SEQUENCES_OPTION + "=<s> keep only sequences with names containing s" )
 874       puts( "           -" + SPLIT + "=<n> split a fasta file into n files of equal number of sequences (expect for " )
 875       puts( "            last one), cannot be used with other options" )
 876       puts( "           -" + REM_RED_OPTION + ": remove redundant sequences" )
 877       puts()
 878     end
 879
 880   end # class MsaProcessor
 881
 882 end # module Evoruby