forester/ruby/evoruby/lib/evo/tool/msa_processor.rb

   1 #
   2 # = lib/evo/apps/msa_processor.rb - MsaProcessor class
   3 #
   4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
   5 # License::    GNU Lesser General Public License (LGPL)
   6 #
   7 # $Id: msa_processor.rb,v 1.33 2010/12/13 19:00:10 cmzmasek Exp $
   8 #
   9
  10 require 'date'
  11 require 'set'
  12
  13 require 'lib/evo/util/constants'
  14 require 'lib/evo/util/util'
  15 require 'lib/evo/util/command_line_arguments'
  16 require 'lib/evo/msa/msa_factory'
  17 require 'lib/evo/io/msa_io'
  18 require 'lib/evo/io/writer/phylip_sequential_writer'
  19 require 'lib/evo/io/writer/nexus_writer'
  20 require 'lib/evo/io/writer/fasta_writer'
  21 require 'lib/evo/io/parser/fasta_parser'
  22 require 'lib/evo/io/parser/general_msa_parser'
  23 require 'lib/evo/io/writer/msa_writer'
  24
  25 module Evoruby
  26
  27   class MsaProcessor
  28
  29     PRG_NAME       = "msa_pro"
  30     PRG_DATE       = "131112"
  31     PRG_DESC       = "processing of multiple sequence alignments"
  32     PRG_VERSION    = "1.08"
  33     COPYRIGHT      = "2008-2010 Christian M Zmasek"
  34     CONTACT        = "phylosoft@gmail.com"
  35     WWW            = "www.phylosoft.org"
  36
  37
  38     NAME_LENGTH_DEFAULT                = 10
  39     WIDTH_DEFAULT_FASTA                = 60
  40     INPUT_TYPE_OPTION                  = "i"
  41     OUTPUT_TYPE_OPTION                 = "o"
  42     MAXIMAL_NAME_LENGTH_OPTION         = "n"
  43     DIE_IF_NAME_TOO_LONG               = "d"
  44     WIDTH_OPTION                       = "w"
  45     CLEAN_UP_SEQ_OPTION                = "c"
  46     REM_RED_OPTION                     = "rem_red"
  47     REMOVE_GAP_COLUMNS_OPTION          = "rgc"
  48     REMOVE_GAP_ONLY_COLUMNS            = "rgoc"
  49     REMOVE_COLUMNS_GAP_RATIO_OPTION    = "rr"
  50     REMOVE_ALL_GAP_CHARACTERS_OPTION   = "rg"
  51     REMOVE_ALL_SEQUENCES_LISTED_OPTION = "r"
  52     KEEP_ONLY_SEQUENCES_LISTED_OPTION  = "k"
  53
  54     KEEP_MATCHING_SEQUENCES_OPTION     = "mk"
  55     REMOVE_MATCHING_SEQUENCES_OPTION   = "mr"
  56
  57     TRIM_OPTION                        = "t"
  58     REMOVE_SEQS_GAP_RATIO_OPTION       = "rsgr"
  59     REMOVE_SEQS_NON_GAP_LENGTH_OPTION  = "rsl"
  60     SPLIT                              = "split"
  61     LOG_SUFFIX                         = "_msa_pro.log"
  62     HELP_OPTION_1                      = "help"
  63     HELP_OPTION_2                      = "h"
  64
  65
  66     def initialize()
  67       @input_format_set = false
  68       @output_format_set = false
  69       @fasta_input      = false
  70       @phylip_input     = true
  71       @name_length      = NAME_LENGTH_DEFAULT
  72       @name_length_set  = false
  73       @width            = WIDTH_DEFAULT_FASTA     # fasta only
  74       @pi_output        = true
  75       @fasta_output     = false
  76       @nexus_output     = false
  77       @clean            = false  # phylip only
  78       @rgc              = false
  79       @rgoc             = false
  80       @rg               = false  # fasta only
  81       @rem_red          = false
  82       @die_if_name_too_long  = false
  83       @rgr              = -1
  84       @rsgr             = -1
  85       @rsl              = -1
  86       @remove_matching  = nil
  87       @keep_matching    = nil
  88
  89       @seqs_name_file   = nil
  90       @remove_seqs      = false
  91       @keep_seqs        = false
  92       @trim             = false
  93       @split            = -1
  94       @first            = -1
  95       @last             = -1
  96     end
  97
  98
  99     def run()
 100
 101       Util.print_program_information( PRG_NAME,
 102         PRG_VERSION,
 103         PRG_DESC,
 104         PRG_DATE,
 105         COPYRIGHT,
 106         CONTACT,
 107         WWW,
 108         STDOUT )
 109
 110       if ( ARGV == nil || ARGV.length < 1 )
 111         Util.print_message( PRG_NAME, "Illegal number of arguments" )
 112         print_help
 113         exit( -1 )
 114       end
 115
 116       begin
 117         cla = CommandLineArguments.new( ARGV )
 118       rescue ArgumentError => e
 119         Util.fatal_error( PRG_NAME, "Error: " + e.to_s, STDOUT )
 120       end
 121
 122       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
 123            cla.is_option_set?( HELP_OPTION_2 ) )
 124         print_help
 125         exit( 0 )
 126       end
 127
 128       if ( cla.get_number_of_files != 2 || ARGV.length < 2 )
 129         Util.print_message( PRG_NAME, "Illegal number of arguments" )
 130         print_help
 131         exit( -1 )
 132       end
 133
 134       allowed_opts = Array.new
 135       allowed_opts.push( INPUT_TYPE_OPTION )
 136       allowed_opts.push( OUTPUT_TYPE_OPTION )
 137       allowed_opts.push( MAXIMAL_NAME_LENGTH_OPTION )
 138       allowed_opts.push( WIDTH_OPTION )
 139       allowed_opts.push( CLEAN_UP_SEQ_OPTION )
 140       allowed_opts.push( REMOVE_GAP_COLUMNS_OPTION )
 141       allowed_opts.push( REMOVE_GAP_ONLY_COLUMNS )
 142       allowed_opts.push( REMOVE_COLUMNS_GAP_RATIO_OPTION )
 143       allowed_opts.push( REMOVE_ALL_GAP_CHARACTERS_OPTION )
 144       allowed_opts.push( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
 145       allowed_opts.push( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
 146       allowed_opts.push( TRIM_OPTION )
 147       allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION )
 148       allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
 149       allowed_opts.push( SPLIT )
 150       allowed_opts.push( REM_RED_OPTION )
 151       allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
 152       allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
 153       allowed_opts.push( DIE_IF_NAME_TOO_LONG )
 154
 155       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
 156       if ( disallowed.length > 0 )
 157         Util.fatal_error( PRG_NAME,
 158           "unknown option(s): " + disallowed )
 159       end
 160
 161       input = cla.get_file_name( 0 )
 162       output = cla.get_file_name( 1 )
 163
 164       analyze_command_line( cla )
 165
 166       begin
 167         Util.check_file_for_readability( input )
 168       rescue IOError => e
 169         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 170       end
 171
 172       begin
 173         Util.check_file_for_writability( output )
 174       rescue IOError => e
 175         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 176       end
 177
 178       if ( @rg )
 179         set_pi_output( false )
 180         set_fasta_output( true )
 181         set_nexus_output( false )
 182       end
 183
 184       if ( !@input_format_set )
 185         fasta_like = false
 186         begin
 187           fasta_like = Util.looks_like_fasta?( input )
 188         rescue ArgumentError => e
 189           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 190         end
 191         @fasta_input = fasta_like
 192         @phylip_input = !fasta_like
 193         if ( !@output_format_set )
 194           @fasta_output = fasta_like
 195           @pi_output = !fasta_like
 196           @nexus_output = false
 197         end
 198       end
 199
 200       ld = Constants::LINE_DELIMITER
 201       log = PRG_NAME + " " + PRG_VERSION + " [" + PRG_DATE + "]" + " LOG" + ld
 202       now = DateTime.now
 203       log << "Date/time: " + now.to_s + ld
 204
 205       puts()
 206       puts( "Input alignment  : " + input )
 207       log << "Input alignment  : " + input + ld
 208       puts( "Output alignment : " + output )
 209       log << "Output alignment : " + output + ld
 210       if ( @phylip_input )
 211         puts( "Input is         : Phylip, or something like it" )
 212         log << "Input is         : Phylip, or something like it" + ld
 213       elsif ( @fasta_input )
 214         puts( "Input is         : Fasta" )
 215         log << "Input is         : Fasta" + ld
 216       end
 217       if( @rgr >= 0 )
 218         puts( "Max col gap ratio: " + @rgr.to_s )
 219         log << "Max col gap ratio: " + @rgr.to_s + ld
 220       elsif ( @rgc )
 221         puts( "Remove gap colums" )
 222         log << "Remove gap colums" + ld
 223       elsif( @rgoc )
 224         puts( "Remove gap only colums" )
 225         log << "Remove gap only colums" + ld
 226       end
 227       if ( @clean )
 228         puts( "Clean up         : true" )
 229         log << "Clean up         : true" + ld
 230       end
 231
 232       if ( @pi_output )
 233         puts( "Output is        : Phylip interleaved" )
 234         log << "Output is        : Phylip interleaved" + ld
 235       elsif ( @fasta_output )
 236         puts( "Output is        : Fasta" )
 237         log << "Output is        : Fasta" + ld
 238         if ( @width )
 239           puts( "Width            : " + @width.to_s )
 240           log << "Width            : " + @width.to_s + ld
 241         end
 242         if ( @rg )
 243           puts( "Remove all gap characters (alignment is destroyed)" )
 244           log << "Remove all gap characters (alignment is destroyed)" + ld
 245         end
 246       elsif ( @nexus_output )
 247         puts( "Output is        : Nexus" )
 248         log << "Output is        : Nexus" + ld
 249       end
 250       if ( @name_length_set || !@fasta_output )
 251         puts( "Max name length  : " + @name_length.to_s )
 252         log << "Max name length  : " + @name_length.to_s + ld
 253       end
 254       if( @rsgr >= 0 )
 255         puts( "Remove sequences for which the gap ratio > " + @rsgr.to_s )
 256         log << "Remove sequences for which the gap ratio > " + @rsgr.to_s + ld
 257       end
 258       if( @rsl >= 0 )
 259         puts( "Remove sequences with less than "  + @rsl.to_s + " non-gap characters" )
 260         log << "Remove sequences with less than "  + @rsl.to_s + " non-gap characters" + ld
 261       end
 262       if ( @remove_seqs )
 263         puts( "Remove sequences listed in: " + @seqs_name_file )
 264         log << "Remove sequences listed in: " + @seqs_name_file + ld
 265       elsif ( @keep_seqs )
 266         puts( "Keep only sequences listed in: " + @seqs_name_file )
 267         log << "Keep only sequences listed in: " + @seqs_name_file + ld
 268       end
 269       if ( @trim )
 270         puts( "Keep only columns from: "+ @first.to_s + " to " + @last.to_s )
 271         log << "Keep only columns from: "+ @first.to_s + " to " + @last.to_s + ld
 272       end
 273       if ( @rem_red )
 274         puts( "Remove redundant sequences: true" )
 275         log << "Remove redundant sequences: true" + ld
 276       end
 277       if ( @split > 0 )
 278         puts( "Split            : " + @split.to_s )
 279         log << "Split            : " + @split.to_s + ld
 280       end
 281       puts()
 282
 283       f = MsaFactory.new()
 284
 285       msa = nil
 286
 287       begin
 288         if ( @phylip_input )
 289           msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
 290         elsif ( @fasta_input )
 291           msa = f.create_msa_from_file( input, FastaParser.new() )
 292         end
 293       rescue Exception => e
 294         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 295       end
 296
 297       if ( msa.is_aligned() )
 298         Util.print_message( PRG_NAME, "Length of original alignment         : " + msa.get_length.to_s )
 299         log << "Length of original alignment         : " + msa.get_length.to_s + ld
 300         gp = msa.calculate_gap_proportion
 301         Util.print_message( PRG_NAME, "Gap-proportion of original alignment : " + gp.to_s )
 302         log << "Gap-proportion of original alignment : " +  gp.to_s + ld
 303       else
 304         Util.print_message( PRG_NAME, "Input is not aligned" )
 305         log << "Input is not aligned" + ld
 306       end
 307
 308       all_names = Set.new()
 309       for i in 0 ... msa.get_number_of_seqs()
 310         current_name = msa.get_sequence( i ).get_name
 311         if all_names.include?( current_name )
 312           Util.print_warning_message( PRG_NAME, "sequence name [" + current_name + "] is not unique" )
 313         else
 314           all_names.add( current_name )
 315         end
 316       end
 317
 318       begin
 319
 320         if ( @remove_seqs || @keep_seqs )
 321           names = Util.file2array( @seqs_name_file, true )
 322           if ( names == nil ||  names.length() < 1 )
 323             error_msg = "file \"" + @seqs_name_file.to_s + "\" appears empty"
 324             Util.fatal_error( PRG_NAME, error_msg )
 325           end
 326
 327           if ( @remove_seqs )
 328             c = 0
 329             for i in 0 ... names.length()
 330               to_delete = msa.find_by_name( names[ i ], true, false )
 331               if ( to_delete.length() < 1 )
 332                 error_msg = "sequence name \"" + names[ i ] + "\" not found"
 333                 Util.fatal_error( PRG_NAME, error_msg )
 334               elsif ( to_delete.length() > 1 )
 335                 error_msg = "sequence name \"" + names[ i ] + "\" is not unique"
 336                 Util.fatal_error( PRG_NAME, error_msg )
 337               else
 338                 msa.remove_sequence!( to_delete[ 0 ] )
 339                 c += 1
 340               end
 341             end
 342             Util.print_message( PRG_NAME, "Removed " + c.to_s + " sequences" )
 343             log <<  "Removed " + c.to_s + " sequences" + ld
 344           elsif ( @keep_seqs )
 345             msa_new = Msa.new()
 346             r = 0
 347             k = 0
 348             for j in 0 ... msa.get_number_of_seqs()
 349               if ( names.include?( msa.get_sequence( j ).get_name() ) )
 350                 msa_new.add_sequence( msa.get_sequence( j ) )
 351                 k += 1
 352               else
 353                 r += 1
 354               end
 355             end
 356             msa = msa_new
 357             Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
 358             log << "Kept    " + k.to_s + " sequences" + ld
 359             Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
 360             log << "removed " + r.to_s + " sequences" + ld
 361           end
 362         end
 363
 364         if ( @trim )
 365           msa.trim!( @first, @last )
 366         end
 367         if( @rgr >= 0 )
 368           msa.remove_gap_columns_w_gap_ratio!( @rgr )
 369         elsif ( @rgc )
 370           msa.remove_gap_columns!()
 371         elsif( @rgoc )
 372           msa.remove_gap_only_columns!()
 373         end
 374         if( @rsgr >= 0 )
 375           n = msa.get_number_of_seqs()
 376           removed = msa.remove_sequences_by_gap_ratio!( @rsgr )
 377           k = msa.get_number_of_seqs()
 378           r = n - k
 379           Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
 380           log << "Kept    " + k.to_s + " sequences" + ld
 381           Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences"  )
 382           log << "Removed " + r.to_s + " sequences:" + ld
 383           removed.each { | seq_name |
 384             log << "         " + seq_name  + ld
 385           }
 386         end
 387         if( @rsl >= 0 )
 388           n = msa.get_number_of_seqs()
 389           removed = msa.remove_sequences_by_non_gap_length!( @rsl )
 390           k = msa.get_number_of_seqs()
 391           r = n - k
 392           Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
 393           log << "Kept    " + k.to_s + " sequences" + ld
 394           Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
 395           log << "Removed " + r.to_s + " sequences:" + ld
 396           removed.each { | seq_name |
 397             log << "         " + seq_name  + ld
 398           }
 399         end
 400         if ( @keep_matching )
 401           n = msa.get_number_of_seqs
 402           to_be_removed = Set.new
 403           for ii in 0 ...  n
 404             seq = msa.get_sequence( ii )
 405             if !seq.get_name.downcase.index( @keep_matching.downcase )
 406               to_be_removed.add( ii )
 407             end
 408           end
 409           to_be_removed_ary = to_be_removed.to_a.sort.reverse
 410           to_be_removed_ary.each { | index |
 411             msa.remove_sequence!( index )
 412           }
 413           # msa = sort( msa )
 414         end
 415         if ( @remove_matching )
 416           n = msa.get_number_of_seqs
 417           to_be_removed = Set.new
 418           for iii in 0 ... n
 419
 420             seq = msa.get_sequence( iii )
 421
 422             if seq.get_name.downcase.index( @remove_matching.downcase )
 423               to_be_removed.add( iii )
 424             end
 425           end
 426           to_be_removed_ary = to_be_removed.to_a.sort.reverse
 427           to_be_removed_ary.each { | index |
 428             msa.remove_sequence!( index )
 429           }
 430           msa = sort( msa )
 431         end
 432
 433
 434
 435         if ( @split > 0 )
 436           begin
 437             msas = msa.split( @split, true )
 438             io = MsaIO.new()
 439             w = MsaWriter
 440             if ( @pi_output )
 441               w = PhylipSequentialWriter.new()
 442               w.clean( @clean )
 443               w.set_max_name_length( @name_length )
 444             elsif( @fasta_output )
 445               w = FastaWriter.new()
 446               w.set_line_width( @width )
 447               if ( @rg )
 448                 w.remove_gap_chars( true )
 449                 Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
 450                 log << "removing gap character, the output is likely to become unaligned" + ld
 451               end
 452               w.clean( @clean )
 453               if ( @name_length_set )
 454                 w.set_max_name_length( @name_length )
 455               end
 456             elsif( @nexus_output )
 457               w = NexusWriter.new()
 458               w.clean( @clean )
 459               w.set_max_name_length( @name_length )
 460             end
 461             i = 0
 462             for m in msas
 463               i = i + 1
 464               io.write_to_file( m, output + "_" + i.to_s, w )
 465             end
 466             Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files"  )
 467             log << "wrote " + msas.length.to_s + " files" + ld
 468           rescue Exception => e
 469             Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 470           end
 471
 472         end
 473       rescue Exception => e
 474         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 475       end
 476
 477       if ( @split <= 0 )
 478
 479         unless ( @rg )
 480           if ( msa.is_aligned() )
 481             Util.print_message( PRG_NAME, "Length of processed alignment        : " + msa.get_length.to_s )
 482             log <<  "Length of processed alignment        : " + msa.get_length.to_s + ld
 483             gp = msa.calculate_gap_proportion
 484             Util.print_message( PRG_NAME, "Gap-proportion of processed alignment: " + gp.to_s )
 485             log << "Gap-proportion of processed alignment: " +  gp.to_s + ld
 486           else
 487             min = 0
 488             max = 0
 489             sum = 0
 490             first = true
 491             for s in 0 ... msa.get_number_of_seqs
 492               seq = msa.get_sequence( s )
 493               l = seq.get_length
 494               sum += l
 495               if l > max
 496                 max = l
 497               end
 498               if first || l < min
 499                 min = l
 500               end
 501               first = false
 502             end
 503             avg = sum / msa.get_number_of_seqs
 504             Util.print_message( PRG_NAME, "Output is not aligned" )
 505             log << "Output is not aligned" + ld
 506             Util.print_message( PRG_NAME, "Shortest sequence                    : " + min.to_s )
 507             log <<  "Shortest sequence                    : " + min.to_s + ld
 508             Util.print_message( PRG_NAME, "Longest sequence                     : " + max.to_s )
 509             log <<  "Longest sequence                     : " + max.to_s + ld
 510             Util.print_message( PRG_NAME, "Average length                       : " + avg.to_s )
 511             log <<  "Average length                       : " + avg.to_s + ld
 512
 513           end
 514         end
 515
 516         if @rem_red
 517           removed = msa.remove_redundant_sequences!( true, false )
 518           if removed.size > 0
 519             identicals = msa.get_identical_seqs_detected
 520             log << "the following " + identicals.size.to_s + " sequences are identical:" + ld
 521             identicals.each { | s |
 522               log << s + ld
 523             }
 524             log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld
 525             removed.each { | seq_name |
 526               log << seq_name + ld
 527             }
 528             Util.print_message( PRG_NAME, "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" )
 529             log << "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" + ld
 530           end
 531         end
 532
 533         io = MsaIO.new()
 534
 535         w = MsaWriter
 536
 537         if ( @pi_output )
 538           w = PhylipSequentialWriter.new()
 539           w.clean( @clean )
 540           w.set_max_name_length( @name_length )
 541           w.set_exception_if_name_too_long( @die_if_name_too_long )
 542         elsif( @fasta_output )
 543           w = FastaWriter.new()
 544           w.set_line_width( @width )
 545           if ( @rg )
 546             w.remove_gap_chars( true )
 547             Util.print_warning_message( PRG_NAME, "removing gap characters, the output is likely to become unaligned"  )
 548             log << "removing gap character, the output is likely to become unaligned" + ld
 549           end
 550           w.clean( @clean )
 551           if ( @name_length_set )
 552             w.set_max_name_length( @name_length )
 553             w.set_exception_if_name_too_long( @die_if_name_too_long )
 554           end
 555         elsif( @nexus_output )
 556           w = NexusWriter.new()
 557           w.clean( @clean )
 558           w.set_max_name_length( @name_length )
 559           w.set_exception_if_name_too_long( @die_if_name_too_long )
 560         end
 561
 562         begin
 563           io.write_to_file( msa, output, w )
 564         rescue Exception => e
 565           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 566         end
 567
 568         Util.print_message( PRG_NAME, "Number of sequences in output        : " + msa.get_number_of_seqs.to_s )
 569         log << "Number of sequences in output        : " + msa.get_number_of_seqs.to_s + ld
 570
 571         begin
 572           f = File.open( output + LOG_SUFFIX, 'a' )
 573           f.print( log )
 574           f.close
 575         rescue Exception => e
 576           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 577         end
 578
 579
 580       end
 581       Util.print_message( PRG_NAME, "OK" )
 582       puts
 583     end
 584
 585
 586     private
 587
 588     def sort( msa )
 589       names = Set.new
 590       for i in 0 ... msa.get_number_of_seqs
 591         name = msa.get_sequence( i ).get_name
 592         names.add( name )
 593       end
 594       sorted_ary = names.to_a.sort
 595       new_msa = Msa.new
 596       sorted_ary.each { | seq_name |
 597         seq = msa.get_sequence( msa.find_by_name( seq_name, true, false )[ 0 ] )
 598         new_msa.add_sequence( seq )
 599       }
 600       new_msa
 601     end
 602
 603     def set_fasta_input( fi = true )
 604       @fasta_input = fi
 605       @input_format_set = true
 606     end
 607     def set_phylip_input( pi = true )
 608       @phylip_input = pi
 609       @input_format_set = true
 610     end
 611     def set_name_length( i )
 612       @name_length = i
 613       @name_length_set = true
 614     end
 615     def set_width( i )
 616       @width = i
 617     end
 618     def set_fasta_output( fo = true )
 619       @fasta_output = fo
 620       @output_format_set = true
 621     end
 622     def set_pi_output( pso = true )
 623       @pi_output = pso
 624       @output_format_set = true
 625     end
 626     def set_nexus_output( nexus = true )
 627       @nexus_output = nexus
 628       @output_format_set = true
 629     end
 630     def set_clean( c = true )
 631       @clean = c
 632     end
 633     def set_remove_gap_columns( rgc = true )
 634       @rgc = rgc
 635     end
 636     def set_remove_gap_only_columns( rgoc = true )
 637       @rgoc = rgoc
 638     end
 639     def set_remove_gaps( rg = true )
 640       @rg = rg
 641     end
 642     def set_remove_gap_ratio( rgr )
 643       @rgr = rgr
 644     end
 645     def set_remove_seqs_gap_ratio( rsgr )
 646       @rsgr = rsgr
 647     end
 648     def set_remove_seqs_min_non_gap_length( rsl )
 649       @rsl = rsl
 650     end
 651     def set_remove_seqs( file )
 652       @seqs_name_file = file
 653       @remove_seqs    = true
 654       @keep_seqs      = false
 655     end
 656     def set_keep_seqs( file )
 657       @seqs_name_file = file
 658       @keep_seqs      = true
 659       @remove_seqs    = false
 660     end
 661     def set_trim( first, last )
 662       @trim            = true
 663       @first           = first
 664       @last            = last
 665     end
 666     def set_remove_matching( remove )
 667       @remove_matching  = remove
 668     end
 669     def set_keep_matching( keep )
 670       @keep_matching = keep
 671     end
 672     def set_rem_red( rr )
 673       @rem_red = rr
 674     end
 675
 676
 677
 678     def set_split( s )
 679       if ( s > 0 )
 680         @split            = s
 681         @clean            = false  # phylip only
 682         @rgc              = false
 683         @rgoc             = false
 684         @rg               = false  # fasta only
 685         @rgr              = -1
 686         @rsgr             = -1
 687         @rsl              = -1
 688         @seqs_name_file   = nil
 689         @remove_seqs      = false
 690         @keep_seqs        = false
 691         @trim             = false
 692         @first            = -1
 693         @last             = -1
 694       end
 695     end
 696
 697     def analyze_command_line( cla )
 698       if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
 699         begin
 700           type = cla.get_option_value( INPUT_TYPE_OPTION )
 701           if ( type == "p" )
 702             set_phylip_input( true )
 703             set_fasta_input( false )
 704           elsif ( type == "f" )
 705             set_fasta_input( true )
 706             set_phylip_input( false )
 707           end
 708         rescue ArgumentError => e
 709           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 710         end
 711       end
 712       if ( cla.is_option_set?( OUTPUT_TYPE_OPTION ) )
 713         begin
 714           type = cla.get_option_value( OUTPUT_TYPE_OPTION )
 715           if ( type == "p" )
 716             set_pi_output( true )
 717             set_fasta_output( false )
 718             set_nexus_output( false )
 719           elsif ( type == "f" )
 720             set_pi_output( false )
 721             set_fasta_output( true )
 722             set_nexus_output( false )
 723           elsif ( type == "n" )
 724             set_pi_output( false )
 725             set_fasta_output( false )
 726             set_nexus_output( true )
 727           end
 728         rescue ArgumentError => e
 729           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 730         end
 731       end
 732       if ( cla.is_option_set?( MAXIMAL_NAME_LENGTH_OPTION ) )
 733         begin
 734           l = cla.get_option_value_as_int( MAXIMAL_NAME_LENGTH_OPTION )
 735           set_name_length( l )
 736         rescue ArgumentError => e
 737           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 738         end
 739       end
 740       if ( cla.is_option_set?( WIDTH_OPTION ) )
 741         begin
 742           w = cla.get_option_value_as_int( WIDTH_OPTION )
 743           set_width( w )
 744         rescue ArgumentError => e
 745           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 746         end
 747       end
 748       if ( cla.is_option_set?( CLEAN_UP_SEQ_OPTION ) )
 749         set_clean( true )
 750       end
 751       if ( cla.is_option_set?( REMOVE_GAP_COLUMNS_OPTION ) )
 752         set_remove_gap_columns( true )
 753       end
 754       if ( cla.is_option_set?( REM_RED_OPTION ) )
 755         set_rem_red( true )
 756       end
 757       if ( cla.is_option_set?( REMOVE_GAP_ONLY_COLUMNS ) )
 758         set_remove_gap_only_columns( true )
 759       end
 760       if ( cla.is_option_set?( REMOVE_ALL_GAP_CHARACTERS_OPTION ) )
 761         set_remove_gaps( true )
 762       end
 763       if ( cla.is_option_set?( REMOVE_COLUMNS_GAP_RATIO_OPTION ) )
 764         begin
 765           f = cla.get_option_value_as_float( REMOVE_COLUMNS_GAP_RATIO_OPTION )
 766           set_remove_gap_ratio( f )
 767         rescue ArgumentError => e
 768           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 769         end
 770       end
 771       if ( cla.is_option_set?( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) )
 772         begin
 773           s = cla.get_option_value( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
 774           set_remove_seqs( s )
 775         rescue ArgumentError => e
 776           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 777         end
 778       end
 779       if ( cla.is_option_set?( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) )
 780         begin
 781           s = cla.get_option_value( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
 782           set_keep_seqs( s )
 783         rescue ArgumentError => e
 784           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 785         end
 786       end
 787       if ( cla.is_option_set?( TRIM_OPTION ) )
 788         begin
 789           s = cla.get_option_value( TRIM_OPTION )
 790           if ( s =~ /(\d+)-(\d+)/ )
 791             set_trim( $1.to_i(), $2.to_i() )
 792           else
 793             puts( "illegal argument" )
 794             print_help
 795             exit( -1 )
 796           end
 797         rescue ArgumentError => e
 798           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 799         end
 800       end
 801       if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) )
 802         begin
 803           f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION )
 804           set_remove_seqs_gap_ratio( f )
 805         rescue ArgumentError => e
 806           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 807         end
 808       end
 809       if ( cla.is_option_set?( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) )
 810         begin
 811           f = cla.get_option_value_as_int( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
 812           set_remove_seqs_min_non_gap_length( f )
 813         rescue ArgumentError => e
 814           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 815         end
 816       end
 817       if ( cla.is_option_set?( SPLIT ) )
 818         begin
 819           s = cla.get_option_value_as_int( SPLIT )
 820           set_split( s )
 821         rescue ArgumentError => e
 822           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 823         end
 824
 825       end
 826       if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) )
 827         begin
 828           s = cla.get_option_value( REMOVE_MATCHING_SEQUENCES_OPTION )
 829           set_remove_matching( s )
 830         rescue ArgumentError => e
 831           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 832         end
 833       end
 834       if ( cla.is_option_set?( KEEP_MATCHING_SEQUENCES_OPTION ) )
 835         begin
 836           s = cla.get_option_value( KEEP_MATCHING_SEQUENCES_OPTION )
 837           set_keep_matching( s )
 838         rescue ArgumentError => e
 839           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 840         end
 841       end
 842       if ( cla.is_option_set?( DIE_IF_NAME_TOO_LONG ) )
 843         @die_if_name_too_long = true
 844       end
 845
 846
 847     end
 848
 849     def print_help()
 850       puts()
 851       puts( "Usage:" )
 852       puts()
 853       puts( "  " + PRG_NAME + ".rb [options] <input alignment> <output>" )
 854       puts()
 855       puts( "  options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta, p for phylip selex type" )
 856       puts( "           -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta, n for nexus, p for phylip sequential (default)" )
 857       puts( "           -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
 858       puts( "           -" + DIE_IF_NAME_TOO_LONG + ": die if sequence name too long" )
 859       puts( "           -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
 860       puts( "           -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" )
 861       puts( "           -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" )
 862       puts( "           -" + REMOVE_GAP_ONLY_COLUMNS + ": remove gap-only columns" )
 863       puts( "           -" + REMOVE_COLUMNS_GAP_RATIO_OPTION + "=<n>: remove columns for which ( seqs with gap / number of sequences > n )" )
 864       puts( "           -" + REMOVE_ALL_GAP_CHARACTERS_OPTION + ": remove all gap characters (destroys alignment, fasta output only)" )
 865       puts( "           -" + REMOVE_ALL_SEQUENCES_LISTED_OPTION + "=<file>: remove all sequences listed in file" )
 866       puts( "           -" + KEEP_ONLY_SEQUENCES_LISTED_OPTION + "=<file>: keep only sequences listed in file" )
 867       puts( "           -" + TRIM_OPTION + "=<first>-<last>: remove columns before first and after last" )
 868       puts( "           -" + REMOVE_SEQS_GAP_RATIO_OPTION + "=<n>: remove sequences for which the gap ratio > n (after column operations)" )
 869       puts( "           -" + REMOVE_SEQS_NON_GAP_LENGTH_OPTION + "=<n> remove sequences with less than n non-gap characters (after column operations)" )
 870       puts( "           -" + REMOVE_MATCHING_SEQUENCES_OPTION + "=<s> remove all sequences with names containing s" )
 871       puts( "           -" + KEEP_MATCHING_SEQUENCES_OPTION + "=<s> keep only sequences with names containing s" )
 872       puts( "           -" + SPLIT + "=<n> split a fasta file into n files of equal number of sequences (expect for " )
 873       puts( "            last one), cannot be used with other options" )
 874       puts( "           -" + REM_RED_OPTION + ": remove redundant sequences" )
 875       puts()
 876     end
 877
 878
 879
 880
 881
 882   end # class MsaProcessor
 883
 884
 885 end # module Evoruby