forester/ruby/evoruby/lib/evo/tool/msa_processor.rb

   1 #
   2 # = lib/evo/apps/msa_processor.rb - MsaProcessor class
   3 #
   4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
   5 # License::    GNU Lesser General Public License (LGPL)
   6 #
   7 # $Id: msa_processor.rb,v 1.33 2010/12/13 19:00:10 cmzmasek Exp $
   8 #
   9
  10 require 'date'
  11 require 'set'
  12
  13 require 'lib/evo/util/constants'
  14 require 'lib/evo/util/util'
  15 require 'lib/evo/util/command_line_arguments'
  16 require 'lib/evo/msa/msa_factory'
  17 require 'lib/evo/io/msa_io'
  18 require 'lib/evo/io/writer/phylip_sequential_writer'
  19 require 'lib/evo/io/writer/nexus_writer'
  20 require 'lib/evo/io/writer/fasta_writer'
  21 require 'lib/evo/io/parser/fasta_parser'
  22 require 'lib/evo/io/parser/general_msa_parser'
  23 require 'lib/evo/io/writer/msa_writer'
  24
  25 module Evoruby
  26
  27   class MsaProcessor
  28
  29     PRG_NAME       = "msa_pro"
  30     PRG_DATE       = "130411"
  31     PRG_DESC       = "processing of multiple sequence alignments"
  32     PRG_VERSION    = "1.07"
  33     COPYRIGHT      = "2008-2010 Christian M Zmasek"
  34     CONTACT        = "phylosoft@gmail.com"
  35     WWW            = "www.phylosoft.org"
  36
  37
  38     NAME_LENGTH_DEFAULT                = 10
  39     WIDTH_DEFAULT_FASTA                = 60
  40     INPUT_TYPE_OPTION                  = "i"
  41     OUTPUT_TYPE_OPTION                 = "o"
  42     MAXIMAL_NAME_LENGTH_OPTION         = "n"
  43     DIE_IF_NAME_TOO_LONG               = "d"
  44     WIDTH_OPTION                       = "w"
  45     CLEAN_UP_SEQ_OPTION                = "c"
  46     REM_RED_OPTION                     = "rem_red"
  47     REMOVE_GAP_COLUMNS_OPTION          = "rgc"
  48     REMOVE_GAP_ONLY_COLUMNS            = "rgoc"
  49     REMOVE_COLUMNS_GAP_RATIO_OPTION    = "rr"
  50     REMOVE_ALL_GAP_CHARACTERS_OPTION   = "rg"
  51     REMOVE_ALL_SEQUENCES_LISTED_OPTION = "r"
  52     KEEP_ONLY_SEQUENCES_LISTED_OPTION  = "k"
  53
  54     KEEP_MATCHING_SEQUENCES_OPTION     = "mk"
  55     REMOVE_MATCHING_SEQUENCES_OPTION   = "mr"
  56
  57     TRIM_OPTION                        = "t"
  58     REMOVE_SEQS_GAP_RATIO_OPTION       = "rsgr"
  59     REMOVE_SEQS_NON_GAP_LENGTH_OPTION  = "rsl"
  60     SPLIT                              = "split"
  61     LOG_SUFFIX                         = "_msa_pro.log"
  62     HELP_OPTION_1                      = "help"
  63     HELP_OPTION_2                      = "h"
  64
  65
  66     def initialize()
  67       @input_format_set = false
  68       @output_format_set = false
  69       @fasta_input      = false
  70       @phylip_input     = true
  71       @name_length      = NAME_LENGTH_DEFAULT
  72       @name_length_set  = false
  73       @width            = WIDTH_DEFAULT_FASTA     # fasta only
  74       @pi_output        = true
  75       @fasta_output     = false
  76       @nexus_output     = false
  77       @clean            = false  # phylip only
  78       @rgc              = false
  79       @rgoc             = false
  80       @rg               = false  # fasta only
  81       @rem_red          = false
  82       @die_if_name_too_long  = false
  83       @rgr              = -1
  84       @rsgr             = -1
  85       @rsl              = -1
  86       @remove_matching  = nil
  87       @keep_matching    = nil
  88
  89       @seqs_name_file   = nil
  90       @remove_seqs      = false
  91       @keep_seqs        = false
  92       @trim             = false
  93       @split            = -1
  94       @first            = -1
  95       @last             = -1
  96     end
  97
  98
  99     def run()
 100
 101       Util.print_program_information( PRG_NAME,
 102         PRG_VERSION,
 103         PRG_DESC,
 104         PRG_DATE,
 105         COPYRIGHT,
 106         CONTACT,
 107         WWW,
 108         STDOUT )
 109
 110       if ( ARGV == nil || ARGV.length < 1 )
 111         Util.print_message( PRG_NAME, "Illegal number of arguments" )
 112         print_help
 113         exit( -1 )
 114       end
 115
 116       begin
 117         cla = CommandLineArguments.new( ARGV )
 118       rescue ArgumentError => e
 119         Util.fatal_error( PRG_NAME, "Error: " + e.to_s, STDOUT )
 120       end
 121
 122       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
 123            cla.is_option_set?( HELP_OPTION_2 ) )
 124         print_help
 125         exit( 0 )
 126       end
 127
 128       if ( cla.get_number_of_files != 2 || ARGV.length < 2 )
 129         Util.print_message( PRG_NAME, "Illegal number of arguments" )
 130         print_help
 131         exit( -1 )
 132       end
 133
 134       allowed_opts = Array.new
 135       allowed_opts.push( INPUT_TYPE_OPTION )
 136       allowed_opts.push( OUTPUT_TYPE_OPTION )
 137       allowed_opts.push( MAXIMAL_NAME_LENGTH_OPTION )
 138       allowed_opts.push( WIDTH_OPTION )
 139       allowed_opts.push( CLEAN_UP_SEQ_OPTION )
 140       allowed_opts.push( REMOVE_GAP_COLUMNS_OPTION )
 141       allowed_opts.push( REMOVE_GAP_ONLY_COLUMNS )
 142       allowed_opts.push( REMOVE_COLUMNS_GAP_RATIO_OPTION )
 143       allowed_opts.push( REMOVE_ALL_GAP_CHARACTERS_OPTION )
 144       allowed_opts.push( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
 145       allowed_opts.push( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
 146       allowed_opts.push( TRIM_OPTION )
 147       allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION )
 148       allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
 149       allowed_opts.push( SPLIT )
 150       allowed_opts.push( REM_RED_OPTION )
 151       allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
 152       allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
 153       allowed_opts.push( DIE_IF_NAME_TOO_LONG )
 154
 155       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
 156       if ( disallowed.length > 0 )
 157         Util.fatal_error( PRG_NAME,
 158           "unknown option(s): " + disallowed )
 159       end
 160
 161       input = cla.get_file_name( 0 )
 162       output = cla.get_file_name( 1 )
 163
 164       analyze_command_line( cla )
 165
 166       begin
 167         Util.check_file_for_readability( input )
 168       rescue IOError => e
 169         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 170       end
 171
 172       begin
 173         Util.check_file_for_writability( output )
 174       rescue IOError => e
 175         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 176       end
 177
 178       if ( @rg )
 179         set_pi_output( false )
 180         set_fasta_output( true )
 181         set_nexus_output( false )
 182       end
 183
 184       if ( !@input_format_set )
 185         fasta_like = false
 186         begin
 187           fasta_like = Util.looks_like_fasta?( input )
 188         rescue ArgumentError => e
 189           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 190         end
 191         @fasta_input = fasta_like
 192         @phylip_input = !fasta_like
 193         if ( !@output_format_set )
 194           @fasta_output = fasta_like
 195           @pi_output = !fasta_like
 196           @nexus_output = false
 197         end
 198       end
 199
 200       ld = Constants::LINE_DELIMITER
 201       log = PRG_NAME + " " + PRG_VERSION + " [" + PRG_DATE + "]" + " LOG" + ld
 202       now = DateTime.now
 203       log << "Date/time: " + now.to_s + ld
 204
 205       puts()
 206       puts( "Input alignment  : " + input )
 207       log << "Input alignment  : " + input + ld
 208       puts( "Output alignment : " + output )
 209       log << "Output alignment : " + output + ld
 210       if ( @phylip_input )
 211         puts( "Input is         : Phylip, or something like it" )
 212         log << "Input is         : Phylip, or something like it" + ld
 213       elsif ( @fasta_input )
 214         puts( "Input is         : Fasta" )
 215         log << "Input is         : Fasta" + ld
 216       end
 217       if( @rgr >= 0 )
 218         puts( "Max col gap ratio: " + @rgr.to_s )
 219         log << "Max col gap ratio: " + @rgr.to_s + ld
 220       elsif ( @rgc )
 221         puts( "Remove gap colums" )
 222         log << "Remove gap colums" + ld
 223       elsif( @rgoc )
 224         puts( "Remove gap only colums" )
 225         log << "Remove gap only colums" + ld
 226       end
 227       if ( @clean )
 228         puts( "Clean up         : true" )
 229         log << "Clean up         : true" + ld
 230       end
 231
 232       if ( @pi_output )
 233         puts( "Output is        : Phylip interleaved" )
 234         log << "Output is        : Phylip interleaved" + ld
 235       elsif ( @fasta_output )
 236         puts( "Output is        : Fasta" )
 237         log << "Output is        : Fasta" + ld
 238         if ( @width )
 239           puts( "Width            : " + @width.to_s )
 240           log << "Width            : " + @width.to_s + ld
 241         end
 242         if ( @rg )
 243           puts( "Remove all gap characters (alignment is destroyed)" )
 244           log << "Remove all gap characters (alignment is destroyed)" + ld
 245         end
 246       elsif ( @nexus_output )
 247         puts( "Output is        : Nexus" )
 248         log << "Output is        : Nexus" + ld
 249       end
 250       if ( @name_length_set || !@fasta_output )
 251         puts( "Max name length  : " + @name_length.to_s )
 252         log << "Max name length  : " + @name_length.to_s + ld
 253       end
 254       if( @rsgr >= 0 )
 255         puts( "Remove sequences for which the gap ratio > " + @rsgr.to_s )
 256         log << "Remove sequences for which the gap ratio > " + @rsgr.to_s + ld
 257       end
 258       if( @rsl >= 0 )
 259         puts( "Remove sequences with less than "  + @rsl.to_s + " non-gap characters" )
 260         log << "Remove sequences with less than "  + @rsl.to_s + " non-gap characters" + ld
 261       end
 262       if ( @remove_seqs )
 263         puts( "Remove sequences listed in: " + @seqs_name_file )
 264         log << "Remove sequences listed in: " + @seqs_name_file + ld
 265       elsif ( @keep_seqs )
 266         puts( "Keep only sequences listed in: " + @seqs_name_file )
 267         log << "Keep only sequences listed in: " + @seqs_name_file + ld
 268       end
 269       if ( @trim )
 270         puts( "Keep only columns from: "+ @first.to_s + " to " + @last.to_s )
 271         log << "Keep only columns from: "+ @first.to_s + " to " + @last.to_s + ld
 272       end
 273       if ( @rem_red )
 274         puts( "Remove redundant sequences: true" )
 275         log << "Remove redundant sequences: true" + ld
 276       end
 277       if ( @split > 0 )
 278         puts( "Split            : " + @split.to_s )
 279         log << "Split            : " + @split.to_s + ld
 280       end
 281       puts()
 282
 283       f = MsaFactory.new()
 284
 285       msa = nil
 286
 287       begin
 288         if ( @phylip_input )
 289           msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
 290         elsif ( @fasta_input )
 291           msa = f.create_msa_from_file( input, FastaParser.new() )
 292         end
 293       rescue Exception => e
 294         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 295       end
 296
 297       if ( msa.is_aligned() )
 298         Util.print_message( PRG_NAME, "Length of original alignment         : " + msa.get_length.to_s )
 299         log << "Length of original alignment         : " + msa.get_length.to_s + ld
 300         gp = msa.calculate_gap_proportion
 301         Util.print_message( PRG_NAME, "Gap-proportion of original alignment : " + gp.to_s )
 302         log << "Gap-proportion of original alignment : " +  gp.to_s + ld
 303       else
 304         Util.print_message( PRG_NAME, "the input is not aligned" )
 305         log << "The input is not aligned" + ld
 306       end
 307
 308       all_names = Set.new()
 309       for i in 0 ... msa.get_number_of_seqs()
 310         current_name = msa.get_sequence( i ).get_name
 311         if all_names.include?( current_name )
 312           Util.print_warning_message( PRG_NAME, "sequence name [" + current_name + "] is not unique" )
 313         else
 314           all_names.add( current_name )
 315         end
 316       end
 317
 318       begin
 319
 320         if ( @remove_seqs || @keep_seqs )
 321           names = Util.file2array( @seqs_name_file, true )
 322           if ( names == nil ||  names.length() < 1 )
 323             error_msg = "file \"" + @seqs_name_file.to_s + "\" appears empty"
 324             Util.fatal_error( PRG_NAME, error_msg )
 325           end
 326
 327           if ( @remove_seqs )
 328             c = 0
 329             for i in 0 ... names.length()
 330               to_delete = msa.find_by_name( names[ i ], true, false )
 331               if ( to_delete.length() < 1 )
 332                 error_msg = "sequence name \"" + names[ i ] + "\" not found"
 333                 Util.fatal_error( PRG_NAME, error_msg )
 334               elsif ( to_delete.length() > 1 )
 335                 error_msg = "sequence name \"" + names[ i ] + "\" is not unique"
 336                 Util.fatal_error( PRG_NAME, error_msg )
 337               else
 338                 msa.remove_sequence!( to_delete[ 0 ] )
 339                 c += 1
 340               end
 341             end
 342             Util.print_message( PRG_NAME, "Removed " + c.to_s + " sequences" )
 343             log <<  "Removed " + c.to_s + " sequences" + ld
 344           elsif ( @keep_seqs )
 345             msa_new = Msa.new()
 346             r = 0
 347             k = 0
 348             for j in 0 ... msa.get_number_of_seqs()
 349               if ( names.include?( msa.get_sequence( j ).get_name() ) )
 350                 msa_new.add_sequence( msa.get_sequence( j ) )
 351                 k += 1
 352               else
 353                 r += 1
 354               end
 355             end
 356             msa = msa_new
 357             Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
 358             log << "Kept    " + k.to_s + " sequences" + ld
 359             Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
 360             log << "removed " + r.to_s + " sequences" + ld
 361           end
 362         end
 363
 364         if ( @trim )
 365           msa.trim!( @first, @last )
 366         end
 367         if( @rgr >= 0 )
 368           msa.remove_gap_columns_w_gap_ratio!( @rgr )
 369         elsif ( @rgc )
 370           msa.remove_gap_columns!()
 371         elsif( @rgoc )
 372           msa.remove_gap_only_columns!()
 373         end
 374         if( @rsgr >= 0 )
 375           n = msa.get_number_of_seqs()
 376           removed = msa.remove_sequences_by_gap_ratio!( @rsgr )
 377           k = msa.get_number_of_seqs()
 378           r = n - k
 379           Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
 380           log << "Kept    " + k.to_s + " sequences" + ld
 381           Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences"  )
 382           log << "Removed " + r.to_s + " sequences:" + ld
 383           removed.each { | seq_name |
 384             log << "         " + seq_name  + ld
 385           }
 386         end
 387         if( @rsl >= 0 )
 388           n = msa.get_number_of_seqs()
 389           removed = msa.remove_sequences_by_non_gap_length!( @rsl )
 390           k = msa.get_number_of_seqs()
 391           r = n - k
 392           Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
 393           log << "Kept    " + k.to_s + " sequences" + ld
 394           Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
 395           log << "Removed " + r.to_s + " sequences:" + ld
 396           removed.each { | seq_name |
 397             log << "         " + seq_name  + ld
 398           }
 399         end
 400         if ( @keep_matching )
 401           n = msa.get_number_of_seqs
 402           to_be_removed = Set.new
 403           for ii in 0 ...  n
 404             seq = msa.get_sequence( ii )
 405             if !seq.get_name.downcase.index( @keep_matching.downcase )
 406               to_be_removed.add( ii )
 407             end
 408           end
 409           to_be_removed_ary = to_be_removed.to_a.sort.reverse
 410           to_be_removed_ary.each { | index |
 411             msa.remove_sequence!( index )
 412           }
 413           # msa = sort( msa )
 414         end
 415         if ( @remove_matching )
 416           n = msa.get_number_of_seqs
 417           to_be_removed = Set.new
 418           for iii in 0 ... n
 419
 420             seq = msa.get_sequence( iii )
 421
 422             if seq.get_name.downcase.index( @remove_matching.downcase )
 423               to_be_removed.add( iii )
 424             end
 425           end
 426           to_be_removed_ary = to_be_removed.to_a.sort.reverse
 427           to_be_removed_ary.each { | index |
 428             msa.remove_sequence!( index )
 429           }
 430           msa = sort( msa )
 431         end
 432
 433
 434
 435         if ( @split > 0 )
 436           begin
 437             msas = msa.split( @split, true )
 438             io = MsaIO.new()
 439             w = MsaWriter
 440             if ( @pi_output )
 441               w = PhylipSequentialWriter.new()
 442               w.clean( @clean )
 443               w.set_max_name_length( @name_length )
 444             elsif( @fasta_output )
 445               w = FastaWriter.new()
 446               w.set_line_width( @width )
 447               if ( @rg )
 448                 w.remove_gap_chars( true )
 449                 Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
 450                 log << "removing gap character, the output is likely to become unaligned" + ld
 451               end
 452               w.clean( @clean )
 453               if ( @name_length_set )
 454                 w.set_max_name_length( @name_length )
 455               end
 456             elsif( @nexus_output )
 457               w = NexusWriter.new()
 458               w.clean( @clean )
 459               w.set_max_name_length( @name_length )
 460             end
 461             i = 0
 462             for m in msas
 463               i = i + 1
 464               io.write_to_file( m, output + "_" + i.to_s, w )
 465             end
 466             Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files"  )
 467             log << "wrote " + msas.length.to_s + " files" + ld
 468           rescue Exception => e
 469             Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 470           end
 471
 472         end
 473       rescue Exception => e
 474         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 475       end
 476
 477       if ( @split <= 0 )
 478
 479         unless ( @rg )
 480           if ( msa.is_aligned() )
 481             Util.print_message( PRG_NAME, "Length of processed alignment        : " + msa.get_length.to_s )
 482             log <<  "Length of processed alignment        : " + msa.get_length.to_s + ld
 483             gp = msa.calculate_gap_proportion
 484             Util.print_message( PRG_NAME, "Gap-proportion of processed alignment: " + gp.to_s )
 485             log << "Gap-proportion of processed alignment: " +  gp.to_s + ld
 486           else
 487             Util.print_warning_message( PRG_NAME, "output is not aligned" )
 488             log << "output is not aligned" + ld
 489           end
 490         end
 491
 492         if @rem_red
 493           removed = msa.remove_redundant_sequences!( true, false )
 494           if removed.size > 0
 495             identicals = msa.get_identical_seqs_detected
 496             log << "the following " + identicals.size.to_s + " sequences are identical:" + ld
 497             identicals.each { | s |
 498               log << s + ld
 499             }
 500             log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld
 501             removed.each { | seq_name |
 502               log << seq_name + ld
 503             }
 504             Util.print_message( PRG_NAME, "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" )
 505             log << "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" + ld
 506           end
 507         end
 508
 509         io = MsaIO.new()
 510
 511         w = MsaWriter
 512
 513         if ( @pi_output )
 514           w = PhylipSequentialWriter.new()
 515           w.clean( @clean )
 516           w.set_max_name_length( @name_length )
 517           w.set_exception_if_name_too_long( @die_if_name_too_long )
 518         elsif( @fasta_output )
 519           w = FastaWriter.new()
 520           w.set_line_width( @width )
 521           if ( @rg )
 522             w.remove_gap_chars( true )
 523             Util.print_warning_message( PRG_NAME, "removing gap characters, the output is likely to become unaligned"  )
 524             log << "removing gap character, the output is likely to become unaligned" + ld
 525           end
 526           w.clean( @clean )
 527           if ( @name_length_set )
 528             w.set_max_name_length( @name_length )
 529             w.set_exception_if_name_too_long( @die_if_name_too_long )
 530           end
 531         elsif( @nexus_output )
 532           w = NexusWriter.new()
 533           w.clean( @clean )
 534           w.set_max_name_length( @name_length )
 535           w.set_exception_if_name_too_long( @die_if_name_too_long )
 536         end
 537
 538
 539         begin
 540           io.write_to_file( msa, output, w )
 541         rescue Exception => e
 542           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 543         end
 544
 545         begin
 546           f = File.open( output + LOG_SUFFIX, 'a' )
 547           f.print( log )
 548           f.close
 549         rescue Exception => e
 550           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 551         end
 552
 553
 554       end
 555       Util.print_message( PRG_NAME, "OK" )
 556       puts
 557     end
 558
 559
 560     private
 561
 562     def sort( msa )
 563       names = Set.new
 564       for i in 0 ... msa.get_number_of_seqs
 565         name = msa.get_sequence( i ).get_name
 566         names.add( name )
 567       end
 568       sorted_ary = names.to_a.sort
 569       new_msa = Msa.new
 570       sorted_ary.each { | seq_name |
 571         seq = msa.get_sequence( msa.find_by_name( seq_name, true, false )[ 0 ] )
 572         new_msa.add_sequence( seq )
 573       }
 574       new_msa
 575     end
 576
 577     def set_fasta_input( fi = true )
 578       @fasta_input = fi
 579       @input_format_set = true
 580     end
 581     def set_phylip_input( pi = true )
 582       @phylip_input = pi
 583       @input_format_set = true
 584     end
 585     def set_name_length( i )
 586       @name_length = i
 587       @name_length_set = true
 588     end
 589     def set_width( i )
 590       @width = i
 591     end
 592     def set_fasta_output( fo = true )
 593       @fasta_output = fo
 594       @output_format_set = true
 595     end
 596     def set_pi_output( pso = true )
 597       @pi_output = pso
 598       @output_format_set = true
 599     end
 600     def set_nexus_output( nexus = true )
 601       @nexus_output = nexus
 602       @output_format_set = true
 603     end
 604     def set_clean( c = true )
 605       @clean = c
 606     end
 607     def set_remove_gap_columns( rgc = true )
 608       @rgc = rgc
 609     end
 610     def set_remove_gap_only_columns( rgoc = true )
 611       @rgoc = rgoc
 612     end
 613     def set_remove_gaps( rg = true )
 614       @rg = rg
 615     end
 616     def set_remove_gap_ratio( rgr )
 617       @rgr = rgr
 618     end
 619     def set_remove_seqs_gap_ratio( rsgr )
 620       @rsgr = rsgr
 621     end
 622     def set_remove_seqs_min_non_gap_length( rsl )
 623       @rsl = rsl
 624     end
 625     def set_remove_seqs( file )
 626       @seqs_name_file = file
 627       @remove_seqs    = true
 628       @keep_seqs      = false
 629     end
 630     def set_keep_seqs( file )
 631       @seqs_name_file = file
 632       @keep_seqs      = true
 633       @remove_seqs    = false
 634     end
 635     def set_trim( first, last )
 636       @trim            = true
 637       @first           = first
 638       @last            = last
 639     end
 640     def set_remove_matching( remove )
 641       @remove_matching  = remove
 642     end
 643     def set_keep_matching( keep )
 644       @keep_matching = keep
 645     end
 646     def set_rem_red( rr )
 647       @rem_red = rr
 648     end
 649
 650
 651
 652     def set_split( s )
 653       if ( s > 0 )
 654         @split            = s
 655         @clean            = false  # phylip only
 656         @rgc              = false
 657         @rgoc             = false
 658         @rg               = false  # fasta only
 659         @rgr              = -1
 660         @rsgr             = -1
 661         @rsl              = -1
 662         @seqs_name_file   = nil
 663         @remove_seqs      = false
 664         @keep_seqs        = false
 665         @trim             = false
 666         @first            = -1
 667         @last             = -1
 668       end
 669     end
 670
 671     def analyze_command_line( cla )
 672       if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
 673         begin
 674           type = cla.get_option_value( INPUT_TYPE_OPTION )
 675           if ( type == "p" )
 676             set_phylip_input( true )
 677             set_fasta_input( false )
 678           elsif ( type == "f" )
 679             set_fasta_input( true )
 680             set_phylip_input( false )
 681           end
 682         rescue ArgumentError => e
 683           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 684         end
 685       end
 686       if ( cla.is_option_set?( OUTPUT_TYPE_OPTION ) )
 687         begin
 688           type = cla.get_option_value( OUTPUT_TYPE_OPTION )
 689           if ( type == "p" )
 690             set_pi_output( true )
 691             set_fasta_output( false )
 692             set_nexus_output( false )
 693           elsif ( type == "f" )
 694             set_pi_output( false )
 695             set_fasta_output( true )
 696             set_nexus_output( false )
 697           elsif ( type == "n" )
 698             set_pi_output( false )
 699             set_fasta_output( false )
 700             set_nexus_output( true )
 701           end
 702         rescue ArgumentError => e
 703           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 704         end
 705       end
 706       if ( cla.is_option_set?( MAXIMAL_NAME_LENGTH_OPTION ) )
 707         begin
 708           l = cla.get_option_value_as_int( MAXIMAL_NAME_LENGTH_OPTION )
 709           set_name_length( l )
 710         rescue ArgumentError => e
 711           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 712         end
 713       end
 714       if ( cla.is_option_set?( WIDTH_OPTION ) )
 715         begin
 716           w = cla.get_option_value_as_int( WIDTH_OPTION )
 717           set_width( w )
 718         rescue ArgumentError => e
 719           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 720         end
 721       end
 722       if ( cla.is_option_set?( CLEAN_UP_SEQ_OPTION ) )
 723         set_clean( true )
 724       end
 725       if ( cla.is_option_set?( REMOVE_GAP_COLUMNS_OPTION ) )
 726         set_remove_gap_columns( true )
 727       end
 728       if ( cla.is_option_set?( REM_RED_OPTION ) )
 729         set_rem_red( true )
 730       end
 731       if ( cla.is_option_set?( REMOVE_GAP_ONLY_COLUMNS ) )
 732         set_remove_gap_only_columns( true )
 733       end
 734       if ( cla.is_option_set?( REMOVE_ALL_GAP_CHARACTERS_OPTION ) )
 735         set_remove_gaps( true )
 736       end
 737       if ( cla.is_option_set?( REMOVE_COLUMNS_GAP_RATIO_OPTION ) )
 738         begin
 739           f = cla.get_option_value_as_float( REMOVE_COLUMNS_GAP_RATIO_OPTION )
 740           set_remove_gap_ratio( f )
 741         rescue ArgumentError => e
 742           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 743         end
 744       end
 745       if ( cla.is_option_set?( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) )
 746         begin
 747           s = cla.get_option_value( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
 748           set_remove_seqs( s )
 749         rescue ArgumentError => e
 750           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 751         end
 752       end
 753       if ( cla.is_option_set?( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) )
 754         begin
 755           s = cla.get_option_value( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
 756           set_keep_seqs( s )
 757         rescue ArgumentError => e
 758           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 759         end
 760       end
 761       if ( cla.is_option_set?( TRIM_OPTION ) )
 762         begin
 763           s = cla.get_option_value( TRIM_OPTION )
 764           if ( s =~ /(\d+)-(\d+)/ )
 765             set_trim( $1.to_i(), $2.to_i() )
 766           else
 767             puts( "illegal argument" )
 768             print_help
 769             exit( -1 )
 770           end
 771         rescue ArgumentError => e
 772           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 773         end
 774       end
 775       if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) )
 776         begin
 777           f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION )
 778           set_remove_seqs_gap_ratio( f )
 779         rescue ArgumentError => e
 780           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 781         end
 782       end
 783       if ( cla.is_option_set?( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) )
 784         begin
 785           f = cla.get_option_value_as_int( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
 786           set_remove_seqs_min_non_gap_length( f )
 787         rescue ArgumentError => e
 788           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 789         end
 790       end
 791       if ( cla.is_option_set?( SPLIT ) )
 792         begin
 793           s = cla.get_option_value_as_int( SPLIT )
 794           set_split( s )
 795         rescue ArgumentError => e
 796           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 797         end
 798
 799       end
 800       if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) )
 801         begin
 802           s = cla.get_option_value( REMOVE_MATCHING_SEQUENCES_OPTION )
 803           set_remove_matching( s )
 804         rescue ArgumentError => e
 805           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 806         end
 807       end
 808       if ( cla.is_option_set?( KEEP_MATCHING_SEQUENCES_OPTION ) )
 809         begin
 810           s = cla.get_option_value( KEEP_MATCHING_SEQUENCES_OPTION )
 811           set_keep_matching( s )
 812         rescue ArgumentError => e
 813           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 814         end
 815       end
 816       if ( cla.is_option_set?( DIE_IF_NAME_TOO_LONG ) )
 817         @die_if_name_too_long = true
 818       end
 819
 820
 821     end
 822
 823     def print_help()
 824       puts()
 825       puts( "Usage:" )
 826       puts()
 827       puts( "  " + PRG_NAME + ".rb [options] <input alignment> <output>" )
 828       puts()
 829       puts( "  options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta, p for phylip selex type" )
 830       puts( "           -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta, n for nexus, p for phylip sequential (default)" )
 831       puts( "           -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
 832       puts( "           -" + DIE_IF_NAME_TOO_LONG + ": die if sequence name too long" )
 833       puts( "           -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
 834       puts( "           -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" )
 835       puts( "           -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" )
 836       puts( "           -" + REMOVE_GAP_ONLY_COLUMNS + ": remove gap-only columns" )
 837       puts( "           -" + REMOVE_COLUMNS_GAP_RATIO_OPTION + "=<n>: remove columns for which ( seqs with gap / number of sequences > n )" )
 838       puts( "           -" + REMOVE_ALL_GAP_CHARACTERS_OPTION + ": remove all gap characters (destroys alignment, fasta output only)" )
 839       puts( "           -" + REMOVE_ALL_SEQUENCES_LISTED_OPTION + "=<file>: remove all sequences listed in file" )
 840       puts( "           -" + KEEP_ONLY_SEQUENCES_LISTED_OPTION + "=<file>: keep only sequences listed in file" )
 841       puts( "           -" + TRIM_OPTION + "=<first>-<last>: remove columns before first and after last" )
 842       puts( "           -" + REMOVE_SEQS_GAP_RATIO_OPTION + "=<n>: remove sequences for which the gap ratio > n (after column operations)" )
 843       puts( "           -" + REMOVE_SEQS_NON_GAP_LENGTH_OPTION + "=<n> remove sequences with less than n non-gap characters (after column operations)" )
 844       puts( "           -" + REMOVE_MATCHING_SEQUENCES_OPTION + "=<s> remove all sequences with names containing s" )
 845       puts( "           -" + KEEP_MATCHING_SEQUENCES_OPTION + "=<s> keep only sequences with names containing s" )
 846       puts( "           -" + SPLIT + "=<n> split a fasta file into n files of equal number of sequences (expect for " )
 847       puts( "            last one), cannot be used with other options" )
 848       puts( "           -" + REM_RED_OPTION + ": remove redundant sequences" )
 849       puts()
 850     end
 851
 852
 853
 854
 855
 856   end # class MsaProcessor
 857
 858
 859 end # module Evoruby