forester/ruby/evoruby/lib/evo/tool/msa_processor.rb

   1 #
   2 # = lib/evo/apps/msa_processor.rb - MsaProcessor class
   3 #
   4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
   5 # License::    GNU Lesser General Public License (LGPL)
   6 #
   7 # $Id: msa_processor.rb,v 1.33 2010/12/13 19:00:10 cmzmasek Exp $
   8 #
   9
  10 require 'date'
  11 require 'set'
  12 require 'lib/evo/util/constants'
  13 require 'lib/evo/util/util'
  14 require 'lib/evo/util/command_line_arguments'
  15 require 'lib/evo/msa/msa_factory'
  16 require 'lib/evo/io/msa_io'
  17 require 'lib/evo/io/writer/phylip_sequential_writer'
  18 require 'lib/evo/io/writer/nexus_writer'
  19 require 'lib/evo/io/writer/fasta_writer'
  20 require 'lib/evo/io/parser/fasta_parser'
  21 require 'lib/evo/io/parser/general_msa_parser'
  22 require 'lib/evo/io/writer/msa_writer'
  23
  24 module Evoruby
  25   class MsaProcessor
  26
  27     PRG_NAME       = "msa_pro"
  28     PRG_DATE       = "170215"
  29     PRG_DESC       = "processing of multiple sequence alignments"
  30     PRG_VERSION    = "1.08"
  31     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
  32
  33     NAME_LENGTH_DEFAULT                = 10
  34     WIDTH_DEFAULT_FASTA                = 60
  35     INPUT_TYPE_OPTION                  = "i"
  36     OUTPUT_TYPE_OPTION                 = "o"
  37     MAXIMAL_NAME_LENGTH_OPTION         = "n"
  38     DIE_IF_NAME_TOO_LONG               = "d"
  39     WIDTH_OPTION                       = "w"
  40     CLEAN_UP_SEQ_OPTION                = "c"
  41     REM_RED_OPTION                     = "rem_red"
  42     REMOVE_GAP_COLUMNS_OPTION          = "rgc"
  43     REMOVE_GAP_ONLY_COLUMNS            = "rgoc"
  44     REMOVE_COLUMNS_GAP_RATIO_OPTION    = "rr"
  45     REMOVE_ALL_GAP_CHARACTERS_OPTION   = "rg"
  46     REMOVE_ALL_SEQUENCES_LISTED_OPTION = "r"
  47     KEEP_ONLY_SEQUENCES_LISTED_OPTION  = "k"
  48
  49     KEEP_MATCHING_SEQUENCES_OPTION     = "mk"
  50     REMOVE_MATCHING_SEQUENCES_OPTION   = "mr"
  51
  52     TRIM_OPTION                        = "t"
  53     REMOVE_SEQS_GAP_RATIO_OPTION       = "rsgr"
  54     REMOVE_SEQS_NON_GAP_LENGTH_OPTION  = "rsl"
  55     SPLIT                              = "split"
  56     SPLIT_BY_OS                        = "split_by_os"
  57     LOG_SUFFIX                         = "_msa_pro.log"
  58     HELP_OPTION_1                      = "help"
  59     HELP_OPTION_2                      = "h"
  60     def initialize()
  61       @input_format_set = false
  62       @output_format_set = false
  63       @fasta_input      = false
  64       @phylip_input     = true
  65       @name_length      = NAME_LENGTH_DEFAULT
  66       @name_length_set  = false
  67       @width            = WIDTH_DEFAULT_FASTA     # fasta only
  68       @pi_output        = true
  69       @fasta_output     = false
  70       @nexus_output     = false
  71       @clean            = false  # phylip only
  72       @rgc              = false
  73       @rgoc             = false
  74       @rg               = false  # fasta only
  75       @rem_red          = false
  76       @die_if_name_too_long  = false
  77       @rgr              = -1
  78       @rsgr             = -1
  79       @rsl              = -1
  80       @remove_matching  = nil
  81       @keep_matching    = nil
  82
  83       @seqs_name_file   = nil
  84       @remove_seqs      = false
  85       @keep_seqs        = false
  86       @trim             = false
  87       @split            = -1
  88       @split_by_os      = false
  89       @first            = -1
  90       @last             = -1
  91     end
  92
  93     def run()
  94
  95       Util.print_program_information( PRG_NAME,
  96       PRG_VERSION,
  97       PRG_DESC,
  98       PRG_DATE,
  99       WWW,
 100       STDOUT )
 101
 102       if ( ARGV == nil || ARGV.length < 1 )
 103         Util.print_message( PRG_NAME, "Illegal number of arguments" )
 104         print_help
 105         exit( -1 )
 106       end
 107
 108       begin
 109         cla = CommandLineArguments.new( ARGV )
 110       rescue ArgumentError => e
 111         Util.fatal_error( PRG_NAME, "Error: " + e.to_s, STDOUT )
 112       end
 113
 114       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
 115       cla.is_option_set?( HELP_OPTION_2 ) )
 116         print_help
 117         exit( 0 )
 118       end
 119
 120       if ( cla.get_number_of_files != 2 || ARGV.length < 2 )
 121         Util.print_message( PRG_NAME, "Illegal number of arguments" )
 122         print_help
 123         exit( -1 )
 124       end
 125
 126       allowed_opts = Array.new
 127       allowed_opts.push( INPUT_TYPE_OPTION )
 128       allowed_opts.push( OUTPUT_TYPE_OPTION )
 129       allowed_opts.push( MAXIMAL_NAME_LENGTH_OPTION )
 130       allowed_opts.push( WIDTH_OPTION )
 131       allowed_opts.push( CLEAN_UP_SEQ_OPTION )
 132       allowed_opts.push( REMOVE_GAP_COLUMNS_OPTION )
 133       allowed_opts.push( REMOVE_GAP_ONLY_COLUMNS )
 134       allowed_opts.push( REMOVE_COLUMNS_GAP_RATIO_OPTION )
 135       allowed_opts.push( REMOVE_ALL_GAP_CHARACTERS_OPTION )
 136       allowed_opts.push( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
 137       allowed_opts.push( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
 138       allowed_opts.push( TRIM_OPTION )
 139       allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION )
 140       allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
 141       allowed_opts.push( SPLIT )
 142       allowed_opts.push( SPLIT_BY_OS )
 143       allowed_opts.push( REM_RED_OPTION )
 144       allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
 145       allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
 146       allowed_opts.push( DIE_IF_NAME_TOO_LONG )
 147
 148       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
 149       if ( disallowed.length > 0 )
 150         Util.fatal_error( PRG_NAME,
 151         "unknown option(s): " + disallowed )
 152       end
 153
 154       input = cla.get_file_name( 0 )
 155       output = cla.get_file_name( 1 )
 156
 157       analyze_command_line( cla )
 158
 159       begin
 160         Util.check_file_for_readability( input )
 161       rescue IOError => e
 162         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 163       end
 164
 165       begin
 166         Util.check_file_for_writability( output )
 167       rescue IOError => e
 168         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 169       end
 170
 171       if ( @rg )
 172         set_pi_output( false )
 173         set_fasta_output( true )
 174         set_nexus_output( false )
 175       end
 176
 177       if ( !@input_format_set )
 178         fasta_like = false
 179         begin
 180           fasta_like = Util.looks_like_fasta?( input )
 181         rescue ArgumentError => e
 182           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 183         end
 184         @fasta_input = fasta_like
 185         @phylip_input = !fasta_like
 186         if ( !@output_format_set )
 187           @fasta_output = fasta_like
 188           @pi_output = !fasta_like
 189           @nexus_output = false
 190         end
 191       end
 192
 193       ld = Constants::LINE_DELIMITER
 194       log = PRG_NAME + " " + PRG_VERSION + " [" + PRG_DATE + "]" + " LOG" + ld
 195       now = DateTime.now
 196       log << "Date/time: " + now.to_s + ld
 197
 198       puts()
 199       puts( "Input alignment  : " + input )
 200       log << "Input alignment  : " + input + ld
 201       puts( "Output alignment : " + output )
 202       log << "Output alignment : " + output + ld
 203       if ( @phylip_input )
 204         puts( "Input is         : Phylip, or something like it" )
 205         log << "Input is         : Phylip, or something like it" + ld
 206       elsif ( @fasta_input )
 207         puts( "Input is         : Fasta" )
 208         log << "Input is         : Fasta" + ld
 209       end
 210       if( @rgr >= 0 )
 211         puts( "Max col gap ratio: " + @rgr.to_s )
 212         log << "Max col gap ratio: " + @rgr.to_s + ld
 213       elsif ( @rgc )
 214         puts( "Remove gap colums" )
 215         log << "Remove gap colums" + ld
 216       elsif( @rgoc )
 217         puts( "Remove gap only colums" )
 218         log << "Remove gap only colums" + ld
 219       end
 220       if ( @clean )
 221         puts( "Clean up         : true" )
 222         log << "Clean up         : true" + ld
 223       end
 224
 225       if ( @pi_output )
 226         puts( "Output is        : Phylip interleaved" )
 227         log << "Output is        : Phylip interleaved" + ld
 228       elsif ( @fasta_output )
 229         puts( "Output is        : Fasta" )
 230         log << "Output is        : Fasta" + ld
 231         if ( @width )
 232           puts( "Width            : " + @width.to_s )
 233           log << "Width            : " + @width.to_s + ld
 234         end
 235         if ( @rg )
 236           puts( "Remove all gap characters (alignment is destroyed)" )
 237           log << "Remove all gap characters (alignment is destroyed)" + ld
 238         end
 239       elsif ( @nexus_output )
 240         puts( "Output is        : Nexus" )
 241         log << "Output is        : Nexus" + ld
 242       end
 243       if ( @name_length_set || !@fasta_output )
 244         puts( "Max name length  : " + @name_length.to_s )
 245         log << "Max name length  : " + @name_length.to_s + ld
 246       end
 247       if( @rsgr >= 0 )
 248         puts( "Remove sequences for which the gap ratio > " + @rsgr.to_s )
 249         log << "Remove sequences for which the gap ratio > " + @rsgr.to_s + ld
 250       end
 251       if( @rsl >= 0 )
 252         puts( "Remove sequences with less than "  + @rsl.to_s + " non-gap characters" )
 253         log << "Remove sequences with less than "  + @rsl.to_s + " non-gap characters" + ld
 254       end
 255       if ( @remove_seqs )
 256         puts( "Remove sequences listed in: " + @seqs_name_file )
 257         log << "Remove sequences listed in: " + @seqs_name_file + ld
 258       elsif ( @keep_seqs )
 259         puts( "Keep only sequences listed in: " + @seqs_name_file )
 260         log << "Keep only sequences listed in: " + @seqs_name_file + ld
 261       end
 262       if ( @trim )
 263         puts( "Keep only columns from: "+ @first.to_s + " to " + @last.to_s )
 264         log << "Keep only columns from: "+ @first.to_s + " to " + @last.to_s + ld
 265       end
 266       if ( @rem_red )
 267         puts( "Remove redundant sequences: true" )
 268         log << "Remove redundant sequences: true" + ld
 269       end
 270       if ( @split_by_os )
 271         puts( "Split by OS      : true"  )
 272         log << "Split            : true" + ld
 273       end
 274       if ( @split > 0 )
 275         puts( "Split            : " + @split.to_s )
 276         log << "Split            : " + @split.to_s + ld
 277       end
 278       puts()
 279
 280       f = MsaFactory.new()
 281
 282       msa = nil
 283
 284       begin
 285         if ( @phylip_input )
 286           msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
 287         elsif ( @fasta_input )
 288           msa = f.create_msa_from_file( input, FastaParser.new() )
 289         end
 290       rescue Exception => e
 291         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 292       end
 293
 294       if ( msa.is_aligned() )
 295         Util.print_message( PRG_NAME, "Length of original alignment         : " + msa.get_length.to_s )
 296         log << "Length of original alignment         : " + msa.get_length.to_s + ld
 297         gp = msa.calculate_gap_proportion
 298         Util.print_message( PRG_NAME, "Gap-proportion of original alignment : " + gp.to_s )
 299         log << "Gap-proportion of original alignment : " +  gp.to_s + ld
 300       else
 301         Util.print_message( PRG_NAME, "Input is not aligned" )
 302         log << "Input is not aligned" + ld
 303       end
 304
 305       all_names = Set.new()
 306       for i in 0 ... msa.get_number_of_seqs()
 307         current_name = msa.get_sequence( i ).get_name
 308         if all_names.include?( current_name )
 309           Util.print_warning_message( PRG_NAME, "sequence name [" + current_name + "] is not unique" )
 310         else
 311           all_names.add( current_name )
 312         end
 313       end
 314
 315       begin
 316
 317         if ( @remove_seqs || @keep_seqs )
 318           names = Util.file2array( @seqs_name_file, true )
 319           if ( names == nil ||  names.length() < 1 )
 320             error_msg = "file \"" + @seqs_name_file.to_s + "\" appears empty"
 321             Util.fatal_error( PRG_NAME, error_msg )
 322           end
 323
 324           if ( @remove_seqs )
 325             c = 0
 326             for i in 0 ... names.length()
 327               to_delete = msa.find_by_name( names[ i ], true, false )
 328               if ( to_delete.length() < 1 )
 329                 error_msg = "sequence name \"" + names[ i ] + "\" not found"
 330                 Util.fatal_error( PRG_NAME, error_msg )
 331               elsif ( to_delete.length() > 1 )
 332                 error_msg = "sequence name \"" + names[ i ] + "\" is not unique"
 333                 Util.fatal_error( PRG_NAME, error_msg )
 334               else
 335                 msa.remove_sequence!( to_delete[ 0 ] )
 336                 c += 1
 337               end
 338             end
 339             Util.print_message( PRG_NAME, "Removed " + c.to_s + " sequences" )
 340             log <<  "Removed " + c.to_s + " sequences" + ld
 341           elsif ( @keep_seqs )
 342             msa_new = Msa.new()
 343             r = 0
 344             k = 0
 345             for j in 0 ... msa.get_number_of_seqs()
 346               if ( names.include?( msa.get_sequence( j ).get_name() ) )
 347                 msa_new.add_sequence( msa.get_sequence( j ) )
 348                 k += 1
 349               else
 350                 r += 1
 351               end
 352             end
 353             msa = msa_new
 354             Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
 355             log << "Kept    " + k.to_s + " sequences" + ld
 356             Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
 357             log << "removed " + r.to_s + " sequences" + ld
 358           end
 359         end
 360
 361         if ( @trim )
 362           msa.trim!( @first, @last )
 363         end
 364         if( @rgr >= 0 )
 365           msa.remove_gap_columns_w_gap_ratio!( @rgr )
 366         elsif ( @rgc )
 367           msa.remove_gap_columns!()
 368         elsif( @rgoc )
 369           msa.remove_gap_only_columns!()
 370         end
 371         if( @rsgr >= 0 )
 372           n = msa.get_number_of_seqs()
 373           removed = msa.remove_sequences_by_gap_ratio!( @rsgr )
 374           k = msa.get_number_of_seqs()
 375           r = n - k
 376           Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
 377           log << "Kept    " + k.to_s + " sequences" + ld
 378           Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences"  )
 379           log << "Removed " + r.to_s + " sequences:" + ld
 380           removed.each { | seq_name |
 381             log << "         " + seq_name  + ld
 382           }
 383         end
 384         if( @rsl >= 0 )
 385           n = msa.get_number_of_seqs()
 386           removed = msa.remove_sequences_by_non_gap_length!( @rsl )
 387           k = msa.get_number_of_seqs()
 388           r = n - k
 389           Util.print_message( PRG_NAME, "Kept    " + k.to_s + " sequences" )
 390           log << "Kept    " + k.to_s + " sequences" + ld
 391           Util.print_message( PRG_NAME, "Removed " + r.to_s + " sequences" )
 392           log << "Removed " + r.to_s + " sequences:" + ld
 393           removed.each { | seq_name |
 394             log << "         " + seq_name  + ld
 395           }
 396         end
 397         if ( @keep_matching )
 398           n = msa.get_number_of_seqs
 399           to_be_removed = Set.new
 400           for ii in 0 ...  n
 401             seq = msa.get_sequence( ii )
 402             if !seq.get_name.downcase.index( @keep_matching.downcase )
 403               to_be_removed.add( ii )
 404             end
 405           end
 406           to_be_removed_ary = to_be_removed.to_a.sort.reverse
 407           to_be_removed_ary.each { | index |
 408             msa.remove_sequence!( index )
 409           }
 410           # msa = sort( msa )
 411         end
 412         if ( @remove_matching )
 413           n = msa.get_number_of_seqs
 414           to_be_removed = Set.new
 415           for iii in 0 ... n
 416
 417             seq = msa.get_sequence( iii )
 418
 419             if seq.get_name.downcase.index( @remove_matching.downcase )
 420               to_be_removed.add( iii )
 421             end
 422           end
 423           to_be_removed_ary = to_be_removed.to_a.sort.reverse
 424           to_be_removed_ary.each { | index |
 425             msa.remove_sequence!( index )
 426           }
 427           msa = sort( msa )
 428         end
 429
 430         if ( @split_by_os )
 431           begin
 432             msa_hash = msa.split_by_os(true)
 433             io = MsaIO.new()
 434             w = MsaWriter
 435             if ( @pi_output )
 436               w = PhylipSequentialWriter.new()
 437               w.clean( @clean )
 438               w.set_max_name_length( @name_length )
 439             elsif( @fasta_output )
 440               w = FastaWriter.new()
 441               w.set_line_width( @width )
 442               if ( @rg )
 443                 w.remove_gap_chars( true )
 444                 Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
 445                 log << "removing gap character, the output is likely to become unaligned" + ld
 446               end
 447               w.clean( @clean )
 448               if ( @name_length_set )
 449                 w.set_max_name_length( @name_length )
 450               end
 451             elsif( @nexus_output )
 452               w = NexusWriter.new()
 453               w.clean( @clean )
 454               w.set_max_name_length( @name_length )
 455             end
 456             msa_hash.each do |os, msa|
 457               my_os = os.gsub(' ', '_').gsub('/', '_').gsub('(', '_').gsub(')', '_')
 458               io.write_to_file( msa, output + '_' + my_os, w )
 459             end
 460
 461             Util.print_message( PRG_NAME, "wrote " + msa_hash.length.to_s + " files"  )
 462             log << "wrote " + msa_hash.length.to_s + " files" + ld
 463           rescue Exception => e
 464             Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 465           end
 466
 467         elsif ( @split > 0 )
 468           begin
 469             msas = msa.split( @split, true )
 470             io = MsaIO.new()
 471             w = MsaWriter
 472             if ( @pi_output )
 473               w = PhylipSequentialWriter.new()
 474               w.clean( @clean )
 475               w.set_max_name_length( @name_length )
 476             elsif( @fasta_output )
 477               w = FastaWriter.new()
 478               w.set_line_width( @width )
 479               if ( @rg )
 480                 w.remove_gap_chars( true )
 481                 Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
 482                 log << "removing gap character, the output is likely to become unaligned" + ld
 483               end
 484               w.clean( @clean )
 485               if ( @name_length_set )
 486                 w.set_max_name_length( @name_length )
 487               end
 488             elsif( @nexus_output )
 489               w = NexusWriter.new()
 490               w.clean( @clean )
 491               w.set_max_name_length( @name_length )
 492             end
 493             i = 0
 494             for m in msas
 495               i = i + 1
 496               io.write_to_file( m, output + "_" + i.to_s, w )
 497             end
 498             Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files"  )
 499             log << "wrote " + msas.length.to_s + " files" + ld
 500           rescue Exception => e
 501             Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 502           end
 503         end
 504       rescue Exception => e
 505         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 506       end
 507
 508       if (@split <= 0) && (!@split_by_os)
 509
 510         unless ( @rg )
 511           if ( msa.is_aligned() )
 512             Util.print_message( PRG_NAME, "Length of processed alignment        : " + msa.get_length.to_s )
 513             log <<  "Length of processed alignment        : " + msa.get_length.to_s + ld
 514             gp = msa.calculate_gap_proportion
 515             Util.print_message( PRG_NAME, "Gap-proportion of processed alignment: " + gp.to_s )
 516             log << "Gap-proportion of processed alignment: " +  gp.to_s + ld
 517           else
 518             min = 0
 519             max = 0
 520             sum = 0
 521             first = true
 522             for s in 0 ... msa.get_number_of_seqs
 523               seq = msa.get_sequence( s )
 524               l = seq.get_length
 525               sum += l
 526               if l > max
 527                 max = l
 528               end
 529               if first || l < min
 530                 min = l
 531               end
 532               first = false
 533             end
 534             avg = sum / msa.get_number_of_seqs
 535             Util.print_message( PRG_NAME, "Output is not aligned" )
 536             log << "Output is not aligned" + ld
 537             Util.print_message( PRG_NAME, "Shortest sequence                    : " + min.to_s )
 538             log <<  "Shortest sequence                    : " + min.to_s + ld
 539             Util.print_message( PRG_NAME, "Longest sequence                     : " + max.to_s )
 540             log <<  "Longest sequence                     : " + max.to_s + ld
 541             Util.print_message( PRG_NAME, "Average length                       : " + avg.to_s )
 542             log <<  "Average length                       : " + avg.to_s + ld
 543
 544           end
 545         end
 546
 547         if @rem_red
 548           removed = msa.remove_redundant_sequences!( true, false )
 549           if removed.size > 0
 550             identicals = msa.get_identical_seqs_detected
 551             log << "the following " + identicals.size.to_s + " sequences are identical:" + ld
 552             identicals.each { | identical |
 553               log << identical + ld
 554             }
 555             log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld
 556             removed.each { | seq_name |
 557               log << seq_name + ld
 558             }
 559             Util.print_message( PRG_NAME, "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" )
 560             log << "will store " + msa.get_number_of_seqs.to_s + " non-redundant sequences" + ld
 561           end
 562         end
 563
 564         io = MsaIO.new()
 565
 566         w = MsaWriter
 567
 568         if ( @pi_output )
 569           w = PhylipSequentialWriter.new()
 570           w.clean( @clean )
 571           w.set_max_name_length( @name_length )
 572           w.set_exception_if_name_too_long( @die_if_name_too_long )
 573         elsif( @fasta_output )
 574           w = FastaWriter.new()
 575           w.set_line_width( @width )
 576           if ( @rg )
 577             w.remove_gap_chars( true )
 578             Util.print_warning_message( PRG_NAME, "removing gap characters, the output is likely to become unaligned"  )
 579             log << "removing gap character, the output is likely to become unaligned" + ld
 580           end
 581           w.clean( @clean )
 582           if ( @name_length_set )
 583             w.set_max_name_length( @name_length )
 584             w.set_exception_if_name_too_long( @die_if_name_too_long )
 585           end
 586         elsif( @nexus_output )
 587           w = NexusWriter.new()
 588           w.clean( @clean )
 589           w.set_max_name_length( @name_length )
 590           w.set_exception_if_name_too_long( @die_if_name_too_long )
 591         end
 592
 593         begin
 594           io.write_to_file( msa, output, w )
 595         rescue Exception => e
 596           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 597         end
 598
 599         Util.print_message( PRG_NAME, "Number of sequences in output        : " + msa.get_number_of_seqs.to_s )
 600         log << "Number of sequences in output        : " + msa.get_number_of_seqs.to_s + ld
 601
 602         begin
 603           f = File.open( output + LOG_SUFFIX, 'a' )
 604           f.print( log )
 605           f.close
 606         rescue Exception => e
 607           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
 608         end
 609
 610       end
 611       Util.print_message( PRG_NAME, "OK" )
 612       puts
 613     end
 614
 615     private
 616
 617     def sort( msa )
 618       names = Set.new
 619       for i in 0 ... msa.get_number_of_seqs
 620         name = msa.get_sequence( i ).get_name
 621         names.add( name )
 622       end
 623       sorted_ary = names.to_a.sort
 624       new_msa = Msa.new
 625       sorted_ary.each { | seq_name |
 626         seq = msa.get_sequence( msa.find_by_name( seq_name, true, false )[ 0 ] )
 627         new_msa.add_sequence( seq )
 628       }
 629       new_msa
 630     end
 631
 632     def set_fasta_input( fi = true )
 633       @fasta_input = fi
 634       @input_format_set = true
 635     end
 636
 637     def set_phylip_input( pi = true )
 638       @phylip_input = pi
 639       @input_format_set = true
 640     end
 641
 642     def set_name_length( i )
 643       @name_length = i
 644       @name_length_set = true
 645     end
 646
 647     def set_width( i )
 648       @width = i
 649     end
 650
 651     def set_fasta_output( fo = true )
 652       @fasta_output = fo
 653       @output_format_set = true
 654     end
 655
 656     def set_pi_output( pso = true )
 657       @pi_output = pso
 658       @output_format_set = true
 659     end
 660
 661     def set_nexus_output( nexus = true )
 662       @nexus_output = nexus
 663       @output_format_set = true
 664     end
 665
 666     def set_clean( c = true )
 667       @clean = c
 668     end
 669
 670     def set_remove_gap_columns( rgc = true )
 671       @rgc = rgc
 672     end
 673
 674     def set_remove_gap_only_columns( rgoc = true )
 675       @rgoc = rgoc
 676     end
 677
 678     def set_remove_gaps( rg = true )
 679       @rg = rg
 680     end
 681
 682     def set_remove_gap_ratio( rgr )
 683       @rgr = rgr
 684     end
 685
 686     def set_remove_seqs_gap_ratio( rsgr )
 687       @rsgr = rsgr
 688     end
 689
 690     def set_remove_seqs_min_non_gap_length( rsl )
 691       @rsl = rsl
 692     end
 693
 694     def set_remove_seqs( file )
 695       @seqs_name_file = file
 696       @remove_seqs    = true
 697       @keep_seqs      = false
 698     end
 699
 700     def set_keep_seqs( file )
 701       @seqs_name_file = file
 702       @keep_seqs      = true
 703       @remove_seqs    = false
 704     end
 705
 706     def set_trim( first, last )
 707       @trim            = true
 708       @first           = first
 709       @last            = last
 710     end
 711
 712     def set_remove_matching( remove )
 713       @remove_matching  = remove
 714     end
 715
 716     def set_keep_matching( keep )
 717       @keep_matching = keep
 718     end
 719
 720     def set_rem_red( rr )
 721       @rem_red = rr
 722     end
 723
 724     def set_split( s )
 725       if ( s > 0 )
 726         @split            = s
 727         @split_by_os      = false
 728         @clean            = false  # phylip only
 729         @rgc              = false
 730         @rgoc             = false
 731         @rg               = false  # fasta only
 732         @rgr              = -1
 733         @rsgr             = -1
 734         @rsl              = -1
 735         @seqs_name_file   = nil
 736         @remove_seqs      = false
 737         @keep_seqs        = false
 738         @trim             = false
 739         @first            = -1
 740         @last             = -1
 741       end
 742     end
 743
 744     def set_split_by_os()
 745       @split            = -1
 746       @split_by_os      = true
 747       @clean            = false  # phylip only
 748       @rgc              = false
 749       @rgoc             = false
 750       @rg               = false  # fasta only
 751       @rgr              = -1
 752       @rsgr             = -1
 753       @rsl              = -1
 754       @seqs_name_file   = nil
 755       @remove_seqs      = false
 756       @keep_seqs        = false
 757       @trim             = false
 758       @first            = -1
 759       @last             = -1
 760     end
 761
 762     def analyze_command_line( cla )
 763       if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
 764         begin
 765           type = cla.get_option_value( INPUT_TYPE_OPTION )
 766           if ( type == "p" )
 767             set_phylip_input( true )
 768             set_fasta_input( false )
 769           elsif ( type == "f" )
 770             set_fasta_input( true )
 771             set_phylip_input( false )
 772           end
 773         rescue ArgumentError => e
 774           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 775         end
 776       end
 777       if ( cla.is_option_set?( OUTPUT_TYPE_OPTION ) )
 778         begin
 779           type = cla.get_option_value( OUTPUT_TYPE_OPTION )
 780           if ( type == "p" )
 781             set_pi_output( true )
 782             set_fasta_output( false )
 783             set_nexus_output( false )
 784           elsif ( type == "f" )
 785             set_pi_output( false )
 786             set_fasta_output( true )
 787             set_nexus_output( false )
 788           elsif ( type == "n" )
 789             set_pi_output( false )
 790             set_fasta_output( false )
 791             set_nexus_output( true )
 792           end
 793         rescue ArgumentError => e
 794           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 795         end
 796       end
 797       if ( cla.is_option_set?( MAXIMAL_NAME_LENGTH_OPTION ) )
 798         begin
 799           l = cla.get_option_value_as_int( MAXIMAL_NAME_LENGTH_OPTION )
 800           set_name_length( l )
 801         rescue ArgumentError => e
 802           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 803         end
 804       end
 805       if ( cla.is_option_set?( WIDTH_OPTION ) )
 806         begin
 807           w = cla.get_option_value_as_int( WIDTH_OPTION )
 808           set_width( w )
 809         rescue ArgumentError => e
 810           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 811         end
 812       end
 813       if ( cla.is_option_set?( CLEAN_UP_SEQ_OPTION ) )
 814         set_clean( true )
 815       end
 816       if ( cla.is_option_set?( REMOVE_GAP_COLUMNS_OPTION ) )
 817         set_remove_gap_columns( true )
 818       end
 819       if ( cla.is_option_set?( REM_RED_OPTION ) )
 820         set_rem_red( true )
 821       end
 822       if ( cla.is_option_set?( REMOVE_GAP_ONLY_COLUMNS ) )
 823         set_remove_gap_only_columns( true )
 824       end
 825       if ( cla.is_option_set?( REMOVE_ALL_GAP_CHARACTERS_OPTION ) )
 826         set_remove_gaps( true )
 827       end
 828       if ( cla.is_option_set?( REMOVE_COLUMNS_GAP_RATIO_OPTION ) )
 829         begin
 830           f = cla.get_option_value_as_float( REMOVE_COLUMNS_GAP_RATIO_OPTION )
 831           set_remove_gap_ratio( f )
 832         rescue ArgumentError => e
 833           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 834         end
 835       end
 836       if ( cla.is_option_set?( REMOVE_ALL_SEQUENCES_LISTED_OPTION ) )
 837         begin
 838           s = cla.get_option_value( REMOVE_ALL_SEQUENCES_LISTED_OPTION )
 839           set_remove_seqs( s )
 840         rescue ArgumentError => e
 841           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 842         end
 843       end
 844       if ( cla.is_option_set?( KEEP_ONLY_SEQUENCES_LISTED_OPTION ) )
 845         begin
 846           s = cla.get_option_value( KEEP_ONLY_SEQUENCES_LISTED_OPTION )
 847           set_keep_seqs( s )
 848         rescue ArgumentError => e
 849           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 850         end
 851       end
 852       if ( cla.is_option_set?( TRIM_OPTION ) )
 853         begin
 854           s = cla.get_option_value( TRIM_OPTION )
 855           if ( s =~ /(\d+)-(\d+)/ )
 856             set_trim( $1.to_i(), $2.to_i() )
 857           else
 858             puts( "illegal argument" )
 859             print_help
 860             exit( -1 )
 861           end
 862         rescue ArgumentError => e
 863           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 864         end
 865       end
 866       if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) )
 867         begin
 868           f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION )
 869           set_remove_seqs_gap_ratio( f )
 870         rescue ArgumentError => e
 871           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 872         end
 873       end
 874       if ( cla.is_option_set?( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) )
 875         begin
 876           f = cla.get_option_value_as_int( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
 877           set_remove_seqs_min_non_gap_length( f )
 878         rescue ArgumentError => e
 879           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 880         end
 881       end
 882       if cla.is_option_set?( SPLIT_BY_OS )
 883         begin
 884           set_split_by_os()
 885         rescue ArgumentError => e
 886           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 887         end
 888       elsif ( cla.is_option_set?( SPLIT ) )
 889         begin
 890           s = cla.get_option_value_as_int( SPLIT )
 891           set_split( s )
 892         rescue ArgumentError => e
 893           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 894         end
 895       end
 896       if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) )
 897         begin
 898           s = cla.get_option_value( REMOVE_MATCHING_SEQUENCES_OPTION )
 899           set_remove_matching( s )
 900         rescue ArgumentError => e
 901           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 902         end
 903       end
 904       if ( cla.is_option_set?( KEEP_MATCHING_SEQUENCES_OPTION ) )
 905         begin
 906           s = cla.get_option_value( KEEP_MATCHING_SEQUENCES_OPTION )
 907           set_keep_matching( s )
 908         rescue ArgumentError => e
 909           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
 910         end
 911       end
 912       if ( cla.is_option_set?( DIE_IF_NAME_TOO_LONG ) )
 913         @die_if_name_too_long = true
 914       end
 915
 916     end
 917
 918     def print_help()
 919       puts()
 920       puts( "Usage:" )
 921       puts()
 922       puts( "  " + PRG_NAME + ".rb [options] <input alignment> <output>" )
 923       puts()
 924       puts( "  options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta, p for phylip selex type" )
 925       puts( "           -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta, n for nexus, p for phylip sequential (default)" )
 926       puts( "           -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
 927       puts( "           -" + DIE_IF_NAME_TOO_LONG + ": die if sequence name too long" )
 928       puts( "           -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
 929       puts( "           -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" )
 930       puts( "           -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" )
 931       puts( "           -" + REMOVE_GAP_ONLY_COLUMNS + ": remove gap-only columns" )
 932       puts( "           -" + REMOVE_COLUMNS_GAP_RATIO_OPTION + "=<n>: remove columns for which ( seqs with gap / number of sequences > n )" )
 933       puts( "           -" + REMOVE_ALL_GAP_CHARACTERS_OPTION + ": remove all gap characters (destroys alignment, fasta output only)" )
 934       puts( "           -" + REMOVE_ALL_SEQUENCES_LISTED_OPTION + "=<file>: remove all sequences listed in file" )
 935       puts( "           -" + KEEP_ONLY_SEQUENCES_LISTED_OPTION + "=<file>: keep only sequences listed in file" )
 936       puts( "           -" + TRIM_OPTION + "=<first>-<last>: remove columns before first and after last" )
 937       puts( "           -" + REMOVE_SEQS_GAP_RATIO_OPTION + "=<n>: remove sequences for which the gap ratio > n (after column operations)" )
 938       puts( "           -" + REMOVE_SEQS_NON_GAP_LENGTH_OPTION + "=<n> remove sequences with less than n non-gap characters (after column operations)" )
 939       puts( "           -" + REMOVE_MATCHING_SEQUENCES_OPTION + "=<s> remove all sequences with names containing s" )
 940       puts( "           -" + KEEP_MATCHING_SEQUENCES_OPTION + "=<s> keep only sequences with names containing s" )
 941       puts( "           -" + SPLIT + "=<n> split a fasta file into n files of equal number of sequences (expect for " )
 942       puts( "            last one), cannot be used with other options" )
 943       puts( "           -" + REM_RED_OPTION + ": remove redundant sequences" )
 944       puts()
 945     end
 946
 947   end # class MsaProcessor
 948
 949 end # module Evoruby