in progress...
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / msa_processor.rb
index d952277..eebf8f6 100644 (file)
@@ -9,7 +9,6 @@
 
 require 'date'
 require 'set'
-
 require 'lib/evo/util/constants'
 require 'lib/evo/util/util'
 require 'lib/evo/util/command_line_arguments'
@@ -23,17 +22,13 @@ require 'lib/evo/io/parser/general_msa_parser'
 require 'lib/evo/io/writer/msa_writer'
 
 module Evoruby
-
   class MsaProcessor
 
     PRG_NAME       = "msa_pro"
-    PRG_DATE       = "131112"
+    PRG_DATE       = "170609"
     PRG_DESC       = "processing of multiple sequence alignments"
-    PRG_VERSION    = "1.08"
-    COPYRIGHT      = "2008-2010 Christian M Zmasek"
-    CONTACT        = "phylosoft@gmail.com"
-    WWW            = "www.phylosoft.org"
-
+    PRG_VERSION    = "1.09"
+    WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
 
     NAME_LENGTH_DEFAULT                = 10
     WIDTH_DEFAULT_FASTA                = 60
@@ -55,14 +50,14 @@ module Evoruby
     REMOVE_MATCHING_SEQUENCES_OPTION   = "mr"
 
     TRIM_OPTION                        = "t"
+    SLIDING_EXTRACTION_OPTION          = "se"
     REMOVE_SEQS_GAP_RATIO_OPTION       = "rsgr"
     REMOVE_SEQS_NON_GAP_LENGTH_OPTION  = "rsl"
     SPLIT                              = "split"
+    SPLIT_BY_OS                        = "split_by_os"
     LOG_SUFFIX                         = "_msa_pro.log"
     HELP_OPTION_1                      = "help"
     HELP_OPTION_2                      = "h"
-
-
     def initialize()
       @input_format_set = false
       @output_format_set = false
@@ -91,21 +86,22 @@ module Evoruby
       @keep_seqs        = false
       @trim             = false
       @split            = -1
+      @split_by_os      = false
       @first            = -1
       @last             = -1
+      @window = false
+      @step = -1
+      @size =  -1
     end
 
-
     def run()
 
       Util.print_program_information( PRG_NAME,
-        PRG_VERSION,
-        PRG_DESC,
-        PRG_DATE,
-        COPYRIGHT,
-        CONTACT,
-        WWW,
-        STDOUT )
+      PRG_VERSION,
+      PRG_DESC,
+      PRG_DATE,
+      WWW,
+      STDOUT )
 
       if ( ARGV == nil || ARGV.length < 1 )
         Util.print_message( PRG_NAME, "Illegal number of arguments" )
@@ -120,7 +116,7 @@ module Evoruby
       end
 
       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
-           cla.is_option_set?( HELP_OPTION_2 ) )
+      cla.is_option_set?( HELP_OPTION_2 ) )
         print_help
         exit( 0 )
       end
@@ -147,15 +143,17 @@ module Evoruby
       allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION )
       allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
       allowed_opts.push( SPLIT )
+      allowed_opts.push( SPLIT_BY_OS )
       allowed_opts.push( REM_RED_OPTION )
       allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
       allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
       allowed_opts.push( DIE_IF_NAME_TOO_LONG )
+      allowed_opts.push( SLIDING_EXTRACTION_OPTION )
 
       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
       if ( disallowed.length > 0 )
         Util.fatal_error( PRG_NAME,
-          "unknown option(s): " + disallowed )
+        "unknown option(s): " + disallowed )
       end
 
       input = cla.get_file_name( 0 )
@@ -274,10 +272,23 @@ module Evoruby
         puts( "Remove redundant sequences: true" )
         log << "Remove redundant sequences: true" + ld
       end
+      if ( @split_by_os )
+        puts( "Split by OS      : true"  )
+        log << "Split            : true" + ld
+      end
       if ( @split > 0 )
         puts( "Split            : " + @split.to_s )
         log << "Split            : " + @split.to_s + ld
       end
+      if @window
+        puts( "Sliding window extraction: true"  )
+        log << "Sliding window extraction: true" + ld
+        puts( "Sliding window step      : " + @step.to_s )
+        log << "Sliding window step      : " + @step.to_s + ld
+        puts( "Sliding window size      : " +  @size.to_s )
+        log << "Sliding window size      : " +  @size.to_s + ld
+      end
+
       puts()
 
       f = MsaFactory.new()
@@ -362,8 +373,46 @@ module Evoruby
         end
 
         if ( @trim )
-          msa.trim!( @first, @last )
+          msa.trim!( @first, @last, '_S' )
+        end
+
+        if @window
+          msas = msa.sliding_extraction( @step, @size, @size / 2, '_Q' )
+          begin
+            io = MsaIO.new()
+            w = MsaWriter
+            if ( @pi_output )
+              w = PhylipSequentialWriter.new()
+              w.clean( @clean )
+              w.set_max_name_length( @name_length )
+            elsif( @fasta_output )
+              w = FastaWriter.new()
+              w.set_line_width( @width )
+              w.clean( @clean )
+              if ( @name_length_set )
+                w.set_max_name_length( @name_length )
+              end
+            elsif( @nexus_output )
+              w = NexusWriter.new()
+              w.clean( @clean )
+              w.set_max_name_length( @name_length )
+            end
+            for m in msas
+              name = output + "_" + m.get_name
+              if @fasta_output
+                name += ".fasta"
+              elsif @nexus_output
+                name += ".nex"
+              end
+              io.write_to_file( m, name, w )
+            end
+            Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files"  )
+            log << "wrote " + msas.length.to_s + " files" + ld
+          rescue Exception => e
+            Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+          end
         end
+
         if( @rgr >= 0 )
           msa.remove_gap_columns_w_gap_ratio!( @rgr )
         elsif ( @rgc )
@@ -430,9 +479,44 @@ module Evoruby
           msa = sort( msa )
         end
 
+        if ( @split_by_os )
+          begin
+            msa_hash = msa.split_by_os(true)
+            io = MsaIO.new()
+            w = MsaWriter
+            if ( @pi_output )
+              w = PhylipSequentialWriter.new()
+              w.clean( @clean )
+              w.set_max_name_length( @name_length )
+            elsif( @fasta_output )
+              w = FastaWriter.new()
+              w.set_line_width( @width )
+              if ( @rg )
+                w.remove_gap_chars( true )
+                Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
+                log << "removing gap character, the output is likely to become unaligned" + ld
+              end
+              w.clean( @clean )
+              if ( @name_length_set )
+                w.set_max_name_length( @name_length )
+              end
+            elsif( @nexus_output )
+              w = NexusWriter.new()
+              w.clean( @clean )
+              w.set_max_name_length( @name_length )
+            end
+            msa_hash.each do |os, m|
+              my_os = os.gsub(' ', '_').gsub('/', '_').gsub('(', '_').gsub(')', '_')
+              io.write_to_file( m, output + '_' + my_os, w )
+            end
 
+            Util.print_message( PRG_NAME, "wrote " + msa_hash.length.to_s + " files"  )
+            log << "wrote " + msa_hash.length.to_s + " files" + ld
+          rescue Exception => e
+            Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+          end
 
-        if ( @split > 0 )
+        elsif ( @split > 0 )
           begin
             msas = msa.split( @split, true )
             io = MsaIO.new()
@@ -468,13 +552,12 @@ module Evoruby
           rescue Exception => e
             Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
           end
-
         end
       rescue Exception => e
         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
       end
 
-      if ( @split <= 0 )
+      if (@split <= 0) && (!@split_by_os) && (!@window)
 
         unless ( @rg )
           if ( msa.is_aligned() )
@@ -518,8 +601,8 @@ module Evoruby
           if removed.size > 0
             identicals = msa.get_identical_seqs_detected
             log << "the following " + identicals.size.to_s + " sequences are identical:" + ld
-            identicals.each { | s |
-              log << s + ld
+            identicals.each { | identical |
+              log << identical + ld
             }
             log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld
             removed.each { | seq_name |
@@ -576,13 +659,11 @@ module Evoruby
           Util.fatal_error( PRG_NAME, "error: " + e.to_s )
         end
 
-
       end
       Util.print_message( PRG_NAME, "OK" )
       puts
     end
 
-
     private
 
     def sort( msa )
@@ -604,80 +685,98 @@ module Evoruby
       @fasta_input = fi
       @input_format_set = true
     end
+
     def set_phylip_input( pi = true )
       @phylip_input = pi
       @input_format_set = true
     end
+
     def set_name_length( i )
       @name_length = i
       @name_length_set = true
     end
+
     def set_width( i )
       @width = i
     end
+
     def set_fasta_output( fo = true )
       @fasta_output = fo
       @output_format_set = true
     end
+
     def set_pi_output( pso = true )
       @pi_output = pso
       @output_format_set = true
     end
+
     def set_nexus_output( nexus = true )
       @nexus_output = nexus
       @output_format_set = true
     end
+
     def set_clean( c = true )
       @clean = c
     end
+
     def set_remove_gap_columns( rgc = true )
       @rgc = rgc
     end
+
     def set_remove_gap_only_columns( rgoc = true )
       @rgoc = rgoc
     end
+
     def set_remove_gaps( rg = true )
       @rg = rg
     end
+
     def set_remove_gap_ratio( rgr )
       @rgr = rgr
     end
+
     def set_remove_seqs_gap_ratio( rsgr )
       @rsgr = rsgr
     end
+
     def set_remove_seqs_min_non_gap_length( rsl )
       @rsl = rsl
     end
+
     def set_remove_seqs( file )
       @seqs_name_file = file
       @remove_seqs    = true
       @keep_seqs      = false
     end
+
     def set_keep_seqs( file )
       @seqs_name_file = file
       @keep_seqs      = true
       @remove_seqs    = false
     end
+
     def set_trim( first, last )
       @trim            = true
       @first           = first
       @last            = last
     end
+
     def set_remove_matching( remove )
       @remove_matching  = remove
     end
+
     def set_keep_matching( keep )
       @keep_matching = keep
     end
+
     def set_rem_red( rr )
       @rem_red = rr
     end
 
-
-
     def set_split( s )
       if ( s > 0 )
         @split            = s
+        @split_by_os      = false
         @clean            = false  # phylip only
         @rgc              = false
         @rgoc             = false
@@ -694,6 +793,43 @@ module Evoruby
       end
     end
 
+    def set_split_by_os()
+      @split            = -1
+      @split_by_os      = true
+      @clean            = false  # phylip only
+      @rgc              = false
+      @rgoc             = false
+      @rg               = false  # fasta only
+      @rgr              = -1
+      @rsgr             = -1
+      @rsl              = -1
+      @seqs_name_file   = nil
+      @remove_seqs      = false
+      @keep_seqs        = false
+      @trim             = false
+      @first            = -1
+      @last             = -1
+      @window           = false
+    end
+
+    def set_window()
+      @split            = -1
+      @split_by_os      = false
+      @rgc              = false
+      @rgoc             = false
+      @rg               = false  # fasta only
+      @rgr              = -1
+      @rsgr             = -1
+      @rsl              = -1
+      @seqs_name_file   = nil
+      @remove_seqs      = false
+      @keep_seqs        = false
+      @trim             = false
+      @first            = -1
+      @last             = -1
+      @window           = true
+    end
+
     def analyze_command_line( cla )
       if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
         begin
@@ -798,6 +934,29 @@ module Evoruby
           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
         end
       end
+      if ( cla.is_option_set?( SLIDING_EXTRACTION_OPTION ) )
+        begin
+          s = cla.get_option_value( SLIDING_EXTRACTION_OPTION )
+          if ( s =~ /(\d+)\/(\d+)/ )
+            set_window
+            @window = true
+            @step = $1.to_i()
+            @size = $2.to_i()
+          else
+            puts( "illegal argument" )
+            print_help
+            exit( -1 )
+          end
+          if (@step <= 0) || (@size <= 0)
+            puts( "illegal argument" )
+            print_help
+            exit( -1 )
+          end
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      end
+
       if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) )
         begin
           f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION )
@@ -814,14 +973,19 @@ module Evoruby
           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
         end
       end
-      if ( cla.is_option_set?( SPLIT ) )
+      if cla.is_option_set?( SPLIT_BY_OS )
+        begin
+          set_split_by_os()
+        rescue ArgumentError => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+        end
+      elsif ( cla.is_option_set?( SPLIT ) )
         begin
           s = cla.get_option_value_as_int( SPLIT )
           set_split( s )
         rescue ArgumentError => e
           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
         end
-
       end
       if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) )
         begin
@@ -843,7 +1007,6 @@ module Evoruby
         @die_if_name_too_long = true
       end
 
-
     end
 
     def print_help()
@@ -852,8 +1015,8 @@ module Evoruby
       puts()
       puts( "  " + PRG_NAME + ".rb [options] <input alignment> <output>" )
       puts()
-      puts( "  options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta, p for phylip selex type" )
-      puts( "           -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta, n for nexus, p for phylip sequential (default)" )
+      puts( "  options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta (default), p for phylip/selex type" )
+      puts( "           -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta (default), n for nexus, p for phylip sequential" )
       puts( "           -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
       puts( "           -" + DIE_IF_NAME_TOO_LONG + ": die if sequence name too long" )
       puts( "           -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
@@ -871,15 +1034,11 @@ module Evoruby
       puts( "           -" + KEEP_MATCHING_SEQUENCES_OPTION + "=<s> keep only sequences with names containing s" )
       puts( "           -" + SPLIT + "=<n> split a fasta file into n files of equal number of sequences (expect for " )
       puts( "            last one), cannot be used with other options" )
+      puts( "           -" + SLIDING_EXTRACTION_OPTION + "=<step>/<window size>: sliding window extraction, cannot be used with other options" )
       puts( "           -" + REM_RED_OPTION + ": remove redundant sequences" )
       puts()
     end
 
-
-
-
-
   end # class MsaProcessor
 
-
 end # module Evoruby