X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Ftool%2Fmsa_processor.rb;h=eebf8f653bcfb322d441705b05db4af1e97830a7;hb=9fe0c828e4a8e1f61a1acba1e7b8439dd5edc598;hp=d9522779393e558a33478c6a591d916757ec76e5;hpb=73a72373e1e807ea755fbeb072ad979034e6d05a;p=jalview.git
diff --git a/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb b/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb
index d952277..eebf8f6 100644
--- a/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb
+++ b/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb
@@ -9,7 +9,6 @@
require 'date'
require 'set'
-
require 'lib/evo/util/constants'
require 'lib/evo/util/util'
require 'lib/evo/util/command_line_arguments'
@@ -23,17 +22,13 @@ require 'lib/evo/io/parser/general_msa_parser'
require 'lib/evo/io/writer/msa_writer'
module Evoruby
-
class MsaProcessor
PRG_NAME = "msa_pro"
- PRG_DATE = "131112"
+ PRG_DATE = "170609"
PRG_DESC = "processing of multiple sequence alignments"
- PRG_VERSION = "1.08"
- COPYRIGHT = "2008-2010 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
+ PRG_VERSION = "1.09"
+ WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
NAME_LENGTH_DEFAULT = 10
WIDTH_DEFAULT_FASTA = 60
@@ -55,14 +50,14 @@ module Evoruby
REMOVE_MATCHING_SEQUENCES_OPTION = "mr"
TRIM_OPTION = "t"
+ SLIDING_EXTRACTION_OPTION = "se"
REMOVE_SEQS_GAP_RATIO_OPTION = "rsgr"
REMOVE_SEQS_NON_GAP_LENGTH_OPTION = "rsl"
SPLIT = "split"
+ SPLIT_BY_OS = "split_by_os"
LOG_SUFFIX = "_msa_pro.log"
HELP_OPTION_1 = "help"
HELP_OPTION_2 = "h"
-
-
def initialize()
@input_format_set = false
@output_format_set = false
@@ -91,21 +86,22 @@ module Evoruby
@keep_seqs = false
@trim = false
@split = -1
+ @split_by_os = false
@first = -1
@last = -1
+ @window = false
+ @step = -1
+ @size = -1
end
-
def run()
Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
+ PRG_VERSION,
+ PRG_DESC,
+ PRG_DATE,
+ WWW,
+ STDOUT )
if ( ARGV == nil || ARGV.length < 1 )
Util.print_message( PRG_NAME, "Illegal number of arguments" )
@@ -120,7 +116,7 @@ module Evoruby
end
if ( cla.is_option_set?( HELP_OPTION_1 ) ||
- cla.is_option_set?( HELP_OPTION_2 ) )
+ cla.is_option_set?( HELP_OPTION_2 ) )
print_help
exit( 0 )
end
@@ -147,15 +143,17 @@ module Evoruby
allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION )
allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
allowed_opts.push( SPLIT )
+ allowed_opts.push( SPLIT_BY_OS )
allowed_opts.push( REM_RED_OPTION )
allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
allowed_opts.push( DIE_IF_NAME_TOO_LONG )
+ allowed_opts.push( SLIDING_EXTRACTION_OPTION )
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
Util.fatal_error( PRG_NAME,
- "unknown option(s): " + disallowed )
+ "unknown option(s): " + disallowed )
end
input = cla.get_file_name( 0 )
@@ -274,10 +272,23 @@ module Evoruby
puts( "Remove redundant sequences: true" )
log << "Remove redundant sequences: true" + ld
end
+ if ( @split_by_os )
+ puts( "Split by OS : true" )
+ log << "Split : true" + ld
+ end
if ( @split > 0 )
puts( "Split : " + @split.to_s )
log << "Split : " + @split.to_s + ld
end
+ if @window
+ puts( "Sliding window extraction: true" )
+ log << "Sliding window extraction: true" + ld
+ puts( "Sliding window step : " + @step.to_s )
+ log << "Sliding window step : " + @step.to_s + ld
+ puts( "Sliding window size : " + @size.to_s )
+ log << "Sliding window size : " + @size.to_s + ld
+ end
+
puts()
f = MsaFactory.new()
@@ -362,8 +373,46 @@ module Evoruby
end
if ( @trim )
- msa.trim!( @first, @last )
+ msa.trim!( @first, @last, '_S' )
+ end
+
+ if @window
+ msas = msa.sliding_extraction( @step, @size, @size / 2, '_Q' )
+ begin
+ io = MsaIO.new()
+ w = MsaWriter
+ if ( @pi_output )
+ w = PhylipSequentialWriter.new()
+ w.clean( @clean )
+ w.set_max_name_length( @name_length )
+ elsif( @fasta_output )
+ w = FastaWriter.new()
+ w.set_line_width( @width )
+ w.clean( @clean )
+ if ( @name_length_set )
+ w.set_max_name_length( @name_length )
+ end
+ elsif( @nexus_output )
+ w = NexusWriter.new()
+ w.clean( @clean )
+ w.set_max_name_length( @name_length )
+ end
+ for m in msas
+ name = output + "_" + m.get_name
+ if @fasta_output
+ name += ".fasta"
+ elsif @nexus_output
+ name += ".nex"
+ end
+ io.write_to_file( m, name, w )
+ end
+ Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files" )
+ log << "wrote " + msas.length.to_s + " files" + ld
+ rescue Exception => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
end
+
if( @rgr >= 0 )
msa.remove_gap_columns_w_gap_ratio!( @rgr )
elsif ( @rgc )
@@ -430,9 +479,44 @@ module Evoruby
msa = sort( msa )
end
+ if ( @split_by_os )
+ begin
+ msa_hash = msa.split_by_os(true)
+ io = MsaIO.new()
+ w = MsaWriter
+ if ( @pi_output )
+ w = PhylipSequentialWriter.new()
+ w.clean( @clean )
+ w.set_max_name_length( @name_length )
+ elsif( @fasta_output )
+ w = FastaWriter.new()
+ w.set_line_width( @width )
+ if ( @rg )
+ w.remove_gap_chars( true )
+ Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
+ log << "removing gap character, the output is likely to become unaligned" + ld
+ end
+ w.clean( @clean )
+ if ( @name_length_set )
+ w.set_max_name_length( @name_length )
+ end
+ elsif( @nexus_output )
+ w = NexusWriter.new()
+ w.clean( @clean )
+ w.set_max_name_length( @name_length )
+ end
+ msa_hash.each do |os, m|
+ my_os = os.gsub(' ', '_').gsub('/', '_').gsub('(', '_').gsub(')', '_')
+ io.write_to_file( m, output + '_' + my_os, w )
+ end
+ Util.print_message( PRG_NAME, "wrote " + msa_hash.length.to_s + " files" )
+ log << "wrote " + msa_hash.length.to_s + " files" + ld
+ rescue Exception => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
- if ( @split > 0 )
+ elsif ( @split > 0 )
begin
msas = msa.split( @split, true )
io = MsaIO.new()
@@ -468,13 +552,12 @@ module Evoruby
rescue Exception => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
-
end
rescue Exception => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
- if ( @split <= 0 )
+ if (@split <= 0) && (!@split_by_os) && (!@window)
unless ( @rg )
if ( msa.is_aligned() )
@@ -518,8 +601,8 @@ module Evoruby
if removed.size > 0
identicals = msa.get_identical_seqs_detected
log << "the following " + identicals.size.to_s + " sequences are identical:" + ld
- identicals.each { | s |
- log << s + ld
+ identicals.each { | identical |
+ log << identical + ld
}
log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld
removed.each { | seq_name |
@@ -576,13 +659,11 @@ module Evoruby
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
-
end
Util.print_message( PRG_NAME, "OK" )
puts
end
-
private
def sort( msa )
@@ -604,80 +685,98 @@ module Evoruby
@fasta_input = fi
@input_format_set = true
end
+
def set_phylip_input( pi = true )
@phylip_input = pi
@input_format_set = true
end
+
def set_name_length( i )
@name_length = i
@name_length_set = true
end
+
def set_width( i )
@width = i
end
+
def set_fasta_output( fo = true )
@fasta_output = fo
@output_format_set = true
end
+
def set_pi_output( pso = true )
@pi_output = pso
@output_format_set = true
end
+
def set_nexus_output( nexus = true )
@nexus_output = nexus
@output_format_set = true
end
+
def set_clean( c = true )
@clean = c
end
+
def set_remove_gap_columns( rgc = true )
@rgc = rgc
end
+
def set_remove_gap_only_columns( rgoc = true )
@rgoc = rgoc
end
+
def set_remove_gaps( rg = true )
@rg = rg
end
+
def set_remove_gap_ratio( rgr )
@rgr = rgr
end
+
def set_remove_seqs_gap_ratio( rsgr )
@rsgr = rsgr
end
+
def set_remove_seqs_min_non_gap_length( rsl )
@rsl = rsl
end
+
def set_remove_seqs( file )
@seqs_name_file = file
@remove_seqs = true
@keep_seqs = false
end
+
def set_keep_seqs( file )
@seqs_name_file = file
@keep_seqs = true
@remove_seqs = false
end
+
def set_trim( first, last )
@trim = true
@first = first
@last = last
end
+
def set_remove_matching( remove )
@remove_matching = remove
end
+
def set_keep_matching( keep )
@keep_matching = keep
end
+
def set_rem_red( rr )
@rem_red = rr
end
-
-
def set_split( s )
if ( s > 0 )
@split = s
+ @split_by_os = false
@clean = false # phylip only
@rgc = false
@rgoc = false
@@ -694,6 +793,43 @@ module Evoruby
end
end
+ def set_split_by_os()
+ @split = -1
+ @split_by_os = true
+ @clean = false # phylip only
+ @rgc = false
+ @rgoc = false
+ @rg = false # fasta only
+ @rgr = -1
+ @rsgr = -1
+ @rsl = -1
+ @seqs_name_file = nil
+ @remove_seqs = false
+ @keep_seqs = false
+ @trim = false
+ @first = -1
+ @last = -1
+ @window = false
+ end
+
+ def set_window()
+ @split = -1
+ @split_by_os = false
+ @rgc = false
+ @rgoc = false
+ @rg = false # fasta only
+ @rgr = -1
+ @rsgr = -1
+ @rsl = -1
+ @seqs_name_file = nil
+ @remove_seqs = false
+ @keep_seqs = false
+ @trim = false
+ @first = -1
+ @last = -1
+ @window = true
+ end
+
def analyze_command_line( cla )
if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
begin
@@ -798,6 +934,29 @@ module Evoruby
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
end
+ if ( cla.is_option_set?( SLIDING_EXTRACTION_OPTION ) )
+ begin
+ s = cla.get_option_value( SLIDING_EXTRACTION_OPTION )
+ if ( s =~ /(\d+)\/(\d+)/ )
+ set_window
+ @window = true
+ @step = $1.to_i()
+ @size = $2.to_i()
+ else
+ puts( "illegal argument" )
+ print_help
+ exit( -1 )
+ end
+ if (@step <= 0) || (@size <= 0)
+ puts( "illegal argument" )
+ print_help
+ exit( -1 )
+ end
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
+ end
+
if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) )
begin
f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION )
@@ -814,14 +973,19 @@ module Evoruby
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
end
- if ( cla.is_option_set?( SPLIT ) )
+ if cla.is_option_set?( SPLIT_BY_OS )
+ begin
+ set_split_by_os()
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
+ elsif ( cla.is_option_set?( SPLIT ) )
begin
s = cla.get_option_value_as_int( SPLIT )
set_split( s )
rescue ArgumentError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
-
end
if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) )
begin
@@ -843,7 +1007,6 @@ module Evoruby
@die_if_name_too_long = true
end
-
end
def print_help()
@@ -852,8 +1015,8 @@ module Evoruby
puts()
puts( " " + PRG_NAME + ".rb [options]