X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Ftool%2Fmsa_processor.rb;h=eebf8f653bcfb322d441705b05db4af1e97830a7;hb=9fe0c828e4a8e1f61a1acba1e7b8439dd5edc598;hp=d9522779393e558a33478c6a591d916757ec76e5;hpb=73a72373e1e807ea755fbeb072ad979034e6d05a;p=jalview.git diff --git a/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb b/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb index d952277..eebf8f6 100644 --- a/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb @@ -9,7 +9,6 @@ require 'date' require 'set' - require 'lib/evo/util/constants' require 'lib/evo/util/util' require 'lib/evo/util/command_line_arguments' @@ -23,17 +22,13 @@ require 'lib/evo/io/parser/general_msa_parser' require 'lib/evo/io/writer/msa_writer' module Evoruby - class MsaProcessor PRG_NAME = "msa_pro" - PRG_DATE = "131112" + PRG_DATE = "170609" PRG_DESC = "processing of multiple sequence alignments" - PRG_VERSION = "1.08" - COPYRIGHT = "2008-2010 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" - + PRG_VERSION = "1.09" + WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" NAME_LENGTH_DEFAULT = 10 WIDTH_DEFAULT_FASTA = 60 @@ -55,14 +50,14 @@ module Evoruby REMOVE_MATCHING_SEQUENCES_OPTION = "mr" TRIM_OPTION = "t" + SLIDING_EXTRACTION_OPTION = "se" REMOVE_SEQS_GAP_RATIO_OPTION = "rsgr" REMOVE_SEQS_NON_GAP_LENGTH_OPTION = "rsl" SPLIT = "split" + SPLIT_BY_OS = "split_by_os" LOG_SUFFIX = "_msa_pro.log" HELP_OPTION_1 = "help" HELP_OPTION_2 = "h" - - def initialize() @input_format_set = false @output_format_set = false @@ -91,21 +86,22 @@ module Evoruby @keep_seqs = false @trim = false @split = -1 + @split_by_os = false @first = -1 @last = -1 + @window = false + @step = -1 + @size = -1 end - def run() Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) + PRG_VERSION, + PRG_DESC, + PRG_DATE, + WWW, + STDOUT ) if ( ARGV == nil || ARGV.length < 1 ) Util.print_message( PRG_NAME, "Illegal number of arguments" ) @@ -120,7 +116,7 @@ module Evoruby end if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) + cla.is_option_set?( HELP_OPTION_2 ) ) print_help exit( 0 ) end @@ -147,15 +143,17 @@ module Evoruby allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION ) allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) allowed_opts.push( SPLIT ) + allowed_opts.push( SPLIT_BY_OS ) allowed_opts.push( REM_RED_OPTION ) allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION ) allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION ) allowed_opts.push( DIE_IF_NAME_TOO_LONG ) + allowed_opts.push( SLIDING_EXTRACTION_OPTION ) disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) Util.fatal_error( PRG_NAME, - "unknown option(s): " + disallowed ) + "unknown option(s): " + disallowed ) end input = cla.get_file_name( 0 ) @@ -274,10 +272,23 @@ module Evoruby puts( "Remove redundant sequences: true" ) log << "Remove redundant sequences: true" + ld end + if ( @split_by_os ) + puts( "Split by OS : true" ) + log << "Split : true" + ld + end if ( @split > 0 ) puts( "Split : " + @split.to_s ) log << "Split : " + @split.to_s + ld end + if @window + puts( "Sliding window extraction: true" ) + log << "Sliding window extraction: true" + ld + puts( "Sliding window step : " + @step.to_s ) + log << "Sliding window step : " + @step.to_s + ld + puts( "Sliding window size : " + @size.to_s ) + log << "Sliding window size : " + @size.to_s + ld + end + puts() f = MsaFactory.new() @@ -362,8 +373,46 @@ module Evoruby end if ( @trim ) - msa.trim!( @first, @last ) + msa.trim!( @first, @last, '_S' ) + end + + if @window + msas = msa.sliding_extraction( @step, @size, @size / 2, '_Q' ) + begin + io = MsaIO.new() + w = MsaWriter + if ( @pi_output ) + w = PhylipSequentialWriter.new() + w.clean( @clean ) + w.set_max_name_length( @name_length ) + elsif( @fasta_output ) + w = FastaWriter.new() + w.set_line_width( @width ) + w.clean( @clean ) + if ( @name_length_set ) + w.set_max_name_length( @name_length ) + end + elsif( @nexus_output ) + w = NexusWriter.new() + w.clean( @clean ) + w.set_max_name_length( @name_length ) + end + for m in msas + name = output + "_" + m.get_name + if @fasta_output + name += ".fasta" + elsif @nexus_output + name += ".nex" + end + io.write_to_file( m, name, w ) + end + Util.print_message( PRG_NAME, "wrote " + msas.length.to_s + " files" ) + log << "wrote " + msas.length.to_s + " files" + ld + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end end + if( @rgr >= 0 ) msa.remove_gap_columns_w_gap_ratio!( @rgr ) elsif ( @rgc ) @@ -430,9 +479,44 @@ module Evoruby msa = sort( msa ) end + if ( @split_by_os ) + begin + msa_hash = msa.split_by_os(true) + io = MsaIO.new() + w = MsaWriter + if ( @pi_output ) + w = PhylipSequentialWriter.new() + w.clean( @clean ) + w.set_max_name_length( @name_length ) + elsif( @fasta_output ) + w = FastaWriter.new() + w.set_line_width( @width ) + if ( @rg ) + w.remove_gap_chars( true ) + Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" ) + log << "removing gap character, the output is likely to become unaligned" + ld + end + w.clean( @clean ) + if ( @name_length_set ) + w.set_max_name_length( @name_length ) + end + elsif( @nexus_output ) + w = NexusWriter.new() + w.clean( @clean ) + w.set_max_name_length( @name_length ) + end + msa_hash.each do |os, m| + my_os = os.gsub(' ', '_').gsub('/', '_').gsub('(', '_').gsub(')', '_') + io.write_to_file( m, output + '_' + my_os, w ) + end + Util.print_message( PRG_NAME, "wrote " + msa_hash.length.to_s + " files" ) + log << "wrote " + msa_hash.length.to_s + " files" + ld + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end - if ( @split > 0 ) + elsif ( @split > 0 ) begin msas = msa.split( @split, true ) io = MsaIO.new() @@ -468,13 +552,12 @@ module Evoruby rescue Exception => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end - end rescue Exception => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end - if ( @split <= 0 ) + if (@split <= 0) && (!@split_by_os) && (!@window) unless ( @rg ) if ( msa.is_aligned() ) @@ -518,8 +601,8 @@ module Evoruby if removed.size > 0 identicals = msa.get_identical_seqs_detected log << "the following " + identicals.size.to_s + " sequences are identical:" + ld - identicals.each { | s | - log << s + ld + identicals.each { | identical | + log << identical + ld } log << "ignoring the following " + removed.size.to_s + " redundant sequences:" + ld removed.each { | seq_name | @@ -576,13 +659,11 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end - end Util.print_message( PRG_NAME, "OK" ) puts end - private def sort( msa ) @@ -604,80 +685,98 @@ module Evoruby @fasta_input = fi @input_format_set = true end + def set_phylip_input( pi = true ) @phylip_input = pi @input_format_set = true end + def set_name_length( i ) @name_length = i @name_length_set = true end + def set_width( i ) @width = i end + def set_fasta_output( fo = true ) @fasta_output = fo @output_format_set = true end + def set_pi_output( pso = true ) @pi_output = pso @output_format_set = true end + def set_nexus_output( nexus = true ) @nexus_output = nexus @output_format_set = true end + def set_clean( c = true ) @clean = c end + def set_remove_gap_columns( rgc = true ) @rgc = rgc end + def set_remove_gap_only_columns( rgoc = true ) @rgoc = rgoc end + def set_remove_gaps( rg = true ) @rg = rg end + def set_remove_gap_ratio( rgr ) @rgr = rgr end + def set_remove_seqs_gap_ratio( rsgr ) @rsgr = rsgr end + def set_remove_seqs_min_non_gap_length( rsl ) @rsl = rsl end + def set_remove_seqs( file ) @seqs_name_file = file @remove_seqs = true @keep_seqs = false end + def set_keep_seqs( file ) @seqs_name_file = file @keep_seqs = true @remove_seqs = false end + def set_trim( first, last ) @trim = true @first = first @last = last end + def set_remove_matching( remove ) @remove_matching = remove end + def set_keep_matching( keep ) @keep_matching = keep end + def set_rem_red( rr ) @rem_red = rr end - - def set_split( s ) if ( s > 0 ) @split = s + @split_by_os = false @clean = false # phylip only @rgc = false @rgoc = false @@ -694,6 +793,43 @@ module Evoruby end end + def set_split_by_os() + @split = -1 + @split_by_os = true + @clean = false # phylip only + @rgc = false + @rgoc = false + @rg = false # fasta only + @rgr = -1 + @rsgr = -1 + @rsl = -1 + @seqs_name_file = nil + @remove_seqs = false + @keep_seqs = false + @trim = false + @first = -1 + @last = -1 + @window = false + end + + def set_window() + @split = -1 + @split_by_os = false + @rgc = false + @rgoc = false + @rg = false # fasta only + @rgr = -1 + @rsgr = -1 + @rsl = -1 + @seqs_name_file = nil + @remove_seqs = false + @keep_seqs = false + @trim = false + @first = -1 + @last = -1 + @window = true + end + def analyze_command_line( cla ) if ( cla.is_option_set?( INPUT_TYPE_OPTION ) ) begin @@ -798,6 +934,29 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end end + if ( cla.is_option_set?( SLIDING_EXTRACTION_OPTION ) ) + begin + s = cla.get_option_value( SLIDING_EXTRACTION_OPTION ) + if ( s =~ /(\d+)\/(\d+)/ ) + set_window + @window = true + @step = $1.to_i() + @size = $2.to_i() + else + puts( "illegal argument" ) + print_help + exit( -1 ) + end + if (@step <= 0) || (@size <= 0) + puts( "illegal argument" ) + print_help + exit( -1 ) + end + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + if ( cla.is_option_set?( REMOVE_SEQS_GAP_RATIO_OPTION ) ) begin f = cla.get_option_value_as_float( REMOVE_SEQS_GAP_RATIO_OPTION ) @@ -814,14 +973,19 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end end - if ( cla.is_option_set?( SPLIT ) ) + if cla.is_option_set?( SPLIT_BY_OS ) + begin + set_split_by_os() + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + elsif ( cla.is_option_set?( SPLIT ) ) begin s = cla.get_option_value_as_int( SPLIT ) set_split( s ) rescue ArgumentError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end - end if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) ) begin @@ -843,7 +1007,6 @@ module Evoruby @die_if_name_too_long = true end - end def print_help() @@ -852,8 +1015,8 @@ module Evoruby puts() puts( " " + PRG_NAME + ".rb [options] " ) puts() - puts( " options: -" + INPUT_TYPE_OPTION + "=: f for fasta, p for phylip selex type" ) - puts( " -" + OUTPUT_TYPE_OPTION + "=: f for fasta, n for nexus, p for phylip sequential (default)" ) + puts( " options: -" + INPUT_TYPE_OPTION + "=: f for fasta (default), p for phylip/selex type" ) + puts( " -" + OUTPUT_TYPE_OPTION + "=: f for fasta (default), n for nexus, p for phylip sequential" ) puts( " -" + MAXIMAL_NAME_LENGTH_OPTION + "=: n=maximal name length (default for phylip 10, for fasta: unlimited )" ) puts( " -" + DIE_IF_NAME_TOO_LONG + ": die if sequence name too long" ) puts( " -" + WIDTH_OPTION + "=: n=width (fasta output only, default is 60)" ) @@ -871,15 +1034,11 @@ module Evoruby puts( " -" + KEEP_MATCHING_SEQUENCES_OPTION + "= keep only sequences with names containing s" ) puts( " -" + SPLIT + "= split a fasta file into n files of equal number of sequences (expect for " ) puts( " last one), cannot be used with other options" ) + puts( " -" + SLIDING_EXTRACTION_OPTION + "=/: sliding window extraction, cannot be used with other options" ) puts( " -" + REM_RED_OPTION + ": remove redundant sequences" ) puts() end - - - - end # class MsaProcessor - end # module Evoruby