From cc48ac62e5e74f6fe69efa978a0b404df9451fe3 Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Mon, 13 Mar 2017 21:45:29 -0700 Subject: [PATCH] in progress... --- forester/ruby/evoruby/lib/evo/msa/msa.rb | 33 ++++++++- .../ruby/evoruby/lib/evo/tool/msa_processor.rb | 77 ++++++++++++++++++-- 2 files changed, 103 insertions(+), 7 deletions(-) diff --git a/forester/ruby/evoruby/lib/evo/msa/msa.rb b/forester/ruby/evoruby/lib/evo/msa/msa.rb index 0dcad71..a355d19 100644 --- a/forester/ruby/evoruby/lib/evo/msa/msa.rb +++ b/forester/ruby/evoruby/lib/evo/msa/msa.rb @@ -4,7 +4,7 @@ # Copyright:: Copyright (C) 2017 Christian M. Zmasek # License:: GNU Lesser General Public License (LGPL) # -# Last modified: 2017/02/07 +# Last modified: 2017/03/13 require 'lib/evo/util/constants' require 'lib/evo/util/util' @@ -494,7 +494,7 @@ module Evoruby return cols end - # Split an alignment into n alignemnts of equal size, except last one + # Split an alignment into n alignmnts of equal size, except last one def split( n, verbose = false ) if ( n < 2 || n > get_number_of_seqs ) error_msg = "attempt to split into less than two or more than the number of sequences" @@ -527,6 +527,35 @@ module Evoruby msas end + def split_by_os(verbose = false) + msa_hash = Hash.new() + for i in 0 ... get_number_of_seqs() + seq = get_sequence(i) + name = seq.get_name() + # >sp|Q1HVE7|AN_EBVA8 Shutoff alkaline exonuclease OS=Epstein-Barr virus (strain AG876) GN=BGLF5 PE=3 SV=1 + # if name =~ /OS=(.+?)\s+[A-Z]{2}=/ + if name =~ /Organism:(.+?)(\|Protein|$)/ + os = $1 + unless msa_hash.has_key?(os) + msa_hash[os] = Msa.new + end + msa_hash[os].add_sequence seq + else + error_msg = "sequence name \"" + name +"\" is not in the expected format for splitting by OS" + raise IOError, error_msg, caller + end + end + msa_hash = msa_hash.sort{|a, b|a<=>b}.to_h + if verbose + c = 0 + msa_hash.each do |os, msa| + c += 1 + puts c.to_s + ': ' + os + end + end + msa_hash + end + private def get_overlaps( min_overlap = 1 ) diff --git a/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb b/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb index 57b5336..567298b 100644 --- a/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb @@ -53,6 +53,7 @@ module Evoruby REMOVE_SEQS_GAP_RATIO_OPTION = "rsgr" REMOVE_SEQS_NON_GAP_LENGTH_OPTION = "rsl" SPLIT = "split" + SPLIT_BY_OS = "split_by_os" LOG_SUFFIX = "_msa_pro.log" HELP_OPTION_1 = "help" HELP_OPTION_2 = "h" @@ -84,6 +85,7 @@ module Evoruby @keep_seqs = false @trim = false @split = -1 + @split_by_os = false @first = -1 @last = -1 end @@ -137,6 +139,7 @@ module Evoruby allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION ) allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION ) allowed_opts.push( SPLIT ) + allowed_opts.push( SPLIT_BY_OS ) allowed_opts.push( REM_RED_OPTION ) allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION ) allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION ) @@ -264,6 +267,10 @@ module Evoruby puts( "Remove redundant sequences: true" ) log << "Remove redundant sequences: true" + ld end + if ( @split_by_os ) + puts( "Split by OS : true" ) + log << "Split : true" + ld + end if ( @split > 0 ) puts( "Split : " + @split.to_s ) log << "Split : " + @split.to_s + ld @@ -420,7 +427,44 @@ module Evoruby msa = sort( msa ) end - if ( @split > 0 ) + if ( @split_by_os ) + begin + msa_hash = msa.split_by_os(true) + io = MsaIO.new() + w = MsaWriter + if ( @pi_output ) + w = PhylipSequentialWriter.new() + w.clean( @clean ) + w.set_max_name_length( @name_length ) + elsif( @fasta_output ) + w = FastaWriter.new() + w.set_line_width( @width ) + if ( @rg ) + w.remove_gap_chars( true ) + Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" ) + log << "removing gap character, the output is likely to become unaligned" + ld + end + w.clean( @clean ) + if ( @name_length_set ) + w.set_max_name_length( @name_length ) + end + elsif( @nexus_output ) + w = NexusWriter.new() + w.clean( @clean ) + w.set_max_name_length( @name_length ) + end + msa_hash.each do |os, msa| + my_os = os.gsub(' ', '_').gsub('/', '_').gsub('(', '_').gsub(')', '_') + io.write_to_file( msa, output + '_' + my_os, w ) + end + + Util.print_message( PRG_NAME, "wrote " + msa_hash.length.to_s + " files" ) + log << "wrote " + msa_hash.length.to_s + " files" + ld + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + elsif ( @split > 0 ) begin msas = msa.split( @split, true ) io = MsaIO.new() @@ -456,13 +500,12 @@ module Evoruby rescue Exception => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end - end rescue Exception => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end - if ( @split <= 0 ) + if (@split <= 0) && (!@split_by_os) unless ( @rg ) if ( msa.is_aligned() ) @@ -681,6 +724,7 @@ module Evoruby def set_split( s ) if ( s > 0 ) @split = s + @split_by_os = false @clean = false # phylip only @rgc = false @rgoc = false @@ -697,6 +741,24 @@ module Evoruby end end + def set_split_by_os() + @split = -1 + @split_by_os = true + @clean = false # phylip only + @rgc = false + @rgoc = false + @rg = false # fasta only + @rgr = -1 + @rsgr = -1 + @rsl = -1 + @seqs_name_file = nil + @remove_seqs = false + @keep_seqs = false + @trim = false + @first = -1 + @last = -1 + end + def analyze_command_line( cla ) if ( cla.is_option_set?( INPUT_TYPE_OPTION ) ) begin @@ -817,14 +879,19 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end end - if ( cla.is_option_set?( SPLIT ) ) + if cla.is_option_set?( SPLIT_BY_OS ) + begin + set_split_by_os() + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + elsif ( cla.is_option_set?( SPLIT ) ) begin s = cla.get_option_value_as_int( SPLIT ) set_split( s ) rescue ArgumentError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end - end if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) ) begin -- 1.7.10.2