# Copyright:: Copyright (C) 2017 Christian M. Zmasek
# License:: GNU Lesser General Public License (LGPL)
#
-# Last modified: 2017/02/07
+# Last modified: 2017/03/13
require 'lib/evo/util/constants'
require 'lib/evo/util/util'
return cols
end
- # Split an alignment into n alignemnts of equal size, except last one
+ # Split an alignment into n alignmnts of equal size, except last one
def split( n, verbose = false )
if ( n < 2 || n > get_number_of_seqs )
error_msg = "attempt to split into less than two or more than the number of sequences"
msas
end
+ def split_by_os(verbose = false)
+ msa_hash = Hash.new()
+ for i in 0 ... get_number_of_seqs()
+ seq = get_sequence(i)
+ name = seq.get_name()
+ # >sp|Q1HVE7|AN_EBVA8 Shutoff alkaline exonuclease OS=Epstein-Barr virus (strain AG876) GN=BGLF5 PE=3 SV=1
+ # if name =~ /OS=(.+?)\s+[A-Z]{2}=/
+ if name =~ /Organism:(.+?)(\|Protein|$)/
+ os = $1
+ unless msa_hash.has_key?(os)
+ msa_hash[os] = Msa.new
+ end
+ msa_hash[os].add_sequence seq
+ else
+ error_msg = "sequence name \"" + name +"\" is not in the expected format for splitting by OS"
+ raise IOError, error_msg, caller
+ end
+ end
+ msa_hash = msa_hash.sort{|a, b|a<=>b}.to_h
+ if verbose
+ c = 0
+ msa_hash.each do |os, msa|
+ c += 1
+ puts c.to_s + ': ' + os
+ end
+ end
+ msa_hash
+ end
+
private
def get_overlaps( min_overlap = 1 )
REMOVE_SEQS_GAP_RATIO_OPTION = "rsgr"
REMOVE_SEQS_NON_GAP_LENGTH_OPTION = "rsl"
SPLIT = "split"
+ SPLIT_BY_OS = "split_by_os"
LOG_SUFFIX = "_msa_pro.log"
HELP_OPTION_1 = "help"
HELP_OPTION_2 = "h"
@keep_seqs = false
@trim = false
@split = -1
+ @split_by_os = false
@first = -1
@last = -1
end
allowed_opts.push( REMOVE_SEQS_GAP_RATIO_OPTION )
allowed_opts.push( REMOVE_SEQS_NON_GAP_LENGTH_OPTION )
allowed_opts.push( SPLIT )
+ allowed_opts.push( SPLIT_BY_OS )
allowed_opts.push( REM_RED_OPTION )
allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
puts( "Remove redundant sequences: true" )
log << "Remove redundant sequences: true" + ld
end
+ if ( @split_by_os )
+ puts( "Split by OS : true" )
+ log << "Split : true" + ld
+ end
if ( @split > 0 )
puts( "Split : " + @split.to_s )
log << "Split : " + @split.to_s + ld
msa = sort( msa )
end
- if ( @split > 0 )
+ if ( @split_by_os )
+ begin
+ msa_hash = msa.split_by_os(true)
+ io = MsaIO.new()
+ w = MsaWriter
+ if ( @pi_output )
+ w = PhylipSequentialWriter.new()
+ w.clean( @clean )
+ w.set_max_name_length( @name_length )
+ elsif( @fasta_output )
+ w = FastaWriter.new()
+ w.set_line_width( @width )
+ if ( @rg )
+ w.remove_gap_chars( true )
+ Util.print_warning_message( PRG_NAME, "removing gap character, the output is likely to become unaligned" )
+ log << "removing gap character, the output is likely to become unaligned" + ld
+ end
+ w.clean( @clean )
+ if ( @name_length_set )
+ w.set_max_name_length( @name_length )
+ end
+ elsif( @nexus_output )
+ w = NexusWriter.new()
+ w.clean( @clean )
+ w.set_max_name_length( @name_length )
+ end
+ msa_hash.each do |os, msa|
+ my_os = os.gsub(' ', '_').gsub('/', '_').gsub('(', '_').gsub(')', '_')
+ io.write_to_file( msa, output + '_' + my_os, w )
+ end
+
+ Util.print_message( PRG_NAME, "wrote " + msa_hash.length.to_s + " files" )
+ log << "wrote " + msa_hash.length.to_s + " files" + ld
+ rescue Exception => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
+
+ elsif ( @split > 0 )
begin
msas = msa.split( @split, true )
io = MsaIO.new()
rescue Exception => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
-
end
rescue Exception => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
- if ( @split <= 0 )
+ if (@split <= 0) && (!@split_by_os)
unless ( @rg )
if ( msa.is_aligned() )
def set_split( s )
if ( s > 0 )
@split = s
+ @split_by_os = false
@clean = false # phylip only
@rgc = false
@rgoc = false
end
end
+ def set_split_by_os()
+ @split = -1
+ @split_by_os = true
+ @clean = false # phylip only
+ @rgc = false
+ @rgoc = false
+ @rg = false # fasta only
+ @rgr = -1
+ @rsgr = -1
+ @rsl = -1
+ @seqs_name_file = nil
+ @remove_seqs = false
+ @keep_seqs = false
+ @trim = false
+ @first = -1
+ @last = -1
+ end
+
def analyze_command_line( cla )
if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
begin
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
end
- if ( cla.is_option_set?( SPLIT ) )
+ if cla.is_option_set?( SPLIT_BY_OS )
+ begin
+ set_split_by_os()
+ rescue ArgumentError => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
+ end
+ elsif ( cla.is_option_set?( SPLIT ) )
begin
s = cla.get_option_value_as_int( SPLIT )
set_split( s )
rescue ArgumentError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
-
end
if ( cla.is_option_set?( REMOVE_MATCHING_SEQUENCES_OPTION ) )
begin