#
# = lib/evo/msa/msa.rb - Msa class
#
-# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
+# Copyright:: Copyright (C) 2017 Christian M. Zmasek
+# License:: GNU Lesser General Public License (LGPL)
#
-# $Id: msa.rb,v 1.11 2009/01/03 00:42:08 cmzmasek Exp $
-#
-
+# Last modified: 2017/03/13
require 'lib/evo/util/constants'
require 'lib/evo/util/util'
require 'lib/evo/sequence/sequence'
module Evoruby
-
class Msa
-
def initialize()
@sequences = Array.new
@identical_seqs_detected = Array.new
+ @name_to_seq_indices = Hash.new
+ @namestart_to_seq_indices = Hash.new
+ @name = nil
end
+ def get_name
+ @name
+ end
+
+ def set_name( name )
+ @name = name
+ end
def add_sequence( sequence )
@sequences.push( sequence )
def get_sequence( index )
if ( index < 0 || index > get_number_of_seqs() - 1 )
error_msg = "attempt to get sequence " <<
- index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
- " sequences"
+ index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
+ " sequences"
raise ArgumentError, error_msg
end
return @sequences[ index ]
def remove_sequence!( index )
if ( index < 0 || index > get_number_of_seqs() - 1 )
error_msg = "attempt to remove sequence " <<
- index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
- " sequences"
+ index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
+ " sequences"
raise ArgumentError, error_msg
end
+ @name_to_seq_indices.clear
+ @namestart_to_seq_indices.clear
@sequences.delete_at( index )
end
@identical_seqs_detected
end
-
def is_aligned()
if ( get_number_of_seqs < 1 )
return false
end
def find_by_name( name, case_sensitive, partial_match )
+ if case_sensitive && !partial_match && @name_to_seq_indices.has_key?( name )
+ return @name_to_seq_indices[ name ]
+ end
indices = Array.new()
for i in 0 ... get_number_of_seqs()
current_name = get_sequence( i ).get_name()
+ if case_sensitive && !partial_match
+ if !@name_to_seq_indices.has_key?( current_name )
+ @name_to_seq_indices[ current_name ] = []
+ end
+ @name_to_seq_indices[ current_name ].push( i )
+ end
if !case_sensitive
current_name = current_name.downcase
name = name.downcase
end
if current_name == name ||
- ( partial_match && current_name.include?( name ) )
+ ( partial_match && current_name.include?( name ) )
indices.push( i )
end
end
get_sequence( indices[ 0 ] )
end
- def find_by_name_start( name, case_sensitive )
+ def find_by_name_start(name, case_sensitive = true)
+ if case_sensitive && @namestart_to_seq_indices.has_key?( name )
+ return @namestart_to_seq_indices[ name ]
+ end
indices = []
- for i in 0 ... get_number_of_seqs()
- get_sequence( i ).get_name() =~ /^\s*(\S+)/
+ for i in 0 ... get_number_of_seqs
+ get_sequence(i).get_name =~ /^\s*(\S+)/
current_name = $1
+ if case_sensitive
+ unless @namestart_to_seq_indices.has_key?(current_name)
+ @namestart_to_seq_indices[current_name] = []
+ end
+ @namestart_to_seq_indices[current_name].push( i )
+ end
if !case_sensitive
current_name = current_name.downcase
name = name.downcase
end
- if ( current_name == name )
- indices.push( i )
+ if current_name == name
+ indices.push(i)
end
end
indices
name = name.downcase
end
if current_name == name ||
- ( partial_match && current_name.include?( name ) )
+ ( partial_match && current_name.include?( name ) )
return true
end
end
# throws ArgumentError
def get_by_name_start( name, case_sensitive = true )
indices = find_by_name_start( name, case_sensitive )
- if ( indices.length > 1 )
+ if indices.length > 1
error_msg = "\"" + name + "\" not unique"
raise ArgumentError, error_msg
- elsif ( indices.length < 1 )
+ elsif indices.length < 1
error_msg = "\"" + name + "\" not found"
raise ArgumentError, error_msg
end
get_sequence( indices[ 0 ] )
end
-
def get_sub_alignment( seq_numbers )
msa = Msa.new()
for i in 0 ... seq_numbers.length()
s
end
-
def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
if ( !is_aligned() )
error_msg = "attempt to get overlap diagram of unaligned msa"
overlap_count = 0
for i in 0...seq_1.get_length()
if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
- !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
+ !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
overlap_count += 1
if overlap_count >= min_overlap
return true
overlap_count = 0
for i in 0...seq_1.get_length
if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
- !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
+ !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
overlap_count += 1
end
end
identities_count = 0
for i in 0...seq_1.get_length
if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
- !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
- seq_1.get_character_code( i ) != 63 &&
- ( seq_1.get_residue( i ).downcase() ==
- seq_2.get_residue( i ).downcase() )
+ !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
+ seq_1.get_character_code( i ) != 63 &&
+ ( seq_1.get_residue( i ).downcase() ==
+ seq_2.get_residue( i ).downcase() )
identities_count += 1
end
end
remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
end
-
def remove_sequences_by_gap_ratio!( gap_ratio )
if ( !is_aligned() )
error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
removed
end
-
def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
n = get_number_of_seqs
removed = Array.new
for j in ( i + 1 ) ... n
if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
if !consider_taxonomy ||
- ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
- ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
+ ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
+ ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
- Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
+ Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
to_be_removed.add( j )
tax_i = ""
removed
end
-
def remove_sequences_by_non_gap_length!( min_non_gap_length )
- if ( !is_aligned() )
- error_msg = "attempt to remove sequences by non gap length on unaligned msa"
- raise StandardError, error_msg, caller
- end
n = get_number_of_seqs
- l = get_length
removed = Array.new
for s in 0 ... n
- if ( ( l - get_sequence( ( n - 1 ) - s ).get_gap_length ) < min_non_gap_length )
+ x = ( n - 1 ) - s
+ seq = get_sequence( x )
+ if ( ( seq.get_length - seq.get_gap_length ) < min_non_gap_length )
if ( Evoruby::Constants::VERBOSE )
- puts( "removed: " + get_sequence( ( n - 1 ) - s ).get_name )
+ puts( "removed: " + seq.get_name )
end
- removed << get_sequence( ( n - 1 ) - s ).get_name
- remove_sequence!( ( n - 1 ) - s )
+ removed << seq.get_name
+ remove_sequence!( x )
end
end
removed
end
- def trim!( first, last )
+ def trim!( first, last, name_suffix = nil )
cols = Array.new()
for i in 0 ... get_length()
if ( i < first || i > last )
end
end
remove_columns!( cols )
+ if name_suffix != nil
+ n = get_number_of_seqs
+ for s in 0 ... n
+ seq = get_sequence( s )
+ seq.set_name( seq.get_name() + name_suffix );
+ end
+ end
+ end
+
+ def extract( first, last, min_non_gap_length, suffix )
+ if !is_aligned()
+ error_msg = "attempt to extract from unaligned msa"
+ raise StandardError, error_msg, caller
+ end
+ if first < 0
+ error_msg = "first < 0"
+ raise StandardError, error_msg, caller
+ end
+ if last >= get_length()
+ error_msg = "last > length"
+ raise StandardError, error_msg, caller
+ end
+ if first >= last
+ error_msg = "first >= last"
+ raise StandardError, error_msg, caller
+ end
+ msa = Msa.new()
+ for i in 0 ... get_number_of_seqs
+ subseq = get_sequence( i ).get_subsequence( first, last )
+ unless ( ( subseq.get_length - subseq.get_gap_length ) < min_non_gap_length )
+ if suffix != nil
+ msa.add( subseq.get_name + suffix, subseq.get_sequence_as_string )
+ else
+ msa.add( subseq.get_name, subseq.get_sequence_as_string )
+ end
+ end
+ end
+
+ msa
+ end
+
+ def sliding_extraction( step, size, min_non_gap_length, suffix = nil )
+ counter = 0
+ done = false
+ msas = Array.new()
+ while !done
+ first = counter * step
+ last = first + size - 1
+ if last > get_length() - 1
+ last = get_length() - 1
+ done = true
+ end
+ unless first >= last
+ counter +=1
+ res = extract(first, last, min_non_gap_length, suffix)
+ res.set_name(first.to_s + "-" + last.to_s)
+ msas << res
+ end
+ end
+ msas
end
def get_gap_only_columns()
return cols
end
-
- # Split an alignment into n alignemnts of equal size, except last one
+ # Split an alignment into n alignmnts of equal size, except last one
def split( n, verbose = false )
if ( n < 2 || n > get_number_of_seqs )
error_msg = "attempt to split into less than two or more than the number of sequences"
x = get_number_of_seqs / n
for i in 0 ... n
msa = Msa.new()
- s = 0
-
+ #s = 0
if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
y = x + r
if ( verbose )
msas
end
+ def split_by_os(verbose = false)
+ msa_hash = Hash.new()
+ for i in 0 ... get_number_of_seqs()
+ seq = get_sequence(i)
+ name = seq.get_name()
+ # >sp|Q1HVE7|AN_EBVA8 Shutoff alkaline exonuclease OS=Epstein-Barr virus (strain AG876) GN=BGLF5 PE=3 SV=1
+ # if name =~ /OS=(.+?)\s+[A-Z]{2}=/
+ if name =~ /Organism:(.+?)(\|Protein|$)/
+ os = $1
+ unless msa_hash.has_key?(os)
+ msa_hash[os] = Msa.new
+ end
+ msa_hash[os].add_sequence seq
+ else
+ error_msg = "sequence name \"" + name + "\" is not in the expected format for splitting by OS"
+ raise IOError, error_msg, caller
+ end
+ end
+ msa_hash = msa_hash.sort{|a, b|a<=>b}.to_h
+ if verbose
+ c = 0
+ msa_hash.each do |o, msa|
+ c += 1
+ puts c.to_s + ': ' + o
+ end
+ end
+ msa_hash
+ end
private
return self
end
-
end # class Msa
end # module Evoruby