2 # = lib/evo/msa/msa.rb - Msa class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: msa.rb,v 1.11 2009/01/03 00:42:08 cmzmasek Exp $
11 require 'lib/evo/util/constants'
12 require 'lib/evo/util/util'
13 require 'lib/evo/sequence/sequence'
20 @sequences = Array.new()
24 def add_sequence( sequence )
25 @sequences.push( sequence )
28 def add( name, molecular_sequence_str )
29 add_sequence( Sequence.new( name, molecular_sequence_str ) )
32 def get_sequence( index )
33 if ( index < 0 || index > get_number_of_seqs() - 1 )
34 error_msg = "attempt to get sequence " <<
35 index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
37 raise ArgumentError, error_msg
39 return @sequences[ index ]
42 def remove_sequence!( index )
43 if ( index < 0 || index > get_number_of_seqs() - 1 )
44 error_msg = "attempt to remove sequence " <<
45 index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
47 raise ArgumentError, error_msg
49 @sequences.delete_at( index )
53 if ( get_number_of_seqs < 1 )
56 l = @sequences[ 0 ].get_length()
57 for i in 0 ... get_number_of_seqs()
58 if ( get_sequence( i ).get_length() != l )
66 def find_by_name( name, case_sensitive, partial_match )
68 for i in 0 ... get_number_of_seqs()
69 current_name = get_sequence( i ).get_name()
71 current_name = current_name.downcase
74 if current_name == name ||
75 ( partial_match && current_name.include?( name ) )
82 def find_by_name_start( name, case_sensitive )
84 for i in 0 ... get_number_of_seqs()
85 get_sequence( i ).get_name() =~ /^\s*(\S+)/
88 current_name = current_name.downcase
91 if ( current_name == name )
98 def has?( name, case_sensitive = true, partial_match = false )
99 for i in 0 ... get_number_of_seqs()
100 current_name = get_sequence( i ).get_name()
102 current_name = current_name.downcase
105 if current_name == name ||
106 ( partial_match && current_name.include?( name ) )
113 # throws ArgumentError
114 def get_by_name( name, case_sensitive = true, partial_match = false )
115 indices = find_by_name( name, case_sensitive, partial_match )
116 if ( indices.length > 1 )
117 error_msg = "\"" + name + "\" not unique"
118 raise ArgumentError, error_msg
119 elsif ( indices.length < 1 )
120 error_msg = "\"" + name + "\" not found"
121 raise ArgumentError, error_msg
123 get_sequence( indices[ 0 ] )
126 # throws ArgumentError
127 def get_by_name_start( name, case_sensitive = true )
128 indices = find_by_name_start( name, case_sensitive )
129 if ( indices.length > 1 )
130 error_msg = "\"" + name + "\" not unique"
131 raise ArgumentError, error_msg
132 elsif ( indices.length < 1 )
133 error_msg = "\"" + name + "\" not found"
134 raise ArgumentError, error_msg
136 get_sequence( indices[ 0 ] )
140 def get_sub_alignment( seq_numbers )
142 for i in 0 ... seq_numbers.length()
143 msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() )
148 def get_number_of_seqs()
154 error_msg = "attempt to get length of unaligned msa"
155 raise StandardError, error_msg, caller
157 if ( get_number_of_seqs() < 1 )
160 @sequences[ 0 ].get_length()
166 for i in 0...get_number_of_seqs()
167 s += @sequences[ i ].to_str + Constants::LINE_DELIMITER
172 def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
174 error_msg = "attempt to get overlap diagram of unaligned msa"
175 raise StandardError, error_msg, caller
177 for i in 0 ... get_number_of_seqs()
178 io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) )
179 for j in 0 ... get_number_of_seqs()
183 if overlap?( i, j, min_overlap )
190 io.print( Evoruby::Constants::LINE_DELIMITER )
194 #returns array of Msa with an overlap of min_overlap
195 def split_into_overlapping_msa( min_overlap = 1 )
197 error_msg = "attempt to split into overlapping msas of unaligned msa"
198 raise StandardError, error_msg, caller
201 bins = get_overlaps( min_overlap )
202 for i in 0 ... bins.length
203 msas.push( get_sub_alignment( bins[ i ] ) )
208 def overlap?( index_1, index_2, min_overlap = 1 )
209 seq_1 = get_sequence( index_1 )
210 seq_2 = get_sequence( index_2 )
212 for i in 0...seq_1.get_length()
213 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
214 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
216 if overlap_count >= min_overlap
224 def calculate_overlap( index_1, index_2 )
225 seq_1 = get_sequence( index_1 )
226 seq_2 = get_sequence( index_2 )
228 for i in 0...seq_1.get_length
229 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
230 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
237 def calculate_identities( index_1, index_2 )
238 seq_1 = get_sequence( index_1 )
239 seq_2 = get_sequence( index_2 )
241 for i in 0...seq_1.get_length
242 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
243 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
244 seq_1.get_character_code( i ) != 63 &&
245 ( seq_1.get_residue( i ).downcase() ==
246 seq_2.get_residue( i ).downcase() )
247 identities_count += 1
253 def remove_gap_only_columns!()
254 remove_columns!( get_gap_only_columns() )
257 def remove_gap_columns!()
258 remove_columns!( get_gap_columns() )
261 # removes columns for which seqs with gap / number of sequences > gap_ratio
262 def remove_gap_columns_w_gap_ratio!( gap_ratio )
263 remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
267 def remove_sequences_by_gap_ratio!( gap_ratio )
269 error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
270 raise StandardError, error_msg, caller
272 n = get_number_of_seqs
275 if ( get_sequence( ( n - 1 ) - s ).get_gap_ratio() > gap_ratio )
276 if ( Evoruby::Constants::VERBOSE )
277 puts( "removed: " + get_sequence( ( n - 1 ) - s ).get_name )
279 removed << get_sequence( ( n - 1 ) - s ).get_name
280 remove_sequence!( ( n - 1 ) - s )
287 def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
288 n = get_number_of_seqs
290 to_be_removed = Set.new
291 for i in 0 ... ( n - 1 )
292 for j in ( i + 1 ) ... n
293 if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
294 if !consider_taxonomy ||
295 ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
296 ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
297 if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
298 Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
299 to_be_removed.add( j )
303 if get_sequence( i ).get_taxonomy != nil
304 tax_i = get_sequence( i ).get_taxonomy.get_name
306 if get_sequence( j ).get_taxonomy != nil
307 tax_j = get_sequence( j ).get_taxonomy.get_name
309 puts get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]"
316 to_be_removed_ary = to_be_removed.to_a.sort.reverse
318 to_be_removed_ary.each { | index |
319 removed.push( get_sequence( index ).get_name )
320 remove_sequence!( index )
326 def remove_sequences_by_non_gap_length!( min_non_gap_length )
328 error_msg = "attempt to remove sequences by non gap length on unaligned msa"
329 raise StandardError, error_msg, caller
331 n = get_number_of_seqs
335 if ( ( l - get_sequence( ( n - 1 ) - s ).get_gap_length ) < min_non_gap_length )
336 if ( Evoruby::Constants::VERBOSE )
337 puts( "removed: " + get_sequence( ( n - 1 ) - s ).get_name )
339 removed << get_sequence( ( n - 1 ) - s ).get_name
340 remove_sequence!( ( n - 1 ) - s )
346 def trim!( first, last )
348 for i in 0 ... get_length()
349 if ( i < first || i > last )
353 remove_columns!( cols )
356 def get_gap_only_columns()
358 error_msg = "attempt to get gap only columns of unaligned msa"
359 raise StandardError, error_msg, caller
362 for c in 0 ... get_length
363 nogap_char_found = false
364 for s in 0 ... get_number_of_seqs
365 unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
366 nogap_char_found = true
370 unless nogap_char_found
377 def calculate_gap_proportion()
379 error_msg = "attempt to get gap only columns of unaligned msa"
380 raise StandardError, error_msg, caller
384 for c in 0 ... get_length
385 for s in 0 ... get_number_of_seqs
386 total_sum = total_sum + 1
387 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
388 gap_sum = gap_sum + 1
393 return gap_sum / total_sum
396 def get_gap_columns()
398 error_msg = "attempt to get gap columns of unaligned msa"
399 raise StandardError, error_msg, caller
402 for c in 0 ... get_length
403 gap_char_found = false
404 for s in 0 ... get_number_of_seqs
405 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
406 gap_char_found = true
417 # gap_ratio = seqs with gap / number of sequences
418 # returns column indices for which seqs with gap / number of sequences > gap_ratio
419 def get_gap_columns_w_gap_ratio( gap_ratio )
421 error_msg = "attempt to get gap columns with gap_ratio of unaligned msa"
422 raise StandardError, error_msg, caller
424 if ( gap_ratio < 0 || gap_ratio > 1 )
425 error_msg = "gap ratio must be between 0 and 1 inclusive"
426 raise ArgumentError, error_msg, caller
429 for c in 0 ... get_length
431 for s in 0 ... get_number_of_seqs
432 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
436 if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio )
444 # Split an alignment into n alignemnts of equal size, except last one
445 def split( n, verbose = false )
446 if ( n < 2 || n > get_number_of_seqs )
447 error_msg = "attempt to split into less than two or more than the number of sequences"
448 raise StandardError, error_msg, caller
451 r = get_number_of_seqs % n
452 x = get_number_of_seqs / n
457 if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
460 puts( i.to_s + ": " + y.to_s )
463 msa.add_sequence( get_sequence( ( i * x ) + j ) )
467 puts( i.to_s + ": " + x.to_s )
470 msa.add_sequence( get_sequence( ( i * x ) + j ) )
481 def get_overlaps( min_overlap = 1 )
483 error_msg = "attempt to get overlaps of unaligned msa"
484 raise StandardError, error_msg, caller
487 for i in 0 ... get_number_of_seqs()
489 for j in 0 ... bins.length
490 current_bin = bins[ j ]
491 # does seq i overlap with all seqs in current_bin?
493 for z in 0 ... current_bin.length
494 unless overlap?( i, current_bin[ z ], min_overlap )
500 current_bin.push( i )
505 new_bin = Array.new()
513 def remove_columns!( cols )
515 error_msg = "attempt to remove columns of unaligned msa"
516 raise StandardError, error_msg, caller
519 for c in 0 ... cols.length()
521 for s in 0 ... get_number_of_seqs()
522 get_sequence( s ).delete_residue!( col )