2 # = lib/evo/msa/msa.rb - Msa class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: msa.rb,v 1.11 2009/01/03 00:42:08 cmzmasek Exp $
11 require 'lib/evo/util/constants'
12 require 'lib/evo/util/util'
13 require 'lib/evo/sequence/sequence'
20 @sequences = Array.new
21 @identical_seqs_detected = Array.new
25 def add_sequence( sequence )
26 @sequences.push( sequence )
29 def add( name, molecular_sequence_str )
30 add_sequence( Sequence.new( name, molecular_sequence_str ) )
33 def get_sequence( index )
34 if ( index < 0 || index > get_number_of_seqs() - 1 )
35 error_msg = "attempt to get sequence " <<
36 index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
38 raise ArgumentError, error_msg
40 return @sequences[ index ]
43 def remove_sequence!( index )
44 if ( index < 0 || index > get_number_of_seqs() - 1 )
45 error_msg = "attempt to remove sequence " <<
46 index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
48 raise ArgumentError, error_msg
50 @sequences.delete_at( index )
53 def get_identical_seqs_detected
54 @identical_seqs_detected
59 if ( get_number_of_seqs < 1 )
62 l = @sequences[ 0 ].get_length()
63 for i in 0 ... get_number_of_seqs()
64 if ( get_sequence( i ).get_length() != l )
72 def find_by_name( name, case_sensitive, partial_match )
74 for i in 0 ... get_number_of_seqs()
75 current_name = get_sequence( i ).get_name()
77 current_name = current_name.downcase
80 if current_name == name ||
81 ( partial_match && current_name.include?( name ) )
88 def find_by_name_pattern( name_re )
90 for i in 0 ... get_number_of_seqs()
91 if name_re.match( get_sequence( i ).get_name() )
98 # throws ArgumentError
99 def get_by_name_pattern( name_re )
100 indices = find_by_name_pattern( name_re )
101 if ( indices.length > 1 )
102 error_msg = "pattern \"" + name_re.to_s + "\" not unique"
103 raise ArgumentError, error_msg
104 elsif ( indices.length < 1 )
105 error_msg = "pattern \"" + name_re.to_s + "\" not found"
106 raise ArgumentError, error_msg
108 get_sequence( indices[ 0 ] )
111 def find_by_name_start( name, case_sensitive )
113 for i in 0 ... get_number_of_seqs()
114 get_sequence( i ).get_name() =~ /^\s*(\S+)/
117 current_name = current_name.downcase
120 if ( current_name == name )
127 def has?( name, case_sensitive = true, partial_match = false )
128 for i in 0 ... get_number_of_seqs()
129 current_name = get_sequence( i ).get_name()
131 current_name = current_name.downcase
134 if current_name == name ||
135 ( partial_match && current_name.include?( name ) )
142 # throws ArgumentError
143 def get_by_name( name, case_sensitive = true, partial_match = false )
144 indices = find_by_name( name, case_sensitive, partial_match )
145 if ( indices.length > 1 )
146 error_msg = "\"" + name + "\" not unique"
147 raise ArgumentError, error_msg
148 elsif ( indices.length < 1 )
149 error_msg = "\"" + name + "\" not found"
150 raise ArgumentError, error_msg
152 get_sequence( indices[ 0 ] )
155 # throws ArgumentError
156 def get_by_name_start( name, case_sensitive = true )
157 indices = find_by_name_start( name, case_sensitive )
158 if ( indices.length > 1 )
159 error_msg = "\"" + name + "\" not unique"
160 raise ArgumentError, error_msg
161 elsif ( indices.length < 1 )
162 error_msg = "\"" + name + "\" not found"
163 raise ArgumentError, error_msg
165 get_sequence( indices[ 0 ] )
169 def get_sub_alignment( seq_numbers )
171 for i in 0 ... seq_numbers.length()
172 msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() )
177 def get_number_of_seqs()
183 error_msg = "attempt to get length of unaligned msa"
184 raise StandardError, error_msg, caller
186 if get_number_of_seqs() < 1
189 @sequences[ 0 ].get_length()
199 for i in 0...get_number_of_seqs
200 s += @sequences[ i ].to_fasta + Constants::LINE_DELIMITER
206 def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
208 error_msg = "attempt to get overlap diagram of unaligned msa"
209 raise StandardError, error_msg, caller
211 for i in 0 ... get_number_of_seqs()
212 io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) )
213 for j in 0 ... get_number_of_seqs()
217 if overlap?( i, j, min_overlap )
224 io.print( Evoruby::Constants::LINE_DELIMITER )
228 #returns array of Msa with an overlap of min_overlap
229 def split_into_overlapping_msa( min_overlap = 1 )
231 error_msg = "attempt to split into overlapping msas of unaligned msa"
232 raise StandardError, error_msg, caller
235 bins = get_overlaps( min_overlap )
236 for i in 0 ... bins.length
237 msas.push( get_sub_alignment( bins[ i ] ) )
242 def overlap?( index_1, index_2, min_overlap = 1 )
243 seq_1 = get_sequence( index_1 )
244 seq_2 = get_sequence( index_2 )
246 for i in 0...seq_1.get_length()
247 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
248 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
250 if overlap_count >= min_overlap
258 def calculate_overlap( index_1, index_2 )
259 seq_1 = get_sequence( index_1 )
260 seq_2 = get_sequence( index_2 )
262 for i in 0...seq_1.get_length
263 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
264 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
271 def calculate_identities( index_1, index_2 )
272 seq_1 = get_sequence( index_1 )
273 seq_2 = get_sequence( index_2 )
275 for i in 0...seq_1.get_length
276 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
277 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
278 seq_1.get_character_code( i ) != 63 &&
279 ( seq_1.get_residue( i ).downcase() ==
280 seq_2.get_residue( i ).downcase() )
281 identities_count += 1
287 def remove_gap_only_columns!()
288 remove_columns!( get_gap_only_columns() )
291 def remove_gap_columns!()
292 remove_columns!( get_gap_columns() )
295 # removes columns for which seqs with gap / number of sequences > gap_ratio
296 def remove_gap_columns_w_gap_ratio!( gap_ratio )
297 remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
301 def remove_sequences_by_gap_ratio!( gap_ratio )
303 error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
304 raise StandardError, error_msg, caller
306 n = get_number_of_seqs
309 if ( get_sequence( ( n - 1 ) - s ).get_gap_ratio() > gap_ratio )
310 if ( Evoruby::Constants::VERBOSE )
311 puts( "removed: " + get_sequence( ( n - 1 ) - s ).get_name )
313 removed << get_sequence( ( n - 1 ) - s ).get_name
314 remove_sequence!( ( n - 1 ) - s )
321 def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
322 n = get_number_of_seqs
324 to_be_removed = Set.new
325 @identical_seqs_detected = Array.new
326 for i in 0 ... ( n - 1 )
327 for j in ( i + 1 ) ... n
328 if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
329 if !consider_taxonomy ||
330 ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
331 ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
332 if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
333 Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
334 to_be_removed.add( j )
338 if get_sequence( i ).get_taxonomy != nil
339 tax_i = get_sequence( i ).get_taxonomy.get_name
341 if get_sequence( j ).get_taxonomy != nil
342 tax_j = get_sequence( j ).get_taxonomy.get_name
344 identical_pair = get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]"
345 @identical_seqs_detected.push( identical_pair )
355 to_be_removed_ary = to_be_removed.to_a.sort.reverse
357 to_be_removed_ary.each { | index |
358 removed.push( get_sequence( index ).get_name )
359 remove_sequence!( index )
365 def remove_sequences_by_non_gap_length!( min_non_gap_length )
367 error_msg = "attempt to remove sequences by non gap length on unaligned msa"
368 raise StandardError, error_msg, caller
370 n = get_number_of_seqs
374 if ( ( l - get_sequence( ( n - 1 ) - s ).get_gap_length ) < min_non_gap_length )
375 if ( Evoruby::Constants::VERBOSE )
376 puts( "removed: " + get_sequence( ( n - 1 ) - s ).get_name )
378 removed << get_sequence( ( n - 1 ) - s ).get_name
379 remove_sequence!( ( n - 1 ) - s )
385 def trim!( first, last )
387 for i in 0 ... get_length()
388 if ( i < first || i > last )
392 remove_columns!( cols )
395 def get_gap_only_columns()
397 error_msg = "attempt to get gap only columns of unaligned msa"
398 raise StandardError, error_msg, caller
401 for c in 0 ... get_length
402 nogap_char_found = false
403 for s in 0 ... get_number_of_seqs
404 unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
405 nogap_char_found = true
409 unless nogap_char_found
416 def calculate_gap_proportion()
418 error_msg = "attempt to get gap only columns of unaligned msa"
419 raise StandardError, error_msg, caller
423 for c in 0 ... get_length
424 for s in 0 ... get_number_of_seqs
425 total_sum = total_sum + 1
426 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
427 gap_sum = gap_sum + 1
432 return gap_sum / total_sum
435 def get_gap_columns()
437 error_msg = "attempt to get gap columns of unaligned msa"
438 raise StandardError, error_msg, caller
441 for c in 0 ... get_length
442 gap_char_found = false
443 for s in 0 ... get_number_of_seqs
444 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
445 gap_char_found = true
456 # gap_ratio = seqs with gap / number of sequences
457 # returns column indices for which seqs with gap / number of sequences > gap_ratio
458 def get_gap_columns_w_gap_ratio( gap_ratio )
460 error_msg = "attempt to get gap columns with gap_ratio of unaligned msa"
461 raise StandardError, error_msg, caller
463 if ( gap_ratio < 0 || gap_ratio > 1 )
464 error_msg = "gap ratio must be between 0 and 1 inclusive"
465 raise ArgumentError, error_msg, caller
468 for c in 0 ... get_length
470 for s in 0 ... get_number_of_seqs
471 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
475 if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio )
483 # Split an alignment into n alignemnts of equal size, except last one
484 def split( n, verbose = false )
485 if ( n < 2 || n > get_number_of_seqs )
486 error_msg = "attempt to split into less than two or more than the number of sequences"
487 raise StandardError, error_msg, caller
490 r = get_number_of_seqs % n
491 x = get_number_of_seqs / n
496 if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
499 puts( i.to_s + ": " + y.to_s )
502 msa.add_sequence( get_sequence( ( i * x ) + j ) )
506 puts( i.to_s + ": " + x.to_s )
509 msa.add_sequence( get_sequence( ( i * x ) + j ) )
520 def get_overlaps( min_overlap = 1 )
522 error_msg = "attempt to get overlaps of unaligned msa"
523 raise StandardError, error_msg, caller
526 for i in 0 ... get_number_of_seqs()
528 for j in 0 ... bins.length
529 current_bin = bins[ j ]
530 # does seq i overlap with all seqs in current_bin?
532 for z in 0 ... current_bin.length
533 unless overlap?( i, current_bin[ z ], min_overlap )
539 current_bin.push( i )
544 new_bin = Array.new()
552 def remove_columns!( cols )
554 error_msg = "attempt to remove columns of unaligned msa"
555 raise StandardError, error_msg, caller
558 for c in 0 ... cols.length()
560 for s in 0 ... get_number_of_seqs()
561 get_sequence( s ).delete_residue!( col )