2 # = lib/evo/msa/msa.rb - Msa class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: msa.rb,v 1.11 2009/01/03 00:42:08 cmzmasek Exp $
11 require 'lib/evo/util/constants'
12 require 'lib/evo/util/util'
13 require 'lib/evo/sequence/sequence'
20 @sequences = Array.new
21 @identical_seqs_detected = Array.new
22 @name_to_seq_indices = Hash.new
23 @namestart_to_seq_indices = Hash.new
27 def add_sequence( sequence )
28 @sequences.push( sequence )
31 def add( name, molecular_sequence_str )
32 add_sequence( Sequence.new( name, molecular_sequence_str ) )
35 def get_sequence( index )
36 if ( index < 0 || index > get_number_of_seqs() - 1 )
37 error_msg = "attempt to get sequence " <<
38 index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
40 raise ArgumentError, error_msg
42 return @sequences[ index ]
45 def remove_sequence!( index )
46 if ( index < 0 || index > get_number_of_seqs() - 1 )
47 error_msg = "attempt to remove sequence " <<
48 index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
50 raise ArgumentError, error_msg
52 @name_to_seq_indices.clear
53 @namestart_to_seq_indices.clear
54 @sequences.delete_at( index )
57 def get_identical_seqs_detected
58 @identical_seqs_detected
63 if ( get_number_of_seqs < 1 )
66 l = @sequences[ 0 ].get_length()
67 for i in 0 ... get_number_of_seqs()
68 if ( get_sequence( i ).get_length() != l )
76 def find_by_name( name, case_sensitive, partial_match )
77 if case_sensitive && !partial_match && @name_to_seq_indices.has_key?( name )
78 return @name_to_seq_indices[ name ]
81 for i in 0 ... get_number_of_seqs()
82 current_name = get_sequence( i ).get_name()
83 if case_sensitive && !partial_match
84 if !@name_to_seq_indices.has_key?( current_name )
85 @name_to_seq_indices[ current_name ] = []
87 @name_to_seq_indices[ current_name ].push( i )
90 current_name = current_name.downcase
93 if current_name == name ||
94 ( partial_match && current_name.include?( name ) )
101 def find_by_name_pattern( name_re, avoid_similar_to = true )
103 for i in 0 ... get_number_of_seqs()
105 m = name_re.match( get_sequence( i ).get_name() )
106 if m && !m.pre_match.downcase.include?( "similar to " )
110 if name_re.match( get_sequence( i ).get_name() )
118 # throws ArgumentError
119 def get_by_name_pattern( name_re , avoid_similar_to = true )
120 indices = find_by_name_pattern( name_re, avoid_similar_to )
121 if ( indices.length > 1 )
122 error_msg = "pattern " + name_re.to_s + " not unique"
123 raise ArgumentError, error_msg
124 elsif ( indices.length < 1 )
125 error_msg = "pattern " + name_re.to_s + " not found"
126 raise ArgumentError, error_msg
128 get_sequence( indices[ 0 ] )
131 def find_by_name_start( name, case_sensitive )
132 if case_sensitive && @namestart_to_seq_indices.has_key?( name )
133 return @namestart_to_seq_indices[ name ]
136 for i in 0 ... get_number_of_seqs()
137 get_sequence( i ).get_name() =~ /^\s*(\S+)/
140 if !@namestart_to_seq_indices.has_key?( current_name )
141 @namestart_to_seq_indices[ current_name ] = []
143 @namestart_to_seq_indices[ current_name ].push( i )
146 current_name = current_name.downcase
149 if current_name == name
156 def has?( name, case_sensitive = true, partial_match = false )
157 for i in 0 ... get_number_of_seqs()
158 current_name = get_sequence( i ).get_name()
160 current_name = current_name.downcase
163 if current_name == name ||
164 ( partial_match && current_name.include?( name ) )
171 # throws ArgumentError
172 def get_by_name( name, case_sensitive = true, partial_match = false )
173 indices = find_by_name( name, case_sensitive, partial_match )
174 if ( indices.length > 1 )
175 error_msg = "\"" + name + "\" not unique"
176 raise ArgumentError, error_msg
177 elsif ( indices.length < 1 )
178 error_msg = "\"" + name + "\" not found"
179 raise ArgumentError, error_msg
181 get_sequence( indices[ 0 ] )
184 # throws ArgumentError
185 def get_by_name_start( name, case_sensitive = true )
186 indices = find_by_name_start( name, case_sensitive )
187 if indices.length > 1
188 error_msg = "\"" + name + "\" not unique"
189 raise ArgumentError, error_msg
190 elsif indices.length < 1
191 error_msg = "\"" + name + "\" not found"
192 raise ArgumentError, error_msg
194 get_sequence( indices[ 0 ] )
198 def get_sub_alignment( seq_numbers )
200 for i in 0 ... seq_numbers.length()
201 msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() )
206 def get_number_of_seqs()
212 error_msg = "attempt to get length of unaligned msa"
213 raise StandardError, error_msg, caller
215 if get_number_of_seqs() < 1
218 @sequences[ 0 ].get_length()
228 for i in 0...get_number_of_seqs
229 s += @sequences[ i ].to_fasta + Constants::LINE_DELIMITER
235 def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
237 error_msg = "attempt to get overlap diagram of unaligned msa"
238 raise StandardError, error_msg, caller
240 for i in 0 ... get_number_of_seqs()
241 io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) )
242 for j in 0 ... get_number_of_seqs()
246 if overlap?( i, j, min_overlap )
253 io.print( Evoruby::Constants::LINE_DELIMITER )
257 #returns array of Msa with an overlap of min_overlap
258 def split_into_overlapping_msa( min_overlap = 1 )
260 error_msg = "attempt to split into overlapping msas of unaligned msa"
261 raise StandardError, error_msg, caller
264 bins = get_overlaps( min_overlap )
265 for i in 0 ... bins.length
266 msas.push( get_sub_alignment( bins[ i ] ) )
271 def overlap?( index_1, index_2, min_overlap = 1 )
272 seq_1 = get_sequence( index_1 )
273 seq_2 = get_sequence( index_2 )
275 for i in 0...seq_1.get_length()
276 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
277 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
279 if overlap_count >= min_overlap
287 def calculate_overlap( index_1, index_2 )
288 seq_1 = get_sequence( index_1 )
289 seq_2 = get_sequence( index_2 )
291 for i in 0...seq_1.get_length
292 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
293 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
300 def calculate_identities( index_1, index_2 )
301 seq_1 = get_sequence( index_1 )
302 seq_2 = get_sequence( index_2 )
304 for i in 0...seq_1.get_length
305 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
306 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
307 seq_1.get_character_code( i ) != 63 &&
308 ( seq_1.get_residue( i ).downcase() ==
309 seq_2.get_residue( i ).downcase() )
310 identities_count += 1
316 def remove_gap_only_columns!()
317 remove_columns!( get_gap_only_columns() )
320 def remove_gap_columns!()
321 remove_columns!( get_gap_columns() )
324 # removes columns for which seqs with gap / number of sequences > gap_ratio
325 def remove_gap_columns_w_gap_ratio!( gap_ratio )
326 remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
330 def remove_sequences_by_gap_ratio!( gap_ratio )
332 error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
333 raise StandardError, error_msg, caller
335 n = get_number_of_seqs
338 if ( get_sequence( ( n - 1 ) - s ).get_gap_ratio() > gap_ratio )
339 if ( Evoruby::Constants::VERBOSE )
340 puts( "removed: " + get_sequence( ( n - 1 ) - s ).get_name )
342 removed << get_sequence( ( n - 1 ) - s ).get_name
343 remove_sequence!( ( n - 1 ) - s )
350 def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
351 n = get_number_of_seqs
353 to_be_removed = Set.new
354 @identical_seqs_detected = Array.new
355 for i in 0 ... ( n - 1 )
356 for j in ( i + 1 ) ... n
357 if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
358 if !consider_taxonomy ||
359 ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
360 ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
361 if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
362 Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
363 to_be_removed.add( j )
367 if get_sequence( i ).get_taxonomy != nil
368 tax_i = get_sequence( i ).get_taxonomy.get_name
370 if get_sequence( j ).get_taxonomy != nil
371 tax_j = get_sequence( j ).get_taxonomy.get_name
373 identical_pair = get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]"
374 @identical_seqs_detected.push( identical_pair )
384 to_be_removed_ary = to_be_removed.to_a.sort.reverse
386 to_be_removed_ary.each { | index |
387 removed.push( get_sequence( index ).get_name )
388 remove_sequence!( index )
394 def remove_sequences_by_non_gap_length!( min_non_gap_length )
395 n = get_number_of_seqs
399 seq = get_sequence( x )
400 if ( ( seq.get_length - seq.get_gap_length ) < min_non_gap_length )
401 if ( Evoruby::Constants::VERBOSE )
402 puts( "removed: " + seq.get_name )
404 removed << seq.get_name
405 remove_sequence!( x )
411 def trim!( first, last )
413 for i in 0 ... get_length()
414 if ( i < first || i > last )
418 remove_columns!( cols )
421 def get_gap_only_columns()
423 error_msg = "attempt to get gap only columns of unaligned msa"
424 raise StandardError, error_msg, caller
427 for c in 0 ... get_length
428 nogap_char_found = false
429 for s in 0 ... get_number_of_seqs
430 unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
431 nogap_char_found = true
435 unless nogap_char_found
442 def calculate_gap_proportion()
444 error_msg = "attempt to get gap only columns of unaligned msa"
445 raise StandardError, error_msg, caller
449 for c in 0 ... get_length
450 for s in 0 ... get_number_of_seqs
451 total_sum = total_sum + 1
452 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
453 gap_sum = gap_sum + 1
458 return gap_sum / total_sum
461 def get_gap_columns()
463 error_msg = "attempt to get gap columns of unaligned msa"
464 raise StandardError, error_msg, caller
467 for c in 0 ... get_length
468 gap_char_found = false
469 for s in 0 ... get_number_of_seqs
470 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
471 gap_char_found = true
482 # gap_ratio = seqs with gap / number of sequences
483 # returns column indices for which seqs with gap / number of sequences > gap_ratio
484 def get_gap_columns_w_gap_ratio( gap_ratio )
486 error_msg = "attempt to get gap columns with gap_ratio of unaligned msa"
487 raise StandardError, error_msg, caller
489 if ( gap_ratio < 0 || gap_ratio > 1 )
490 error_msg = "gap ratio must be between 0 and 1 inclusive"
491 raise ArgumentError, error_msg, caller
494 for c in 0 ... get_length
496 for s in 0 ... get_number_of_seqs
497 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
501 if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio )
509 # Split an alignment into n alignemnts of equal size, except last one
510 def split( n, verbose = false )
511 if ( n < 2 || n > get_number_of_seqs )
512 error_msg = "attempt to split into less than two or more than the number of sequences"
513 raise StandardError, error_msg, caller
516 r = get_number_of_seqs % n
517 x = get_number_of_seqs / n
522 if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
525 puts( i.to_s + ": " + y.to_s )
528 msa.add_sequence( get_sequence( ( i * x ) + j ) )
532 puts( i.to_s + ": " + x.to_s )
535 msa.add_sequence( get_sequence( ( i * x ) + j ) )
546 def get_overlaps( min_overlap = 1 )
548 error_msg = "attempt to get overlaps of unaligned msa"
549 raise StandardError, error_msg, caller
552 for i in 0 ... get_number_of_seqs()
554 for j in 0 ... bins.length
555 current_bin = bins[ j ]
556 # does seq i overlap with all seqs in current_bin?
558 for z in 0 ... current_bin.length
559 unless overlap?( i, current_bin[ z ], min_overlap )
565 current_bin.push( i )
570 new_bin = Array.new()
578 def remove_columns!( cols )
580 error_msg = "attempt to remove columns of unaligned msa"
581 raise StandardError, error_msg, caller
584 for c in 0 ... cols.length()
586 for s in 0 ... get_number_of_seqs()
587 get_sequence( s ).delete_residue!( col )