2 # = lib/evo/msa/msa.rb - Msa class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # Last modified: 2017/02/07
10 require 'lib/evo/util/constants'
11 require 'lib/evo/util/util'
12 require 'lib/evo/sequence/sequence'
19 @sequences = Array.new
20 @identical_seqs_detected = Array.new
21 @name_to_seq_indices = Hash.new
22 @namestart_to_seq_indices = Hash.new
26 def add_sequence( sequence )
27 @sequences.push( sequence )
30 def add( name, molecular_sequence_str )
31 add_sequence( Sequence.new( name, molecular_sequence_str ) )
34 def get_sequence( index )
35 if ( index < 0 || index > get_number_of_seqs() - 1 )
36 error_msg = "attempt to get sequence " <<
37 index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
39 raise ArgumentError, error_msg
41 return @sequences[ index ]
44 def remove_sequence!( index )
45 if ( index < 0 || index > get_number_of_seqs() - 1 )
46 error_msg = "attempt to remove sequence " <<
47 index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
49 raise ArgumentError, error_msg
51 @name_to_seq_indices.clear
52 @namestart_to_seq_indices.clear
53 @sequences.delete_at( index )
56 def get_identical_seqs_detected
57 @identical_seqs_detected
62 if ( get_number_of_seqs < 1 )
65 l = @sequences[ 0 ].get_length()
66 for i in 0 ... get_number_of_seqs()
67 if ( get_sequence( i ).get_length() != l )
75 def find_by_name( name, case_sensitive, partial_match )
76 if case_sensitive && !partial_match && @name_to_seq_indices.has_key?( name )
77 return @name_to_seq_indices[ name ]
80 for i in 0 ... get_number_of_seqs()
81 current_name = get_sequence( i ).get_name()
82 if case_sensitive && !partial_match
83 if !@name_to_seq_indices.has_key?( current_name )
84 @name_to_seq_indices[ current_name ] = []
86 @name_to_seq_indices[ current_name ].push( i )
89 current_name = current_name.downcase
92 if current_name == name ||
93 ( partial_match && current_name.include?( name ) )
100 def find_by_name_pattern( name_re, avoid_similar_to = true )
102 for i in 0 ... get_number_of_seqs()
104 m = name_re.match( get_sequence( i ).get_name() )
105 if m && !m.pre_match.downcase.include?( "similar to " )
109 if name_re.match( get_sequence( i ).get_name() )
117 # throws ArgumentError
118 def get_by_name_pattern( name_re , avoid_similar_to = true )
119 indices = find_by_name_pattern( name_re, avoid_similar_to )
120 if ( indices.length > 1 )
121 error_msg = "pattern " + name_re.to_s + " not unique"
122 raise ArgumentError, error_msg
123 elsif ( indices.length < 1 )
124 error_msg = "pattern " + name_re.to_s + " not found"
125 raise ArgumentError, error_msg
127 get_sequence( indices[ 0 ] )
130 def find_by_name_start( name, case_sensitive )
131 if case_sensitive && @namestart_to_seq_indices.has_key?( name )
132 return @namestart_to_seq_indices[ name ]
135 for i in 0 ... get_number_of_seqs()
136 get_sequence( i ).get_name() =~ /^\s*(\S+)/
139 if !@namestart_to_seq_indices.has_key?( current_name )
140 @namestart_to_seq_indices[ current_name ] = []
142 @namestart_to_seq_indices[ current_name ].push( i )
145 current_name = current_name.downcase
148 if current_name == name
155 def has?( name, case_sensitive = true, partial_match = false )
156 for i in 0 ... get_number_of_seqs()
157 current_name = get_sequence( i ).get_name()
159 current_name = current_name.downcase
162 if current_name == name ||
163 ( partial_match && current_name.include?( name ) )
170 # throws ArgumentError
171 def get_by_name( name, case_sensitive = true, partial_match = false )
172 indices = find_by_name( name, case_sensitive, partial_match )
173 if ( indices.length > 1 )
174 error_msg = "\"" + name + "\" not unique"
175 raise ArgumentError, error_msg
176 elsif ( indices.length < 1 )
177 error_msg = "\"" + name + "\" not found"
178 raise ArgumentError, error_msg
180 get_sequence( indices[ 0 ] )
183 # throws ArgumentError
184 def get_by_name_start( name, case_sensitive = true )
185 indices = find_by_name_start( name, case_sensitive )
186 if indices.length > 1
187 error_msg = "\"" + name + "\" not unique"
188 raise ArgumentError, error_msg
189 elsif indices.length < 1
190 error_msg = "\"" + name + "\" not found"
191 raise ArgumentError, error_msg
193 get_sequence( indices[ 0 ] )
197 def get_sub_alignment( seq_numbers )
199 for i in 0 ... seq_numbers.length()
200 msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() )
205 def get_number_of_seqs()
211 error_msg = "attempt to get length of unaligned msa"
212 raise StandardError, error_msg, caller
214 if get_number_of_seqs() < 1
217 @sequences[ 0 ].get_length()
227 for i in 0...get_number_of_seqs
228 s += @sequences[ i ].to_fasta + Constants::LINE_DELIMITER
234 def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
236 error_msg = "attempt to get overlap diagram of unaligned msa"
237 raise StandardError, error_msg, caller
239 for i in 0 ... get_number_of_seqs()
240 io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) )
241 for j in 0 ... get_number_of_seqs()
245 if overlap?( i, j, min_overlap )
252 io.print( Evoruby::Constants::LINE_DELIMITER )
256 #returns array of Msa with an overlap of min_overlap
257 def split_into_overlapping_msa( min_overlap = 1 )
259 error_msg = "attempt to split into overlapping msas of unaligned msa"
260 raise StandardError, error_msg, caller
263 bins = get_overlaps( min_overlap )
264 for i in 0 ... bins.length
265 msas.push( get_sub_alignment( bins[ i ] ) )
270 def overlap?( index_1, index_2, min_overlap = 1 )
271 seq_1 = get_sequence( index_1 )
272 seq_2 = get_sequence( index_2 )
274 for i in 0...seq_1.get_length()
275 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
276 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
278 if overlap_count >= min_overlap
286 def calculate_overlap( index_1, index_2 )
287 seq_1 = get_sequence( index_1 )
288 seq_2 = get_sequence( index_2 )
290 for i in 0...seq_1.get_length
291 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
292 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
299 def calculate_identities( index_1, index_2 )
300 seq_1 = get_sequence( index_1 )
301 seq_2 = get_sequence( index_2 )
303 for i in 0...seq_1.get_length
304 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
305 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
306 seq_1.get_character_code( i ) != 63 &&
307 ( seq_1.get_residue( i ).downcase() ==
308 seq_2.get_residue( i ).downcase() )
309 identities_count += 1
315 def remove_gap_only_columns!()
316 remove_columns!( get_gap_only_columns() )
319 def remove_gap_columns!()
320 remove_columns!( get_gap_columns() )
323 # removes columns for which seqs with gap / number of sequences > gap_ratio
324 def remove_gap_columns_w_gap_ratio!( gap_ratio )
325 remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
329 def remove_sequences_by_gap_ratio!( gap_ratio )
331 error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
332 raise StandardError, error_msg, caller
334 n = get_number_of_seqs
337 if ( get_sequence( ( n - 1 ) - s ).get_gap_ratio() > gap_ratio )
338 if ( Evoruby::Constants::VERBOSE )
339 puts( "removed: " + get_sequence( ( n - 1 ) - s ).get_name )
341 removed << get_sequence( ( n - 1 ) - s ).get_name
342 remove_sequence!( ( n - 1 ) - s )
349 def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
350 n = get_number_of_seqs
352 to_be_removed = Set.new
353 @identical_seqs_detected = Array.new
354 for i in 0 ... ( n - 1 )
355 for j in ( i + 1 ) ... n
356 if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
357 if !consider_taxonomy ||
358 ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
359 ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
360 if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
361 Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
362 to_be_removed.add( j )
366 if get_sequence( i ).get_taxonomy != nil
367 tax_i = get_sequence( i ).get_taxonomy.get_name
369 if get_sequence( j ).get_taxonomy != nil
370 tax_j = get_sequence( j ).get_taxonomy.get_name
372 identical_pair = get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]"
373 @identical_seqs_detected.push( identical_pair )
383 to_be_removed_ary = to_be_removed.to_a.sort.reverse
385 to_be_removed_ary.each { | index |
386 removed.push( get_sequence( index ).get_name )
387 remove_sequence!( index )
393 def remove_sequences_by_non_gap_length!( min_non_gap_length )
394 n = get_number_of_seqs
398 seq = get_sequence( x )
399 if ( ( seq.get_length - seq.get_gap_length ) < min_non_gap_length )
400 if ( Evoruby::Constants::VERBOSE )
401 puts( "removed: " + seq.get_name )
403 removed << seq.get_name
404 remove_sequence!( x )
410 def trim!( first, last )
412 for i in 0 ... get_length()
413 if ( i < first || i > last )
417 remove_columns!( cols )
420 def get_gap_only_columns()
422 error_msg = "attempt to get gap only columns of unaligned msa"
423 raise StandardError, error_msg, caller
426 for c in 0 ... get_length
427 nogap_char_found = false
428 for s in 0 ... get_number_of_seqs
429 unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
430 nogap_char_found = true
434 unless nogap_char_found
441 def calculate_gap_proportion()
443 error_msg = "attempt to get gap only columns of unaligned msa"
444 raise StandardError, error_msg, caller
448 for c in 0 ... get_length
449 for s in 0 ... get_number_of_seqs
450 total_sum = total_sum + 1
451 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
452 gap_sum = gap_sum + 1
457 return gap_sum / total_sum
460 def get_gap_columns()
462 error_msg = "attempt to get gap columns of unaligned msa"
463 raise StandardError, error_msg, caller
466 for c in 0 ... get_length
467 gap_char_found = false
468 for s in 0 ... get_number_of_seqs
469 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
470 gap_char_found = true
481 # gap_ratio = seqs with gap / number of sequences
482 # returns column indices for which seqs with gap / number of sequences > gap_ratio
483 def get_gap_columns_w_gap_ratio( gap_ratio )
485 error_msg = "attempt to get gap columns with gap_ratio of unaligned msa"
486 raise StandardError, error_msg, caller
488 if ( gap_ratio < 0 || gap_ratio > 1 )
489 error_msg = "gap ratio must be between 0 and 1 inclusive"
490 raise ArgumentError, error_msg, caller
493 for c in 0 ... get_length
495 for s in 0 ... get_number_of_seqs
496 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
500 if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio )
508 # Split an alignment into n alignemnts of equal size, except last one
509 def split( n, verbose = false )
510 if ( n < 2 || n > get_number_of_seqs )
511 error_msg = "attempt to split into less than two or more than the number of sequences"
512 raise StandardError, error_msg, caller
515 r = get_number_of_seqs % n
516 x = get_number_of_seqs / n
520 if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
523 puts( i.to_s + ": " + y.to_s )
526 msa.add_sequence( get_sequence( ( i * x ) + j ) )
530 puts( i.to_s + ": " + x.to_s )
533 msa.add_sequence( get_sequence( ( i * x ) + j ) )
544 def get_overlaps( min_overlap = 1 )
546 error_msg = "attempt to get overlaps of unaligned msa"
547 raise StandardError, error_msg, caller
550 for i in 0 ... get_number_of_seqs()
552 for j in 0 ... bins.length
553 current_bin = bins[ j ]
554 # does seq i overlap with all seqs in current_bin?
556 for z in 0 ... current_bin.length
557 unless overlap?( i, current_bin[ z ], min_overlap )
563 current_bin.push( i )
568 new_bin = Array.new()
576 def remove_columns!( cols )
578 error_msg = "attempt to remove columns of unaligned msa"
579 raise StandardError, error_msg, caller
582 for c in 0 ... cols.length()
584 for s in 0 ... get_number_of_seqs()
585 get_sequence( s ).delete_residue!( col )