2 # = lib/evo/msa/msa.rb - Msa class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # Last modified: 2017/02/07
9 require 'lib/evo/util/constants'
10 require 'lib/evo/util/util'
11 require 'lib/evo/sequence/sequence'
16 @sequences = Array.new
17 @identical_seqs_detected = Array.new
18 @name_to_seq_indices = Hash.new
19 @namestart_to_seq_indices = Hash.new
22 def add_sequence( sequence )
23 @sequences.push( sequence )
26 def add( name, molecular_sequence_str )
27 add_sequence( Sequence.new( name, molecular_sequence_str ) )
30 def get_sequence( index )
31 if ( index < 0 || index > get_number_of_seqs() - 1 )
32 error_msg = "attempt to get sequence " <<
33 index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
35 raise ArgumentError, error_msg
37 return @sequences[ index ]
40 def remove_sequence!( index )
41 if ( index < 0 || index > get_number_of_seqs() - 1 )
42 error_msg = "attempt to remove sequence " <<
43 index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
45 raise ArgumentError, error_msg
47 @name_to_seq_indices.clear
48 @namestart_to_seq_indices.clear
49 @sequences.delete_at( index )
52 def get_identical_seqs_detected
53 @identical_seqs_detected
57 if ( get_number_of_seqs < 1 )
60 l = @sequences[ 0 ].get_length()
61 for i in 0 ... get_number_of_seqs()
62 if ( get_sequence( i ).get_length() != l )
70 def find_by_name( name, case_sensitive, partial_match )
71 if case_sensitive && !partial_match && @name_to_seq_indices.has_key?( name )
72 return @name_to_seq_indices[ name ]
75 for i in 0 ... get_number_of_seqs()
76 current_name = get_sequence( i ).get_name()
77 if case_sensitive && !partial_match
78 if !@name_to_seq_indices.has_key?( current_name )
79 @name_to_seq_indices[ current_name ] = []
81 @name_to_seq_indices[ current_name ].push( i )
84 current_name = current_name.downcase
87 if current_name == name ||
88 ( partial_match && current_name.include?( name ) )
95 def find_by_name_pattern( name_re, avoid_similar_to = true )
97 for i in 0 ... get_number_of_seqs()
99 m = name_re.match( get_sequence( i ).get_name() )
100 if m && !m.pre_match.downcase.include?( "similar to " )
104 if name_re.match( get_sequence( i ).get_name() )
112 # throws ArgumentError
113 def get_by_name_pattern( name_re , avoid_similar_to = true )
114 indices = find_by_name_pattern( name_re, avoid_similar_to )
115 if ( indices.length > 1 )
116 error_msg = "pattern " + name_re.to_s + " not unique"
117 raise ArgumentError, error_msg
118 elsif ( indices.length < 1 )
119 error_msg = "pattern " + name_re.to_s + " not found"
120 raise ArgumentError, error_msg
122 get_sequence( indices[ 0 ] )
125 def find_by_name_start(name, case_sensitive = true)
126 if case_sensitive && @namestart_to_seq_indices.has_key?( name )
127 return @namestart_to_seq_indices[ name ]
130 for i in 0 ... get_number_of_seqs
131 get_sequence(i).get_name =~ /^\s*(\S+)/
134 unless @namestart_to_seq_indices.has_key?(current_name)
135 @namestart_to_seq_indices[current_name] = []
137 @namestart_to_seq_indices[current_name].push( i )
140 current_name = current_name.downcase
143 if current_name == name
150 def has?( name, case_sensitive = true, partial_match = false )
151 for i in 0 ... get_number_of_seqs()
152 current_name = get_sequence( i ).get_name()
154 current_name = current_name.downcase
157 if current_name == name ||
158 ( partial_match && current_name.include?( name ) )
165 # throws ArgumentError
166 def get_by_name( name, case_sensitive = true, partial_match = false )
167 indices = find_by_name( name, case_sensitive, partial_match )
168 if ( indices.length > 1 )
169 error_msg = "\"" + name + "\" not unique"
170 raise ArgumentError, error_msg
171 elsif ( indices.length < 1 )
172 error_msg = "\"" + name + "\" not found"
173 raise ArgumentError, error_msg
175 get_sequence( indices[ 0 ] )
178 # throws ArgumentError
179 def get_by_name_start( name, case_sensitive = true )
180 indices = find_by_name_start( name, case_sensitive )
181 if indices.length > 1
182 error_msg = "\"" + name + "\" not unique"
183 raise ArgumentError, error_msg
184 elsif indices.length < 1
185 error_msg = "\"" + name + "\" not found"
186 raise ArgumentError, error_msg
188 get_sequence( indices[ 0 ] )
191 def get_sub_alignment( seq_numbers )
193 for i in 0 ... seq_numbers.length()
194 msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() )
199 def get_number_of_seqs()
205 error_msg = "attempt to get length of unaligned msa"
206 raise StandardError, error_msg, caller
208 if get_number_of_seqs() < 1
211 @sequences[ 0 ].get_length()
221 for i in 0...get_number_of_seqs
222 s += @sequences[ i ].to_fasta + Constants::LINE_DELIMITER
227 def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
229 error_msg = "attempt to get overlap diagram of unaligned msa"
230 raise StandardError, error_msg, caller
232 for i in 0 ... get_number_of_seqs()
233 io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) )
234 for j in 0 ... get_number_of_seqs()
238 if overlap?( i, j, min_overlap )
245 io.print( Evoruby::Constants::LINE_DELIMITER )
249 #returns array of Msa with an overlap of min_overlap
250 def split_into_overlapping_msa( min_overlap = 1 )
252 error_msg = "attempt to split into overlapping msas of unaligned msa"
253 raise StandardError, error_msg, caller
256 bins = get_overlaps( min_overlap )
257 for i in 0 ... bins.length
258 msas.push( get_sub_alignment( bins[ i ] ) )
263 def overlap?( index_1, index_2, min_overlap = 1 )
264 seq_1 = get_sequence( index_1 )
265 seq_2 = get_sequence( index_2 )
267 for i in 0...seq_1.get_length()
268 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
269 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
271 if overlap_count >= min_overlap
279 def calculate_overlap( index_1, index_2 )
280 seq_1 = get_sequence( index_1 )
281 seq_2 = get_sequence( index_2 )
283 for i in 0...seq_1.get_length
284 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
285 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
292 def calculate_identities( index_1, index_2 )
293 seq_1 = get_sequence( index_1 )
294 seq_2 = get_sequence( index_2 )
296 for i in 0...seq_1.get_length
297 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
298 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
299 seq_1.get_character_code( i ) != 63 &&
300 ( seq_1.get_residue( i ).downcase() ==
301 seq_2.get_residue( i ).downcase() )
302 identities_count += 1
308 def remove_gap_only_columns!()
309 remove_columns!( get_gap_only_columns() )
312 def remove_gap_columns!()
313 remove_columns!( get_gap_columns() )
316 # removes columns for which seqs with gap / number of sequences > gap_ratio
317 def remove_gap_columns_w_gap_ratio!( gap_ratio )
318 remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
321 def remove_sequences_by_gap_ratio!( gap_ratio )
323 error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
324 raise StandardError, error_msg, caller
326 n = get_number_of_seqs
329 if ( get_sequence( ( n - 1 ) - s ).get_gap_ratio() > gap_ratio )
330 if ( Evoruby::Constants::VERBOSE )
331 puts( "removed: " + get_sequence( ( n - 1 ) - s ).get_name )
333 removed << get_sequence( ( n - 1 ) - s ).get_name
334 remove_sequence!( ( n - 1 ) - s )
340 def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
341 n = get_number_of_seqs
343 to_be_removed = Set.new
344 @identical_seqs_detected = Array.new
345 for i in 0 ... ( n - 1 )
346 for j in ( i + 1 ) ... n
347 if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
348 if !consider_taxonomy ||
349 ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
350 ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
351 if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
352 Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
353 to_be_removed.add( j )
357 if get_sequence( i ).get_taxonomy != nil
358 tax_i = get_sequence( i ).get_taxonomy.get_name
360 if get_sequence( j ).get_taxonomy != nil
361 tax_j = get_sequence( j ).get_taxonomy.get_name
363 identical_pair = get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]"
364 @identical_seqs_detected.push( identical_pair )
374 to_be_removed_ary = to_be_removed.to_a.sort.reverse
376 to_be_removed_ary.each { | index |
377 removed.push( get_sequence( index ).get_name )
378 remove_sequence!( index )
383 def remove_sequences_by_non_gap_length!( min_non_gap_length )
384 n = get_number_of_seqs
388 seq = get_sequence( x )
389 if ( ( seq.get_length - seq.get_gap_length ) < min_non_gap_length )
390 if ( Evoruby::Constants::VERBOSE )
391 puts( "removed: " + seq.get_name )
393 removed << seq.get_name
394 remove_sequence!( x )
400 def trim!( first, last )
402 for i in 0 ... get_length()
403 if ( i < first || i > last )
407 remove_columns!( cols )
410 def get_gap_only_columns()
412 error_msg = "attempt to get gap only columns of unaligned msa"
413 raise StandardError, error_msg, caller
416 for c in 0 ... get_length
417 nogap_char_found = false
418 for s in 0 ... get_number_of_seqs
419 unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
420 nogap_char_found = true
424 unless nogap_char_found
431 def calculate_gap_proportion()
433 error_msg = "attempt to get gap only columns of unaligned msa"
434 raise StandardError, error_msg, caller
438 for c in 0 ... get_length
439 for s in 0 ... get_number_of_seqs
440 total_sum = total_sum + 1
441 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
442 gap_sum = gap_sum + 1
447 return gap_sum / total_sum
450 def get_gap_columns()
452 error_msg = "attempt to get gap columns of unaligned msa"
453 raise StandardError, error_msg, caller
456 for c in 0 ... get_length
457 gap_char_found = false
458 for s in 0 ... get_number_of_seqs
459 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
460 gap_char_found = true
471 # gap_ratio = seqs with gap / number of sequences
472 # returns column indices for which seqs with gap / number of sequences > gap_ratio
473 def get_gap_columns_w_gap_ratio( gap_ratio )
475 error_msg = "attempt to get gap columns with gap_ratio of unaligned msa"
476 raise StandardError, error_msg, caller
478 if ( gap_ratio < 0 || gap_ratio > 1 )
479 error_msg = "gap ratio must be between 0 and 1 inclusive"
480 raise ArgumentError, error_msg, caller
483 for c in 0 ... get_length
485 for s in 0 ... get_number_of_seqs
486 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
490 if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio )
497 # Split an alignment into n alignemnts of equal size, except last one
498 def split( n, verbose = false )
499 if ( n < 2 || n > get_number_of_seqs )
500 error_msg = "attempt to split into less than two or more than the number of sequences"
501 raise StandardError, error_msg, caller
504 r = get_number_of_seqs % n
505 x = get_number_of_seqs / n
509 if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
512 puts( i.to_s + ": " + y.to_s )
515 msa.add_sequence( get_sequence( ( i * x ) + j ) )
519 puts( i.to_s + ": " + x.to_s )
522 msa.add_sequence( get_sequence( ( i * x ) + j ) )
532 def get_overlaps( min_overlap = 1 )
534 error_msg = "attempt to get overlaps of unaligned msa"
535 raise StandardError, error_msg, caller
538 for i in 0 ... get_number_of_seqs()
540 for j in 0 ... bins.length
541 current_bin = bins[ j ]
542 # does seq i overlap with all seqs in current_bin?
544 for z in 0 ... current_bin.length
545 unless overlap?( i, current_bin[ z ], min_overlap )
551 current_bin.push( i )
556 new_bin = Array.new()
564 def remove_columns!( cols )
566 error_msg = "attempt to remove columns of unaligned msa"
567 raise StandardError, error_msg, caller
570 for c in 0 ... cols.length()
572 for s in 0 ... get_number_of_seqs()
573 get_sequence( s ).delete_residue!( col )