2 # = lib/evo/msa/msa.rb - Msa class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # Last modified: 2017/03/13
9 require 'lib/evo/util/constants'
10 require 'lib/evo/util/util'
11 require 'lib/evo/sequence/sequence'
16 @sequences = Array.new
17 @identical_seqs_detected = Array.new
18 @name_to_seq_indices = Hash.new
19 @namestart_to_seq_indices = Hash.new
31 def add_sequence( sequence )
32 @sequences.push( sequence )
35 def add( name, molecular_sequence_str )
36 add_sequence( Sequence.new( name, molecular_sequence_str ) )
39 def get_sequence( index )
40 if ( index < 0 || index > get_number_of_seqs() - 1 )
41 error_msg = "attempt to get sequence " <<
42 index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
44 raise ArgumentError, error_msg
46 return @sequences[ index ]
49 def remove_sequence!( index )
50 if ( index < 0 || index > get_number_of_seqs() - 1 )
51 error_msg = "attempt to remove sequence " <<
52 index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
54 raise ArgumentError, error_msg
56 @name_to_seq_indices.clear
57 @namestart_to_seq_indices.clear
58 @sequences.delete_at( index )
61 def get_identical_seqs_detected
62 @identical_seqs_detected
66 if ( get_number_of_seqs < 1 )
69 l = @sequences[ 0 ].get_length()
70 for i in 0 ... get_number_of_seqs()
71 if ( get_sequence( i ).get_length() != l )
79 def find_by_name( name, case_sensitive, partial_match )
80 if case_sensitive && !partial_match && @name_to_seq_indices.has_key?( name )
81 return @name_to_seq_indices[ name ]
84 for i in 0 ... get_number_of_seqs()
85 current_name = get_sequence( i ).get_name()
86 if case_sensitive && !partial_match
87 if !@name_to_seq_indices.has_key?( current_name )
88 @name_to_seq_indices[ current_name ] = []
90 @name_to_seq_indices[ current_name ].push( i )
93 current_name = current_name.downcase
96 if current_name == name ||
97 ( partial_match && current_name.include?( name ) )
104 def find_by_name_pattern( name_re, avoid_similar_to = true )
106 for i in 0 ... get_number_of_seqs()
108 m = name_re.match( get_sequence( i ).get_name() )
109 if m && !m.pre_match.downcase.include?( "similar to " )
113 if name_re.match( get_sequence( i ).get_name() )
121 # throws ArgumentError
122 def get_by_name_pattern( name_re , avoid_similar_to = true )
123 indices = find_by_name_pattern( name_re, avoid_similar_to )
124 if ( indices.length > 1 )
125 error_msg = "pattern " + name_re.to_s + " not unique"
126 raise ArgumentError, error_msg
127 elsif ( indices.length < 1 )
128 error_msg = "pattern " + name_re.to_s + " not found"
129 raise ArgumentError, error_msg
131 get_sequence( indices[ 0 ] )
134 def find_by_name_start(name, case_sensitive = true)
135 if case_sensitive && @namestart_to_seq_indices.has_key?( name )
136 return @namestart_to_seq_indices[ name ]
139 for i in 0 ... get_number_of_seqs
140 get_sequence(i).get_name =~ /^\s*(\S+)/
143 unless @namestart_to_seq_indices.has_key?(current_name)
144 @namestart_to_seq_indices[current_name] = []
146 @namestart_to_seq_indices[current_name].push( i )
149 current_name = current_name.downcase
152 if current_name == name
159 def has?( name, case_sensitive = true, partial_match = false )
160 for i in 0 ... get_number_of_seqs()
161 current_name = get_sequence( i ).get_name()
163 current_name = current_name.downcase
166 if current_name == name ||
167 ( partial_match && current_name.include?( name ) )
174 # throws ArgumentError
175 def get_by_name( name, case_sensitive = true, partial_match = false )
176 indices = find_by_name( name, case_sensitive, partial_match )
177 if ( indices.length > 1 )
178 error_msg = "\"" + name + "\" not unique"
179 raise ArgumentError, error_msg
180 elsif ( indices.length < 1 )
181 error_msg = "\"" + name + "\" not found"
182 raise ArgumentError, error_msg
184 get_sequence( indices[ 0 ] )
187 # throws ArgumentError
188 def get_by_name_start( name, case_sensitive = true )
189 indices = find_by_name_start( name, case_sensitive )
190 if indices.length > 1
191 error_msg = "\"" + name + "\" not unique"
192 raise ArgumentError, error_msg
193 elsif indices.length < 1
194 error_msg = "\"" + name + "\" not found"
195 raise ArgumentError, error_msg
197 get_sequence( indices[ 0 ] )
200 def get_sub_alignment( seq_numbers )
202 for i in 0 ... seq_numbers.length()
203 msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() )
208 def get_number_of_seqs()
214 error_msg = "attempt to get length of unaligned msa"
215 raise StandardError, error_msg, caller
217 if get_number_of_seqs() < 1
220 @sequences[ 0 ].get_length()
230 for i in 0...get_number_of_seqs
231 s += @sequences[ i ].to_fasta + Constants::LINE_DELIMITER
236 def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
238 error_msg = "attempt to get overlap diagram of unaligned msa"
239 raise StandardError, error_msg, caller
241 for i in 0 ... get_number_of_seqs()
242 io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) )
243 for j in 0 ... get_number_of_seqs()
247 if overlap?( i, j, min_overlap )
254 io.print( Evoruby::Constants::LINE_DELIMITER )
258 #returns array of Msa with an overlap of min_overlap
259 def split_into_overlapping_msa( min_overlap = 1 )
261 error_msg = "attempt to split into overlapping msas of unaligned msa"
262 raise StandardError, error_msg, caller
265 bins = get_overlaps( min_overlap )
266 for i in 0 ... bins.length
267 msas.push( get_sub_alignment( bins[ i ] ) )
272 def overlap?( index_1, index_2, min_overlap = 1 )
273 seq_1 = get_sequence( index_1 )
274 seq_2 = get_sequence( index_2 )
276 for i in 0...seq_1.get_length()
277 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
278 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
280 if overlap_count >= min_overlap
288 def calculate_overlap( index_1, index_2 )
289 seq_1 = get_sequence( index_1 )
290 seq_2 = get_sequence( index_2 )
292 for i in 0...seq_1.get_length
293 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
294 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
301 def calculate_identities( index_1, index_2 )
302 seq_1 = get_sequence( index_1 )
303 seq_2 = get_sequence( index_2 )
305 for i in 0...seq_1.get_length
306 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
307 !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
308 seq_1.get_character_code( i ) != 63 &&
309 ( seq_1.get_residue( i ).downcase() ==
310 seq_2.get_residue( i ).downcase() )
311 identities_count += 1
317 def remove_gap_only_columns!()
318 remove_columns!( get_gap_only_columns() )
321 def remove_gap_columns!()
322 remove_columns!( get_gap_columns() )
325 # removes columns for which seqs with gap / number of sequences > gap_ratio
326 def remove_gap_columns_w_gap_ratio!( gap_ratio )
327 remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
330 def remove_sequences_by_gap_ratio!( gap_ratio )
332 error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
333 raise StandardError, error_msg, caller
335 n = get_number_of_seqs
338 if ( get_sequence( ( n - 1 ) - s ).get_gap_ratio() > gap_ratio )
339 if ( Evoruby::Constants::VERBOSE )
340 puts( "removed: " + get_sequence( ( n - 1 ) - s ).get_name )
342 removed << get_sequence( ( n - 1 ) - s ).get_name
343 remove_sequence!( ( n - 1 ) - s )
349 def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
350 n = get_number_of_seqs
352 to_be_removed = Set.new
353 @identical_seqs_detected = Array.new
354 for i in 0 ... ( n - 1 )
355 for j in ( i + 1 ) ... n
356 if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
357 if !consider_taxonomy ||
358 ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
359 ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
360 if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
361 Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
362 to_be_removed.add( j )
366 if get_sequence( i ).get_taxonomy != nil
367 tax_i = get_sequence( i ).get_taxonomy.get_name
369 if get_sequence( j ).get_taxonomy != nil
370 tax_j = get_sequence( j ).get_taxonomy.get_name
372 identical_pair = get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]"
373 @identical_seqs_detected.push( identical_pair )
383 to_be_removed_ary = to_be_removed.to_a.sort.reverse
385 to_be_removed_ary.each { | index |
386 removed.push( get_sequence( index ).get_name )
387 remove_sequence!( index )
392 def remove_sequences_by_non_gap_length!( min_non_gap_length )
393 n = get_number_of_seqs
397 seq = get_sequence( x )
398 if ( ( seq.get_length - seq.get_gap_length ) < min_non_gap_length )
399 if ( Evoruby::Constants::VERBOSE )
400 puts( "removed: " + seq.get_name )
402 removed << seq.get_name
403 remove_sequence!( x )
409 def trim!( first, last, name_suffix = nil )
411 for i in 0 ... get_length()
412 if ( i < first || i > last )
416 remove_columns!( cols )
417 if name_suffix != nil
418 n = get_number_of_seqs
420 seq = get_sequence( s )
421 seq.set_name( seq.get_name() + name_suffix );
426 def extract( first, last )
428 error_msg = "attempt to extract from unaligned msa"
429 raise StandardError, error_msg, caller
432 error_msg = "first < 0"
433 raise StandardError, error_msg, caller
435 if last >= get_length()
436 error_msg = "last > length"
437 raise StandardError, error_msg, caller
440 error_msg = "first >= last"
441 raise StandardError, error_msg, caller
444 for i in 0 ... get_number_of_seqs
445 msa.add_sequence( get_sequence( i ).get_subsequence( first, last ) )
450 def sliding_extraction( step, size )
455 first = counter * step
456 last = first + size - 1
457 if last > get_length() - 1
458 last = get_length() - 1
463 res = extract( first, last)
464 res.set_name(first.to_s + "-" + last.to_s)
471 def get_gap_only_columns()
473 error_msg = "attempt to get gap only columns of unaligned msa"
474 raise StandardError, error_msg, caller
477 for c in 0 ... get_length
478 nogap_char_found = false
479 for s in 0 ... get_number_of_seqs
480 unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
481 nogap_char_found = true
485 unless nogap_char_found
492 def calculate_gap_proportion()
494 error_msg = "attempt to get gap only columns of unaligned msa"
495 raise StandardError, error_msg, caller
499 for c in 0 ... get_length
500 for s in 0 ... get_number_of_seqs
501 total_sum = total_sum + 1
502 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
503 gap_sum = gap_sum + 1
508 return gap_sum / total_sum
511 def get_gap_columns()
513 error_msg = "attempt to get gap columns of unaligned msa"
514 raise StandardError, error_msg, caller
517 for c in 0 ... get_length
518 gap_char_found = false
519 for s in 0 ... get_number_of_seqs
520 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
521 gap_char_found = true
532 # gap_ratio = seqs with gap / number of sequences
533 # returns column indices for which seqs with gap / number of sequences > gap_ratio
534 def get_gap_columns_w_gap_ratio( gap_ratio )
536 error_msg = "attempt to get gap columns with gap_ratio of unaligned msa"
537 raise StandardError, error_msg, caller
539 if ( gap_ratio < 0 || gap_ratio > 1 )
540 error_msg = "gap ratio must be between 0 and 1 inclusive"
541 raise ArgumentError, error_msg, caller
544 for c in 0 ... get_length
546 for s in 0 ... get_number_of_seqs
547 if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
551 if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio )
558 # Split an alignment into n alignmnts of equal size, except last one
559 def split( n, verbose = false )
560 if ( n < 2 || n > get_number_of_seqs )
561 error_msg = "attempt to split into less than two or more than the number of sequences"
562 raise StandardError, error_msg, caller
565 r = get_number_of_seqs % n
566 x = get_number_of_seqs / n
570 if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
573 puts( i.to_s + ": " + y.to_s )
576 msa.add_sequence( get_sequence( ( i * x ) + j ) )
580 puts( i.to_s + ": " + x.to_s )
583 msa.add_sequence( get_sequence( ( i * x ) + j ) )
591 def split_by_os(verbose = false)
592 msa_hash = Hash.new()
593 for i in 0 ... get_number_of_seqs()
594 seq = get_sequence(i)
595 name = seq.get_name()
596 # >sp|Q1HVE7|AN_EBVA8 Shutoff alkaline exonuclease OS=Epstein-Barr virus (strain AG876) GN=BGLF5 PE=3 SV=1
597 # if name =~ /OS=(.+?)\s+[A-Z]{2}=/
598 if name =~ /Organism:(.+?)(\|Protein|$)/
600 unless msa_hash.has_key?(os)
601 msa_hash[os] = Msa.new
603 msa_hash[os].add_sequence seq
605 error_msg = "sequence name \"" + name + "\" is not in the expected format for splitting by OS"
606 raise IOError, error_msg, caller
609 msa_hash = msa_hash.sort{|a, b|a<=>b}.to_h
612 msa_hash.each do |o, msa|
614 puts c.to_s + ': ' + o
622 def get_overlaps( min_overlap = 1 )
624 error_msg = "attempt to get overlaps of unaligned msa"
625 raise StandardError, error_msg, caller
628 for i in 0 ... get_number_of_seqs()
630 for j in 0 ... bins.length
631 current_bin = bins[ j ]
632 # does seq i overlap with all seqs in current_bin?
634 for z in 0 ... current_bin.length
635 unless overlap?( i, current_bin[ z ], min_overlap )
641 current_bin.push( i )
646 new_bin = Array.new()
654 def remove_columns!( cols )
656 error_msg = "attempt to remove columns of unaligned msa"
657 raise StandardError, error_msg, caller
660 for c in 0 ... cols.length()
662 for s in 0 ... get_number_of_seqs()
663 get_sequence( s ).delete_residue!( col )