added calculation of gap-proportion
[jalview.git] / forester / ruby / evoruby / lib / evo / msa / msa.rb
1 #
2 # = lib/evo/msa/msa.rb - Msa class
3 #
4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: msa.rb,v 1.11 2009/01/03 00:42:08 cmzmasek Exp $
8 #
9
10
11 require 'lib/evo/util/constants'
12 require 'lib/evo/util/util'
13 require 'lib/evo/sequence/sequence'
14
15 module Evoruby
16
17     class Msa
18
19         def initialize()
20             @sequences = Array.new()
21         end
22
23
24         def add_sequence( sequence )
25             @sequences.push( sequence )
26         end
27
28         def add( name, molecular_sequence_str )
29             add_sequence( Sequence.new( name, molecular_sequence_str ) )
30         end
31
32         def get_sequence( index )
33             if ( index < 0 || index > get_number_of_seqs() - 1 )
34                 error_msg = "attempt to get sequence " <<
35                  index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
36                  " sequences"
37                 raise ArgumentError, error_msg
38             end
39             return @sequences[ index ]
40         end
41
42         def remove_sequence!( index )
43             if ( index < 0 || index > get_number_of_seqs() - 1 )
44                 error_msg = "attempt to remove sequence " <<
45                  index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
46                  " sequences"
47                 raise ArgumentError, error_msg
48             end
49             @sequences.delete_at( index )
50         end
51
52         def is_aligned()
53             if ( get_number_of_seqs < 1 )
54                 return false
55             else
56                 l = @sequences[ 0 ].get_length()
57                 for i in 0 ... get_number_of_seqs()
58                     if ( get_sequence( i ).get_length() != l )
59                         return false
60                     end
61                 end
62             end
63             return true
64         end
65
66         def find_by_name( name, case_sensitive, partial_match )
67             indices = Array.new()
68             for i in 0 ... get_number_of_seqs()
69                 current_name = get_sequence( i ).get_name()
70                 if !case_sensitive
71                     current_name = current_name.downcase
72                     name = name.downcase
73                 end
74                 if current_name == name ||
75                      ( partial_match && current_name.include?( name ) )
76                     indices.push( i )
77                 end
78             end
79             indices
80         end
81
82         def find_by_name_start( name, case_sensitive )
83             indices = Array.new()
84             for i in 0 ... get_number_of_seqs()
85                 get_sequence( i ).get_name() =~ /^\s*(\S+)/
86                 current_name = $1
87                 if !case_sensitive
88                     current_name = current_name.downcase
89                     name = name.downcase
90                 end
91                 if  ( current_name == name )
92                     indices.push( i )
93                 end
94             end
95             indices
96         end
97
98         def has?( name, case_sensitive = true, partial_match = false )
99             for i in 0 ... get_number_of_seqs()
100                 current_name = get_sequence( i ).get_name()
101                 if !case_sensitive
102                     current_name = current_name.downcase
103                     name = name.downcase
104                 end
105                 if current_name == name ||
106                      ( partial_match && current_name.include?( name ) )
107                     return true
108                 end
109             end
110             false
111         end
112
113         # throws ArgumentError
114         def get_by_name( name, case_sensitive = true, partial_match = false )
115             indices = find_by_name( name, case_sensitive, partial_match )
116             if ( indices.length > 1 )
117                 error_msg = "\"" + name + "\" not unique"
118                 raise ArgumentError, error_msg
119             elsif ( indices.length < 1 )
120                 error_msg = "\"" + name + "\" not found"
121                 raise ArgumentError, error_msg
122             end
123             get_sequence( indices[ 0 ] )
124         end
125
126         # throws ArgumentError
127         def get_by_name_start( name, case_sensitive = true )
128             indices = find_by_name_start( name, case_sensitive )
129             if ( indices.length > 1 )
130                 error_msg = "\"" + name + "\" not unique"
131                 raise ArgumentError, error_msg
132             elsif ( indices.length < 1 )
133                 error_msg = "\"" + name + "\" not found"
134                 raise ArgumentError, error_msg
135             end
136             get_sequence( indices[ 0 ] )
137         end
138
139
140         def get_sub_alignment( seq_numbers )
141             msa = Msa.new()
142             for i in 0 ... seq_numbers.length()
143                 msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() )
144             end
145             return msa
146         end
147
148         def get_number_of_seqs()
149             @sequences.length
150         end
151
152         def get_length()
153             if ( !is_aligned() )
154                 error_msg = "attempt to get length of unaligned msa"
155                 raise StandardError, error_msg, caller
156             end
157             if ( get_number_of_seqs() < 1 )
158                 -1
159             else
160                 @sequences[ 0 ].get_length()
161             end
162         end
163
164         def to_str()
165             s = String.new()
166             for i in 0...get_number_of_seqs()
167                 s += @sequences[ i ].to_str + Constants::LINE_DELIMITER
168             end
169             s
170         end
171
172         def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
173             if ( !is_aligned() )
174                 error_msg = "attempt to get overlap diagram of unaligned msa"
175                 raise StandardError, error_msg, caller
176             end
177             for i in 0 ... get_number_of_seqs()
178                 io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) )
179                 for j in 0 ... get_number_of_seqs()
180                     if i == j
181                         io.print( " " )
182                     else
183                         if overlap?( i, j, min_overlap )
184                             io.print( "+" )
185                         else
186                             io.print( "-" )
187                         end
188                     end
189                 end
190                 io.print( Evoruby::Constants::LINE_DELIMITER )
191             end
192         end
193
194         #returns array of Msa with an overlap of min_overlap
195         def split_into_overlapping_msa( min_overlap = 1 )
196             if ( !is_aligned() )
197                 error_msg = "attempt to split into overlapping msas of unaligned msa"
198                 raise StandardError, error_msg, caller
199             end
200             msas = Array.new()
201             bins = get_overlaps( min_overlap )
202             for i in 0 ... bins.length
203                 msas.push( get_sub_alignment( bins[ i ] ) )
204             end
205             msas
206         end
207
208         def overlap?( index_1, index_2, min_overlap = 1 )
209             seq_1 = get_sequence( index_1 )
210             seq_2 = get_sequence( index_2 )
211             overlap_count = 0
212             for i in 0...seq_1.get_length()
213                 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
214                      !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
215                     overlap_count += 1
216                     if overlap_count >= min_overlap
217                         return true
218                     end
219                 end
220             end
221             return false
222         end
223
224         def calculate_overlap( index_1, index_2 )
225             seq_1 = get_sequence( index_1 )
226             seq_2 = get_sequence( index_2 )
227             overlap_count = 0
228             for i in 0...seq_1.get_length
229                 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
230                      !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
231                     overlap_count += 1
232                 end
233             end
234             overlap_count
235         end
236
237         def calculate_identities( index_1, index_2 )
238             seq_1 = get_sequence( index_1 )
239             seq_2 = get_sequence( index_2 )
240             identities_count = 0
241             for i in 0...seq_1.get_length
242                 if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
243                      !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
244                      seq_1.get_character_code( i ) != 63 &&
245                      ( seq_1.get_residue( i ).downcase() ==
246                          seq_2.get_residue( i ).downcase() )
247                     identities_count += 1
248                 end
249             end
250             identities_count
251         end
252
253         def remove_gap_only_columns!()
254             remove_columns!( get_gap_only_columns() )
255         end
256
257         def remove_gap_columns!()
258             remove_columns!( get_gap_columns() )
259         end
260
261         # removes columns for which seqs with gap / number of sequences > gap_ratio
262         def remove_gap_columns_w_gap_ratio!( gap_ratio )
263             remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
264         end
265
266
267         def remove_sequences_by_gap_ratio!( gap_ratio )
268             if ( !is_aligned() )
269                 error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
270                 raise StandardError, error_msg, caller
271             end
272             n = get_number_of_seqs
273             removed = Array.new
274             for s in 0 ... n
275                 if ( get_sequence( ( n - 1 ) - s  ).get_gap_ratio() > gap_ratio )
276                     if ( Evoruby::Constants::VERBOSE )
277                         puts( "removed: " + get_sequence( ( n - 1 ) - s  ).get_name )
278                     end
279                     removed << get_sequence( ( n - 1 ) - s  ).get_name
280                     remove_sequence!( ( n - 1 ) - s  )
281                 end
282             end
283             removed
284         end
285
286
287         def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
288             n = get_number_of_seqs
289             removed = Array.new
290             to_be_removed = Set.new
291             for i in 0 ... ( n - 1 )
292                 for j in ( i + 1 ) ... n
293                     if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
294                         if  !consider_taxonomy ||
295                              ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
296                                  ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
297                             if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
298                                  Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
299                                 to_be_removed.add( j )
300                                 if verbose
301                                     tax_i = ""
302                                     tax_j = ""
303                                     if get_sequence( i ).get_taxonomy != nil
304                                         tax_i = get_sequence( i ).get_taxonomy.get_name
305                                     end
306                                     if get_sequence( j ).get_taxonomy != nil
307                                         tax_j = get_sequence( j ).get_taxonomy.get_name
308                                     end
309                                     puts get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]"
310                                 end
311                             end
312                         end
313                     end
314                 end
315             end
316             to_be_removed_ary = to_be_removed.to_a.sort.reverse
317
318             to_be_removed_ary.each { | index |
319                 removed.push( get_sequence( index ).get_name )
320                 remove_sequence!( index )
321             }
322             removed
323         end
324
325
326         def remove_sequences_by_non_gap_length!( min_non_gap_length )
327             if ( !is_aligned() )
328                 error_msg = "attempt to remove sequences by non gap length on unaligned msa"
329                 raise StandardError, error_msg, caller
330             end
331             n = get_number_of_seqs
332             l = get_length
333             removed = Array.new
334             for s in 0 ... n
335                 if ( ( l - get_sequence( ( n - 1 ) - s ).get_gap_length ) < min_non_gap_length )
336                     if ( Evoruby::Constants::VERBOSE )
337                         puts( "removed: " + get_sequence( ( n - 1 ) - s  ).get_name )
338                     end
339                     removed << get_sequence( ( n - 1 ) - s  ).get_name
340                     remove_sequence!( ( n - 1 ) - s )
341                 end
342             end
343             removed
344         end
345
346         def trim!( first, last )
347             cols = Array.new()
348             for i in 0 ... get_length()
349                 if ( i < first || i > last )
350                     cols.push( i )
351                 end
352             end
353             remove_columns!( cols )
354         end
355
356         def get_gap_only_columns()
357             if ( !is_aligned() )
358                 error_msg = "attempt to get gap only columns of unaligned msa"
359                 raise StandardError, error_msg, caller
360             end
361             cols = Array.new()
362             for c in 0 ... get_length
363                 nogap_char_found = false
364                 for s in 0 ... get_number_of_seqs
365                     unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
366                         nogap_char_found = true
367                         break
368                     end
369                 end
370                 unless nogap_char_found
371                     cols.push( c )
372                 end
373             end
374             return cols
375         end
376         
377         def calculate_gap_proportion()
378             if ( !is_aligned() )
379                 error_msg = "attempt to get gap only columns of unaligned msa"
380                 raise StandardError, error_msg, caller
381             end
382             total_sum = 0.0
383             gap_sum = 0.0
384             for c in 0 ... get_length
385                 for s in 0 ... get_number_of_seqs
386                     total_sum = total_sum + 1
387                     if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
388                         gap_sum = gap_sum  + 1
389                     end
390                 end
391                
392             end
393             return gap_sum / total_sum
394         end
395
396         def get_gap_columns()
397             if ( !is_aligned() )
398                 error_msg = "attempt to get gap columns of unaligned msa"
399                 raise StandardError, error_msg, caller
400             end
401             cols = Array.new()
402             for c in 0 ... get_length
403                 gap_char_found = false
404                 for s in 0 ... get_number_of_seqs
405                     if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
406                         gap_char_found = true
407                         break
408                     end
409                 end
410                 if gap_char_found
411                     cols.push( c )
412                 end
413             end
414             return cols
415         end
416
417         # gap_ratio = seqs with gap / number of sequences
418         # returns column indices for which seqs with gap / number of sequences > gap_ratio
419         def get_gap_columns_w_gap_ratio( gap_ratio )
420             if ( !is_aligned() )
421                 error_msg = "attempt to get gap columns with gap_ratio of unaligned msa"
422                 raise StandardError, error_msg, caller
423             end
424             if ( gap_ratio < 0 || gap_ratio > 1 )
425                 error_msg = "gap ratio must be between 0 and 1 inclusive"
426                 raise ArgumentError, error_msg, caller
427             end
428             cols = Array.new()
429             for c in 0 ... get_length
430                 gap_chars_found = 0
431                 for s in 0 ... get_number_of_seqs
432                     if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
433                         gap_chars_found += 1
434                     end
435                 end
436                 if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio )
437                     cols.push( c )
438                 end
439             end
440             return cols
441         end
442
443
444         # Split an alignment into n alignemnts of equal size, except last one
445         def split( n, verbose = false )
446             if ( n < 2 || n > get_number_of_seqs )
447                 error_msg = "attempt to split into less than two or more than the number of sequences"
448                 raise StandardError, error_msg, caller
449             end
450             msas = Array.new()
451             r = get_number_of_seqs % n
452             x = get_number_of_seqs / n
453             for i in 0 ... n
454                 msa = Msa.new()
455                 s = 0
456
457                 if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
458                     y = x + r
459                     if ( verbose )
460                         puts( i.to_s + ": " + y.to_s )
461                     end
462                     for j in 0 ... y
463                         msa.add_sequence( get_sequence( ( i * x ) + j ) )
464                     end
465                 else
466                     if ( verbose )
467                         puts( i.to_s + ": " + x.to_s )
468                     end
469                     for j in 0 ... x
470                         msa.add_sequence( get_sequence( ( i * x ) + j ) )
471                     end
472                 end
473                 msas.push( msa )
474             end
475             msas
476         end
477
478
479         private
480
481         def get_overlaps( min_overlap = 1 )
482             if ( !is_aligned() )
483                 error_msg = "attempt to get overlaps of unaligned msa"
484                 raise StandardError, error_msg, caller
485             end
486             bins = Array.new()
487             for i in 0 ... get_number_of_seqs()
488                 found_bin = false
489                 for j in 0 ... bins.length
490                     current_bin = bins[ j ]
491                     # does seq i overlap with all seqs in current_bin?
492                     all_overlap = true
493                     for z in 0 ... current_bin.length
494                         unless overlap?( i, current_bin[ z ], min_overlap )
495                             all_overlap = false
496                             break
497                         end
498                     end
499                     if all_overlap
500                         current_bin.push( i )
501                         found_bin = true
502                     end
503                 end
504                 unless found_bin
505                     new_bin = Array.new()
506                     new_bin.push( i )
507                     bins.push( new_bin )
508                 end
509             end
510             return bins
511         end
512
513         def remove_columns!( cols )
514             if ( !is_aligned() )
515                 error_msg = "attempt to remove columns of unaligned msa"
516                 raise StandardError, error_msg, caller
517             end
518             cols.reverse!()
519             for c in 0 ... cols.length()
520                 col = cols[ c ]
521                 for s in 0 ... get_number_of_seqs()
522                     get_sequence( s ).delete_residue!( col )
523                 end
524             end
525             return self
526         end
527
528
529     end # class Msa
530
531 end # module Evoruby
532