forester/ruby/evoruby/lib/evo/msa/msa.rb

   1 #
   2 # = lib/evo/msa/msa.rb - Msa class
   3 #
   4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
   5 # License::    GNU Lesser General Public License (LGPL)
   6 #
   7 # $Id: msa.rb,v 1.11 2009/01/03 00:42:08 cmzmasek Exp $
   8 #
   9
  10
  11 require 'lib/evo/util/constants'
  12 require 'lib/evo/util/util'
  13 require 'lib/evo/sequence/sequence'
  14
  15 module Evoruby
  16
  17   class Msa
  18
  19     def initialize()
  20       @sequences = Array.new
  21       @identical_seqs_detected = Array.new
  22       @name_to_seq_indices = Hash.new
  23       @namestart_to_seq_indices = Hash.new
  24     end
  25
  26
  27     def add_sequence( sequence )
  28       @sequences.push( sequence )
  29     end
  30
  31     def add( name, molecular_sequence_str )
  32       add_sequence( Sequence.new( name, molecular_sequence_str ) )
  33     end
  34
  35     def get_sequence( index )
  36       if ( index < 0 || index > get_number_of_seqs() - 1 )
  37         error_msg = "attempt to get sequence " <<
  38          index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
  39          " sequences"
  40         raise ArgumentError, error_msg
  41       end
  42       return @sequences[ index ]
  43     end
  44
  45     def remove_sequence!( index )
  46       if ( index < 0 || index > get_number_of_seqs() - 1 )
  47         error_msg = "attempt to remove sequence " <<
  48          index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
  49          " sequences"
  50         raise ArgumentError, error_msg
  51       end
  52       @name_to_seq_indices.clear
  53       @namestart_to_seq_indices.clear
  54       @sequences.delete_at( index )
  55     end
  56
  57     def get_identical_seqs_detected
  58       @identical_seqs_detected
  59     end
  60
  61
  62     def is_aligned()
  63       if ( get_number_of_seqs < 1 )
  64         return false
  65       else
  66         l = @sequences[ 0 ].get_length()
  67         for i in 0 ... get_number_of_seqs()
  68           if ( get_sequence( i ).get_length() != l )
  69             return false
  70           end
  71         end
  72       end
  73       return true
  74     end
  75
  76     def find_by_name( name, case_sensitive, partial_match )
  77       if case_sensitive && !partial_match && @name_to_seq_indices.has_key?( name )
  78         return @name_to_seq_indices[ name ]
  79       end
  80       indices = Array.new()
  81       for i in 0 ... get_number_of_seqs()
  82         current_name = get_sequence( i ).get_name()
  83         if !case_sensitive
  84           current_name = current_name.downcase
  85           name = name.downcase
  86         end
  87         if current_name == name ||
  88            ( partial_match && current_name.include?( name ) )
  89           indices.push( i )
  90         end
  91       end
  92       if case_sensitive && !partial_match
  93         @name_to_seq_indices[ name ] = indices
  94       end
  95       indices
  96     end
  97
  98     def find_by_name_pattern( name_re, avoid_similar_to = true )
  99       indices = []
 100       for i in 0 ... get_number_of_seqs()
 101         if avoid_similar_to
 102           m = name_re.match( get_sequence( i ).get_name() )
 103           if m && !m.pre_match.downcase.include?( "similar to " )
 104             indices.push( i )
 105           end
 106         else
 107           if name_re.match( get_sequence( i ).get_name() )
 108             indices.push( i )
 109           end
 110         end
 111       end
 112       indices
 113     end
 114
 115     # throws ArgumentError
 116     def get_by_name_pattern( name_re , avoid_similar_to = true )
 117       indices = find_by_name_pattern( name_re, avoid_similar_to  )
 118       if ( indices.length > 1 )
 119         error_msg = "pattern  " + name_re.to_s + " not unique"
 120         raise ArgumentError, error_msg
 121       elsif ( indices.length < 1 )
 122         error_msg = "pattern " + name_re.to_s + " not found"
 123         raise ArgumentError, error_msg
 124       end
 125       get_sequence( indices[ 0 ] )
 126     end
 127
 128     def find_by_name_start( name, case_sensitive )
 129       if case_sensitive && @namestart_to_seq_indices.has_key?( name )
 130         return @namestart_to_seq_indices[ name ]
 131       end
 132       indices = []
 133       for i in 0 ... get_number_of_seqs()
 134         get_sequence( i ).get_name() =~ /^\s*(\S+)/
 135         current_name = $1
 136         if !case_sensitive
 137           current_name = current_name.downcase
 138           name = name.downcase
 139         end
 140         if  current_name == name
 141           indices.push( i )
 142         end
 143       end
 144       if case_sensitive
 145         @namestart_to_seq_indices[ name ] = indices
 146       end
 147       indices
 148     end
 149
 150     def has?( name, case_sensitive = true, partial_match = false )
 151       for i in 0 ... get_number_of_seqs()
 152         current_name = get_sequence( i ).get_name()
 153         if !case_sensitive
 154           current_name = current_name.downcase
 155           name = name.downcase
 156         end
 157         if current_name == name ||
 158            ( partial_match && current_name.include?( name ) )
 159           return true
 160         end
 161       end
 162       false
 163     end
 164
 165     # throws ArgumentError
 166     def get_by_name( name, case_sensitive = true, partial_match = false )
 167       indices = find_by_name( name, case_sensitive, partial_match )
 168       if ( indices.length > 1 )
 169         error_msg = "\"" + name + "\" not unique"
 170         raise ArgumentError, error_msg
 171       elsif ( indices.length < 1 )
 172         error_msg = "\"" + name + "\" not found"
 173         raise ArgumentError, error_msg
 174       end
 175       get_sequence( indices[ 0 ] )
 176     end
 177
 178     # throws ArgumentError
 179     def get_by_name_start( name, case_sensitive = true )
 180       indices = find_by_name_start( name, case_sensitive )
 181       if indices.length > 1
 182         error_msg = "\"" + name + "\" not unique"
 183         raise ArgumentError, error_msg
 184       elsif  indices.length < 1
 185         error_msg = "\"" + name + "\" not found"
 186         raise ArgumentError, error_msg
 187       end
 188       get_sequence( indices[ 0 ] )
 189     end
 190
 191
 192     def get_sub_alignment( seq_numbers )
 193       msa = Msa.new()
 194       for i in 0 ... seq_numbers.length()
 195         msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() )
 196       end
 197       return msa
 198     end
 199
 200     def get_number_of_seqs()
 201       @sequences.length
 202     end
 203
 204     def get_length
 205       if !is_aligned()
 206         error_msg = "attempt to get length of unaligned msa"
 207         raise StandardError, error_msg, caller
 208       end
 209       if get_number_of_seqs() < 1
 210         -1
 211       else
 212         @sequences[ 0 ].get_length()
 213       end
 214     end
 215
 216     def to_str
 217       to_fasta
 218     end
 219
 220     def to_fasta
 221       s = String.new
 222       for i in 0...get_number_of_seqs
 223         s += @sequences[ i ].to_fasta + Constants::LINE_DELIMITER
 224       end
 225       s
 226     end
 227
 228
 229     def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
 230       if ( !is_aligned() )
 231         error_msg = "attempt to get overlap diagram of unaligned msa"
 232         raise StandardError, error_msg, caller
 233       end
 234       for i in 0 ... get_number_of_seqs()
 235         io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) )
 236         for j in 0 ... get_number_of_seqs()
 237           if i == j
 238             io.print( " " )
 239           else
 240             if overlap?( i, j, min_overlap )
 241               io.print( "+" )
 242             else
 243               io.print( "-" )
 244             end
 245           end
 246         end
 247         io.print( Evoruby::Constants::LINE_DELIMITER )
 248       end
 249     end
 250
 251     #returns array of Msa with an overlap of min_overlap
 252     def split_into_overlapping_msa( min_overlap = 1 )
 253       if ( !is_aligned() )
 254         error_msg = "attempt to split into overlapping msas of unaligned msa"
 255         raise StandardError, error_msg, caller
 256       end
 257       msas = Array.new()
 258       bins = get_overlaps( min_overlap )
 259       for i in 0 ... bins.length
 260         msas.push( get_sub_alignment( bins[ i ] ) )
 261       end
 262       msas
 263     end
 264
 265     def overlap?( index_1, index_2, min_overlap = 1 )
 266       seq_1 = get_sequence( index_1 )
 267       seq_2 = get_sequence( index_2 )
 268       overlap_count = 0
 269       for i in 0...seq_1.get_length()
 270         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 271            !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
 272           overlap_count += 1
 273           if overlap_count >= min_overlap
 274             return true
 275           end
 276         end
 277       end
 278       return false
 279     end
 280
 281     def calculate_overlap( index_1, index_2 )
 282       seq_1 = get_sequence( index_1 )
 283       seq_2 = get_sequence( index_2 )
 284       overlap_count = 0
 285       for i in 0...seq_1.get_length
 286         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 287            !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
 288           overlap_count += 1
 289         end
 290       end
 291       overlap_count
 292     end
 293
 294     def calculate_identities( index_1, index_2 )
 295       seq_1 = get_sequence( index_1 )
 296       seq_2 = get_sequence( index_2 )
 297       identities_count = 0
 298       for i in 0...seq_1.get_length
 299         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 300            !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
 301            seq_1.get_character_code( i ) != 63 &&
 302            ( seq_1.get_residue( i ).downcase() ==
 303              seq_2.get_residue( i ).downcase() )
 304           identities_count += 1
 305         end
 306       end
 307       identities_count
 308     end
 309
 310     def remove_gap_only_columns!()
 311       remove_columns!( get_gap_only_columns() )
 312     end
 313
 314     def remove_gap_columns!()
 315       remove_columns!( get_gap_columns() )
 316     end
 317
 318     # removes columns for which seqs with gap / number of sequences > gap_ratio
 319     def remove_gap_columns_w_gap_ratio!( gap_ratio )
 320       remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
 321     end
 322
 323
 324     def remove_sequences_by_gap_ratio!( gap_ratio )
 325       if ( !is_aligned() )
 326         error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
 327         raise StandardError, error_msg, caller
 328       end
 329       n = get_number_of_seqs
 330       removed = Array.new
 331       for s in 0 ... n
 332         if ( get_sequence( ( n - 1 ) - s  ).get_gap_ratio() > gap_ratio )
 333           if ( Evoruby::Constants::VERBOSE )
 334             puts( "removed: " + get_sequence( ( n - 1 ) - s  ).get_name )
 335           end
 336           removed << get_sequence( ( n - 1 ) - s  ).get_name
 337           remove_sequence!( ( n - 1 ) - s  )
 338         end
 339       end
 340       removed
 341     end
 342
 343
 344     def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
 345       n = get_number_of_seqs
 346       removed = Array.new
 347       to_be_removed = Set.new
 348       @identical_seqs_detected = Array.new
 349       for i in 0 ... ( n - 1 )
 350         for j in ( i + 1 ) ... n
 351           if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
 352             if  !consider_taxonomy ||
 353                ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
 354                  ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
 355               if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
 356                  Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
 357                 to_be_removed.add( j )
 358
 359                 tax_i = ""
 360                 tax_j = ""
 361                 if get_sequence( i ).get_taxonomy != nil
 362                   tax_i = get_sequence( i ).get_taxonomy.get_name
 363                 end
 364                 if get_sequence( j ).get_taxonomy != nil
 365                   tax_j = get_sequence( j ).get_taxonomy.get_name
 366                 end
 367                 identical_pair = get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]"
 368                 @identical_seqs_detected.push( identical_pair )
 369                 if verbose
 370                   puts identical_pair
 371                 end
 372               end
 373             end
 374           end
 375         end
 376       end
 377
 378       to_be_removed_ary = to_be_removed.to_a.sort.reverse
 379
 380       to_be_removed_ary.each { | index |
 381         removed.push( get_sequence( index ).get_name )
 382         remove_sequence!( index )
 383       }
 384       removed
 385     end
 386
 387
 388     def remove_sequences_by_non_gap_length!( min_non_gap_length )
 389       n = get_number_of_seqs
 390       removed = Array.new
 391       for s in 0 ... n
 392         x = ( n - 1 ) - s
 393         seq = get_sequence( x )
 394         if ( ( seq.get_length - seq.get_gap_length ) < min_non_gap_length )
 395           if ( Evoruby::Constants::VERBOSE )
 396             puts( "removed: " + seq.get_name )
 397           end
 398           removed << seq.get_name
 399           remove_sequence!( x )
 400         end
 401       end
 402       removed
 403     end
 404
 405     def trim!( first, last )
 406       cols = Array.new()
 407       for i in 0 ... get_length()
 408         if ( i < first || i > last )
 409           cols.push( i )
 410         end
 411       end
 412       remove_columns!( cols )
 413     end
 414
 415     def get_gap_only_columns()
 416       if ( !is_aligned() )
 417         error_msg = "attempt to get gap only columns of unaligned msa"
 418         raise StandardError, error_msg, caller
 419       end
 420       cols = Array.new()
 421       for c in 0 ... get_length
 422         nogap_char_found = false
 423         for s in 0 ... get_number_of_seqs
 424           unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 425             nogap_char_found = true
 426             break
 427           end
 428         end
 429         unless nogap_char_found
 430           cols.push( c )
 431         end
 432       end
 433       return cols
 434     end
 435
 436     def calculate_gap_proportion()
 437       if ( !is_aligned() )
 438         error_msg = "attempt to get gap only columns of unaligned msa"
 439         raise StandardError, error_msg, caller
 440       end
 441       total_sum = 0.0
 442       gap_sum = 0.0
 443       for c in 0 ... get_length
 444         for s in 0 ... get_number_of_seqs
 445           total_sum = total_sum + 1
 446           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 447             gap_sum = gap_sum  + 1
 448           end
 449         end
 450
 451       end
 452       return gap_sum / total_sum
 453     end
 454
 455     def get_gap_columns()
 456       if ( !is_aligned() )
 457         error_msg = "attempt to get gap columns of unaligned msa"
 458         raise StandardError, error_msg, caller
 459       end
 460       cols = Array.new()
 461       for c in 0 ... get_length
 462         gap_char_found = false
 463         for s in 0 ... get_number_of_seqs
 464           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 465             gap_char_found = true
 466             break
 467           end
 468         end
 469         if gap_char_found
 470           cols.push( c )
 471         end
 472       end
 473       return cols
 474     end
 475
 476     # gap_ratio = seqs with gap / number of sequences
 477     # returns column indices for which seqs with gap / number of sequences > gap_ratio
 478     def get_gap_columns_w_gap_ratio( gap_ratio )
 479       if ( !is_aligned() )
 480         error_msg = "attempt to get gap columns with gap_ratio of unaligned msa"
 481         raise StandardError, error_msg, caller
 482       end
 483       if ( gap_ratio < 0 || gap_ratio > 1 )
 484         error_msg = "gap ratio must be between 0 and 1 inclusive"
 485         raise ArgumentError, error_msg, caller
 486       end
 487       cols = Array.new()
 488       for c in 0 ... get_length
 489         gap_chars_found = 0
 490         for s in 0 ... get_number_of_seqs
 491           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 492             gap_chars_found += 1
 493           end
 494         end
 495         if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio )
 496           cols.push( c )
 497         end
 498       end
 499       return cols
 500     end
 501
 502
 503     # Split an alignment into n alignemnts of equal size, except last one
 504     def split( n, verbose = false )
 505       if ( n < 2 || n > get_number_of_seqs )
 506         error_msg = "attempt to split into less than two or more than the number of sequences"
 507         raise StandardError, error_msg, caller
 508       end
 509       msas = Array.new()
 510       r = get_number_of_seqs % n
 511       x = get_number_of_seqs / n
 512       for i in 0 ... n
 513         msa = Msa.new()
 514         s = 0
 515
 516         if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
 517           y = x + r
 518           if ( verbose )
 519             puts( i.to_s + ": " + y.to_s )
 520           end
 521           for j in 0 ... y
 522             msa.add_sequence( get_sequence( ( i * x ) + j ) )
 523           end
 524         else
 525           if ( verbose )
 526             puts( i.to_s + ": " + x.to_s )
 527           end
 528           for j in 0 ... x
 529             msa.add_sequence( get_sequence( ( i * x ) + j ) )
 530           end
 531         end
 532         msas.push( msa )
 533       end
 534       msas
 535     end
 536
 537
 538     private
 539
 540     def get_overlaps( min_overlap = 1 )
 541       if ( !is_aligned() )
 542         error_msg = "attempt to get overlaps of unaligned msa"
 543         raise StandardError, error_msg, caller
 544       end
 545       bins = Array.new()
 546       for i in 0 ... get_number_of_seqs()
 547         found_bin = false
 548         for j in 0 ... bins.length
 549           current_bin = bins[ j ]
 550           # does seq i overlap with all seqs in current_bin?
 551           all_overlap = true
 552           for z in 0 ... current_bin.length
 553             unless overlap?( i, current_bin[ z ], min_overlap )
 554               all_overlap = false
 555               break
 556             end
 557           end
 558           if all_overlap
 559             current_bin.push( i )
 560             found_bin = true
 561           end
 562         end
 563         unless found_bin
 564           new_bin = Array.new()
 565           new_bin.push( i )
 566           bins.push( new_bin )
 567         end
 568       end
 569       return bins
 570     end
 571
 572     def remove_columns!( cols )
 573       if ( !is_aligned() )
 574         error_msg = "attempt to remove columns of unaligned msa"
 575         raise StandardError, error_msg, caller
 576       end
 577       cols.reverse!()
 578       for c in 0 ... cols.length()
 579         col = cols[ c ]
 580         for s in 0 ... get_number_of_seqs()
 581           get_sequence( s ).delete_residue!( col )
 582         end
 583       end
 584       return self
 585     end
 586
 587
 588   end # class Msa
 589
 590 end # module Evoruby
 591