forester/ruby/evoruby/lib/evo/msa/msa.rb

   1 #
   2 # = lib/evo/msa/msa.rb - Msa class
   3 #
   4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
   5 # License::    GNU Lesser General Public License (LGPL)
   6 #
   7 # $Id: msa.rb,v 1.11 2009/01/03 00:42:08 cmzmasek Exp $
   8 #
   9
  10
  11 require 'lib/evo/util/constants'
  12 require 'lib/evo/util/util'
  13 require 'lib/evo/sequence/sequence'
  14
  15 module Evoruby
  16
  17   class Msa
  18
  19     def initialize()
  20       @sequences = Array.new
  21       @identical_seqs_detected = Array.new
  22       @name_to_seq_indices = Hash.new
  23       @namestart_to_seq_indices = Hash.new
  24     end
  25
  26
  27     def add_sequence( sequence )
  28       @sequences.push( sequence )
  29     end
  30
  31     def add( name, molecular_sequence_str )
  32       add_sequence( Sequence.new( name, molecular_sequence_str ) )
  33     end
  34
  35     def get_sequence( index )
  36       if ( index < 0 || index > get_number_of_seqs() - 1 )
  37         error_msg = "attempt to get sequence " <<
  38          index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
  39          " sequences"
  40         raise ArgumentError, error_msg
  41       end
  42       return @sequences[ index ]
  43     end
  44
  45     def remove_sequence!( index )
  46       if ( index < 0 || index > get_number_of_seqs() - 1 )
  47         error_msg = "attempt to remove sequence " <<
  48          index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
  49          " sequences"
  50         raise ArgumentError, error_msg
  51       end
  52       @name_to_seq_indices.clear
  53       @namestart_to_seq_indices.clear
  54       @sequences.delete_at( index )
  55     end
  56
  57     def get_identical_seqs_detected
  58       @identical_seqs_detected
  59     end
  60
  61
  62     def is_aligned()
  63       if ( get_number_of_seqs < 1 )
  64         return false
  65       else
  66         l = @sequences[ 0 ].get_length()
  67         for i in 0 ... get_number_of_seqs()
  68           if ( get_sequence( i ).get_length() != l )
  69             return false
  70           end
  71         end
  72       end
  73       return true
  74     end
  75
  76     def find_by_name( name, case_sensitive, partial_match )
  77       if case_sensitive && !partial_match && @name_to_seq_indices.has_key?( name )
  78         return @name_to_seq_indices[ name ]
  79       end
  80       indices = Array.new()
  81       for i in 0 ... get_number_of_seqs()
  82         current_name = get_sequence( i ).get_name()
  83         if case_sensitive && !partial_match
  84           if !@name_to_seq_indices.has_key?( current_name )
  85             @name_to_seq_indices[ current_name ] = []
  86           end
  87           @name_to_seq_indices[ current_name ].push( i )
  88         end
  89         if !case_sensitive
  90           current_name = current_name.downcase
  91           name = name.downcase
  92         end
  93         if current_name == name ||
  94            ( partial_match && current_name.include?( name ) )
  95           indices.push( i )
  96         end
  97       end
  98       indices
  99     end
 100
 101     def find_by_name_pattern( name_re, avoid_similar_to = true )
 102       indices = []
 103       for i in 0 ... get_number_of_seqs()
 104         if avoid_similar_to
 105           m = name_re.match( get_sequence( i ).get_name() )
 106           if m && !m.pre_match.downcase.include?( "similar to " )
 107             indices.push( i )
 108           end
 109         else
 110           if name_re.match( get_sequence( i ).get_name() )
 111             indices.push( i )
 112           end
 113         end
 114       end
 115       indices
 116     end
 117
 118     # throws ArgumentError
 119     def get_by_name_pattern( name_re , avoid_similar_to = true )
 120       indices = find_by_name_pattern( name_re, avoid_similar_to  )
 121       if ( indices.length > 1 )
 122         error_msg = "pattern  " + name_re.to_s + " not unique"
 123         raise ArgumentError, error_msg
 124       elsif ( indices.length < 1 )
 125         error_msg = "pattern " + name_re.to_s + " not found"
 126         raise ArgumentError, error_msg
 127       end
 128       get_sequence( indices[ 0 ] )
 129     end
 130
 131     def find_by_name_start( name, case_sensitive )
 132       if case_sensitive && @namestart_to_seq_indices.has_key?( name )
 133         return @namestart_to_seq_indices[ name ]
 134       end
 135       indices = []
 136       for i in 0 ... get_number_of_seqs()
 137         get_sequence( i ).get_name() =~ /^\s*(\S+)/
 138         current_name = $1
 139         if case_sensitive
 140           if !@namestart_to_seq_indices.has_key?( current_name )
 141             @namestart_to_seq_indices[ current_name ] = []
 142           end
 143           @namestart_to_seq_indices[ current_name ].push( i )
 144         end
 145         if !case_sensitive
 146           current_name = current_name.downcase
 147           name = name.downcase
 148         end
 149         if  current_name == name
 150           indices.push( i )
 151         end
 152       end
 153       indices
 154     end
 155
 156     def has?( name, case_sensitive = true, partial_match = false )
 157       for i in 0 ... get_number_of_seqs()
 158         current_name = get_sequence( i ).get_name()
 159         if !case_sensitive
 160           current_name = current_name.downcase
 161           name = name.downcase
 162         end
 163         if current_name == name ||
 164            ( partial_match && current_name.include?( name ) )
 165           return true
 166         end
 167       end
 168       false
 169     end
 170
 171     # throws ArgumentError
 172     def get_by_name( name, case_sensitive = true, partial_match = false )
 173       indices = find_by_name( name, case_sensitive, partial_match )
 174       if ( indices.length > 1 )
 175         error_msg = "\"" + name + "\" not unique"
 176         raise ArgumentError, error_msg
 177       elsif ( indices.length < 1 )
 178         error_msg = "\"" + name + "\" not found"
 179         raise ArgumentError, error_msg
 180       end
 181       get_sequence( indices[ 0 ] )
 182     end
 183
 184     # throws ArgumentError
 185     def get_by_name_start( name, case_sensitive = true )
 186       indices = find_by_name_start( name, case_sensitive )
 187       if indices.length > 1
 188         error_msg = "\"" + name + "\" not unique"
 189         raise ArgumentError, error_msg
 190       elsif  indices.length < 1
 191         error_msg = "\"" + name + "\" not found"
 192         raise ArgumentError, error_msg
 193       end
 194       get_sequence( indices[ 0 ] )
 195     end
 196
 197
 198     def get_sub_alignment( seq_numbers )
 199       msa = Msa.new()
 200       for i in 0 ... seq_numbers.length()
 201         msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() )
 202       end
 203       return msa
 204     end
 205
 206     def get_number_of_seqs()
 207       @sequences.length
 208     end
 209
 210     def get_length
 211       if !is_aligned()
 212         error_msg = "attempt to get length of unaligned msa"
 213         raise StandardError, error_msg, caller
 214       end
 215       if get_number_of_seqs() < 1
 216         -1
 217       else
 218         @sequences[ 0 ].get_length()
 219       end
 220     end
 221
 222     def to_str
 223       to_fasta
 224     end
 225
 226     def to_fasta
 227       s = String.new
 228       for i in 0...get_number_of_seqs
 229         s += @sequences[ i ].to_fasta + Constants::LINE_DELIMITER
 230       end
 231       s
 232     end
 233
 234
 235     def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
 236       if ( !is_aligned() )
 237         error_msg = "attempt to get overlap diagram of unaligned msa"
 238         raise StandardError, error_msg, caller
 239       end
 240       for i in 0 ... get_number_of_seqs()
 241         io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) )
 242         for j in 0 ... get_number_of_seqs()
 243           if i == j
 244             io.print( " " )
 245           else
 246             if overlap?( i, j, min_overlap )
 247               io.print( "+" )
 248             else
 249               io.print( "-" )
 250             end
 251           end
 252         end
 253         io.print( Evoruby::Constants::LINE_DELIMITER )
 254       end
 255     end
 256
 257     #returns array of Msa with an overlap of min_overlap
 258     def split_into_overlapping_msa( min_overlap = 1 )
 259       if ( !is_aligned() )
 260         error_msg = "attempt to split into overlapping msas of unaligned msa"
 261         raise StandardError, error_msg, caller
 262       end
 263       msas = Array.new()
 264       bins = get_overlaps( min_overlap )
 265       for i in 0 ... bins.length
 266         msas.push( get_sub_alignment( bins[ i ] ) )
 267       end
 268       msas
 269     end
 270
 271     def overlap?( index_1, index_2, min_overlap = 1 )
 272       seq_1 = get_sequence( index_1 )
 273       seq_2 = get_sequence( index_2 )
 274       overlap_count = 0
 275       for i in 0...seq_1.get_length()
 276         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 277            !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
 278           overlap_count += 1
 279           if overlap_count >= min_overlap
 280             return true
 281           end
 282         end
 283       end
 284       return false
 285     end
 286
 287     def calculate_overlap( index_1, index_2 )
 288       seq_1 = get_sequence( index_1 )
 289       seq_2 = get_sequence( index_2 )
 290       overlap_count = 0
 291       for i in 0...seq_1.get_length
 292         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 293            !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
 294           overlap_count += 1
 295         end
 296       end
 297       overlap_count
 298     end
 299
 300     def calculate_identities( index_1, index_2 )
 301       seq_1 = get_sequence( index_1 )
 302       seq_2 = get_sequence( index_2 )
 303       identities_count = 0
 304       for i in 0...seq_1.get_length
 305         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 306            !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
 307            seq_1.get_character_code( i ) != 63 &&
 308            ( seq_1.get_residue( i ).downcase() ==
 309              seq_2.get_residue( i ).downcase() )
 310           identities_count += 1
 311         end
 312       end
 313       identities_count
 314     end
 315
 316     def remove_gap_only_columns!()
 317       remove_columns!( get_gap_only_columns() )
 318     end
 319
 320     def remove_gap_columns!()
 321       remove_columns!( get_gap_columns() )
 322     end
 323
 324     # removes columns for which seqs with gap / number of sequences > gap_ratio
 325     def remove_gap_columns_w_gap_ratio!( gap_ratio )
 326       remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
 327     end
 328
 329
 330     def remove_sequences_by_gap_ratio!( gap_ratio )
 331       if ( !is_aligned() )
 332         error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
 333         raise StandardError, error_msg, caller
 334       end
 335       n = get_number_of_seqs
 336       removed = Array.new
 337       for s in 0 ... n
 338         if ( get_sequence( ( n - 1 ) - s  ).get_gap_ratio() > gap_ratio )
 339           if ( Evoruby::Constants::VERBOSE )
 340             puts( "removed: " + get_sequence( ( n - 1 ) - s  ).get_name )
 341           end
 342           removed << get_sequence( ( n - 1 ) - s  ).get_name
 343           remove_sequence!( ( n - 1 ) - s  )
 344         end
 345       end
 346       removed
 347     end
 348
 349
 350     def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
 351       n = get_number_of_seqs
 352       removed = Array.new
 353       to_be_removed = Set.new
 354       @identical_seqs_detected = Array.new
 355       for i in 0 ... ( n - 1 )
 356         for j in ( i + 1 ) ... n
 357           if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
 358             if  !consider_taxonomy ||
 359                ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
 360                  ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
 361               if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
 362                  Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
 363                 to_be_removed.add( j )
 364
 365                 tax_i = ""
 366                 tax_j = ""
 367                 if get_sequence( i ).get_taxonomy != nil
 368                   tax_i = get_sequence( i ).get_taxonomy.get_name
 369                 end
 370                 if get_sequence( j ).get_taxonomy != nil
 371                   tax_j = get_sequence( j ).get_taxonomy.get_name
 372                 end
 373                 identical_pair = get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]"
 374                 @identical_seqs_detected.push( identical_pair )
 375                 if verbose
 376                   puts identical_pair
 377                 end
 378               end
 379             end
 380           end
 381         end
 382       end
 383
 384       to_be_removed_ary = to_be_removed.to_a.sort.reverse
 385
 386       to_be_removed_ary.each { | index |
 387         removed.push( get_sequence( index ).get_name )
 388         remove_sequence!( index )
 389       }
 390       removed
 391     end
 392
 393
 394     def remove_sequences_by_non_gap_length!( min_non_gap_length )
 395       n = get_number_of_seqs
 396       removed = Array.new
 397       for s in 0 ... n
 398         x = ( n - 1 ) - s
 399         seq = get_sequence( x )
 400         if ( ( seq.get_length - seq.get_gap_length ) < min_non_gap_length )
 401           if ( Evoruby::Constants::VERBOSE )
 402             puts( "removed: " + seq.get_name )
 403           end
 404           removed << seq.get_name
 405           remove_sequence!( x )
 406         end
 407       end
 408       removed
 409     end
 410
 411     def trim!( first, last )
 412       cols = Array.new()
 413       for i in 0 ... get_length()
 414         if ( i < first || i > last )
 415           cols.push( i )
 416         end
 417       end
 418       remove_columns!( cols )
 419     end
 420
 421     def get_gap_only_columns()
 422       if ( !is_aligned() )
 423         error_msg = "attempt to get gap only columns of unaligned msa"
 424         raise StandardError, error_msg, caller
 425       end
 426       cols = Array.new()
 427       for c in 0 ... get_length
 428         nogap_char_found = false
 429         for s in 0 ... get_number_of_seqs
 430           unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 431             nogap_char_found = true
 432             break
 433           end
 434         end
 435         unless nogap_char_found
 436           cols.push( c )
 437         end
 438       end
 439       return cols
 440     end
 441
 442     def calculate_gap_proportion()
 443       if ( !is_aligned() )
 444         error_msg = "attempt to get gap only columns of unaligned msa"
 445         raise StandardError, error_msg, caller
 446       end
 447       total_sum = 0.0
 448       gap_sum = 0.0
 449       for c in 0 ... get_length
 450         for s in 0 ... get_number_of_seqs
 451           total_sum = total_sum + 1
 452           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 453             gap_sum = gap_sum  + 1
 454           end
 455         end
 456
 457       end
 458       return gap_sum / total_sum
 459     end
 460
 461     def get_gap_columns()
 462       if ( !is_aligned() )
 463         error_msg = "attempt to get gap columns of unaligned msa"
 464         raise StandardError, error_msg, caller
 465       end
 466       cols = Array.new()
 467       for c in 0 ... get_length
 468         gap_char_found = false
 469         for s in 0 ... get_number_of_seqs
 470           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 471             gap_char_found = true
 472             break
 473           end
 474         end
 475         if gap_char_found
 476           cols.push( c )
 477         end
 478       end
 479       return cols
 480     end
 481
 482     # gap_ratio = seqs with gap / number of sequences
 483     # returns column indices for which seqs with gap / number of sequences > gap_ratio
 484     def get_gap_columns_w_gap_ratio( gap_ratio )
 485       if ( !is_aligned() )
 486         error_msg = "attempt to get gap columns with gap_ratio of unaligned msa"
 487         raise StandardError, error_msg, caller
 488       end
 489       if ( gap_ratio < 0 || gap_ratio > 1 )
 490         error_msg = "gap ratio must be between 0 and 1 inclusive"
 491         raise ArgumentError, error_msg, caller
 492       end
 493       cols = Array.new()
 494       for c in 0 ... get_length
 495         gap_chars_found = 0
 496         for s in 0 ... get_number_of_seqs
 497           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 498             gap_chars_found += 1
 499           end
 500         end
 501         if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio )
 502           cols.push( c )
 503         end
 504       end
 505       return cols
 506     end
 507
 508
 509     # Split an alignment into n alignemnts of equal size, except last one
 510     def split( n, verbose = false )
 511       if ( n < 2 || n > get_number_of_seqs )
 512         error_msg = "attempt to split into less than two or more than the number of sequences"
 513         raise StandardError, error_msg, caller
 514       end
 515       msas = Array.new()
 516       r = get_number_of_seqs % n
 517       x = get_number_of_seqs / n
 518       for i in 0 ... n
 519         msa = Msa.new()
 520         s = 0
 521
 522         if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
 523           y = x + r
 524           if ( verbose )
 525             puts( i.to_s + ": " + y.to_s )
 526           end
 527           for j in 0 ... y
 528             msa.add_sequence( get_sequence( ( i * x ) + j ) )
 529           end
 530         else
 531           if ( verbose )
 532             puts( i.to_s + ": " + x.to_s )
 533           end
 534           for j in 0 ... x
 535             msa.add_sequence( get_sequence( ( i * x ) + j ) )
 536           end
 537         end
 538         msas.push( msa )
 539       end
 540       msas
 541     end
 542
 543
 544     private
 545
 546     def get_overlaps( min_overlap = 1 )
 547       if ( !is_aligned() )
 548         error_msg = "attempt to get overlaps of unaligned msa"
 549         raise StandardError, error_msg, caller
 550       end
 551       bins = Array.new()
 552       for i in 0 ... get_number_of_seqs()
 553         found_bin = false
 554         for j in 0 ... bins.length
 555           current_bin = bins[ j ]
 556           # does seq i overlap with all seqs in current_bin?
 557           all_overlap = true
 558           for z in 0 ... current_bin.length
 559             unless overlap?( i, current_bin[ z ], min_overlap )
 560               all_overlap = false
 561               break
 562             end
 563           end
 564           if all_overlap
 565             current_bin.push( i )
 566             found_bin = true
 567           end
 568         end
 569         unless found_bin
 570           new_bin = Array.new()
 571           new_bin.push( i )
 572           bins.push( new_bin )
 573         end
 574       end
 575       return bins
 576     end
 577
 578     def remove_columns!( cols )
 579       if ( !is_aligned() )
 580         error_msg = "attempt to remove columns of unaligned msa"
 581         raise StandardError, error_msg, caller
 582       end
 583       cols.reverse!()
 584       for c in 0 ... cols.length()
 585         col = cols[ c ]
 586         for s in 0 ... get_number_of_seqs()
 587           get_sequence( s ).delete_residue!( col )
 588         end
 589       end
 590       return self
 591     end
 592
 593
 594   end # class Msa
 595
 596 end # module Evoruby
 597