forester/ruby/evoruby/lib/evo/msa/msa.rb

   1 #
   2 # = lib/evo/msa/msa.rb - Msa class
   3 #
   4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
   5 # License::    GNU Lesser General Public License (LGPL)
   6 #
   7 # $Id: msa.rb,v 1.11 2009/01/03 00:42:08 cmzmasek Exp $
   8 #
   9
  10
  11 require 'lib/evo/util/constants'
  12 require 'lib/evo/util/util'
  13 require 'lib/evo/sequence/sequence'
  14
  15 module Evoruby
  16
  17   class Msa
  18
  19     def initialize()
  20       @sequences = Array.new
  21       @identical_seqs_detected = Array.new
  22     end
  23
  24
  25     def add_sequence( sequence )
  26       @sequences.push( sequence )
  27     end
  28
  29     def add( name, molecular_sequence_str )
  30       add_sequence( Sequence.new( name, molecular_sequence_str ) )
  31     end
  32
  33     def get_sequence( index )
  34       if ( index < 0 || index > get_number_of_seqs() - 1 )
  35         error_msg = "attempt to get sequence " <<
  36          index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
  37          " sequences"
  38         raise ArgumentError, error_msg
  39       end
  40       return @sequences[ index ]
  41     end
  42
  43     def remove_sequence!( index )
  44       if ( index < 0 || index > get_number_of_seqs() - 1 )
  45         error_msg = "attempt to remove sequence " <<
  46          index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
  47          " sequences"
  48         raise ArgumentError, error_msg
  49       end
  50       @sequences.delete_at( index )
  51     end
  52
  53     def get_identical_seqs_detected
  54       @identical_seqs_detected
  55     end
  56
  57
  58     def is_aligned()
  59       if ( get_number_of_seqs < 1 )
  60         return false
  61       else
  62         l = @sequences[ 0 ].get_length()
  63         for i in 0 ... get_number_of_seqs()
  64           if ( get_sequence( i ).get_length() != l )
  65             return false
  66           end
  67         end
  68       end
  69       return true
  70     end
  71
  72     def find_by_name( name, case_sensitive, partial_match )
  73       indices = Array.new()
  74       for i in 0 ... get_number_of_seqs()
  75         current_name = get_sequence( i ).get_name()
  76         if !case_sensitive
  77           current_name = current_name.downcase
  78           name = name.downcase
  79         end
  80         if current_name == name ||
  81            ( partial_match && current_name.include?( name ) )
  82           indices.push( i )
  83         end
  84       end
  85       indices
  86     end
  87
  88     def find_by_name_pattern( name_re, avoid_similar_to = true )
  89       indices = []
  90       for i in 0 ... get_number_of_seqs()
  91         if avoid_similar_to
  92           m = name_re.match( get_sequence( i ).get_name() )
  93           if m && !m.pre_match.downcase.include?( "similar to " )
  94             indices.push( i )
  95           end
  96         else
  97           if name_re.match( get_sequence( i ).get_name() )
  98             indices.push( i )
  99           end
 100         end
 101       end
 102       indices
 103     end
 104
 105     # throws ArgumentError
 106     def get_by_name_pattern( name_re , avoid_similar_to = true )
 107       indices = find_by_name_pattern( name_re, avoid_similar_to  )
 108       if ( indices.length > 1 )
 109         error_msg = "pattern  " + name_re.to_s + " not unique"
 110         raise ArgumentError, error_msg
 111       elsif ( indices.length < 1 )
 112         error_msg = "pattern " + name_re.to_s + " not found"
 113         raise ArgumentError, error_msg
 114       end
 115       get_sequence( indices[ 0 ] )
 116     end
 117
 118     def find_by_name_start( name, case_sensitive )
 119       indices = []
 120       for i in 0 ... get_number_of_seqs()
 121         get_sequence( i ).get_name() =~ /^\s*(\S+)/
 122         current_name = $1
 123         if !case_sensitive
 124           current_name = current_name.downcase
 125           name = name.downcase
 126         end
 127         if  ( current_name == name )
 128           indices.push( i )
 129         end
 130       end
 131       indices
 132     end
 133
 134     def has?( name, case_sensitive = true, partial_match = false )
 135       for i in 0 ... get_number_of_seqs()
 136         current_name = get_sequence( i ).get_name()
 137         if !case_sensitive
 138           current_name = current_name.downcase
 139           name = name.downcase
 140         end
 141         if current_name == name ||
 142            ( partial_match && current_name.include?( name ) )
 143           return true
 144         end
 145       end
 146       false
 147     end
 148
 149     # throws ArgumentError
 150     def get_by_name( name, case_sensitive = true, partial_match = false )
 151       indices = find_by_name( name, case_sensitive, partial_match )
 152       if ( indices.length > 1 )
 153         error_msg = "\"" + name + "\" not unique"
 154         raise ArgumentError, error_msg
 155       elsif ( indices.length < 1 )
 156         error_msg = "\"" + name + "\" not found"
 157         raise ArgumentError, error_msg
 158       end
 159       get_sequence( indices[ 0 ] )
 160     end
 161
 162     # throws ArgumentError
 163     def get_by_name_start( name, case_sensitive = true )
 164       indices = find_by_name_start( name, case_sensitive )
 165       if ( indices.length > 1 )
 166         error_msg = "\"" + name + "\" not unique"
 167         raise ArgumentError, error_msg
 168       elsif ( indices.length < 1 )
 169         error_msg = "\"" + name + "\" not found"
 170         raise ArgumentError, error_msg
 171       end
 172       get_sequence( indices[ 0 ] )
 173     end
 174
 175
 176     def get_sub_alignment( seq_numbers )
 177       msa = Msa.new()
 178       for i in 0 ... seq_numbers.length()
 179         msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() )
 180       end
 181       return msa
 182     end
 183
 184     def get_number_of_seqs()
 185       @sequences.length
 186     end
 187
 188     def get_length
 189       if !is_aligned()
 190         error_msg = "attempt to get length of unaligned msa"
 191         raise StandardError, error_msg, caller
 192       end
 193       if get_number_of_seqs() < 1
 194         -1
 195       else
 196         @sequences[ 0 ].get_length()
 197       end
 198     end
 199
 200     def to_str
 201       to_fasta
 202     end
 203
 204     def to_fasta
 205       s = String.new
 206       for i in 0...get_number_of_seqs
 207         s += @sequences[ i ].to_fasta + Constants::LINE_DELIMITER
 208       end
 209       s
 210     end
 211
 212
 213     def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
 214       if ( !is_aligned() )
 215         error_msg = "attempt to get overlap diagram of unaligned msa"
 216         raise StandardError, error_msg, caller
 217       end
 218       for i in 0 ... get_number_of_seqs()
 219         io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) )
 220         for j in 0 ... get_number_of_seqs()
 221           if i == j
 222             io.print( " " )
 223           else
 224             if overlap?( i, j, min_overlap )
 225               io.print( "+" )
 226             else
 227               io.print( "-" )
 228             end
 229           end
 230         end
 231         io.print( Evoruby::Constants::LINE_DELIMITER )
 232       end
 233     end
 234
 235     #returns array of Msa with an overlap of min_overlap
 236     def split_into_overlapping_msa( min_overlap = 1 )
 237       if ( !is_aligned() )
 238         error_msg = "attempt to split into overlapping msas of unaligned msa"
 239         raise StandardError, error_msg, caller
 240       end
 241       msas = Array.new()
 242       bins = get_overlaps( min_overlap )
 243       for i in 0 ... bins.length
 244         msas.push( get_sub_alignment( bins[ i ] ) )
 245       end
 246       msas
 247     end
 248
 249     def overlap?( index_1, index_2, min_overlap = 1 )
 250       seq_1 = get_sequence( index_1 )
 251       seq_2 = get_sequence( index_2 )
 252       overlap_count = 0
 253       for i in 0...seq_1.get_length()
 254         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 255            !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
 256           overlap_count += 1
 257           if overlap_count >= min_overlap
 258             return true
 259           end
 260         end
 261       end
 262       return false
 263     end
 264
 265     def calculate_overlap( index_1, index_2 )
 266       seq_1 = get_sequence( index_1 )
 267       seq_2 = get_sequence( index_2 )
 268       overlap_count = 0
 269       for i in 0...seq_1.get_length
 270         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 271            !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
 272           overlap_count += 1
 273         end
 274       end
 275       overlap_count
 276     end
 277
 278     def calculate_identities( index_1, index_2 )
 279       seq_1 = get_sequence( index_1 )
 280       seq_2 = get_sequence( index_2 )
 281       identities_count = 0
 282       for i in 0...seq_1.get_length
 283         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 284            !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
 285            seq_1.get_character_code( i ) != 63 &&
 286            ( seq_1.get_residue( i ).downcase() ==
 287              seq_2.get_residue( i ).downcase() )
 288           identities_count += 1
 289         end
 290       end
 291       identities_count
 292     end
 293
 294     def remove_gap_only_columns!()
 295       remove_columns!( get_gap_only_columns() )
 296     end
 297
 298     def remove_gap_columns!()
 299       remove_columns!( get_gap_columns() )
 300     end
 301
 302     # removes columns for which seqs with gap / number of sequences > gap_ratio
 303     def remove_gap_columns_w_gap_ratio!( gap_ratio )
 304       remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
 305     end
 306
 307
 308     def remove_sequences_by_gap_ratio!( gap_ratio )
 309       if ( !is_aligned() )
 310         error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
 311         raise StandardError, error_msg, caller
 312       end
 313       n = get_number_of_seqs
 314       removed = Array.new
 315       for s in 0 ... n
 316         if ( get_sequence( ( n - 1 ) - s  ).get_gap_ratio() > gap_ratio )
 317           if ( Evoruby::Constants::VERBOSE )
 318             puts( "removed: " + get_sequence( ( n - 1 ) - s  ).get_name )
 319           end
 320           removed << get_sequence( ( n - 1 ) - s  ).get_name
 321           remove_sequence!( ( n - 1 ) - s  )
 322         end
 323       end
 324       removed
 325     end
 326
 327
 328     def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
 329       n = get_number_of_seqs
 330       removed = Array.new
 331       to_be_removed = Set.new
 332       @identical_seqs_detected = Array.new
 333       for i in 0 ... ( n - 1 )
 334         for j in ( i + 1 ) ... n
 335           if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
 336             if  !consider_taxonomy ||
 337                ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
 338                  ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
 339               if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
 340                  Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
 341                 to_be_removed.add( j )
 342
 343                 tax_i = ""
 344                 tax_j = ""
 345                 if get_sequence( i ).get_taxonomy != nil
 346                   tax_i = get_sequence( i ).get_taxonomy.get_name
 347                 end
 348                 if get_sequence( j ).get_taxonomy != nil
 349                   tax_j = get_sequence( j ).get_taxonomy.get_name
 350                 end
 351                 identical_pair = get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]"
 352                 @identical_seqs_detected.push( identical_pair )
 353                 if verbose
 354                   puts identical_pair
 355                 end
 356               end
 357             end
 358           end
 359         end
 360       end
 361
 362       to_be_removed_ary = to_be_removed.to_a.sort.reverse
 363
 364       to_be_removed_ary.each { | index |
 365         removed.push( get_sequence( index ).get_name )
 366         remove_sequence!( index )
 367       }
 368       removed
 369     end
 370
 371
 372     def remove_sequences_by_non_gap_length!( min_non_gap_length )
 373       if ( !is_aligned() )
 374         error_msg = "attempt to remove sequences by non gap length on unaligned msa"
 375         raise StandardError, error_msg, caller
 376       end
 377       n = get_number_of_seqs
 378       l = get_length
 379       removed = Array.new
 380       for s in 0 ... n
 381         if ( ( l - get_sequence( ( n - 1 ) - s ).get_gap_length ) < min_non_gap_length )
 382           if ( Evoruby::Constants::VERBOSE )
 383             puts( "removed: " + get_sequence( ( n - 1 ) - s  ).get_name )
 384           end
 385           removed << get_sequence( ( n - 1 ) - s  ).get_name
 386           remove_sequence!( ( n - 1 ) - s )
 387         end
 388       end
 389       removed
 390     end
 391
 392     def trim!( first, last )
 393       cols = Array.new()
 394       for i in 0 ... get_length()
 395         if ( i < first || i > last )
 396           cols.push( i )
 397         end
 398       end
 399       remove_columns!( cols )
 400     end
 401
 402     def get_gap_only_columns()
 403       if ( !is_aligned() )
 404         error_msg = "attempt to get gap only columns of unaligned msa"
 405         raise StandardError, error_msg, caller
 406       end
 407       cols = Array.new()
 408       for c in 0 ... get_length
 409         nogap_char_found = false
 410         for s in 0 ... get_number_of_seqs
 411           unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 412             nogap_char_found = true
 413             break
 414           end
 415         end
 416         unless nogap_char_found
 417           cols.push( c )
 418         end
 419       end
 420       return cols
 421     end
 422
 423     def calculate_gap_proportion()
 424       if ( !is_aligned() )
 425         error_msg = "attempt to get gap only columns of unaligned msa"
 426         raise StandardError, error_msg, caller
 427       end
 428       total_sum = 0.0
 429       gap_sum = 0.0
 430       for c in 0 ... get_length
 431         for s in 0 ... get_number_of_seqs
 432           total_sum = total_sum + 1
 433           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 434             gap_sum = gap_sum  + 1
 435           end
 436         end
 437
 438       end
 439       return gap_sum / total_sum
 440     end
 441
 442     def get_gap_columns()
 443       if ( !is_aligned() )
 444         error_msg = "attempt to get gap columns of unaligned msa"
 445         raise StandardError, error_msg, caller
 446       end
 447       cols = Array.new()
 448       for c in 0 ... get_length
 449         gap_char_found = false
 450         for s in 0 ... get_number_of_seqs
 451           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 452             gap_char_found = true
 453             break
 454           end
 455         end
 456         if gap_char_found
 457           cols.push( c )
 458         end
 459       end
 460       return cols
 461     end
 462
 463     # gap_ratio = seqs with gap / number of sequences
 464     # returns column indices for which seqs with gap / number of sequences > gap_ratio
 465     def get_gap_columns_w_gap_ratio( gap_ratio )
 466       if ( !is_aligned() )
 467         error_msg = "attempt to get gap columns with gap_ratio of unaligned msa"
 468         raise StandardError, error_msg, caller
 469       end
 470       if ( gap_ratio < 0 || gap_ratio > 1 )
 471         error_msg = "gap ratio must be between 0 and 1 inclusive"
 472         raise ArgumentError, error_msg, caller
 473       end
 474       cols = Array.new()
 475       for c in 0 ... get_length
 476         gap_chars_found = 0
 477         for s in 0 ... get_number_of_seqs
 478           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 479             gap_chars_found += 1
 480           end
 481         end
 482         if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio )
 483           cols.push( c )
 484         end
 485       end
 486       return cols
 487     end
 488
 489
 490     # Split an alignment into n alignemnts of equal size, except last one
 491     def split( n, verbose = false )
 492       if ( n < 2 || n > get_number_of_seqs )
 493         error_msg = "attempt to split into less than two or more than the number of sequences"
 494         raise StandardError, error_msg, caller
 495       end
 496       msas = Array.new()
 497       r = get_number_of_seqs % n
 498       x = get_number_of_seqs / n
 499       for i in 0 ... n
 500         msa = Msa.new()
 501         s = 0
 502
 503         if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
 504           y = x + r
 505           if ( verbose )
 506             puts( i.to_s + ": " + y.to_s )
 507           end
 508           for j in 0 ... y
 509             msa.add_sequence( get_sequence( ( i * x ) + j ) )
 510           end
 511         else
 512           if ( verbose )
 513             puts( i.to_s + ": " + x.to_s )
 514           end
 515           for j in 0 ... x
 516             msa.add_sequence( get_sequence( ( i * x ) + j ) )
 517           end
 518         end
 519         msas.push( msa )
 520       end
 521       msas
 522     end
 523
 524
 525     private
 526
 527     def get_overlaps( min_overlap = 1 )
 528       if ( !is_aligned() )
 529         error_msg = "attempt to get overlaps of unaligned msa"
 530         raise StandardError, error_msg, caller
 531       end
 532       bins = Array.new()
 533       for i in 0 ... get_number_of_seqs()
 534         found_bin = false
 535         for j in 0 ... bins.length
 536           current_bin = bins[ j ]
 537           # does seq i overlap with all seqs in current_bin?
 538           all_overlap = true
 539           for z in 0 ... current_bin.length
 540             unless overlap?( i, current_bin[ z ], min_overlap )
 541               all_overlap = false
 542               break
 543             end
 544           end
 545           if all_overlap
 546             current_bin.push( i )
 547             found_bin = true
 548           end
 549         end
 550         unless found_bin
 551           new_bin = Array.new()
 552           new_bin.push( i )
 553           bins.push( new_bin )
 554         end
 555       end
 556       return bins
 557     end
 558
 559     def remove_columns!( cols )
 560       if ( !is_aligned() )
 561         error_msg = "attempt to remove columns of unaligned msa"
 562         raise StandardError, error_msg, caller
 563       end
 564       cols.reverse!()
 565       for c in 0 ... cols.length()
 566         col = cols[ c ]
 567         for s in 0 ... get_number_of_seqs()
 568           get_sequence( s ).delete_residue!( col )
 569         end
 570       end
 571       return self
 572     end
 573
 574
 575   end # class Msa
 576
 577 end # module Evoruby
 578