forester/ruby/evoruby/lib/evo/msa/msa.rb

   1 #
   2 # = lib/evo/msa/msa.rb - Msa class
   3 #
   4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
   5 # License::      GNU Lesser General Public License (LGPL)
   6 #
   7 # Last modified: 2017/02/07
   8
   9
  10 require 'lib/evo/util/constants'
  11 require 'lib/evo/util/util'
  12 require 'lib/evo/sequence/sequence'
  13
  14 module Evoruby
  15
  16   class Msa
  17
  18     def initialize()
  19       @sequences = Array.new
  20       @identical_seqs_detected = Array.new
  21       @name_to_seq_indices = Hash.new
  22       @namestart_to_seq_indices = Hash.new
  23     end
  24
  25
  26     def add_sequence( sequence )
  27       @sequences.push( sequence )
  28     end
  29
  30     def add( name, molecular_sequence_str )
  31       add_sequence( Sequence.new( name, molecular_sequence_str ) )
  32     end
  33
  34     def get_sequence( index )
  35       if ( index < 0 || index > get_number_of_seqs() - 1 )
  36         error_msg = "attempt to get sequence " <<
  37          index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
  38          " sequences"
  39         raise ArgumentError, error_msg
  40       end
  41       return @sequences[ index ]
  42     end
  43
  44     def remove_sequence!( index )
  45       if ( index < 0 || index > get_number_of_seqs() - 1 )
  46         error_msg = "attempt to remove sequence " <<
  47          index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
  48          " sequences"
  49         raise ArgumentError, error_msg
  50       end
  51       @name_to_seq_indices.clear
  52       @namestart_to_seq_indices.clear
  53       @sequences.delete_at( index )
  54     end
  55
  56     def get_identical_seqs_detected
  57       @identical_seqs_detected
  58     end
  59
  60
  61     def is_aligned()
  62       if ( get_number_of_seqs < 1 )
  63         return false
  64       else
  65         l = @sequences[ 0 ].get_length()
  66         for i in 0 ... get_number_of_seqs()
  67           if ( get_sequence( i ).get_length() != l )
  68             return false
  69           end
  70         end
  71       end
  72       return true
  73     end
  74
  75     def find_by_name( name, case_sensitive, partial_match )
  76       if case_sensitive && !partial_match && @name_to_seq_indices.has_key?( name )
  77         return @name_to_seq_indices[ name ]
  78       end
  79       indices = Array.new()
  80       for i in 0 ... get_number_of_seqs()
  81         current_name = get_sequence( i ).get_name()
  82         if case_sensitive && !partial_match
  83           if !@name_to_seq_indices.has_key?( current_name )
  84             @name_to_seq_indices[ current_name ] = []
  85           end
  86           @name_to_seq_indices[ current_name ].push( i )
  87         end
  88         if !case_sensitive
  89           current_name = current_name.downcase
  90           name = name.downcase
  91         end
  92         if current_name == name ||
  93            ( partial_match && current_name.include?( name ) )
  94           indices.push( i )
  95         end
  96       end
  97       indices
  98     end
  99
 100     def find_by_name_pattern( name_re, avoid_similar_to = true )
 101       indices = []
 102       for i in 0 ... get_number_of_seqs()
 103         if avoid_similar_to
 104           m = name_re.match( get_sequence( i ).get_name() )
 105           if m && !m.pre_match.downcase.include?( "similar to " )
 106             indices.push( i )
 107           end
 108         else
 109           if name_re.match( get_sequence( i ).get_name() )
 110             indices.push( i )
 111           end
 112         end
 113       end
 114       indices
 115     end
 116
 117     # throws ArgumentError
 118     def get_by_name_pattern( name_re , avoid_similar_to = true )
 119       indices = find_by_name_pattern( name_re, avoid_similar_to  )
 120       if ( indices.length > 1 )
 121         error_msg = "pattern  " + name_re.to_s + " not unique"
 122         raise ArgumentError, error_msg
 123       elsif ( indices.length < 1 )
 124         error_msg = "pattern " + name_re.to_s + " not found"
 125         raise ArgumentError, error_msg
 126       end
 127       get_sequence( indices[ 0 ] )
 128     end
 129
 130     def find_by_name_start( name, case_sensitive )
 131       if case_sensitive && @namestart_to_seq_indices.has_key?( name )
 132         return @namestart_to_seq_indices[ name ]
 133       end
 134       indices = []
 135       for i in 0 ... get_number_of_seqs()
 136         get_sequence( i ).get_name() =~ /^\s*(\S+)/
 137         current_name = $1
 138         if case_sensitive
 139           if !@namestart_to_seq_indices.has_key?( current_name )
 140             @namestart_to_seq_indices[ current_name ] = []
 141           end
 142           @namestart_to_seq_indices[ current_name ].push( i )
 143         end
 144         if !case_sensitive
 145           current_name = current_name.downcase
 146           name = name.downcase
 147         end
 148         if  current_name == name
 149           indices.push( i )
 150         end
 151       end
 152       indices
 153     end
 154
 155     def has?( name, case_sensitive = true, partial_match = false )
 156       for i in 0 ... get_number_of_seqs()
 157         current_name = get_sequence( i ).get_name()
 158         if !case_sensitive
 159           current_name = current_name.downcase
 160           name = name.downcase
 161         end
 162         if current_name == name ||
 163            ( partial_match && current_name.include?( name ) )
 164           return true
 165         end
 166       end
 167       false
 168     end
 169
 170     # throws ArgumentError
 171     def get_by_name( name, case_sensitive = true, partial_match = false )
 172       indices = find_by_name( name, case_sensitive, partial_match )
 173       if ( indices.length > 1 )
 174         error_msg = "\"" + name + "\" not unique"
 175         raise ArgumentError, error_msg
 176       elsif ( indices.length < 1 )
 177         error_msg = "\"" + name + "\" not found"
 178         raise ArgumentError, error_msg
 179       end
 180       get_sequence( indices[ 0 ] )
 181     end
 182
 183     # throws ArgumentError
 184     def get_by_name_start( name, case_sensitive = true )
 185       indices = find_by_name_start( name, case_sensitive )
 186       if indices.length > 1
 187         error_msg = "\"" + name + "\" not unique"
 188         raise ArgumentError, error_msg
 189       elsif  indices.length < 1
 190         error_msg = "\"" + name + "\" not found"
 191         raise ArgumentError, error_msg
 192       end
 193       get_sequence( indices[ 0 ] )
 194     end
 195
 196
 197     def get_sub_alignment( seq_numbers )
 198       msa = Msa.new()
 199       for i in 0 ... seq_numbers.length()
 200         msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() )
 201       end
 202       return msa
 203     end
 204
 205     def get_number_of_seqs()
 206       @sequences.length
 207     end
 208
 209     def get_length
 210       if !is_aligned()
 211         error_msg = "attempt to get length of unaligned msa"
 212         raise StandardError, error_msg, caller
 213       end
 214       if get_number_of_seqs() < 1
 215         -1
 216       else
 217         @sequences[ 0 ].get_length()
 218       end
 219     end
 220
 221     def to_str
 222       to_fasta
 223     end
 224
 225     def to_fasta
 226       s = String.new
 227       for i in 0...get_number_of_seqs
 228         s += @sequences[ i ].to_fasta + Constants::LINE_DELIMITER
 229       end
 230       s
 231     end
 232
 233
 234     def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
 235       if ( !is_aligned() )
 236         error_msg = "attempt to get overlap diagram of unaligned msa"
 237         raise StandardError, error_msg, caller
 238       end
 239       for i in 0 ... get_number_of_seqs()
 240         io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) )
 241         for j in 0 ... get_number_of_seqs()
 242           if i == j
 243             io.print( " " )
 244           else
 245             if overlap?( i, j, min_overlap )
 246               io.print( "+" )
 247             else
 248               io.print( "-" )
 249             end
 250           end
 251         end
 252         io.print( Evoruby::Constants::LINE_DELIMITER )
 253       end
 254     end
 255
 256     #returns array of Msa with an overlap of min_overlap
 257     def split_into_overlapping_msa( min_overlap = 1 )
 258       if ( !is_aligned() )
 259         error_msg = "attempt to split into overlapping msas of unaligned msa"
 260         raise StandardError, error_msg, caller
 261       end
 262       msas = Array.new()
 263       bins = get_overlaps( min_overlap )
 264       for i in 0 ... bins.length
 265         msas.push( get_sub_alignment( bins[ i ] ) )
 266       end
 267       msas
 268     end
 269
 270     def overlap?( index_1, index_2, min_overlap = 1 )
 271       seq_1 = get_sequence( index_1 )
 272       seq_2 = get_sequence( index_2 )
 273       overlap_count = 0
 274       for i in 0...seq_1.get_length()
 275         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 276            !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
 277           overlap_count += 1
 278           if overlap_count >= min_overlap
 279             return true
 280           end
 281         end
 282       end
 283       return false
 284     end
 285
 286     def calculate_overlap( index_1, index_2 )
 287       seq_1 = get_sequence( index_1 )
 288       seq_2 = get_sequence( index_2 )
 289       overlap_count = 0
 290       for i in 0...seq_1.get_length
 291         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 292            !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
 293           overlap_count += 1
 294         end
 295       end
 296       overlap_count
 297     end
 298
 299     def calculate_identities( index_1, index_2 )
 300       seq_1 = get_sequence( index_1 )
 301       seq_2 = get_sequence( index_2 )
 302       identities_count = 0
 303       for i in 0...seq_1.get_length
 304         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 305            !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
 306            seq_1.get_character_code( i ) != 63 &&
 307            ( seq_1.get_residue( i ).downcase() ==
 308              seq_2.get_residue( i ).downcase() )
 309           identities_count += 1
 310         end
 311       end
 312       identities_count
 313     end
 314
 315     def remove_gap_only_columns!()
 316       remove_columns!( get_gap_only_columns() )
 317     end
 318
 319     def remove_gap_columns!()
 320       remove_columns!( get_gap_columns() )
 321     end
 322
 323     # removes columns for which seqs with gap / number of sequences > gap_ratio
 324     def remove_gap_columns_w_gap_ratio!( gap_ratio )
 325       remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
 326     end
 327
 328
 329     def remove_sequences_by_gap_ratio!( gap_ratio )
 330       if ( !is_aligned() )
 331         error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
 332         raise StandardError, error_msg, caller
 333       end
 334       n = get_number_of_seqs
 335       removed = Array.new
 336       for s in 0 ... n
 337         if ( get_sequence( ( n - 1 ) - s  ).get_gap_ratio() > gap_ratio )
 338           if ( Evoruby::Constants::VERBOSE )
 339             puts( "removed: " + get_sequence( ( n - 1 ) - s  ).get_name )
 340           end
 341           removed << get_sequence( ( n - 1 ) - s  ).get_name
 342           remove_sequence!( ( n - 1 ) - s  )
 343         end
 344       end
 345       removed
 346     end
 347
 348
 349     def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
 350       n = get_number_of_seqs
 351       removed = Array.new
 352       to_be_removed = Set.new
 353       @identical_seqs_detected = Array.new
 354       for i in 0 ... ( n - 1 )
 355         for j in ( i + 1 ) ... n
 356           if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
 357             if  !consider_taxonomy ||
 358                ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
 359                  ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
 360               if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
 361                  Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
 362                 to_be_removed.add( j )
 363
 364                 tax_i = ""
 365                 tax_j = ""
 366                 if get_sequence( i ).get_taxonomy != nil
 367                   tax_i = get_sequence( i ).get_taxonomy.get_name
 368                 end
 369                 if get_sequence( j ).get_taxonomy != nil
 370                   tax_j = get_sequence( j ).get_taxonomy.get_name
 371                 end
 372                 identical_pair = get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]"
 373                 @identical_seqs_detected.push( identical_pair )
 374                 if verbose
 375                   puts identical_pair
 376                 end
 377               end
 378             end
 379           end
 380         end
 381       end
 382
 383       to_be_removed_ary = to_be_removed.to_a.sort.reverse
 384
 385       to_be_removed_ary.each { | index |
 386         removed.push( get_sequence( index ).get_name )
 387         remove_sequence!( index )
 388       }
 389       removed
 390     end
 391
 392
 393     def remove_sequences_by_non_gap_length!( min_non_gap_length )
 394       n = get_number_of_seqs
 395       removed = Array.new
 396       for s in 0 ... n
 397         x = ( n - 1 ) - s
 398         seq = get_sequence( x )
 399         if ( ( seq.get_length - seq.get_gap_length ) < min_non_gap_length )
 400           if ( Evoruby::Constants::VERBOSE )
 401             puts( "removed: " + seq.get_name )
 402           end
 403           removed << seq.get_name
 404           remove_sequence!( x )
 405         end
 406       end
 407       removed
 408     end
 409
 410     def trim!( first, last )
 411       cols = Array.new()
 412       for i in 0 ... get_length()
 413         if ( i < first || i > last )
 414           cols.push( i )
 415         end
 416       end
 417       remove_columns!( cols )
 418     end
 419
 420     def get_gap_only_columns()
 421       if ( !is_aligned() )
 422         error_msg = "attempt to get gap only columns of unaligned msa"
 423         raise StandardError, error_msg, caller
 424       end
 425       cols = Array.new()
 426       for c in 0 ... get_length
 427         nogap_char_found = false
 428         for s in 0 ... get_number_of_seqs
 429           unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 430             nogap_char_found = true
 431             break
 432           end
 433         end
 434         unless nogap_char_found
 435           cols.push( c )
 436         end
 437       end
 438       return cols
 439     end
 440
 441     def calculate_gap_proportion()
 442       if ( !is_aligned() )
 443         error_msg = "attempt to get gap only columns of unaligned msa"
 444         raise StandardError, error_msg, caller
 445       end
 446       total_sum = 0.0
 447       gap_sum = 0.0
 448       for c in 0 ... get_length
 449         for s in 0 ... get_number_of_seqs
 450           total_sum = total_sum + 1
 451           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 452             gap_sum = gap_sum  + 1
 453           end
 454         end
 455
 456       end
 457       return gap_sum / total_sum
 458     end
 459
 460     def get_gap_columns()
 461       if ( !is_aligned() )
 462         error_msg = "attempt to get gap columns of unaligned msa"
 463         raise StandardError, error_msg, caller
 464       end
 465       cols = Array.new()
 466       for c in 0 ... get_length
 467         gap_char_found = false
 468         for s in 0 ... get_number_of_seqs
 469           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 470             gap_char_found = true
 471             break
 472           end
 473         end
 474         if gap_char_found
 475           cols.push( c )
 476         end
 477       end
 478       return cols
 479     end
 480
 481     # gap_ratio = seqs with gap / number of sequences
 482     # returns column indices for which seqs with gap / number of sequences > gap_ratio
 483     def get_gap_columns_w_gap_ratio( gap_ratio )
 484       if ( !is_aligned() )
 485         error_msg = "attempt to get gap columns with gap_ratio of unaligned msa"
 486         raise StandardError, error_msg, caller
 487       end
 488       if ( gap_ratio < 0 || gap_ratio > 1 )
 489         error_msg = "gap ratio must be between 0 and 1 inclusive"
 490         raise ArgumentError, error_msg, caller
 491       end
 492       cols = Array.new()
 493       for c in 0 ... get_length
 494         gap_chars_found = 0
 495         for s in 0 ... get_number_of_seqs
 496           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 497             gap_chars_found += 1
 498           end
 499         end
 500         if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio )
 501           cols.push( c )
 502         end
 503       end
 504       return cols
 505     end
 506
 507
 508     # Split an alignment into n alignemnts of equal size, except last one
 509     def split( n, verbose = false )
 510       if ( n < 2 || n > get_number_of_seqs )
 511         error_msg = "attempt to split into less than two or more than the number of sequences"
 512         raise StandardError, error_msg, caller
 513       end
 514       msas = Array.new()
 515       r = get_number_of_seqs % n
 516       x = get_number_of_seqs / n
 517       for i in 0 ... n
 518         msa = Msa.new()
 519         #s = 0
 520         if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
 521           y = x + r
 522           if ( verbose )
 523             puts( i.to_s + ": " + y.to_s )
 524           end
 525           for j in 0 ... y
 526             msa.add_sequence( get_sequence( ( i * x ) + j ) )
 527           end
 528         else
 529           if ( verbose )
 530             puts( i.to_s + ": " + x.to_s )
 531           end
 532           for j in 0 ... x
 533             msa.add_sequence( get_sequence( ( i * x ) + j ) )
 534           end
 535         end
 536         msas.push( msa )
 537       end
 538       msas
 539     end
 540
 541
 542     private
 543
 544     def get_overlaps( min_overlap = 1 )
 545       if ( !is_aligned() )
 546         error_msg = "attempt to get overlaps of unaligned msa"
 547         raise StandardError, error_msg, caller
 548       end
 549       bins = Array.new()
 550       for i in 0 ... get_number_of_seqs()
 551         found_bin = false
 552         for j in 0 ... bins.length
 553           current_bin = bins[ j ]
 554           # does seq i overlap with all seqs in current_bin?
 555           all_overlap = true
 556           for z in 0 ... current_bin.length
 557             unless overlap?( i, current_bin[ z ], min_overlap )
 558               all_overlap = false
 559               break
 560             end
 561           end
 562           if all_overlap
 563             current_bin.push( i )
 564             found_bin = true
 565           end
 566         end
 567         unless found_bin
 568           new_bin = Array.new()
 569           new_bin.push( i )
 570           bins.push( new_bin )
 571         end
 572       end
 573       return bins
 574     end
 575
 576     def remove_columns!( cols )
 577       if ( !is_aligned() )
 578         error_msg = "attempt to remove columns of unaligned msa"
 579         raise StandardError, error_msg, caller
 580       end
 581       cols.reverse!()
 582       for c in 0 ... cols.length()
 583         col = cols[ c ]
 584         for s in 0 ... get_number_of_seqs()
 585           get_sequence( s ).delete_residue!( col )
 586         end
 587       end
 588       return self
 589     end
 590
 591
 592   end # class Msa
 593
 594 end # module Evoruby
 595