forester/ruby/evoruby/lib/evo/msa/msa.rb

   1 #
   2 # = lib/evo/msa/msa.rb - Msa class
   3 #
   4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
   5 # License::      GNU Lesser General Public License (LGPL)
   6 #
   7 # Last modified: 2017/03/13
   8
   9 require 'lib/evo/util/constants'
  10 require 'lib/evo/util/util'
  11 require 'lib/evo/sequence/sequence'
  12
  13 module Evoruby
  14   class Msa
  15     def initialize()
  16       @sequences = Array.new
  17       @identical_seqs_detected = Array.new
  18       @name_to_seq_indices = Hash.new
  19       @namestart_to_seq_indices = Hash.new
  20       @name = nil
  21     end
  22
  23     def get_name
  24       @name
  25     end
  26
  27     def set_name( name )
  28       @name = name
  29     end
  30
  31     def add_sequence( sequence )
  32       @sequences.push( sequence )
  33     end
  34
  35     def add( name, molecular_sequence_str )
  36       add_sequence( Sequence.new( name, molecular_sequence_str ) )
  37     end
  38
  39     def get_sequence( index )
  40       if ( index < 0 || index > get_number_of_seqs() - 1 )
  41         error_msg = "attempt to get sequence " <<
  42         index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
  43         " sequences"
  44         raise ArgumentError, error_msg
  45       end
  46       return @sequences[ index ]
  47     end
  48
  49     def remove_sequence!( index )
  50       if ( index < 0 || index > get_number_of_seqs() - 1 )
  51         error_msg = "attempt to remove sequence " <<
  52         index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
  53         " sequences"
  54         raise ArgumentError, error_msg
  55       end
  56       @name_to_seq_indices.clear
  57       @namestart_to_seq_indices.clear
  58       @sequences.delete_at( index )
  59     end
  60
  61     def get_identical_seqs_detected
  62       @identical_seqs_detected
  63     end
  64
  65     def is_aligned()
  66       if ( get_number_of_seqs < 1 )
  67         return false
  68       else
  69         l = @sequences[ 0 ].get_length()
  70         for i in 0 ... get_number_of_seqs()
  71           if ( get_sequence( i ).get_length() != l )
  72             return false
  73           end
  74         end
  75       end
  76       return true
  77     end
  78
  79     def find_by_name( name, case_sensitive, partial_match )
  80       if case_sensitive && !partial_match && @name_to_seq_indices.has_key?( name )
  81         return @name_to_seq_indices[ name ]
  82       end
  83       indices = Array.new()
  84       for i in 0 ... get_number_of_seqs()
  85         current_name = get_sequence( i ).get_name()
  86         if case_sensitive && !partial_match
  87           if !@name_to_seq_indices.has_key?( current_name )
  88             @name_to_seq_indices[ current_name ] = []
  89           end
  90           @name_to_seq_indices[ current_name ].push( i )
  91         end
  92         if !case_sensitive
  93           current_name = current_name.downcase
  94           name = name.downcase
  95         end
  96         if current_name == name ||
  97         ( partial_match && current_name.include?( name ) )
  98           indices.push( i )
  99         end
 100       end
 101       indices
 102     end
 103
 104     def find_by_name_pattern( name_re, avoid_similar_to = true )
 105       indices = []
 106       for i in 0 ... get_number_of_seqs()
 107         if avoid_similar_to
 108           m = name_re.match( get_sequence( i ).get_name() )
 109           if m && !m.pre_match.downcase.include?( "similar to " )
 110             indices.push( i )
 111           end
 112         else
 113           if name_re.match( get_sequence( i ).get_name() )
 114             indices.push( i )
 115           end
 116         end
 117       end
 118       indices
 119     end
 120
 121     # throws ArgumentError
 122     def get_by_name_pattern( name_re , avoid_similar_to = true )
 123       indices = find_by_name_pattern( name_re, avoid_similar_to  )
 124       if ( indices.length > 1 )
 125         error_msg = "pattern  " + name_re.to_s + " not unique"
 126         raise ArgumentError, error_msg
 127       elsif ( indices.length < 1 )
 128         error_msg = "pattern " + name_re.to_s + " not found"
 129         raise ArgumentError, error_msg
 130       end
 131       get_sequence( indices[ 0 ] )
 132     end
 133
 134     def find_by_name_start(name, case_sensitive = true)
 135       if case_sensitive && @namestart_to_seq_indices.has_key?( name )
 136         return @namestart_to_seq_indices[ name ]
 137       end
 138       indices = []
 139       for i in 0 ... get_number_of_seqs
 140         get_sequence(i).get_name =~ /^\s*(\S+)/
 141         current_name = $1
 142         if case_sensitive
 143           unless @namestart_to_seq_indices.has_key?(current_name)
 144             @namestart_to_seq_indices[current_name] = []
 145           end
 146           @namestart_to_seq_indices[current_name].push( i )
 147         end
 148         if !case_sensitive
 149           current_name = current_name.downcase
 150           name = name.downcase
 151         end
 152         if current_name == name
 153           indices.push(i)
 154         end
 155       end
 156       indices
 157     end
 158
 159     def has?( name, case_sensitive = true, partial_match = false )
 160       for i in 0 ... get_number_of_seqs()
 161         current_name = get_sequence( i ).get_name()
 162         if !case_sensitive
 163           current_name = current_name.downcase
 164           name = name.downcase
 165         end
 166         if current_name == name ||
 167         ( partial_match && current_name.include?( name ) )
 168           return true
 169         end
 170       end
 171       false
 172     end
 173
 174     # throws ArgumentError
 175     def get_by_name( name, case_sensitive = true, partial_match = false )
 176       indices = find_by_name( name, case_sensitive, partial_match )
 177       if ( indices.length > 1 )
 178         error_msg = "\"" + name + "\" not unique"
 179         raise ArgumentError, error_msg
 180       elsif ( indices.length < 1 )
 181         error_msg = "\"" + name + "\" not found"
 182         raise ArgumentError, error_msg
 183       end
 184       get_sequence( indices[ 0 ] )
 185     end
 186
 187     # throws ArgumentError
 188     def get_by_name_start( name, case_sensitive = true )
 189       indices = find_by_name_start( name, case_sensitive )
 190       if indices.length > 1
 191         error_msg = "\"" + name + "\" not unique"
 192         raise ArgumentError, error_msg
 193       elsif  indices.length < 1
 194         error_msg = "\"" + name + "\" not found"
 195         raise ArgumentError, error_msg
 196       end
 197       get_sequence( indices[ 0 ] )
 198     end
 199
 200     def get_sub_alignment( seq_numbers )
 201       msa = Msa.new()
 202       for i in 0 ... seq_numbers.length()
 203         msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() )
 204       end
 205       return msa
 206     end
 207
 208     def get_number_of_seqs()
 209       @sequences.length
 210     end
 211
 212     def get_length
 213       if !is_aligned()
 214         error_msg = "attempt to get length of unaligned msa"
 215         raise StandardError, error_msg, caller
 216       end
 217       if get_number_of_seqs() < 1
 218         -1
 219       else
 220         @sequences[ 0 ].get_length()
 221       end
 222     end
 223
 224     def to_str
 225       to_fasta
 226     end
 227
 228     def to_fasta
 229       s = String.new
 230       for i in 0...get_number_of_seqs
 231         s += @sequences[ i ].to_fasta + Constants::LINE_DELIMITER
 232       end
 233       s
 234     end
 235
 236     def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
 237       if ( !is_aligned() )
 238         error_msg = "attempt to get overlap diagram of unaligned msa"
 239         raise StandardError, error_msg, caller
 240       end
 241       for i in 0 ... get_number_of_seqs()
 242         io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) )
 243         for j in 0 ... get_number_of_seqs()
 244           if i == j
 245             io.print( " " )
 246           else
 247             if overlap?( i, j, min_overlap )
 248               io.print( "+" )
 249             else
 250               io.print( "-" )
 251             end
 252           end
 253         end
 254         io.print( Evoruby::Constants::LINE_DELIMITER )
 255       end
 256     end
 257
 258     #returns array of Msa with an overlap of min_overlap
 259     def split_into_overlapping_msa( min_overlap = 1 )
 260       if ( !is_aligned() )
 261         error_msg = "attempt to split into overlapping msas of unaligned msa"
 262         raise StandardError, error_msg, caller
 263       end
 264       msas = Array.new()
 265       bins = get_overlaps( min_overlap )
 266       for i in 0 ... bins.length
 267         msas.push( get_sub_alignment( bins[ i ] ) )
 268       end
 269       msas
 270     end
 271
 272     def overlap?( index_1, index_2, min_overlap = 1 )
 273       seq_1 = get_sequence( index_1 )
 274       seq_2 = get_sequence( index_2 )
 275       overlap_count = 0
 276       for i in 0...seq_1.get_length()
 277         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 278         !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
 279           overlap_count += 1
 280           if overlap_count >= min_overlap
 281             return true
 282           end
 283         end
 284       end
 285       return false
 286     end
 287
 288     def calculate_overlap( index_1, index_2 )
 289       seq_1 = get_sequence( index_1 )
 290       seq_2 = get_sequence( index_2 )
 291       overlap_count = 0
 292       for i in 0...seq_1.get_length
 293         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 294         !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
 295           overlap_count += 1
 296         end
 297       end
 298       overlap_count
 299     end
 300
 301     def calculate_identities( index_1, index_2 )
 302       seq_1 = get_sequence( index_1 )
 303       seq_2 = get_sequence( index_2 )
 304       identities_count = 0
 305       for i in 0...seq_1.get_length
 306         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 307         !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
 308         seq_1.get_character_code( i ) != 63 &&
 309         ( seq_1.get_residue( i ).downcase() ==
 310         seq_2.get_residue( i ).downcase() )
 311           identities_count += 1
 312         end
 313       end
 314       identities_count
 315     end
 316
 317     def remove_gap_only_columns!()
 318       remove_columns!( get_gap_only_columns() )
 319     end
 320
 321     def remove_gap_columns!()
 322       remove_columns!( get_gap_columns() )
 323     end
 324
 325     # removes columns for which seqs with gap / number of sequences > gap_ratio
 326     def remove_gap_columns_w_gap_ratio!( gap_ratio )
 327       remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
 328     end
 329
 330     def remove_sequences_by_gap_ratio!( gap_ratio )
 331       if ( !is_aligned() )
 332         error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
 333         raise StandardError, error_msg, caller
 334       end
 335       n = get_number_of_seqs
 336       removed = Array.new
 337       for s in 0 ... n
 338         if ( get_sequence( ( n - 1 ) - s  ).get_gap_ratio() > gap_ratio )
 339           if ( Evoruby::Constants::VERBOSE )
 340             puts( "removed: " + get_sequence( ( n - 1 ) - s  ).get_name )
 341           end
 342           removed << get_sequence( ( n - 1 ) - s  ).get_name
 343           remove_sequence!( ( n - 1 ) - s  )
 344         end
 345       end
 346       removed
 347     end
 348
 349     def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
 350       n = get_number_of_seqs
 351       removed = Array.new
 352       to_be_removed = Set.new
 353       @identical_seqs_detected = Array.new
 354       for i in 0 ... ( n - 1 )
 355         for j in ( i + 1 ) ... n
 356           if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
 357             if  !consider_taxonomy ||
 358             ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
 359             ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
 360               if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
 361               Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
 362                 to_be_removed.add( j )
 363
 364                 tax_i = ""
 365                 tax_j = ""
 366                 if get_sequence( i ).get_taxonomy != nil
 367                   tax_i = get_sequence( i ).get_taxonomy.get_name
 368                 end
 369                 if get_sequence( j ).get_taxonomy != nil
 370                   tax_j = get_sequence( j ).get_taxonomy.get_name
 371                 end
 372                 identical_pair = get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]"
 373                 @identical_seqs_detected.push( identical_pair )
 374                 if verbose
 375                   puts identical_pair
 376                 end
 377               end
 378             end
 379           end
 380         end
 381       end
 382
 383       to_be_removed_ary = to_be_removed.to_a.sort.reverse
 384
 385       to_be_removed_ary.each { | index |
 386         removed.push( get_sequence( index ).get_name )
 387         remove_sequence!( index )
 388       }
 389       removed
 390     end
 391
 392     def remove_sequences_by_non_gap_length!( min_non_gap_length )
 393       n = get_number_of_seqs
 394       removed = Array.new
 395       for s in 0 ... n
 396         x = ( n - 1 ) - s
 397         seq = get_sequence( x )
 398         if ( ( seq.get_length - seq.get_gap_length ) < min_non_gap_length )
 399           if ( Evoruby::Constants::VERBOSE )
 400             puts( "removed: " + seq.get_name )
 401           end
 402           removed << seq.get_name
 403           remove_sequence!( x )
 404         end
 405       end
 406       removed
 407     end
 408
 409     def trim!( first, last, name_suffix = nil )
 410       cols = Array.new()
 411       for i in 0 ... get_length()
 412         if ( i < first || i > last )
 413           cols.push( i )
 414         end
 415       end
 416       remove_columns!( cols )
 417       if name_suffix != nil
 418         n = get_number_of_seqs
 419         for s in 0 ... n
 420           seq = get_sequence( s )
 421           seq.set_name( seq.get_name() + name_suffix );
 422         end
 423       end
 424     end
 425
 426     def extract( first, last, min_non_gap_length, suffix )
 427       if !is_aligned()
 428         error_msg = "attempt to extract from unaligned msa"
 429         raise StandardError, error_msg, caller
 430       end
 431       if first < 0
 432         error_msg = "first < 0"
 433         raise StandardError, error_msg, caller
 434       end
 435       if last >= get_length()
 436         error_msg = "last > length"
 437         raise StandardError, error_msg, caller
 438       end
 439       if first >= last
 440         error_msg = "first >= last"
 441         raise StandardError, error_msg, caller
 442       end
 443       msa = Msa.new()
 444       for i in 0 ... get_number_of_seqs
 445         subseq = get_sequence( i ).get_subsequence( first, last )
 446         unless ( ( subseq.get_length - subseq.get_gap_length ) < min_non_gap_length )
 447           if suffix != nil
 448             msa.add( subseq.get_name + suffix, subseq.get_sequence_as_string )
 449           else
 450             msa.add( subseq.get_name, subseq.get_sequence_as_string )
 451           end
 452         end
 453       end
 454
 455       msa
 456     end
 457
 458     def sliding_extraction( step, size, min_non_gap_length, suffix = nil )
 459       counter = 0
 460       done = false
 461       msas = Array.new()
 462       while !done
 463         first = counter * step
 464         last = first + size - 1
 465         if last > get_length() - 1
 466           last = get_length() - 1
 467           done = true
 468         end
 469         unless first >= last
 470           counter +=1
 471           res = extract(first, last, min_non_gap_length, suffix)
 472           res.set_name(first.to_s + "-" + last.to_s)
 473           msas << res
 474         end
 475       end
 476       msas
 477     end
 478
 479     def get_gap_only_columns()
 480       if ( !is_aligned() )
 481         error_msg = "attempt to get gap only columns of unaligned msa"
 482         raise StandardError, error_msg, caller
 483       end
 484       cols = Array.new()
 485       for c in 0 ... get_length
 486         nogap_char_found = false
 487         for s in 0 ... get_number_of_seqs
 488           unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 489             nogap_char_found = true
 490             break
 491           end
 492         end
 493         unless nogap_char_found
 494           cols.push( c )
 495         end
 496       end
 497       return cols
 498     end
 499
 500     def calculate_gap_proportion()
 501       if ( !is_aligned() )
 502         error_msg = "attempt to get gap only columns of unaligned msa"
 503         raise StandardError, error_msg, caller
 504       end
 505       total_sum = 0.0
 506       gap_sum = 0.0
 507       for c in 0 ... get_length
 508         for s in 0 ... get_number_of_seqs
 509           total_sum = total_sum + 1
 510           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 511             gap_sum = gap_sum  + 1
 512           end
 513         end
 514
 515       end
 516       return gap_sum / total_sum
 517     end
 518
 519     def get_gap_columns()
 520       if ( !is_aligned() )
 521         error_msg = "attempt to get gap columns of unaligned msa"
 522         raise StandardError, error_msg, caller
 523       end
 524       cols = Array.new()
 525       for c in 0 ... get_length
 526         gap_char_found = false
 527         for s in 0 ... get_number_of_seqs
 528           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 529             gap_char_found = true
 530             break
 531           end
 532         end
 533         if gap_char_found
 534           cols.push( c )
 535         end
 536       end
 537       return cols
 538     end
 539
 540     # gap_ratio = seqs with gap / number of sequences
 541     # returns column indices for which seqs with gap / number of sequences > gap_ratio
 542     def get_gap_columns_w_gap_ratio( gap_ratio )
 543       if ( !is_aligned() )
 544         error_msg = "attempt to get gap columns with gap_ratio of unaligned msa"
 545         raise StandardError, error_msg, caller
 546       end
 547       if ( gap_ratio < 0 || gap_ratio > 1 )
 548         error_msg = "gap ratio must be between 0 and 1 inclusive"
 549         raise ArgumentError, error_msg, caller
 550       end
 551       cols = Array.new()
 552       for c in 0 ... get_length
 553         gap_chars_found = 0
 554         for s in 0 ... get_number_of_seqs
 555           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 556             gap_chars_found += 1
 557           end
 558         end
 559         if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio )
 560           cols.push( c )
 561         end
 562       end
 563       return cols
 564     end
 565
 566     # Split an alignment into n alignmnts of equal size, except last one
 567     def split( n, verbose = false )
 568       if ( n < 2 || n > get_number_of_seqs )
 569         error_msg = "attempt to split into less than two or more than the number of sequences"
 570         raise StandardError, error_msg, caller
 571       end
 572       msas = Array.new()
 573       r = get_number_of_seqs % n
 574       x = get_number_of_seqs / n
 575       for i in 0 ... n
 576         msa = Msa.new()
 577         #s = 0
 578         if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
 579           y = x + r
 580           if ( verbose )
 581             puts( i.to_s + ": " + y.to_s )
 582           end
 583           for j in 0 ... y
 584             msa.add_sequence( get_sequence( ( i * x ) + j ) )
 585           end
 586         else
 587           if ( verbose )
 588             puts( i.to_s + ": " + x.to_s )
 589           end
 590           for j in 0 ... x
 591             msa.add_sequence( get_sequence( ( i * x ) + j ) )
 592           end
 593         end
 594         msas.push( msa )
 595       end
 596       msas
 597     end
 598
 599     def split_by_os(verbose = false)
 600       msa_hash = Hash.new()
 601       for i in 0 ... get_number_of_seqs()
 602         seq = get_sequence(i)
 603         name = seq.get_name()
 604         # >sp|Q1HVE7|AN_EBVA8 Shutoff alkaline exonuclease OS=Epstein-Barr virus (strain AG876) GN=BGLF5 PE=3 SV=1
 605         # if name =~ /OS=(.+?)\s+[A-Z]{2}=/
 606         if name =~ /Organism:(.+?)(\|Protein|$)/
 607           os = $1
 608           unless  msa_hash.has_key?(os)
 609             msa_hash[os] = Msa.new
 610           end
 611           msa_hash[os].add_sequence seq
 612         else
 613           error_msg = "sequence name \"" + name + "\" is not in the expected format for splitting by OS"
 614           raise IOError, error_msg, caller
 615         end
 616       end
 617       msa_hash = msa_hash.sort{|a, b|a<=>b}.to_h
 618       if verbose
 619         c = 0
 620         msa_hash.each do |o, msa|
 621           c += 1
 622           puts c.to_s + ': ' + o
 623         end
 624       end
 625       msa_hash
 626     end
 627
 628     private
 629
 630     def get_overlaps( min_overlap = 1 )
 631       if ( !is_aligned() )
 632         error_msg = "attempt to get overlaps of unaligned msa"
 633         raise StandardError, error_msg, caller
 634       end
 635       bins = Array.new()
 636       for i in 0 ... get_number_of_seqs()
 637         found_bin = false
 638         for j in 0 ... bins.length
 639           current_bin = bins[ j ]
 640           # does seq i overlap with all seqs in current_bin?
 641           all_overlap = true
 642           for z in 0 ... current_bin.length
 643             unless overlap?( i, current_bin[ z ], min_overlap )
 644               all_overlap = false
 645               break
 646             end
 647           end
 648           if all_overlap
 649             current_bin.push( i )
 650             found_bin = true
 651           end
 652         end
 653         unless found_bin
 654           new_bin = Array.new()
 655           new_bin.push( i )
 656           bins.push( new_bin )
 657         end
 658       end
 659       return bins
 660     end
 661
 662     def remove_columns!( cols )
 663       if ( !is_aligned() )
 664         error_msg = "attempt to remove columns of unaligned msa"
 665         raise StandardError, error_msg, caller
 666       end
 667       cols.reverse!()
 668       for c in 0 ... cols.length()
 669         col = cols[ c ]
 670         for s in 0 ... get_number_of_seqs()
 671           get_sequence( s ).delete_residue!( col )
 672         end
 673       end
 674       return self
 675     end
 676
 677   end # class Msa
 678
 679 end # module Evoruby
 680