forester/ruby/evoruby/lib/evo/msa/msa.rb

   1 #
   2 # = lib/evo/msa/msa.rb - Msa class
   3 #
   4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
   5 # License::      GNU Lesser General Public License (LGPL)
   6 #
   7 # Last modified: 2017/03/13
   8
   9 require 'lib/evo/util/constants'
  10 require 'lib/evo/util/util'
  11 require 'lib/evo/sequence/sequence'
  12
  13 module Evoruby
  14   class Msa
  15     def initialize()
  16       @sequences = Array.new
  17       @identical_seqs_detected = Array.new
  18       @name_to_seq_indices = Hash.new
  19       @namestart_to_seq_indices = Hash.new
  20     end
  21
  22     def add_sequence( sequence )
  23       @sequences.push( sequence )
  24     end
  25
  26     def add( name, molecular_sequence_str )
  27       add_sequence( Sequence.new( name, molecular_sequence_str ) )
  28     end
  29
  30     def get_sequence( index )
  31       if ( index < 0 || index > get_number_of_seqs() - 1 )
  32         error_msg = "attempt to get sequence " <<
  33         index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
  34         " sequences"
  35         raise ArgumentError, error_msg
  36       end
  37       return @sequences[ index ]
  38     end
  39
  40     def remove_sequence!( index )
  41       if ( index < 0 || index > get_number_of_seqs() - 1 )
  42         error_msg = "attempt to remove sequence " <<
  43         index.to_s << " in alignment of " << get_number_of_seqs().to_s <<
  44         " sequences"
  45         raise ArgumentError, error_msg
  46       end
  47       @name_to_seq_indices.clear
  48       @namestart_to_seq_indices.clear
  49       @sequences.delete_at( index )
  50     end
  51
  52     def get_identical_seqs_detected
  53       @identical_seqs_detected
  54     end
  55
  56     def is_aligned()
  57       if ( get_number_of_seqs < 1 )
  58         return false
  59       else
  60         l = @sequences[ 0 ].get_length()
  61         for i in 0 ... get_number_of_seqs()
  62           if ( get_sequence( i ).get_length() != l )
  63             return false
  64           end
  65         end
  66       end
  67       return true
  68     end
  69
  70     def find_by_name( name, case_sensitive, partial_match )
  71       if case_sensitive && !partial_match && @name_to_seq_indices.has_key?( name )
  72         return @name_to_seq_indices[ name ]
  73       end
  74       indices = Array.new()
  75       for i in 0 ... get_number_of_seqs()
  76         current_name = get_sequence( i ).get_name()
  77         if case_sensitive && !partial_match
  78           if !@name_to_seq_indices.has_key?( current_name )
  79             @name_to_seq_indices[ current_name ] = []
  80           end
  81           @name_to_seq_indices[ current_name ].push( i )
  82         end
  83         if !case_sensitive
  84           current_name = current_name.downcase
  85           name = name.downcase
  86         end
  87         if current_name == name ||
  88         ( partial_match && current_name.include?( name ) )
  89           indices.push( i )
  90         end
  91       end
  92       indices
  93     end
  94
  95     def find_by_name_pattern( name_re, avoid_similar_to = true )
  96       indices = []
  97       for i in 0 ... get_number_of_seqs()
  98         if avoid_similar_to
  99           m = name_re.match( get_sequence( i ).get_name() )
 100           if m && !m.pre_match.downcase.include?( "similar to " )
 101             indices.push( i )
 102           end
 103         else
 104           if name_re.match( get_sequence( i ).get_name() )
 105             indices.push( i )
 106           end
 107         end
 108       end
 109       indices
 110     end
 111
 112     # throws ArgumentError
 113     def get_by_name_pattern( name_re , avoid_similar_to = true )
 114       indices = find_by_name_pattern( name_re, avoid_similar_to  )
 115       if ( indices.length > 1 )
 116         error_msg = "pattern  " + name_re.to_s + " not unique"
 117         raise ArgumentError, error_msg
 118       elsif ( indices.length < 1 )
 119         error_msg = "pattern " + name_re.to_s + " not found"
 120         raise ArgumentError, error_msg
 121       end
 122       get_sequence( indices[ 0 ] )
 123     end
 124
 125     def find_by_name_start(name, case_sensitive = true)
 126       if case_sensitive && @namestart_to_seq_indices.has_key?( name )
 127         return @namestart_to_seq_indices[ name ]
 128       end
 129       indices = []
 130       for i in 0 ... get_number_of_seqs
 131         get_sequence(i).get_name =~ /^\s*(\S+)/
 132         current_name = $1
 133         if case_sensitive
 134           unless @namestart_to_seq_indices.has_key?(current_name)
 135             @namestart_to_seq_indices[current_name] = []
 136           end
 137           @namestart_to_seq_indices[current_name].push( i )
 138         end
 139         if !case_sensitive
 140           current_name = current_name.downcase
 141           name = name.downcase
 142         end
 143         if current_name == name
 144           indices.push(i)
 145         end
 146       end
 147       indices
 148     end
 149
 150     def has?( name, case_sensitive = true, partial_match = false )
 151       for i in 0 ... get_number_of_seqs()
 152         current_name = get_sequence( i ).get_name()
 153         if !case_sensitive
 154           current_name = current_name.downcase
 155           name = name.downcase
 156         end
 157         if current_name == name ||
 158         ( partial_match && current_name.include?( name ) )
 159           return true
 160         end
 161       end
 162       false
 163     end
 164
 165     # throws ArgumentError
 166     def get_by_name( name, case_sensitive = true, partial_match = false )
 167       indices = find_by_name( name, case_sensitive, partial_match )
 168       if ( indices.length > 1 )
 169         error_msg = "\"" + name + "\" not unique"
 170         raise ArgumentError, error_msg
 171       elsif ( indices.length < 1 )
 172         error_msg = "\"" + name + "\" not found"
 173         raise ArgumentError, error_msg
 174       end
 175       get_sequence( indices[ 0 ] )
 176     end
 177
 178     # throws ArgumentError
 179     def get_by_name_start( name, case_sensitive = true )
 180       indices = find_by_name_start( name, case_sensitive )
 181       if indices.length > 1
 182         error_msg = "\"" + name + "\" not unique"
 183         raise ArgumentError, error_msg
 184       elsif  indices.length < 1
 185         error_msg = "\"" + name + "\" not found"
 186         raise ArgumentError, error_msg
 187       end
 188       get_sequence( indices[ 0 ] )
 189     end
 190
 191     def get_sub_alignment( seq_numbers )
 192       msa = Msa.new()
 193       for i in 0 ... seq_numbers.length()
 194         msa.add_sequence( get_sequence( seq_numbers[ i ] ).copy() )
 195       end
 196       return msa
 197     end
 198
 199     def get_number_of_seqs()
 200       @sequences.length
 201     end
 202
 203     def get_length
 204       if !is_aligned()
 205         error_msg = "attempt to get length of unaligned msa"
 206         raise StandardError, error_msg, caller
 207       end
 208       if get_number_of_seqs() < 1
 209         -1
 210       else
 211         @sequences[ 0 ].get_length()
 212       end
 213     end
 214
 215     def to_str
 216       to_fasta
 217     end
 218
 219     def to_fasta
 220       s = String.new
 221       for i in 0...get_number_of_seqs
 222         s += @sequences[ i ].to_fasta + Constants::LINE_DELIMITER
 223       end
 224       s
 225     end
 226
 227     def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
 228       if ( !is_aligned() )
 229         error_msg = "attempt to get overlap diagram of unaligned msa"
 230         raise StandardError, error_msg, caller
 231       end
 232       for i in 0 ... get_number_of_seqs()
 233         io.print( Util.normalize_seq_name( get_sequence( i ).get_name(), max_name_length ) )
 234         for j in 0 ... get_number_of_seqs()
 235           if i == j
 236             io.print( " " )
 237           else
 238             if overlap?( i, j, min_overlap )
 239               io.print( "+" )
 240             else
 241               io.print( "-" )
 242             end
 243           end
 244         end
 245         io.print( Evoruby::Constants::LINE_DELIMITER )
 246       end
 247     end
 248
 249     #returns array of Msa with an overlap of min_overlap
 250     def split_into_overlapping_msa( min_overlap = 1 )
 251       if ( !is_aligned() )
 252         error_msg = "attempt to split into overlapping msas of unaligned msa"
 253         raise StandardError, error_msg, caller
 254       end
 255       msas = Array.new()
 256       bins = get_overlaps( min_overlap )
 257       for i in 0 ... bins.length
 258         msas.push( get_sub_alignment( bins[ i ] ) )
 259       end
 260       msas
 261     end
 262
 263     def overlap?( index_1, index_2, min_overlap = 1 )
 264       seq_1 = get_sequence( index_1 )
 265       seq_2 = get_sequence( index_2 )
 266       overlap_count = 0
 267       for i in 0...seq_1.get_length()
 268         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 269         !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
 270           overlap_count += 1
 271           if overlap_count >= min_overlap
 272             return true
 273           end
 274         end
 275       end
 276       return false
 277     end
 278
 279     def calculate_overlap( index_1, index_2 )
 280       seq_1 = get_sequence( index_1 )
 281       seq_2 = get_sequence( index_2 )
 282       overlap_count = 0
 283       for i in 0...seq_1.get_length
 284         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 285         !Util.is_aa_gap_character?( seq_2.get_character_code( i ) )
 286           overlap_count += 1
 287         end
 288       end
 289       overlap_count
 290     end
 291
 292     def calculate_identities( index_1, index_2 )
 293       seq_1 = get_sequence( index_1 )
 294       seq_2 = get_sequence( index_2 )
 295       identities_count = 0
 296       for i in 0...seq_1.get_length
 297         if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) &&
 298         !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) &&
 299         seq_1.get_character_code( i ) != 63 &&
 300         ( seq_1.get_residue( i ).downcase() ==
 301         seq_2.get_residue( i ).downcase() )
 302           identities_count += 1
 303         end
 304       end
 305       identities_count
 306     end
 307
 308     def remove_gap_only_columns!()
 309       remove_columns!( get_gap_only_columns() )
 310     end
 311
 312     def remove_gap_columns!()
 313       remove_columns!( get_gap_columns() )
 314     end
 315
 316     # removes columns for which seqs with gap / number of sequences > gap_ratio
 317     def remove_gap_columns_w_gap_ratio!( gap_ratio )
 318       remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) )
 319     end
 320
 321     def remove_sequences_by_gap_ratio!( gap_ratio )
 322       if ( !is_aligned() )
 323         error_msg = "attempt to remove sequences by gap ratio on unaligned msa"
 324         raise StandardError, error_msg, caller
 325       end
 326       n = get_number_of_seqs
 327       removed = Array.new
 328       for s in 0 ... n
 329         if ( get_sequence( ( n - 1 ) - s  ).get_gap_ratio() > gap_ratio )
 330           if ( Evoruby::Constants::VERBOSE )
 331             puts( "removed: " + get_sequence( ( n - 1 ) - s  ).get_name )
 332           end
 333           removed << get_sequence( ( n - 1 ) - s  ).get_name
 334           remove_sequence!( ( n - 1 ) - s  )
 335         end
 336       end
 337       removed
 338     end
 339
 340     def remove_redundant_sequences!( consider_taxonomy = false, verbose = false )
 341       n = get_number_of_seqs
 342       removed = Array.new
 343       to_be_removed = Set.new
 344       @identical_seqs_detected = Array.new
 345       for i in 0 ... ( n - 1 )
 346         for j in ( i + 1 ) ... n
 347           if !to_be_removed.include?( i ) && !to_be_removed.include?( j )
 348             if  !consider_taxonomy ||
 349             ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) ||
 350             ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) )
 351               if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) ==
 352               Util.clean_seq_str( get_sequence( j ).get_sequence_as_string )
 353                 to_be_removed.add( j )
 354
 355                 tax_i = ""
 356                 tax_j = ""
 357                 if get_sequence( i ).get_taxonomy != nil
 358                   tax_i = get_sequence( i ).get_taxonomy.get_name
 359                 end
 360                 if get_sequence( j ).get_taxonomy != nil
 361                   tax_j = get_sequence( j ).get_taxonomy.get_name
 362                 end
 363                 identical_pair = get_sequence( i ).get_name + " [#{tax_i}] == " + get_sequence( j ).get_name + " [#{tax_j}]"
 364                 @identical_seqs_detected.push( identical_pair )
 365                 if verbose
 366                   puts identical_pair
 367                 end
 368               end
 369             end
 370           end
 371         end
 372       end
 373
 374       to_be_removed_ary = to_be_removed.to_a.sort.reverse
 375
 376       to_be_removed_ary.each { | index |
 377         removed.push( get_sequence( index ).get_name )
 378         remove_sequence!( index )
 379       }
 380       removed
 381     end
 382
 383     def remove_sequences_by_non_gap_length!( min_non_gap_length )
 384       n = get_number_of_seqs
 385       removed = Array.new
 386       for s in 0 ... n
 387         x = ( n - 1 ) - s
 388         seq = get_sequence( x )
 389         if ( ( seq.get_length - seq.get_gap_length ) < min_non_gap_length )
 390           if ( Evoruby::Constants::VERBOSE )
 391             puts( "removed: " + seq.get_name )
 392           end
 393           removed << seq.get_name
 394           remove_sequence!( x )
 395         end
 396       end
 397       removed
 398     end
 399
 400     def trim!( first, last )
 401       cols = Array.new()
 402       for i in 0 ... get_length()
 403         if ( i < first || i > last )
 404           cols.push( i )
 405         end
 406       end
 407       remove_columns!( cols )
 408     end
 409
 410     def get_gap_only_columns()
 411       if ( !is_aligned() )
 412         error_msg = "attempt to get gap only columns of unaligned msa"
 413         raise StandardError, error_msg, caller
 414       end
 415       cols = Array.new()
 416       for c in 0 ... get_length
 417         nogap_char_found = false
 418         for s in 0 ... get_number_of_seqs
 419           unless Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 420             nogap_char_found = true
 421             break
 422           end
 423         end
 424         unless nogap_char_found
 425           cols.push( c )
 426         end
 427       end
 428       return cols
 429     end
 430
 431     def calculate_gap_proportion()
 432       if ( !is_aligned() )
 433         error_msg = "attempt to get gap only columns of unaligned msa"
 434         raise StandardError, error_msg, caller
 435       end
 436       total_sum = 0.0
 437       gap_sum = 0.0
 438       for c in 0 ... get_length
 439         for s in 0 ... get_number_of_seqs
 440           total_sum = total_sum + 1
 441           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 442             gap_sum = gap_sum  + 1
 443           end
 444         end
 445
 446       end
 447       return gap_sum / total_sum
 448     end
 449
 450     def get_gap_columns()
 451       if ( !is_aligned() )
 452         error_msg = "attempt to get gap columns of unaligned msa"
 453         raise StandardError, error_msg, caller
 454       end
 455       cols = Array.new()
 456       for c in 0 ... get_length
 457         gap_char_found = false
 458         for s in 0 ... get_number_of_seqs
 459           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 460             gap_char_found = true
 461             break
 462           end
 463         end
 464         if gap_char_found
 465           cols.push( c )
 466         end
 467       end
 468       return cols
 469     end
 470
 471     # gap_ratio = seqs with gap / number of sequences
 472     # returns column indices for which seqs with gap / number of sequences > gap_ratio
 473     def get_gap_columns_w_gap_ratio( gap_ratio )
 474       if ( !is_aligned() )
 475         error_msg = "attempt to get gap columns with gap_ratio of unaligned msa"
 476         raise StandardError, error_msg, caller
 477       end
 478       if ( gap_ratio < 0 || gap_ratio > 1 )
 479         error_msg = "gap ratio must be between 0 and 1 inclusive"
 480         raise ArgumentError, error_msg, caller
 481       end
 482       cols = Array.new()
 483       for c in 0 ... get_length
 484         gap_chars_found = 0
 485         for s in 0 ... get_number_of_seqs
 486           if Util.is_aa_gap_character?( get_sequence( s ).get_character_code( c ) )
 487             gap_chars_found += 1
 488           end
 489         end
 490         if ( ( gap_chars_found.to_f / get_number_of_seqs ) > gap_ratio )
 491           cols.push( c )
 492         end
 493       end
 494       return cols
 495     end
 496
 497     # Split an alignment into n alignmnts of equal size, except last one
 498     def split( n, verbose = false )
 499       if ( n < 2 || n > get_number_of_seqs )
 500         error_msg = "attempt to split into less than two or more than the number of sequences"
 501         raise StandardError, error_msg, caller
 502       end
 503       msas = Array.new()
 504       r = get_number_of_seqs % n
 505       x = get_number_of_seqs / n
 506       for i in 0 ... n
 507         msa = Msa.new()
 508         #s = 0
 509         if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
 510           y = x + r
 511           if ( verbose )
 512             puts( i.to_s + ": " + y.to_s )
 513           end
 514           for j in 0 ... y
 515             msa.add_sequence( get_sequence( ( i * x ) + j ) )
 516           end
 517         else
 518           if ( verbose )
 519             puts( i.to_s + ": " + x.to_s )
 520           end
 521           for j in 0 ... x
 522             msa.add_sequence( get_sequence( ( i * x ) + j ) )
 523           end
 524         end
 525         msas.push( msa )
 526       end
 527       msas
 528     end
 529
 530     def split_by_os(verbose = false)
 531       msa_hash = Hash.new()
 532       for i in 0 ... get_number_of_seqs()
 533         seq = get_sequence(i)
 534         name = seq.get_name()
 535         # >sp|Q1HVE7|AN_EBVA8 Shutoff alkaline exonuclease OS=Epstein-Barr virus (strain AG876) GN=BGLF5 PE=3 SV=1
 536        # if name =~ /OS=(.+?)\s+[A-Z]{2}=/
 537         if name =~ /Organism:(.+?)(\|Protein|$)/
 538           os = $1
 539           unless  msa_hash.has_key?(os)
 540             msa_hash[os] = Msa.new
 541           end
 542           msa_hash[os].add_sequence seq
 543         else
 544           error_msg = "sequence name \"" + name +"\" is not in the expected format for splitting by OS"
 545           raise IOError, error_msg, caller
 546         end
 547       end
 548       msa_hash = msa_hash.sort{|a, b|a<=>b}.to_h
 549       if verbose
 550         c = 0
 551         msa_hash.each do |os, msa|
 552           c += 1
 553           puts c.to_s + ': ' + os
 554         end
 555       end
 556       msa_hash
 557     end
 558
 559     private
 560
 561     def get_overlaps( min_overlap = 1 )
 562       if ( !is_aligned() )
 563         error_msg = "attempt to get overlaps of unaligned msa"
 564         raise StandardError, error_msg, caller
 565       end
 566       bins = Array.new()
 567       for i in 0 ... get_number_of_seqs()
 568         found_bin = false
 569         for j in 0 ... bins.length
 570           current_bin = bins[ j ]
 571           # does seq i overlap with all seqs in current_bin?
 572           all_overlap = true
 573           for z in 0 ... current_bin.length
 574             unless overlap?( i, current_bin[ z ], min_overlap )
 575               all_overlap = false
 576               break
 577             end
 578           end
 579           if all_overlap
 580             current_bin.push( i )
 581             found_bin = true
 582           end
 583         end
 584         unless found_bin
 585           new_bin = Array.new()
 586           new_bin.push( i )
 587           bins.push( new_bin )
 588         end
 589       end
 590       return bins
 591     end
 592
 593     def remove_columns!( cols )
 594       if ( !is_aligned() )
 595         error_msg = "attempt to remove columns of unaligned msa"
 596         raise StandardError, error_msg, caller
 597       end
 598       cols.reverse!()
 599       for c in 0 ... cols.length()
 600         col = cols[ c ]
 601         for s in 0 ... get_number_of_seqs()
 602           get_sequence( s ).delete_residue!( col )
 603         end
 604       end
 605       return self
 606     end
 607
 608   end # class Msa
 609
 610 end # module Evoruby
 611