X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Fmsa%2Fmsa.rb;h=a61e62ab73889fdb693a94a1c7457b4ebb80a4dd;hb=9fe0c828e4a8e1f61a1acba1e7b8439dd5edc598;hp=69a1f30ebea84d26ba9a999f175629854a5e3bf5;hpb=2dd62272307b5ebdb33fabdc27dabc00e3ad601d;p=jalview.git diff --git a/forester/ruby/evoruby/lib/evo/msa/msa.rb b/forester/ruby/evoruby/lib/evo/msa/msa.rb index 69a1f30..a61e62a 100644 --- a/forester/ruby/evoruby/lib/evo/msa/msa.rb +++ b/forester/ruby/evoruby/lib/evo/msa/msa.rb @@ -1,26 +1,32 @@ # # = lib/evo/msa/msa.rb - Msa class # -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) # -# $Id: msa.rb,v 1.11 2009/01/03 00:42:08 cmzmasek Exp $ -# - +# Last modified: 2017/03/13 require 'lib/evo/util/constants' require 'lib/evo/util/util' require 'lib/evo/sequence/sequence' module Evoruby - class Msa - def initialize() @sequences = Array.new @identical_seqs_detected = Array.new + @name_to_seq_indices = Hash.new + @namestart_to_seq_indices = Hash.new + @name = nil end + def get_name + @name + end + + def set_name( name ) + @name = name + end def add_sequence( sequence ) @sequences.push( sequence ) @@ -33,8 +39,8 @@ module Evoruby def get_sequence( index ) if ( index < 0 || index > get_number_of_seqs() - 1 ) error_msg = "attempt to get sequence " << - index.to_s << " in alignment of " << get_number_of_seqs().to_s << - " sequences" + index.to_s << " in alignment of " << get_number_of_seqs().to_s << + " sequences" raise ArgumentError, error_msg end return @sequences[ index ] @@ -43,10 +49,12 @@ module Evoruby def remove_sequence!( index ) if ( index < 0 || index > get_number_of_seqs() - 1 ) error_msg = "attempt to remove sequence " << - index.to_s << " in alignment of " << get_number_of_seqs().to_s << - " sequences" + index.to_s << " in alignment of " << get_number_of_seqs().to_s << + " sequences" raise ArgumentError, error_msg end + @name_to_seq_indices.clear + @namestart_to_seq_indices.clear @sequences.delete_at( index ) end @@ -54,7 +62,6 @@ module Evoruby @identical_seqs_detected end - def is_aligned() if ( get_number_of_seqs < 1 ) return false @@ -70,15 +77,24 @@ module Evoruby end def find_by_name( name, case_sensitive, partial_match ) + if case_sensitive && !partial_match && @name_to_seq_indices.has_key?( name ) + return @name_to_seq_indices[ name ] + end indices = Array.new() for i in 0 ... get_number_of_seqs() current_name = get_sequence( i ).get_name() + if case_sensitive && !partial_match + if !@name_to_seq_indices.has_key?( current_name ) + @name_to_seq_indices[ current_name ] = [] + end + @name_to_seq_indices[ current_name ].push( i ) + end if !case_sensitive current_name = current_name.downcase name = name.downcase end if current_name == name || - ( partial_match && current_name.include?( name ) ) + ( partial_match && current_name.include?( name ) ) indices.push( i ) end end @@ -115,17 +131,26 @@ module Evoruby get_sequence( indices[ 0 ] ) end - def find_by_name_start( name, case_sensitive ) + def find_by_name_start(name, case_sensitive = true) + if case_sensitive && @namestart_to_seq_indices.has_key?( name ) + return @namestart_to_seq_indices[ name ] + end indices = [] - for i in 0 ... get_number_of_seqs() - get_sequence( i ).get_name() =~ /^\s*(\S+)/ + for i in 0 ... get_number_of_seqs + get_sequence(i).get_name =~ /^\s*(\S+)/ current_name = $1 + if case_sensitive + unless @namestart_to_seq_indices.has_key?(current_name) + @namestart_to_seq_indices[current_name] = [] + end + @namestart_to_seq_indices[current_name].push( i ) + end if !case_sensitive current_name = current_name.downcase name = name.downcase end - if ( current_name == name ) - indices.push( i ) + if current_name == name + indices.push(i) end end indices @@ -139,7 +164,7 @@ module Evoruby name = name.downcase end if current_name == name || - ( partial_match && current_name.include?( name ) ) + ( partial_match && current_name.include?( name ) ) return true end end @@ -162,17 +187,16 @@ module Evoruby # throws ArgumentError def get_by_name_start( name, case_sensitive = true ) indices = find_by_name_start( name, case_sensitive ) - if ( indices.length > 1 ) + if indices.length > 1 error_msg = "\"" + name + "\" not unique" raise ArgumentError, error_msg - elsif ( indices.length < 1 ) + elsif indices.length < 1 error_msg = "\"" + name + "\" not found" raise ArgumentError, error_msg end get_sequence( indices[ 0 ] ) end - def get_sub_alignment( seq_numbers ) msa = Msa.new() for i in 0 ... seq_numbers.length() @@ -209,7 +233,6 @@ module Evoruby s end - def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 ) if ( !is_aligned() ) error_msg = "attempt to get overlap diagram of unaligned msa" @@ -252,7 +275,7 @@ module Evoruby overlap_count = 0 for i in 0...seq_1.get_length() if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) && - !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) + !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) overlap_count += 1 if overlap_count >= min_overlap return true @@ -268,7 +291,7 @@ module Evoruby overlap_count = 0 for i in 0...seq_1.get_length if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) && - !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) + !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) overlap_count += 1 end end @@ -281,10 +304,10 @@ module Evoruby identities_count = 0 for i in 0...seq_1.get_length if !Util.is_aa_gap_character?( seq_1.get_character_code( i ) ) && - !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) && - seq_1.get_character_code( i ) != 63 && - ( seq_1.get_residue( i ).downcase() == - seq_2.get_residue( i ).downcase() ) + !Util.is_aa_gap_character?( seq_2.get_character_code( i ) ) && + seq_1.get_character_code( i ) != 63 && + ( seq_1.get_residue( i ).downcase() == + seq_2.get_residue( i ).downcase() ) identities_count += 1 end end @@ -304,7 +327,6 @@ module Evoruby remove_columns!( get_gap_columns_w_gap_ratio( gap_ratio ) ) end - def remove_sequences_by_gap_ratio!( gap_ratio ) if ( !is_aligned() ) error_msg = "attempt to remove sequences by gap ratio on unaligned msa" @@ -324,7 +346,6 @@ module Evoruby removed end - def remove_redundant_sequences!( consider_taxonomy = false, verbose = false ) n = get_number_of_seqs removed = Array.new @@ -334,10 +355,10 @@ module Evoruby for j in ( i + 1 ) ... n if !to_be_removed.include?( i ) && !to_be_removed.include?( j ) if !consider_taxonomy || - ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) || - ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) ) + ( ( get_sequence( i ).get_taxonomy == nil && get_sequence( j ).get_taxonomy == nil ) || + ( get_sequence( i ).get_taxonomy == get_sequence( j ).get_taxonomy ) ) if Util.clean_seq_str( get_sequence( i ).get_sequence_as_string ) == - Util.clean_seq_str( get_sequence( j ).get_sequence_as_string ) + Util.clean_seq_str( get_sequence( j ).get_sequence_as_string ) to_be_removed.add( j ) tax_i = "" @@ -368,28 +389,24 @@ module Evoruby removed end - def remove_sequences_by_non_gap_length!( min_non_gap_length ) - if ( !is_aligned() ) - error_msg = "attempt to remove sequences by non gap length on unaligned msa" - raise StandardError, error_msg, caller - end n = get_number_of_seqs - l = get_length removed = Array.new for s in 0 ... n - if ( ( l - get_sequence( ( n - 1 ) - s ).get_gap_length ) < min_non_gap_length ) + x = ( n - 1 ) - s + seq = get_sequence( x ) + if ( ( seq.get_length - seq.get_gap_length ) < min_non_gap_length ) if ( Evoruby::Constants::VERBOSE ) - puts( "removed: " + get_sequence( ( n - 1 ) - s ).get_name ) + puts( "removed: " + seq.get_name ) end - removed << get_sequence( ( n - 1 ) - s ).get_name - remove_sequence!( ( n - 1 ) - s ) + removed << seq.get_name + remove_sequence!( x ) end end removed end - def trim!( first, last ) + def trim!( first, last, name_suffix = nil ) cols = Array.new() for i in 0 ... get_length() if ( i < first || i > last ) @@ -397,6 +414,66 @@ module Evoruby end end remove_columns!( cols ) + if name_suffix != nil + n = get_number_of_seqs + for s in 0 ... n + seq = get_sequence( s ) + seq.set_name( seq.get_name() + name_suffix ); + end + end + end + + def extract( first, last, min_non_gap_length, suffix ) + if !is_aligned() + error_msg = "attempt to extract from unaligned msa" + raise StandardError, error_msg, caller + end + if first < 0 + error_msg = "first < 0" + raise StandardError, error_msg, caller + end + if last >= get_length() + error_msg = "last > length" + raise StandardError, error_msg, caller + end + if first >= last + error_msg = "first >= last" + raise StandardError, error_msg, caller + end + msa = Msa.new() + for i in 0 ... get_number_of_seqs + subseq = get_sequence( i ).get_subsequence( first, last ) + unless ( ( subseq.get_length - subseq.get_gap_length ) < min_non_gap_length ) + if suffix != nil + msa.add( subseq.get_name + suffix, subseq.get_sequence_as_string ) + else + msa.add( subseq.get_name, subseq.get_sequence_as_string ) + end + end + end + + msa + end + + def sliding_extraction( step, size, min_non_gap_length, suffix = nil ) + counter = 0 + done = false + msas = Array.new() + while !done + first = counter * step + last = first + size - 1 + if last > get_length() - 1 + last = get_length() - 1 + done = true + end + unless first >= last + counter +=1 + res = extract(first, last, min_non_gap_length, suffix) + res.set_name(first.to_s + "-" + last.to_s) + msas << res + end + end + msas end def get_gap_only_columns() @@ -486,8 +563,7 @@ module Evoruby return cols end - - # Split an alignment into n alignemnts of equal size, except last one + # Split an alignment into n alignmnts of equal size, except last one def split( n, verbose = false ) if ( n < 2 || n > get_number_of_seqs ) error_msg = "attempt to split into less than two or more than the number of sequences" @@ -498,8 +574,7 @@ module Evoruby x = get_number_of_seqs / n for i in 0 ... n msa = Msa.new() - s = 0 - + #s = 0 if ( ( r > 0 ) && ( i == ( n - 1 ) ) ) y = x + r if ( verbose ) @@ -521,6 +596,34 @@ module Evoruby msas end + def split_by_os(verbose = false) + msa_hash = Hash.new() + for i in 0 ... get_number_of_seqs() + seq = get_sequence(i) + name = seq.get_name() + # >sp|Q1HVE7|AN_EBVA8 Shutoff alkaline exonuclease OS=Epstein-Barr virus (strain AG876) GN=BGLF5 PE=3 SV=1 + # if name =~ /OS=(.+?)\s+[A-Z]{2}=/ + if name =~ /Organism:(.+?)(\|Protein|$)/ + os = $1 + unless msa_hash.has_key?(os) + msa_hash[os] = Msa.new + end + msa_hash[os].add_sequence seq + else + error_msg = "sequence name \"" + name + "\" is not in the expected format for splitting by OS" + raise IOError, error_msg, caller + end + end + msa_hash = msa_hash.sort{|a, b|a<=>b}.to_h + if verbose + c = 0 + msa_hash.each do |o, msa| + c += 1 + puts c.to_s + ': ' + o + end + end + msa_hash + end private @@ -571,7 +674,6 @@ module Evoruby return self end - end # class Msa end # module Evoruby