def initialize()
@sequences = Array.new
@identical_seqs_detected = Array.new
+ @name_to_seq_indices = Hash.new
+ @namestart_to_seq_indices = Hash.new
end
" sequences"
raise ArgumentError, error_msg
end
+ @name_to_seq_indices.clear
+ @namestart_to_seq_indices.clear
@sequences.delete_at( index )
end
end
def find_by_name( name, case_sensitive, partial_match )
+ if case_sensitive && !partial_match && @name_to_seq_indices.has_key?( name )
+ return @name_to_seq_indices[ name ]
+ end
indices = Array.new()
for i in 0 ... get_number_of_seqs()
current_name = get_sequence( i ).get_name()
indices.push( i )
end
end
+ if case_sensitive && !partial_match
+ @name_to_seq_indices[ name ] = indices
+ end
indices
end
end
def find_by_name_start( name, case_sensitive )
+ if case_sensitive && @namestart_to_seq_indices.has_key?( name )
+ return @namestart_to_seq_indices[ name ]
+ end
indices = []
for i in 0 ... get_number_of_seqs()
get_sequence( i ).get_name() =~ /^\s*(\S+)/
current_name = current_name.downcase
name = name.downcase
end
- if ( current_name == name )
+ if current_name == name
indices.push( i )
end
end
+ if case_sensitive
+ @namestart_to_seq_indices[ name ] = indices
+ end
indices
end
# throws ArgumentError
def get_by_name_start( name, case_sensitive = true )
indices = find_by_name_start( name, case_sensitive )
- if ( indices.length > 1 )
+ if indices.length > 1
error_msg = "\"" + name + "\" not unique"
raise ArgumentError, error_msg
- elsif ( indices.length < 1 )
+ elsif indices.length < 1
error_msg = "\"" + name + "\" not found"
raise ArgumentError, error_msg
end
class MultiSequenceExtractor
PRG_NAME = "mse"
- PRG_VERSION = "1.02"
+ PRG_VERSION = "1.03"
PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
- PRG_DATE = "130322"
+ PRG_DATE = "131127"
COPYRIGHT = "2008-2013 Christian M Zmasek"
CONTACT = "phylosoft@gmail.com"
WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
per_species_counter = per_species_counter + 1
seq = nil
- if current_msa.find_by_name_start( seq_name, true ).size > 0
- begin
- seq = current_msa.get_by_name_start( seq_name, true ).copy
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
- else
+ indices = current_msa.find_by_name_start( seq_name, true )
+ if indices.size == 1
+ seq = current_msa.get_sequence( indices[ 0 ] )
+ elsif indices.size == 0
# Not found, try finding by partial match.
begin
seq = current_msa.get_by_name( seq_name, true, true )
rescue ArgumentError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
+ else
+ Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique" )
end
+ # if current_msa.find_by_name_start( seq_name, true ).size > 0
+ # begin
+ # seq = current_msa.get_by_name_start( seq_name, true ).copy
+ # rescue ArgumentError => e
+ # Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+ # end
+ # else
+ # # Not found, try finding by partial match.
+ # begin
+ # seq = current_msa.get_by_name( seq_name, true, true )
+ # rescue ArgumentError => e
+ # Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+ # end
+ # end
+
normalized_id = per_species_counter.to_s( 16 ).upcase +
"_" + current_species