class MultiSequenceExtractor
PRG_NAME = "mse"
- PRG_VERSION = "1.02"
+ PRG_VERSION = "1.03"
PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
- PRG_DATE = "130322"
+ PRG_DATE = "131127"
COPYRIGHT = "2008-2013 Christian M Zmasek"
CONTACT = "phylosoft@gmail.com"
WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
NORMALIZED_IDS_MAP_SUFFIX = ".nim"
PROTEINS_LIST_FILE_SEPARATOR = "\t"
+ def initialize()
+ @file_to_msa = Hash.new
+ @seqs = 0
+ end
def run()
puts basename
File.open( input_file ) do | file |
+ species_counter = 1
while line = file.gets
line.strip!
if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
print_counts( per_species_counter, log, Constants::LINE_DELIMITER )
per_species_counter = 0
end
- puts " " + current_species + " [" + my_readlink + "]"
- log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
+ puts " " + species_counter.to_s + ":" + current_species + " [" + my_readlink + "]"
+ log << species_counter.to_s + ": " + current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER
+ species_counter += 1
end
puts " " + seq_name
log << " " + seq_name + Constants::LINE_DELIMITER
per_species_counter = per_species_counter + 1
seq = nil
- if current_msa.find_by_name_start( seq_name, true ).size > 0
- begin
- seq = current_msa.get_by_name_start( seq_name, true ).copy
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s )
- end
- else
+ indices = current_msa.find_by_name_start( seq_name, true )
+ if indices.size == 1
+ seq = current_msa.get_sequence( indices[ 0 ] )
+ elsif indices.size == 0
# Not found, try finding by partial match.
begin
seq = current_msa.get_by_name( seq_name, true, true )
rescue ArgumentError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
+ else
+ Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique" )
end
+ # if current_msa.find_by_name_start( seq_name, true ).size > 0
+ # begin
+ # seq = current_msa.get_by_name_start( seq_name, true ).copy
+ # rescue ArgumentError => e
+ # Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+ # end
+ # else
+ # # Not found, try finding by partial match.
+ # begin
+ # seq = current_msa.get_by_name( seq_name, true, true )
+ # rescue ArgumentError => e
+ # Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+ # end
+ # end
+
normalized_id = per_species_counter.to_s( 16 ).upcase +
"_" + current_species
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
end
-
-
end
end
def read_fasta_file( input )
+ if @file_to_msa.has_key?( input )
+ return @file_to_msa[ input ]
+ end
+
f = MsaFactory.new()
msa = nil
begin
rescue Exception => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
+ if @seqs <= 100000000
+ @file_to_msa[ input ] = msa
+ @seqs += msa.get_number_of_seqs
+ puts " total seqs in memory: " + @seqs.to_s
+ end
msa
end