end
species = nil
+ ll = nil
if msa != nil
seq = get_sequence( query, msa )
species = get_species( seq )
raise StandardError, "could not get species" if species == nil || species.empty?
if x_models != nil && x_models.length > 0
- extract_linkers( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file )
+ ll = extract_linker( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file )
end
end
s << r.model + " "
end
s << "\t"
+ if msa != nil
+ if ll != nil
+ s << ll.to_s
+ end
+ s << "\t"
+ end
s << make_overview_da( hmmscan_results_per_protein_filtered )
s << "\t"
s << make_detailed_da( hmmscan_results_per_protein_filtered, qlen )
end
- def extract_linkers( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file )
+ def extract_linker( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file )
raise StandardError, "extraction output file is nil" if extraction_output_file == nil
prev_r = nil
hmmscan_results_per_protein_filtered.each do | r |
if prev_r != nil
if ( x_models.length == 2 && prev_r.model == x_models[ 0 ] && r.model == x_models[ 1 ] )
- extract_linker( prev_r.env_to, r.env_from, seq, extraction_output_file )
+ ll = output_linker( prev_r.env_to, r.env_from, seq, extraction_output_file )
+ return ll
end
end
prev_r = r
end
+ return nil
end
end
- def extract_linker( first, last , seq, output_file )
+ def output_linker( first, last , seq, output_file )
if ( last - first >= 1 )
output_file.print( ">" + seq.get_name + " [" + first.to_s + "-" + last.to_s + "]" + "\n")
output_file.print( seq.get_subsequence( first - 1 , last - 1 ).get_sequence_as_string + "\n" )
end
+ last - first + 1
+
end
class TaxonomyProcessor
PRG_NAME = "tap"
- PRG_DATE = "2013.01.18"
+ PRG_DATE = "2013.03.20"
PRG_DESC = "replacement of species names in multiple sequence files"
- PRG_VERSION = "1.02"
- COPYRIGHT = "2012 Christian M Zmasek"
+ PRG_VERSION = "2.00"
+ COPYRIGHT = "2013 Christian M Zmasek"
CONTACT = "phylosoft@gmail.com"
- WWW = "www.phylosoft.org"
-
- SIMPLE = true
+ WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
EXTRACT_TAXONOMY_OPTION = "t"
- def initialize()
- @taxonomies = Hash.new()
- end
-
def run()
Util.print_program_information( PRG_NAME,
STDOUT )
if ( ARGV == nil || ( ARGV.length != 1 && ARGV.length != 2 && ARGV.length != 3 && ARGV.length != 4 && ARGV.length != 5 && ARGV.length != 6 ) )
- puts( "Usage: #{PRG_NAME}.rb [options] [input map file] <input sequences> [output sequences] [output id list]" )
+ puts( "Usage: #{PRG_NAME}.rb [options] <input sequences> [output sequences] [output id list]" )
puts()
puts( " options: -" + EXTRACT_TAXONOMY_OPTION + ": to extract taxonomy information from bracketed expression" )
puts()
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
-
- mapfile = nil
input = nil
output = nil
list_file = nil
-
-
- if cla.get_number_of_files == 4
- mapfile = cla.get_file_name( 0 )
- input = cla.get_file_name( 1 )
- output = cla.get_file_name( 2 )
- list_file = cla.get_file_name( 3 )
- elsif cla.get_number_of_files == 3
+ if cla.get_number_of_files == 3
input = cla.get_file_name( 0 )
output = cla.get_file_name( 1 )
list_file = cla.get_file_name( 2 )
if ( !File.exists?( input) )
Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" )
end
- if ( mapfile != nil && !File.exists?( mapfile ) )
- Util.fatal_error( PRG_NAME, "mapfile [" + mapfile + "] does not exist" )
- end
fasta_like = Util.looks_like_fasta?( input )
puts()
- if mapfile != nil
- puts( "Map file : " + mapfile )
- end
puts( "Input alignment : " + input )
puts( "Output alignment: " + output )
puts( "Name list : " + list_file )
end
puts()
- species_map = Hash.new
- if mapfile != nil
- File.open( mapfile ) do | file |
- while line = file.gets
- if ( line =~/(.+)#(.+)/ || line =~/(.+)\s+(.+)/ )
- species_map[ $1 ] = $2
- Util.print_message( PRG_NAME, "mapping: " + $1 + ' => ' + $2 )
- end
- end
- end
- end
-
f = MsaFactory.new()
begin
if ( fasta_like )
Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT )
end
- #removed = msa.remove_redundant_sequences!( true )
- #if removed.size > 0
- # Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
- # removed.each { | seq_name |
- # puts seq_name
- # }
- # Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" )
- #end
-
lf = File.open( list_file, "a" )
for i in 0 ... msa.get_number_of_seqs
- seq = msa.get_sequence( i )
- seq.set_name( Util::normalize_seq_name( modify_name( seq.get_name(), i, lf, species_map, extract_taxonomy ), 10 ) )
+ seq = msa.get_sequence( i )
+ seq.set_name( modify_name( seq.get_name(), i, lf, extract_taxonomy ) )
end
-
io = MsaIO.new()
w = nil
if ( fasta_like )
private
- def modify_name( desc, counter, file, species_map, extract_taxonomy )
+ def modify_name( desc, counter, file, extract_taxonomy )
new_desc = nil
- my_species = nil
- # if desc =~ /^>?\s*\S{1,10}_([0-9A-Z]{3,5})/
- desc.gsub!( /:\s+/, ":" ) #new
- desc.gsub!( /\s+/, " " ) #new
+ desc.gsub!( /:\s+/, ":" )
+ desc.gsub!( /\s+/, " " )
if desc =~ /^>?\s*\S{1,10}_([A-Z]{3,5})/
new_desc = counter.to_s( 16 ) + "_" + $1
- elsif SIMPLE
- new_desc = counter.to_s( 16 )
elsif extract_taxonomy
-
- species = nil
- species_map.each_key do | key |
- if desc =~ /[\b|_]#{key}\b/ # Added boundaries to prevent e.g. RAT matching ARATH.
- species = species_map[ key ]
- new_desc = counter.to_s( 16 ) + "_" + species
- break
- end
- end
- if species == nil
- #if desc =~/.*\[(\S{3,}?)\]/
- if desc =~/\[([A-Z0-9]{3,6})\]\s*$/ #new
- species = $1
- species.strip!
- species.upcase!
- species.gsub!( /\s+/, " " )
- species.gsub!( /-/, "" )
- species.gsub!( /\)/, "" )
- species.gsub!( /\(/, "" )
- species.gsub!( /\'/, "" )
- if species =~ /\S+\s\S+/ || species =~ /\S{3,5}/
- if species =~ /(\S+)\s(\S+)/
- code = $1[ 0..2 ] + $2[ 0..1 ]
- elsif species =~ /\S{3,5}/
- code = species
- elsif species.count( " " ) > 2
- species =~ /(\S+)\s+(\S+)\s+(\S+)$/
- third_last = $1
- second_last = $2
- last = $3
- code = code[ 0 ] + third_last[ 0 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ]
- elsif species.count( " " ) > 1
- species =~ /(\S+)\s+(\S+)$/
- second_last = $1
- last = $2
- code = code[ 0..1 ] + second_last[ 0 ] + last[ 0 ] + last[ last.size - 1 ]
- end
- new_desc = counter.to_s( 16 ) + "_" + code
- if @taxonomies.has_key?( code )
- if ( !@taxonomies.has_value?( species ) )
- Util.fatal_error( PRG_NAME, "code [#{code}] is not unique in [#{desc}]" )
- end
- else
- if ( @taxonomies.has_value?( species ) )
- Util.fatal_error( PRG_NAME, "genome [#{species}] is not unique in [#{desc}]" )
- else
- @taxonomies[ code ] = species
- end
- end
- else
- Util.fatal_error( PRG_NAME, "illegal format [#{species}] in: " + desc )
- end
- else
- Util.fatal_error( PRG_NAME, "illegal format in: " + desc )
- end
- end
- else
- species = nil
- my_species = nil
- species_map.each_key do | key |
- if desc =~ /#{key}/
- species = species_map[ key ]
- species = species.gsub( /\s+/, "" )
- species = species.gsub( /_/, " " )
- my_species = species
- if species =~ /(\S+)\s+(\S+)/
- species = $1[0..2] + $2[0..1]
- end
- species = species.gsub( /\s+/, "" )
- species = species.slice(0, 5)
- species.upcase!
- break
- end
- end
- if species == nil
- Util.fatal_error( PRG_NAME, "species not found in: " + desc )
+ if desc =~/\s\[([A-Z0-9]{3,5})\]\b/
+ new_desc = counter.to_s( 16 ) + "_" + $1
else
- new_desc = counter.to_s( 16 ) + "_" + species
+ Util.fatal_error( PRG_NAME, "illegal format in: " + desc )
end
+ else
+ new_desc = counter.to_s( 16 )
end
if new_desc == nil
Util.fatal_error( PRG_NAME, "failed to extract species from: " + desc )
end
- if my_species != nil
- file.print( new_desc + ": " + desc + " [" + my_species + "]" + "\n" )
- else
- file.print( new_desc + ": " + desc + "\n" )
- end
+ file.print( new_desc + ": " + desc + "\n" )
new_desc
end