# # = lib/evo/apps/fasta_taxonomy_processor - FastaTaxonomyProcessor class # # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek # License:: GNU Lesser General Public License (LGPL) require 'lib/evo/util/util' require 'lib/evo/msa/msa_factory' require 'lib/evo/msa/msa' require 'lib/evo/io/msa_io' require 'lib/evo/io/parser/sp_taxonomy_parser' require 'lib/evo/io/parser/fasta_parser' require 'lib/evo/io/writer/fasta_writer' require 'lib/evo/io/writer/phylip_sequential_writer' require 'lib/evo/util/command_line_arguments' require 'lib/evo/apps/tseq_taxonomy_processor' module Evoruby class FastaTaxonomyProcessor PRG_NAME = "fasta_tap" PRG_DATE = "2009.01.20" PRG_DESC = "preprocessing of multiple sequence files in ncbi fasta format" PRG_VERSION = "1.00" WWW = "www.phylosoft.org" def initialize() @tax_ids_to_sp_taxonomies = Hash.new() end def run() Util.print_program_information( PRG_NAME, PRG_VERSION, PRG_DESC, PRG_DATE, COPYRIGHT, CONTACT, WWW, STDOUT ) if ARGV == nil || ARGV.length != 4 puts( "Usage: #{PRG_NAME}.rb " ) puts() exit( -1 ) end begin cla = CommandLineArguments.new( ARGV ) rescue ArgumentError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end allowed_opts = Array.new disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) end sp_taxonomy_infile = cla.get_file_name( 0 ) sequences_infile = cla.get_file_name( 1 ) sequences_outfile = cla.get_file_name( 2 ) mapping_outfile = cla.get_file_name( 3 ) Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile ) Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile ) Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile ) Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile ) sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile ) Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile ) fasta_parser = FastaParser.new msa_fac = MsaFactory.new seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser ) Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile ) removed = seqs.remove_redundant_sequences!( true, true ) if removed.size > 0 Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) removed.each { | seq_name | puts seq_name } Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" ) end mapping_out = File.open( mapping_outfile, "a" ) for i in 0 ... seqs.get_number_of_seqs seq = seqs.get_sequence( i ) seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) ) end io = MsaIO.new() w = FastaWriter.new() w.set_max_name_length( 10 ) w.clean( true ) begin io.write_to_file( seqs, sequences_outfile, w ) rescue Exception => e Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) end mapping_out.close() Util.print_message( PRG_NAME, "wrote: " + mapping_outfile ) Util.print_message( PRG_NAME, "wrote: " + sequences_outfile ) Util.print_message( PRG_NAME, "OK" ) end private def modify_name( seq, i, sp_taxonomies, mapping_outfile ) #i = i + 1792 seq_desc = seq.get_name taxonomy_sn = nil if seq_desc =~ /\[(.+)\]/ taxonomy_sn = $1 else Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]" ) end matching_sp_taxonomy = nil sp_taxonomies.each { |sp_taxonomy| if ( sp_taxonomy.scientific_name == taxonomy_sn ) matching_sp_taxonomy = sp_taxonomy end } if matching_sp_taxonomy == nil Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" ) end new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code gi = nil if seq_desc =~ /gi\|(.+?)\|/ gi = $1 else Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]" ) end seq_name = "" if seq_desc =~ /\|\s*([^|]+?)\s*\[/ seq_name = $1 end if seq_name =~ /\[.+\]$/ # Redundant taxonomy information hides here. seq_name = seq_name.sub(/\[.+\]$/, '') end if seq_name =~ /^\s*hypothetical\s+protein\s*/i # Pointless information. seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' ) end if seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i # Pointless information. seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' ) end if gi != nil mapping_outfile.print( new_name + "\t" + TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" + TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" + TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" + TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" + TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" + TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string + Constants::LINE_DELIMITER ) else mapping_outfile.print( new_name + "\t" + TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" + TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" + TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" + TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string + Constants::LINE_DELIMITER ) end new_name end end end # module Evoruby