2 # = lib/evo/apps/fasta_taxonomy_processor - FastaTaxonomyProcessor class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 require 'lib/evo/util/util'
8 require 'lib/evo/msa/msa_factory'
9 require 'lib/evo/msa/msa'
10 require 'lib/evo/io/msa_io'
11 require 'lib/evo/io/parser/sp_taxonomy_parser'
12 require 'lib/evo/io/parser/fasta_parser'
13 require 'lib/evo/io/writer/fasta_writer'
14 require 'lib/evo/io/writer/phylip_sequential_writer'
15 require 'lib/evo/util/command_line_arguments'
16 require 'lib/evo/apps/tseq_taxonomy_processor'
19 class FastaTaxonomyProcessor
21 PRG_NAME = "fasta_tap"
22 PRG_DATE = "2009.01.20"
23 PRG_DESC = "preprocessing of multiple sequence files in ncbi fasta format"
25 WWW = "www.phylosoft.org"
27 @tax_ids_to_sp_taxonomies = Hash.new()
32 Util.print_program_information( PRG_NAME,
41 if ARGV == nil || ARGV.length != 4
42 puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in ncbi fasta format> <name for fasta outfile> <name for map outfile>" )
48 cla = CommandLineArguments.new( ARGV )
49 rescue ArgumentError => e
50 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
52 allowed_opts = Array.new
53 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
54 if ( disallowed.length > 0 )
55 Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
58 sp_taxonomy_infile = cla.get_file_name( 0 )
59 sequences_infile = cla.get_file_name( 1 )
60 sequences_outfile = cla.get_file_name( 2 )
61 mapping_outfile = cla.get_file_name( 3 )
63 Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
64 Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
65 Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
66 Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
68 sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
70 Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
72 fasta_parser = FastaParser.new
73 msa_fac = MsaFactory.new
75 seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser )
77 Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
79 removed = seqs.remove_redundant_sequences!( true, true )
82 Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
83 removed.each { | seq_name |
86 Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
89 mapping_out = File.open( mapping_outfile, "a" )
91 for i in 0 ... seqs.get_number_of_seqs
92 seq = seqs.get_sequence( i )
93 seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
100 w.set_max_name_length( 10 )
103 io.write_to_file( seqs, sequences_outfile, w )
104 rescue Exception => e
105 Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
109 Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
110 Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
111 Util.print_message( PRG_NAME, "OK" )
117 def modify_name( seq, i, sp_taxonomies, mapping_outfile )
121 seq_desc = seq.get_name
125 if seq_desc =~ /\[(.+)\]/
128 Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]" )
131 matching_sp_taxonomy = nil
133 sp_taxonomies.each { |sp_taxonomy|
134 if ( sp_taxonomy.scientific_name == taxonomy_sn )
135 matching_sp_taxonomy = sp_taxonomy
139 if matching_sp_taxonomy == nil
140 Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" )
143 new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
146 if seq_desc =~ /gi\|(.+?)\|/
149 Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]" )
154 if seq_desc =~ /\|\s*([^|]+?)\s*\[/
158 if seq_name =~ /\[.+\]$/
159 # Redundant taxonomy information hides here.
160 seq_name = seq_name.sub(/\[.+\]$/, '')
162 if seq_name =~ /^\s*hypothetical\s+protein\s*/i
163 # Pointless information.
164 seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
166 if seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i
167 # Pointless information.
168 seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' )
172 mapping_outfile.print( new_name + "\t" +
173 TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
174 TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
175 TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
176 TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
177 TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" +
178 TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" +
179 TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
180 TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
181 Constants::LINE_DELIMITER )
183 mapping_outfile.print( new_name + "\t" +
184 TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
185 TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
186 TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
187 TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
188 TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
189 TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
190 Constants::LINE_DELIMITER )