2 # = lib/evo/apps/tseq_taxonomy_processor - TseqTaxonomyProcessor class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: tseq_taxonomy_processor.rb,v 1.6 2010/12/13 19:00:11 cmzmasek Exp $
10 require 'lib/evo/util/util'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/io/msa_io'
14 require 'lib/evo/io/parser/sp_taxonomy_parser'
15 require 'lib/evo/io/parser/ncbi_tseq_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/io/writer/phylip_sequential_writer'
18 require 'lib/evo/util/command_line_arguments'
22 class TseqTaxonomyProcessor
25 PRG_DATE = "2009.01.06"
26 PRG_DESC = "preprocessing of multiple sequence files in ncbi tseq xml format"
28 COPYRIGHT = "2009 Christian M Zmasek"
29 CONTACT = "phylosoft@gmail.com"
30 WWW = "www.phylosoft.org"
32 TAXONOMY_CODE = "TAXONOMY_CODE:"
33 TAXONOMY_ID = "TAXONOMY_ID:"
34 TAXONOMY_ID_TYPE = "TAXONOMY_ID_TYPE:"
35 TAXONOMY_SN = "TAXONOMY_SN:"
36 TAXONOMY_CN = "TAXONOMY_CN:"
37 SEQ_ACCESSION = "SEQ_ACCESSION:"
38 SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE:"
39 SEQ_SECONDARY_ACCESSION = "SEQ_SECONDARY_ACCESSION:"
40 SEQ_SYMBOL = "SEQ_SYMBOL:"
41 SEQ_NAME = "SEQ_NAME:"
42 SEQ_MOL_SEQ = "SEQ_MOL_SEQ:"
45 @tax_ids_to_sp_taxonomies = Hash.new()
50 Util.print_program_information( PRG_NAME,
59 if ARGV == nil || ARGV.length != 4
60 puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in tseq xml format> <name for fasta outfile> <name for map outfile>" )
67 cla = CommandLineArguments.new( ARGV )
68 rescue ArgumentError => e
69 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
71 allowed_opts = Array.new
72 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
73 if ( disallowed.length > 0 )
74 Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
77 sp_taxonomy_infile = cla.get_file_name( 0 )
78 sequences_infile = cla.get_file_name( 1 )
79 sequences_outfile = cla.get_file_name( 2 )
80 mapping_outfile = cla.get_file_name( 3 )
82 Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
83 Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
84 Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
85 Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
87 sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
89 Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
91 tseq_parser = NcbiTSeqParser.new
92 msa_fac = MsaFactory.new
94 seqs = msa_fac.create_msa_from_file( sequences_infile, tseq_parser )
96 Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
98 removed = seqs.remove_redundant_sequences!( true, true )
101 Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
102 removed.each { | seq_name |
105 Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
108 mapping_out = File.open( mapping_outfile, "a" )
110 for i in 0 ... seqs.get_number_of_seqs
111 seq = seqs.get_sequence( i )
112 if seq.get_taxonomy == nil
113 Util.fatal_error( PRG_NAME, "sequence [" + seq.get_name + "] has no taxonomy information" )
115 seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
120 w = FastaWriter.new()
122 w.set_max_name_length( 10 )
125 io.write_to_file( seqs, sequences_outfile, w )
126 rescue Exception => e
127 Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
131 Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
132 Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
133 Util.print_message( PRG_NAME, "OK" )
139 def modify_name( seq, i, sp_taxonomies, mapping_outfile )
141 tax_id = seq.get_taxonomy.get_id
142 matching_sp_taxonomy = nil
144 if @tax_ids_to_sp_taxonomies.has_key?( tax_id )
145 # This is so that a second lookup will be much faster.
146 matching_sp_taxonomy = @tax_ids_to_sp_taxonomies[ tax_id ]
148 sp_taxonomies.each { |sp_taxonomy|
149 if ( sp_taxonomy.id == tax_id )
150 if matching_sp_taxonomy != nil
151 Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] is not unique" )
153 matching_sp_taxonomy = sp_taxonomy
154 @tax_ids_to_sp_taxonomies[ tax_id ] = sp_taxonomy
158 if matching_sp_taxonomy == nil
159 Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] for [" + seq.get_taxonomy.get_name + "] not found" )
162 new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
164 seq_name = seq.get_name
165 if seq_name =~ /\[.+\]$/
166 # Redundant taxonomy information hides here.
167 seq_name = seq_name.sub(/\[.+\]$/, '')
169 if seq_name =~ /^\s*hypothetical\s+protein\s*/i
170 # Pointless information.
171 seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
174 mapping_outfile.print( new_name + "\t" +
175 TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
176 TAXONOMY_ID + tax_id + "\t" +
177 TAXONOMY_ID_TYPE + seq.get_taxonomy.get_id_source + "\t" +
178 TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
179 SEQ_ACCESSION + seq.get_accession + "\t" +
180 SEQ_ACCESSION_SOURCE + seq.get_accession_source + "\t" +
181 SEQ_SYMBOL + seq.get_symbol + "\t" +
182 SEQ_NAME + seq_name + "\t" +
183 SEQ_MOL_SEQ + seq.get_sequence_as_string +
184 Constants::LINE_DELIMITER )