in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / tseq_taxonomy_processor.rb
1 #
2 # = lib/evo/apps/tseq_taxonomy_processor - TseqTaxonomyProcessor class
3 #
4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: tseq_taxonomy_processor.rb,v 1.6 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/util'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/io/msa_io'
14 require 'lib/evo/io/parser/sp_taxonomy_parser'
15 require 'lib/evo/io/parser/ncbi_tseq_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/io/writer/phylip_sequential_writer'
18 require 'lib/evo/util/command_line_arguments'
19
20 module Evoruby
21
22     class TseqTaxonomyProcessor
23
24         PRG_NAME       = "tseq_tap"
25         PRG_DATE       = "2009.01.06"
26         PRG_DESC       = "preprocessing of multiple sequence files in ncbi tseq xml format"
27         PRG_VERSION    = "1.02"
28         COPYRIGHT      = "2009 Christian M Zmasek"
29         CONTACT        = "phylosoft@gmail.com"
30         WWW            = "www.phylosoft.org"
31
32         TAXONOMY_CODE           = "TAXONOMY_CODE:"
33         TAXONOMY_ID             = "TAXONOMY_ID:"
34         TAXONOMY_ID_TYPE        = "TAXONOMY_ID_TYPE:"
35         TAXONOMY_SN             = "TAXONOMY_SN:"
36         TAXONOMY_CN             = "TAXONOMY_CN:"
37         SEQ_ACCESSION           = "SEQ_ACCESSION:"
38         SEQ_ACCESSION_SOURCE    = "SEQ_ACCESSION_SOURCE:"
39         SEQ_SECONDARY_ACCESSION = "SEQ_SECONDARY_ACCESSION:"
40         SEQ_SYMBOL              = "SEQ_SYMBOL:"
41         SEQ_NAME                = "SEQ_NAME:"
42         SEQ_MOL_SEQ             = "SEQ_MOL_SEQ:"
43
44         def initialize()
45             @tax_ids_to_sp_taxonomies = Hash.new()
46         end
47
48         def run()
49
50             Util.print_program_information( PRG_NAME,
51                 PRG_VERSION,
52                 PRG_DESC,
53                 PRG_DATE,
54                 COPYRIGHT,
55                 CONTACT,
56                 WWW,
57                 STDOUT )
58
59             if  ARGV == nil || ARGV.length != 4
60                 puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in tseq xml format> <name for fasta outfile> <name for map outfile>" )
61                 puts()
62
63                 exit( -1 )
64             end
65
66             begin
67                 cla = CommandLineArguments.new( ARGV )
68             rescue ArgumentError => e
69                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
70             end
71             allowed_opts = Array.new
72             disallowed = cla.validate_allowed_options_as_str( allowed_opts )
73             if ( disallowed.length > 0 )
74                 Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
75             end
76
77             sp_taxonomy_infile = cla.get_file_name( 0 )
78             sequences_infile = cla.get_file_name( 1 )
79             sequences_outfile = cla.get_file_name( 2 )
80             mapping_outfile = cla.get_file_name( 3 )
81
82             Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
83             Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
84             Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
85             Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
86
87             sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
88
89             Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
90
91             tseq_parser = NcbiTSeqParser.new
92             msa_fac = MsaFactory.new
93
94             seqs = msa_fac.create_msa_from_file( sequences_infile, tseq_parser )
95
96             Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
97
98             removed = seqs.remove_redundant_sequences!( true, true )
99
100             if removed.size > 0
101                 Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
102                 removed.each { | seq_name |
103                     puts seq_name
104                 }
105                 Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
106             end
107
108             mapping_out = File.open( mapping_outfile, "a" )
109
110             for i in 0 ... seqs.get_number_of_seqs
111                 seq = seqs.get_sequence( i )
112                 if seq.get_taxonomy == nil
113                     Util.fatal_error( PRG_NAME, "sequence [" + seq.get_name + "] has no taxonomy information" )
114                 end
115                 seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
116             end
117
118             io = MsaIO.new()
119
120             w = FastaWriter.new()
121
122             w.set_max_name_length( 10 )
123             w.clean( true )
124             begin
125                 io.write_to_file( seqs, sequences_outfile, w )
126             rescue Exception => e
127                 Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
128             end
129             mapping_out.close()
130
131             Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
132             Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
133             Util.print_message( PRG_NAME, "OK" )
134
135         end
136
137         private
138
139         def modify_name( seq, i, sp_taxonomies, mapping_outfile )
140
141             tax_id = seq.get_taxonomy.get_id
142             matching_sp_taxonomy = nil
143
144             if @tax_ids_to_sp_taxonomies.has_key?( tax_id )
145                 # This is so that a second lookup will be much faster.
146                 matching_sp_taxonomy = @tax_ids_to_sp_taxonomies[ tax_id ]
147             else
148                 sp_taxonomies.each { |sp_taxonomy|
149                     if ( sp_taxonomy.id == tax_id )
150                         if  matching_sp_taxonomy != nil
151                             Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] is not unique" )
152                         end
153                         matching_sp_taxonomy = sp_taxonomy
154                         @tax_ids_to_sp_taxonomies[ tax_id ] = sp_taxonomy
155                     end
156                 }
157             end
158             if  matching_sp_taxonomy == nil
159                 Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] for [" +  seq.get_taxonomy.get_name + "] not found" )
160             end
161
162             new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
163
164             seq_name = seq.get_name
165             if  seq_name =~ /\[.+\]$/
166                 # Redundant taxonomy information hides here.
167                 seq_name = seq_name.sub(/\[.+\]$/, '')
168             end
169             if  seq_name =~ /^\s*hypothetical\s+protein\s*/i
170                 # Pointless information.
171                 seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
172             end
173
174             mapping_outfile.print( new_name + "\t" +
175                  TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
176                  TAXONOMY_ID + tax_id + "\t" +
177                  TAXONOMY_ID_TYPE + seq.get_taxonomy.get_id_source + "\t" +
178                  TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
179                  SEQ_ACCESSION + seq.get_accession + "\t" +
180                  SEQ_ACCESSION_SOURCE + seq.get_accession_source + "\t" +
181                  SEQ_SYMBOL + seq.get_symbol + "\t" +
182                  SEQ_NAME + seq_name + "\t" +
183                  SEQ_MOL_SEQ + seq.get_sequence_as_string +
184                  Constants::LINE_DELIMITER )
185             new_name
186         end
187
188     end 
189
190 end # module Evoruby