refactored
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / fasta_taxonomy_processor.rb
1 #
2 # = lib/evo/apps/fasta_taxonomy_processor - FastaTaxonomyProcessor class
3 #
4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: fasta_taxonomy_processor.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/util'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/io/msa_io'
14 require 'lib/evo/io/parser/sp_taxonomy_parser'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/io/writer/phylip_sequential_writer'
18 require 'lib/evo/util/command_line_arguments'
19 require 'lib/evo/apps/tseq_taxonomy_processor'
20
21 module Evoruby
22
23     class FastaTaxonomyProcessor
24
25         PRG_NAME       = "fasta_tap"
26         PRG_DATE       = "2009.01.20"
27         PRG_DESC       = "preprocessing of multiple sequence files in ncbi fasta format"
28         PRG_VERSION    = "1.00"
29         COPYRIGHT      = "2009 Christian M Zmasek"
30         CONTACT        = "phylosoft@gmail.com"
31         WWW            = "www.phylosoft.org"
32
33         def initialize()
34             @tax_ids_to_sp_taxonomies = Hash.new()
35         end
36
37         def run()
38
39             Util.print_program_information( PRG_NAME,
40                 PRG_VERSION,
41                 PRG_DESC,
42                 PRG_DATE,
43                 COPYRIGHT,
44                 CONTACT,
45                 WWW,
46                 STDOUT )
47
48             if  ARGV == nil || ARGV.length != 4
49                 puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in ncbi fasta format> <name for fasta outfile> <name for map outfile>" )
50                 puts()
51                 exit( -1 )
52             end
53
54             begin
55                 cla = CommandLineArguments.new( ARGV )
56             rescue ArgumentError => e
57                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
58             end
59             allowed_opts = Array.new
60             disallowed = cla.validate_allowed_options_as_str( allowed_opts )
61             if ( disallowed.length > 0 )
62                 Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
63             end
64
65             sp_taxonomy_infile = cla.get_file_name( 0 )
66             sequences_infile = cla.get_file_name( 1 )
67             sequences_outfile = cla.get_file_name( 2 )
68             mapping_outfile = cla.get_file_name( 3 )
69
70             Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
71             Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
72             Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
73             Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
74
75             sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
76
77             Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
78
79             fasta_parser = FastaParser.new
80             msa_fac = MsaFactory.new
81
82             seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser )
83
84             Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
85
86             removed = seqs.remove_redundant_sequences!( true, true )
87
88             if removed.size > 0
89                 Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
90                 removed.each { | seq_name |
91                     puts seq_name
92                 }
93                 Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
94             end
95
96             mapping_out = File.open( mapping_outfile, "a" )
97
98             for i in 0 ... seqs.get_number_of_seqs
99                 seq = seqs.get_sequence( i )
100                 seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
101             end
102
103             io = MsaIO.new()
104
105             w = FastaWriter.new()
106
107             w.set_max_name_length( 10 )
108             w.clean( true )
109             begin
110                 io.write_to_file( seqs, sequences_outfile, w )
111             rescue Exception => e
112                 Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
113             end
114             mapping_out.close()
115
116             Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
117             Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
118             Util.print_message( PRG_NAME, "OK" )
119
120         end
121
122         private
123
124         def modify_name( seq, i, sp_taxonomies, mapping_outfile )
125
126             #i = i + 1792
127             
128             seq_desc = seq.get_name
129
130             taxonomy_sn = nil
131
132             if seq_desc =~ /\[(.+)\]/
133                 taxonomy_sn = $1
134             else
135                 Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]"  )
136             end
137
138             matching_sp_taxonomy = nil
139
140             sp_taxonomies.each { |sp_taxonomy|
141                 if ( sp_taxonomy.scientific_name == taxonomy_sn )
142                     matching_sp_taxonomy = sp_taxonomy
143                 end
144             }
145
146             if  matching_sp_taxonomy == nil
147                 Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" )
148             end
149
150             new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
151
152             gi = nil
153             if seq_desc =~ /gi\|(.+?)\|/
154                 gi = $1
155             else
156               Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]"  )
157             end
158
159             seq_name = ""
160
161             if seq_desc =~ /\|\s*([^|]+?)\s*\[/
162                 seq_name = $1
163             end
164
165             if  seq_name =~ /\[.+\]$/
166                 # Redundant taxonomy information hides here.
167                 seq_name = seq_name.sub(/\[.+\]$/, '')
168             end
169             if  seq_name =~ /^\s*hypothetical\s+protein\s*/i
170                 # Pointless information.
171                 seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
172             end
173             if  seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i
174                 # Pointless information.
175                 seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' )
176             end
177
178             if gi != nil
179             mapping_outfile.print( new_name + "\t" +
180                  TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
181                  TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
182                  TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
183                  TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
184                  TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" +
185                  TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" +
186                  TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
187                  TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
188                  Constants::LINE_DELIMITER )
189             else
190                  mapping_outfile.print( new_name + "\t" +
191                  TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
192                  TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
193                  TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
194                  TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
195                  TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
196                  TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
197                  Constants::LINE_DELIMITER )
198                 
199             end    
200             new_name
201         end
202
203     end
204
205 end # module Evoruby