in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / evo_nursery.rb
1 #
2 # = lib/evo/apps/evo_nursery.rb - EvoNursery class
3 #
4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: evo_nursery.rb,v 1.11 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10
11 require 'lib/evo/soft/fastme'
12 require 'lib/evo/soft/tree_puzzle'
13 require 'lib/evo/util/constants'
14 require 'lib/evo/util/util'
15 require 'lib/evo/util/command_line_arguments'
16 require 'lib/evo/msa/msa_factory'
17 require 'lib/evo/io/msa_io'
18 require 'lib/evo/io/writer/phylip_sequential_writer'
19 require 'lib/evo/io/parser/general_msa_parser'
20 require 'lib/evo/io/writer/msa_writer'
21
22 require 'iconv'
23
24 module Evoruby
25
26     class EvoNursery
27         GAP_RATIO           = 0.50
28         GAP_RATIO_FOR_SEQS  = 0.75
29         MIN_LENGTH          = 30
30         MIN_SEQS            = 4
31         MAX_SEQS            = 3000
32         MAX_ALN_FILE_SIZE   = 5000000
33         MODEL               = :auto
34         RATES               = :uniform
35         FASTME_INITIAL_TREE = :GME
36         ALN_NAME            = '_align_'
37         TREE_PUZZLE_OUTDIST = TreePuzzle::OUTDIST
38         TREE_PUZZLE_OUTFILE = TreePuzzle::OUTFILE
39         FASTME_OUTTREE      = FastMe::OUTTREE
40         FASTME_OUTPUT_D     = FastMe::OUTPUT_D
41
42         PRG_NAME       = "evo_nursery"
43         PRG_DATE       = "2012.03.21"
44         PRG_DESC       = "pfam alignments to evolutionary trees"
45         PRG_VERSION    = "0.20"
46         COPYRIGHT      = "2009-2012 Christian M Zmasek"
47         CONTACT        = "phylosoft@gmail.com"
48         WWW            = "www.phylosoft.org"
49
50         HELP_OPTION_1       = "help"
51         HELP_OPTION_2       = "h"
52
53         def run
54
55             Util.print_program_information( PRG_NAME,
56                 PRG_VERSION,
57                 PRG_DESC,
58                 PRG_DATE,
59                 COPYRIGHT,
60                 CONTACT,
61                 WWW,
62                 STDOUT )
63
64             if RUBY_VERSION !~ /1.9/
65                 puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
66                 exit( -1 )
67             end
68
69             forester_home = Util.get_env_variable_value( Constants::FORESTER_HOME_ENV_VARIABLE )
70             java_home = Util.get_env_variable_value( Constants::JAVA_HOME_ENV_VARIABLE )
71             decorator = java_home + '/bin/java -cp ' + forester_home + '/java/forester.jar org.forester.application.decorator'
72
73             if ( ARGV == nil || ARGV.length != 1 )
74                 help
75                 exit( -1 )
76             end
77
78             begin
79                 cla = CommandLineArguments.new( ARGV )
80             rescue ArgumentError => e
81                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
82             end
83
84             if ( cla.is_option_set?( HELP_OPTION_1 ) ||
85                      cla.is_option_set?( HELP_OPTION_2 ) )
86                 help
87                 exit( 0 )
88             end
89
90             output_dir = cla.get_file_name( 0 )
91
92             if output_dir !~ /\/$/
93                 output_dir = output_dir + '/'
94             end
95
96             if !File.exists?( output_dir )
97                 Util.fatal_error( PRG_NAME, output_dir.to_s + " does not exist", STDOUT )
98             end
99             ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
100             files = Dir.entries( "." )
101             skipped = Array.new
102             counter = 1
103             analyzed = 0;
104             begin
105                 files.each { |pfam_aln_file|
106                     if ( !File.directory?( pfam_aln_file ) &&
107                              pfam_aln_file !~ /^\./ &&
108                              pfam_aln_file !~ /.+\.tre$/  )
109
110                         tree_out_file = output_dir + File.basename( pfam_aln_file ) + ".xml"
111
112                         if File.exists?( tree_out_file )
113                             puts
114                             puts
115                             puts "***** skipping " + File.basename( pfam_aln_file ) + ", already exists"
116                             puts
117                             skipped.push( File.basename( pfam_aln_file ) + " [already exists]" )
118                             next
119                         end
120
121                         puts
122                         puts counter.to_s + ": " + pfam_aln_file.to_str
123                         counter += 1
124                         if File.size( pfam_aln_file ) > MAX_ALN_FILE_SIZE
125                             puts "***** skipping, file size: " +  File.size( pfam_aln_file ).to_s
126                             skipped.push( File.basename( pfam_aln_file ) + " [file size: " +  File.size( pfam_aln_file ).to_s + "]" )
127                             next
128                         end
129
130                         f = MsaFactory.new()
131                         msa = f.create_msa_from_file( pfam_aln_file, GeneralMsaParser.new() )
132
133                         if msa.get_number_of_seqs < MIN_SEQS || msa.get_number_of_seqs > MAX_SEQS
134                             puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s
135                             skipped.push( File.basename( pfam_aln_file ) + " [seqs: " +  msa.get_number_of_seqs.to_s + "]" )
136                             next
137                         end
138
139                         msa.remove_gap_columns_w_gap_ratio!( GAP_RATIO )
140
141                         length = msa.get_length
142                         if length < MIN_LENGTH
143                             puts "***** skipping, length: " + length.to_s
144                             skipped.push( File.basename( pfam_aln_file ) + " [length: " +  length.to_s + "]" )
145                             next
146                         end
147
148                         msa.remove_sequences_by_gap_ratio!( GAP_RATIO_FOR_SEQS )
149
150                         if msa.get_number_of_seqs < MIN_SEQS
151                             puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s
152                             skipped.push( File.basename( pfam_aln_file ) + " [seqs: " +  msa.get_number_of_seqs.to_s + "]" )
153                             next
154                         end
155
156                         map_file = output_dir + File.basename( pfam_aln_file ) + ".map"
157                         f = File.open( map_file, 'a' )
158                         for i in 0 ... msa.get_number_of_seqs
159                             name = msa.get_sequence( i ).get_name()
160                             name =~ /(.+)_(.+)\/.+/
161                             acc = $1
162                             tax_code = $2
163
164                             mapping_str = i.to_s
165                             mapping_str << "\t"
166                             mapping_str << 'TAXONOMY_CODE:'
167                             mapping_str << tax_code
168                             mapping_str << "\t"
169                             mapping_str << 'SEQ_SYMBOL:'
170                             mapping_str << ( acc + '_' + tax_code )
171                             mapping_str << "\t"
172                             if ( acc.length < 6 )
173                                 acc = acc + '_' + tax_code
174                             end
175                             mapping_str << 'SEQ_ACCESSION:'
176                             mapping_str << acc
177                             mapping_str << "\t"
178                             mapping_str << 'SEQ_ACCESSION_SOURCE:UniProtKB'
179                             mapping_str << "\t"
180                             mapping_str << 'NODE_NAME:'
181                             mapping_str << name
182                             f.print( mapping_str )
183                             f.print( "\n" )
184                             name = msa.get_sequence( i ).set_name( i.to_s )
185                         end
186                         f.close
187
188                         io = MsaIO.new()
189                         w = MsaWriter
190                         w = PhylipSequentialWriter.new()
191                         w.clean( true )
192                         w.set_max_name_length( 10 )
193                         if File.exists?( output_dir + ALN_NAME )
194                             File.unlink( output_dir + ALN_NAME )
195                         end
196                         io.write_to_file( msa, output_dir + ALN_NAME, w )
197
198                         tp = TreePuzzle.new()
199                         tp.run( output_dir + ALN_NAME,
200                             MODEL,
201                             RATES,
202                             msa.get_number_of_seqs )
203
204                         File.rename( output_dir + ALN_NAME, output_dir  + File.basename( pfam_aln_file ) + ".aln" )
205
206                         fastme = FastMe.new()
207                         fastme.run( TREE_PUZZLE_OUTDIST, 0, FASTME_INITIAL_TREE )
208
209                         pfam_acc = nil
210                         pfam_de = nil
211                         File.open( pfam_aln_file ) do |file|
212                             while line = file.gets
213                                 line = ic.iconv( line )
214                                 if line =~ /^#=AC\s+(.+)/
215                                     pfam_acc = $1
216                                 end
217                                 if line =~ /^#=DE\s+(.+)/
218                                     pfam_de = $1
219                                 end
220                                 if pfam_acc && pfam_de
221                                     break
222                                 end
223                             end
224                         end
225                         if !pfam_acc || !pfam_de
226                             Util.fatal_error( PRG_NAME, "problem with " + pfam_aln_file.to_s, STDOUT )
227                         end
228
229                         puzzle_model = nil
230                         File.open( TREE_PUZZLE_OUTFILE ) do |file|
231                             while line = file.gets
232                                 line = ic.iconv( line )
233                                 if line =~ /^Model\s+of\s+substitution:\s+(.+)/
234                                     puzzle_model = $1
235                                     break
236                                 end
237                             end
238                         end
239                         if !puzzle_model
240                             Util.fatal_error( PRG_NAME, "problem with puzzle outfile: " + TREE_PUZZLE_OUTFILE.to_s, STDOUT )
241                         end
242
243                         desc = pfam_de
244                         desc << ' | '
245                         desc << 'ML pwd estimation by TREE-PUZZLE version '
246                         desc << TreePuzzle::VERSION
247                         desc << ', model: '
248                         desc << puzzle_model
249                         desc << ', rates: '
250                         desc << RATES.to_s
251                         desc << '; tree estimation by FastME version '
252                         desc << FastMe::VERSION
253                         desc << ', initial tree: '
254                         desc << FASTME_INITIAL_TREE.to_s
255                         desc << '; aln length: '
256                         desc << msa.get_length.to_s
257
258                         cmd = decorator + " -table -p -pn=\"" + pfam_aln_file +
259                          "\" -pi=pfam:" + pfam_acc +
260                          " -pd=\"" + desc + "\" " +
261                          FASTME_OUTTREE + ' ' +
262                          map_file + ' ' + tree_out_file
263
264                         IO.popen( cmd , 'r+' ) do | pipe |
265                             pipe.close_write
266                         end
267                         analyzed += 1
268
269                         File.unlink( map_file )
270                         File.unlink(TREE_PUZZLE_OUTDIST)
271                         File.unlink( TREE_PUZZLE_OUTFILE )
272                         File.unlink( FASTME_OUTPUT_D )
273                     end
274                 }
275             rescue ArgumentError, IOError, StandardError => e
276                 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
277             end
278
279             puts()
280             puts( 'Skipped:' )
281             puts()
282             for i in 0 ... skipped.size
283                 puts i.to_s + ": " + skipped[ i ]
284             end
285
286             puts()
287             puts( 'Skipped : ' + skipped.size.to_s + ' alignments' )
288             puts( 'Analyzed: ' +  analyzed.to_s    + ' alignments' )
289
290             puts( 'Min gap ratio for col del  : ' + GAP_RATIO.to_s )
291             puts( 'Min gap ratio for seq del  : ' + GAP_RATIO_FOR_SEQS.to_s )
292             puts( 'Minimal aln length         : ' + MIN_LENGTH.to_s )
293             puts( 'Minimal number of sequences: ' + MIN_SEQS.to_s )
294             puts( 'Maximal number of sequences: ' + MAX_SEQS.to_s )
295             puts( 'Maximal aln file size      : ' + MAX_ALN_FILE_SIZE.to_s )
296             puts( 'Model              : ' + MODEL.to_s )
297             puts( 'FastME initial tree: ' + FASTME_INITIAL_TREE.to_s )
298
299             puts()
300             puts( '[' + PRG_NAME + '] > OK' )
301             puts()
302
303         end  # run
304
305         private
306
307         def help
308             puts( "Usage:" )
309             puts()
310             puts( "  " + PRG_NAME + ".rb <output dir> " )
311             puts()
312         end
313
314
315     end # class EvoNursery
316
317 end # module Evoruby