2 # = lib/evo/apps/evo_nursery.rb - EvoNursery class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: evo_nursery.rb,v 1.11 2010/12/13 19:00:11 cmzmasek Exp $
11 require 'lib/evo/soft/fastme'
12 require 'lib/evo/soft/tree_puzzle'
13 require 'lib/evo/util/constants'
14 require 'lib/evo/util/util'
15 require 'lib/evo/util/command_line_arguments'
16 require 'lib/evo/msa/msa_factory'
17 require 'lib/evo/io/msa_io'
18 require 'lib/evo/io/writer/phylip_sequential_writer'
19 require 'lib/evo/io/parser/general_msa_parser'
20 require 'lib/evo/io/writer/msa_writer'
28 GAP_RATIO_FOR_SEQS = 0.75
32 MAX_ALN_FILE_SIZE = 5000000
35 FASTME_INITIAL_TREE = :GME
37 TREE_PUZZLE_OUTDIST = TreePuzzle::OUTDIST
38 TREE_PUZZLE_OUTFILE = TreePuzzle::OUTFILE
39 FASTME_OUTTREE = FastMe::OUTTREE
40 FASTME_OUTPUT_D = FastMe::OUTPUT_D
42 PRG_NAME = "evo_nursery"
43 PRG_DATE = "2012.03.21"
44 PRG_DESC = "pfam alignments to evolutionary trees"
46 COPYRIGHT = "2009-2012 Christian M Zmasek"
47 CONTACT = "phylosoft@gmail.com"
48 WWW = "www.phylosoft.org"
50 HELP_OPTION_1 = "help"
55 Util.print_program_information( PRG_NAME,
64 if RUBY_VERSION !~ /1.9/
65 puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
69 forester_home = Util.get_env_variable_value( Constants::FORESTER_HOME_ENV_VARIABLE )
70 java_home = Util.get_env_variable_value( Constants::JAVA_HOME_ENV_VARIABLE )
71 decorator = java_home + '/bin/java -cp ' + forester_home + '/java/forester.jar org.forester.application.decorator'
73 if ( ARGV == nil || ARGV.length != 1 )
79 cla = CommandLineArguments.new( ARGV )
80 rescue ArgumentError => e
81 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
84 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
85 cla.is_option_set?( HELP_OPTION_2 ) )
90 output_dir = cla.get_file_name( 0 )
92 if output_dir !~ /\/$/
93 output_dir = output_dir + '/'
96 if !File.exists?( output_dir )
97 Util.fatal_error( PRG_NAME, output_dir.to_s + " does not exist", STDOUT )
99 ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
100 files = Dir.entries( "." )
105 files.each { |pfam_aln_file|
106 if ( !File.directory?( pfam_aln_file ) &&
107 pfam_aln_file !~ /^\./ &&
108 pfam_aln_file !~ /.+\.tre$/ )
110 tree_out_file = output_dir + File.basename( pfam_aln_file ) + ".xml"
112 if File.exists?( tree_out_file )
115 puts "***** skipping " + File.basename( pfam_aln_file ) + ", already exists"
117 skipped.push( File.basename( pfam_aln_file ) + " [already exists]" )
122 puts counter.to_s + ": " + pfam_aln_file.to_str
124 if File.size( pfam_aln_file ) > MAX_ALN_FILE_SIZE
125 puts "***** skipping, file size: " + File.size( pfam_aln_file ).to_s
126 skipped.push( File.basename( pfam_aln_file ) + " [file size: " + File.size( pfam_aln_file ).to_s + "]" )
131 msa = f.create_msa_from_file( pfam_aln_file, GeneralMsaParser.new() )
133 if msa.get_number_of_seqs < MIN_SEQS || msa.get_number_of_seqs > MAX_SEQS
134 puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s
135 skipped.push( File.basename( pfam_aln_file ) + " [seqs: " + msa.get_number_of_seqs.to_s + "]" )
139 msa.remove_gap_columns_w_gap_ratio!( GAP_RATIO )
141 length = msa.get_length
142 if length < MIN_LENGTH
143 puts "***** skipping, length: " + length.to_s
144 skipped.push( File.basename( pfam_aln_file ) + " [length: " + length.to_s + "]" )
148 msa.remove_sequences_by_gap_ratio!( GAP_RATIO_FOR_SEQS )
150 if msa.get_number_of_seqs < MIN_SEQS
151 puts "***** skipping, seqs: " + msa.get_number_of_seqs.to_s
152 skipped.push( File.basename( pfam_aln_file ) + " [seqs: " + msa.get_number_of_seqs.to_s + "]" )
156 map_file = output_dir + File.basename( pfam_aln_file ) + ".map"
157 f = File.open( map_file, 'a' )
158 for i in 0 ... msa.get_number_of_seqs
159 name = msa.get_sequence( i ).get_name()
160 name =~ /(.+)_(.+)\/.+/
166 mapping_str << 'TAXONOMY_CODE:'
167 mapping_str << tax_code
169 mapping_str << 'SEQ_SYMBOL:'
170 mapping_str << ( acc + '_' + tax_code )
172 if ( acc.length < 6 )
173 acc = acc + '_' + tax_code
175 mapping_str << 'SEQ_ACCESSION:'
178 mapping_str << 'SEQ_ACCESSION_SOURCE:UniProtKB'
180 mapping_str << 'NODE_NAME:'
182 f.print( mapping_str )
184 name = msa.get_sequence( i ).set_name( i.to_s )
190 w = PhylipSequentialWriter.new()
192 w.set_max_name_length( 10 )
193 if File.exists?( output_dir + ALN_NAME )
194 File.unlink( output_dir + ALN_NAME )
196 io.write_to_file( msa, output_dir + ALN_NAME, w )
198 tp = TreePuzzle.new()
199 tp.run( output_dir + ALN_NAME,
202 msa.get_number_of_seqs )
204 File.rename( output_dir + ALN_NAME, output_dir + File.basename( pfam_aln_file ) + ".aln" )
206 fastme = FastMe.new()
207 fastme.run( TREE_PUZZLE_OUTDIST, 0, FASTME_INITIAL_TREE )
211 File.open( pfam_aln_file ) do |file|
212 while line = file.gets
213 line = ic.iconv( line )
214 if line =~ /^#=AC\s+(.+)/
217 if line =~ /^#=DE\s+(.+)/
220 if pfam_acc && pfam_de
225 if !pfam_acc || !pfam_de
226 Util.fatal_error( PRG_NAME, "problem with " + pfam_aln_file.to_s, STDOUT )
230 File.open( TREE_PUZZLE_OUTFILE ) do |file|
231 while line = file.gets
232 line = ic.iconv( line )
233 if line =~ /^Model\s+of\s+substitution:\s+(.+)/
240 Util.fatal_error( PRG_NAME, "problem with puzzle outfile: " + TREE_PUZZLE_OUTFILE.to_s, STDOUT )
245 desc << 'ML pwd estimation by TREE-PUZZLE version '
246 desc << TreePuzzle::VERSION
251 desc << '; tree estimation by FastME version '
252 desc << FastMe::VERSION
253 desc << ', initial tree: '
254 desc << FASTME_INITIAL_TREE.to_s
255 desc << '; aln length: '
256 desc << msa.get_length.to_s
258 cmd = decorator + " -table -p -pn=\"" + pfam_aln_file +
259 "\" -pi=pfam:" + pfam_acc +
260 " -pd=\"" + desc + "\" " +
261 FASTME_OUTTREE + ' ' +
262 map_file + ' ' + tree_out_file
264 IO.popen( cmd , 'r+' ) do | pipe |
269 File.unlink( map_file )
270 File.unlink(TREE_PUZZLE_OUTDIST)
271 File.unlink( TREE_PUZZLE_OUTFILE )
272 File.unlink( FASTME_OUTPUT_D )
275 rescue ArgumentError, IOError, StandardError => e
276 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
282 for i in 0 ... skipped.size
283 puts i.to_s + ": " + skipped[ i ]
287 puts( 'Skipped : ' + skipped.size.to_s + ' alignments' )
288 puts( 'Analyzed: ' + analyzed.to_s + ' alignments' )
290 puts( 'Min gap ratio for col del : ' + GAP_RATIO.to_s )
291 puts( 'Min gap ratio for seq del : ' + GAP_RATIO_FOR_SEQS.to_s )
292 puts( 'Minimal aln length : ' + MIN_LENGTH.to_s )
293 puts( 'Minimal number of sequences: ' + MIN_SEQS.to_s )
294 puts( 'Maximal number of sequences: ' + MAX_SEQS.to_s )
295 puts( 'Maximal aln file size : ' + MAX_ALN_FILE_SIZE.to_s )
296 puts( 'Model : ' + MODEL.to_s )
297 puts( 'FastME initial tree: ' + FASTME_INITIAL_TREE.to_s )
300 puts( '[' + PRG_NAME + '] > OK' )
310 puts( " " + PRG_NAME + ".rb <output dir> " )
315 end # class EvoNursery