2 # = lib/evo/apps/ - class
4 # Copyright:: Copyright (C) 2009 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: new_tap.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $
10 require 'lib/evo/util/util'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/io/msa_io'
14 require 'lib/evo/io/parser/fasta_parser'
15 require 'lib/evo/io/parser/general_msa_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/io/writer/phylip_sequential_writer'
18 require 'lib/evo/util/command_line_arguments'
22 class TaxonomyProcessor
25 PRG_DATE = "2009.10.09"
26 PRG_DESC = "replacement of labels in multiple sequence files"
28 COPYRIGHT = "2009 Christian M Zmasek"
29 CONTACT = "phylosoft@gmail.com"
30 WWW = "www.phylosoft.org"
32 REMOVE_REDUNDANT_SEQS_OPTION = "rr"
35 @taxonomies = Hash.new()
40 Util.print_program_information( PRG_NAME,
49 if ( ARGV == nil || ( ARGV.length != 3 && ARGV.length != 4 ) )
50 puts( "Usage: #{PRG_NAME}.rb <input sequences> <output sequences> <output map>" )
52 puts( " options: -" + REMOVE_REDUNDANT_SEQS_OPTION + ": to remove redundant sequences" )
58 cla = CommandLineArguments.new( ARGV )
59 rescue ArgumentError => e
60 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
63 input = cla.get_file_name( 0 )
64 output = cla.get_file_name( 1 )
65 map_file = cla.get_file_name( 2 )
67 allowed_opts = Array.new
68 allowed_opts.push( REMOVE_REDUNDANT_SEQS_OPTION )
70 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
71 if ( disallowed.length > 0 )
72 Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
76 remove_redudant = false
77 if ( cla.is_option_set?( REMOVE_REDUNDANT_SEQS_OPTION ) )
78 remove_redudant = true
81 if ( File.exists?( output ) )
82 Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" )
84 if ( File.exists?( map_file ) )
85 Util.fatal_error( PRG_NAME, "map file [" + map_file + "] already exists" )
87 if ( !File.exists?( input) )
88 Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" )
91 fasta_like = Util.looks_like_fasta?( input )
94 puts( "Input alignment : " + input )
95 puts( "Output alignment: " + output )
96 puts( "Output map : " + map_file )
98 puts( "Format : Fasta" )
100 puts( "Format : Phylip like" )
104 species_map = Hash.new
109 msa = f.create_msa_from_file( input, FastaParser.new() )
111 msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
113 rescue Exception => e
114 Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s )
117 if ( msa == nil || msa.get_number_of_seqs() < 1 )
118 Util.fatal_error( PRG_NAME, "failed to read MSA" )
121 Util.check_file_for_writability( map_file )
122 rescue Exception => e
123 Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT )
126 if ( remove_redudant )
127 removed = msa.remove_redundant_sequences!( true )
129 Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
130 removed.each { | seq_name |
133 Util.print_message( PRG_NAME, "will process " + msa.get_number_of_seqs.to_s + " non redundant sequences" )
137 lf = File.open( map_file, "a" )
138 for i in 0 ... msa.get_number_of_seqs
139 seq = msa.get_sequence( i )
145 w = FastaWriter.new()
147 w = PhylipSequentialWriter.new()
149 w.set_max_name_length( 10 )
152 io.write_to_file( msa, output, w )
153 rescue Exception => e
154 Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
157 if ( @taxonomies.length > 0 )
158 Util.print_message( PRG_NAME, "number of unique taxonomies: " + @taxonomies.length.to_s )
160 Util.print_message( PRG_NAME, "wrote: " + map_file )
161 Util.print_message( PRG_NAME, "wrote: " + output )
162 Util.print_message( PRG_NAME, "OK" )