2 # = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: taxonomy_processor.rb,v 1.26 2010/12/13 19:00:11 cmzmasek Exp $
10 require 'lib/evo/util/util'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/msa/msa'
13 require 'lib/evo/io/msa_io'
14 require 'lib/evo/io/parser/fasta_parser'
15 require 'lib/evo/io/parser/general_msa_parser'
16 require 'lib/evo/io/writer/fasta_writer'
17 require 'lib/evo/io/writer/phylip_sequential_writer'
18 require 'lib/evo/util/command_line_arguments'
22 class TaxonomyProcessor
26 PRG_DESC = "replacement of species names in multiple sequence files"
28 COPYRIGHT = "2013 Christian M Zmasek"
29 CONTACT = "phylosoft@gmail.com"
30 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
32 EXTRACT_TAXONOMY_OPTION = "t"
36 Util.print_program_information( PRG_NAME,
45 if ( ARGV == nil || ( ARGV.length != 1 && ARGV.length != 2 && ARGV.length != 3 && ARGV.length != 4 && ARGV.length != 5 && ARGV.length != 6 ) )
46 puts( "Usage: #{PRG_NAME}.rb [options] <input sequences> [output sequences] [output id list]" )
48 puts( " options: -" + EXTRACT_TAXONOMY_OPTION + ": to extract taxonomy information from bracketed expression" )
54 cla = CommandLineArguments.new( ARGV )
55 rescue ArgumentError => e
56 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
63 if cla.get_number_of_files == 3
64 input = cla.get_file_name( 0 )
65 output = cla.get_file_name( 1 )
66 list_file = cla.get_file_name( 2 )
67 elsif cla.get_number_of_files == 1
68 input = cla.get_file_name( 0 )
70 if input.downcase.end_with?( ".fasta" )
71 i = input[ 0 .. input.length - 7 ]
72 elsif input.downcase.end_with?( ".fsa" )
73 i = input[ 0 .. input.length - 5 ]
77 output = i + "_ni.fasta"
78 list_file = i + ".nim"
82 allowed_opts = Array.new
83 allowed_opts.push( EXTRACT_TAXONOMY_OPTION )
85 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
86 if ( disallowed.length > 0 )
87 Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
90 extract_taxonomy = false
91 if ( cla.is_option_set?( EXTRACT_TAXONOMY_OPTION ) )
92 extract_taxonomy = true
95 if ( File.exists?( output ) )
96 Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" )
98 if ( File.exists?( list_file ) )
99 Util.fatal_error( PRG_NAME, "list file [" + list_file + "] already exists" )
101 if ( !File.exists?( input) )
102 Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" )
105 fasta_like = Util.looks_like_fasta?( input )
108 puts( "Input alignment : " + input )
109 puts( "Output alignment: " + output )
110 puts( "Name list : " + list_file )
112 puts( "Format : Fasta" )
114 puts( "Format : Phylip like" )
116 if ( extract_taxonomy )
117 puts( "Extract taxonomy: true" )
124 msa = f.create_msa_from_file( input, FastaParser.new() )
126 msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
128 rescue Exception => e
129 Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s )
132 if ( msa == nil || msa.get_number_of_seqs() < 1 )
133 Util.fatal_error( PRG_NAME, "failed to read MSA" )
136 Util.check_file_for_writability( list_file )
137 rescue Exception => e
138 Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT )
141 lf = File.open( list_file, "a" )
142 for i in 0 ... msa.get_number_of_seqs
143 seq = msa.get_sequence( i )
144 seq.set_name( modify_name( seq.get_name(), i, lf, extract_taxonomy ) )
149 w = FastaWriter.new()
151 w = PhylipSequentialWriter.new()
153 w.set_max_name_length( 10 )
156 io.write_to_file( msa, output, w )
157 rescue Exception => e
158 Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
161 Util.print_message( PRG_NAME, "wrote: " + list_file )
162 Util.print_message( PRG_NAME, "wrote: " + output )
163 Util.print_message( PRG_NAME, "OK" )
168 def modify_name( desc, counter, file, extract_taxonomy )
170 desc.gsub!( /\s+/, ' ' )
171 #if desc =~ /^>?\s*\S{1,10}_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)/
172 # new_desc = counter.to_s( 16 ) + "_" + $1
174 if desc =~/\s\[(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)\]/
175 new_desc = counter.to_s( 16 ) + "_" + $1
177 Util.fatal_error( PRG_NAME, "could not get taxonomy from: " + desc )
180 new_desc = counter.to_s( 16 )
182 file.print( new_desc + "\t" + desc + "\n" )
186 end # class TaxonomyProcessor