2 # = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 require 'lib/evo/util/constants'
8 require 'lib/evo/util/util'
9 require 'lib/evo/msa/msa_factory'
10 require 'lib/evo/msa/msa'
11 require 'lib/evo/io/msa_io'
12 require 'lib/evo/io/parser/fasta_parser'
13 require 'lib/evo/io/parser/general_msa_parser'
14 require 'lib/evo/io/writer/fasta_writer'
15 require 'lib/evo/io/writer/phylip_sequential_writer'
16 require 'lib/evo/util/command_line_arguments'
19 class TaxonomyProcessor
23 PRG_DESC = "Replacement of labels in multiple sequence files"
25 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
27 EXTRACT_TAXONOMY_OPTION = "t"
28 ANNOTATION_OPTION = "a"
29 HELP_OPTION_1 = "help"
33 Util.print_program_information( PRG_NAME,
40 if ( ARGV == nil || ( ARGV.length < 1 ) )
46 cla = CommandLineArguments.new( ARGV )
47 rescue ArgumentError => e
48 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
51 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
52 cla.is_option_set?( HELP_OPTION_2 ) )
61 if cla.get_number_of_files == 3
62 input = cla.get_file_name( 0 )
63 output = cla.get_file_name( 1 )
64 list_file = cla.get_file_name( 2 )
65 elsif cla.get_number_of_files == 1
66 input = cla.get_file_name( 0 )
68 if input.downcase.end_with?( ".fasta" )
69 i = input[ 0 .. input.length - 7 ]
70 elsif input.downcase.end_with?( ".fsa" )
71 i = input[ 0 .. input.length - 5 ]
75 output = i + Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
76 list_file = i + Constants::ID_MAP_FILE_SUFFIX
82 allowed_opts = Array.new
83 allowed_opts.push( EXTRACT_TAXONOMY_OPTION )
84 allowed_opts.push( ANNOTATION_OPTION )
86 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
87 if ( disallowed.length > 0 )
88 Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
91 extract_taxonomy = false
92 if ( cla.is_option_set?( EXTRACT_TAXONOMY_OPTION ) )
93 extract_taxonomy = true
97 if ( cla.is_option_set?( ANNOTATION_OPTION ) )
98 annotation = cla.get_option_value( ANNOTATION_OPTION )
101 if ( File.exist?( output ) )
102 Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" )
104 if ( File.exist?( list_file ) )
105 Util.fatal_error( PRG_NAME, "list file [" + list_file + "] already exists" )
107 if ( !File.exist?( input) )
108 Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" )
111 fasta_like = Util.looks_like_fasta?( input )
114 puts( "Input alignment : " + input )
115 puts( "Output alignment: " + output )
116 puts( "Name list : " + list_file )
118 puts( "Format : Fasta" )
120 puts( "Format : Phylip like" )
122 if ( extract_taxonomy )
123 puts( "Extract taxonomy: true" )
125 if ( annotation != nil )
126 puts( "Annotation : " + annotation )
133 msa = f.create_msa_from_file( input, FastaParser.new() )
135 msa = f.create_msa_from_file( input, GeneralMsaParser.new() )
137 rescue Exception => e
138 Util.fatal_error( PRG_NAME, "failed to read file: " + e.to_s )
141 if ( msa == nil || msa.get_number_of_seqs() < 1 )
142 Util.fatal_error( PRG_NAME, "failed to read MSA" )
145 Util.check_file_for_writability( list_file )
146 rescue Exception => e
147 Util.fatal_error( PRG_NAME, "error: " + e.to_, STDOUT )
150 lf = File.open( list_file, "a" )
151 for i in 0 ... msa.get_number_of_seqs
152 seq = msa.get_sequence( i )
153 seq.set_name( modify_name( seq.get_name(), i, lf, extract_taxonomy, annotation ) )
158 w = FastaWriter.new()
160 w = PhylipSequentialWriter.new()
162 w.set_max_name_length( 9 )
165 io.write_to_file( msa, output, w )
166 rescue Exception => e
167 Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
170 Util.print_message( PRG_NAME, "wrote: " + list_file )
171 Util.print_message( PRG_NAME, "wrote: " + output )
172 Util.print_message( PRG_NAME, "next steps in standard analysis pipeline: hmmscan followed by hsp.rb")
173 Util.print_message( PRG_NAME, "OK" )
178 def modify_name( desc, counter, file, extract_taxonomy, annotation )
180 desc.gsub!( /\s+/, ' ' )
182 if desc =~/\s\[(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)\]/
183 new_desc = counter.to_s( 16 ) + "_" + $1
185 Util.fatal_error( PRG_NAME, "could not get taxonomy from: " + desc )
188 new_desc = counter.to_s( 16 )
190 if (annotation != nil)
191 new_desc = new_desc + annotation
192 file.print( new_desc + "\t" + desc + " " + annotation + "\n" )
194 file.print( new_desc + "\t" + desc + "\n" )
196 if ( new_desc.length > 9)
197 Util.fatal_error( PRG_NAME, "shortened identifier [" +
198 new_desc + "] is too long (" + new_desc.length.to_s + " characters)" )
206 puts( " " + PRG_NAME + ".rb [options] <input sequences> [output sequences] [output id list]" )
208 puts( " options: -" + EXTRACT_TAXONOMY_OPTION + " : to extract taxonomy information from bracketed expressions" )
209 puts( " -" + ANNOTATION_OPTION + "=<s>: to add an annotation to all entries" )
211 puts( " [next steps in standard analysis pipeline: hmmscan followed by hsp.rb]")
215 puts( " " + PRG_NAME + ".rb P53.fasta" )
219 end # class TaxonomyProcessor