X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Ftool%2Ftaxonomy_processor.rb;h=1d8a25bde4b888983bb16adfae67dcba21ae141d;hb=971d9105afd46581e56d4c0221500f47b3979496;hp=862a91a006089c76aae13ec09c1693b574744bd1;hpb=610ca99f1de9956bbe4c964acc48ea33b5049e4b;p=jalview.git diff --git a/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb index 862a91a..1d8a25b 100644 --- a/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb @@ -1,12 +1,10 @@ # # = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class # -# Copyright:: Copyright (C) 20017 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# - - +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +require 'lib/evo/util/constants' require 'lib/evo/util/util' require 'lib/evo/msa/msa_factory' require 'lib/evo/msa/msa' @@ -18,35 +16,29 @@ require 'lib/evo/io/writer/phylip_sequential_writer' require 'lib/evo/util/command_line_arguments' module Evoruby - class TaxonomyProcessor PRG_NAME = "tap" - PRG_DATE = "170206" - PRG_DESC = "replacement of species names in multiple sequence files" - PRG_VERSION = "2.002" - COPYRIGHT = "2017 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "" + PRG_DATE = "170214" + PRG_DESC = "Replacement of labels in multiple sequence files" + PRG_VERSION = "2.004" + WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" EXTRACT_TAXONOMY_OPTION = "t" - + ANNOTATION_OPTION = "a" + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" def run() Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - if ( ARGV == nil || ( ARGV.length != 1 && ARGV.length != 2 && ARGV.length != 3 && ARGV.length != 4 && ARGV.length != 5 && ARGV.length != 6 ) ) - puts( "Usage: #{PRG_NAME}.rb [options] [output sequences] [output id list]" ) - puts() - puts( " options: -" + EXTRACT_TAXONOMY_OPTION + ": to extract taxonomy information from bracketed expression" ) - puts() + PRG_VERSION, + PRG_DESC, + PRG_DATE, + WWW, + STDOUT ) + + if ( ARGV == nil || ( ARGV.length < 1 ) ) + print_help() exit( -1 ) end @@ -56,9 +48,15 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end - input = nil - output = nil - list_file = nil + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + input = nil + output = nil + list_file = nil if cla.get_number_of_files == 3 input = cla.get_file_name( 0 ) @@ -74,13 +72,16 @@ module Evoruby else i = input end - output = i + "_ni.fasta" - list_file = i + ".nim" + output = i + Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX + list_file = i + Constants::ID_MAP_FILE_SUFFIX + else + print_help() + exit(-1) end - allowed_opts = Array.new allowed_opts.push( EXTRACT_TAXONOMY_OPTION ) + allowed_opts.push( ANNOTATION_OPTION ) disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) @@ -92,13 +93,18 @@ module Evoruby extract_taxonomy = true end - if ( File.exists?( output ) ) + annotation = nil + if ( cla.is_option_set?( ANNOTATION_OPTION ) ) + annotation = cla.get_option_value( ANNOTATION_OPTION ) + end + + if ( File.exist?( output ) ) Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" ) end - if ( File.exists?( list_file ) ) + if ( File.exist?( list_file ) ) Util.fatal_error( PRG_NAME, "list file [" + list_file + "] already exists" ) end - if ( !File.exists?( input) ) + if ( !File.exist?( input) ) Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" ) end @@ -116,6 +122,9 @@ module Evoruby if ( extract_taxonomy ) puts( "Extract taxonomy: true" ) end + if ( annotation != nil ) + puts( "Annotation : " + annotation ) + end puts() f = MsaFactory.new() @@ -141,7 +150,7 @@ module Evoruby lf = File.open( list_file, "a" ) for i in 0 ... msa.get_number_of_seqs seq = msa.get_sequence( i ) - seq.set_name( modify_name( seq.get_name(), i, lf, extract_taxonomy ) ) + seq.set_name( modify_name( seq.get_name(), i, lf, extract_taxonomy, annotation ) ) end io = MsaIO.new() w = nil @@ -150,7 +159,7 @@ module Evoruby else w = PhylipSequentialWriter.new() end - w.set_max_name_length( 10 ) + w.set_max_name_length( 9 ) w.clean( true ) begin io.write_to_file( msa, output, w ) @@ -160,16 +169,15 @@ module Evoruby lf.close() Util.print_message( PRG_NAME, "wrote: " + list_file ) Util.print_message( PRG_NAME, "wrote: " + output ) + Util.print_message( PRG_NAME, "next steps in standard analysis pipeline: hmmscan followed by hsp.rb") Util.print_message( PRG_NAME, "OK" ) end private - def modify_name( desc, counter, file, extract_taxonomy ) + def modify_name( desc, counter, file, extract_taxonomy, annotation ) new_desc = nil desc.gsub!( /\s+/, ' ' ) - #if desc =~ /^>?\s*\S{1,10}_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)/ - # new_desc = counter.to_s( 16 ) + "_" + $1 if extract_taxonomy if desc =~/\s\[(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)\]/ new_desc = counter.to_s( 16 ) + "_" + $1 @@ -179,10 +187,35 @@ module Evoruby else new_desc = counter.to_s( 16 ) end - file.print( new_desc + "\t" + desc + "\n" ) + if (annotation != nil) + new_desc = new_desc + annotation + file.print( new_desc + "\t" + desc + " " + annotation + "\n" ) + else + file.print( new_desc + "\t" + desc + "\n" ) + end + if ( new_desc.length > 9) + Util.fatal_error( PRG_NAME, "shortened identifier [" + + new_desc + "] is too long (" + new_desc.length.to_s + " characters)" ) + end new_desc end + def print_help() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] [output sequences] [output id list]" ) + puts() + puts( " options: -" + EXTRACT_TAXONOMY_OPTION + " : to extract taxonomy information from bracketed expressions" ) + puts( " -" + ANNOTATION_OPTION + "=: to add an annotation to all entries" ) + puts() + puts( " [next steps in standard analysis pipeline: hmmscan followed by hsp.rb]") + puts() + puts( "Example:" ) + puts() + puts( " " + PRG_NAME + ".rb P53.fasta" ) + puts() + end + end # class TaxonomyProcessor end # module Evoruby