X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Ftool%2Ftaxonomy_processor.rb;h=1d8a25bde4b888983bb16adfae67dcba21ae141d;hb=971d9105afd46581e56d4c0221500f47b3979496;hp=862a91a006089c76aae13ec09c1693b574744bd1;hpb=610ca99f1de9956bbe4c964acc48ea33b5049e4b;p=jalview.git
diff --git a/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb
index 862a91a..1d8a25b 100644
--- a/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb
+++ b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb
@@ -1,12 +1,10 @@
#
# = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class
#
-# Copyright:: Copyright (C) 20017 Christian M. Zmasek
-# License:: GNU Lesser General Public License (LGPL)
-#
-
-
+# Copyright:: Copyright (C) 2017 Christian M. Zmasek
+# License:: GNU Lesser General Public License (LGPL)
+require 'lib/evo/util/constants'
require 'lib/evo/util/util'
require 'lib/evo/msa/msa_factory'
require 'lib/evo/msa/msa'
@@ -18,35 +16,29 @@ require 'lib/evo/io/writer/phylip_sequential_writer'
require 'lib/evo/util/command_line_arguments'
module Evoruby
-
class TaxonomyProcessor
PRG_NAME = "tap"
- PRG_DATE = "170206"
- PRG_DESC = "replacement of species names in multiple sequence files"
- PRG_VERSION = "2.002"
- COPYRIGHT = "2017 Christian M Zmasek"
- CONTACT = "phylosoft@gmail.com"
- WWW = ""
+ PRG_DATE = "170214"
+ PRG_DESC = "Replacement of labels in multiple sequence files"
+ PRG_VERSION = "2.004"
+ WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
EXTRACT_TAXONOMY_OPTION = "t"
-
+ ANNOTATION_OPTION = "a"
+ HELP_OPTION_1 = "help"
+ HELP_OPTION_2 = "h"
def run()
Util.print_program_information( PRG_NAME,
- PRG_VERSION,
- PRG_DESC,
- PRG_DATE,
- COPYRIGHT,
- CONTACT,
- WWW,
- STDOUT )
-
- if ( ARGV == nil || ( ARGV.length != 1 && ARGV.length != 2 && ARGV.length != 3 && ARGV.length != 4 && ARGV.length != 5 && ARGV.length != 6 ) )
- puts( "Usage: #{PRG_NAME}.rb [options] [output sequences] [output id list]" )
- puts()
- puts( " options: -" + EXTRACT_TAXONOMY_OPTION + ": to extract taxonomy information from bracketed expression" )
- puts()
+ PRG_VERSION,
+ PRG_DESC,
+ PRG_DATE,
+ WWW,
+ STDOUT )
+
+ if ( ARGV == nil || ( ARGV.length < 1 ) )
+ print_help()
exit( -1 )
end
@@ -56,9 +48,15 @@ module Evoruby
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
- input = nil
- output = nil
- list_file = nil
+ if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+ cla.is_option_set?( HELP_OPTION_2 ) )
+ print_help
+ exit( 0 )
+ end
+
+ input = nil
+ output = nil
+ list_file = nil
if cla.get_number_of_files == 3
input = cla.get_file_name( 0 )
@@ -74,13 +72,16 @@ module Evoruby
else
i = input
end
- output = i + "_ni.fasta"
- list_file = i + ".nim"
+ output = i + Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
+ list_file = i + Constants::ID_MAP_FILE_SUFFIX
+ else
+ print_help()
+ exit(-1)
end
-
allowed_opts = Array.new
allowed_opts.push( EXTRACT_TAXONOMY_OPTION )
+ allowed_opts.push( ANNOTATION_OPTION )
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
@@ -92,13 +93,18 @@ module Evoruby
extract_taxonomy = true
end
- if ( File.exists?( output ) )
+ annotation = nil
+ if ( cla.is_option_set?( ANNOTATION_OPTION ) )
+ annotation = cla.get_option_value( ANNOTATION_OPTION )
+ end
+
+ if ( File.exist?( output ) )
Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" )
end
- if ( File.exists?( list_file ) )
+ if ( File.exist?( list_file ) )
Util.fatal_error( PRG_NAME, "list file [" + list_file + "] already exists" )
end
- if ( !File.exists?( input) )
+ if ( !File.exist?( input) )
Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" )
end
@@ -116,6 +122,9 @@ module Evoruby
if ( extract_taxonomy )
puts( "Extract taxonomy: true" )
end
+ if ( annotation != nil )
+ puts( "Annotation : " + annotation )
+ end
puts()
f = MsaFactory.new()
@@ -141,7 +150,7 @@ module Evoruby
lf = File.open( list_file, "a" )
for i in 0 ... msa.get_number_of_seqs
seq = msa.get_sequence( i )
- seq.set_name( modify_name( seq.get_name(), i, lf, extract_taxonomy ) )
+ seq.set_name( modify_name( seq.get_name(), i, lf, extract_taxonomy, annotation ) )
end
io = MsaIO.new()
w = nil
@@ -150,7 +159,7 @@ module Evoruby
else
w = PhylipSequentialWriter.new()
end
- w.set_max_name_length( 10 )
+ w.set_max_name_length( 9 )
w.clean( true )
begin
io.write_to_file( msa, output, w )
@@ -160,16 +169,15 @@ module Evoruby
lf.close()
Util.print_message( PRG_NAME, "wrote: " + list_file )
Util.print_message( PRG_NAME, "wrote: " + output )
+ Util.print_message( PRG_NAME, "next steps in standard analysis pipeline: hmmscan followed by hsp.rb")
Util.print_message( PRG_NAME, "OK" )
end
private
- def modify_name( desc, counter, file, extract_taxonomy )
+ def modify_name( desc, counter, file, extract_taxonomy, annotation )
new_desc = nil
desc.gsub!( /\s+/, ' ' )
- #if desc =~ /^>?\s*\S{1,10}_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)/
- # new_desc = counter.to_s( 16 ) + "_" + $1
if extract_taxonomy
if desc =~/\s\[(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)\]/
new_desc = counter.to_s( 16 ) + "_" + $1
@@ -179,10 +187,35 @@ module Evoruby
else
new_desc = counter.to_s( 16 )
end
- file.print( new_desc + "\t" + desc + "\n" )
+ if (annotation != nil)
+ new_desc = new_desc + annotation
+ file.print( new_desc + "\t" + desc + " " + annotation + "\n" )
+ else
+ file.print( new_desc + "\t" + desc + "\n" )
+ end
+ if ( new_desc.length > 9)
+ Util.fatal_error( PRG_NAME, "shortened identifier [" +
+ new_desc + "] is too long (" + new_desc.length.to_s + " characters)" )
+ end
new_desc
end
+ def print_help()
+ puts( "Usage:" )
+ puts()
+ puts( " " + PRG_NAME + ".rb [options] [output sequences] [output id list]" )
+ puts()
+ puts( " options: -" + EXTRACT_TAXONOMY_OPTION + " : to extract taxonomy information from bracketed expressions" )
+ puts( " -" + ANNOTATION_OPTION + "=: to add an annotation to all entries" )
+ puts()
+ puts( " [next steps in standard analysis pipeline: hmmscan followed by hsp.rb]")
+ puts()
+ puts( "Example:" )
+ puts()
+ puts( " " + PRG_NAME + ".rb P53.fasta" )
+ puts()
+ end
+
end # class TaxonomyProcessor
end # module Evoruby