From 2044a46819969b8ac56d653aab5729fb782ec40c Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Fri, 12 Apr 2013 00:45:43 +0000 Subject: [PATCH] inprogress --- .../ruby/evoruby/lib/evo/io/writer/fasta_writer.rb | 15 ++-- .../ruby/evoruby/lib/evo/io/writer/msa_writer.rb | 21 +++-- .../ruby/evoruby/lib/evo/io/writer/nexus_writer.rb | 11 ++- .../lib/evo/io/writer/phylip_sequential_writer.rb | 91 +++++++++++--------- .../ruby/evoruby/lib/evo/tool/msa_processor.rb | 19 +++- .../ruby/evoruby/lib/evo/tool/phylogeny_factory.rb | 69 ++++++++++++--- .../evoruby/lib/evo/tool/taxonomy_processor.rb | 10 +-- forester/ruby/evoruby/lib/evo/util/util.rb | 6 +- 8 files changed, 158 insertions(+), 84 deletions(-) diff --git a/forester/ruby/evoruby/lib/evo/io/writer/fasta_writer.rb b/forester/ruby/evoruby/lib/evo/io/writer/fasta_writer.rb index 26f7461..0da0de3 100644 --- a/forester/ruby/evoruby/lib/evo/io/writer/fasta_writer.rb +++ b/forester/ruby/evoruby/lib/evo/io/writer/fasta_writer.rb @@ -18,10 +18,11 @@ module Evoruby MAX_NAME_LENGTH_DEFAULT = 0 def initialize() - @line_width = LINE_WIDTH_DEFAULT - @max_name_length = MAX_NAME_LENGTH_DEFAULT - @remove_gap_chars = false - @clean = false + @line_width = LINE_WIDTH_DEFAULT + @max_name_length = MAX_NAME_LENGTH_DEFAULT + @remove_gap_chars = false + @clean = false + @ex_if_name_too_long = false end @@ -47,6 +48,10 @@ module Evoruby @clean = clean end + def set_exception_if_name_too_long( exception_if_name_too_long ) + @ex_if_name_too_long = exception_if_name_too_long + end + def write( msa, path ) Util.check_file_for_writability( path ) f = File.open( path, "a" ) @@ -55,7 +60,7 @@ module Evoruby name = seq_obj.get_name() f.print( ">" ) if ( @max_name_length != MAX_NAME_LENGTH_DEFAULT ) - name = Util.normalize_seq_name( name, @max_name_length ) + name = Util.normalize_seq_name( name, @max_name_length, @ex_if_name_too_long ) end f.print( name ) counter = 0 diff --git a/forester/ruby/evoruby/lib/evo/io/writer/msa_writer.rb b/forester/ruby/evoruby/lib/evo/io/writer/msa_writer.rb index 36eec15..9ad5466 100644 --- a/forester/ruby/evoruby/lib/evo/io/writer/msa_writer.rb +++ b/forester/ruby/evoruby/lib/evo/io/writer/msa_writer.rb @@ -13,18 +13,21 @@ require 'lib/evo/util/util' module Evoruby - class MsaWriter + class MsaWriter - def initialize() - raise TypeError, "Cannot instanciate abstract class MsaWriter" - end + def initialize() + raise TypeError, "Cannot instanciate abstract class MsaWriter" + end - def set_max_name_length( length ) - end + def set_max_name_length( length ) + end - def write( msa, path ) - end + def set_exception_if_name_too_long( exception_if_name_too_long ) + end - end # class MsaWriter + def write( msa, path ) + end + + end # class MsaWriter end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/io/writer/nexus_writer.rb b/forester/ruby/evoruby/lib/evo/io/writer/nexus_writer.rb index 17a0c77..9326ae1 100644 --- a/forester/ruby/evoruby/lib/evo/io/writer/nexus_writer.rb +++ b/forester/ruby/evoruby/lib/evo/io/writer/nexus_writer.rb @@ -17,8 +17,9 @@ module Evoruby MAX_NAME_LENGTH_DEFAULT = 10 def initialize() - @max_name_length = MAX_NAME_LENGTH_DEFAULT - @clean = false + @max_name_length = MAX_NAME_LENGTH_DEFAULT + @clean = false + @ex_if_name_too_long = false end def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT ) @@ -32,6 +33,10 @@ module Evoruby @clean = clean end + def set_exception_if_name_too_long( exception_if_name_too_long ) + @ex_if_name_too_long = exception_if_name_too_long + end + def write( msa, path ) if ( !msa.is_aligned() ) error_msg = "attempt to write unaligned msa in nexus format" @@ -60,7 +65,7 @@ module Evoruby seq = seq_obj.get_sequence_as_string() name = name.gsub( /\s+$/, '') name = name.gsub( /\s+/, '_') - name = Util.normalize_seq_name( name, @max_name_length ) + name = Util.normalize_seq_name( name, @max_name_length, @ex_if_name_too_long ) f.print( " " ) f.print( name ) f.print( " " ) diff --git a/forester/ruby/evoruby/lib/evo/io/writer/phylip_sequential_writer.rb b/forester/ruby/evoruby/lib/evo/io/writer/phylip_sequential_writer.rb index 15f4c5c..42ad186 100644 --- a/forester/ruby/evoruby/lib/evo/io/writer/phylip_sequential_writer.rb +++ b/forester/ruby/evoruby/lib/evo/io/writer/phylip_sequential_writer.rb @@ -12,59 +12,64 @@ require 'lib/evo/io/writer/msa_writer' module Evoruby - class PhylipSequentialWriter < MsaWriter + class PhylipSequentialWriter < MsaWriter - MAX_NAME_LENGTH_DEFAULT = 10 + MAX_NAME_LENGTH_DEFAULT = 10 - def initialize() - @max_name_length = MAX_NAME_LENGTH_DEFAULT - @clean = false - end + def initialize() + @max_name_length = MAX_NAME_LENGTH_DEFAULT + @clean = false + @ex_if_name_too_long = false + end - def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT ) - if length < 1 - length = MAX_NAME_LENGTH_DEFAULT - end - @max_name_length = length - end + def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT ) + if length < 1 + length = MAX_NAME_LENGTH_DEFAULT + end + @max_name_length = length + end - def clean( clean = true ) - @clean = clean - end + def clean( clean = true ) + @clean = clean + end + + def set_exception_if_name_too_long( exception_if_name_too_long ) + @ex_if_name_too_long = exception_if_name_too_long + end - def write( msa, path ) - if ( !msa.is_aligned() ) - error_msg = "attempt to write unaligned msa in phylip sequential format" - raise StandardError, error_msg, caller - end + def write( msa, path ) + if ( !msa.is_aligned() ) + error_msg = "attempt to write unaligned msa in phylip sequential format" + raise StandardError, error_msg, caller + end - Util.check_file_for_writability( path ) + Util.check_file_for_writability( path ) - f = File.open( path, "a" ) + f = File.open( path, "a" ) - f.print( msa.get_number_of_seqs().to_s() ) - f.print( " " ) - f.print( msa.get_length().to_s() ) - f.print( Evoruby::Constants::LINE_DELIMITER ) - for i in 0 ... msa.get_number_of_seqs() - seq_obj = msa.get_sequence( i ) - name = seq_obj.get_name() - seq = seq_obj.get_sequence_as_string() - name = name.gsub( /\s+$/, '') - name = name.gsub( /\s+/, '_') - name = Util.normalize_seq_name( name, @max_name_length ) - f.print( name ) - f.print( " " ) - if ( @clean ) - seq = Util.clean_seq_str( seq ) - end - f.print( seq ) - f.print( Evoruby::Constants::LINE_DELIMITER ) - end - f.close() + f.print( msa.get_number_of_seqs().to_s() ) + f.print( " " ) + f.print( msa.get_length().to_s() ) + f.print( Evoruby::Constants::LINE_DELIMITER ) + for i in 0 ... msa.get_number_of_seqs() + seq_obj = msa.get_sequence( i ) + name = seq_obj.get_name() + seq = seq_obj.get_sequence_as_string() + name = name.gsub( /\s+$/, '') + name = name.gsub( /\s+/, '_') + name = Util.normalize_seq_name( name, @max_name_length, @ex_if_name_too_long ) + f.print( name ) + f.print( " " ) + if ( @clean ) + seq = Util.clean_seq_str( seq ) end + f.print( seq ) + f.print( Evoruby::Constants::LINE_DELIMITER ) + end + f.close() + end - end # class PhylipSequentialWriter + end # class PhylipSequentialWriter end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb b/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb index 6299417..738db56 100644 --- a/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/msa_processor.rb @@ -27,9 +27,9 @@ module Evoruby class MsaProcessor PRG_NAME = "msa_pro" - PRG_DATE = "2012.05.11" + PRG_DATE = "130411" PRG_DESC = "processing of multiple sequence alignments" - PRG_VERSION = "1.06" + PRG_VERSION = "1.07" COPYRIGHT = "2008-2010 Christian M Zmasek" CONTACT = "phylosoft@gmail.com" WWW = "www.phylosoft.org" @@ -40,6 +40,7 @@ module Evoruby INPUT_TYPE_OPTION = "i" OUTPUT_TYPE_OPTION = "o" MAXIMAL_NAME_LENGTH_OPTION = "n" + DIE_IF_NAME_TOO_LONG = "d" WIDTH_OPTION = "w" CLEAN_UP_SEQ_OPTION = "c" REM_RED_OPTION = "rem_red" @@ -78,6 +79,7 @@ module Evoruby @rgoc = false @rg = false # fasta only @rem_red = false + @die_if_name_too_long = false @rgr = -1 @rsgr = -1 @rsl = -1 @@ -148,6 +150,7 @@ module Evoruby allowed_opts.push( REM_RED_OPTION ) allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION ) allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION ) + allowed_opts.push( DIE_IF_NAME_TOO_LONG ) disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) @@ -162,13 +165,13 @@ module Evoruby begin Util.check_file_for_readability( input ) - rescue ArgumentError => e + rescue IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end begin Util.check_file_for_writability( output ) - rescue ArgumentError => e + rescue IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end @@ -511,6 +514,7 @@ module Evoruby w = PhylipSequentialWriter.new() w.clean( @clean ) w.set_max_name_length( @name_length ) + w.set_exception_if_name_too_long( @die_if_name_too_long ) elsif( @fasta_output ) w = FastaWriter.new() w.set_line_width( @width ) @@ -522,11 +526,13 @@ module Evoruby w.clean( @clean ) if ( @name_length_set ) w.set_max_name_length( @name_length ) + w.set_exception_if_name_too_long( @die_if_name_too_long ) end elsif( @nexus_output ) w = NexusWriter.new() w.clean( @clean ) w.set_max_name_length( @name_length ) + w.set_exception_if_name_too_long( @die_if_name_too_long ) end @@ -661,6 +667,7 @@ module Evoruby @last = -1 end end + def analyze_command_line( cla ) if ( cla.is_option_set?( INPUT_TYPE_OPTION ) ) begin @@ -806,6 +813,9 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end end + if ( cla.is_option_set?( DIE_IF_NAME_TOO_LONG ) ) + @die_if_name_too_long = true + end end @@ -819,6 +829,7 @@ module Evoruby puts( " options: -" + INPUT_TYPE_OPTION + "=: f for fasta, p for phylip selex type" ) puts( " -" + OUTPUT_TYPE_OPTION + "=: f for fasta, n for nexus, p for phylip sequential (default)" ) puts( " -" + MAXIMAL_NAME_LENGTH_OPTION + "=: n=maximal name length (default for phylip 10, for fasta: unlimited )" ) + puts( " -" + DIE_IF_NAME_TOO_LONG + ": die if sequence name too long" ) puts( " -" + WIDTH_OPTION + "=: n=width (fasta output only, default is 60)" ) puts( " -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" ) puts( " -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" ) diff --git a/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb b/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb index cb72a02..6122f2b 100644 --- a/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb +++ b/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb @@ -274,10 +274,10 @@ end # module Evoruby # are to be used: # the substring between the first two double underscores is a # unique identifier and needs to match the identifiers -# in '% =' statements +# in '% =' statements # Example: # alignment name : 'x__bcl2__e1' -# parameter statments: '% RSL bcl2=60' +# parameter statments: '% RSL bcl2=60' $ PROBCONS=/home/czmasek/SOFTWARE/PROBCONS/probcons_v1_12/probcons $ DIALIGN_TX=/home/czmasek/SOFTWARE/DIALIGNTX/DIALIGN-TX_1.0.2/source/dialign-tx $ DIALIGN_CONF=/home/czmasek/SOFTWARE/DIALIGNTX/DIALIGN-TX_1.0.2/conf @@ -291,7 +291,7 @@ $ PHYLO_PL=/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/archive/perl/ph % RSL Hormone_recep=60 -% +% % RSL Y_phosphatase=100 % RSL Y_phosphatase2=75 % RSL Y_phosphatase3=50 @@ -303,39 +303,80 @@ $ PHYLO_PL=/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/archive/perl/ph > KALIGN $ > $_kalign -> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_kalign $_kalign_05_%[RSL]%.aln +> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_kalign $_kalign_05_%[RSL]%.aln > PHYLO_PL %[PHYLO_OPT]% $_kalign_05_%[RSL]%.aln $_kalign_05_%[RSL]% %[TMP_DIR]% -- +- + +> KALIGN $ > $_kalign_ +> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_kalign_ $_kalign_09_%[RSL]%.aln +> PHYLO_PL %[PHYLO_OPT]% $_kalign_09_%[RSL]%.aln $_kalign_09_%[RSL]% %[TMP_DIR]% +- + > HMMALIGN --amino --trim --outformat Pfam -o $_hmmalign %[HMM]% $ > /dev/null -> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_hmmalign $_hmmalign_05_%[RSL]%.aln +> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_hmmalign $_hmmalign_05_%[RSL]%.aln > PHYLO_PL %[PHYLO_OPT]% $_hmmalign_05_%[RSL]%.aln $_hmmalign_05_%[RSL]% %[TMP_DIR]% -- +- + +> HMMALIGN --amino --trim --outformat Pfam -o $_hmmalign_ %[HMM]% $ > /dev/null +> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_hmmalign_ $_hmmalign_09_%[RSL]%.aln +> PHYLO_PL %[PHYLO_OPT]% $_hmmalign_09_%[RSL]%.aln $_hmmalign_09_%[RSL]% %[TMP_DIR]% +- + > MAFFT --maxiterate 1000 --localpair $ > $_mafft -> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_mafft $_mafft_05_%[RSL]%.aln +> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_mafft $_mafft_05_%[RSL]%.aln > PHYLO_PL %[PHYLO_OPT]% $_mafft_05_%[RSL]%.aln $_mafft_05_%[RSL]% %[TMP_DIR]% - +> MAFFT --maxiterate 1000 --localpair $ > $_mafft_ +> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_mafft_ $_mafft_09_%[RSL]%.aln +> PHYLO_PL %[PHYLO_OPT]% $_mafft_09_%[RSL]%.aln $_mafft_09_%[RSL]% %[TMP_DIR]% +- + + > MUSCLE -maxiters 1000 -maxtrees 100 -in $ -out $_muscle -> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_muscle $_muscle_05_%[RSL]%.aln -> PHYLO_PL %[PHYLO_OPT]% $_muscle_05_%[RSL]%.aln $_muscle_05_%[RSL]% %[TMP_DIR]% +> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_muscle $_muscle_05_%[RSL]%.aln +> PHYLO_PL %[PHYLO_OPT]% $_muscle_05_%[RSL]%.aln $_muscle_05_%[RSL]% %[TMP_DIR]% +- + +> MUSCLE -maxiters 1000 -maxtrees 100 -in $ -out $_muscle_ +> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_muscle_ $_muscle_09_%[RSL]%.aln +> PHYLO_PL %[PHYLO_OPT]% $_muscle_09_%[RSL]%.aln $_muscle_09_%[RSL]% %[TMP_DIR]% - + > CLUSTALO --full --full-iter --iter=5 -i $ -o $_clustalo -> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_clustalo $_clustalo_05_%[RSL]%.aln +> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_clustalo $_clustalo_05_%[RSL]%.aln > PHYLO_PL %[PHYLO_OPT]% $_clustalo_05_%[RSL]%.aln $_clustalo_05_%[RSL]% %[TMP_DIR]% - +> CLUSTALO --full --full-iter --iter=5 -i $ -o $_clustalo_ +> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_clustalo_ $_clustalo_09_%[RSL]%.aln +> PHYLO_PL %[PHYLO_OPT]% $_clustalo_09_%[RSL]%.aln $_clustalo_09_%[RSL]% %[TMP_DIR]% +- + + > PROBCONS $ > $_probcons -> MSA_PRO -o=p -n=10 -rem_red -rr=0.5 -c -rsl=%[RSL]% $_probcons $_probcons_05_%[RSL]%.aln +> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.5 -c -rsl=%[RSL]% $_probcons $_probcons_05_%[RSL]%.aln > PHYLO_PL %[PHYLO_OPT]% $_probcons_05_%[RSL]%.aln $_probcons_05_%[RSL]% %[TMP_DIR]% -- +- + +> PROBCONS $ > $_probcons_ +> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.9 -c -rsl=%[RSL]% $_probcons_ $_probcons_09_%[RSL]%.aln +> PHYLO_PL %[PHYLO_OPT]% $_probcons_09_%[RSL]%.aln $_probcons_09_%[RSL]% %[TMP_DIR]% +- + > DIALIGN_TX DIALIGN_CONF $ $_dialigntx -> MSA_PRO -o=p -n=10 -rem_red -rr=0.5 -c -rsl=%[RSL]% $_dialigntx $_dialigntx_05_%[RSL]%.aln +> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.5 -c -rsl=%[RSL]% $_dialigntx $_dialigntx_05_%[RSL]%.aln > PHYLO_PL %[PHYLO_OPT]% $_dialigntx_05_%[RSL]%.aln $_dialigntx_05_%[RSL]% %[TMP_DIR]% - +> DIALIGN_TX DIALIGN_CONF $ $_dialigntx_ +> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.9 -c -rsl=%[RSL]% $_dialigntx_ $_dialigntx_09_%[RSL]%.aln +> PHYLO_PL %[PHYLO_OPT]% $_dialigntx_09_%[RSL]%.aln $_dialigntx_09_%[RSL]% %[TMP_DIR]% +- + =end diff --git a/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb index 9a77601..bf194c7 100644 --- a/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb @@ -22,9 +22,9 @@ module Evoruby class TaxonomyProcessor PRG_NAME = "tap" - PRG_DATE = "2013.03.22" + PRG_DATE = "130411" PRG_DESC = "replacement of species names in multiple sequence files" - PRG_VERSION = "2.001" + PRG_VERSION = "2.002" COPYRIGHT = "2013 Christian M Zmasek" CONTACT = "phylosoft@gmail.com" WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" @@ -168,9 +168,9 @@ module Evoruby def modify_name( desc, counter, file, extract_taxonomy ) new_desc = nil desc.gsub!( /\s+/, ' ' ) - if desc =~ /^>?\s*\S{1,10}_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)/ - new_desc = counter.to_s( 16 ) + "_" + $1 - elsif extract_taxonomy + #if desc =~ /^>?\s*\S{1,10}_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)/ + # new_desc = counter.to_s( 16 ) + "_" + $1 + if extract_taxonomy if desc =~/\s\[(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)\]/ new_desc = counter.to_s( 16 ) + "_" + $1 else diff --git a/forester/ruby/evoruby/lib/evo/util/util.rb b/forester/ruby/evoruby/lib/evo/util/util.rb index 79f92e0..e4173d3 100644 --- a/forester/ruby/evoruby/lib/evo/util/util.rb +++ b/forester/ruby/evoruby/lib/evo/util/util.rb @@ -14,8 +14,12 @@ module Evoruby class Util - def Util.normalize_seq_name( name, length ) + def Util.normalize_seq_name( name, length, exception_if_too_long = false ) if name.length > length + if exception_if_too_long + error_msg = "sequence name \"#{name}\" is too long (>#{length})" + raise StandardError, error_msg + end name = name[ 0, length ] elsif name.length < length for i in 0 ... length - name.length -- 1.7.10.2