MAX_NAME_LENGTH_DEFAULT = 0
def initialize()
- @line_width = LINE_WIDTH_DEFAULT
- @max_name_length = MAX_NAME_LENGTH_DEFAULT
- @remove_gap_chars = false
- @clean = false
+ @line_width = LINE_WIDTH_DEFAULT
+ @max_name_length = MAX_NAME_LENGTH_DEFAULT
+ @remove_gap_chars = false
+ @clean = false
+ @ex_if_name_too_long = false
end
@clean = clean
end
+ def set_exception_if_name_too_long( exception_if_name_too_long )
+ @ex_if_name_too_long = exception_if_name_too_long
+ end
+
def write( msa, path )
Util.check_file_for_writability( path )
f = File.open( path, "a" )
name = seq_obj.get_name()
f.print( ">" )
if ( @max_name_length != MAX_NAME_LENGTH_DEFAULT )
- name = Util.normalize_seq_name( name, @max_name_length )
+ name = Util.normalize_seq_name( name, @max_name_length, @ex_if_name_too_long )
end
f.print( name )
counter = 0
module Evoruby
- class MsaWriter
+ class MsaWriter
- def initialize()
- raise TypeError, "Cannot instanciate abstract class MsaWriter"
- end
+ def initialize()
+ raise TypeError, "Cannot instanciate abstract class MsaWriter"
+ end
- def set_max_name_length( length )
- end
+ def set_max_name_length( length )
+ end
- def write( msa, path )
- end
+ def set_exception_if_name_too_long( exception_if_name_too_long )
+ end
- end # class MsaWriter
+ def write( msa, path )
+ end
+
+ end # class MsaWriter
end # module Evoruby
MAX_NAME_LENGTH_DEFAULT = 10
def initialize()
- @max_name_length = MAX_NAME_LENGTH_DEFAULT
- @clean = false
+ @max_name_length = MAX_NAME_LENGTH_DEFAULT
+ @clean = false
+ @ex_if_name_too_long = false
end
def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT )
@clean = clean
end
+ def set_exception_if_name_too_long( exception_if_name_too_long )
+ @ex_if_name_too_long = exception_if_name_too_long
+ end
+
def write( msa, path )
if ( !msa.is_aligned() )
error_msg = "attempt to write unaligned msa in nexus format"
seq = seq_obj.get_sequence_as_string()
name = name.gsub( /\s+$/, '')
name = name.gsub( /\s+/, '_')
- name = Util.normalize_seq_name( name, @max_name_length )
+ name = Util.normalize_seq_name( name, @max_name_length, @ex_if_name_too_long )
f.print( " " )
f.print( name )
f.print( " " )
module Evoruby
- class PhylipSequentialWriter < MsaWriter
+ class PhylipSequentialWriter < MsaWriter
- MAX_NAME_LENGTH_DEFAULT = 10
+ MAX_NAME_LENGTH_DEFAULT = 10
- def initialize()
- @max_name_length = MAX_NAME_LENGTH_DEFAULT
- @clean = false
- end
+ def initialize()
+ @max_name_length = MAX_NAME_LENGTH_DEFAULT
+ @clean = false
+ @ex_if_name_too_long = false
+ end
- def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT )
- if length < 1
- length = MAX_NAME_LENGTH_DEFAULT
- end
- @max_name_length = length
- end
+ def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT )
+ if length < 1
+ length = MAX_NAME_LENGTH_DEFAULT
+ end
+ @max_name_length = length
+ end
- def clean( clean = true )
- @clean = clean
- end
+ def clean( clean = true )
+ @clean = clean
+ end
+
+ def set_exception_if_name_too_long( exception_if_name_too_long )
+ @ex_if_name_too_long = exception_if_name_too_long
+ end
- def write( msa, path )
- if ( !msa.is_aligned() )
- error_msg = "attempt to write unaligned msa in phylip sequential format"
- raise StandardError, error_msg, caller
- end
+ def write( msa, path )
+ if ( !msa.is_aligned() )
+ error_msg = "attempt to write unaligned msa in phylip sequential format"
+ raise StandardError, error_msg, caller
+ end
- Util.check_file_for_writability( path )
+ Util.check_file_for_writability( path )
- f = File.open( path, "a" )
+ f = File.open( path, "a" )
- f.print( msa.get_number_of_seqs().to_s() )
- f.print( " " )
- f.print( msa.get_length().to_s() )
- f.print( Evoruby::Constants::LINE_DELIMITER )
- for i in 0 ... msa.get_number_of_seqs()
- seq_obj = msa.get_sequence( i )
- name = seq_obj.get_name()
- seq = seq_obj.get_sequence_as_string()
- name = name.gsub( /\s+$/, '')
- name = name.gsub( /\s+/, '_')
- name = Util.normalize_seq_name( name, @max_name_length )
- f.print( name )
- f.print( " " )
- if ( @clean )
- seq = Util.clean_seq_str( seq )
- end
- f.print( seq )
- f.print( Evoruby::Constants::LINE_DELIMITER )
- end
- f.close()
+ f.print( msa.get_number_of_seqs().to_s() )
+ f.print( " " )
+ f.print( msa.get_length().to_s() )
+ f.print( Evoruby::Constants::LINE_DELIMITER )
+ for i in 0 ... msa.get_number_of_seqs()
+ seq_obj = msa.get_sequence( i )
+ name = seq_obj.get_name()
+ seq = seq_obj.get_sequence_as_string()
+ name = name.gsub( /\s+$/, '')
+ name = name.gsub( /\s+/, '_')
+ name = Util.normalize_seq_name( name, @max_name_length, @ex_if_name_too_long )
+ f.print( name )
+ f.print( " " )
+ if ( @clean )
+ seq = Util.clean_seq_str( seq )
end
+ f.print( seq )
+ f.print( Evoruby::Constants::LINE_DELIMITER )
+ end
+ f.close()
+ end
- end # class PhylipSequentialWriter
+ end # class PhylipSequentialWriter
end # module Evoruby
class MsaProcessor
PRG_NAME = "msa_pro"
- PRG_DATE = "2012.05.11"
+ PRG_DATE = "130411"
PRG_DESC = "processing of multiple sequence alignments"
- PRG_VERSION = "1.06"
+ PRG_VERSION = "1.07"
COPYRIGHT = "2008-2010 Christian M Zmasek"
CONTACT = "phylosoft@gmail.com"
WWW = "www.phylosoft.org"
INPUT_TYPE_OPTION = "i"
OUTPUT_TYPE_OPTION = "o"
MAXIMAL_NAME_LENGTH_OPTION = "n"
+ DIE_IF_NAME_TOO_LONG = "d"
WIDTH_OPTION = "w"
CLEAN_UP_SEQ_OPTION = "c"
REM_RED_OPTION = "rem_red"
@rgoc = false
@rg = false # fasta only
@rem_red = false
+ @die_if_name_too_long = false
@rgr = -1
@rsgr = -1
@rsl = -1
allowed_opts.push( REM_RED_OPTION )
allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
+ allowed_opts.push( DIE_IF_NAME_TOO_LONG )
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
begin
Util.check_file_for_readability( input )
- rescue ArgumentError => e
+ rescue IOError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
begin
Util.check_file_for_writability( output )
- rescue ArgumentError => e
+ rescue IOError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
w = PhylipSequentialWriter.new()
w.clean( @clean )
w.set_max_name_length( @name_length )
+ w.set_exception_if_name_too_long( @die_if_name_too_long )
elsif( @fasta_output )
w = FastaWriter.new()
w.set_line_width( @width )
w.clean( @clean )
if ( @name_length_set )
w.set_max_name_length( @name_length )
+ w.set_exception_if_name_too_long( @die_if_name_too_long )
end
elsif( @nexus_output )
w = NexusWriter.new()
w.clean( @clean )
w.set_max_name_length( @name_length )
+ w.set_exception_if_name_too_long( @die_if_name_too_long )
end
@last = -1
end
end
+
def analyze_command_line( cla )
if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
begin
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
end
+ if ( cla.is_option_set?( DIE_IF_NAME_TOO_LONG ) )
+ @die_if_name_too_long = true
+ end
end
puts( " options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta, p for phylip selex type" )
puts( " -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta, n for nexus, p for phylip sequential (default)" )
puts( " -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
+ puts( " -" + DIE_IF_NAME_TOO_LONG + ": die if sequence name too long" )
puts( " -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
puts( " -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" )
puts( " -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" )
# are to be used:
# the substring between the first two double underscores is a
# unique identifier and needs to match the identifiers
-# in '% <parameter-type> <unique-id>=<value>' statements
+# in '% <parameter-type> <unique-id>=<value>' statements
# Example:
# alignment name : 'x__bcl2__e1'
-# parameter statments: '% RSL bcl2=60'
+# parameter statments: '% RSL bcl2=60'
$ PROBCONS=/home/czmasek/SOFTWARE/PROBCONS/probcons_v1_12/probcons
$ DIALIGN_TX=/home/czmasek/SOFTWARE/DIALIGNTX/DIALIGN-TX_1.0.2/source/dialign-tx
$ DIALIGN_CONF=/home/czmasek/SOFTWARE/DIALIGNTX/DIALIGN-TX_1.0.2/conf
% RSL Hormone_recep=60
-%
+%
% RSL Y_phosphatase=100
% RSL Y_phosphatase2=75
% RSL Y_phosphatase3=50
> KALIGN $ > $_kalign
-> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_kalign $_kalign_05_%[RSL]%.aln
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_kalign $_kalign_05_%[RSL]%.aln
> PHYLO_PL %[PHYLO_OPT]% $_kalign_05_%[RSL]%.aln $_kalign_05_%[RSL]% %[TMP_DIR]%
--
+-
+
+> KALIGN $ > $_kalign_
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_kalign_ $_kalign_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_kalign_09_%[RSL]%.aln $_kalign_09_%[RSL]% %[TMP_DIR]%
+-
+
> HMMALIGN --amino --trim --outformat Pfam -o $_hmmalign %[HMM]% $ > /dev/null
-> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_hmmalign $_hmmalign_05_%[RSL]%.aln
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_hmmalign $_hmmalign_05_%[RSL]%.aln
> PHYLO_PL %[PHYLO_OPT]% $_hmmalign_05_%[RSL]%.aln $_hmmalign_05_%[RSL]% %[TMP_DIR]%
--
+-
+
+> HMMALIGN --amino --trim --outformat Pfam -o $_hmmalign_ %[HMM]% $ > /dev/null
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_hmmalign_ $_hmmalign_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_hmmalign_09_%[RSL]%.aln $_hmmalign_09_%[RSL]% %[TMP_DIR]%
+-
+
> MAFFT --maxiterate 1000 --localpair $ > $_mafft
-> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_mafft $_mafft_05_%[RSL]%.aln
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_mafft $_mafft_05_%[RSL]%.aln
> PHYLO_PL %[PHYLO_OPT]% $_mafft_05_%[RSL]%.aln $_mafft_05_%[RSL]% %[TMP_DIR]%
-
+> MAFFT --maxiterate 1000 --localpair $ > $_mafft_
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_mafft_ $_mafft_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_mafft_09_%[RSL]%.aln $_mafft_09_%[RSL]% %[TMP_DIR]%
+-
+
+
> MUSCLE -maxiters 1000 -maxtrees 100 -in $ -out $_muscle
-> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_muscle $_muscle_05_%[RSL]%.aln
-> PHYLO_PL %[PHYLO_OPT]% $_muscle_05_%[RSL]%.aln $_muscle_05_%[RSL]% %[TMP_DIR]%
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_muscle $_muscle_05_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_muscle_05_%[RSL]%.aln $_muscle_05_%[RSL]% %[TMP_DIR]%
+-
+
+> MUSCLE -maxiters 1000 -maxtrees 100 -in $ -out $_muscle_
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_muscle_ $_muscle_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_muscle_09_%[RSL]%.aln $_muscle_09_%[RSL]% %[TMP_DIR]%
-
+
> CLUSTALO --full --full-iter --iter=5 -i $ -o $_clustalo
-> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_clustalo $_clustalo_05_%[RSL]%.aln
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_clustalo $_clustalo_05_%[RSL]%.aln
> PHYLO_PL %[PHYLO_OPT]% $_clustalo_05_%[RSL]%.aln $_clustalo_05_%[RSL]% %[TMP_DIR]%
-
+> CLUSTALO --full --full-iter --iter=5 -i $ -o $_clustalo_
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_clustalo_ $_clustalo_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_clustalo_09_%[RSL]%.aln $_clustalo_09_%[RSL]% %[TMP_DIR]%
+-
+
+
> PROBCONS $ > $_probcons
-> MSA_PRO -o=p -n=10 -rem_red -rr=0.5 -c -rsl=%[RSL]% $_probcons $_probcons_05_%[RSL]%.aln
+> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.5 -c -rsl=%[RSL]% $_probcons $_probcons_05_%[RSL]%.aln
> PHYLO_PL %[PHYLO_OPT]% $_probcons_05_%[RSL]%.aln $_probcons_05_%[RSL]% %[TMP_DIR]%
--
+-
+
+> PROBCONS $ > $_probcons_
+> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.9 -c -rsl=%[RSL]% $_probcons_ $_probcons_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_probcons_09_%[RSL]%.aln $_probcons_09_%[RSL]% %[TMP_DIR]%
+-
+
> DIALIGN_TX DIALIGN_CONF $ $_dialigntx
-> MSA_PRO -o=p -n=10 -rem_red -rr=0.5 -c -rsl=%[RSL]% $_dialigntx $_dialigntx_05_%[RSL]%.aln
+> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.5 -c -rsl=%[RSL]% $_dialigntx $_dialigntx_05_%[RSL]%.aln
> PHYLO_PL %[PHYLO_OPT]% $_dialigntx_05_%[RSL]%.aln $_dialigntx_05_%[RSL]% %[TMP_DIR]%
-
+> DIALIGN_TX DIALIGN_CONF $ $_dialigntx_
+> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.9 -c -rsl=%[RSL]% $_dialigntx_ $_dialigntx_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_dialigntx_09_%[RSL]%.aln $_dialigntx_09_%[RSL]% %[TMP_DIR]%
+-
+
=end
class TaxonomyProcessor
PRG_NAME = "tap"
- PRG_DATE = "2013.03.22"
+ PRG_DATE = "130411"
PRG_DESC = "replacement of species names in multiple sequence files"
- PRG_VERSION = "2.001"
+ PRG_VERSION = "2.002"
COPYRIGHT = "2013 Christian M Zmasek"
CONTACT = "phylosoft@gmail.com"
WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
def modify_name( desc, counter, file, extract_taxonomy )
new_desc = nil
desc.gsub!( /\s+/, ' ' )
- if desc =~ /^>?\s*\S{1,10}_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)/
- new_desc = counter.to_s( 16 ) + "_" + $1
- elsif extract_taxonomy
+ #if desc =~ /^>?\s*\S{1,10}_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)/
+ # new_desc = counter.to_s( 16 ) + "_" + $1
+ if extract_taxonomy
if desc =~/\s\[(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)\]/
new_desc = counter.to_s( 16 ) + "_" + $1
else
class Util
- def Util.normalize_seq_name( name, length )
+ def Util.normalize_seq_name( name, length, exception_if_too_long = false )
if name.length > length
+ if exception_if_too_long
+ error_msg = "sequence name \"#{name}\" is too long (>#{length})"
+ raise StandardError, error_msg
+ end
name = name[ 0, length ]
elsif name.length < length
for i in 0 ... length - name.length