inprogress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Fri, 12 Apr 2013 00:45:43 +0000 (00:45 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Fri, 12 Apr 2013 00:45:43 +0000 (00:45 +0000)
forester/ruby/evoruby/lib/evo/io/writer/fasta_writer.rb
forester/ruby/evoruby/lib/evo/io/writer/msa_writer.rb
forester/ruby/evoruby/lib/evo/io/writer/nexus_writer.rb
forester/ruby/evoruby/lib/evo/io/writer/phylip_sequential_writer.rb
forester/ruby/evoruby/lib/evo/tool/msa_processor.rb
forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb
forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb
forester/ruby/evoruby/lib/evo/util/util.rb

index 26f7461..0da0de3 100644 (file)
@@ -18,10 +18,11 @@ module Evoruby
         MAX_NAME_LENGTH_DEFAULT = 0
 
         def initialize()
-            @line_width       = LINE_WIDTH_DEFAULT
-            @max_name_length  = MAX_NAME_LENGTH_DEFAULT
-            @remove_gap_chars = false
-            @clean            = false
+            @line_width          = LINE_WIDTH_DEFAULT
+            @max_name_length     = MAX_NAME_LENGTH_DEFAULT
+            @remove_gap_chars    = false
+            @clean               = false
+            @ex_if_name_too_long = false
         end
 
 
@@ -47,6 +48,10 @@ module Evoruby
             @clean = clean
         end
 
+        def set_exception_if_name_too_long( exception_if_name_too_long )
+          @ex_if_name_too_long = exception_if_name_too_long
+        end
+
         def write( msa, path )
             Util.check_file_for_writability( path )
             f = File.open( path, "a" )
@@ -55,7 +60,7 @@ module Evoruby
                 name = seq_obj.get_name()
                 f.print( ">" )
                 if ( @max_name_length != MAX_NAME_LENGTH_DEFAULT )
-                    name = Util.normalize_seq_name( name, @max_name_length )
+                    name = Util.normalize_seq_name( name, @max_name_length, @ex_if_name_too_long )
                 end
                 f.print( name )
                 counter = 0
index 36eec15..9ad5466 100644 (file)
@@ -13,18 +13,21 @@ require 'lib/evo/util/util'
 
 module Evoruby
 
-    class MsaWriter
+  class MsaWriter
 
-        def initialize()
-            raise TypeError, "Cannot instanciate abstract class MsaWriter"
-        end
+    def initialize()
+      raise TypeError, "Cannot instanciate abstract class MsaWriter"
+    end
 
-        def set_max_name_length( length )
-        end
+    def set_max_name_length( length )
+    end
 
-        def write( msa, path )
-        end
+    def set_exception_if_name_too_long( exception_if_name_too_long )
+    end
 
-    end # class MsaWriter
+    def write( msa, path )
+    end
+
+  end # class MsaWriter
 
 end # module Evoruby
index 17a0c77..9326ae1 100644 (file)
@@ -17,8 +17,9 @@ module Evoruby
         MAX_NAME_LENGTH_DEFAULT = 10
 
         def initialize()
-            @max_name_length = MAX_NAME_LENGTH_DEFAULT
-            @clean           = false
+            @max_name_length     = MAX_NAME_LENGTH_DEFAULT
+            @clean               = false
+            @ex_if_name_too_long = false
         end
 
         def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT )
@@ -32,6 +33,10 @@ module Evoruby
             @clean = clean
         end
 
+        def set_exception_if_name_too_long( exception_if_name_too_long )
+          @ex_if_name_too_long = exception_if_name_too_long
+        end
+
         def write( msa, path )
             if ( !msa.is_aligned() )
                 error_msg = "attempt to write unaligned msa in nexus format"
@@ -60,7 +65,7 @@ module Evoruby
                 seq  = seq_obj.get_sequence_as_string()
                 name = name.gsub( /\s+$/, '')
                 name = name.gsub( /\s+/, '_')
-                name = Util.normalize_seq_name( name, @max_name_length )
+                name = Util.normalize_seq_name( name, @max_name_length, @ex_if_name_too_long )
                 f.print( "      " )
                 f.print( name )
                 f.print( " " )
index 15f4c5c..42ad186 100644 (file)
@@ -12,59 +12,64 @@ require 'lib/evo/io/writer/msa_writer'
 
 module Evoruby
 
-    class PhylipSequentialWriter < MsaWriter
+  class PhylipSequentialWriter < MsaWriter
 
-        MAX_NAME_LENGTH_DEFAULT = 10
+    MAX_NAME_LENGTH_DEFAULT = 10
 
-        def initialize()
-            @max_name_length = MAX_NAME_LENGTH_DEFAULT
-            @clean           = false
-        end
+    def initialize()
+      @max_name_length     = MAX_NAME_LENGTH_DEFAULT
+      @clean               = false
+      @ex_if_name_too_long = false
+    end
 
-        def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT )
-            if length < 1
-                length = MAX_NAME_LENGTH_DEFAULT
-            end
-            @max_name_length = length
-        end
+    def set_max_name_length( length = MAX_NAME_LENGTH_DEFAULT )
+      if length < 1
+        length = MAX_NAME_LENGTH_DEFAULT
+      end
+      @max_name_length = length
+    end
 
-        def clean( clean = true )
-            @clean = clean
-        end
+    def clean( clean = true )
+      @clean = clean
+    end
+
+    def set_exception_if_name_too_long( exception_if_name_too_long )
+      @ex_if_name_too_long = exception_if_name_too_long
+    end
 
-        def write( msa, path )
-            if ( !msa.is_aligned() )
-                error_msg = "attempt to write unaligned msa in phylip sequential format"
-                raise StandardError, error_msg, caller
-            end
+    def write( msa, path )
+      if ( !msa.is_aligned() )
+        error_msg = "attempt to write unaligned msa in phylip sequential format"
+        raise StandardError, error_msg, caller
+      end
 
 
-            Util.check_file_for_writability( path )
+      Util.check_file_for_writability( path )
 
-            f = File.open( path, "a" )
+      f = File.open( path, "a" )
 
-            f.print( msa.get_number_of_seqs().to_s() )
-            f.print( " " )
-            f.print( msa.get_length().to_s() )
-            f.print( Evoruby::Constants::LINE_DELIMITER )
-            for i in 0 ... msa.get_number_of_seqs()
-                seq_obj = msa.get_sequence( i )
-                name = seq_obj.get_name()
-                seq  = seq_obj.get_sequence_as_string()
-                name = name.gsub( /\s+$/, '')
-                name = name.gsub( /\s+/, '_')
-                name = Util.normalize_seq_name( name, @max_name_length )
-                f.print( name )
-                f.print( " " )
-                if ( @clean )
-                    seq = Util.clean_seq_str( seq )
-                end
-                f.print( seq )
-                f.print( Evoruby::Constants::LINE_DELIMITER )
-            end
-            f.close()
+      f.print( msa.get_number_of_seqs().to_s() )
+      f.print( " " )
+      f.print( msa.get_length().to_s() )
+      f.print( Evoruby::Constants::LINE_DELIMITER )
+      for i in 0 ... msa.get_number_of_seqs()
+        seq_obj = msa.get_sequence( i )
+        name = seq_obj.get_name()
+        seq  = seq_obj.get_sequence_as_string()
+        name = name.gsub( /\s+$/, '')
+        name = name.gsub( /\s+/, '_')
+        name = Util.normalize_seq_name( name, @max_name_length, @ex_if_name_too_long )
+        f.print( name )
+        f.print( " " )
+        if ( @clean )
+          seq = Util.clean_seq_str( seq )
         end
+        f.print( seq )
+        f.print( Evoruby::Constants::LINE_DELIMITER )
+      end
+      f.close()
+    end
 
-    end # class PhylipSequentialWriter
+  end # class PhylipSequentialWriter
 
 end # module Evoruby
index 6299417..738db56 100644 (file)
@@ -27,9 +27,9 @@ module Evoruby
   class MsaProcessor
 
     PRG_NAME       = "msa_pro"
-    PRG_DATE       = "2012.05.11"
+    PRG_DATE       = "130411"
     PRG_DESC       = "processing of multiple sequence alignments"
-    PRG_VERSION    = "1.06"
+    PRG_VERSION    = "1.07"
     COPYRIGHT      = "2008-2010 Christian M Zmasek"
     CONTACT        = "phylosoft@gmail.com"
     WWW            = "www.phylosoft.org"
@@ -40,6 +40,7 @@ module Evoruby
     INPUT_TYPE_OPTION                  = "i"
     OUTPUT_TYPE_OPTION                 = "o"
     MAXIMAL_NAME_LENGTH_OPTION         = "n"
+    DIE_IF_NAME_TOO_LONG               = "d"
     WIDTH_OPTION                       = "w"
     CLEAN_UP_SEQ_OPTION                = "c"
     REM_RED_OPTION                     = "rem_red"
@@ -78,6 +79,7 @@ module Evoruby
       @rgoc             = false
       @rg               = false  # fasta only
       @rem_red          = false
+      @die_if_name_too_long  = false
       @rgr              = -1
       @rsgr             = -1
       @rsl              = -1
@@ -148,6 +150,7 @@ module Evoruby
       allowed_opts.push( REM_RED_OPTION )
       allowed_opts.push( KEEP_MATCHING_SEQUENCES_OPTION )
       allowed_opts.push( REMOVE_MATCHING_SEQUENCES_OPTION )
+      allowed_opts.push( DIE_IF_NAME_TOO_LONG )
 
       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
       if ( disallowed.length > 0 )
@@ -162,13 +165,13 @@ module Evoruby
 
       begin
         Util.check_file_for_readability( input )
-      rescue ArgumentError => e
+      rescue IOError => e
         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
       end
 
       begin
         Util.check_file_for_writability( output )
-      rescue ArgumentError => e
+      rescue IOError => e
         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
       end
 
@@ -511,6 +514,7 @@ module Evoruby
           w = PhylipSequentialWriter.new()
           w.clean( @clean )
           w.set_max_name_length( @name_length )
+          w.set_exception_if_name_too_long( @die_if_name_too_long )
         elsif( @fasta_output )
           w = FastaWriter.new()
           w.set_line_width( @width )
@@ -522,11 +526,13 @@ module Evoruby
           w.clean( @clean )
           if ( @name_length_set )
             w.set_max_name_length( @name_length )
+            w.set_exception_if_name_too_long( @die_if_name_too_long )
           end
         elsif( @nexus_output )
           w = NexusWriter.new()
           w.clean( @clean )
           w.set_max_name_length( @name_length )
+          w.set_exception_if_name_too_long( @die_if_name_too_long )
         end
 
 
@@ -661,6 +667,7 @@ module Evoruby
         @last             = -1
       end
     end
+
     def analyze_command_line( cla )
       if ( cla.is_option_set?( INPUT_TYPE_OPTION ) )
         begin
@@ -806,6 +813,9 @@ module Evoruby
           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
         end
       end
+      if ( cla.is_option_set?( DIE_IF_NAME_TOO_LONG ) )
+        @die_if_name_too_long = true
+      end
 
 
     end
@@ -819,6 +829,7 @@ module Evoruby
       puts( "  options: -" + INPUT_TYPE_OPTION + "=<input type>: f for fasta, p for phylip selex type" )
       puts( "           -" + OUTPUT_TYPE_OPTION + "=<output type>: f for fasta, n for nexus, p for phylip sequential (default)" )
       puts( "           -" + MAXIMAL_NAME_LENGTH_OPTION + "=<n>: n=maximal name length (default for phylip 10, for fasta: unlimited )" )
+      puts( "           -" + DIE_IF_NAME_TOO_LONG + ": die if sequence name too long" )
       puts( "           -" + WIDTH_OPTION + "=<n>: n=width (fasta output only, default is 60)" )
       puts( "           -" + CLEAN_UP_SEQ_OPTION + ": clean up sequences" )
       puts( "           -" + REMOVE_GAP_COLUMNS_OPTION + ": remove gap columns" )
index cb72a02..6122f2b 100644 (file)
@@ -274,10 +274,10 @@ end # module Evoruby
 # are to be used:
 #  the substring between the first two double underscores is a
 #  unique identifier and needs to match the identifiers
-#  in '% <parameter-type> <unique-id>=<value>' statements
+#  in '% <parameter-type> <unique-id>=<value>' statements 
 #  Example:
 #  alignment name     : 'x__bcl2__e1'
-#  parameter statments: '% RSL bcl2=60'
+#  parameter statments: '% RSL bcl2=60' 
 $ PROBCONS=/home/czmasek/SOFTWARE/PROBCONS/probcons_v1_12/probcons
 $ DIALIGN_TX=/home/czmasek/SOFTWARE/DIALIGNTX/DIALIGN-TX_1.0.2/source/dialign-tx
 $ DIALIGN_CONF=/home/czmasek/SOFTWARE/DIALIGNTX/DIALIGN-TX_1.0.2/conf
@@ -291,7 +291,7 @@ $ PHYLO_PL=/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/archive/perl/ph
 
 
 % RSL Hormone_recep=60
-%
+% 
 % RSL Y_phosphatase=100
 % RSL Y_phosphatase2=75
 % RSL Y_phosphatase3=50
@@ -303,39 +303,80 @@ $ PHYLO_PL=/home/czmasek/SOFTWARE/FORESTER/DEV/forester/forester/archive/perl/ph
 
 
 > KALIGN $ > $_kalign
-> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_kalign $_kalign_05_%[RSL]%.aln
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_kalign $_kalign_05_%[RSL]%.aln
 > PHYLO_PL %[PHYLO_OPT]% $_kalign_05_%[RSL]%.aln $_kalign_05_%[RSL]% %[TMP_DIR]%
--
+- 
+
+> KALIGN $ > $_kalign_
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_kalign_ $_kalign_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_kalign_09_%[RSL]%.aln $_kalign_09_%[RSL]% %[TMP_DIR]%
+- 
+
 
 > HMMALIGN --amino --trim --outformat Pfam -o $_hmmalign %[HMM]% $ > /dev/null
-> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_hmmalign $_hmmalign_05_%[RSL]%.aln
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_hmmalign $_hmmalign_05_%[RSL]%.aln
 > PHYLO_PL %[PHYLO_OPT]% $_hmmalign_05_%[RSL]%.aln $_hmmalign_05_%[RSL]% %[TMP_DIR]%
--
+- 
+
+> HMMALIGN --amino --trim --outformat Pfam -o $_hmmalign_ %[HMM]% $ > /dev/null
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_hmmalign_ $_hmmalign_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_hmmalign_09_%[RSL]%.aln $_hmmalign_09_%[RSL]% %[TMP_DIR]%
+- 
+
 
 > MAFFT --maxiterate 1000 --localpair $ > $_mafft
-> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_mafft $_mafft_05_%[RSL]%.aln
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_mafft $_mafft_05_%[RSL]%.aln
 > PHYLO_PL %[PHYLO_OPT]% $_mafft_05_%[RSL]%.aln $_mafft_05_%[RSL]% %[TMP_DIR]%
 -
 
+> MAFFT --maxiterate 1000 --localpair $ > $_mafft_
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_mafft_ $_mafft_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_mafft_09_%[RSL]%.aln $_mafft_09_%[RSL]% %[TMP_DIR]%
+-
+
+        
 > MUSCLE  -maxiters 1000 -maxtrees 100 -in $ -out $_muscle
-> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_muscle $_muscle_05_%[RSL]%.aln
-> PHYLO_PL %[PHYLO_OPT]%  $_muscle_05_%[RSL]%.aln  $_muscle_05_%[RSL]% %[TMP_DIR]%
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_muscle $_muscle_05_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_muscle_05_%[RSL]%.aln $_muscle_05_%[RSL]% %[TMP_DIR]%
+-
+
+> MUSCLE  -maxiters 1000 -maxtrees 100 -in $ -out $_muscle_
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_muscle_ $_muscle_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_muscle_09_%[RSL]%.aln $_muscle_09_%[RSL]% %[TMP_DIR]%
 -
 
+
 > CLUSTALO --full --full-iter --iter=5 -i $ -o $_clustalo
-> MSA_PRO -o=p -n=10 -rr=0.5 -c -rsl=%[RSL]% $_clustalo $_clustalo_05_%[RSL]%.aln
+> MSA_PRO -o=p -n=10 -d -rr=0.5 -c -rsl=%[RSL]% $_clustalo $_clustalo_05_%[RSL]%.aln
 > PHYLO_PL %[PHYLO_OPT]% $_clustalo_05_%[RSL]%.aln $_clustalo_05_%[RSL]% %[TMP_DIR]%
 -
 
+> CLUSTALO --full --full-iter --iter=5 -i $ -o $_clustalo_
+> MSA_PRO -o=p -n=10 -d -rr=0.9 -c -rsl=%[RSL]% $_clustalo_ $_clustalo_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_clustalo_09_%[RSL]%.aln $_clustalo_09_%[RSL]% %[TMP_DIR]%
+-
+
+
 > PROBCONS $ > $_probcons
-> MSA_PRO -o=p -n=10 -rem_red -rr=0.5 -c -rsl=%[RSL]% $_probcons $_probcons_05_%[RSL]%.aln
+> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.5 -c -rsl=%[RSL]% $_probcons $_probcons_05_%[RSL]%.aln
 > PHYLO_PL %[PHYLO_OPT]% $_probcons_05_%[RSL]%.aln $_probcons_05_%[RSL]% %[TMP_DIR]%
--
+-  
+
+> PROBCONS $ > $_probcons_
+> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.9 -c -rsl=%[RSL]% $_probcons_ $_probcons_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_probcons_09_%[RSL]%.aln $_probcons_09_%[RSL]% %[TMP_DIR]%
+-  
+
 
 > DIALIGN_TX DIALIGN_CONF $ $_dialigntx
-> MSA_PRO -o=p -n=10 -rem_red -rr=0.5 -c -rsl=%[RSL]% $_dialigntx $_dialigntx_05_%[RSL]%.aln
+> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.5 -c -rsl=%[RSL]% $_dialigntx $_dialigntx_05_%[RSL]%.aln
 > PHYLO_PL %[PHYLO_OPT]% $_dialigntx_05_%[RSL]%.aln $_dialigntx_05_%[RSL]% %[TMP_DIR]%
 -
 
+> DIALIGN_TX DIALIGN_CONF $ $_dialigntx_
+> MSA_PRO -o=p -n=10 -d -rem_red -rr=0.9 -c -rsl=%[RSL]% $_dialigntx_ $_dialigntx_09_%[RSL]%.aln
+> PHYLO_PL %[PHYLO_OPT]% $_dialigntx_09_%[RSL]%.aln $_dialigntx_09_%[RSL]% %[TMP_DIR]%
+-
+
 =end
 
index 9a77601..bf194c7 100644 (file)
@@ -22,9 +22,9 @@ module Evoruby
   class TaxonomyProcessor
 
     PRG_NAME       = "tap"
-    PRG_DATE       = "2013.03.22"
+    PRG_DATE       = "130411"
     PRG_DESC       = "replacement of species names in multiple sequence files"
-    PRG_VERSION    = "2.001"
+    PRG_VERSION    = "2.002"
     COPYRIGHT      = "2013 Christian M Zmasek"
     CONTACT        = "phylosoft@gmail.com"
     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
@@ -168,9 +168,9 @@ module Evoruby
     def modify_name( desc, counter, file, extract_taxonomy )
       new_desc = nil
       desc.gsub!( /\s+/, ' ' )
-      if desc =~ /^>?\s*\S{1,10}_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)/
-        new_desc = counter.to_s( 16 ) + "_" + $1
-      elsif extract_taxonomy
+      #if desc =~ /^>?\s*\S{1,10}_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)/
+      #  new_desc = counter.to_s( 16 ) + "_" + $1
+      if extract_taxonomy
         if desc =~/\s\[(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)\]/
           new_desc = counter.to_s( 16 ) + "_" + $1
         else
index 79f92e0..e4173d3 100644 (file)
@@ -14,8 +14,12 @@ module Evoruby
 
   class Util
 
-    def Util.normalize_seq_name( name, length )
+    def Util.normalize_seq_name( name, length, exception_if_too_long = false )
       if name.length > length
+        if exception_if_too_long
+          error_msg = "sequence name \"#{name}\" is too long (>#{length})"
+          raise StandardError, error_msg
+        end
         name = name[ 0, length ]
       elsif name.length < length
         for i in 0 ... length - name.length