in progress

author cmzmasek <cmzmasek@yahoo.com>

Tue, 14 Feb 2017 18:43:57 +0000 (10:43 -0800)

committer cmzmasek <cmzmasek@yahoo.com>

Tue, 14 Feb 2017 18:43:57 +0000 (10:43 -0800)
author cmzmasek <cmzmasek@yahoo.com>
Tue, 14 Feb 2017 18:43:57 +0000 (10:43 -0800)
committer cmzmasek <cmzmasek@yahoo.com>
Tue, 14 Feb 2017 18:43:57 +0000 (10:43 -0800)
diff --git a/forester/java/src/org/forester/archaeopteryx/Configuration.java b/forester/java/src/org/forester/archaeopteryx/Configuration.java

index 3418951..f764e7f 100644 (file)
--- a/forester/java/src/org/forester/archaeopteryx/Configuration.java
+++ b/forester/java/src/org/forester/archaeopteryx/Configuration.java
@@ -1665,6 +1665,10 @@ public final class Configuration {
          return _gui_menu_text_color;
      }
  
+    static int getGuiFontSize() {
+        return 11;
+    }
+    
      int getMaxBaseFontSize() {
          return _max_base_font_size;
      }
diff --git a/forester/java/src/org/forester/archaeopteryx/ControlPanel.java b/forester/java/src/org/forester/archaeopteryx/ControlPanel.java

index d89204e..05d3ca0 100644 (file)
--- a/forester/java/src/org/forester/archaeopteryx/ControlPanel.java
+++ b/forester/java/src/org/forester/archaeopteryx/ControlPanel.java
@@ -102,11 +102,11 @@ final class ControlPanel extends JPanel implements ActionListener {
                            ORDER_SUBTREE;
      }
      final static Font                         jcb_bold_font             = new Font( Configuration
-            .getDefaultFontFamilyName(), Font.BOLD, 9 );
+            .getDefaultFontFamilyName(), Font.BOLD, Configuration.getGuiFontSize() );
      final static Font                         jcb_font                  = new Font( Configuration
-            .getDefaultFontFamilyName(), Font.PLAIN, 9 );
+            .getDefaultFontFamilyName(), Font.PLAIN, Configuration.getGuiFontSize());
      final static Font                         js_font                   = new Font( Configuration
-            .getDefaultFontFamilyName(), Font.PLAIN, 9 );
+            .getDefaultFontFamilyName(), Font.PLAIN, Configuration.getGuiFontSize() );
      private static final String               RETURN_TO_SUPER_TREE_TEXT = "R";
      private static final String               SEARCH_TIP_TEXT           = "Enter text to search for. Use ',' for logical OR and '+' for logical AND (not used in this manner for regular expression searches).";
      private static final long                 serialVersionUID          = -8463483932821545633L;
diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java

index 780dcc3..e44430c 100644 (file)
--- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java
+++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java
@@ -103,8 +103,8 @@ import org.forester.util.ForesterUtil;
  
  public final class MainFrameApplication extends MainFrame {
  
-    private final static int             FRAME_X_SIZE                    = 800;
-    private final static int             FRAME_Y_SIZE                    = 800;
+    private final static int             FRAME_X_SIZE                    = 900;
+    private final static int             FRAME_Y_SIZE                    = 900;
      // Filters for the file-open dialog (classes defined in this file)
      private static final long            serialVersionUID                = -799735726778865234L;
      private static final boolean         PREPROCESS_TREES                = false;
diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java

index f31c657..ee17de9 100644 (file)
--- a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java
+++ b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java
@@ -1696,6 +1696,9 @@ public class PhylogenyMethods {
                          n.getNodeData().getTaxonomy().setIdentifier( new Identifier( name ) );
                          break;
                      }
+                    case CLADE_NAME:
+                        n.setName( name );
+                        break;
                      default: {
                          throw new IllegalArgumentException( "don't know what to do with " + field );
                      }
diff --git a/forester/perl/forester.pm b/forester/perl/forester.pm

index f8e58cb..ef0654f 100755 (executable)
--- a/forester/perl/forester.pm
+++ b/forester/perl/forester.pm
@@ -166,12 +166,12 @@ our @EXPORT = qw( executeConsense
  # Software directory:
  # ---------------------
  
-our $SOFTWARE_DIR              = "/home/czmasek/SOFTWARE/";
+our $SOFTWARE_DIR              = "/home/zma/SOFTWARE/";
  
  
  # Java virtual machine:
  # ---------------------
-our $JAVA                      = $SOFTWARE_DIR."JAVA/jdk1.6.0_03/bin/java";
+our $JAVA                      = "java";
  
  
  # Where all the temporary files can be created:
@@ -181,49 +181,49 @@ our $TEMP_DIR_DEFAULT          = "/tmp/";
  
  # Programs from Joe Felsenstein's PHYLIP package:
  # -----------------------------------------------
-our $SEQBOOT                   = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/seqboot";
-our $NEIGHBOR                  = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/neighbor";
-our $PROTPARS                  = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/protpars";
-our $PROML                     = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/proml";
-our $FITCH                     = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/fitch";
-our $CONSENSE                  = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/consense";
-our $PHYLIP_VERSION            = "3.68";
+our $SEQBOOT                   = $SOFTWARE_DIR."PHYLO/Phylip/Phylip3.695/phylip-3.696/exe/seqboot";
+our $NEIGHBOR                  = $SOFTWARE_DIR."PHYLO/Phylip/Phylip3.695/phylip-3.696/exe/neighbor";
+our $PROTPARS                  = $SOFTWARE_DIR."PHYLO/Phylip/Phylip3.695/phylip-3.696/exe/protpars";
+our $PROML                     = $SOFTWARE_DIR."PHYLO/Phylip/Phylip3.695/phylip-3.696/exe/proml";
+our $FITCH                     = $SOFTWARE_DIR."PHYLO/Phylip/Phylip3.695/phylip-3.696/exe/fitch";
+our $CONSENSE                  = $SOFTWARE_DIR."PHYLO/Phylip/Phylip3.695/phylip-3.696/exe/consense";
+our $PHYLIP_VERSION            = "3.695";
  
  # TREE-PUZZLE:
  # ------------
-our $PUZZLE                    = $SOFTWARE_DIR."TREE_PUZZLE/tree-puzzle-5.2/src/puzzle";
+our $PUZZLE                    = $SOFTWARE_DIR."PHYLO/TREE-PUZZLE/tree-puzzle-5.2/src/puzzle";
  our $PUZZLE_VERSION            = "5.2";
  
  # FASTME:
  # -----------------------------------------------------
-our $FASTME                    = $SOFTWARE_DIR."FASTME/fastme2.0/fastme";
+our $FASTME                    = $SOFTWARE_DIR."PHYLO/FastME/fastme2.0/fastme";
  our $FASTME_VERSION            = "2.0";
  
  # BIONJ:
  # -----------------------------------------------------
-our $BIONJ                    = $SOFTWARE_DIR."BIONJ/bionj";
-our $BIONJ_VERSION            = "[1997]";
+our $BIONJ                    = "";
+our $BIONJ_VERSION            = "";
  
  # WEIGHBOR:
  # -----------------------------------------------------
-our $WEIGHBOR                 = $SOFTWARE_DIR."WEIGHBOR/Weighbor/weighbor";
-our $WEIGHBOR_VERSION         = "1.2.1";
+our $WEIGHBOR                 = "";
+our $WEIGHBOR_VERSION         = "";
  
  # PHYML:
  # -----------------------------------------------------
-our $PHYML                    = $SOFTWARE_DIR."PHYML/phyml_v2.4.4/exe/phyml_linux";
-our $PHYML_VERSION            = "2.4.4";
+our $PHYML                    = $SOFTWARE_DIR."PHYLO/PhyML/PhyML-3.1/PhyML-3.1/PhyML-3.1_linux64";
+our $PHYML_VERSION            = "3.1";
  
  # RAXML:
  # -----------------------------------------------------
-our $RAXML                    = $SOFTWARE_DIR."RAXML/RAxML-7.0.4/raxmlHPC";
-our $RAXML_VERSION            = "7.0.4";
+our $RAXML                    = $SOFTWARE_DIR."PHYLO/RAxML/20161215/standard-RAxML-master/raxmlHPC-AVX";
+our $RAXML_VERSION            = "8.2.9";
  
  
-# forester.jar. This jar file is currently available at: http://www.phylosoft.org 
-# -------------------------------------------------------------------------------
+# forester.jar. This jar file is currently available at: https://sites.google.com/site/cmzmasek/home/software/forester 
+# --------------------------------------------------------------------------------------------------------------------
  
-our $FORESTER_JAR             = $SOFTWARE_DIR."FORESTER/DEV/forester/forester/java/forester.jar";
+our $FORESTER_JAR             = "/home/zma/git/forester/forester/java/forester.jar";
  
  
  
@@ -734,7 +734,7 @@ sub executeFastme {
        
      &testForTextFilePresence( $inpwd );
      my $command = "";
-    if ( $bs > 0 ) {
+    if ( $bs > 1 ) {
          $command = "$FASTME -b $init_opt -i $inpwd -n $bs -s b";
      }
      else {
diff --git a/forester/perl/phylo_pl.pl b/forester/perl/phylo_pl.pl

index abdfc2f..220fdd6 100755 (executable)
--- a/forester/perl/phylo_pl.pl
+++ b/forester/perl/phylo_pl.pl
@@ -1,12 +1,9 @@
  #!/usr/bin/perl -W
  #
-# $Id: phylo_pl.pl,v 1.32 2010/12/13 19:00:22 cmzmasek Exp $
-#
  # FORESTER -- software libraries and applications
  # for evolutionary biology research and applications.
  #
-# Copyright (C) 2008-2014 Christian M. Zmasek
-# Copyright (C) 2008-2009 Burnham Institute for Medical Research
+# Copyright (C) 2017 Christian M. Zmasek
  # All rights reserved
  # 
  # This library is free software; you can redistribute it and/or
@@ -23,8 +20,8 @@
  # License along with this library; if not, write to the Free Software
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
  #
-# Contact: phylosoft @ gmail . com
-#     WWW: www.phylosoft.org/forester
+# Contact: cmzmasek at yahoo dot com
+#     WWW: https://sites.google.com/site/cmzmasek/home/software/forester
  #
  #
  #
@@ -61,7 +58,7 @@ use lib $FindBin::Bin;
  use forester;
  
  my $VERSION                = "1.0.1";
-my $LAST_MODIFIED          = "2009.10.02";
+my $LAST_MODIFIED          = "2017/02/07";
  
  my $RAXML_MODEL_BASE       = "PROTGAMMA";
  my $RAXML_ALGORITHM        = "a";
@@ -1669,13 +1666,12 @@ Y
  sub printUsage {
  
      print <<END;
-
-Copyright (C) 2002-2007 Christian M. Zmasek
+Copyright (C) 2017 Christian M Zmasek
  All rights reserved
  
-Author: Christian M. Zmasek
-phylosoft\@gmail.com
-http://www.phylosoft.org
+Author: Christian M Zmasek
+cmzmasek at yahoo dot com
+https://sites.google.com/site/cmzmasek/home/software/forester
  
    Requirements  phylo_pl is part of the FORESTER collection of programs.
    ------------  Many of its global variables are set via forester.pm.
diff --git a/forester/ruby/evoruby/exe/tap.rb b/forester/ruby/evoruby/exe/tap.rb

index 6a69f79..b0b7d8f 100755 (executable)
--- a/forester/ruby/evoruby/exe/tap.rb
+++ b/forester/ruby/evoruby/exe/tap.rb
@@ -2,10 +2,8 @@
  #
  # = exe/tap
  #
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# last modified: 05/18/2007
+# Copyright::    Copyright (C) 2017 Christian M Zmasek
+# License::      GNU Lesser General Public License (LGPL)
  
  require 'lib/evo/tool/taxonomy_processor'
  
diff --git a/forester/ruby/evoruby/lib/evo/io/parser/fasta_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/fasta_parser.rb

index 94c2193..f1da2a9 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/io/parser/fasta_parser.rb
+++ b/forester/ruby/evoruby/lib/evo/io/parser/fasta_parser.rb
@@ -1,16 +1,14 @@
  #
  # = lib/evo/io/parser/fasta_parser - FastaParser class
  #
-# Copyright::  Copyright (C) 20017 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
  #
-# last modified: 05/17/2007
+# Last modified: 2017/02/07
  
  require 'lib/evo/io/parser/msa_parser'
  require 'lib/evo/msa/msa'
  
-#require 'iconv'
-
  module Evoruby
  
    class FastaParser < MsaParser
@@ -24,10 +22,10 @@ module Evoruby
        current_seq = String.new()
        name        = String.new()
        saw_first_seq = false
-      ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
        File.open( path ) do | file |
          while line = file.gets
-          line = ic.iconv( line )
+          
+          line.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")          
            if can_ignore?( line, saw_first_seq )
  
            elsif line =~ /^\s*>\s*(.+)/
diff --git a/forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb

index dbab203..6ef36bf 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb
+++ b/forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb
@@ -1,10 +1,10 @@
  #
  # = lib/evo/io/parser/general_msa_parser - GeneralMsaParser class
  #
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
  #
-# last modified: 2009/10/08
+# Last modified: 2017/02/07
  
  require 'lib/evo/io/parser/msa_parser'
  require 'lib/evo/msa/msa'
diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_parser.rb

index c148cdf..a3c7b6a 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_parser.rb
+++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_parser.rb
@@ -1,10 +1,12 @@
  #
-# To change this template, choose Tools | Templates
-# and open the template in the editor.
-
+# = lib/evo/io/parser/hmmscan_parser.rb - HmmscanParser class
+#
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
+#
+# Last modified: 2017/02/12
  
  class HmmscanParser
-
    def initialize file
      @file = file
    end
@@ -51,6 +53,8 @@ class HmmscanParser
        r.env_from   = $20.to_i
        r.env_to     = $21.to_i
  
+      r.desc       = $23
+
        if r.number > r.out_of || r.hmm_from > r.hmm_to || r.ali_from > r.ali_to || r.env_from > r.env_to
          raise IOError, "illogical format: " + line
        end
@@ -81,5 +85,6 @@ class HmmscanResult
    attr_accessor :ali_to
    attr_accessor :env_from
    attr_accessor :env_to
+  attr_accessor :desc
  
  end
diff --git a/forester/ruby/evoruby/lib/evo/msa/msa.rb b/forester/ruby/evoruby/lib/evo/msa/msa.rb

index 579941d..549ae2c 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/msa/msa.rb
+++ b/forester/ruby/evoruby/lib/evo/msa/msa.rb
@@ -1,11 +1,10 @@
  #
  # = lib/evo/msa/msa.rb - Msa class
  #
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# $Id: msa.rb,v 1.11 2009/01/03 00:42:08 cmzmasek Exp $
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
  #
+# Last modified: 2017/02/07
  
  
  require 'lib/evo/util/constants'
@@ -517,8 +516,7 @@ module Evoruby
        x = get_number_of_seqs / n
        for i in 0 ... n
          msa = Msa.new()
-        s = 0
-
+        #s = 0
          if ( ( r > 0 ) && ( i == ( n - 1 ) ) )
            y = x + r
            if ( verbose )
diff --git a/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb

index 5c8619d..a8144ec 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb
+++ b/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb
@@ -1,11 +1,8 @@
  #
-# = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class
+# = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class
  #
-# Copyright::  Copyright (C) 2012 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# $Id:Exp $
-
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
  
  require 'lib/evo/util/constants'
  require 'lib/evo/util/util'
@@ -13,16 +10,13 @@ require 'lib/evo/util/command_line_arguments'
  require 'lib/evo/io/parser/hmmscan_domain_extractor'
  
  module Evoruby
-
    class DomainSequenceExtractor
  
      PRG_NAME       = "dsx"
-    PRG_VERSION    = "2.000"
+    PRG_VERSION    = "2.001"
      PRG_DESC       = "extraction of domain sequences from hmmscan output"
-    PRG_DATE       = "20121001"
-    COPYRIGHT      = "2012 Christian M Zmasek"
-    CONTACT        = "phylosoft@gmail.com"
-    WWW            = "www.phylosoft.org"
+    PRG_DATE       = "20170213"
+    WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
  
      E_VALUE_THRESHOLD_OPTION           = 'e'
      LENGTH_THRESHOLD_OPTION            = 'l'
@@ -35,17 +29,14 @@ module Evoruby
      FAILED_SEQS_SUFFIX                 = '_with_no_passing_domains.fasta'
      HELP_OPTION_1                      = 'help'
      HELP_OPTION_2                      = 'h'
-
      def run()
  
        Util.print_program_information( PRG_NAME,
-        PRG_VERSION,
-        PRG_DESC ,
-        PRG_DATE,
-        COPYRIGHT,
-        CONTACT,
-        WWW,
-        STDOUT )
+      PRG_VERSION,
+      PRG_DESC ,
+      PRG_DATE,
+      WWW,
+      STDOUT )
  
        ld = Constants::LINE_DELIMITER
  
@@ -56,7 +47,7 @@ module Evoruby
        end
  
        if ( cla.is_option_set?( HELP_OPTION_1 ) ||
-           cla.is_option_set?( HELP_OPTION_2 ) )
+      cla.is_option_set?( HELP_OPTION_2 ) )
          print_help
          exit( 0 )
        end
@@ -77,8 +68,8 @@ module Evoruby
        disallowed = cla.validate_allowed_options_as_str( allowed_opts )
        if ( disallowed.length > 0 )
          Util.fatal_error( PRG_NAME,
-          "unknown option(s): " + disallowed,
-          STDOUT )
+        "unknown option(s): " + disallowed,
+        STDOUT )
        end
  
        domain_id           = cla.get_file_name( 0 )
@@ -92,7 +83,6 @@ module Evoruby
          outfile = outfile[ 0 .. outfile.length - 5 ]
        end
  
-
        add_position = false
        if ( cla.is_option_set?( ADD_POSITION_OPTION ) )
          add_position = true
@@ -132,7 +122,6 @@ module Evoruby
          end
        end
  
-
        min_linker = nil
        if ( cla.is_option_set?( MIN_LINKER_OPT ) )
          begin
@@ -145,7 +134,6 @@ module Evoruby
          end
        end
  
-
        log = String.new
  
        puts()
@@ -183,7 +171,6 @@ module Evoruby
  
        end
  
-
        if ( add_position )
          puts( "Add positions (rel to complete seq) to extracted domains: true" )
          log << "Add positions (rel to complete seq) to extracted domains: true" + ld
@@ -206,18 +193,18 @@ module Evoruby
        begin
          parser = HmmscanDomainExtractor.new()
          domain_count = parser.parse( domain_id,
-          hmmsearch_output,
-          fasta_sequence_file,
-          outfile,
-          outfile + PASSED_SEQS_SUFFIX,
-          outfile + FAILED_SEQS_SUFFIX,
-          e_value_threshold,
-          length_threshold,
-          add_position,
-          add_domain_number,
-          add_species,
-          min_linker,
-          log )
+        hmmsearch_output,
+        fasta_sequence_file,
+        outfile,
+        outfile + PASSED_SEQS_SUFFIX,
+        outfile + FAILED_SEQS_SUFFIX,
+        e_value_threshold,
+        length_threshold,
+        add_position,
+        add_domain_number,
+        add_species,
+        min_linker,
+        log )
        rescue ArgumentError, IOError => e
          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
  
diff --git a/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb b/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb

index f08236d..b928468 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb
+++ b/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb
@@ -1,12 +1,8 @@
  #
  # = lib/evo/apps/domains_to_forester - DomainsToForester class
  #
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# $Id: Exp $
-#
-# last modified: 06/11/2007
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
  
  require 'lib/evo/util/constants'
  require 'lib/evo/util/util'
@@ -17,28 +13,24 @@ require 'lib/evo/sequence/protein_domain'
  require 'lib/evo/sequence/domain_structure'
  
  module Evoruby
-
    class DomainsToForester
  
      PRG_NAME       = "d2f"
-    PRG_DESC       = "parsed hmmpfam output to forester format"
-    PRG_VERSION    = "1.001"
-    PRG_DATE       = "20120807"
-    COPYRIGHT      = "2012 Christian M Zmasek"
-    CONTACT        = "phylosoft@gmail.com"
-    WWW            = "www.phylosoft.org"
+    PRG_DESC       = "converting of parsed hmmpfam output to forester format"
+    PRG_VERSION    = "1.002"
+    PRG_DATE       = "20170213"
+    WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
  
      E_VALUE_THRESHOLD_OPTION         = "e"
      OVERWRITE_IF_SAME_FROM_TO_OPTION = "o"
      HELP_OPTION_1                    = "help"
      HELP_OPTION_2                    = "h"
-
      def parse( domains_list_file,
-        original_seqs_file,
-        outfile,
-        column_delimiter,
-        e_value_threshold,
-        overwrite_if_same_from_to )
+      original_seqs_file,
+      outfile,
+      column_delimiter,
+      e_value_threshold,
+      overwrite_if_same_from_to )
        Util.check_file_for_readability( domains_list_file )
        Util.check_file_for_readability( original_seqs_file )
        Util.check_file_for_writability( outfile )
@@ -56,7 +48,7 @@ module Evoruby
        File.open( domains_list_file ) do | file |
          while line = file.gets
            if !is_ignorable?( line )
-            
+
              a = line.split( column_delimiter )
              l = a.length
              if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) )
@@ -67,14 +59,7 @@ module Evoruby
              domain_name  = a[ 1 ]
              seq_from     = -1
              seq_to       = -1
-            ##########################################
-            if domain_name =~ /RRM_\d/
-              puts "ignoring " + line 
-              next
-            end
-            ##########################################
-            
-            
+
              begin
                seq_from = a[ 2 ].to_i
              rescue Exception
@@ -134,19 +119,19 @@ module Evoruby
  
      end # parse
  
-
-
-
      def run()
  
        Util.print_program_information( PRG_NAME,
-        PRG_VERSION,
-        PRG_DESC,
-        PRG_DATE,
-        COPYRIGHT,
-        CONTACT,
-        WWW,
-        STDOUT )
+      PRG_VERSION,
+      PRG_DESC,
+      PRG_DATE,
+      WWW,
+      STDOUT )
+
+      if ( ARGV == nil || ( ARGV.length < 1 )  )
+        print_help
+        exit( -1 )
+      end
  
        begin
          cla = CommandLineArguments.new( ARGV )
@@ -155,12 +140,12 @@ module Evoruby
        end
  
        if ( cla.is_option_set?( HELP_OPTION_1 ) ||
-           cla.is_option_set?( HELP_OPTION_2 ) )
+      cla.is_option_set?( HELP_OPTION_2 ) )
          print_help
          exit( 0 )
        end
  
-      if cla.get_number_of_files != 3
+      unless ( cla.get_number_of_files == 1 || cla.get_number_of_files == 2 || cla.get_number_of_files == 3 )
          print_help
          exit( -1 )
        end
@@ -172,15 +157,10 @@ module Evoruby
        disallowed = cla.validate_allowed_options_as_str( allowed_opts )
        if ( disallowed.length > 0 )
          Util.fatal_error( PRG_NAME,
-          "unknown option(s): " + disallowed,
-          STDOUT )
+        "unknown option(s): " + disallowed,
+        STDOUT )
        end
  
-      domains_list_file       = cla.get_file_name( 0 )
-      original_sequences_file = cla.get_file_name( 1 )
-      outfile                 = cla.get_file_name( 2 )
-
-
        e_value_threshold = -1.0
        if cla.is_option_set?( E_VALUE_THRESHOLD_OPTION )
          begin
@@ -192,35 +172,73 @@ module Evoruby
            Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
          end
        end
+
+      domains_list_file = cla.get_file_name( 0 )
+      original_sequences_file = ""
+      outfile = ""
+      if (cla.get_number_of_files == 3)
+        original_sequences_file = cla.get_file_name( 1 )
+        outfile = cla.get_file_name( 2 )
+      elsif (cla.get_number_of_files == 1 || cla.get_number_of_files == 2 )
+        if ( cla.get_number_of_files == 2 )
+          original_sequences_file = cla.get_file_name( 1 )
+        else
+          hmmscan_index = domains_list_file.index("hmmscan")
+          if ( hmmscan_index != nil )
+            prefix = domains_list_file[0 .. hmmscan_index-1 ]
+            suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
+            files = Dir.entries( "." )
+            matching_files = Util.get_matching_files( files, prefix, suffix)
+            if matching_files.length < 1
+              Util.fatal_error( PRG_NAME, 'no file matching [' + prefix +
+              '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
+            end
+            if matching_files.length > 1
+              Util.fatal_error( PRG_NAME, 'more than one file matching [' +
+              prefix  + '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
+            end
+            original_sequences_file = matching_files[ 0 ]
+          end
+        end
+        outfile = domains_list_file
+        if (outfile.end_with?(Constants::DOMAIN_TABLE_SUFFIX) )
+          outfile = outfile.chomp(Constants::DOMAIN_TABLE_SUFFIX)
+        end
+        if ( e_value_threshold >= 0.0 )
+          outfile = outfile + Constants::DOMAINS_TO_FORESTER_EVALUE_CUTOFF_SUFFIX + e_value_threshold.to_s
+        end
+        outfile = outfile + Constants::DOMAINS_TO_FORESTER_OUTFILE_SUFFIX
+      end
+
        overwrite_if_same_from_to = false
        if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) )
          overwrite_if_same_from_to = true
        end
  
        puts
-      puts( "Domains list file                      : " + domains_list_file )
-      puts( "Fasta sequencefile (complete sequences): " + original_sequences_file )
-      puts( "Outputfile                             : " + outfile )
+      puts( "Domain table                            : " + domains_list_file )
+      puts( "Fasta sequence file (complete sequences): " + original_sequences_file )
+      puts( "Outputfile                              : " + outfile )
        if ( e_value_threshold >= 0.0 )
-        puts( "E-value threshold                      : " + e_value_threshold.to_s )
+        puts( "E-value threshold                       : " + e_value_threshold.to_s )
        else
-        puts( "E-value threshold                      : no threshold" )
+        puts( "E-value threshold                       : no threshold" )
        end
        if ( overwrite_if_same_from_to )
-        puts( "Overwrite if same from and to          : true" )
+        puts( "Overwrite if same from and to           : true" )
        else
-        puts( "Overwrite if same from and to          : false" )
+        puts( "Overwrite if same from and to           : false" )
        end
  
        puts
  
        begin
          parse( domains_list_file,
-          original_sequences_file,
-          outfile,
-          " ",
-          e_value_threshold,
-          overwrite_if_same_from_to )
+        original_sequences_file,
+        outfile,
+        " ",
+        e_value_threshold,
+        overwrite_if_same_from_to )
  
        rescue ArgumentError, IOError, StandardError => e
          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
@@ -228,8 +246,9 @@ module Evoruby
          Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
        end
  
-
        puts
+      Util.print_message( PRG_NAME, "wrote: " + outfile )
+      Util.print_message( PRG_NAME, "next steps in standard analysis pipeline: hmmsearch followed by dsx.rb")
        Util.print_message( PRG_NAME, 'OK' )
        puts
  
@@ -241,21 +260,25 @@ module Evoruby
        puts
        puts( "Usage:" )
        puts
-      puts( "  " + PRG_NAME + ".rb [options] <domains list file (parsed hmmpfam output)> <file containing complete sequences in fasta format> <outputfile>" )
+      puts( "  " + PRG_NAME + ".rb [options] <domain table (parsed hmmpfam output)> [file containing complete sequences in fasta format] [outputfile]" )
        puts()
        puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f> : E-value threshold, default is no threshold" )
        puts( "               -" + OVERWRITE_IF_SAME_FROM_TO_OPTION  + " : overwrite domain with same start and end with domain with better E-value" )
        puts
+      puts( "Examples:" )
+      puts
+      puts( "  " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta P53_hmmscan_300_10.dff" )
+      puts
+      puts( "  " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta" )
+      puts
+      puts( "  " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table" )
+      puts()
      end
  
-
-
      def is_ignorable?( line )
        return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/)
      end
  
-
    end # class DomainsToForester
  
-
  end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb

index 6ae3cf1..96c6a2f 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb
+++ b/forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb
@@ -3,9 +3,6 @@
  #
  # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
  # License::    GNU Lesser General Public License (LGPL)
-#
-# $Id: fasta_taxonomy_processor.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $
-
  
  require 'lib/evo/util/util'
  require 'lib/evo/msa/msa_factory'
@@ -19,187 +16,183 @@ require 'lib/evo/util/command_line_arguments'
  require 'lib/evo/apps/tseq_taxonomy_processor'
  
  module Evoruby
+  class FastaTaxonomyProcessor
+
+    PRG_NAME       = "fasta_tap"
+    PRG_DATE       = "2009.01.20"
+    PRG_DESC       = "preprocessing of multiple sequence files in ncbi fasta format"
+    PRG_VERSION    = "1.00"
+    WWW            = "www.phylosoft.org"
+    def initialize()
+      @tax_ids_to_sp_taxonomies = Hash.new()
+    end
  
-    class FastaTaxonomyProcessor
+    def run()
+
+      Util.print_program_information( PRG_NAME,
+      PRG_VERSION,
+      PRG_DESC,
+      PRG_DATE,
+      COPYRIGHT,
+      CONTACT,
+      WWW,
+      STDOUT )
+
+      if  ARGV == nil || ARGV.length != 4
+        puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in ncbi fasta format> <name for fasta outfile> <name for map outfile>" )
+        puts()
+        exit( -1 )
+      end
+
+      begin
+        cla = CommandLineArguments.new( ARGV )
+      rescue ArgumentError => e
+        Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+      end
+      allowed_opts = Array.new
+      disallowed = cla.validate_allowed_options_as_str( allowed_opts )
+      if ( disallowed.length > 0 )
+        Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
+      end
+
+      sp_taxonomy_infile = cla.get_file_name( 0 )
+      sequences_infile = cla.get_file_name( 1 )
+      sequences_outfile = cla.get_file_name( 2 )
+      mapping_outfile = cla.get_file_name( 3 )
+
+      Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
+      Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
+      Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
+      Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
+
+      sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
+
+      Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
+
+      fasta_parser = FastaParser.new
+      msa_fac = MsaFactory.new
+
+      seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser )
+
+      Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
+
+      removed = seqs.remove_redundant_sequences!( true, true )
+
+      if removed.size > 0
+        Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
+        removed.each { | seq_name |
+          puts seq_name
+        }
+        Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
+      end
+
+      mapping_out = File.open( mapping_outfile, "a" )
+
+      for i in 0 ... seqs.get_number_of_seqs
+        seq = seqs.get_sequence( i )
+        seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
+      end
+
+      io = MsaIO.new()
+
+      w = FastaWriter.new()
+
+      w.set_max_name_length( 10 )
+      w.clean( true )
+      begin
+        io.write_to_file( seqs, sequences_outfile, w )
+      rescue Exception => e
+        Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
+      end
+      mapping_out.close()
+
+      Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
+      Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
+      Util.print_message( PRG_NAME, "OK" )
  
-        PRG_NAME       = "fasta_tap"
-        PRG_DATE       = "2009.01.20"
-        PRG_DESC       = "preprocessing of multiple sequence files in ncbi fasta format"
-        PRG_VERSION    = "1.00"
-        COPYRIGHT      = "2009 Christian M Zmasek"
-        CONTACT        = "phylosoft@gmail.com"
-        WWW            = "www.phylosoft.org"
+    end
  
-        def initialize()
-            @tax_ids_to_sp_taxonomies = Hash.new()
-        end
+    private
  
-        def run()
-
-            Util.print_program_information( PRG_NAME,
-                PRG_VERSION,
-                PRG_DESC,
-                PRG_DATE,
-                COPYRIGHT,
-                CONTACT,
-                WWW,
-                STDOUT )
-
-            if  ARGV == nil || ARGV.length != 4
-                puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in ncbi fasta format> <name for fasta outfile> <name for map outfile>" )
-                puts()
-                exit( -1 )
-            end
-
-            begin
-                cla = CommandLineArguments.new( ARGV )
-            rescue ArgumentError => e
-                Util.fatal_error( PRG_NAME, "error: " + e.to_s )
-            end
-            allowed_opts = Array.new
-            disallowed = cla.validate_allowed_options_as_str( allowed_opts )
-            if ( disallowed.length > 0 )
-                Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
-            end
-
-            sp_taxonomy_infile = cla.get_file_name( 0 )
-            sequences_infile = cla.get_file_name( 1 )
-            sequences_outfile = cla.get_file_name( 2 )
-            mapping_outfile = cla.get_file_name( 3 )
-
-            Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
-            Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
-            Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
-            Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
-
-            sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
-
-            Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
-
-            fasta_parser = FastaParser.new
-            msa_fac = MsaFactory.new
-
-            seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser )
-
-            Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
-
-            removed = seqs.remove_redundant_sequences!( true, true )
-
-            if removed.size > 0
-                Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
-                removed.each { | seq_name |
-                    puts seq_name
-                }
-                Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
-            end
-
-            mapping_out = File.open( mapping_outfile, "a" )
-
-            for i in 0 ... seqs.get_number_of_seqs
-                seq = seqs.get_sequence( i )
-                seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
-            end
-
-            io = MsaIO.new()
-
-            w = FastaWriter.new()
-
-            w.set_max_name_length( 10 )
-            w.clean( true )
-            begin
-                io.write_to_file( seqs, sequences_outfile, w )
-            rescue Exception => e
-                Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
-            end
-            mapping_out.close()
-
-            Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
-            Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
-            Util.print_message( PRG_NAME, "OK" )
+    def modify_name( seq, i, sp_taxonomies, mapping_outfile )
  
-        end
+      #i = i + 1792
  
-        private
-
-        def modify_name( seq, i, sp_taxonomies, mapping_outfile )
-
-            #i = i + 1792
-            
-            seq_desc = seq.get_name
-
-            taxonomy_sn = nil
-
-            if seq_desc =~ /\[(.+)\]/
-                taxonomy_sn = $1
-            else
-                Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]"  )
-            end
-
-            matching_sp_taxonomy = nil
-
-            sp_taxonomies.each { |sp_taxonomy|
-                if ( sp_taxonomy.scientific_name == taxonomy_sn )
-                    matching_sp_taxonomy = sp_taxonomy
-                end
-            }
-
-            if  matching_sp_taxonomy == nil
-                Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" )
-            end
-
-            new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
-
-            gi = nil
-            if seq_desc =~ /gi\|(.+?)\|/
-                gi = $1
-            else
-              Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]"  )
-            end
-
-            seq_name = ""
-
-            if seq_desc =~ /\|\s*([^|]+?)\s*\[/
-                seq_name = $1
-            end
-
-            if  seq_name =~ /\[.+\]$/
-                # Redundant taxonomy information hides here.
-                seq_name = seq_name.sub(/\[.+\]$/, '')
-            end
-            if  seq_name =~ /^\s*hypothetical\s+protein\s*/i
-                # Pointless information.
-                seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
-            end
-            if  seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i
-                # Pointless information.
-                seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' )
-            end
-
-            if gi != nil
-            mapping_outfile.print( new_name + "\t" +
-                 TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
-                 TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
-                 TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
-                 TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
-                 TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" +
-                 TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" +
-                 TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
-                 TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
-                 Constants::LINE_DELIMITER )
-            else
-                 mapping_outfile.print( new_name + "\t" +
-                 TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
-                 TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
-                 TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
-                 TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
-                 TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
-                 TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
-                 Constants::LINE_DELIMITER )
-                
-            end    
-            new_name
-        end
+      seq_desc = seq.get_name
+
+      taxonomy_sn = nil
+
+      if seq_desc =~ /\[(.+)\]/
+        taxonomy_sn = $1
+      else
+        Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]"  )
+      end
  
+      matching_sp_taxonomy = nil
+
+      sp_taxonomies.each { |sp_taxonomy|
+        if ( sp_taxonomy.scientific_name == taxonomy_sn )
+          matching_sp_taxonomy = sp_taxonomy
+        end
+      }
+
+      if  matching_sp_taxonomy == nil
+        Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" )
+      end
+
+      new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
+
+      gi = nil
+      if seq_desc =~ /gi\|(.+?)\|/
+        gi = $1
+      else
+        Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]"  )
+      end
+
+      seq_name = ""
+
+      if seq_desc =~ /\|\s*([^|]+?)\s*\[/
+        seq_name = $1
+      end
+
+      if  seq_name =~ /\[.+\]$/
+        # Redundant taxonomy information hides here.
+        seq_name = seq_name.sub(/\[.+\]$/, '')
+      end
+      if  seq_name =~ /^\s*hypothetical\s+protein\s*/i
+        # Pointless information.
+        seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
+      end
+      if  seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i
+        # Pointless information.
+        seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' )
+      end
+
+      if gi != nil
+        mapping_outfile.print( new_name + "\t" +
+        TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
+        TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
+        TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
+        TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
+        TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" +
+        TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" +
+        TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
+        TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
+        Constants::LINE_DELIMITER )
+      else
+        mapping_outfile.print( new_name + "\t" +
+        TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
+        TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
+        TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
+        TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
+        TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
+        TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
+        Constants::LINE_DELIMITER )
+
+      end
+      new_name
      end
  
+  end
+
  end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb

index 08de9c1..caff316 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb
+++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb
@@ -1,29 +1,22 @@
  #
  # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class
  #
-# Copyright::  Copyright (C) 2012 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
-#
+# Copyright::    Copyright (C) 2017 Christian M Zmasek
+# License::      GNU Lesser General Public License (LGPL)
  
  require 'set'
-
  require 'lib/evo/util/constants'
  require 'lib/evo/util/util'
  require 'lib/evo/util/command_line_arguments'
  require 'lib/evo/io/parser/hmmscan_parser'
  
  module Evoruby
-
    class HmmscanSummary
  
      PRG_NAME       = "hsp"
-    PRG_VERSION    = "2.002"
-    PRG_DESC       = "hmmscan summary"
-    PRG_DATE       = "130319"
-    COPYRIGHT      = "2013 Christian M Zmasek"
-    CONTACT        = "phyloxml@gmail.com"
+    PRG_VERSION    = "2.003"
+    PRG_DESC       = "Summarize hmmscan output tables into simpler tables"
+    PRG_DATE       = "170213"
      WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
  
      DELIMITER_OPTION              = "d"
@@ -36,9 +29,9 @@ module Evoruby
      HELP_OPTION_1                 = "help"
      HELP_OPTION_2                 = "h"
  
-    USE_AVOID_HMMS = true
-    AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
-    LIMIT_FOR_CLOSE_DOMAINS = 20
+    USE_AVOID_HMMS = false
+    AVOID_HHMS = [ "x", "y", "z" ]
+    LIMIT_FOR_CLOSE_DOMAINS = 20 # Used for protein architecture summary
  
      def initialize
        @domain_counts = Hash.new
@@ -47,13 +40,16 @@ module Evoruby
      def run
  
        Util.print_program_information( PRG_NAME,
-        PRG_VERSION,
-        PRG_DESC,
-        PRG_DATE,
-        COPYRIGHT,
-        CONTACT,
-        WWW,
-        STDOUT )
+      PRG_VERSION,
+      PRG_DESC,
+      PRG_DATE,
+      WWW,
+      STDOUT )
+
+      if ( ARGV == nil || ( ARGV.length < 1 )  )
+        print_help
+        exit( -1 )
+      end
  
        begin
          cla = CommandLineArguments.new( ARGV )
@@ -62,16 +58,11 @@ module Evoruby
        end
  
        if ( cla.is_option_set?( HELP_OPTION_1 ) ||
-           cla.is_option_set?( HELP_OPTION_2 ) )
+      cla.is_option_set?( HELP_OPTION_2 ) )
          print_help
          exit( 0 )
        end
  
-      if ( cla.get_number_of_files != 2 )
-        print_help
-        exit( -1 )
-      end
-
        allowed_opts = Array.new
        allowed_opts.push( DELIMITER_OPTION )
        allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
@@ -84,12 +75,21 @@ module Evoruby
        disallowed = cla.validate_allowed_options_as_str( allowed_opts )
        if ( disallowed.length > 0 )
          Util.fatal_error( PRG_NAME,
-          "unknown option(s): " + disallowed,
-          STDOUT )
+        "unknown option(s): " + disallowed,
+        STDOUT )
        end
  
        inpath = cla.get_file_name( 0 )
-      outpath = cla.get_file_name( 1 )
+
+      outpath = ""
+      if ( cla.get_number_of_files == 1 )
+        outpath = inpath + Constants::DOMAIN_TABLE_SUFFIX
+      elsif ( cla.get_number_of_files == 2 )
+        outpath = cla.get_file_name( 1 )
+      else
+        print_help
+        exit( -1 )
+      end
  
        column_delimiter = "\t"
        if ( cla.is_option_set?( DELIMITER_OPTION ) )
@@ -155,7 +155,7 @@ module Evoruby
        puts()
        puts( "hmmpfam outputfile  : " + inpath )
        puts( "outputfile          : " + outpath )
-      puts( "species             : " + species )
+
        if ( i_e_value_threshold >= 0.0 )
          puts( "i-E-value threshold : " + i_e_value_threshold.to_s )
        else
@@ -174,28 +174,29 @@ module Evoruby
        if ( column_delimiter == "\t" )
          puts( "column delimiter    : TAB" )
        else
-        puts( "column delimiter     : " + column_delimiter )
-      end
-      if fs_e_value_threshold >= 0.0
-        puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
-      else
-        puts( "E-value threshold   : no threshold" )
+        puts( "column delimiter    : " + column_delimiter )
        end
        if !hmm_for_protein_output.empty?
          puts( "HMM for proteins    : " + hmm_for_protein_output )
+        puts( "species             : " + species )
+        if fs_e_value_threshold >= 0.0
+          puts( "E-value threshold   : " + fs_e_value_threshold.to_s )
+        else
+          puts( "E-value threshold   : no threshold" )
+        end
        end
        puts()
  
        begin
          parse( inpath,
-          outpath,
-          column_delimiter,
-          i_e_value_threshold,
-          ignore_dufs,
-          parse_descriptions,
-          fs_e_value_threshold,
-          hmm_for_protein_output,
-          species )
+        outpath,
+        column_delimiter,
+        i_e_value_threshold,
+        ignore_dufs,
+        parse_descriptions,
+        fs_e_value_threshold,
+        hmm_for_protein_output,
+        species )
        rescue IOError => e
          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
        end
@@ -207,6 +208,8 @@ module Evoruby
        puts
        puts( Util.draw_histogram( domain_counts, "#" ) )
        puts
+      Util.print_message( PRG_NAME, "wrote: " + outpath )
+      Util.print_message( PRG_NAME, "next step in standard analysis pipeline: d2f.rb")
        Util.print_message( PRG_NAME, 'OK' )
        puts
  
@@ -216,14 +219,14 @@ module Evoruby
  
      # raises ArgumentError, IOError
      def parse( inpath,
-        outpath,
-        column_delimiter,
-        i_e_value_threshold,
-        ignore_dufs,
-        get_descriptions,
-        fs_e_value_threshold,
-        hmm_for_protein_output,
-        species )
+      outpath,
+      column_delimiter,
+      i_e_value_threshold,
+      ignore_dufs,
+      get_descriptions,
+      fs_e_value_threshold,
+      hmm_for_protein_output,
+      species )
  
        Util.check_file_for_readability( inpath )
        Util.check_file_for_writability( outpath )
@@ -246,27 +249,28 @@ module Evoruby
  
        results.each do | r |
          model     = r.model
+        desc      = r.desc
          query     = r.query
          i_e_value = r.i_e_value
          env_from  = r.env_from
          env_to    = r.env_to
  
          if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
-           ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
+        ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
            count_model( model )
            outfile.print( query +
-             column_delimiter )
+          column_delimiter )
            if ( get_descriptions )
              outfile.print( desc +
-               column_delimiter )
+            column_delimiter )
            end
            outfile.print( model +
-             column_delimiter +
-             env_from.to_s +
-             column_delimiter +
-             env_to.to_s +
-             column_delimiter +
-             i_e_value.to_s )
+          column_delimiter +
+          env_from.to_s +
+          column_delimiter +
+          env_to.to_s +
+          column_delimiter +
+          i_e_value.to_s )
            outfile.print( Constants::LINE_DELIMITER )
          end
  
@@ -274,10 +278,10 @@ module Evoruby
            if  !prev_query.empty? && prev_query != query
              if !hmmscan_results_per_protein.empty?
                process_hmmscan_results_per_protein( hmmscan_results_per_protein,
-                fs_e_value_threshold,
-                hmm_for_protein_output,
-                i_e_value_threshold,
-                species )
+              fs_e_value_threshold,
+              hmm_for_protein_output,
+              i_e_value_threshold,
+              species )
              end
              hmmscan_results_per_protein.clear
            end
@@ -295,10 +299,10 @@ module Evoruby
  
        if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
          process_hmmscan_results_per_protein( hmmscan_results_per_protein,
-          fs_e_value_threshold,
-          hmm_for_protein_output,
-          i_e_value_threshold,
-          species )
+        fs_e_value_threshold,
+        hmm_for_protein_output,
+        i_e_value_threshold,
+        species )
        end
  
        outfile.flush()
@@ -323,10 +327,10 @@ module Evoruby
      end
  
      def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
-        fs_e_value_threshold,
-        hmm_for_protein_output,
-        i_e_value_threshold,
-        species )
+      fs_e_value_threshold,
+      hmm_for_protein_output,
+      i_e_value_threshold,
+      species )
  
        dc = 0
        # filter according to i-Evalue threshold
@@ -335,7 +339,6 @@ module Evoruby
  
        hmmscan_results_per_protein.each do | r |
  
-
          if r.model == hmm_for_protein_output
            if fs_e_value_threshold > 0.0 && r.fs_e_value > fs_e_value_threshold
              return
@@ -453,19 +456,24 @@ module Evoruby
        s
      end
  
-
      def print_help()
        puts( "Usage:" )
        puts()
-      puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
+      puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> [outputfile]" )
+      puts()
+      puts( "  options: -" + DELIMITER_OPTION + "=<s> : column delimiter for outputfile, default is TAB" )
+      puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + "=<f>: i-E-value threshold, default is no threshold" )
+      puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + "     : parse query description (in addition to query name)" )
+      puts( "           -" + IGNORE_DUF_OPTION  + "     : ignore DUFs" )
+      puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + "=<s> : HMM for protein architectures summary" )
+      puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + "=<f>: E-value threshold for full protein sequences, only for protein architectures summary" )
+      puts( "           -" + SPECIES_OPTION + "=<s> : species for protein architectures summary" )
+      puts()
+      puts( "Example:" )
+      puts()
+      puts( "  " + "hmmscan --nobias --domtblout P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 -E 10 Pfam-A.hmm P53_ni.fasta" )
        puts()
-      puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
-      puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
-      puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
-      puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
-      puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
-      puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
-      puts( "           -" + SPECIES_OPTION + ": species for protein summary" )
+      puts( "  " + PRG_NAME + ".rb P53_hmmscan_300_10" )
        puts()
      end
  
diff --git a/forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb b/forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb

index 0459eee..cbbff63 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb
+++ b/forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb
@@ -2,12 +2,12 @@
  #
  # = lib/evo/apps/phylogenies_decorator
  #
-# Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
  #
-# decoration of phylogenies with sequence/species names and domain architectures
+# Last modified: 2017/02/09
  #
-# $Id: phylogenies_decorator.rb,v 1.34 2010/12/13 19:00:11 cmzmasek Exp $
+# decoration of phylogenies with sequence/species names and domain architectures
  #
  # Environment variable FORESTER_HOME needs to point to the appropriate
  # directory (e.g. setenv FORESTER_HOME $HOME/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester/)
@@ -22,7 +22,8 @@ module Evoruby
  
      #DECORATOR_OPTIONS_SEQ_NAMES = '-r=1 -mdn'
      #DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -sn'
-    DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -tc -mp -or'
+    #DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -tc -mp -or'
+    DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -mp -or'
      # -mdn is a hidden expert option to rename e.g. "6_ORYLA3" to "6_[3]_ORYLA"
      #DECORATOR_OPTIONS_SEQ_NAMES = '-sn -r=1'
      #DECORATOR_OPTIONS_DOMAINS = '-r=1'
@@ -31,7 +32,8 @@ module Evoruby
      DOMAINS_MAPFILE_SUFFIX    = '_hmmscan_10.dff'
      SLEEP_TIME                = 0.05
      REMOVE_NI                 = true
-    IDS_ONLY                  = true
+    IDS_ONLY                  = true #TODO this should be a command line option
+    FIXED_NIM_FILE            = 'all.nim' #TODO this should be a command line option
      TMP_FILE_1                  = '___PD1___'
      TMP_FILE_2                  = '___PD2___'
      LOG_FILE                  = '00_phylogenies_decorator.log'
@@ -39,11 +41,11 @@ module Evoruby
      JAVA_HOME                 = ENV[Constants::JAVA_HOME_ENV_VARIABLE]
  
      PRG_NAME       = "phylogenies_decorator"
-    PRG_DATE       = "2013.11.15"
+    PRG_DATE       = "170209"
      PRG_DESC       = "decoration of phylogenies with sequence/species names and domain architectures"
      PRG_VERSION    = "1.02"
-    COPYRIGHT      = "2013 Christian M Zmasek"
-    CONTACT        = "phylosoft@gmail.com"
+    COPYRIGHT      = "2017 Christian M Zmasek"
+    CONTACT        = "phyloxml at gmail dot com"
      WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
  
      HELP_OPTION_1       = "help"
@@ -151,7 +153,7 @@ module Evoruby
              outfile = outfile.sub( /_ni_/, '_' )
            end
  
-          if File.exists?( outfile )
+          if File.exist?( outfile )
              msg = counter.to_s + ': ' + phylogeny_file + ' -> ' +  outfile +
              ' : already exists, skipping'
              Util.print_message( PRG_NAME, msg  )
@@ -174,8 +176,12 @@ module Evoruby
            domains_mapfile_name = nil
            seqs_file_name = nil
  
-          ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX )
-
+          if ( FIXED_NIM_FILE == nil )
+            ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX )
+          else
+            ids_mapfile_name = FIXED_NIM_FILE
+          end
+          
            unless IDS_ONLY
              domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX )
              seqs_file_name = get_seq_file( files, phylogeny_id )
@@ -213,7 +219,7 @@ module Evoruby
  
              cmd = decorator + ' ' + DECORATOR_OPTIONS_DOMAINS + ' ' +
              '-f=d ' + TMP_FILE_1 + ' ' +
-            domains_mapfile_name + ' ' +TMP_FILE_2
+            domains_mapfile_name + ' ' + TMP_FILE_2
              puts cmd
              begin
                execute_cmd( cmd, log )
@@ -276,17 +282,7 @@ module Evoruby
      end
  
      def get_file( files_in_dir, phylogeny_id, suffix_pattern )
-      matching_files = Array.new
-
-      files_in_dir.each { | file |
-
-        if ( !File.directory?( file ) &&
-        file !~ /^\./ &&
-        file !~ /^00/ &&
-        file =~ /^#{phylogeny_id}.*#{suffix_pattern}$/ )
-          matching_files << file
-        end
-      }
+      matching_files = Util.get_matching_files( files_in_dir, phylogeny_id, suffix_pattern )
        if matching_files.length < 1
          Util.fatal_error( PRG_NAME, 'no file matching [' + phylogeny_id +
          '...' + suffix_pattern + '] present in current directory' )
diff --git a/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb b/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb

index 18d396e..9180831 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb
+++ b/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb
@@ -1,10 +1,10 @@
  #
  # = lib/evo/apps/phylogeny_factory - PhylogenyFactory class
  #
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
  #
-# $Id: phylogeny_factory.rb,v 1.32 2010/12/13 19:00:11 cmzmasek Exp $
+# Last modified: 2017/02/07
  
  require 'lib/evo/util/constants'
  require 'lib/evo/util/util'
@@ -21,9 +21,9 @@ module Evoruby
      PRG_DATE       = "1301111"
      PRG_DESC       = "automated phylogeny reconstruction using queing system"
      PRG_VERSION    = "1.100"
-    COPYRIGHT      = "2013 Christian M Zmasek"
-    CONTACT        = "phylosoft@gmail.com"
-    WWW            = "www.phylosoft.org"
+    COPYRIGHT      = "2017 Christian M Zmasek"
+    CONTACT        = "cmzmasek at yahoo dot com"
+    WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
  
      USE_JOB_SUBMISSION_SYSTEM_OPTION  = 's'
      BS_OPTION                         = 'b'
diff --git a/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb

index 862a91a..6a4bb5f 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb
+++ b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb
@@ -1,12 +1,10 @@
  #
  # = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class
  #
-# Copyright::  Copyright (C) 20017 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-
-
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
  
+require 'lib/evo/util/constants'
  require 'lib/evo/util/util'
  require 'lib/evo/msa/msa_factory'
  require 'lib/evo/msa/msa'
@@ -18,35 +16,29 @@ require 'lib/evo/io/writer/phylip_sequential_writer'
  require 'lib/evo/util/command_line_arguments'
  
  module Evoruby
-
    class TaxonomyProcessor
  
      PRG_NAME       = "tap"
-    PRG_DATE       = "170206"
-    PRG_DESC       = "replacement of species names in multiple sequence files"
-    PRG_VERSION    = "2.002"
-    COPYRIGHT      = "2017 Christian M Zmasek"
-    CONTACT        = "phylosoft@gmail.com"
-    WWW            = ""
+    PRG_DATE       = "170213"
+    PRG_DESC       = "Replacement of labels in multiple sequence files"
+    PRG_VERSION    = "2.004"
+    WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
  
      EXTRACT_TAXONOMY_OPTION = "t"
-
+    ANNOTATION_OPTION       = "a"
+    HELP_OPTION_1           = "help"
+    HELP_OPTION_2           = "h"
      def run()
  
        Util.print_program_information( PRG_NAME,
-        PRG_VERSION,
-        PRG_DESC,
-        PRG_DATE,
-        COPYRIGHT,
-        CONTACT,
-        WWW,
-        STDOUT )
-
-      if ( ARGV == nil || ( ARGV.length != 1 && ARGV.length != 2 && ARGV.length != 3 && ARGV.length != 4 && ARGV.length != 5 && ARGV.length != 6 ) )
-        puts( "Usage: #{PRG_NAME}.rb [options] <input sequences> [output sequences] [output id list]" )
-        puts()
-        puts( "  options: -" + EXTRACT_TAXONOMY_OPTION + ": to extract taxonomy information from bracketed expression" )
-        puts()
+      PRG_VERSION,
+      PRG_DESC,
+      PRG_DATE,
+      WWW,
+      STDOUT )
+
+      if ( ARGV == nil || ( ARGV.length < 1 ) )
+        print_help()
          exit( -1 )
        end
  
@@ -56,9 +48,15 @@ module Evoruby
          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
        end
  
-      input     = nil
-      output    = nil
-      list_file = nil
+      if ( cla.is_option_set?( HELP_OPTION_1 ) ||
+      cla.is_option_set?( HELP_OPTION_2 ) )
+        print_help
+        exit( 0 )
+      end
+
+      input      = nil
+      output     = nil
+      list_file  = nil
  
        if cla.get_number_of_files == 3
          input     = cla.get_file_name( 0 )
@@ -74,13 +72,16 @@ module Evoruby
          else
            i = input
          end
-        output    = i + "_ni.fasta"
-        list_file = i + ".nim"
+        output    = i + Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
+        list_file = i + Constants::ID_MAP_FILE_SUFFIX
+      else
+        print_help()
+        exit(-1)
        end
  
-
        allowed_opts = Array.new
        allowed_opts.push( EXTRACT_TAXONOMY_OPTION )
+      allowed_opts.push( ANNOTATION_OPTION )
  
        disallowed = cla.validate_allowed_options_as_str( allowed_opts )
        if ( disallowed.length > 0 )
@@ -92,13 +93,18 @@ module Evoruby
          extract_taxonomy = true
        end
  
-      if ( File.exists?( output ) )
+      annotation = nil
+      if ( cla.is_option_set?( ANNOTATION_OPTION ) )
+        annotation = cla.get_option_value( ANNOTATION_OPTION )
+      end
+
+      if ( File.exist?( output ) )
          Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" )
        end
-      if ( File.exists?( list_file ) )
+      if ( File.exist?( list_file ) )
          Util.fatal_error( PRG_NAME, "list file [" + list_file + "] already exists" )
        end
-      if ( !File.exists?( input) )
+      if ( !File.exist?( input) )
          Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" )
        end
  
@@ -116,6 +122,9 @@ module Evoruby
        if ( extract_taxonomy )
          puts( "Extract taxonomy: true"  )
        end
+      if ( annotation != nil )
+        puts( "Annotation      : " + annotation )
+      end
        puts()
  
        f = MsaFactory.new()
@@ -141,7 +150,7 @@ module Evoruby
        lf = File.open( list_file, "a" )
        for i in 0 ... msa.get_number_of_seqs
          seq = msa.get_sequence( i )
-        seq.set_name( modify_name( seq.get_name(), i, lf, extract_taxonomy ) )
+        seq.set_name( modify_name( seq.get_name(), i, lf, extract_taxonomy, annotation ) )
        end
        io = MsaIO.new()
        w = nil
@@ -150,7 +159,7 @@ module Evoruby
        else
          w = PhylipSequentialWriter.new()
        end
-      w.set_max_name_length( 10 )
+      w.set_max_name_length( 9 )
        w.clean( true )
        begin
          io.write_to_file( msa, output, w )
@@ -160,16 +169,15 @@ module Evoruby
        lf.close()
        Util.print_message( PRG_NAME, "wrote: " + list_file )
        Util.print_message( PRG_NAME, "wrote: " + output )
+      Util.print_message( PRG_NAME, "next steps in standard analysis pipeline: hmmscan followed by hsp.rb")
        Util.print_message( PRG_NAME, "OK" )
      end
  
      private
  
-    def modify_name( desc, counter, file, extract_taxonomy )
+    def modify_name( desc, counter, file, extract_taxonomy, annotation )
        new_desc = nil
        desc.gsub!( /\s+/, ' ' )
-      #if desc =~ /^>?\s*\S{1,10}_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)/
-      #  new_desc = counter.to_s( 16 ) + "_" + $1
        if extract_taxonomy
          if desc =~/\s\[(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)\]/
            new_desc = counter.to_s( 16 ) + "_" + $1
@@ -179,10 +187,33 @@ module Evoruby
        else
          new_desc = counter.to_s( 16 )
        end
-      file.print( new_desc + "\t" + desc + "\n" )
+      if (annotation != nil)
+        new_desc = new_desc + annotation
+        file.print( new_desc + "\t" + desc + " " + annotation + "\n" )
+      else
+        file.print( new_desc + "\t" + desc + "\n" )
+      end
+      if ( new_desc.length > 9)
+        Util.fatal_error( PRG_NAME, "shortened identifier [" +
+        new_desc + "] is too long (" + new_desc.length.to_s + " characters)" )
+      end
        new_desc
      end
  
+    def print_help()
+      puts( "Usage:" )
+      puts()
+      puts( "  " + PRG_NAME + ".rb [options] <input sequences> [output sequences] [output id list]" )
+      puts()
+      puts( "  options: -" + EXTRACT_TAXONOMY_OPTION + "    : to extract taxonomy information from bracketed expressions" )
+      puts( "           -" + ANNOTATION_OPTION + "=<s>: to add an annotation to all entries" )
+      puts()
+      puts( "Example:" )
+      puts()
+      puts( "  " + PRG_NAME + ".rb P53.fasta" )
+      puts()
+    end
+
    end # class TaxonomyProcessor
  
  end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb b/forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb

index 1ad9924..0045730 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb
+++ b/forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb
@@ -1,177 +1,175 @@
  #
  # = lib/evo/util/command_line_arguments.rb - CommandLineArguments class
  #
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
  #
-# $Id: command_line_arguments.rb,v 1.2 2007/06/12 04:51:34 cmzmasek Exp $
-#
-# last modified: 05/16/2007
+# Last modified: 2017/02/12
  
  module Evoruby
-
-    class CommandLineArguments
-
-        OPTIONS_PREFIX          = "-"
-        EXTENDED_OPTIONS_PREFIX = "--"
-        OPTIONS_SEPARATOR       = "="
-
-        # raises ArgumentError
-        def initialize( args )
-            @options  = Hash.new
-            @extended_options = Hash.new
-            @file_names = Array.new
-            parse_arguments( args )
-        end
-
-        def get_file_names
-            return @file_names
-        end
-
-        def get_file_name( i )
-            return @file_names[ i ]
-        end
-
-        def get_number_of_files()
-            return @file_names.length
-        end
-
-        def is_option_set?( option_name )
-            o = get_all_options
-            return ( o.has_key?( option_name ) )
-        end
-
-        # raises ArgumentError
-        def get_option_value( option_name )
-            o = get_all_options
-            if ( o.has_key?( option_name ) )
-                value = o[ option_name ]
-                if ( !Util.is_string_empty?( value ) )
-                    return value
-                else
-                    raise( ArgumentError, "value for option \"" +
-                         option_name + "\" is not set", caller )
-                end
-            else
-                raise( ArgumentError, "option \"" + option_name +
-                     "\" is not set", caller )
-            end
-        end
-
-        def get_option_value_as_int( option_name )
-            return get_option_value( option_name ).to_i
-        end
-
-        def get_option_value_as_float( option_name )
-            return get_option_value( option_name ).to_f
-        end
-
-        # mandatory_options (Array)
-        #
-        def validate_mandatory_options( mandatory_options )
-            o = get_all_options
-            missing = Array.new
-            for ma in mandatory_options
-                if ( !o.has_key?( ma ) )
-                    missing.push( ma )
-                end
-            end
-            return missing
-        end
-
-        # mandatory_options (Array)
-        #
-        def validate_mandatory_options_as_str( mandatory_options )
-            missing = validate_mandatory_options( mandatory_options )
-            return missing.join( ", " )
-        end
-
-        # allowed_options (Array)
-        #
-        def validate_allowed_options( allowed_options )
-            o = get_all_options
-            disallowed = Array.new
-            o.each_key { |op|
-                if ( !allowed_options.include?( op ) )
-                    disallowed.push( op )
-                end
-            }
-            return disallowed
-        end
-
-        # allowed_options (Array)
-        #
-        def validate_allowed_options_as_str( allowed_options )
-            disallowed = validate_allowed_options( allowed_options )
-            return disallowed.join( ", " )
-        end
-
-        private
-
-        def get_all_options
-            o = Hash.new
-            o.merge!( get_options_list )
-            o.merge!( get_extended_options_list )
-            return o
-        end
-
-        def parse_arguments( args )
-            for arg in args
-                if ( arg.index( EXTENDED_OPTIONS_PREFIX ) == 0 )
-                    parse_option( arg.slice( EXTENDED_OPTIONS_PREFIX.length, arg.length() - 1 ),
-                                  get_extended_options_list )
-
-                elsif ( arg.index( OPTIONS_PREFIX ) == 0 )
-                    parse_option( arg.slice( OPTIONS_PREFIX.length, arg.length() - 1 ),
-                                  get_options_list )
-
-                else
-                    get_file_names.push( arg )
-                end
-            end
-        end
-
-        # raises ArgumentError
-        def parse_option( option, options_map )
-            sep_index = option.index( OPTIONS_SEPARATOR )
-            if ( sep_index == nil )
-                if ( Util.is_string_empty?( option ) )
-                    raise( ArgumentError, "attempt to set option with an empty name" )
-                end
-                if ( get_all_options.has_key?( option ) )
-                     raise( ArgumentError, "attempt to set option \"" +
-                            option + "\" mutiple times" )
-                end
-                options_map[ option ] = ""
-            else
-                key = option.slice( 0, sep_index )
-                value = option.slice( sep_index + 1, option.length() - 1 )
-                if ( Util.is_string_empty?( key ) )
-                    raise( ArgumentError, "attempt to set option with an empty name" )
-                end
-                if ( Util.is_string_empty?( value ) )
-                    raise( ArgumentError, "attempt to set option with an empty value" )
-                end
-                if ( get_all_options.has_key?( key ) )
-                    raise( ArgumentError, "attempt to set option \"" +
-                            key + "\" mutiple times [" + option + "]" )
-                end
-                options_map[ key ] = value
-            end
-        end
-
-        def get_file_names_list
-            return @file_names
-        end
-
-        def get_options_list
-            return @options
-        end
-
-        def get_extended_options_list
-            return @extended_options
-        end
-
-    end # class CommandLineArguments
+  class CommandLineArguments
+
+    OPTIONS_PREFIX          = "-"
+    EXTENDED_OPTIONS_PREFIX = "--"
+    OPTIONS_SEPARATOR       = "="
+    # raises ArgumentError
+    def initialize( args )
+      @options  = Hash.new
+      @extended_options = Hash.new
+      @file_names = Array.new
+      parse_arguments( args )
+    end
+
+    def get_file_names
+      return @file_names
+    end
+
+    def get_file_name( i )
+      return @file_names[ i ]
+    end
+
+    def get_number_of_files()
+      return @file_names.length
+    end
+
+    def is_option_set?( option_name )
+      o = get_all_options
+      return ( o.has_key?( option_name ) )
+    end
+
+    # raises ArgumentError
+    def get_option_value( option_name )
+      o = get_all_options
+      if ( o.has_key?( option_name ) )
+        value = o[ option_name ]
+        if ( !Util.is_string_empty?( value ) )
+          return value
+        else
+          puts()
+          puts(  "value for option \"" + option_name + "\" is not set")
+          puts()
+          exit( -1 )
+        end
+      else
+        raise( ArgumentError, "option \"" + option_name +
+        "\" is not set", caller )
+      end
+    end
+
+    def get_option_value_as_int( option_name )
+      return get_option_value( option_name ).to_i
+    end
+
+    def get_option_value_as_float( option_name )
+      return get_option_value( option_name ).to_f
+    end
+
+    # mandatory_options (Array)
+    #
+    def validate_mandatory_options( mandatory_options )
+      o = get_all_options
+      missing = Array.new
+      for ma in mandatory_options
+        if ( !o.has_key?( ma ) )
+          missing.push( ma )
+        end
+      end
+      return missing
+    end
+
+    # mandatory_options (Array)
+    #
+    def validate_mandatory_options_as_str( mandatory_options )
+      missing = validate_mandatory_options( mandatory_options )
+      return missing.join( ", " )
+    end
+
+    # allowed_options (Array)
+    #
+    def validate_allowed_options( allowed_options )
+      o = get_all_options
+      disallowed = Array.new
+      o.each_key { |op|
+        if ( !allowed_options.include?( op ) )
+          disallowed.push( op )
+        end
+      }
+      return disallowed
+    end
+
+    # allowed_options (Array)
+    #
+    def validate_allowed_options_as_str( allowed_options )
+      disallowed = validate_allowed_options( allowed_options )
+      return disallowed.join( ", " )
+    end
+
+    private
+
+    def get_all_options
+      o = Hash.new
+      o.merge!( get_options_list )
+      o.merge!( get_extended_options_list )
+      return o
+    end
+
+    def parse_arguments( args )
+      for arg in args
+        if ( arg.index( EXTENDED_OPTIONS_PREFIX ) == 0 )
+          parse_option( arg.slice( EXTENDED_OPTIONS_PREFIX.length, arg.length() - 1 ),
+          get_extended_options_list )
+
+        elsif ( arg.index( OPTIONS_PREFIX ) == 0 )
+          parse_option( arg.slice( OPTIONS_PREFIX.length, arg.length() - 1 ),
+          get_options_list )
+
+        else
+          get_file_names.push( arg )
+        end
+      end
+    end
+
+    # raises ArgumentError
+    def parse_option( option, options_map )
+      sep_index = option.index( OPTIONS_SEPARATOR )
+      if ( sep_index == nil )
+        if ( Util.is_string_empty?( option ) )
+          raise( ArgumentError, "attempt to set option with an empty name" )
+        end
+        if ( get_all_options.has_key?( option ) )
+          raise( ArgumentError, "attempt to set option \"" +
+          option + "\" mutiple times" )
+        end
+        options_map[ option ] = ""
+      else
+        key = option.slice( 0, sep_index )
+        value = option.slice( sep_index + 1, option.length() - 1 )
+        if ( Util.is_string_empty?( key ) )
+          raise( ArgumentError, "attempt to set option with an empty name" )
+        end
+        if ( Util.is_string_empty?( value ) )
+          raise( ArgumentError, "attempt to set option with an empty value" )
+        end
+        if ( get_all_options.has_key?( key ) )
+          raise( ArgumentError, "attempt to set option \"" +
+          key + "\" mutiple times [" + option + "]" )
+        end
+        options_map[ key ] = value
+      end
+    end
+
+    def get_file_names_list
+      return @file_names
+    end
+
+    def get_options_list
+      return @options
+    end
+
+    def get_extended_options_list
+      return @extended_options
+    end
+
+  end # class CommandLineArguments
  
  end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/util/constants.rb b/forester/ruby/evoruby/lib/evo/util/constants.rb

index 6546478..b54801b 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/util/constants.rb
+++ b/forester/ruby/evoruby/lib/evo/util/constants.rb
@@ -1,33 +1,35 @@
  #
  # = lib/evo/util/constants.rb - Constants class
  #
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
-#
-# $Id: constants.rb,v 1.3 2007/12/21 04:13:33 cmzmasek Exp $
-#
-# last modified: 05/11/2007
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
  
  module Evoruby
+  class Constants
  
-    class Constants
-
-        VERBOSE = true
+    VERBOSE = true
  
-        EVORUBY_VERSION = '1.0'
+    EVORUBY_VERSION = '1.1'
  
-        FORESTER_HOME_ENV_VARIABLE = 'FORESTER_HOME'
-        JAVA_HOME_ENV_VARIABLE     = 'JAVA_HOME'
+    ID_NORMALIZED_FASTA_FILE_SUFFIX          = "_ni.fasta"
+    ID_MAP_FILE_SUFFIX                       = ".nim"
+    DOMAIN_TABLE_SUFFIX                      = "_domain_table"
+    DOMAINS_TO_FORESTER_OUTFILE_SUFFIX       = ".dff"
+    DOMAINS_TO_FORESTER_EVALUE_CUTOFF_SUFFIX = "_dtfE"
+    
+    PFAM_V_FOR_EX                             = "300" # Pfam version for examples
  
-        EVORUBY         = 'evoruby'
+    FORESTER_HOME_ENV_VARIABLE = 'FORESTER_HOME'
+    JAVA_HOME_ENV_VARIABLE     = 'JAVA_HOME'
  
-        LINE_DELIMITER  = "\n"
+    EVORUBY         = 'evoruby'
  
-        FILE_SEPARATOR  = File::SEPARATOR
+    LINE_DELIMITER  = "\n"
  
-        DOMAIN_STRUCTURE_NHX_SEPARATOR = '>'
+    FILE_SEPARATOR  = File::SEPARATOR
  
+    DOMAIN_STRUCTURE_NHX_SEPARATOR = '>'
  
-    end # class Constants
+  end # class Constants
  
  end # module Evoruby
diff --git a/forester/ruby/evoruby/lib/evo/util/util.rb b/forester/ruby/evoruby/lib/evo/util/util.rb

index 27bdb5e..80916ca 100644 (file)
--- a/forester/ruby/evoruby/lib/evo/util/util.rb
+++ b/forester/ruby/evoruby/lib/evo/util/util.rb
@@ -1,18 +1,26 @@
  #
  # = lib/evo/util/util.rb - Util class
  #
-# Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
-# License::    GNU Lesser General Public License (LGPL)
+# Copyright::    Copyright (C) 2017 Christian M. Zmasek
+# License::      GNU Lesser General Public License (LGPL)
  #
-# $Id: util.rb,v 1.17 2009/10/06 22:22:46 cmzmasek Exp $
-#
-# last modified: 05/15/2007
+# Last modified: 2017/02/07
  
  require 'lib/evo/util/constants'
  
  module Evoruby
-
    class Util
+    def Util.get_matching_files( files, prefix_pattern, suffix_pattern )
+      matching_files = Array.new
+      files.each { | file |
+        if ( !File.directory?( file ) &&
+        file !~ /^\./ &&
+        file =~ /^#{prefix_pattern}.*#{suffix_pattern}$/ )
+          matching_files << file
+        end
+      }
+      matching_files
+    end
  
      def Util.normalize_seq_name( name, length, exception_if_too_long = false )
        if name.length > length
@@ -22,7 +30,8 @@ module Evoruby
          end
          name = name[ 0, length ]
        elsif name.length < length
-        for i in 0 ... length - name.length
+        t = length - name.length
+        t.times do
            name = name + " "
          end
        end
@@ -104,7 +113,6 @@ module Evoruby
        value
      end
  
-
      # raises ArgumentError
      def Util.file2array( path, split_by_semicolon )
        Util.check_file_for_readability( path )
@@ -130,18 +138,11 @@ module Evoruby
      end
  
      def Util.print_program_information( prg_name,
-        prg_version,
-        prg_desc,
-        date,
-        copyright,
-        contact,
-        www,
-        io = STDOUT )
-
-    #  if RUBY_VERSION !~ /1.9/
-    #    puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
-    #    exit( -1 )
-    #  end
+      prg_version,
+      prg_desc,
+      date,
+      www,
+      io = STDOUT )
  
        ruby_version = RUBY_VERSION
        l = prg_name.length + prg_version.length + date.length + ruby_version.length + 12
@@ -156,11 +157,7 @@ module Evoruby
        io.print( prg_desc )
        io.print( Constants::LINE_DELIMITER )
        io.print( Constants::LINE_DELIMITER )
-      io.print( "Copyright (C) " + copyright )
-      io.print( Constants::LINE_DELIMITER )
-      io.print( "Contact: " + contact )
-      io.print( Constants::LINE_DELIMITER )
-      io.print( "         " + www )
+      io.print( "Website: " + www )
        io.print( Constants::LINE_DELIMITER )
        io.print( Constants::LINE_DELIMITER )
      end
author	cmzmasek <cmzmasek@yahoo.com>
	Tue, 14 Feb 2017 18:43:57 +0000 (10:43 -0800)
committer	cmzmasek <cmzmasek@yahoo.com>
	Tue, 14 Feb 2017 18:43:57 +0000 (10:43 -0800)
forester/java/src/org/forester/archaeopteryx/Configuration.java		patch \| blob \| history
forester/java/src/org/forester/archaeopteryx/ControlPanel.java		patch \| blob \| history
forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java		patch \| blob \| history
forester/java/src/org/forester/phylogeny/PhylogenyMethods.java		patch \| blob \| history
forester/perl/forester.pm		patch \| blob \| history
forester/perl/phylo_pl.pl		patch \| blob \| history
forester/ruby/evoruby/exe/tap.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/io/parser/fasta_parser.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/io/parser/hmmscan_parser.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/msa/msa.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/util/constants.rb		patch \| blob \| history
forester/ruby/evoruby/lib/evo/util/util.rb		patch \| blob \| history