From: cmzmasek Date: Tue, 14 Feb 2017 18:43:57 +0000 (-0800) Subject: in progress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=a2be52673716ddf6f158c0831075d47635779e64;p=jalview.git in progress --- diff --git a/forester/java/src/org/forester/archaeopteryx/Configuration.java b/forester/java/src/org/forester/archaeopteryx/Configuration.java index 3418951..f764e7f 100644 --- a/forester/java/src/org/forester/archaeopteryx/Configuration.java +++ b/forester/java/src/org/forester/archaeopteryx/Configuration.java @@ -1665,6 +1665,10 @@ public final class Configuration { return _gui_menu_text_color; } + static int getGuiFontSize() { + return 11; + } + int getMaxBaseFontSize() { return _max_base_font_size; } diff --git a/forester/java/src/org/forester/archaeopteryx/ControlPanel.java b/forester/java/src/org/forester/archaeopteryx/ControlPanel.java index d89204e..05d3ca0 100644 --- a/forester/java/src/org/forester/archaeopteryx/ControlPanel.java +++ b/forester/java/src/org/forester/archaeopteryx/ControlPanel.java @@ -102,11 +102,11 @@ final class ControlPanel extends JPanel implements ActionListener { ORDER_SUBTREE; } final static Font jcb_bold_font = new Font( Configuration - .getDefaultFontFamilyName(), Font.BOLD, 9 ); + .getDefaultFontFamilyName(), Font.BOLD, Configuration.getGuiFontSize() ); final static Font jcb_font = new Font( Configuration - .getDefaultFontFamilyName(), Font.PLAIN, 9 ); + .getDefaultFontFamilyName(), Font.PLAIN, Configuration.getGuiFontSize()); final static Font js_font = new Font( Configuration - .getDefaultFontFamilyName(), Font.PLAIN, 9 ); + .getDefaultFontFamilyName(), Font.PLAIN, Configuration.getGuiFontSize() ); private static final String RETURN_TO_SUPER_TREE_TEXT = "R"; private static final String SEARCH_TIP_TEXT = "Enter text to search for. Use ',' for logical OR and '+' for logical AND (not used in this manner for regular expression searches)."; private static final long serialVersionUID = -8463483932821545633L; diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java index 780dcc3..e44430c 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java @@ -103,8 +103,8 @@ import org.forester.util.ForesterUtil; public final class MainFrameApplication extends MainFrame { - private final static int FRAME_X_SIZE = 800; - private final static int FRAME_Y_SIZE = 800; + private final static int FRAME_X_SIZE = 900; + private final static int FRAME_Y_SIZE = 900; // Filters for the file-open dialog (classes defined in this file) private static final long serialVersionUID = -799735726778865234L; private static final boolean PREPROCESS_TREES = false; diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java index f31c657..ee17de9 100644 --- a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java +++ b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java @@ -1696,6 +1696,9 @@ public class PhylogenyMethods { n.getNodeData().getTaxonomy().setIdentifier( new Identifier( name ) ); break; } + case CLADE_NAME: + n.setName( name ); + break; default: { throw new IllegalArgumentException( "don't know what to do with " + field ); } diff --git a/forester/perl/forester.pm b/forester/perl/forester.pm index f8e58cb..ef0654f 100755 --- a/forester/perl/forester.pm +++ b/forester/perl/forester.pm @@ -166,12 +166,12 @@ our @EXPORT = qw( executeConsense # Software directory: # --------------------- -our $SOFTWARE_DIR = "/home/czmasek/SOFTWARE/"; +our $SOFTWARE_DIR = "/home/zma/SOFTWARE/"; # Java virtual machine: # --------------------- -our $JAVA = $SOFTWARE_DIR."JAVA/jdk1.6.0_03/bin/java"; +our $JAVA = "java"; # Where all the temporary files can be created: @@ -181,49 +181,49 @@ our $TEMP_DIR_DEFAULT = "/tmp/"; # Programs from Joe Felsenstein's PHYLIP package: # ----------------------------------------------- -our $SEQBOOT = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/seqboot"; -our $NEIGHBOR = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/neighbor"; -our $PROTPARS = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/protpars"; -our $PROML = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/proml"; -our $FITCH = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/fitch"; -our $CONSENSE = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/consense"; -our $PHYLIP_VERSION = "3.68"; +our $SEQBOOT = $SOFTWARE_DIR."PHYLO/Phylip/Phylip3.695/phylip-3.696/exe/seqboot"; +our $NEIGHBOR = $SOFTWARE_DIR."PHYLO/Phylip/Phylip3.695/phylip-3.696/exe/neighbor"; +our $PROTPARS = $SOFTWARE_DIR."PHYLO/Phylip/Phylip3.695/phylip-3.696/exe/protpars"; +our $PROML = $SOFTWARE_DIR."PHYLO/Phylip/Phylip3.695/phylip-3.696/exe/proml"; +our $FITCH = $SOFTWARE_DIR."PHYLO/Phylip/Phylip3.695/phylip-3.696/exe/fitch"; +our $CONSENSE = $SOFTWARE_DIR."PHYLO/Phylip/Phylip3.695/phylip-3.696/exe/consense"; +our $PHYLIP_VERSION = "3.695"; # TREE-PUZZLE: # ------------ -our $PUZZLE = $SOFTWARE_DIR."TREE_PUZZLE/tree-puzzle-5.2/src/puzzle"; +our $PUZZLE = $SOFTWARE_DIR."PHYLO/TREE-PUZZLE/tree-puzzle-5.2/src/puzzle"; our $PUZZLE_VERSION = "5.2"; # FASTME: # ----------------------------------------------------- -our $FASTME = $SOFTWARE_DIR."FASTME/fastme2.0/fastme"; +our $FASTME = $SOFTWARE_DIR."PHYLO/FastME/fastme2.0/fastme"; our $FASTME_VERSION = "2.0"; # BIONJ: # ----------------------------------------------------- -our $BIONJ = $SOFTWARE_DIR."BIONJ/bionj"; -our $BIONJ_VERSION = "[1997]"; +our $BIONJ = ""; +our $BIONJ_VERSION = ""; # WEIGHBOR: # ----------------------------------------------------- -our $WEIGHBOR = $SOFTWARE_DIR."WEIGHBOR/Weighbor/weighbor"; -our $WEIGHBOR_VERSION = "1.2.1"; +our $WEIGHBOR = ""; +our $WEIGHBOR_VERSION = ""; # PHYML: # ----------------------------------------------------- -our $PHYML = $SOFTWARE_DIR."PHYML/phyml_v2.4.4/exe/phyml_linux"; -our $PHYML_VERSION = "2.4.4"; +our $PHYML = $SOFTWARE_DIR."PHYLO/PhyML/PhyML-3.1/PhyML-3.1/PhyML-3.1_linux64"; +our $PHYML_VERSION = "3.1"; # RAXML: # ----------------------------------------------------- -our $RAXML = $SOFTWARE_DIR."RAXML/RAxML-7.0.4/raxmlHPC"; -our $RAXML_VERSION = "7.0.4"; +our $RAXML = $SOFTWARE_DIR."PHYLO/RAxML/20161215/standard-RAxML-master/raxmlHPC-AVX"; +our $RAXML_VERSION = "8.2.9"; -# forester.jar. This jar file is currently available at: http://www.phylosoft.org -# ------------------------------------------------------------------------------- +# forester.jar. This jar file is currently available at: https://sites.google.com/site/cmzmasek/home/software/forester +# -------------------------------------------------------------------------------------------------------------------- -our $FORESTER_JAR = $SOFTWARE_DIR."FORESTER/DEV/forester/forester/java/forester.jar"; +our $FORESTER_JAR = "/home/zma/git/forester/forester/java/forester.jar"; @@ -734,7 +734,7 @@ sub executeFastme { &testForTextFilePresence( $inpwd ); my $command = ""; - if ( $bs > 0 ) { + if ( $bs > 1 ) { $command = "$FASTME -b $init_opt -i $inpwd -n $bs -s b"; } else { diff --git a/forester/perl/phylo_pl.pl b/forester/perl/phylo_pl.pl index abdfc2f..220fdd6 100755 --- a/forester/perl/phylo_pl.pl +++ b/forester/perl/phylo_pl.pl @@ -1,12 +1,9 @@ #!/usr/bin/perl -W # -# $Id: phylo_pl.pl,v 1.32 2010/12/13 19:00:22 cmzmasek Exp $ -# # FORESTER -- software libraries and applications # for evolutionary biology research and applications. # -# Copyright (C) 2008-2014 Christian M. Zmasek -# Copyright (C) 2008-2009 Burnham Institute for Medical Research +# Copyright (C) 2017 Christian M. Zmasek # All rights reserved # # This library is free software; you can redistribute it and/or @@ -23,8 +20,8 @@ # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA # -# Contact: phylosoft @ gmail . com -# WWW: www.phylosoft.org/forester +# Contact: cmzmasek at yahoo dot com +# WWW: https://sites.google.com/site/cmzmasek/home/software/forester # # # @@ -61,7 +58,7 @@ use lib $FindBin::Bin; use forester; my $VERSION = "1.0.1"; -my $LAST_MODIFIED = "2009.10.02"; +my $LAST_MODIFIED = "2017/02/07"; my $RAXML_MODEL_BASE = "PROTGAMMA"; my $RAXML_ALGORITHM = "a"; @@ -1669,13 +1666,12 @@ Y sub printUsage { print < :replace, :undef => :replace, :replace => "?") if can_ignore?( line, saw_first_seq ) elsif line =~ /^\s*>\s*(.+)/ diff --git a/forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb index dbab203..6ef36bf 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/general_msa_parser.rb @@ -1,10 +1,10 @@ # # = lib/evo/io/parser/general_msa_parser - GeneralMsaParser class # -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) # -# last modified: 2009/10/08 +# Last modified: 2017/02/07 require 'lib/evo/io/parser/msa_parser' require 'lib/evo/msa/msa' diff --git a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_parser.rb index c148cdf..a3c7b6a 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_parser.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/hmmscan_parser.rb @@ -1,10 +1,12 @@ # -# To change this template, choose Tools | Templates -# and open the template in the editor. - +# = lib/evo/io/parser/hmmscan_parser.rb - HmmscanParser class +# +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# Last modified: 2017/02/12 class HmmscanParser - def initialize file @file = file end @@ -51,6 +53,8 @@ class HmmscanParser r.env_from = $20.to_i r.env_to = $21.to_i + r.desc = $23 + if r.number > r.out_of || r.hmm_from > r.hmm_to || r.ali_from > r.ali_to || r.env_from > r.env_to raise IOError, "illogical format: " + line end @@ -81,5 +85,6 @@ class HmmscanResult attr_accessor :ali_to attr_accessor :env_from attr_accessor :env_to + attr_accessor :desc end diff --git a/forester/ruby/evoruby/lib/evo/msa/msa.rb b/forester/ruby/evoruby/lib/evo/msa/msa.rb index 579941d..549ae2c 100644 --- a/forester/ruby/evoruby/lib/evo/msa/msa.rb +++ b/forester/ruby/evoruby/lib/evo/msa/msa.rb @@ -1,11 +1,10 @@ # # = lib/evo/msa/msa.rb - Msa class # -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: msa.rb,v 1.11 2009/01/03 00:42:08 cmzmasek Exp $ +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) # +# Last modified: 2017/02/07 require 'lib/evo/util/constants' @@ -517,8 +516,7 @@ module Evoruby x = get_number_of_seqs / n for i in 0 ... n msa = Msa.new() - s = 0 - + #s = 0 if ( ( r > 0 ) && ( i == ( n - 1 ) ) ) y = x + r if ( verbose ) diff --git a/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb index 5c8619d..a8144ec 100644 --- a/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/domain_sequence_extractor.rb @@ -1,11 +1,8 @@ # -# = lib/evo/apps/domain_sequence_extractor.rb - DomainSequenceExtractor class +# = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class # -# Copyright:: Copyright (C) 2012 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id:Exp $ - +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) require 'lib/evo/util/constants' require 'lib/evo/util/util' @@ -13,16 +10,13 @@ require 'lib/evo/util/command_line_arguments' require 'lib/evo/io/parser/hmmscan_domain_extractor' module Evoruby - class DomainSequenceExtractor PRG_NAME = "dsx" - PRG_VERSION = "2.000" + PRG_VERSION = "2.001" PRG_DESC = "extraction of domain sequences from hmmscan output" - PRG_DATE = "20121001" - COPYRIGHT = "2012 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" + PRG_DATE = "20170213" + WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" E_VALUE_THRESHOLD_OPTION = 'e' LENGTH_THRESHOLD_OPTION = 'l' @@ -35,17 +29,14 @@ module Evoruby FAILED_SEQS_SUFFIX = '_with_no_passing_domains.fasta' HELP_OPTION_1 = 'help' HELP_OPTION_2 = 'h' - def run() Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC , - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) + PRG_VERSION, + PRG_DESC , + PRG_DATE, + WWW, + STDOUT ) ld = Constants::LINE_DELIMITER @@ -56,7 +47,7 @@ module Evoruby end if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) + cla.is_option_set?( HELP_OPTION_2 ) ) print_help exit( 0 ) end @@ -77,8 +68,8 @@ module Evoruby disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) Util.fatal_error( PRG_NAME, - "unknown option(s): " + disallowed, - STDOUT ) + "unknown option(s): " + disallowed, + STDOUT ) end domain_id = cla.get_file_name( 0 ) @@ -92,7 +83,6 @@ module Evoruby outfile = outfile[ 0 .. outfile.length - 5 ] end - add_position = false if ( cla.is_option_set?( ADD_POSITION_OPTION ) ) add_position = true @@ -132,7 +122,6 @@ module Evoruby end end - min_linker = nil if ( cla.is_option_set?( MIN_LINKER_OPT ) ) begin @@ -145,7 +134,6 @@ module Evoruby end end - log = String.new puts() @@ -183,7 +171,6 @@ module Evoruby end - if ( add_position ) puts( "Add positions (rel to complete seq) to extracted domains: true" ) log << "Add positions (rel to complete seq) to extracted domains: true" + ld @@ -206,18 +193,18 @@ module Evoruby begin parser = HmmscanDomainExtractor.new() domain_count = parser.parse( domain_id, - hmmsearch_output, - fasta_sequence_file, - outfile, - outfile + PASSED_SEQS_SUFFIX, - outfile + FAILED_SEQS_SUFFIX, - e_value_threshold, - length_threshold, - add_position, - add_domain_number, - add_species, - min_linker, - log ) + hmmsearch_output, + fasta_sequence_file, + outfile, + outfile + PASSED_SEQS_SUFFIX, + outfile + FAILED_SEQS_SUFFIX, + e_value_threshold, + length_threshold, + add_position, + add_domain_number, + add_species, + min_linker, + log ) rescue ArgumentError, IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) diff --git a/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb b/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb index f08236d..b928468 100644 --- a/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb +++ b/forester/ruby/evoruby/lib/evo/tool/domains_to_forester.rb @@ -1,12 +1,8 @@ # # = lib/evo/apps/domains_to_forester - DomainsToForester class # -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: Exp $ -# -# last modified: 06/11/2007 +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) require 'lib/evo/util/constants' require 'lib/evo/util/util' @@ -17,28 +13,24 @@ require 'lib/evo/sequence/protein_domain' require 'lib/evo/sequence/domain_structure' module Evoruby - class DomainsToForester PRG_NAME = "d2f" - PRG_DESC = "parsed hmmpfam output to forester format" - PRG_VERSION = "1.001" - PRG_DATE = "20120807" - COPYRIGHT = "2012 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" + PRG_DESC = "converting of parsed hmmpfam output to forester format" + PRG_VERSION = "1.002" + PRG_DATE = "20170213" + WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" E_VALUE_THRESHOLD_OPTION = "e" OVERWRITE_IF_SAME_FROM_TO_OPTION = "o" HELP_OPTION_1 = "help" HELP_OPTION_2 = "h" - def parse( domains_list_file, - original_seqs_file, - outfile, - column_delimiter, - e_value_threshold, - overwrite_if_same_from_to ) + original_seqs_file, + outfile, + column_delimiter, + e_value_threshold, + overwrite_if_same_from_to ) Util.check_file_for_readability( domains_list_file ) Util.check_file_for_readability( original_seqs_file ) Util.check_file_for_writability( outfile ) @@ -56,7 +48,7 @@ module Evoruby File.open( domains_list_file ) do | file | while line = file.gets if !is_ignorable?( line ) - + a = line.split( column_delimiter ) l = a.length if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) ) @@ -67,14 +59,7 @@ module Evoruby domain_name = a[ 1 ] seq_from = -1 seq_to = -1 - ########################################## - if domain_name =~ /RRM_\d/ - puts "ignoring " + line - next - end - ########################################## - - + begin seq_from = a[ 2 ].to_i rescue Exception @@ -134,19 +119,19 @@ module Evoruby end # parse - - - def run() Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) + PRG_VERSION, + PRG_DESC, + PRG_DATE, + WWW, + STDOUT ) + + if ( ARGV == nil || ( ARGV.length < 1 ) ) + print_help + exit( -1 ) + end begin cla = CommandLineArguments.new( ARGV ) @@ -155,12 +140,12 @@ module Evoruby end if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) + cla.is_option_set?( HELP_OPTION_2 ) ) print_help exit( 0 ) end - if cla.get_number_of_files != 3 + unless ( cla.get_number_of_files == 1 || cla.get_number_of_files == 2 || cla.get_number_of_files == 3 ) print_help exit( -1 ) end @@ -172,15 +157,10 @@ module Evoruby disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) Util.fatal_error( PRG_NAME, - "unknown option(s): " + disallowed, - STDOUT ) + "unknown option(s): " + disallowed, + STDOUT ) end - domains_list_file = cla.get_file_name( 0 ) - original_sequences_file = cla.get_file_name( 1 ) - outfile = cla.get_file_name( 2 ) - - e_value_threshold = -1.0 if cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) begin @@ -192,35 +172,73 @@ module Evoruby Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) end end + + domains_list_file = cla.get_file_name( 0 ) + original_sequences_file = "" + outfile = "" + if (cla.get_number_of_files == 3) + original_sequences_file = cla.get_file_name( 1 ) + outfile = cla.get_file_name( 2 ) + elsif (cla.get_number_of_files == 1 || cla.get_number_of_files == 2 ) + if ( cla.get_number_of_files == 2 ) + original_sequences_file = cla.get_file_name( 1 ) + else + hmmscan_index = domains_list_file.index("hmmscan") + if ( hmmscan_index != nil ) + prefix = domains_list_file[0 .. hmmscan_index-1 ] + suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX + files = Dir.entries( "." ) + matching_files = Util.get_matching_files( files, prefix, suffix) + if matching_files.length < 1 + Util.fatal_error( PRG_NAME, 'no file matching [' + prefix + + '...' + suffix + '] present in current directory: need to indicate as second argument' ) + end + if matching_files.length > 1 + Util.fatal_error( PRG_NAME, 'more than one file matching [' + + prefix + '...' + suffix + '] present in current directory: need to indicate as second argument' ) + end + original_sequences_file = matching_files[ 0 ] + end + end + outfile = domains_list_file + if (outfile.end_with?(Constants::DOMAIN_TABLE_SUFFIX) ) + outfile = outfile.chomp(Constants::DOMAIN_TABLE_SUFFIX) + end + if ( e_value_threshold >= 0.0 ) + outfile = outfile + Constants::DOMAINS_TO_FORESTER_EVALUE_CUTOFF_SUFFIX + e_value_threshold.to_s + end + outfile = outfile + Constants::DOMAINS_TO_FORESTER_OUTFILE_SUFFIX + end + overwrite_if_same_from_to = false if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) ) overwrite_if_same_from_to = true end puts - puts( "Domains list file : " + domains_list_file ) - puts( "Fasta sequencefile (complete sequences): " + original_sequences_file ) - puts( "Outputfile : " + outfile ) + puts( "Domain table : " + domains_list_file ) + puts( "Fasta sequence file (complete sequences): " + original_sequences_file ) + puts( "Outputfile : " + outfile ) if ( e_value_threshold >= 0.0 ) - puts( "E-value threshold : " + e_value_threshold.to_s ) + puts( "E-value threshold : " + e_value_threshold.to_s ) else - puts( "E-value threshold : no threshold" ) + puts( "E-value threshold : no threshold" ) end if ( overwrite_if_same_from_to ) - puts( "Overwrite if same from and to : true" ) + puts( "Overwrite if same from and to : true" ) else - puts( "Overwrite if same from and to : false" ) + puts( "Overwrite if same from and to : false" ) end puts begin parse( domains_list_file, - original_sequences_file, - outfile, - " ", - e_value_threshold, - overwrite_if_same_from_to ) + original_sequences_file, + outfile, + " ", + e_value_threshold, + overwrite_if_same_from_to ) rescue ArgumentError, IOError, StandardError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) @@ -228,8 +246,9 @@ module Evoruby Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT ) end - puts + Util.print_message( PRG_NAME, "wrote: " + outfile ) + Util.print_message( PRG_NAME, "next steps in standard analysis pipeline: hmmsearch followed by dsx.rb") Util.print_message( PRG_NAME, 'OK' ) puts @@ -241,21 +260,25 @@ module Evoruby puts puts( "Usage:" ) puts - puts( " " + PRG_NAME + ".rb [options] " ) + puts( " " + PRG_NAME + ".rb [options] [file containing complete sequences in fasta format] [outputfile]" ) puts() puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "= : E-value threshold, default is no threshold" ) puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" ) puts + puts( "Examples:" ) + puts + puts( " " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta P53_hmmscan_300_10.dff" ) + puts + puts( " " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta" ) + puts + puts( " " + PRG_NAME + ".rb P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table" ) + puts() end - - def is_ignorable?( line ) return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/) end - end # class DomainsToForester - end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb index 6ae3cf1..96c6a2f 100644 --- a/forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb @@ -3,9 +3,6 @@ # # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek # License:: GNU Lesser General Public License (LGPL) -# -# $Id: fasta_taxonomy_processor.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $ - require 'lib/evo/util/util' require 'lib/evo/msa/msa_factory' @@ -19,187 +16,183 @@ require 'lib/evo/util/command_line_arguments' require 'lib/evo/apps/tseq_taxonomy_processor' module Evoruby + class FastaTaxonomyProcessor + + PRG_NAME = "fasta_tap" + PRG_DATE = "2009.01.20" + PRG_DESC = "preprocessing of multiple sequence files in ncbi fasta format" + PRG_VERSION = "1.00" + WWW = "www.phylosoft.org" + def initialize() + @tax_ids_to_sp_taxonomies = Hash.new() + end - class FastaTaxonomyProcessor + def run() + + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) + + if ARGV == nil || ARGV.length != 4 + puts( "Usage: #{PRG_NAME}.rb " ) + puts() + exit( -1 ) + end + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + allowed_opts = Array.new + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) + end + + sp_taxonomy_infile = cla.get_file_name( 0 ) + sequences_infile = cla.get_file_name( 1 ) + sequences_outfile = cla.get_file_name( 2 ) + mapping_outfile = cla.get_file_name( 3 ) + + Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile ) + Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile ) + Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile ) + Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile ) + + sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile ) + + Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile ) + + fasta_parser = FastaParser.new + msa_fac = MsaFactory.new + + seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser ) + + Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile ) + + removed = seqs.remove_redundant_sequences!( true, true ) + + if removed.size > 0 + Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) + removed.each { | seq_name | + puts seq_name + } + Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" ) + end + + mapping_out = File.open( mapping_outfile, "a" ) + + for i in 0 ... seqs.get_number_of_seqs + seq = seqs.get_sequence( i ) + seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) ) + end + + io = MsaIO.new() + + w = FastaWriter.new() + + w.set_max_name_length( 10 ) + w.clean( true ) + begin + io.write_to_file( seqs, sequences_outfile, w ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) + end + mapping_out.close() + + Util.print_message( PRG_NAME, "wrote: " + mapping_outfile ) + Util.print_message( PRG_NAME, "wrote: " + sequences_outfile ) + Util.print_message( PRG_NAME, "OK" ) - PRG_NAME = "fasta_tap" - PRG_DATE = "2009.01.20" - PRG_DESC = "preprocessing of multiple sequence files in ncbi fasta format" - PRG_VERSION = "1.00" - COPYRIGHT = "2009 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" + end - def initialize() - @tax_ids_to_sp_taxonomies = Hash.new() - end + private - def run() - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - if ARGV == nil || ARGV.length != 4 - puts( "Usage: #{PRG_NAME}.rb " ) - puts() - exit( -1 ) - end - - begin - cla = CommandLineArguments.new( ARGV ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - allowed_opts = Array.new - disallowed = cla.validate_allowed_options_as_str( allowed_opts ) - if ( disallowed.length > 0 ) - Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed ) - end - - sp_taxonomy_infile = cla.get_file_name( 0 ) - sequences_infile = cla.get_file_name( 1 ) - sequences_outfile = cla.get_file_name( 2 ) - mapping_outfile = cla.get_file_name( 3 ) - - Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile ) - Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile ) - Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile ) - Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile ) - - sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile ) - - Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile ) - - fasta_parser = FastaParser.new - msa_fac = MsaFactory.new - - seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser ) - - Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile ) - - removed = seqs.remove_redundant_sequences!( true, true ) - - if removed.size > 0 - Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" ) - removed.each { | seq_name | - puts seq_name - } - Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" ) - end - - mapping_out = File.open( mapping_outfile, "a" ) - - for i in 0 ... seqs.get_number_of_seqs - seq = seqs.get_sequence( i ) - seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) ) - end - - io = MsaIO.new() - - w = FastaWriter.new() - - w.set_max_name_length( 10 ) - w.clean( true ) - begin - io.write_to_file( seqs, sequences_outfile, w ) - rescue Exception => e - Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s ) - end - mapping_out.close() - - Util.print_message( PRG_NAME, "wrote: " + mapping_outfile ) - Util.print_message( PRG_NAME, "wrote: " + sequences_outfile ) - Util.print_message( PRG_NAME, "OK" ) + def modify_name( seq, i, sp_taxonomies, mapping_outfile ) - end + #i = i + 1792 - private - - def modify_name( seq, i, sp_taxonomies, mapping_outfile ) - - #i = i + 1792 - - seq_desc = seq.get_name - - taxonomy_sn = nil - - if seq_desc =~ /\[(.+)\]/ - taxonomy_sn = $1 - else - Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]" ) - end - - matching_sp_taxonomy = nil - - sp_taxonomies.each { |sp_taxonomy| - if ( sp_taxonomy.scientific_name == taxonomy_sn ) - matching_sp_taxonomy = sp_taxonomy - end - } - - if matching_sp_taxonomy == nil - Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" ) - end - - new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code - - gi = nil - if seq_desc =~ /gi\|(.+?)\|/ - gi = $1 - else - Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]" ) - end - - seq_name = "" - - if seq_desc =~ /\|\s*([^|]+?)\s*\[/ - seq_name = $1 - end - - if seq_name =~ /\[.+\]$/ - # Redundant taxonomy information hides here. - seq_name = seq_name.sub(/\[.+\]$/, '') - end - if seq_name =~ /^\s*hypothetical\s+protein\s*/i - # Pointless information. - seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' ) - end - if seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i - # Pointless information. - seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' ) - end - - if gi != nil - mapping_outfile.print( new_name + "\t" + - TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + - TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" + - TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" + - TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + - TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" + - TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" + - TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" + - TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string + - Constants::LINE_DELIMITER ) - else - mapping_outfile.print( new_name + "\t" + - TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + - TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" + - TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" + - TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + - TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" + - TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string + - Constants::LINE_DELIMITER ) - - end - new_name - end + seq_desc = seq.get_name + + taxonomy_sn = nil + + if seq_desc =~ /\[(.+)\]/ + taxonomy_sn = $1 + else + Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]" ) + end + matching_sp_taxonomy = nil + + sp_taxonomies.each { |sp_taxonomy| + if ( sp_taxonomy.scientific_name == taxonomy_sn ) + matching_sp_taxonomy = sp_taxonomy + end + } + + if matching_sp_taxonomy == nil + Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" ) + end + + new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code + + gi = nil + if seq_desc =~ /gi\|(.+?)\|/ + gi = $1 + else + Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]" ) + end + + seq_name = "" + + if seq_desc =~ /\|\s*([^|]+?)\s*\[/ + seq_name = $1 + end + + if seq_name =~ /\[.+\]$/ + # Redundant taxonomy information hides here. + seq_name = seq_name.sub(/\[.+\]$/, '') + end + if seq_name =~ /^\s*hypothetical\s+protein\s*/i + # Pointless information. + seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' ) + end + if seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i + # Pointless information. + seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' ) + end + + if gi != nil + mapping_outfile.print( new_name + "\t" + + TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + + TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" + + TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" + + TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + + TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" + + TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" + + TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" + + TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string + + Constants::LINE_DELIMITER ) + else + mapping_outfile.print( new_name + "\t" + + TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" + + TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" + + TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" + + TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" + + TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" + + TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string + + Constants::LINE_DELIMITER ) + + end + new_name end + end + end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb index 08de9c1..caff316 100644 --- a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb +++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb @@ -1,29 +1,22 @@ # # = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class # -# Copyright:: Copyright (C) 2012 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $ -# +# Copyright:: Copyright (C) 2017 Christian M Zmasek +# License:: GNU Lesser General Public License (LGPL) require 'set' - require 'lib/evo/util/constants' require 'lib/evo/util/util' require 'lib/evo/util/command_line_arguments' require 'lib/evo/io/parser/hmmscan_parser' module Evoruby - class HmmscanSummary PRG_NAME = "hsp" - PRG_VERSION = "2.002" - PRG_DESC = "hmmscan summary" - PRG_DATE = "130319" - COPYRIGHT = "2013 Christian M Zmasek" - CONTACT = "phyloxml@gmail.com" + PRG_VERSION = "2.003" + PRG_DESC = "Summarize hmmscan output tables into simpler tables" + PRG_DATE = "170213" WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" DELIMITER_OPTION = "d" @@ -36,9 +29,9 @@ module Evoruby HELP_OPTION_1 = "help" HELP_OPTION_2 = "h" - USE_AVOID_HMMS = true - AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ] - LIMIT_FOR_CLOSE_DOMAINS = 20 + USE_AVOID_HMMS = false + AVOID_HHMS = [ "x", "y", "z" ] + LIMIT_FOR_CLOSE_DOMAINS = 20 # Used for protein architecture summary def initialize @domain_counts = Hash.new @@ -47,13 +40,16 @@ module Evoruby def run Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) + PRG_VERSION, + PRG_DESC, + PRG_DATE, + WWW, + STDOUT ) + + if ( ARGV == nil || ( ARGV.length < 1 ) ) + print_help + exit( -1 ) + end begin cla = CommandLineArguments.new( ARGV ) @@ -62,16 +58,11 @@ module Evoruby end if ( cla.is_option_set?( HELP_OPTION_1 ) || - cla.is_option_set?( HELP_OPTION_2 ) ) + cla.is_option_set?( HELP_OPTION_2 ) ) print_help exit( 0 ) end - if ( cla.get_number_of_files != 2 ) - print_help - exit( -1 ) - end - allowed_opts = Array.new allowed_opts.push( DELIMITER_OPTION ) allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION ) @@ -84,12 +75,21 @@ module Evoruby disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) Util.fatal_error( PRG_NAME, - "unknown option(s): " + disallowed, - STDOUT ) + "unknown option(s): " + disallowed, + STDOUT ) end inpath = cla.get_file_name( 0 ) - outpath = cla.get_file_name( 1 ) + + outpath = "" + if ( cla.get_number_of_files == 1 ) + outpath = inpath + Constants::DOMAIN_TABLE_SUFFIX + elsif ( cla.get_number_of_files == 2 ) + outpath = cla.get_file_name( 1 ) + else + print_help + exit( -1 ) + end column_delimiter = "\t" if ( cla.is_option_set?( DELIMITER_OPTION ) ) @@ -155,7 +155,7 @@ module Evoruby puts() puts( "hmmpfam outputfile : " + inpath ) puts( "outputfile : " + outpath ) - puts( "species : " + species ) + if ( i_e_value_threshold >= 0.0 ) puts( "i-E-value threshold : " + i_e_value_threshold.to_s ) else @@ -174,28 +174,29 @@ module Evoruby if ( column_delimiter == "\t" ) puts( "column delimiter : TAB" ) else - puts( "column delimiter : " + column_delimiter ) - end - if fs_e_value_threshold >= 0.0 - puts( "E-value threshold : " + fs_e_value_threshold.to_s ) - else - puts( "E-value threshold : no threshold" ) + puts( "column delimiter : " + column_delimiter ) end if !hmm_for_protein_output.empty? puts( "HMM for proteins : " + hmm_for_protein_output ) + puts( "species : " + species ) + if fs_e_value_threshold >= 0.0 + puts( "E-value threshold : " + fs_e_value_threshold.to_s ) + else + puts( "E-value threshold : no threshold" ) + end end puts() begin parse( inpath, - outpath, - column_delimiter, - i_e_value_threshold, - ignore_dufs, - parse_descriptions, - fs_e_value_threshold, - hmm_for_protein_output, - species ) + outpath, + column_delimiter, + i_e_value_threshold, + ignore_dufs, + parse_descriptions, + fs_e_value_threshold, + hmm_for_protein_output, + species ) rescue IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end @@ -207,6 +208,8 @@ module Evoruby puts puts( Util.draw_histogram( domain_counts, "#" ) ) puts + Util.print_message( PRG_NAME, "wrote: " + outpath ) + Util.print_message( PRG_NAME, "next step in standard analysis pipeline: d2f.rb") Util.print_message( PRG_NAME, 'OK' ) puts @@ -216,14 +219,14 @@ module Evoruby # raises ArgumentError, IOError def parse( inpath, - outpath, - column_delimiter, - i_e_value_threshold, - ignore_dufs, - get_descriptions, - fs_e_value_threshold, - hmm_for_protein_output, - species ) + outpath, + column_delimiter, + i_e_value_threshold, + ignore_dufs, + get_descriptions, + fs_e_value_threshold, + hmm_for_protein_output, + species ) Util.check_file_for_readability( inpath ) Util.check_file_for_writability( outpath ) @@ -246,27 +249,28 @@ module Evoruby results.each do | r | model = r.model + desc = r.desc query = r.query i_e_value = r.i_e_value env_from = r.env_from env_to = r.env_to if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) && - ( !ignore_dufs || ( model !~ /^DUF\d+/ ) ) + ( !ignore_dufs || ( model !~ /^DUF\d+/ ) ) count_model( model ) outfile.print( query + - column_delimiter ) + column_delimiter ) if ( get_descriptions ) outfile.print( desc + - column_delimiter ) + column_delimiter ) end outfile.print( model + - column_delimiter + - env_from.to_s + - column_delimiter + - env_to.to_s + - column_delimiter + - i_e_value.to_s ) + column_delimiter + + env_from.to_s + + column_delimiter + + env_to.to_s + + column_delimiter + + i_e_value.to_s ) outfile.print( Constants::LINE_DELIMITER ) end @@ -274,10 +278,10 @@ module Evoruby if !prev_query.empty? && prev_query != query if !hmmscan_results_per_protein.empty? process_hmmscan_results_per_protein( hmmscan_results_per_protein, - fs_e_value_threshold, - hmm_for_protein_output, - i_e_value_threshold, - species ) + fs_e_value_threshold, + hmm_for_protein_output, + i_e_value_threshold, + species ) end hmmscan_results_per_protein.clear end @@ -295,10 +299,10 @@ module Evoruby if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty? process_hmmscan_results_per_protein( hmmscan_results_per_protein, - fs_e_value_threshold, - hmm_for_protein_output, - i_e_value_threshold, - species ) + fs_e_value_threshold, + hmm_for_protein_output, + i_e_value_threshold, + species ) end outfile.flush() @@ -323,10 +327,10 @@ module Evoruby end def process_hmmscan_results_per_protein( hmmscan_results_per_protein, - fs_e_value_threshold, - hmm_for_protein_output, - i_e_value_threshold, - species ) + fs_e_value_threshold, + hmm_for_protein_output, + i_e_value_threshold, + species ) dc = 0 # filter according to i-Evalue threshold @@ -335,7 +339,6 @@ module Evoruby hmmscan_results_per_protein.each do | r | - if r.model == hmm_for_protein_output if fs_e_value_threshold > 0.0 && r.fs_e_value > fs_e_value_threshold return @@ -453,19 +456,24 @@ module Evoruby s end - def print_help() puts( "Usage:" ) puts() - puts( " " + PRG_NAME + ".rb [options] " ) + puts( " " + PRG_NAME + ".rb [options] [outputfile]" ) + puts() + puts( " options: -" + DELIMITER_OPTION + "= : column delimiter for outputfile, default is TAB" ) + puts( " -" + I_E_VALUE_THRESHOLD_OPTION + "=: i-E-value threshold, default is no threshold" ) + puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + " : parse query description (in addition to query name)" ) + puts( " -" + IGNORE_DUF_OPTION + " : ignore DUFs" ) + puts( " -" + HMM_FOR_PROTEIN_OUTPUT + "= : HMM for protein architectures summary" ) + puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + "=: E-value threshold for full protein sequences, only for protein architectures summary" ) + puts( " -" + SPECIES_OPTION + "= : species for protein architectures summary" ) + puts() + puts( "Example:" ) + puts() + puts( " " + "hmmscan --nobias --domtblout P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10 -E 10 Pfam-A.hmm P53_ni.fasta" ) puts() - puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" ) - puts( " -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" ) - puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" ) - puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" ) - puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" ) - puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" ) - puts( " -" + SPECIES_OPTION + ": species for protein summary" ) + puts( " " + PRG_NAME + ".rb P53_hmmscan_300_10" ) puts() end diff --git a/forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb b/forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb index 0459eee..cbbff63 100644 --- a/forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb +++ b/forester/ruby/evoruby/lib/evo/tool/phylogenies_decorator.rb @@ -2,12 +2,12 @@ # # = lib/evo/apps/phylogenies_decorator # -# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) # -# decoration of phylogenies with sequence/species names and domain architectures +# Last modified: 2017/02/09 # -# $Id: phylogenies_decorator.rb,v 1.34 2010/12/13 19:00:11 cmzmasek Exp $ +# decoration of phylogenies with sequence/species names and domain architectures # # Environment variable FORESTER_HOME needs to point to the appropriate # directory (e.g. setenv FORESTER_HOME $HOME/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester/) @@ -22,7 +22,8 @@ module Evoruby #DECORATOR_OPTIONS_SEQ_NAMES = '-r=1 -mdn' #DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -sn' - DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -tc -mp -or' + #DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -tc -mp -or' + DECORATOR_OPTIONS_SEQ_NAMES = '-p -t -mp -or' # -mdn is a hidden expert option to rename e.g. "6_ORYLA3" to "6_[3]_ORYLA" #DECORATOR_OPTIONS_SEQ_NAMES = '-sn -r=1' #DECORATOR_OPTIONS_DOMAINS = '-r=1' @@ -31,7 +32,8 @@ module Evoruby DOMAINS_MAPFILE_SUFFIX = '_hmmscan_10.dff' SLEEP_TIME = 0.05 REMOVE_NI = true - IDS_ONLY = true + IDS_ONLY = true #TODO this should be a command line option + FIXED_NIM_FILE = 'all.nim' #TODO this should be a command line option TMP_FILE_1 = '___PD1___' TMP_FILE_2 = '___PD2___' LOG_FILE = '00_phylogenies_decorator.log' @@ -39,11 +41,11 @@ module Evoruby JAVA_HOME = ENV[Constants::JAVA_HOME_ENV_VARIABLE] PRG_NAME = "phylogenies_decorator" - PRG_DATE = "2013.11.15" + PRG_DATE = "170209" PRG_DESC = "decoration of phylogenies with sequence/species names and domain architectures" PRG_VERSION = "1.02" - COPYRIGHT = "2013 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" + COPYRIGHT = "2017 Christian M Zmasek" + CONTACT = "phyloxml at gmail dot com" WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" HELP_OPTION_1 = "help" @@ -151,7 +153,7 @@ module Evoruby outfile = outfile.sub( /_ni_/, '_' ) end - if File.exists?( outfile ) + if File.exist?( outfile ) msg = counter.to_s + ': ' + phylogeny_file + ' -> ' + outfile + ' : already exists, skipping' Util.print_message( PRG_NAME, msg ) @@ -174,8 +176,12 @@ module Evoruby domains_mapfile_name = nil seqs_file_name = nil - ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX ) - + if ( FIXED_NIM_FILE == nil ) + ids_mapfile_name = get_file( files, phylogeny_id, IDS_MAPFILE_SUFFIX ) + else + ids_mapfile_name = FIXED_NIM_FILE + end + unless IDS_ONLY domains_mapfile_name = get_file( files, phylogeny_id, DOMAINS_MAPFILE_SUFFIX ) seqs_file_name = get_seq_file( files, phylogeny_id ) @@ -213,7 +219,7 @@ module Evoruby cmd = decorator + ' ' + DECORATOR_OPTIONS_DOMAINS + ' ' + '-f=d ' + TMP_FILE_1 + ' ' + - domains_mapfile_name + ' ' +TMP_FILE_2 + domains_mapfile_name + ' ' + TMP_FILE_2 puts cmd begin execute_cmd( cmd, log ) @@ -276,17 +282,7 @@ module Evoruby end def get_file( files_in_dir, phylogeny_id, suffix_pattern ) - matching_files = Array.new - - files_in_dir.each { | file | - - if ( !File.directory?( file ) && - file !~ /^\./ && - file !~ /^00/ && - file =~ /^#{phylogeny_id}.*#{suffix_pattern}$/ ) - matching_files << file - end - } + matching_files = Util.get_matching_files( files_in_dir, phylogeny_id, suffix_pattern ) if matching_files.length < 1 Util.fatal_error( PRG_NAME, 'no file matching [' + phylogeny_id + '...' + suffix_pattern + '] present in current directory' ) diff --git a/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb b/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb index 18d396e..9180831 100644 --- a/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb +++ b/forester/ruby/evoruby/lib/evo/tool/phylogeny_factory.rb @@ -1,10 +1,10 @@ # # = lib/evo/apps/phylogeny_factory - PhylogenyFactory class # -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) # -# $Id: phylogeny_factory.rb,v 1.32 2010/12/13 19:00:11 cmzmasek Exp $ +# Last modified: 2017/02/07 require 'lib/evo/util/constants' require 'lib/evo/util/util' @@ -21,9 +21,9 @@ module Evoruby PRG_DATE = "1301111" PRG_DESC = "automated phylogeny reconstruction using queing system" PRG_VERSION = "1.100" - COPYRIGHT = "2013 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" + COPYRIGHT = "2017 Christian M Zmasek" + CONTACT = "cmzmasek at yahoo dot com" + WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" USE_JOB_SUBMISSION_SYSTEM_OPTION = 's' BS_OPTION = 'b' diff --git a/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb index 862a91a..6a4bb5f 100644 --- a/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/taxonomy_processor.rb @@ -1,12 +1,10 @@ # # = lib/evo/apps/taxonomy_processor - TaxonomyProcessor class # -# Copyright:: Copyright (C) 20017 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# - - +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +require 'lib/evo/util/constants' require 'lib/evo/util/util' require 'lib/evo/msa/msa_factory' require 'lib/evo/msa/msa' @@ -18,35 +16,29 @@ require 'lib/evo/io/writer/phylip_sequential_writer' require 'lib/evo/util/command_line_arguments' module Evoruby - class TaxonomyProcessor PRG_NAME = "tap" - PRG_DATE = "170206" - PRG_DESC = "replacement of species names in multiple sequence files" - PRG_VERSION = "2.002" - COPYRIGHT = "2017 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "" + PRG_DATE = "170213" + PRG_DESC = "Replacement of labels in multiple sequence files" + PRG_VERSION = "2.004" + WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" EXTRACT_TAXONOMY_OPTION = "t" - + ANNOTATION_OPTION = "a" + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" def run() Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) - - if ( ARGV == nil || ( ARGV.length != 1 && ARGV.length != 2 && ARGV.length != 3 && ARGV.length != 4 && ARGV.length != 5 && ARGV.length != 6 ) ) - puts( "Usage: #{PRG_NAME}.rb [options] [output sequences] [output id list]" ) - puts() - puts( " options: -" + EXTRACT_TAXONOMY_OPTION + ": to extract taxonomy information from bracketed expression" ) - puts() + PRG_VERSION, + PRG_DESC, + PRG_DATE, + WWW, + STDOUT ) + + if ( ARGV == nil || ( ARGV.length < 1 ) ) + print_help() exit( -1 ) end @@ -56,9 +48,15 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end - input = nil - output = nil - list_file = nil + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + input = nil + output = nil + list_file = nil if cla.get_number_of_files == 3 input = cla.get_file_name( 0 ) @@ -74,13 +72,16 @@ module Evoruby else i = input end - output = i + "_ni.fasta" - list_file = i + ".nim" + output = i + Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX + list_file = i + Constants::ID_MAP_FILE_SUFFIX + else + print_help() + exit(-1) end - allowed_opts = Array.new allowed_opts.push( EXTRACT_TAXONOMY_OPTION ) + allowed_opts.push( ANNOTATION_OPTION ) disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) @@ -92,13 +93,18 @@ module Evoruby extract_taxonomy = true end - if ( File.exists?( output ) ) + annotation = nil + if ( cla.is_option_set?( ANNOTATION_OPTION ) ) + annotation = cla.get_option_value( ANNOTATION_OPTION ) + end + + if ( File.exist?( output ) ) Util.fatal_error( PRG_NAME, "outfile [" + output + "] already exists" ) end - if ( File.exists?( list_file ) ) + if ( File.exist?( list_file ) ) Util.fatal_error( PRG_NAME, "list file [" + list_file + "] already exists" ) end - if ( !File.exists?( input) ) + if ( !File.exist?( input) ) Util.fatal_error( PRG_NAME, "infile [" + input + "] does not exist" ) end @@ -116,6 +122,9 @@ module Evoruby if ( extract_taxonomy ) puts( "Extract taxonomy: true" ) end + if ( annotation != nil ) + puts( "Annotation : " + annotation ) + end puts() f = MsaFactory.new() @@ -141,7 +150,7 @@ module Evoruby lf = File.open( list_file, "a" ) for i in 0 ... msa.get_number_of_seqs seq = msa.get_sequence( i ) - seq.set_name( modify_name( seq.get_name(), i, lf, extract_taxonomy ) ) + seq.set_name( modify_name( seq.get_name(), i, lf, extract_taxonomy, annotation ) ) end io = MsaIO.new() w = nil @@ -150,7 +159,7 @@ module Evoruby else w = PhylipSequentialWriter.new() end - w.set_max_name_length( 10 ) + w.set_max_name_length( 9 ) w.clean( true ) begin io.write_to_file( msa, output, w ) @@ -160,16 +169,15 @@ module Evoruby lf.close() Util.print_message( PRG_NAME, "wrote: " + list_file ) Util.print_message( PRG_NAME, "wrote: " + output ) + Util.print_message( PRG_NAME, "next steps in standard analysis pipeline: hmmscan followed by hsp.rb") Util.print_message( PRG_NAME, "OK" ) end private - def modify_name( desc, counter, file, extract_taxonomy ) + def modify_name( desc, counter, file, extract_taxonomy, annotation ) new_desc = nil desc.gsub!( /\s+/, ' ' ) - #if desc =~ /^>?\s*\S{1,10}_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)/ - # new_desc = counter.to_s( 16 ) + "_" + $1 if extract_taxonomy if desc =~/\s\[(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)\]/ new_desc = counter.to_s( 16 ) + "_" + $1 @@ -179,10 +187,33 @@ module Evoruby else new_desc = counter.to_s( 16 ) end - file.print( new_desc + "\t" + desc + "\n" ) + if (annotation != nil) + new_desc = new_desc + annotation + file.print( new_desc + "\t" + desc + " " + annotation + "\n" ) + else + file.print( new_desc + "\t" + desc + "\n" ) + end + if ( new_desc.length > 9) + Util.fatal_error( PRG_NAME, "shortened identifier [" + + new_desc + "] is too long (" + new_desc.length.to_s + " characters)" ) + end new_desc end + def print_help() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] [output sequences] [output id list]" ) + puts() + puts( " options: -" + EXTRACT_TAXONOMY_OPTION + " : to extract taxonomy information from bracketed expressions" ) + puts( " -" + ANNOTATION_OPTION + "=: to add an annotation to all entries" ) + puts() + puts( "Example:" ) + puts() + puts( " " + PRG_NAME + ".rb P53.fasta" ) + puts() + end + end # class TaxonomyProcessor end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb b/forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb index 1ad9924..0045730 100644 --- a/forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb +++ b/forester/ruby/evoruby/lib/evo/util/command_line_arguments.rb @@ -1,177 +1,175 @@ # # = lib/evo/util/command_line_arguments.rb - CommandLineArguments class # -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) # -# $Id: command_line_arguments.rb,v 1.2 2007/06/12 04:51:34 cmzmasek Exp $ -# -# last modified: 05/16/2007 +# Last modified: 2017/02/12 module Evoruby - - class CommandLineArguments - - OPTIONS_PREFIX = "-" - EXTENDED_OPTIONS_PREFIX = "--" - OPTIONS_SEPARATOR = "=" - - # raises ArgumentError - def initialize( args ) - @options = Hash.new - @extended_options = Hash.new - @file_names = Array.new - parse_arguments( args ) - end - - def get_file_names - return @file_names - end - - def get_file_name( i ) - return @file_names[ i ] - end - - def get_number_of_files() - return @file_names.length - end - - def is_option_set?( option_name ) - o = get_all_options - return ( o.has_key?( option_name ) ) - end - - # raises ArgumentError - def get_option_value( option_name ) - o = get_all_options - if ( o.has_key?( option_name ) ) - value = o[ option_name ] - if ( !Util.is_string_empty?( value ) ) - return value - else - raise( ArgumentError, "value for option \"" + - option_name + "\" is not set", caller ) - end - else - raise( ArgumentError, "option \"" + option_name + - "\" is not set", caller ) - end - end - - def get_option_value_as_int( option_name ) - return get_option_value( option_name ).to_i - end - - def get_option_value_as_float( option_name ) - return get_option_value( option_name ).to_f - end - - # mandatory_options (Array) - # - def validate_mandatory_options( mandatory_options ) - o = get_all_options - missing = Array.new - for ma in mandatory_options - if ( !o.has_key?( ma ) ) - missing.push( ma ) - end - end - return missing - end - - # mandatory_options (Array) - # - def validate_mandatory_options_as_str( mandatory_options ) - missing = validate_mandatory_options( mandatory_options ) - return missing.join( ", " ) - end - - # allowed_options (Array) - # - def validate_allowed_options( allowed_options ) - o = get_all_options - disallowed = Array.new - o.each_key { |op| - if ( !allowed_options.include?( op ) ) - disallowed.push( op ) - end - } - return disallowed - end - - # allowed_options (Array) - # - def validate_allowed_options_as_str( allowed_options ) - disallowed = validate_allowed_options( allowed_options ) - return disallowed.join( ", " ) - end - - private - - def get_all_options - o = Hash.new - o.merge!( get_options_list ) - o.merge!( get_extended_options_list ) - return o - end - - def parse_arguments( args ) - for arg in args - if ( arg.index( EXTENDED_OPTIONS_PREFIX ) == 0 ) - parse_option( arg.slice( EXTENDED_OPTIONS_PREFIX.length, arg.length() - 1 ), - get_extended_options_list ) - - elsif ( arg.index( OPTIONS_PREFIX ) == 0 ) - parse_option( arg.slice( OPTIONS_PREFIX.length, arg.length() - 1 ), - get_options_list ) - - else - get_file_names.push( arg ) - end - end - end - - # raises ArgumentError - def parse_option( option, options_map ) - sep_index = option.index( OPTIONS_SEPARATOR ) - if ( sep_index == nil ) - if ( Util.is_string_empty?( option ) ) - raise( ArgumentError, "attempt to set option with an empty name" ) - end - if ( get_all_options.has_key?( option ) ) - raise( ArgumentError, "attempt to set option \"" + - option + "\" mutiple times" ) - end - options_map[ option ] = "" - else - key = option.slice( 0, sep_index ) - value = option.slice( sep_index + 1, option.length() - 1 ) - if ( Util.is_string_empty?( key ) ) - raise( ArgumentError, "attempt to set option with an empty name" ) - end - if ( Util.is_string_empty?( value ) ) - raise( ArgumentError, "attempt to set option with an empty value" ) - end - if ( get_all_options.has_key?( key ) ) - raise( ArgumentError, "attempt to set option \"" + - key + "\" mutiple times [" + option + "]" ) - end - options_map[ key ] = value - end - end - - def get_file_names_list - return @file_names - end - - def get_options_list - return @options - end - - def get_extended_options_list - return @extended_options - end - - end # class CommandLineArguments + class CommandLineArguments + + OPTIONS_PREFIX = "-" + EXTENDED_OPTIONS_PREFIX = "--" + OPTIONS_SEPARATOR = "=" + # raises ArgumentError + def initialize( args ) + @options = Hash.new + @extended_options = Hash.new + @file_names = Array.new + parse_arguments( args ) + end + + def get_file_names + return @file_names + end + + def get_file_name( i ) + return @file_names[ i ] + end + + def get_number_of_files() + return @file_names.length + end + + def is_option_set?( option_name ) + o = get_all_options + return ( o.has_key?( option_name ) ) + end + + # raises ArgumentError + def get_option_value( option_name ) + o = get_all_options + if ( o.has_key?( option_name ) ) + value = o[ option_name ] + if ( !Util.is_string_empty?( value ) ) + return value + else + puts() + puts( "value for option \"" + option_name + "\" is not set") + puts() + exit( -1 ) + end + else + raise( ArgumentError, "option \"" + option_name + + "\" is not set", caller ) + end + end + + def get_option_value_as_int( option_name ) + return get_option_value( option_name ).to_i + end + + def get_option_value_as_float( option_name ) + return get_option_value( option_name ).to_f + end + + # mandatory_options (Array) + # + def validate_mandatory_options( mandatory_options ) + o = get_all_options + missing = Array.new + for ma in mandatory_options + if ( !o.has_key?( ma ) ) + missing.push( ma ) + end + end + return missing + end + + # mandatory_options (Array) + # + def validate_mandatory_options_as_str( mandatory_options ) + missing = validate_mandatory_options( mandatory_options ) + return missing.join( ", " ) + end + + # allowed_options (Array) + # + def validate_allowed_options( allowed_options ) + o = get_all_options + disallowed = Array.new + o.each_key { |op| + if ( !allowed_options.include?( op ) ) + disallowed.push( op ) + end + } + return disallowed + end + + # allowed_options (Array) + # + def validate_allowed_options_as_str( allowed_options ) + disallowed = validate_allowed_options( allowed_options ) + return disallowed.join( ", " ) + end + + private + + def get_all_options + o = Hash.new + o.merge!( get_options_list ) + o.merge!( get_extended_options_list ) + return o + end + + def parse_arguments( args ) + for arg in args + if ( arg.index( EXTENDED_OPTIONS_PREFIX ) == 0 ) + parse_option( arg.slice( EXTENDED_OPTIONS_PREFIX.length, arg.length() - 1 ), + get_extended_options_list ) + + elsif ( arg.index( OPTIONS_PREFIX ) == 0 ) + parse_option( arg.slice( OPTIONS_PREFIX.length, arg.length() - 1 ), + get_options_list ) + + else + get_file_names.push( arg ) + end + end + end + + # raises ArgumentError + def parse_option( option, options_map ) + sep_index = option.index( OPTIONS_SEPARATOR ) + if ( sep_index == nil ) + if ( Util.is_string_empty?( option ) ) + raise( ArgumentError, "attempt to set option with an empty name" ) + end + if ( get_all_options.has_key?( option ) ) + raise( ArgumentError, "attempt to set option \"" + + option + "\" mutiple times" ) + end + options_map[ option ] = "" + else + key = option.slice( 0, sep_index ) + value = option.slice( sep_index + 1, option.length() - 1 ) + if ( Util.is_string_empty?( key ) ) + raise( ArgumentError, "attempt to set option with an empty name" ) + end + if ( Util.is_string_empty?( value ) ) + raise( ArgumentError, "attempt to set option with an empty value" ) + end + if ( get_all_options.has_key?( key ) ) + raise( ArgumentError, "attempt to set option \"" + + key + "\" mutiple times [" + option + "]" ) + end + options_map[ key ] = value + end + end + + def get_file_names_list + return @file_names + end + + def get_options_list + return @options + end + + def get_extended_options_list + return @extended_options + end + + end # class CommandLineArguments end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/util/constants.rb b/forester/ruby/evoruby/lib/evo/util/constants.rb index 6546478..b54801b 100644 --- a/forester/ruby/evoruby/lib/evo/util/constants.rb +++ b/forester/ruby/evoruby/lib/evo/util/constants.rb @@ -1,33 +1,35 @@ # # = lib/evo/util/constants.rb - Constants class # -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) -# -# $Id: constants.rb,v 1.3 2007/12/21 04:13:33 cmzmasek Exp $ -# -# last modified: 05/11/2007 +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) module Evoruby + class Constants - class Constants - - VERBOSE = true + VERBOSE = true - EVORUBY_VERSION = '1.0' + EVORUBY_VERSION = '1.1' - FORESTER_HOME_ENV_VARIABLE = 'FORESTER_HOME' - JAVA_HOME_ENV_VARIABLE = 'JAVA_HOME' + ID_NORMALIZED_FASTA_FILE_SUFFIX = "_ni.fasta" + ID_MAP_FILE_SUFFIX = ".nim" + DOMAIN_TABLE_SUFFIX = "_domain_table" + DOMAINS_TO_FORESTER_OUTFILE_SUFFIX = ".dff" + DOMAINS_TO_FORESTER_EVALUE_CUTOFF_SUFFIX = "_dtfE" + + PFAM_V_FOR_EX = "300" # Pfam version for examples - EVORUBY = 'evoruby' + FORESTER_HOME_ENV_VARIABLE = 'FORESTER_HOME' + JAVA_HOME_ENV_VARIABLE = 'JAVA_HOME' - LINE_DELIMITER = "\n" + EVORUBY = 'evoruby' - FILE_SEPARATOR = File::SEPARATOR + LINE_DELIMITER = "\n" - DOMAIN_STRUCTURE_NHX_SEPARATOR = '>' + FILE_SEPARATOR = File::SEPARATOR + DOMAIN_STRUCTURE_NHX_SEPARATOR = '>' - end # class Constants + end # class Constants end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/util/util.rb b/forester/ruby/evoruby/lib/evo/util/util.rb index 27bdb5e..80916ca 100644 --- a/forester/ruby/evoruby/lib/evo/util/util.rb +++ b/forester/ruby/evoruby/lib/evo/util/util.rb @@ -1,18 +1,26 @@ # # = lib/evo/util/util.rb - Util class # -# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek -# License:: GNU Lesser General Public License (LGPL) +# Copyright:: Copyright (C) 2017 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) # -# $Id: util.rb,v 1.17 2009/10/06 22:22:46 cmzmasek Exp $ -# -# last modified: 05/15/2007 +# Last modified: 2017/02/07 require 'lib/evo/util/constants' module Evoruby - class Util + def Util.get_matching_files( files, prefix_pattern, suffix_pattern ) + matching_files = Array.new + files.each { | file | + if ( !File.directory?( file ) && + file !~ /^\./ && + file =~ /^#{prefix_pattern}.*#{suffix_pattern}$/ ) + matching_files << file + end + } + matching_files + end def Util.normalize_seq_name( name, length, exception_if_too_long = false ) if name.length > length @@ -22,7 +30,8 @@ module Evoruby end name = name[ 0, length ] elsif name.length < length - for i in 0 ... length - name.length + t = length - name.length + t.times do name = name + " " end end @@ -104,7 +113,6 @@ module Evoruby value end - # raises ArgumentError def Util.file2array( path, split_by_semicolon ) Util.check_file_for_readability( path ) @@ -130,18 +138,11 @@ module Evoruby end def Util.print_program_information( prg_name, - prg_version, - prg_desc, - date, - copyright, - contact, - www, - io = STDOUT ) - - # if RUBY_VERSION !~ /1.9/ - # puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) - # exit( -1 ) - # end + prg_version, + prg_desc, + date, + www, + io = STDOUT ) ruby_version = RUBY_VERSION l = prg_name.length + prg_version.length + date.length + ruby_version.length + 12 @@ -156,11 +157,7 @@ module Evoruby io.print( prg_desc ) io.print( Constants::LINE_DELIMITER ) io.print( Constants::LINE_DELIMITER ) - io.print( "Copyright (C) " + copyright ) - io.print( Constants::LINE_DELIMITER ) - io.print( "Contact: " + contact ) - io.print( Constants::LINE_DELIMITER ) - io.print( " " + www ) + io.print( "Website: " + www ) io.print( Constants::LINE_DELIMITER ) io.print( Constants::LINE_DELIMITER ) end