From: cmzmasek@gmail.com Date: Mon, 24 Jun 2013 21:58:21 +0000 (+0000) Subject: moved X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=3fd397a6ed15f66c18e058eb127eb157e880de98;p=jalview.git moved --- diff --git a/forester/archive/perl/forester.pm b/forester/archive/perl/forester.pm deleted file mode 100755 index f8e58cb..0000000 --- a/forester/archive/perl/forester.pm +++ /dev/null @@ -1,1428 +0,0 @@ -# $Id: forester.pm,v 1.26 2010/12/13 19:00:22 cmzmasek Exp $ -# -# FORESTER -- software libraries and applications -# for evolutionary biology research and applications. -# -# Copyright (C) 2007-2009 Christian M. Zmasek -# Copyright (C) 2007-2009 Burnham Institute for Medical Research -# All rights reserved -# -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or (at your option) any later version. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA -# -# Contact: phylosoft @ gmail . com -# WWW: www.phylosoft.org/forester -# -# -# - - -package forester; -use strict; -require Exporter; - -our $VERSION = 1.000; - -our @ISA = qw( Exporter ); - -our @EXPORT = qw( executeConsense - executePhyloPl - executePuzzleDQO - executePuzzleDQObootstrapped - pfam2phylipMatchOnly - startsWithSWISS_PROTname - isPfamSequenceLine - isPfamCommentLine - containsPfamNamedSequence - isRFline - executeProtpars - setModelForPuzzle - setRateHeterogeneityOptionForPuzzle - setParameterEstimatesOptionForPuzzle - executePuzzleBootstrapped - executePuzzle - executeFastme - executeNeighbor - executeFitch - executeBionj - executeWeighbor - executePhyml - executeHmmfetch - addDistsToQueryToPWDfile - testForTextFilePresence - exitWithWarning - dieWithUnexpectedError - addSlashAtEndIfNotPresent - $LENGTH_OF_NAME - $MIN_NUMBER_OF_AA - $TREMBL_ACDEOS_FILE - $SWISSPROT_ACDEOS_FILE - $SPECIES_NAMES_FILE - $SPECIES_TREE_FILE_DEFAULT - $MULTIPLE_TREES_FILE_SUFFIX - $LOG_FILE_SUFFIX - $ALIGN_FILE_SUFFIX - $TREE_FILE_SUFFIX - $ADDITION_FOR_RIO_ANNOT_TREE - $SUFFIX_PWD - $SUFFIX_BOOT_STRP_POS - $MULTIPLE_PWD_FILE_SUFFIX - $SUFFIX_PWD_NOT_BOOTS - $SUFFIX_HMM - $MATRIX_FOR_PWD - $RIO_PWD_DIRECTORY - $RIO_BSP_DIRECTORY - $RIO_NBD_DIRECTORY - $RIO_ALN_DIRECTORY - $RIO_HMM_DIRECTORY - $PFAM_FULL_DIRECTORY - $PFAM_SEED_DIRECTORY - $PRIOR_FILE_DIR - $PFAM_HMM_DB - $FORESTER_JAR - $SEQBOOT - $NEIGHBOR - $PROTPARS - $CONSENSE - $PROML - $PHYLIP_VERSION - $PUZZLE - $PUZZLE_VERSION - $FASTME - $FASTME_VERSION - $BIONJ - $BIONJ_VERSION - $WEIGHBOR - $WEIGHBOR_VERSION - $RAXML - $RAXML_VERSION - $PHYML - $PHYML_VERSION - $HMMALIGN - $HMMSEARCH - $HMMBUILD - $HMMFETCH - $SFE - $HMMCALIBRATE - $P7EXTRACT - $MULTIFETCH - $BOOTSTRAP_CZ - $BOOTSTRAP_CZ_PL - $SUPPORT_TRANSFER - $SUPPORT_STATISTICS - $NEWICK_TO_PHYLOXML - $PHYLO_PL - $RIO_PL - $DORIO - $PUZZLE_DQO - $BOOTSTRAPS - $PATH_TO_FORESTER - $JAVA - $NODE_LIST - $RIO_SLAVE_DRIVER - $RIO_SLAVE - $TEMP_DIR_DEFAULT - $EXPASY_SPROT_SEARCH_DE - $EXPASY_SPROT_SEARCH_AC - ); - - - - -# ============================================================================= -# ============================================================================= -# -# THESE VARIABLES ARE ENVIRONMENT DEPENDENT, AND NEED TO BE SET ACCORDINGLY -# BY THE USER -# ------------------------------------------------------------------------- -# - -# For using just "phylo_pl.pl", only the following variables need to be set -# $JAVA -# $FORESTER_JAR -# $TEMP_DIR_DEFAULT -# $SEQBOOT -# $CONSENSE -# $PUZZLE -# $FASTME -# $NEIGHBOR -# $FITCH -# $BIONJ -# $WEIGHBOR -# $PHYML -# $PROTPARS - -# Software directory: -# --------------------- - -our $SOFTWARE_DIR = "/home/czmasek/SOFTWARE/"; - - -# Java virtual machine: -# --------------------- -our $JAVA = $SOFTWARE_DIR."JAVA/jdk1.6.0_03/bin/java"; - - -# Where all the temporary files can be created: -# --------------------------------------------- -our $TEMP_DIR_DEFAULT = "/tmp/"; - - -# Programs from Joe Felsenstein's PHYLIP package: -# ----------------------------------------------- -our $SEQBOOT = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/seqboot"; -our $NEIGHBOR = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/neighbor"; -our $PROTPARS = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/protpars"; -our $PROML = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/proml"; -our $FITCH = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/fitch"; -our $CONSENSE = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/consense"; -our $PHYLIP_VERSION = "3.68"; - -# TREE-PUZZLE: -# ------------ -our $PUZZLE = $SOFTWARE_DIR."TREE_PUZZLE/tree-puzzle-5.2/src/puzzle"; -our $PUZZLE_VERSION = "5.2"; - -# FASTME: -# ----------------------------------------------------- -our $FASTME = $SOFTWARE_DIR."FASTME/fastme2.0/fastme"; -our $FASTME_VERSION = "2.0"; - -# BIONJ: -# ----------------------------------------------------- -our $BIONJ = $SOFTWARE_DIR."BIONJ/bionj"; -our $BIONJ_VERSION = "[1997]"; - -# WEIGHBOR: -# ----------------------------------------------------- -our $WEIGHBOR = $SOFTWARE_DIR."WEIGHBOR/Weighbor/weighbor"; -our $WEIGHBOR_VERSION = "1.2.1"; - -# PHYML: -# ----------------------------------------------------- -our $PHYML = $SOFTWARE_DIR."PHYML/phyml_v2.4.4/exe/phyml_linux"; -our $PHYML_VERSION = "2.4.4"; - -# RAXML: -# ----------------------------------------------------- -our $RAXML = $SOFTWARE_DIR."RAXML/RAxML-7.0.4/raxmlHPC"; -our $RAXML_VERSION = "7.0.4"; - - -# forester.jar. This jar file is currently available at: http://www.phylosoft.org -# ------------------------------------------------------------------------------- - -our $FORESTER_JAR = $SOFTWARE_DIR."FORESTER/DEV/forester/forester/java/forester.jar"; - - - -# End of variables which need to be set by the user for using "phylo_pl.pl". - - - - - - - - - - - - - - -# Tool from forester.jar to transfer support values: -# ------------------------------------------------- -our $SUPPORT_TRANSFER = $JAVA." -cp $FORESTER_JAR org.forester.application.support_transfer"; - - - -# Tool from forester.jar for simple statistics for support values: -# ---------------------------------------------------------------- -our $SUPPORT_STATISTICS = $JAVA." -cp $FORESTER_JAR org.forester.application.support_statistics"; - - -# Tool from forester.jar to transfer nh to phyloXML: -# ------------------------------------------------- -our $NEWICK_TO_PHYLOXML = $JAVA." -cp $FORESTER_JAR org.forester.application.phyloxml_converter"; - - - -# FORESTER itself (currently not needed for "phylo_pl.pl"): -# --------------------------------------------------------- -our $PATH_TO_FORESTER = ""; - - -# Pfam data (not needed for phylo_pl.pl): -# -------------------------------------- -our $PFAM_FULL_DIRECTORY = "/path/to/Pfam/Full/"; -our $PFAM_SEED_DIRECTORY = "/path/to/Pfam/Seed/"; -our $PFAM_HMM_DB = "/path/to/Pfam/Pfam_ls"; # Need to run "hmmindex" on this - # to produce .ssi file. - # Then, for example - # "setenv HMMERDB /home/rio/pfam-6.6/" - - -$PATH_TO_FORESTER = &addSlashAtEndIfNotPresent( $PATH_TO_FORESTER ); - - -# Description lines and species from SWISS-PROT and TrEMBL (not needed for phylo_pl.pl): -# ------------------------------------------------------------------------------------- -our $TREMBL_ACDEOS_FILE = $PATH_TO_FORESTER."data/trembl22_ACDEOS_1-6"; - -our $SWISSPROT_ACDEOS_FILE = $PATH_TO_FORESTER."data/sp40_ACDEOS_1-6"; - - - -# Names of species which can be analyzed and analyzed -# against (must also be in tree $SPECIES_TREE_FILE_DEFAULT). -# By using a list with less species, RIO analyses become faster -# but lose phylogenetic resolution. -# For many purposes, list "tree_of_life_bin_1-6_species_list" -# in "data/species/" might be sufficient: -# (not needed for phylo_pl.pl) -# -------------------------------------------------------------- -our $SPECIES_NAMES_FILE = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6_species_list"; - - - -# A default species tree in NHX format. -# For many purposes, tree "tree_of_life_bin_1-6.nhx" -# in "data/species/" might be fine: -# (not needed for phylo_pl.pl) -# -------------------------------------------------- -our $SPECIES_TREE_FILE_DEFAULT = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6.nhx"; - - - -# Data for using precalculated distances: -# (not needed for phylo_pl.pl) -# --------------------------------------- -our $MATRIX_FOR_PWD = 2; # The matrix which has been used for the pwd in $RIO_PWD_DIRECTORY. - # 0=JTT, 1=PAM, 2=BLOSUM 62, 3=mtREV24, 5=VT, 6=WAG. - -our $RIO_PWD_DIRECTORY = $PATH_TO_FORESTER."example_data/"; # all must end with "/" -our $RIO_BSP_DIRECTORY = $PATH_TO_FORESTER."example_data/"; -our $RIO_NBD_DIRECTORY = $PATH_TO_FORESTER."example_data/"; -our $RIO_ALN_DIRECTORY = $PATH_TO_FORESTER."example_data/"; -our $RIO_HMM_DIRECTORY = $PATH_TO_FORESTER."example_data/"; - - - -# -# End of variables which need to be set by the user. -# -# ============================================================================= -# ============================================================================= - - - - - -$TEMP_DIR_DEFAULT = &addSlashAtEndIfNotPresent( $TEMP_DIR_DEFAULT ); -$PFAM_FULL_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_FULL_DIRECTORY ); -$PFAM_SEED_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_SEED_DIRECTORY ); - - - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# These variables should normally not be changed: -# - -our $PRIOR_FILE_DIR = $PATH_TO_FORESTER."data/priors_for_hmmbuild/"; - # Directory containing dirichlet prior - # files needed for certain aligments - # by hmmbuild (e.g. Collagen). - - - - - -# TREE-PUZZLE: -our $PUZZLE_DQO = $PATH_TO_FORESTER."puzzle_dqo/src/puzzle"; - -# HMMER: -our $HMMALIGN = $PATH_TO_FORESTER."hmmer/binaries/hmmalign"; -our $HMMSEARCH = $PATH_TO_FORESTER."hmmer/binaries/hmmsearch"; -our $HMMBUILD = $PATH_TO_FORESTER."hmmer/binaries/hmmbuild"; -our $HMMFETCH = $PATH_TO_FORESTER."hmmer/binaries/hmmfetch"; -our $SFE = $PATH_TO_FORESTER."hmmer/binaries/sfetch"; -our $HMMCALIBRATE = $PATH_TO_FORESTER."hmmer/binaries/hmmcalibrate"; - -our $P7EXTRACT = $PATH_TO_FORESTER."perl/p7extract.pl"; -our $MULTIFETCH = $PATH_TO_FORESTER."perl/multifetch.pl"; - - -# RIO/FORESTER: -our $BOOTSTRAP_CZ = $PATH_TO_FORESTER."C/bootstrap_cz"; -our $BOOTSTRAP_CZ_PL = $PATH_TO_FORESTER."perl/bootstrap_cz.pl"; -#our $SUPPORT_TRANSFER = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.transfersBranchLenghts"; -#our $SUPPORT_TRANSFER = $JAVA." -cp /home/czmasek/SOFTWARE/FORESTER/forester3/forester.jar org.forester.tools.SupportTransfer"; - -our $PHYLO_PL = $PATH_TO_FORESTER."perl/phylo_pl.pl"; -our $RIO_PL = $PATH_TO_FORESTER."perl/rio.pl"; -our $DORIO = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.DoRIO"; -# parallel RIO: -our $RIO_SLAVE_DRIVER = $PATH_TO_FORESTER."perl/rio_slave_driver.pl"; -our $RIO_SLAVE = $PATH_TO_FORESTER."perl/rio_slave.pl"; -our $NODE_LIST = $PATH_TO_FORESTER."data/node_list.dat"; - -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - -our $BOOTSTRAPS = 100; -our $MIN_NUMBER_OF_AA = 20; # After removal of gaps, if less, gaps are not removed. -our $LENGTH_OF_NAME = 10; - - - - -our $MULTIPLE_TREES_FILE_SUFFIX = ".mlt"; -our $LOG_FILE_SUFFIX = ".log"; -our $ALIGN_FILE_SUFFIX = ".aln"; -our $TREE_FILE_SUFFIX = ".nhx"; -our $ADDITION_FOR_RIO_ANNOT_TREE = ".rio"; -our $SUFFIX_PWD = ".pwd"; -our $MULTIPLE_PWD_FILE_SUFFIX = ".mpwd"; -our $SUFFIX_BOOT_STRP_POS = ".bsp"; -our $SUFFIX_PWD_NOT_BOOTS = ".nbd"; -our $SUFFIX_HMM = ".hmm"; - -our $EXPASY_SPROT_SEARCH_DE = "http://www.expasy.org/cgi-bin/sprot-search-de?"; -our $EXPASY_SPROT_SEARCH_AC = "http://www.expasy.org/cgi-bin/sprot-search-ac?"; - - - -# One argument: input multiple trees file -# Last modified: 07/05/01 -sub executeConsense { - my $in = $_[ 0 ]; - - &testForTextFilePresence( $in ); - - system( "$CONSENSE >/dev/null 2>&1 << ! -$in -Y -!" ) - && &dieWithUnexpectedError( "Could not execute \"$CONSENSE $in\"" ); - - return; -} - - - -# Four arguments: -# 1. options ("-" is not necessary) -# 2. alignment or pwd file -# 3. outfile -# 4. temp dir -# Last modified: 07/05/01 -sub executePhyloPl { - - my $opts = $_[ 0 ]; - my $B = $_[ 1 ]; - my $C = $_[ 2 ]; - my $D = $_[ 3 ]; - - &testForTextFilePresence( $B ); - - $opts = "-".$opts; - - system( "$PHYLO_PL $opts $B $C $D" ) - && &dieWithUnexpectedError( "Could not execute \"$PHYLO_PL $opts $B $C $D\"" ); - -} ## executePhyloPl - - - - -# Two arguments: -# 1. Name of inputfile -# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; -# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise -sub executePuzzleDQO { - my $in = $_[ 0 ]; - my $matrix_option = $_[ 1 ]; - my $mat = ""; - - &testForTextFilePresence( $in ); - - $mat = setModelForPuzzle( $matrix_option ); - - system( "$PUZZLE_DQO $in >/dev/null 2>&1 << !$mat -y -!" ) - && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO\"" ); - - return; - -} ## executePuzzleDQO - - - - -# Two arguments: -# 1. Name of inputfile -# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; -# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise -# Last modified: 01/28/02 -sub executePuzzleDQObootstrapped { - my $in = $_[ 0 ]; - my $matrix_option = $_[ 1 ]; - - - my $l = 0; - my $slen = 0; - my $counter = 0; - my $mat = ""; - my $a = ""; - my @a = (); - - &testForTextFilePresence( $in ); - - open( GRP, "<$in" ) || &dieWithUnexpectedError( "Cannot open file \"$in\"" ); - while( ) { - if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) { - $counter++; - } - } - close( GRP ); - - $l = `cat $in | wc -l`; - $slen = $l / $counter; - - system( "split -$slen $in $in.splt." ) - && &dieWithUnexpectedError( "Could not execute \"split -$slen $in $in.splt.\"" ); - - @a = <$in.splt.*>; - - $mat = setModelForPuzzle( $matrix_option ); - - foreach $a ( @a ) { - - system( "$PUZZLE_DQO $a >/dev/null 2>&1 << !$mat -y -!" ) - && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO $a\"" ); - - system( "cat $a.dist >> $in.dist" ) - && &dieWithUnexpectedError( "Could not execute \"cat outdist >> $in.dist\"" ); - - unlink( $a, $a.".dist" ); - } - - return; - -} ## executePuzzleDQObootstrapped - - - -# Transfers a Pfam (SELEX) alignment to a -# PHYLIP sequential style alignment. -# It only writes "match columns" as indicated by the -# "# RF" line ('x' means match). -# -# Three arguments: -# 1. infile name -# 2. outfile name -# 3. 1 to NOT ensure that match states contain only 'A'-'Z' or '-' -# -# Returns the number of match states (=length of output alignment), -# the length of the input alignment, -# the number of seqs in the input alignment -# -# Last modified: 07/07/01 -# -sub pfam2phylipMatchOnly { - - my $infile = $_[ 0 ]; - my $outfile = $_[ 1 ]; - my $ne = $_[ 2 ]; - my @seq_name = (); - my @seq_array = (); - my $return_line = ""; - my $seq = ""; - my $x = 0; - my $y = 0; - my $i = 0; - my $x_offset = 0; - my $max_x = 0; - my $rf_y = 0; - my $number_colum = 0; - my $not_ensure = 0; - my $saw_rf_line = 0; - - if ( $ne && $ne == 1 ) { - $not_ensure = 1; - } - - &testForTextFilePresence( $infile ); - - open( INPP, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); - - # This reads in the first block. It reads in the seq names. - while ( 1 ) { - if ( &isPfamSequenceLine( $return_line ) ) { - $return_line =~ /^(\S+)\s+(\S+)/; - $seq_name[ $y ] = substr( $1, 0, $LENGTH_OF_NAME ); - $seq = $2; - for ( $x = 0; $x < length( $seq ); $x++ ) { - $seq_array[ $x ][ $y ] = substr( $seq, $x, 1 ); - } - $y++; - } - elsif ( &isRFline( $return_line ) ) { - $saw_rf_line = 1; - $return_line =~ /\s+(\S+)\s*$/; - $seq = $1; - $x_offset = length( $seq ); - $rf_y = $y; - for ( $x = 0; $x < $x_offset; $x++ ) { - $seq_array[ $x ][ $rf_y ] = substr( $seq, $x, 1 ); - } - last; - } - - $return_line = ; - - if ( !$return_line ) { - &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" ); - } - } - - if ( $saw_rf_line != 1 ) { - &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" ); - } - - $y = 0; - $max_x = 0; - - # This reads all blocks after the 1st one. - while ( $return_line = ) { - if ( &isPfamSequenceLine( $return_line ) ) { - $return_line =~ /^\S+\s+(\S+)/; - $seq = $1; - for ( $x = 0; $x < length( $seq ); $x++ ) { - $seq_array[ $x + $x_offset ][ $y % $rf_y ] = substr( $seq, $x, 1 ); - } - $y++; - } - elsif ( &isRFline( $return_line ) ) { - if ( $y != $rf_y ) { - &dieWithUnexpectedError( "Alignment not in expected format" ); - } - - $return_line =~ /\s+(\S+)\s*$/; - $seq = $1; - $max_x = length( $seq ); - - for ( $x = 0; $x < length( $seq ); $x++ ) { - $seq_array[ $x + $x_offset ][ $rf_y ] = substr( $seq, $x, 1 ); - } - - $y = 0; - $x_offset = $x_offset + $max_x; - $max_x = 0; - } - } - - close( INPP ); - - # Counts the match states, and hence the number of aa in the alignment: - for ( $x = 0; $x < $x_offset; $x++ ) { - if ( !$seq_array[ $x ][ $rf_y ] ) { - &dieWithUnexpectedError( "Alignment not in expected format" ); - } - if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) { - $number_colum++; - } - } - - # Writes the file: - - open( OUTPP, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); - print OUTPP "$rf_y $number_colum\n"; - for ( $y = 0; $y < $rf_y; $y++ ) { - print OUTPP "$seq_name[ $y ]"; - for ( $i = 0; $i < ( $LENGTH_OF_NAME - length( $seq_name[ $y ] ) ); $i++ ) { - print OUTPP " "; - } - for ( $x = 0; $x < $x_offset; $x++ ) { - if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) { - if ( !$seq_array[ $x ][ $y ] ) { - &dieWithUnexpectedError( "Alignment not in expected format" ); - } - if ( $not_ensure != 1 && $seq_array[ $x ][ $y ] !~ /[A-Z]|-/ ) { - &dieWithUnexpectedError( "Alignment not in expected format (match states must only contain 'A'-'Z' or '-')" ); - } - print OUTPP "$seq_array[ $x ][ $y ]"; - } - } - print OUTPP "\n"; - } - close( OUTPP ); - - return $number_colum, $x_offset, $rf_y; - -} ## pfam2phylipMatchOnly - - - -# Returns whether the argument (a String) -# starts with a SWISS-PROT name (SEQN_SPECI). -# Last modified: 06/21/01 -sub startsWithSWISS_PROTname { - return ( $_[ 0 ] =~ /^[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/ ); -} - - - -# Returns whether the argument starts with XXX.. XXXXX.. and the first -# character is not a "#". -# Last modified: 06/21/01 -sub isPfamSequenceLine { - return( !&isPfamCommentLine( $_[ 0 ] ) - && &containsPfamNamedSequence( $_[ 0 ] ) ); -} - - - -# Returns whether the argument does start with a "#". -# Last modified: 06/21/01 -sub isPfamCommentLine { - return ( $_[ 0 ] =~ /^#/ ); -} - - - -# Returns whether the argument starts with XXX XXXXX. -# Last modified: 06/21/01 -sub containsPfamNamedSequence { - return ( $_[ 0 ] =~ /^\S+\s+\S+/ ); -} - - -# Returns whether the argument starts with XXX XXXXX. -# Last modified: 06/21/01 -sub isRFline { - return ( $_[ 0 ] =~ /^#.*RF/ ); -} - - - -# Three arguments: -# 1. pairwise distance file -# 2. number of bootstraps -# 3. initial tree: BME, GME or NJ -# Last modified: 2008/12/31 -sub executeFastme { - my $inpwd = $_[ 0 ]; - my $bs = $_[ 1 ]; - my $init_opt = $_[ 2 ]; - - &testForTextFilePresence( $inpwd ); - my $command = ""; - if ( $bs > 0 ) { - $command = "$FASTME -b $init_opt -i $inpwd -n $bs -s b"; - } - else { - $command = "$FASTME -b $init_opt -i $inpwd -s b"; - } - print $command; - - system( $command ); - - -} ## executeFastme - - -# Four arguments: -# 1. pairwise distance file -# 2. number of bootstraps -# 3. seed for random number generator -# 4. lower-triangular data matrix? 1: yes; no, otherwise -sub executeNeighbor { - my $inpwd = $_[ 0 ]; - my $bs = $_[ 1 ]; - my $s = $_[ 2 ]; - my $l = $_[ 3 ]; - my $multi = ""; - my $lower = ""; - - &testForTextFilePresence( $inpwd ); - - if ( $bs >= 2 ) { - $multi = " -M -$bs -$s"; - } - if ( $l == 1 ) { - $lower = " -L"; - } - - system( "$NEIGHBOR >/dev/null 2>&1 << ! -$inpwd$multi$lower -2 -3 -Y -!" ) - && &dieWithUnexpectedError( "Could not execute \"$NEIGHBOR $inpwd$multi$lower\"" ); - -} ## executeNeighbor - - -# Seven arguments: -# 1. pairwise distance file -# 2. number of bootstraps -# 3. seed for random number generator -# 4. number of jumbles for input order -# 5. lower-triangular data matrix? 1: yes; no, otherwise -# 6. FM for Fitch-Margoliash, ME for ME# 6. -# 7. 1 to use globale rearragements -sub executeFitch { - my $inpwd = $_[ 0 ]; - my $bs = $_[ 1 ]; - my $s = $_[ 2 ]; - my $j = $_[ 3 ]; - my $l = $_[ 4 ]; - my $m = $_[ 5 ]; - my $use_global_rearr = $_[ 6 ]; - my $jumble = ""; - my $multi = ""; - my $lower = ""; - my $method = ""; - - my $global = ""; - if ( $use_global_rearr == 1 ) { - $global = " -G"; - } - - &testForTextFilePresence( $inpwd ); - - if ( $m eq "FM" ) { - $method = ""; - } - elsif ( $m eq "ME" ) { - $method = " -D"; - } - else { - &dieWithUnexpectedError( "method for FITCH must be either FM or ME" ); - } - - if ( $j >= 1 ) { - $jumble = " -J -$s -$j"; - } - - if ( $bs >= 2 ) { - $multi = " -M -$bs -$s"; - } - if ( $l == 1 ) { - $lower = " -L"; - } - - # jumble must be set BEFORE multi! - system( "$FITCH 2>&1 << ! -$inpwd$method$global$jumble$multi$lower -3 -Y -!" ) - && &dieWithUnexpectedError( "Could not execute \"$FITCH $inpwd$method$global$jumble$multi$lower\"" ); - # 3: Do NOT print out tree - -} ## executeFitch - - - -# Two arguments: -# 1. pairwise distance file -# 2. outfile -sub executeBionj { - my $inpwd = $_[ 0 ]; - my $out = $_[ 1 ]; - - &testForTextFilePresence( $inpwd ); - my $command = "$BIONJ $inpwd $out"; - - system( $command ) - && &dieWithUnexpectedError( $command ); - -} - -# Four arguments: -# 1. (effective) sequence length -# 2. (effective) number of bases -# 3. pairwise distance file -# 4. outfile -sub executeWeighbor { - my $L = $_[ 0 ]; - my $b = $_[ 1 ]; - my $i = $_[ 2 ]; - my $o = $_[ 3 ]; - - &testForTextFilePresence( $i ); - my $command = "$WEIGHBOR -L $L -b $b -i $i -o $o"; - - system( $command ) - && &dieWithUnexpectedError( $command ); - -} - -# Six arguments: -# 1. DNA or Amino-Acids sequence filename (PHYLIP format) -# 2. number of data sets to analyse (ex:3) -# 3. Model: JTT | MtREV | Dayhoff | WAG | VT | DCMut | Blosum62 (Amino-Acids) -# 4. number of relative substitution rate categories (ex:4), positive integer -# 5. starting tree filename (Newick format), your tree filename | BIONJ for a distance-based tree -# 6. 1 to estimate proportion of invariable sites, otherwise, fixed proportion "0.0" is used -# PHYML produces several results files : -# _phyml_lk.txt : likelihood value(s) -# _phyml_tree.txt : inferred tree(s) -# _phyml_stat.txt : detailed execution stats -sub executePhyml { - my $sequences = $_[ 0 ]; - my $data_sets = $_[ 1 ]; - my $model = $_[ 2 ]; - my $nb_categ = $_[ 3 ]; - my $tree = $_[ 4 ]; - my $estimate_invar_sites = $_[ 5 ]; - - if ( $data_sets < 1 ) { - $data_sets = 1 - } - - my $invar = "0.0"; # proportion of invariable sites, - # a fixed value (ex:0.0) | e to get the maximum likelihood estimate - if ( $estimate_invar_sites == 1 ) { - $invar = "e"; - } - - my $data_type = "1"; # 0 = DNA | 1 = Amino-Acids - my $format = "i"; # i = interleaved sequence format | s = sequential - my $bootstrap_sets = "0"; # number of bootstrap data sets to generate (ex:2) - # only works with one data set to analyse - - my $alpha = "e"; # gamma distribution parameter, - # a fixed value (ex:1.0) | e to get the maximum likelihood estimate - - my $opt_topology = "y"; # optimise tree topology ? y | n - my $opt_lengths = "y"; # optimise branch lengths and rate parameters ? y | n - - if ( $data_sets > 1 ) { - # No need to calc branch lengths for bootstrapped analysis - $opt_lengths = "n"; - } - - &testForTextFilePresence( $sequences ); - my $command = "$PHYML $sequences $data_type $format $data_sets $bootstrap_sets $model $invar $nb_categ $alpha $tree $opt_topology $opt_lengths"; - - print( "\n$command\n"); - - system( $command ) - && &dieWithUnexpectedError( $command ); - -} - - - - -# Four arguments: -# 1. name of alignment file (in correct format!) -# 2. number of bootstraps -# 3. jumbles: 0: do not jumble; >=1 number of jumbles -# 4. seed for random number generator -sub executeProtpars { - my $align = $_[ 0 ]; - my $bs = $_[ 1 ]; - my $rand = $_[ 2 ]; - my $s = $_[ 3 ]; - my $jumble = ""; - my $multi = ""; - - - &testForTextFilePresence( $align ); - - if ( $bs > 1 && $rand < 1 ) { - $rand = 1; - } - - if ( $rand >= 1 ) { - $jumble = " -J -$s -$rand"; - } - - if ( $bs > 1 ) { - $multi = " -M -D -$bs"; - } - - system( "$PROTPARS 2>&1 << ! -$align$jumble$multi -Y -!" ) - && &dieWithUnexpectedError( "Could not execute \"$PROTPARS $align$jumble$multi\"" ); - # 3: Do NOT print out tree - - - return; - -} ## executeProtpars - - - -# "Model of substitution" order for DQO TREE-PUZZLE 5.0: -# Auto -# m -> Dayhoff (Dayhoff et al. 1978) -# m -> JTT (Jones et al. 1992) -# m -> mtREV24 (Adachi-Hasegawa 1996) -# m -> BLOSUM62 (Henikoff-Henikoff 92) -# m -> VT (Mueller-Vingron 2000) -# m -> WAG (Whelan-Goldman 2000) -# m -> Auto -# One argument: -# matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; -# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise -# Last modified: 07/07/01 -sub setModelForPuzzle { - my $matrix_option = $_[ 0 ]; - my $matr = ""; - - if ( $matrix_option == 0 ) { # JTT - $matr = " -m -m"; - } - elsif ( $matrix_option == 2 ) { # BLOSUM 62 - $matr = " -m -m -m -m"; - } - elsif ( $matrix_option == 3 ) { # mtREV24 - $matr = " -m -m -m"; - } - elsif ( $matrix_option == 5 ) { # VT - $matr = " -m -m -m -m -m"; - } - elsif ( $matrix_option == 6 ) { # WAG - $matr = " -m -m -m -m -m -m"; - } - elsif ( $matrix_option == 7 ) { # auto - $matr = ""; - } - else { # PAM - $matr = " -m" - } - - return $matr; - -} ## setModelForPuzzle - -# One argument: -# Model of rate heterogeneity: -# 1 for "8 Gamma distributed rates" -# 2 for "Two rates (1 invariable + 1 variable)" -# 3 for "Mixed (1 invariable + 8 Gamma rates)" -# otherwise: Uniform rate -# Last modified: 09/08/03 -sub setRateHeterogeneityOptionForPuzzle { - my $rate_heterogeneity_option = $_[ 0 ]; - my $opt = ""; - - if ( $rate_heterogeneity_option == 1 ) { - $opt = " -w"; - } - elsif ( $rate_heterogeneity_option == 2 ) { - $opt = " -w -w"; - } - elsif ( $rate_heterogeneity_option == 3 ) { - $opt = " -w -w -w"; - } - else { - $opt = ""; - } - - return $opt; -} ## setRateHeterogeneityOptionForPuzzle - - -# One argument: -# Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise -# Last modified: 09/08/03 -sub setParameterEstimatesOptionForPuzzle { - my $parameter_estimates_option = $_[ 0 ]; - my $opt = ""; - - if ( $parameter_estimates_option == 1 ) { - $opt = " -e"; - } - else { - $opt = ""; - } - - return $opt; -} ## setParameterEstimatesOptionForPuzzle - - - -# three/four/five arguments: -# 1. Name of inputfile -# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; -# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise -# 3. Number of sequences in alignment -# 4. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise -# 5. Model of rate heterogeneity: -# 1 for "8 Gamma distributed rates" -# 2 for "Two rates (1 invariable + 1 variable)" -# 3 for "Mixed (1 invariable + 8 Gamma rates)" -# otherwise: Uniform rate -sub executePuzzleBootstrapped { - my $in = $_[ 0 ]; - my $matrix_option = $_[ 1 ]; - my $number_of_seqs = $_[ 2 ]; - my $parameter_estimates_option = $_[ 3 ]; - my $rate_heterogeneity_option = $_[ 4 ]; - - my $l = 0; - my $slen = 0; - my $counter = 0; - my $mat = ""; - my $est = ""; - my $rate = ""; - my $a = ""; - my @a = (); - - &testForTextFilePresence( $in ); - - open( GRP, "<$in" ) || die "\n\n$0: Unexpected error: Cannot open file <<$in>>: $!"; - while( ) { - if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) { - $counter++; - } - } - close( GRP ); - - $l = `cat $in | wc -l`; - $slen = $l / $counter; - - system( "split --suffix-length=4 -$slen $in $in.splt." ) - && die "\n\n$0: executePuzzleDQObootstrapped: Could not execute \"split --suffix-length=4 -$slen $in $in.splt.\": $!"; - - @a = <$in.splt.*>; - - $mat = setModelForPuzzle( $matrix_option ); - if ( $parameter_estimates_option ) { - $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option ); - } - if ( $rate_heterogeneity_option ) { - $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option ); - } - - my $k=""; - if ( $number_of_seqs <= 257 ) { - $k = "k"; - } - - foreach $a ( @a ) { - print "-".$a."\n"; - system( "$PUZZLE $a << ! -$k -k -k$mat$est$rate -y -!" ) - && die "$0: Could not execute \"$PUZZLE $a\""; - - system( "cat $a.dist >> $in.dist" ) - && die "$0: Could not execute \"cat outdist >> $in.dist\""; - - unlink( $a, $a.".dist", $a.".puzzle" ); - } - - return; - -} ## executePuzzleBootstrapped - - - - - -# three/four/five arguments: -# 1. Name of inputfile -# 2. Matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; -# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise -# 3. Number of sequences in alignment -# 4. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise -# 5. Model of rate heterogeneity: -# 1 for "8 Gamma distributed rates" -# 2 for "Two rates (1 invariable + 1 variable)" -# 3 for "Mixed (1 invariable + 8 Gamma rates)" -# otherwise: Uniform rate -sub executePuzzle { - my $in = $_[ 0 ]; - my $matrix_option = $_[ 1 ]; - my $number_of_seqs = $_[ 2 ]; - my $parameter_estimates_option = $_[ 3 ]; - my $rate_heterogeneity_option = $_[ 4 ]; - my $mat = ""; - my $est = ""; - my $rate = ""; - - &testForTextFilePresence( $in ); - - $mat = &setModelForPuzzle( $matrix_option ); - if ( $parameter_estimates_option ) { - $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option ); - } - if ( $rate_heterogeneity_option ) { - $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option ); - } - - my $k=""; - if ( $number_of_seqs <= 257 ) { - $k = "k"; - } - - - system( "$PUZZLE $in << ! -$k -k -k$mat$est$rate -y -!" ) - && die "$0: Could not execute \"$PUZZLE\""; - - return; - -} ## executePuzzle - - - - -# Preparation of the pwd file -sub addDistsToQueryToPWDfile { - my $pwd_file = $_[ 0 ]; - my $disttoquery_file = $_[ 1 ]; - my $outfile = $_[ 2 ]; - my $name_of_query = $_[ 3 ]; - my $name_of_query_ = ""; - my $return_line_pwd = ""; - my $return_line_dq = ""; - my $num_of_sqs = 0; - my $block = 0; - my $name_from_pwd = "X"; - my $name_from_dq = "Y"; - my @dists_to_query = (); - my $i = 0; - - &testForTextFilePresence( $pwd_file ); - &testForTextFilePresence( $disttoquery_file ); - - $name_of_query_ = $name_of_query; - for ( my $j = 0; $j <= ( $LENGTH_OF_NAME - length( $name_of_query ) - 1 ); ++$j ) { - $name_of_query_ .= " "; - } - - open( OUT_AD, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); - open( IN_PWD, "$pwd_file" ) || &dieWithUnexpectedError( "Cannot open file \"$pwd_file\"" ); - open( IN_DQ, "$disttoquery_file" ) || &dieWithUnexpectedError( "Cannot open file \"$disttoquery_file\"" ); - - W: while ( $return_line_pwd = ) { - - - if ( $return_line_pwd =~ /^\s*(\d+)\s*$/ ) { - $num_of_sqs = $1; - $num_of_sqs++; - if ( $block > 0 ) { - print OUT_AD "$name_of_query_ "; - for ( my $j = 0; $j < $i; ++$j ) { - print OUT_AD "$dists_to_query[ $j ] "; - } - print OUT_AD "0.0\n"; - } - print OUT_AD " $num_of_sqs\n"; - $block++; - @dists_to_query = (); - $i = 0; - } - - if ( $block == 1 - && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) { - $name_from_pwd = $1; - - if ( !defined( $return_line_dq = ) ) { - &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); - } - - if ( $return_line_dq !~ /\S/ ) { - if ( !defined( $return_line_dq = ) ) { - &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); - } - } - $return_line_dq =~ /^\s*(\S+)\s+(\S+)/; - $name_from_dq = $1; - $dists_to_query[ $i++ ] = $2; - - - if ( $name_from_pwd ne $name_from_dq ) { - &dieWithUnexpectedError( "Order of sequence names in \"$pwd_file\" and \"$disttoquery_file\" is not the same" ); - } - print OUT_AD $return_line_pwd; - - } - elsif ( $block > 1 - && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) { - $name_from_pwd = $1; - if ( !defined( $return_line_dq = ) ) { - &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); - } - if ( $return_line_dq !~ /\S/ ) { - if ( !defined( $return_line_dq = ) ) { - &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); - } - } - $return_line_dq =~ /^\s*\S+\s+(\S+)/; - $dists_to_query[ $i++ ] = $1; - print OUT_AD $return_line_pwd; - } - } - print OUT_AD "$name_of_query_ "; - for ( my $j = 0; $j < $i; ++$j ) { - print OUT_AD "$dists_to_query[ $j ] "; - } - print OUT_AD "0.0\n"; - - close( OUT_AD ); - close( IN_PWD ); - close( IN_DQ ); - return $block; - -} ## addDistsToQueryToPWDfile - - - - -# Three arguments: -# 1. HMMER model db -# 2. name of HMM -# 3. outputfile name -# Last modified: 02/27/01 -sub executeHmmfetch { - - my $db = $_[ 0 ]; - my $name = $_[ 1 ]; - my $outfile = $_[ 2 ]; - - system( "$HMMFETCH $db $name > $outfile" ) - && &dieWithUnexpectedError( "Could not execute \"$HMMFETCH $db $name > $outfile\"" ); - return; - -} ## executeHmmfetch - - - -# Checks wether a file is present, not empty and a plain textfile. -# One argument: name of file. -# Last modified: 07/07/01 -sub testForTextFilePresence { - my $file = $_[ 0 ]; - unless ( ( -s $file ) && ( -f $file ) && ( -T $file ) ) { - dieWithUnexpectedError( "File \"$file\" does not exist, is empty, or is not a plain textfile" ); - } -} ## testForTextFilePresence - - -# Last modified: 02/21/03 -sub addSlashAtEndIfNotPresent { - my $filename = $_[ 0 ]; - $filename =~ s/\s+//g; - unless ( $filename =~ /\/$/ ) { - $filename = $filename."/"; - } - return $filename; -} ## addSlashAtEndIfNotPresent - - - -# Last modified: 02/15/02 -sub exitWithWarning { - - my $text = $_[ 0 ]; - if ( defined( $_[ 1 ] ) && $_[ 1 ] == 1 ) { - print( "

user error

\n" ); - print( "

\n" ); - print( "$text\n" ); - print( "

\n" ); - print( "

 

\n" ); - } - else { - print( "\n\n$text\n\n" ); - } - - exit( 0 ); - -} ## exit_with_warning - - - -# Last modified: 02/15/02 -sub dieWithUnexpectedError { - - my $text = $_[ 0 ]; - - die( "\n\n$0:\nUnexpected error (should not have happened):\n$text\n$!\n\n" ); - -} ## dieWithUnexpectedError - - - -1; diff --git a/forester/archive/perl/pf_cutoff_extract.pl b/forester/archive/perl/pf_cutoff_extract.pl deleted file mode 100755 index 1a56d9c..0000000 --- a/forester/archive/perl/pf_cutoff_extract.pl +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/perl -W - -# $Id: pf_cutoff_extract.pl,v 1.4 2009/11/11 02:28:19 cmzmasek Exp $ - -# This extracts GA, TC, or NC score cutoff values from -# Pfam HMM files (GA1, TC1, NC1) -# Copyright (C) 2008-2009 Christian M. Zmasek -# All rights reserved -# Created 2007-08-01 in Winterthur, Switzerland by CMZ - -# Usage: pf_cutoff_extract.pl - -use strict; - -if ( scalar( @ARGV ) != 3 ) { - print "\npf_cutoff_extract.pl \n\n"; - exit( -1 ); -} - -my $infile = $ARGV[ 0 ]; -my $cutoff_type = uc( $ARGV[ 1 ] ); -my $outfile = $ARGV[ 2 ]; - -my $GA = "GA"; -my $TC = "TC"; -my $NC = "NC"; - -if ( -e $outfile ) { - die "\n$0: \"$outfile\" already exists.\n\n"; -} -unless( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n$0: cannot read from \"$infile\".\n\n"; -} -unless( $cutoff_type eq $GA || $cutoff_type eq $TC || $cutoff_type eq $NC ) { - die "\n$0: illegal value \"$cutoff_type\" for cutoff type.\n\n"; -} - -open( IN, "$infile" ) || die "\n$0: Cannot open file \"$infile\": $!\n"; -open( OUT, ">$outfile" ) || die "\n$0: Cannot create file \"$outfile\": $!\n"; - -my $line = ""; -my $name = ""; -my $line_number = 0; -my $n = 0; - -while ( $line = ) { - $line_number++; - if ( $line =~ /^NAME\s+(.+)/ ) { - if ( length( $name ) > 0 ) { - die "\n$0: Unexpected line $line at line $line_number: $!\n"; - } - $name = $1; - } - elsif ( $line =~ /^$cutoff_type\s+(\S+)\s+[^;]+/ ) { - if ( length( $name ) < 1 ) { - die "\n$0: Unexpected line $line at line $line_number: $!\n"; - } - $n++; - print OUT "$name $1\n"; - $name = ""; - } - elsif ( $line =~ /\/\// ) { - $name = ""; - } -} - -close( OUT ) || die "\n$0: Cannot close file \"$outfile\": $!\n";; - -print( "\nExtracted $n $cutoff_type" . "1 values to \"$outfile\"\n" ); -print( "\nOK\n" ); - -exit( 0 ); - diff --git a/forester/archive/perl/phylo_pl.pl b/forester/archive/perl/phylo_pl.pl deleted file mode 100755 index b6b40bd..0000000 --- a/forester/archive/perl/phylo_pl.pl +++ /dev/null @@ -1,1735 +0,0 @@ -#!/usr/bin/perl -W -# -# $Id: phylo_pl.pl,v 1.32 2010/12/13 19:00:22 cmzmasek Exp $ -# -# FORESTER -- software libraries and applications -# for evolutionary biology research and applications. -# -# Copyright (C) 2008-2009 Christian M. Zmasek -# Copyright (C) 2008-2009 Burnham Institute for Medical Research -# All rights reserved -# -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or (at your option) any later version. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA -# -# Contact: phylosoft @ gmail . com -# WWW: www.phylosoft.org/forester -# -# -# -# Requirements phylo_pl is part of the FORESTER libraries. -# ------------ Many of its global variables are set via forester.pm. -# -# -# Note. Use xt.pl (for Pfam alignments) or mt.pl (for other alignments) -# to run phylo_pl.pl on whole directories of alignments files. -# -# -# -# -# ========================= -# -# METHOD ORDER (IMPORTANT!) -# 1. FastME -# 2. phylip NJ -# 3. phylip fitch FM -# 4. phylip fitch ME -# 5. BIONJ -# 6. Weighbor -# 7. Raxml -# 8. phyml -# 9. phylip proml -# 10. phylip protpars -# 11. all -#========================== - -use strict; - -use FindBin; -use lib $FindBin::Bin; -use forester; - -my $VERSION = "1.0.1"; -my $LAST_MODIFIED = "2009.10.02"; - -my $RAXML_MODEL_BASE = "PROTGAMMA"; -my $RAXML_ALGORITHM = "a"; - -my $TEMP_DIR_DEFAULT = "/tmp/phylo_pl_"; # Where all the infiles, outfiles, etc will be created. - -my $bootstraps = 100; # 0,1: do not bootstrap. Default: 100 -my $matrix = 5; # 0 = JTT - # 1 = PAM - # 2 = BLOSUM 62 - # 3 = mtREV24 - # 5 = VT - default - # 6 = WAG - # 7 = auto in puzzle - # 8 = DCMut in PHYML, VT in TREE-PUZZLE -my $rate_heterogeneity = 0; # 0 = Uniform rate (default) - # 1 = 8 Gamma distributed rates - # 2 = Two rates (1 invariable + 1 variable) - # 3 = Mixed (1 invariable + 8 Gamma rates) -my $seed = 9; # Seed for random number generators. Default: 9 -my $keep_multiple_trees = 0; # 0: delete multiple tree file - # 1: do not delete multiple tree file -my $exact_parameter_est = 0; # 0: no; 1: yes - -my $phyml_rel_substitution_rate_cat = 4; - -my $jumbles = 2; -my $use_fastme = 0; # 0: no; 1: yes -my $use_phylip_nj = 0; # 0: no; 1: yes -my $use_phylip_fitch_fm = 0; # 0: no; 1: yes -my $use_phylip_fitch_me = 0; # 0: no; 1: yes -my $use_bionj = 0; # 0: no; 1: yes -my $use_weighbor = 0; # 0: no; 1: yes -my $use_raxml = 0; # 0: no; 1: yes -my $use_phyml = 0; # 0: no; 1: yes -my $use_proml = 0; # 0: no; 1: yes -my $use_protpars = 0; # 0: no; 1: yes -my $use_global_rearr = 0; # 0: no; 1: yes -my $estimate_invar_sites = 0; # 0: no; 1: yes - -my $fastme_init_tree_opt = "NJ"; - -my %seqnames = (); # number => seqname -my %numbers = (); # seqname => number -my $options = ""; -my $infile = ""; -my $pwdfile = ""; -my $outfile = ""; -my $logfile = ""; -my $multipwdfile = ""; -my $distancefile = ""; -my $log = ""; -my $ii = 0; -my $temp_dir = ""; -my $current_dir = ""; -my @out = (); -my $number_of_seqs = 0; -my $number_of_aa = 0; - -my $use_pwd_based_methods = 0; - -print( "\n"); -print( "phylo_pl $VERSION ($LAST_MODIFIED)\n" ); -print( "__________________________________\n"); -print( "\n\n"); - - - -unless ( @ARGV == 2 || @ARGV == 3 || @ARGV == 4 || @ARGV == 5 ) { - &printUsage(); - exit ( -1 ); -} - - - -# Analyzes the options: -# --------------------- - -if ( $ARGV[ 0 ] =~ /^-.+/ ) { - - unless ( @ARGV > 2 ) { - &printUsage(); - exit ( -1 ); - } - $options = $ARGV[ 0 ]; - - if ( @ARGV != 3 && @ARGV != 4 ) { - &printUsage(); - exit ( -1 ); - } - $infile = $ARGV[ 1 ]; - $outfile = $ARGV[ 2 ]; - if ( @ARGV == 4 ) { - $temp_dir = $ARGV[ 3 ]; - } - if ( $options =~ /B(\d+)/ ) { - $bootstraps = $1; - if ( $bootstraps <= 1 ) { - $bootstraps = 0; - } - elsif ( $bootstraps <= 9 ) { - $bootstraps = 0; - print "\n\nphylo_pl: WARNING: Bootstrap number must be devisable by 10,\nno bootstrapping.\n\n"; - } - elsif ( $bootstraps % 10 != 0 ) { - $bootstraps = $bootstraps - $bootstraps % 10; # to ensure $bootstraps % 10 == 0 - print "\n\nphylo_pl: WARNING: Bootstrap number must be devisable by 10,\nset to $bootstraps.\n\n"; - } - } - if ( $options =~ /n/ ) { - $use_phylip_nj = 1; - } - if ( $options =~ /q@(\d)/ ) { - $use_fastme = 1; - my $opt = $1; - if ( $opt == 1 ) { - $fastme_init_tree_opt = "GME"; - } - elsif ( $opt == 2 ) { - $fastme_init_tree_opt = "BME"; - } - elsif ( $opt == 3 ) { - $fastme_init_tree_opt = "NJ"; - } - else { - &printUsage(); - exit ( -1 ); - } - } - if ( $options =~ /f/ ) { - $use_phylip_fitch_fm = 1; - } - if ( $options =~ /e/ ) { - $use_phylip_fitch_me = 1; - } - if ( $options =~ /b/ ) { - $use_bionj = 1; - } - if ( $options =~ /w/ ) { - $use_weighbor = 1; - } - if ( $options =~ /x/ ) { - $use_raxml = 1; - } - if ( $options =~ /y/ ) { - $use_phyml = 1; - } - if ( $options =~ /o/ ) { - $use_proml = 1; - } - if ( $options =~ /p/ ) { - $use_protpars = 1; - } - if ( $options =~ /G/ ) { - $use_global_rearr = 1; - } - if ( $options =~ /I/ ) { - $estimate_invar_sites = 1; - } - if ( $options =~ /j(\d+)/ ) { - $jumbles = $1; - if ( $jumbles < 1 ) { - $jumbles = 0; - } - } - if ( $options =~ /r(\d+)/ ) { - $phyml_rel_substitution_rate_cat = $1; - if ( $phyml_rel_substitution_rate_cat < 1 ) { - $phyml_rel_substitution_rate_cat = 1; - } - } - if ( $options =~ /J/ ) { - $matrix = 0; # JTT - } - if ( $options =~ /P/ ) { - $matrix = 1; # PAM - } - if ( $options =~ /L/ ) { - $matrix = 2; # Blosum 62 - } - if ( $options =~ /M/ ) { - $matrix = 3; # mtREV24 - } - if ( $options =~ /W/ ) { - $matrix = 6; # WAG - } - if ( $options =~ /A/ ) { - $matrix = 7; # auto - } - if ( $options =~ /D/ ) { - $matrix = 8; # DCMut in PHYML and RAXML, VT in PUZZLE - } - if ( $options =~ /S(\d+)/ ) { - $seed = $1; - } - if ( $options =~ /X/ ) { - $keep_multiple_trees = 1; - } - if ( $options =~ /E/ ) { - $exact_parameter_est = 1; - } - if ( $options =~ /g/ ) { - $rate_heterogeneity = 1; - } - if ( $options =~ /t/ ) { - $rate_heterogeneity = 2; - } - if ( $options =~ /m/ ) { - $rate_heterogeneity = 3; - } -} -else { - unless ( @ARGV == 2 || @ARGV == 3 ) { - &printUsage(); - exit ( -1 ); - } - $infile = $ARGV[ 0 ]; - $outfile = $ARGV[ 1 ]; - if ( @ARGV == 3 ) { - $temp_dir = $ARGV[ 2 ]; - } -} - -if ( $use_fastme != 1 && - $use_phylip_nj != 1 && - $use_phylip_fitch_fm != 1 && - $use_phylip_fitch_me != 1 && - $use_bionj != 1 && - $use_weighbor != 1 && - $use_raxml != 1 && - $use_phyml != 1 && - $use_proml != 1 && - $use_protpars != 1 ) { - - $use_fastme = 1; - $use_phylip_nj = 1; - $use_phylip_fitch_fm = 1; - $use_phylip_fitch_me = 1; - $use_bionj = 1; - $use_raxml = 1; - $use_weighbor = 1; - $use_phyml = 1; - $use_proml = 1; - $use_protpars = 1; -} - - -if ( $use_fastme == 1 || - $use_phylip_nj == 1 || - $use_phylip_fitch_fm == 1 || - $use_phylip_fitch_me == 1 || - $use_bionj == 1 || - $use_weighbor == 1 ) { - $use_pwd_based_methods = 1; -} -else { - $use_pwd_based_methods = 0; -} - -$current_dir = `pwd`; -$current_dir =~ s/\s//; - -if ( $outfile !~ /^\// ) { - # outfile is not absolute path. - $outfile = $current_dir."/".$outfile; -} - - - -# TREE-PUZZLE sets the option in this way: -# If two rates or mixed, exact parameter estimates are used. -if ( $rate_heterogeneity == 2 -|| $rate_heterogeneity == 3 ) { - $exact_parameter_est = 1 -} - - -if ( $outfile =~ /\.xml$/i ) { - $outfile =~ s/\.xml//i; -} -elsif ( $outfile =~ /\.aln$/i ) { - $outfile =~ s/\.aln//i; -} -elsif ( $outfile =~ /\.fasta$/i ) { - $outfile =~ s/\.fasta//i; -} -elsif ( $outfile =~ /\.fas$/i ) { - $outfile =~ s/\.fas//i; -} -elsif ( $outfile =~ /\.seqs$/i ) { - $outfile =~ s/\.seqs//i; -} - - -$logfile = $outfile.$LOG_FILE_SUFFIX; -$multipwdfile = $outfile.$MULTIPLE_PWD_FILE_SUFFIX; -$distancefile = $outfile.$SUFFIX_PWD_NOT_BOOTS; - -&dieIfFileExists( $logfile ); -&dieIfFileExists( $multipwdfile ); -&dieIfFileExists( $distancefile ); - - -my $fastme_outtree = $outfile."_fme.xml"; -my $phylip_nj_outtree = $outfile."_pnj.xml"; -my $phylip_fm_outtree = $outfile."_pfm.xml"; -my $phylip_me_outtree = $outfile."_pme.xml"; -my $bionj_outtree = $outfile."_bionj.xml"; -my $weighbor_outtree = $outfile."_weigh.xml"; -my $raxml_outtree = $outfile."_raxml.xml"; -my $phyml_outtree = $outfile."_phyml.xml"; -my $proml_outtree = $outfile."_proml.xml"; -my $protpars_outtree = $outfile."_ppp.xml"; -my $all_outtree = $outfile."_comb.xml"; - -my $multitreefile_fastme = $outfile."_fme".$MULTIPLE_TREES_FILE_SUFFIX; -my $multitreefile_phylip_nj = $outfile."_pnj".$MULTIPLE_TREES_FILE_SUFFIX; -my $multitreefile_phylip_fm = $outfile."_pfm".$MULTIPLE_TREES_FILE_SUFFIX; -my $multitreefile_phylip_me = $outfile."_pme".$MULTIPLE_TREES_FILE_SUFFIX; -my $multitreefile_bionj = $outfile."_bionj".$MULTIPLE_TREES_FILE_SUFFIX; -my $multitreefile_weighbor = $outfile."_weigh".$MULTIPLE_TREES_FILE_SUFFIX; -my $multitreefile_raxml = $outfile."_raxml".$MULTIPLE_TREES_FILE_SUFFIX; -my $multitreefile_phyml = $outfile."_phyml".$MULTIPLE_TREES_FILE_SUFFIX; -my $multitreefile_proml = $outfile."_proml".$MULTIPLE_TREES_FILE_SUFFIX; -my $multitreefile_protpars = $outfile."_ppp".$MULTIPLE_TREES_FILE_SUFFIX; - -if ( $use_fastme == 1 ) { - &dieIfFileExists( $fastme_outtree ); - if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { - &dieIfFileExists( $multitreefile_fastme ); - } -} -if( $use_phylip_nj == 1 ) { - &dieIfFileExists( $phylip_nj_outtree ); - if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { - &dieIfFileExists( $multitreefile_phylip_nj ); - } -} -if( $use_phylip_fitch_fm == 1 ) { - &dieIfFileExists( $phylip_fm_outtree ); - if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { - &dieIfFileExists( $multitreefile_phylip_fm ); - } -} -if( $use_phylip_fitch_me == 1 ) { - &dieIfFileExists( $phylip_me_outtree ); - if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { - &dieIfFileExists( $multitreefile_phylip_me ); - } -} -if( $use_bionj == 1 ) { - &dieIfFileExists( $bionj_outtree ); - if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { - &dieIfFileExists( $multitreefile_bionj ); - } -} -if( $use_weighbor == 1 ) { - &dieIfFileExists( $weighbor_outtree ); - if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { - &dieIfFileExists( $multitreefile_weighbor ); - } -} -if( $use_raxml == 1 ) { - &dieIfFileExists( $raxml_outtree ); - if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { - &dieIfFileExists( $multitreefile_raxml ); - } -} -if( $use_phyml == 1 ) { - &dieIfFileExists( $phyml_outtree ); - if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { - &dieIfFileExists( $multitreefile_phyml ); - } -} -if( $use_proml == 1 ) { - &dieIfFileExists( $proml_outtree ); - if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { - &dieIfFileExists( $multitreefile_proml ); - } -} -if( $use_protpars == 1 ) { - &dieIfFileExists( $protpars_outtree ); - if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { - &dieIfFileExists( $multitreefile_protpars ); - } -} -if ( $bootstraps > 1 ) { - &dieIfFileExists( $all_outtree ); -} -if ( $infile ne "" ) { - unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n\nphylo_pl: Input alignment file \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n"; - } -} - - - - -# Prints out the options: -# ----------------------- - - -$log = "\n$0 logfile:\n"; -$log = $log."Version: $VERSION\n\n"; - - - -if ( $infile ne "" ) { - $log = $log."Input : $infile\n"; -} - -if ( $keep_multiple_trees == 1 && $bootstraps >= 2 ) { - $log = $log."Multiple distance matrices : $multipwdfile\n"; -} - - -$log = $log."Bootstraps : $bootstraps\n"; - -if ( $use_pwd_based_methods == 1 ) { - $log = $log."Prgrm to calculate pairwise dist. : TREE-PUZZLE (version: $PUZZLE_VERSION)\n"; -} - - -if ( $use_fastme == 1 ) { - $log = $log."Program to calculate tree : FastME (version: $FASTME_VERSION)\n"; - $log = $log."Method for intial tree in FastME : $fastme_init_tree_opt\n"; - $log = $log."Tree swapping (NNI) in FastME : balanced (default)\n"; -} -if ( $use_phylip_nj == 1 ) { - $log = $log."Program to calculate tree : PHYLIP NEIGHBOR NJ (version: $PHYLIP_VERSION)\n"; -} -if ( $use_phylip_fitch_fm == 1 ) { - $log = $log."Program to calculate tree : PHYLIP FITCH Fitch-Margoliash (version: $PHYLIP_VERSION)\n"; -} -if ( $use_phylip_fitch_me == 1 ) { - $log = $log."Program to calculate tree : PHYLIP FITCH Minimal Evolution (version: $PHYLIP_VERSION)\n"; -} -if ( $use_bionj == 1 ) { - $log = $log."Program to calculate tree : BIONJ (version: $BIONJ_VERSION)\n"; -} -if ( $use_weighbor == 1 ) { - $log = $log."Program to calculate tree : Weighbor [no invariable sites, b=14] (version: $WEIGHBOR_VERSION)\n"; -} -if ( $use_raxml == 1 ) { - $log = $log."Program to calculate tree : RAxML [$RAXML_MODEL_BASE] (uses its own bootstraps, if bootstrapped: -f $RAXML_ALGORITHM) (version: $RAXML_VERSION)\n"; -} -if ( $use_phyml == 1 ) { - $log = $log."Program to calculate tree : PHYML (MLE for gamma distr param and proportion of inv sites) (version: $PHYML_VERSION)\n"; - $log = $log."# of rel subst rate categories : $phyml_rel_substitution_rate_cat\n"; -} -if ( $use_proml == 1 ) { - $log = $log."Program to calculate tree : PHYLIP PROML (uses PAM unless JTT selected) (version: $PHYLIP_VERSION)\n"; -} -if ( $use_protpars == 1 ) { - $log = $log."Program to calculate tree : PHYLIP PROTPARS (with global rearrangements) (version: $PHYLIP_VERSION)\n"; -} -if ( $use_phylip_fitch_fm == 1 || $use_phylip_fitch_me == 1 || $use_protpars == 1 || $use_proml ) { - $log = $log."Number of jumbles (input order rand): $jumbles\n"; - -} -if ( $use_phylip_fitch_fm == 1 || $use_phylip_fitch_me == 1 || $use_proml ) { - if ( $use_global_rearr == 1 ) { - $log = $log."Global rearrangements : true\n"; - } - else { - $log = $log."Global rearrangements : false\n"; - - } -} - -if ( $bootstraps > 0 ) { - $log = $log."Prgrm to calculate ML branch lenghts: TREE-PUZZLE (version: $PUZZLE_VERSION)\n"; -} - -$log = $log."Model : "; -if ( $matrix == 0 ) { - $log = $log."JTT (Jones et al. 1992)\n"; -} -elsif ( $matrix == 1 ) { - $log = $log."PAM (Dayhoff et al. 1978)\n"; -} -elsif ( $matrix == 2 ) { - $log = $log."BLOSUM 62 (Henikoff-Henikoff 92)\n"; -} -elsif ( $matrix == 3 ) { - $log = $log."mtREV24 (Adachi-Hasegawa 1996)\n"; -} -elsif ( $matrix == 5 ) { - $log = $log."VT (Mueller-Vingron 2000)\n"; -} -elsif ( $matrix == 6 ) { - $log = $log."WAG (Whelan-Goldman 2000)\n"; -} -elsif ( $matrix == 7 ) { - $log = $log."auto in TREE-PUZZLE\n"; -} -elsif ( $matrix == 8 ) { - $log = $log."DCMut (Kosial and Goldman, 2005) in PHYML and RAxML, VT in TREE-PUZZLE\n"; -} -else { - &dieWithUnexpectedError( "Unknown model: matrix=$matrix" ); -} -if ( $use_raxml == 1 || $use_phyml == 1 ) { - if ( $estimate_invar_sites == 1 ) { - $log = $log."Estimate proportion of invariable sites in RAXML and/or PHYML: true\n"; - } - else { - $log = $log."Estimate proportion of invariable sites in RAXML and/or PHYML: false (proportion \"0.0\" is used in PHYML)\n"; - } -} - -$log = $log."Model of rate heterogeneity (PUZZLE): "; -if ( $rate_heterogeneity == 1 ) { - $log = $log."8 Gamma distributed rates\n"; -} -elsif ( $rate_heterogeneity == 2 ) { - $log = $log."Two rates (1 invariable + 1 variable)\n"; -} -elsif ( $rate_heterogeneity == 3 ) { - $log = $log."Mixed (1 invariable + 8 Gamma rates)\n"; -} -else { - $log = $log."Uniform rate\n"; -} -$log = $log."Seed for random number generators : $seed\n"; -if ( $exact_parameter_est == 1 ) { - $log = $log."Exact parameter estimates in TREE-PUZZLE\n"; -} - -$log = $log."Start time/date : ".`date`; - - - - -# That's where the mischief starts.... -# ------------------------------------ - -$ii = 0; - -srand(); -my $range = 1000000; -my $random_number = int( rand( $range ) ); - -if ( $temp_dir eq "" ) { - $temp_dir = $TEMP_DIR_DEFAULT; -} - -$temp_dir = $temp_dir.$random_number.$ii; - -while ( -e $temp_dir ) { - $ii++; - $temp_dir = $temp_dir.$random_number.$ii; -} - -mkdir( $temp_dir, 0700 ) -|| die "\n\n$0: Could not create <<$temp_dir>>: $!\n\n"; - -unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) { - die "\n\n$0: <<$temp_dir>> does not exist, or is not a directory.\n\n"; -} - - -&cp( $infile, $temp_dir."/INFILE" ); -unless ( chmod ( 0600, $temp_dir."/INFILE" ) ) { - warn "\n\n$0: Could not chmod. $!\n\n"; -} -$infile = "INFILE"; - -chdir ( $temp_dir ) -|| die "\n\n$0: Could not chdir to <<$temp_dir>>: $!\n\n"; - -&cp( $infile, "infile" ); -@out = &getNumberOfSeqsAndAas( $infile ); -$number_of_seqs = $out[ 0 ]; -$number_of_aa = $out[ 1 ]; - -my $SEQBOOT_OUTFILE = "seqboot_outfile"; - -if ( $bootstraps > 1 && ( $use_pwd_based_methods == 1 - || $use_phyml == 1 - || $use_proml == 1 - || $use_protpars == 1 ) ) { - &executeSeqboot( $seed, $bootstraps ); - &mv( "outfile", $SEQBOOT_OUTFILE ); - &rm( "infile" ); -} - -&cp( $infile, "align" ); - -if ( $use_pwd_based_methods == 1 ) { - # Calculating the pairwise distances (saved in file "infile"): "puzzle" - if ( $bootstraps > 1 ) { - &executePuzzleBootstrapped( $SEQBOOT_OUTFILE, - $matrix, - $number_of_seqs, - $exact_parameter_est, - $rate_heterogeneity ); - - $pwdfile = $SEQBOOT_OUTFILE.".dist"; - } - else { - &executePuzzle( "infile", - $matrix, - $number_of_seqs, - $exact_parameter_est, - $rate_heterogeneity ); - $pwdfile = "infile.dist"; - } -} - -&rm( "infile" ); - -# Methods based on alignment -# -------------------------- -my $OUTTREE_RAXML = "outtree_rax"; -my $OUTTREE_PHYML = "outtree_phyml"; -my $OUTTREE_PROML = "outtree_proml"; -my $OUTTREE_PROTPARS = "outtree_protpars"; - -my $CONSENSUS_RAXML = "consensus_raxml"; -my $CONSENSUS_PHYML = "consensus_phyml"; -my $CONSENSUS_PROML = "consensus_proml"; -my $CONSENSUS_PROTPARS = "consensus_protpars"; - -my $OUTTREES_ALL = "outtrees_all"; -my $all_count = 0; - -if ( $use_raxml == 1 ) { - - my $model = "---"; - if ( $matrix == 0 ) { - $model = "JTT"; - } - elsif ( $matrix == 1 ) { - $model = "DAYHOFF"; - } - elsif ( $matrix == 2 ) { - $model = "BLOSUM62"; - } - elsif ( $matrix == 3 ) { - $model = "MTREV"; - } - elsif ( $matrix == 5 ) { - $model = "VT"; - } - elsif ( $matrix == 6 ) { - $model = "WAG"; - } - elsif ( $matrix == 7 ) { - $model = "VT"; - } - elsif ( $matrix == 8 ) { - $model = "DCMUT"; - } - else { - &dieWithUnexpectedError( "Unknown model: matrix=$matrix" ); - } - - print( "\n========== RAxML begin =========\n\n" ); - # Six arguments: - # 1. DNA or Amino-Acids sequence filename (PHYLIP format) - # 2. Model, eg. PROTGAMMAIVT - # 3. Replicates (bootstrap) - # 4. Seed for bootstrap - # 5. Output suffix - # 6. Algorithm (only for bootstrap, default otherwise) - my $invar = ""; - if ( $estimate_invar_sites == 1 ) { - $invar = "I"; - } - - # NOTE. RaxML does its own bootstrapping. - &executeRaxml( "align", $RAXML_MODEL_BASE.$invar.$model."F", $bootstraps, $seed, "xxx", $RAXML_ALGORITHM ); - print( "\n========== RAxML end =========\n\n" ); - - &rm( "RAxML_log.xxx" ); - &rm( "RAxML_parsimonyTree.xxx" ); - &mv( "RAxML_info.xxx", $outfile."_raxml_info" ); - if ( $bootstraps > 1 ) { - &rm( "RAxML_bestTree.xxx" ); - &mv( "RAxML_bipartitions.xxx", $CONSENSUS_RAXML ); - &append( "RAxML_bootstrap.xxx", $OUTTREES_ALL ); - if ( $keep_multiple_trees == 1 ) { - &mv( "RAxML_bootstrap.xxx", $multitreefile_raxml ); - } - else { - &rm( "RAxML_bootstrap.xxx" ); - } - $all_count++; - } - else { - &mv( "RAxML_result.xxx", $OUTTREE_RAXML ); - } -} - - -if ( $use_phyml == 1 ) { - - my $model = "---"; - if ( $matrix == 0 ) { - $model = "JTT"; - } - elsif ( $matrix == 1 ) { - $model = "Dayhoff"; - } - elsif ( $matrix == 2 ) { - $model = "Blosum62"; - } - elsif ( $matrix == 3 ) { - $model = "MtREV"; - } - elsif ( $matrix == 5 ) { - $model = "VT"; - } - elsif ( $matrix == 6 ) { - $model = "WAG"; - } - elsif ( $matrix == 7 ) { - $model = "VT"; - } - elsif ( $matrix == 8 ) { - $model = "DCMut"; - } - else { - &dieWithUnexpectedError( "Unknown model: matrix=$matrix" ); - } - - my $input = ""; - if ( $bootstraps > 1 ) { - $input = $SEQBOOT_OUTFILE; - } - else { - $input = "align"; - } - print( "\n========== PHYML begin =========\n\n" ); - # Six arguments: - # 1. DNA or Amino-Acids sequence filename (PHYLIP format) - # 2. number of data sets to analyse (ex:3) - # 3. Model: JTT | MtREV | Dayhoff | WAG | VT | DCMut | Blosum62 (Amino-Acids) - # 4. number of relative substitution rate categories (ex:4), positive integer - # 5. starting tree filename (Newick format), your tree filename | BIONJ for a distance-based tree - # 6. 1 to estimate proportion of invariable sites, otherwise, fixed proportion "0.0" is used - # PHYML produces several results files : - # _phyml_lk.txt : likelihood value(s) - # _phyml_tree.txt : inferred tree(s) - # _phyml_stat.txt : detailed execution stats - &executePhyml( $input, $bootstraps, $model, $phyml_rel_substitution_rate_cat, "BIONJ", $estimate_invar_sites ); - print( "\n========== PHYML end =========\n\n" ); - - &rm( $input."_phyml_lk.txt" ); - &mv( $input."_phyml_tree.txt", $OUTTREE_PHYML ); - if ( -e $outfile."_phyml_stat" ) { - &rm( $outfile."_phyml_stat" ); - } - &mv( $input."_phyml_stat.txt", $outfile."_phyml_stat" ); - if ( $bootstraps > 1 ) { - &append( $OUTTREE_PHYML, $OUTTREES_ALL ); - $all_count++; - } - -} - -if ( $use_proml == 1 ) { - my $input = ""; - if ( $bootstraps > 1 ) { - $input = $SEQBOOT_OUTFILE; - } - else { - $input = "align"; - } - print( "\n========== PHYLIP PROML begin =========\n\n" ); - # Five arguments: - # 1. name of alignment file (in correct format!) - # 2. number of bootstraps - # 3. jumbles: 0: do not jumble; >=1 number of jumbles - # 4. seed for random number generator - # 5. 1 for PAM instead of JTT - my $use_pam = 1; - if ( $matrix == 0 ) { - $use_pam = 0; - } - &executeProml( $input, $bootstraps, $jumbles, $seed, $use_pam, $use_global_rearr ); - print( "\n========== PHYLIP PROML end =========\n\n" ); - &mv( "outtree", $OUTTREE_PROML ); - &rm( "outfile" ); - if ( $bootstraps > 1 ) { - &append( $OUTTREE_PROML, $OUTTREES_ALL ); - $all_count++; - } -} - - -if ( $use_protpars == 1 ) { - my $input = ""; - if ( $bootstraps > 1 ) { - $input = $SEQBOOT_OUTFILE; - } - else { - $input = "align"; - } - print( "\n========== PHYLIP PROTPARS begin =========\n\n" ); - &executeProtpars( $input, $bootstraps, $jumbles, $seed ); - print( "\n========== PHYLIP PROTPARS end =========\n\n" ); - &mv( "outtree", $OUTTREE_PROTPARS ); - &rm( $outfile."_protpars_outfile" ); - &mv( "outfile", $outfile."_protpars_outfile" ); - if ( $bootstraps > 1 ) { - &append( $OUTTREE_PROTPARS, $OUTTREES_ALL ); - $all_count++; - } -} - - - -# Methods based on PWD -# -------------------- -my $OUTTREE_FASTME = "outtree_fastme"; -my $OUTTREE_PHYLIP_NJ = "outtree_phylip_nj"; -my $OUTTREE_PHYLIP_FM = "outtree_phylip_fm"; -my $OUTTREE_PHYLIP_ME = "outtree_phylip_me"; -my $OUTTREE_BIONJ = "outtree_bionj"; -my $OUTTREE_WEIGHBOR = "outtree_weighbor"; - -my $CONSENSUS_FASTME = "consensus_fastme"; -my $CONSENSUS_PHYLIP_NJ = "consensus_phylip_nj"; -my $CONSENSUS_PHYLIP_FM = "consensus_phylip_fm"; -my $CONSENSUS_PHYLIP_ME = "consensus_phylip_me"; -my $CONSENSUS_BIONJ = "consensus_bionj"; -my $CONSENSUS_WEIGHBOR = "consensus_weighbor"; -my $CONSENSUS_ALL = "consensus_all"; - - -if ( $use_fastme == 1 ) { - print( "\n========== FASTME begin =========\n\n" ); - &executeFastme( $pwdfile, $bootstraps, $fastme_init_tree_opt ); - print( "\n========== FASTME end ===========\n\n" ); - &rm( "output.d" ); - &mv( "output.tre", $OUTTREE_FASTME ); - if ( $bootstraps > 1 ) { - &append( $OUTTREE_FASTME, $OUTTREES_ALL ); - $all_count++; - } -} -if ( $use_phylip_nj ) { - print( "\n========== PHYLIP NEIGHBOR begin =========\n\n" ); - &executeNeighbor( $pwdfile, $bootstraps, $seed, 0 ); - print( "\n========== PHYLIP NEIGHBOR end =========\n\n" ); - &mv( "outtree", $OUTTREE_PHYLIP_NJ ); - &rm( "outfile" ); - if ( $bootstraps > 1 ) { - &append( $OUTTREE_PHYLIP_NJ, $OUTTREES_ALL ); - $all_count++; - } -} -if ( $use_phylip_fitch_fm ) { - print( "\n========== PHYLIP FITCH FM begin =========\n\n" ); - &executeFitch( $pwdfile, $bootstraps, $seed, $jumbles, 0, "FM", $use_global_rearr ); - print( "\n========== PHYLIP FITCH FM end =========\n\n" ); - &mv( "outtree", $OUTTREE_PHYLIP_FM ); - &rm( "outfile" ); - if ( $bootstraps > 1 ) { - &append( $OUTTREE_PHYLIP_FM, $OUTTREES_ALL ); - $all_count++; - } -} -if ( $use_phylip_fitch_me ) { - print( "\n========== PHYLIP FITCH ME begin =========\n\n" ); - &executeFitch( $pwdfile, $bootstraps, $seed, $jumbles, 0, "ME", $use_global_rearr ); - print( "\n========== PHYLIP FITCH ME end =========\n\n" ); - &mv( "outtree", $OUTTREE_PHYLIP_ME ); - &rm( "outfile" ); - if ( $bootstraps > 1 ) { - &append( $OUTTREE_PHYLIP_ME, $OUTTREES_ALL ); - $all_count++; - } -} -if ( $use_bionj ) { - print( "\n========== BIONJ begin =========\n\n" ); - &executeBionj( $pwdfile, $OUTTREE_BIONJ ); - print( "\n========== BIONJ end =========\n\n" ); - if ( $bootstraps > 1 ) { - &append( $OUTTREE_BIONJ, $OUTTREES_ALL ); - $all_count++; - } -} -if ( $use_weighbor ) { - print( "\n========== WEIGHBOR begin =========\n\n" ); - &executeWeighbor( $number_of_aa, 14, $pwdfile, $OUTTREE_WEIGHBOR ); - print( "\n========== WEIGHBOR end =========\n\n" ); - if ( $bootstraps > 1 ) { - &append( $OUTTREE_WEIGHBOR, $OUTTREES_ALL ); - $all_count++; - } -} - - - -if ( $bootstraps > 1 ) { - # Consense: - if ( $use_fastme == 1 ) { - &consense( $OUTTREE_FASTME, $CONSENSUS_FASTME ); - } - if ( $use_phylip_nj == 1 ) { - &consense( $OUTTREE_PHYLIP_NJ, $CONSENSUS_PHYLIP_NJ ); - } - if ( $use_phylip_fitch_fm == 1 ) { - &consense( $OUTTREE_PHYLIP_FM, $CONSENSUS_PHYLIP_FM ); - } - if ( $use_phylip_fitch_me == 1 ) { - &consense( $OUTTREE_PHYLIP_ME, $CONSENSUS_PHYLIP_ME ); - } - if ( $use_bionj == 1 ) { - &consense( $OUTTREE_BIONJ, $CONSENSUS_BIONJ ); - } - if ( $use_weighbor == 1 ) { - &consense( $OUTTREE_WEIGHBOR, $CONSENSUS_WEIGHBOR ); - } - if ( $use_phyml == 1 ) { - &consense( $OUTTREE_PHYML, $CONSENSUS_PHYML ); - } - if ( $use_proml == 1 ) { - &consense( $OUTTREE_PROML, $CONSENSUS_PROML ); - } - if ( $use_protpars == 1 ) { - &consense( $OUTTREE_PROTPARS, $CONSENSUS_PROTPARS ); - } - if ( $all_count > 1 ) { - &consense( $OUTTREES_ALL, $CONSENSUS_ALL ); - } - else { - &rm( $OUTTREES_ALL ); - } - - my $INTREE_FOR_PUZZLE = "intree"; #why so serious? - &rm( $INTREE_FOR_PUZZLE ); - system( "touch", $INTREE_FOR_PUZZLE ) - && die("\n\n$0: could not \"touch $INTREE_FOR_PUZZLE\": $!\n\n"); - - if ( $use_fastme == 1 ) { - &append( $CONSENSUS_FASTME, $INTREE_FOR_PUZZLE ); - } - if ( $use_phylip_nj == 1 ) { - &append( $CONSENSUS_PHYLIP_NJ, $INTREE_FOR_PUZZLE ); - } - if ( $use_phylip_fitch_fm == 1 ) { - &append( $CONSENSUS_PHYLIP_FM, $INTREE_FOR_PUZZLE ); - } - if ( $use_phylip_fitch_me == 1 ) { - &append( $CONSENSUS_PHYLIP_ME, $INTREE_FOR_PUZZLE ); - } - if ( $use_bionj == 1 ) { - &append( $CONSENSUS_BIONJ, $INTREE_FOR_PUZZLE ); - } - if ( $use_weighbor == 1 ) { - &append( $CONSENSUS_WEIGHBOR, $INTREE_FOR_PUZZLE ); - } - if ( $use_raxml == 1 ) { - # Needed, because TREE-PUZZLE adds internal labels for all subsequent trees - # when evaluating given trees (this seems a strange behaviour). - removeSupportValues( $CONSENSUS_RAXML, $CONSENSUS_RAXML."_support_removed" ); - &append( $CONSENSUS_RAXML."_support_removed", $INTREE_FOR_PUZZLE ); - &rm( $CONSENSUS_RAXML."_support_removed" ); - } - if ( $use_phyml == 1 ) { - &append( $CONSENSUS_PHYML, $INTREE_FOR_PUZZLE ); - } - if ( $use_proml == 1 ) { - &append( $CONSENSUS_PROML, $INTREE_FOR_PUZZLE ); - } - if ( $use_protpars == 1 ) { - &append( $CONSENSUS_PROTPARS, $INTREE_FOR_PUZZLE ); - } - if ( $all_count > 1 ) { - &append( $CONSENSUS_ALL, $INTREE_FOR_PUZZLE ); - } - - - # Puzzle for ML branch lenghts: - # The alignment is read from infile by default. - # The tree is read from intree by default. - &rm( "infile" ); - &mv( "align", "infile" ); # align = original alignment in phylip interleaved. - - &executePuzzleToCalculateBranchLenghts( $matrix, - $exact_parameter_est, - $rate_heterogeneity ); - - my $OUTTREE_PUZZLE = "outtree_puzzle"; - - &rm( $outfile."_puzzle_outfile" ); - - &mv( "outfile", $outfile."_puzzle_outfile" ); - &mv( "outtree", $OUTTREE_PUZZLE ); - &rm( "outdist" ); - &rm( "intree" ); - - - # Transfer - # -------- - my $counter = 0; - if ( $use_fastme == 1 ) { - &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_FASTME, $fastme_outtree, $counter++ ); - &rm( $CONSENSUS_FASTME ); - } - if ( $use_phylip_nj == 1 ) { - &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PHYLIP_NJ, $phylip_nj_outtree, $counter++ ); - &rm( $CONSENSUS_PHYLIP_NJ ); - } - if ( $use_phylip_fitch_fm == 1 ) { - &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PHYLIP_FM, $phylip_fm_outtree, $counter++ ); - &rm( $CONSENSUS_PHYLIP_FM ); - } - if ( $use_phylip_fitch_me == 1 ) { - &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PHYLIP_ME, $phylip_me_outtree, $counter++ ); - &rm( $CONSENSUS_PHYLIP_ME ); - } - if ( $use_bionj == 1 ) { - &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_BIONJ, $bionj_outtree, $counter++ ); - &rm( $CONSENSUS_BIONJ ); - } - if ( $use_weighbor == 1 ) { - &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_WEIGHBOR, $weighbor_outtree, $counter++ ); - &rm( $CONSENSUS_WEIGHBOR ); - } - if ( $use_raxml == 1 ) { - &to_phyloxml( $CONSENSUS_RAXML, $raxml_outtree, 1, 1 ); - $counter++; - } - if ( $use_phyml == 1 ) { - &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PHYML, $phyml_outtree, $counter++ ); - &rm( $CONSENSUS_PHYML ); - } - if ( $use_proml == 1 ) { - &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PROML, $proml_outtree, $counter++ ); - &rm( $CONSENSUS_PROML ); - } - if ( $use_protpars == 1 ) { - &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PROTPARS, $protpars_outtree, $counter++ ); - &rm( $CONSENSUS_PROTPARS ); - } - if ( $all_count > 1 ) { - &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_ALL, $all_outtree, $counter++ ); - &rm( $CONSENSUS_ALL ); - } - - # Clean up - # -------- - &rm( $OUTTREE_PUZZLE ); - &rm( $SEQBOOT_OUTFILE ); - if ( $keep_multiple_trees == 1 ) { - if ( $use_fastme == 1 ) { - &mv( $OUTTREE_FASTME, $multitreefile_fastme ); - } - if ( $use_phylip_nj == 1 ) { - &mv( $OUTTREE_PHYLIP_NJ, $multitreefile_phylip_nj ); - } - if ( $use_phylip_fitch_fm == 1 ) { - &mv( $OUTTREE_PHYLIP_FM, $multitreefile_phylip_fm ); - } - if ( $use_phylip_fitch_me == 1 ) { - &mv( $OUTTREE_PHYLIP_ME, $multitreefile_phylip_me ); - } - if ( $use_bionj == 1 ) { - &mv( $OUTTREE_BIONJ, $multitreefile_bionj ); - } - if ( $use_weighbor == 1 ) { - &mv( $OUTTREE_WEIGHBOR, $multitreefile_weighbor ); - } - if ( $use_phyml == 1 ) { - &mv( $OUTTREE_PHYML, $multitreefile_phyml ); - } - if ( $use_proml == 1 ) { - &mv( $OUTTREE_PROML, $multitreefile_proml ); - } - if ( $use_protpars == 1 ) { - &mv( $OUTTREE_PROTPARS, $multitreefile_protpars ); - } - &mv( $pwdfile, $multipwdfile ); - } - else { - if ( $use_fastme == 1 ) { - &rm( $OUTTREE_FASTME ); - } - if ( $use_phylip_nj == 1 ) { - &rm( $OUTTREE_PHYLIP_NJ ); - } - if ( $use_phylip_fitch_fm == 1 ) { - &rm( $OUTTREE_PHYLIP_FM ); - } - if ( $use_phylip_fitch_me == 1 ) { - &rm( $OUTTREE_PHYLIP_ME ); - } - if ( $use_bionj == 1 ) { - &rm( $OUTTREE_BIONJ ); - } - if ( $use_weighbor == 1 ) { - &rm( $OUTTREE_WEIGHBOR ); - } - if ( $use_phyml == 1 ) { - &rm( $OUTTREE_PHYML ); - } - if ( $use_proml == 1 ) { - &rm( $OUTTREE_PROML ); - } - if ( $use_protpars == 1 ) { - &rm( $OUTTREE_PROTPARS ); - } - &rm( $pwdfile ); - } - if ( $all_count > 1 ) { - &rm( $OUTTREES_ALL ); - } -} # if ( $bootstraps > 1 ) -else { - &rm( "infile.dist" ); - - &rm( "infile.puzzle" ); - if ( $use_fastme == 1 ) { - &to_phyloxml( $OUTTREE_FASTME, $fastme_outtree, 0, 1 ); - } - if ( $use_phylip_nj == 1 ) { - &to_phyloxml( $OUTTREE_PHYLIP_NJ, $phylip_nj_outtree, 0, 1); - } - if ( $use_phylip_fitch_fm == 1 ) { - &to_phyloxml( $OUTTREE_PHYLIP_FM, $phylip_fm_outtree, 0, 1 ); - } - if ( $use_phylip_fitch_me == 1 ) { - &to_phyloxml( $OUTTREE_PHYLIP_ME, $phylip_me_outtree, 0, 1 ); - } - if ( $use_bionj == 1 ) { - &to_phyloxml( $OUTTREE_BIONJ, $bionj_outtree, 0, 1 ); - } - if ( $use_weighbor == 1 ) { - &to_phyloxml( $OUTTREE_WEIGHBOR, $weighbor_outtree, 0, 1 ); - } - if ( $use_raxml == 1 ) { - &to_phyloxml( $OUTTREE_RAXML, $raxml_outtree, 0, 1 ); - } - if ( $use_phyml == 1 ) { - &to_phyloxml( $OUTTREE_PHYML, $phyml_outtree, 0, 1 ); - } - if ( $use_proml == 1 ) { - &to_phyloxml( $OUTTREE_PROML, $proml_outtree, 0, 1 ); - } - if ( $use_protpars == 1 ) { - &to_phyloxml( $OUTTREE_PROTPARS, $protpars_outtree, 0, 1 ); - } -} # if ( $bootstraps > 1 ) - -&rm( $infile ); -&rm( "infile" ); -&rm( "align" ); -&rm( "align.reduced" ); - - -$log = $log."Finish time/date : ".`date`; - -if ( $bootstraps > 1 ) { - $log = $log."Puzzle output file : ".$outfile."_puzzle_outfile\n"; -} -$log = $log."Columns in alignment : $number_of_aa\n"; -$log = $log."Number of sequences in alignment : $number_of_seqs\n"; -if ( $all_count > 1 ) { - $log = $log."Combined consensus : $all_outtree\n"; -} - - -if ( $bootstraps > 1 ) { - $log = $log."\n\n"; - $log = $log."Simple support value statistics (trees are numbered the same as for TREE PUZZLE output)\n"; - $log = $log."------------------------------- \n"; - $log = $log."\n"; -} - -open( OUT, ">$logfile" ) || die "\n$0: Cannot create file <<$logfile>>: $!\n"; -print OUT $log; -close( OUT ); - -if ( $bootstraps > 1 ) { - # Simple support statistics - # ------------------------- - my $SS_OUT = $temp_dir."/ss_out"; - my @phylos = (); - my $ounter = 0; - if ( $use_fastme == 1 ) { - $phylos[ $ounter++ ] = $fastme_outtree; - } - if ( $use_phylip_nj == 1 ) { - $phylos[ $ounter++ ] = $phylip_nj_outtree; - } - if ( $use_phylip_fitch_fm == 1 ) { - $phylos[ $ounter++ ] = $phylip_fm_outtree; - } - if ( $use_phylip_fitch_me == 1 ) { - $phylos[ $ounter++ ] = $phylip_me_outtree; - } - if ( $use_bionj == 1 ) { - $phylos[ $ounter++ ] = $bionj_outtree; - } - if ( $use_weighbor == 1 ) { - $phylos[ $ounter++ ] = $weighbor_outtree; - } - if ( $use_raxml == 1 ) { - $phylos[ $ounter++ ] = $raxml_outtree; - } - if ( $use_phyml == 1 ) { - $phylos[ $ounter++ ] = $phyml_outtree; - } - if ( $use_proml == 1 ) { - $phylos[ $ounter++ ] = $proml_outtree; - } - if ( $use_protpars == 1 ) { - $phylos[ $ounter++ ] = $protpars_outtree; - } - if ( $all_count > 1) { - $phylos[ $ounter++ ] = $all_outtree; - } - &executeSupportStatistics( $SS_OUT, @phylos ); - &append( $SS_OUT, $logfile ); - &rm( $SS_OUT ); - - # Append parts of puzzle output file - # ---------------------------------- - if ( $all_count > 1 ) { - &parsePuzzleOutfile( $outfile."_puzzle_outfile", $logfile ); - } -} - -chdir( $current_dir ) -|| die "\n\n$0: Could not chdir to <<$current_dir>>: $!\n\n"; - -rmdir( $temp_dir ) -|| print "\n\n$0: Warning: Could not remove <<$temp_dir>>: $!\n\n"; - -print "\n\n\n$0 successfully comleted.\n\n"; - -exit( 0 ); - - - -# Methods: -# -------- - - -# Six arguments: -# 1. DNA or Amino-Acids sequence filename (PHYLIP format) -# 2. Model, eg. PROTGAMMAIVT -# 3. Replicates (bootstrap) -# 4. Seed for bootstrap -# 5. Output suffix -# 6. Algorithm (only for bootstrap, default otherwise) -# NOTE. RaxML does its own bootstrapping. -sub executeRaxml { - my $msa = $_[ 0 ]; - my $model = $_[ 1 ]; - my $replicates = $_[ 2 ]; - my $seed = $_[ 3 ]; - my $outfile_suffix = $_[ 4 ]; - my $algo = $_[ 5 ]; - - &testForTextFilePresence( $msa ); - my $command = "$RAXML -m $model -s $msa -n $outfile_suffix"; - - if ( $replicates > 1 ) { - $command = $command . " -x $seed -N $replicates"; - if ( $algo ) { - $command = $command . " -f $algo"; - } - } - - print( "\n$command\n"); - - system( $command ) - && &dieWithUnexpectedError( $command ); - -} - - -sub to_phyloxml { - my $from = $_[ 0 ]; - my $to = $_[ 1 ]; - my $internal_names_are_boots = $_[ 2 ]; - my $extract_taxonomy = $_[ 3 ]; - &dieIfFileExists( $to ); - &dieIfFileNotExists( $from ); - my $command = "$NEWICK_TO_PHYLOXML -f=nn $from $to"; - if ( $internal_names_are_boots == 1 ) { - $command = $command . " -i"; - } - if ( $extract_taxonomy == 1 ) { - $command = $command . " -xt"; - } - system( $command ) - && die "$0: Could not execute \"$command \""; - &rm( $from ); -} - - -sub mv { - my $from = $_[ 0 ]; - my $to = $_[ 1 ]; - &dieIfFileExists( $to ); - &dieIfFileNotExists( $from ); - system( "mv", $from, $to ) - && die "\n\n$0: could not move \"$from\" to \"$to\": $!\n\n"; -} - -sub cp { - my $from = $_[ 0 ]; - my $to = $_[ 1 ]; - &dieIfFileExists( $to ); - &dieIfFileNotExists( $from ); - - system( "cp", $from, $to ) - && die "\n\n$0: could not copy \"$from\" to \"$to\": $!\n\n"; -} - -sub rm { - my $f = $_[ 0 ]; - unlink( $f ); -} - -sub consense { - my $multi_in = $_[ 0 ]; - my $consense_out = $_[ 1 ]; - &executeConsense( $multi_in ); - &mv( "outtree", $consense_out ); - &rm( "outfile" ); - -} - - - -# 1. file to be appended -# 2. file to append to -sub append { - my $to_be_appended = $_[ 0 ]; - my $append_to = $_[ 1 ]; - &dieIfFileNotExists( $to_be_appended ); - system( "cat $to_be_appended >> $append_to" ) - && die "\n\n$0: could not execute \"cat $to_be_appended >> $append_to\": $!\n\n"; - -} - -sub dieIfFileExists { - my $file = $_[ 0 ]; - if ( -e $file ) { - die "\n\n$0: \"$file\" already exists\n\n"; - } -} - -sub dieIfFileNotExists { - my $file = $_[ 0 ]; - unless ( ( -s $file ) && ( -f $file ) ) { - die( "\n\n$0: \"$file\" does not exist or is empty" ); - } -} - - - - -# Two arguments: -# 1. seed for random number generator -# 2. number of bootstraps -# Reads in "infile" by default. -sub executeSeqboot { - - my $s = $_[ 0 ]; - my $bs = $_[ 1 ]; - my $verb = ""; - - &testForTextFilePresence( $infile ); - - - $verb = " -2"; - - system( "$SEQBOOT << ! -r -$bs$verb -Y -$s -!" ) - && die "$0: Could not execute \"$SEQBOOT\""; - -} - - - -# One/two/three argument(s): -# Reads in tree from "intree" by default. (Presence of "intree" automatically -# switches into "User defined trees" mode.) -# 1. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; -# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise -# 2. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise -# 3. Model of rate heterogeneity: -# 1 for "8 Gamma distributed rates" -# 2 for "Two rates (1 invariable + 1 variable)" -# 3 for "Mixed (1 invariable + 8 Gamma rates)" -# otherwise: Uniform rate -# Last modified: 09/08/03 (added 2nd and 3rd parameter) -sub executePuzzleToCalculateBranchLenghts { - my $matrix_option = $_[ 0 ]; - my $parameter_estimates_option = $_[ 1 ]; - my $rate_heterogeneity_option = $_[ 2 ]; - my $i = 0; - my $mat = ""; - my $est = ""; - my $rate = ""; - - unless ( ( -s "infile" ) && ( -f "infile" ) && ( -T "infile" ) ) { - die "\n$0: executePuzzleToCalculateBranchLenghts: <> does not exist, is empty, or is not a plain textfile.\n"; - } - unless ( ( -s "intree" ) && ( -f "intree" ) && ( -T "intree" ) ) { - die "\n$0: executePuzzleToCalculateBranchLenghts: <> does not exist, is empty, or is not a plain textfile.\n"; - } - - $mat = setModelForPuzzle( $matrix_option ); - if ( $parameter_estimates_option ) { - $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option ); - } - if ( $rate_heterogeneity_option ) { - $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option ); - } - - - system( "$PUZZLE << ! -$mat$est$rate -x -y -!" ) - && die "$0: Could not execute \"$PUZZLE\" (mat=$mat est=$est rate=$rate)"; - -} - -# Two arguments: -# 1. puzzle outfile -# 2. file to append to -sub parsePuzzleOutfile { - my $puzzle_outfile = $_[ 0 ]; - my $file_to_append_to = $_[ 1 ]; - &testForTextFilePresence( $puzzle_outfile ); - open( OUT, ">>$file_to_append_to" ) || &dieWithUnexpectedError( "Cannot open \"$file_to_append_to\"" ); - open( IN, "$puzzle_outfile" ) || &dieWithUnexpectedError( "Cannot open file \"$puzzle_outfile\"" ); - my $return_line; - my $read = 0; - print OUT "\nTREE PUZZLE output\n"; - print OUT "------------------\n"; - while ( $return_line = ) { - if ( $return_line =~/COMPARISON OF USER TREES/ ) { - $read = 1; - } - elsif( $return_line =~/TIME STAMP/ ) { - $read = 0; - } - elsif( $read ) { - print OUT $return_line; - } - } - close( IN ); - close( OUT ); -} - -# Three/four arguments: -# 1. Name of file containing tree with correct branch lengths -# 2. Name of file containing tree with correct bootstraps -# 3. Outputfilename -# 4. Index of tree with correct branch lengths, in case more than one in file -# Last modified: 2007.11.27 -sub executeSupportTransfer { - my $tree_with_bl = $_[ 0 ]; - my $tree_with_bs = $_[ 1 ]; - my $out = $_[ 2 ]; - my $index = $_[ 3 ]; - - &testForTextFilePresence( $tree_with_bl ); - &testForTextFilePresence( $tree_with_bs ); - my $command = "$SUPPORT_TRANSFER $tree_with_bl $tree_with_bs $out $index"; - system( $command ) - && die "$0: Could not execute \"$command\""; -} - - -# Two or more arguments: -# 1. outfile -# 2. phylogeny 1 with support values -# 3. phylogeny 2 with support values -# 4. ... -sub executeSupportStatistics { - my $outfile = $_[ 0 ]; - &dieIfFileExists( $outfile ); - my $phylos = ""; - for( my $i = 1; $i < scalar(@_); ++$i ) { - &testForTextFilePresence( $_[ $i ] ); - $phylos .= $_[ $i ]." "; - } - my $command = "$SUPPORT_STATISTICS -o=$outfile $phylos"; - system( "$command" ) - && die "$0: Could not execute \"$command\""; -} - - -sub getNumberOfSeqsAndAas { - my $infile = $_[ 0 ]; - my $seqs = 0; - my $aa = 0; - open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; - while( ) { - if ( $_ =~ /^\s*(\d+)\s+(\d+)\s*$/ ) { - $seqs = $1; - $aa = $2; - } - } - close( IN ); - - if ( $seqs == 0 || $aa == 0 ) { - die( "\n$0: Could not get number of seqs and aa from: $infile" ); - } - return $seqs, $aa; -} - - - -sub removeSupportValues { - my $infile = $_[ 0 ]; - my $outfile = $_[ 1 ]; - &testForTextFilePresence( $infile ); - open( OUT, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); - open( IN, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); - while ( my $line = ) { - $line =~ s/\)\d+\.?\d*:/\):/g; - print OUT "$line"; - } - close( OUT ); - close( IN ); -} - - - - -# Six arguments: -# 1. name of alignment file (in correct format!) -# 2. number of bootstraps -# 3. jumbles: 0: do not jumble; >=1 number of jumbles -# 4. seed for random number generator -# 5. 1 for PAM instead of JTT -# 6. 1 to use globale rearragements -sub executeProml { - my $align = $_[ 0 ]; - my $bs = $_[ 1 ]; - my $rand = $_[ 2 ]; - my $s = $_[ 3 ]; - my $use_pam = $_[ 4 ]; - my $use_global_rearr = $_[ 5 ]; - my $jumble = ""; - my $multi = ""; - my $pam = ""; - - &testForTextFilePresence( $align ); - - if ( $bs > 1 && $rand < 1 ) { - $rand = 1; - } - - if ( $rand >= 1 ) { - $jumble = " -J -$s -$rand"; - } - - if ( $bs > 1 ) { - $multi = " -M -D -$bs"; - } - - - if ( $use_pam == 1 ) { - $pam = " -P -P"; - } - - my $global = ""; - if ( $use_global_rearr == 1 ) { - $global = " -G"; - } - - system( "$PROML 2>&1 << ! -$align$jumble$multi$pam$global -3 -Y -!" ) - && &dieWithUnexpectedError( "Could not execute \"$PROML $align$jumble$multi$pam$global\"" ); - # 3: Do NOT print out tree - - return; - -} ## executeProml - - -sub printUsage { - - print < - [path/name for temporary directory to be created] - - Example: - "% phylo_pl.pl -B100q\@1nbS9X IL5.aln IL5_tree" - - Options - ------- - - Bx : Number of bootstraps. B0: do not bootstrap. Default is 100 bootstrapps. - The number of bootstrapps should be divisible by 10. - J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE and/or PHYML, RAXML, default: VT (Mueller-Vingron 2000). - L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE and/or PHYML, RAXML, default: VT. - M : Use mtREV24 matrix (Adachi-Hasegawa 1996) in TREE-PUZZLE and/or PHYML, default: VT. - W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE and/or PHYML, RAXML, default: VT. - P : Use PAM matrix (Dayhoff et al. 1978) in TREE-PUZZLE and/or PHYML, RAXML, default: VT. - D : Use DCMut matrix (Kosial and Goldman, 2005) in PHYML, RAXML, VT in TREE-PUZZLE. - A : Let TREE-PUZZLE choose which matrix to use, default: VT - E : Exact parameter estimates in TREE-PUZZLE, default: Approximate. - Model of rate heterogeneity in TREE-PUZZLE (default: Uniform rate): - g : 8 Gamma distributed rates - t : Two rates (1 invariable + 1 variable) - m : Mixed (1 invariable + 8 Gamma rates) - q\@x: Use FastME, x: 1: GME - 2: BME - 3: NJ - n : Use PHYLIP Neighbor (NJ). - f : Use PHYLIP Fitch. - e : Use PHYLIP Minimal Evolution. - b : Use BIONJ. - w : Use Weighbor. - x : Use RAxML. - y : Use PHYML. - o : Use PHYLIP proml. - p : Use PHYLIP protpars. - rx : Number of relative substitution rate categories in PHYML (default is 4). - jx : Number of jumbles (input order randomization) for PHYLIP FM, ME, PROTPARS, and PROML (default is 2) (random seed set with Sx). - I : Estimate proportion of invariable sites in RAXML and/or PHYML (otherwise, proportion "0.0" is used in PHYML) - G : to turn on global rearrangements in PHYLIP FM, ME, and PROML - Sx : Seed for random number generator(s). Must be 4n+1. Default is 9. - X : To keep multiple tree file (=trees from bootstrap resampled alignments) and - pairwise distance matrix file (in case of bootstrap analysis). - -END - -} ## printUsage