--- /dev/null
+# $Id: forester.pm,v 1.26 2010/12/13 19:00:22 cmzmasek Exp $
+#
+# FORESTER -- software libraries and applications
+# for evolutionary biology research and applications.
+#
+# Copyright (C) 2007-2009 Christian M. Zmasek
+# Copyright (C) 2007-2009 Burnham Institute for Medical Research
+# All rights reserved
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+#
+# Contact: phylosoft @ gmail . com
+# WWW: www.phylosoft.org/forester
+#
+#
+#
+
+
+package forester;
+use strict;
+require Exporter;
+
+our $VERSION = 1.000;
+
+our @ISA = qw( Exporter );
+
+our @EXPORT = qw( executeConsense
+ executePhyloPl
+ executePuzzleDQO
+ executePuzzleDQObootstrapped
+ pfam2phylipMatchOnly
+ startsWithSWISS_PROTname
+ isPfamSequenceLine
+ isPfamCommentLine
+ containsPfamNamedSequence
+ isRFline
+ executeProtpars
+ setModelForPuzzle
+ setRateHeterogeneityOptionForPuzzle
+ setParameterEstimatesOptionForPuzzle
+ executePuzzleBootstrapped
+ executePuzzle
+ executeFastme
+ executeNeighbor
+ executeFitch
+ executeBionj
+ executeWeighbor
+ executePhyml
+ executeHmmfetch
+ addDistsToQueryToPWDfile
+ testForTextFilePresence
+ exitWithWarning
+ dieWithUnexpectedError
+ addSlashAtEndIfNotPresent
+ $LENGTH_OF_NAME
+ $MIN_NUMBER_OF_AA
+ $TREMBL_ACDEOS_FILE
+ $SWISSPROT_ACDEOS_FILE
+ $SPECIES_NAMES_FILE
+ $SPECIES_TREE_FILE_DEFAULT
+ $MULTIPLE_TREES_FILE_SUFFIX
+ $LOG_FILE_SUFFIX
+ $ALIGN_FILE_SUFFIX
+ $TREE_FILE_SUFFIX
+ $ADDITION_FOR_RIO_ANNOT_TREE
+ $SUFFIX_PWD
+ $SUFFIX_BOOT_STRP_POS
+ $MULTIPLE_PWD_FILE_SUFFIX
+ $SUFFIX_PWD_NOT_BOOTS
+ $SUFFIX_HMM
+ $MATRIX_FOR_PWD
+ $RIO_PWD_DIRECTORY
+ $RIO_BSP_DIRECTORY
+ $RIO_NBD_DIRECTORY
+ $RIO_ALN_DIRECTORY
+ $RIO_HMM_DIRECTORY
+ $PFAM_FULL_DIRECTORY
+ $PFAM_SEED_DIRECTORY
+ $PRIOR_FILE_DIR
+ $PFAM_HMM_DB
+ $FORESTER_JAR
+ $SEQBOOT
+ $NEIGHBOR
+ $PROTPARS
+ $CONSENSE
+ $PROML
+ $PHYLIP_VERSION
+ $PUZZLE
+ $PUZZLE_VERSION
+ $FASTME
+ $FASTME_VERSION
+ $BIONJ
+ $BIONJ_VERSION
+ $WEIGHBOR
+ $WEIGHBOR_VERSION
+ $RAXML
+ $RAXML_VERSION
+ $PHYML
+ $PHYML_VERSION
+ $HMMALIGN
+ $HMMSEARCH
+ $HMMBUILD
+ $HMMFETCH
+ $SFE
+ $HMMCALIBRATE
+ $P7EXTRACT
+ $MULTIFETCH
+ $BOOTSTRAP_CZ
+ $BOOTSTRAP_CZ_PL
+ $SUPPORT_TRANSFER
+ $SUPPORT_STATISTICS
+ $NEWICK_TO_PHYLOXML
+ $PHYLO_PL
+ $RIO_PL
+ $DORIO
+ $PUZZLE_DQO
+ $BOOTSTRAPS
+ $PATH_TO_FORESTER
+ $JAVA
+ $NODE_LIST
+ $RIO_SLAVE_DRIVER
+ $RIO_SLAVE
+ $TEMP_DIR_DEFAULT
+ $EXPASY_SPROT_SEARCH_DE
+ $EXPASY_SPROT_SEARCH_AC
+ );
+
+
+
+
+# =============================================================================
+# =============================================================================
+#
+# THESE VARIABLES ARE ENVIRONMENT DEPENDENT, AND NEED TO BE SET ACCORDINGLY
+# BY THE USER
+# -------------------------------------------------------------------------
+#
+
+# For using just "phylo_pl.pl", only the following variables need to be set
+# $JAVA
+# $FORESTER_JAR
+# $TEMP_DIR_DEFAULT
+# $SEQBOOT
+# $CONSENSE
+# $PUZZLE
+# $FASTME
+# $NEIGHBOR
+# $FITCH
+# $BIONJ
+# $WEIGHBOR
+# $PHYML
+# $PROTPARS
+
+# Software directory:
+# ---------------------
+
+our $SOFTWARE_DIR = "/home/czmasek/SOFTWARE/";
+
+
+# Java virtual machine:
+# ---------------------
+our $JAVA = $SOFTWARE_DIR."JAVA/jdk1.6.0_03/bin/java";
+
+
+# Where all the temporary files can be created:
+# ---------------------------------------------
+our $TEMP_DIR_DEFAULT = "/tmp/";
+
+
+# Programs from Joe Felsenstein's PHYLIP package:
+# -----------------------------------------------
+our $SEQBOOT = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/seqboot";
+our $NEIGHBOR = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/neighbor";
+our $PROTPARS = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/protpars";
+our $PROML = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/proml";
+our $FITCH = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/fitch";
+our $CONSENSE = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/consense";
+our $PHYLIP_VERSION = "3.68";
+
+# TREE-PUZZLE:
+# ------------
+our $PUZZLE = $SOFTWARE_DIR."TREE_PUZZLE/tree-puzzle-5.2/src/puzzle";
+our $PUZZLE_VERSION = "5.2";
+
+# FASTME:
+# -----------------------------------------------------
+our $FASTME = $SOFTWARE_DIR."FASTME/fastme2.0/fastme";
+our $FASTME_VERSION = "2.0";
+
+# BIONJ:
+# -----------------------------------------------------
+our $BIONJ = $SOFTWARE_DIR."BIONJ/bionj";
+our $BIONJ_VERSION = "[1997]";
+
+# WEIGHBOR:
+# -----------------------------------------------------
+our $WEIGHBOR = $SOFTWARE_DIR."WEIGHBOR/Weighbor/weighbor";
+our $WEIGHBOR_VERSION = "1.2.1";
+
+# PHYML:
+# -----------------------------------------------------
+our $PHYML = $SOFTWARE_DIR."PHYML/phyml_v2.4.4/exe/phyml_linux";
+our $PHYML_VERSION = "2.4.4";
+
+# RAXML:
+# -----------------------------------------------------
+our $RAXML = $SOFTWARE_DIR."RAXML/RAxML-7.0.4/raxmlHPC";
+our $RAXML_VERSION = "7.0.4";
+
+
+# forester.jar. This jar file is currently available at: http://www.phylosoft.org
+# -------------------------------------------------------------------------------
+
+our $FORESTER_JAR = $SOFTWARE_DIR."FORESTER/DEV/forester/forester/java/forester.jar";
+
+
+
+# End of variables which need to be set by the user for using "phylo_pl.pl".
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# Tool from forester.jar to transfer support values:
+# -------------------------------------------------
+our $SUPPORT_TRANSFER = $JAVA." -cp $FORESTER_JAR org.forester.application.support_transfer";
+
+
+
+# Tool from forester.jar for simple statistics for support values:
+# ----------------------------------------------------------------
+our $SUPPORT_STATISTICS = $JAVA." -cp $FORESTER_JAR org.forester.application.support_statistics";
+
+
+# Tool from forester.jar to transfer nh to phyloXML:
+# -------------------------------------------------
+our $NEWICK_TO_PHYLOXML = $JAVA." -cp $FORESTER_JAR org.forester.application.phyloxml_converter";
+
+
+
+# FORESTER itself (currently not needed for "phylo_pl.pl"):
+# ---------------------------------------------------------
+our $PATH_TO_FORESTER = "";
+
+
+# Pfam data (not needed for phylo_pl.pl):
+# --------------------------------------
+our $PFAM_FULL_DIRECTORY = "/path/to/Pfam/Full/";
+our $PFAM_SEED_DIRECTORY = "/path/to/Pfam/Seed/";
+our $PFAM_HMM_DB = "/path/to/Pfam/Pfam_ls"; # Need to run "hmmindex" on this
+ # to produce .ssi file.
+ # Then, for example
+ # "setenv HMMERDB /home/rio/pfam-6.6/"
+
+
+$PATH_TO_FORESTER = &addSlashAtEndIfNotPresent( $PATH_TO_FORESTER );
+
+
+# Description lines and species from SWISS-PROT and TrEMBL (not needed for phylo_pl.pl):
+# -------------------------------------------------------------------------------------
+our $TREMBL_ACDEOS_FILE = $PATH_TO_FORESTER."data/trembl22_ACDEOS_1-6";
+
+our $SWISSPROT_ACDEOS_FILE = $PATH_TO_FORESTER."data/sp40_ACDEOS_1-6";
+
+
+
+# Names of species which can be analyzed and analyzed
+# against (must also be in tree $SPECIES_TREE_FILE_DEFAULT).
+# By using a list with less species, RIO analyses become faster
+# but lose phylogenetic resolution.
+# For many purposes, list "tree_of_life_bin_1-6_species_list"
+# in "data/species/" might be sufficient:
+# (not needed for phylo_pl.pl)
+# --------------------------------------------------------------
+our $SPECIES_NAMES_FILE = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6_species_list";
+
+
+
+# A default species tree in NHX format.
+# For many purposes, tree "tree_of_life_bin_1-6.nhx"
+# in "data/species/" might be fine:
+# (not needed for phylo_pl.pl)
+# --------------------------------------------------
+our $SPECIES_TREE_FILE_DEFAULT = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6.nhx";
+
+
+
+# Data for using precalculated distances:
+# (not needed for phylo_pl.pl)
+# ---------------------------------------
+our $MATRIX_FOR_PWD = 2; # The matrix which has been used for the pwd in $RIO_PWD_DIRECTORY.
+ # 0=JTT, 1=PAM, 2=BLOSUM 62, 3=mtREV24, 5=VT, 6=WAG.
+
+our $RIO_PWD_DIRECTORY = $PATH_TO_FORESTER."example_data/"; # all must end with "/"
+our $RIO_BSP_DIRECTORY = $PATH_TO_FORESTER."example_data/";
+our $RIO_NBD_DIRECTORY = $PATH_TO_FORESTER."example_data/";
+our $RIO_ALN_DIRECTORY = $PATH_TO_FORESTER."example_data/";
+our $RIO_HMM_DIRECTORY = $PATH_TO_FORESTER."example_data/";
+
+
+
+#
+# End of variables which need to be set by the user.
+#
+# =============================================================================
+# =============================================================================
+
+
+
+
+
+$TEMP_DIR_DEFAULT = &addSlashAtEndIfNotPresent( $TEMP_DIR_DEFAULT );
+$PFAM_FULL_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_FULL_DIRECTORY );
+$PFAM_SEED_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_SEED_DIRECTORY );
+
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# These variables should normally not be changed:
+#
+
+our $PRIOR_FILE_DIR = $PATH_TO_FORESTER."data/priors_for_hmmbuild/";
+ # Directory containing dirichlet prior
+ # files needed for certain aligments
+ # by hmmbuild (e.g. Collagen).
+
+
+
+
+
+# TREE-PUZZLE:
+our $PUZZLE_DQO = $PATH_TO_FORESTER."puzzle_dqo/src/puzzle";
+
+# HMMER:
+our $HMMALIGN = $PATH_TO_FORESTER."hmmer/binaries/hmmalign";
+our $HMMSEARCH = $PATH_TO_FORESTER."hmmer/binaries/hmmsearch";
+our $HMMBUILD = $PATH_TO_FORESTER."hmmer/binaries/hmmbuild";
+our $HMMFETCH = $PATH_TO_FORESTER."hmmer/binaries/hmmfetch";
+our $SFE = $PATH_TO_FORESTER."hmmer/binaries/sfetch";
+our $HMMCALIBRATE = $PATH_TO_FORESTER."hmmer/binaries/hmmcalibrate";
+
+our $P7EXTRACT = $PATH_TO_FORESTER."perl/p7extract.pl";
+our $MULTIFETCH = $PATH_TO_FORESTER."perl/multifetch.pl";
+
+
+# RIO/FORESTER:
+our $BOOTSTRAP_CZ = $PATH_TO_FORESTER."C/bootstrap_cz";
+our $BOOTSTRAP_CZ_PL = $PATH_TO_FORESTER."perl/bootstrap_cz.pl";
+#our $SUPPORT_TRANSFER = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.transfersBranchLenghts";
+#our $SUPPORT_TRANSFER = $JAVA." -cp /home/czmasek/SOFTWARE/FORESTER/forester3/forester.jar org.forester.tools.SupportTransfer";
+
+our $PHYLO_PL = $PATH_TO_FORESTER."perl/phylo_pl.pl";
+our $RIO_PL = $PATH_TO_FORESTER."perl/rio.pl";
+our $DORIO = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.DoRIO";
+# parallel RIO:
+our $RIO_SLAVE_DRIVER = $PATH_TO_FORESTER."perl/rio_slave_driver.pl";
+our $RIO_SLAVE = $PATH_TO_FORESTER."perl/rio_slave.pl";
+our $NODE_LIST = $PATH_TO_FORESTER."data/node_list.dat";
+
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+our $BOOTSTRAPS = 100;
+our $MIN_NUMBER_OF_AA = 20; # After removal of gaps, if less, gaps are not removed.
+our $LENGTH_OF_NAME = 10;
+
+
+
+
+our $MULTIPLE_TREES_FILE_SUFFIX = ".mlt";
+our $LOG_FILE_SUFFIX = ".log";
+our $ALIGN_FILE_SUFFIX = ".aln";
+our $TREE_FILE_SUFFIX = ".nhx";
+our $ADDITION_FOR_RIO_ANNOT_TREE = ".rio";
+our $SUFFIX_PWD = ".pwd";
+our $MULTIPLE_PWD_FILE_SUFFIX = ".mpwd";
+our $SUFFIX_BOOT_STRP_POS = ".bsp";
+our $SUFFIX_PWD_NOT_BOOTS = ".nbd";
+our $SUFFIX_HMM = ".hmm";
+
+our $EXPASY_SPROT_SEARCH_DE = "http://www.expasy.org/cgi-bin/sprot-search-de?";
+our $EXPASY_SPROT_SEARCH_AC = "http://www.expasy.org/cgi-bin/sprot-search-ac?";
+
+
+
+# One argument: input multiple trees file
+# Last modified: 07/05/01
+sub executeConsense {
+ my $in = $_[ 0 ];
+
+ &testForTextFilePresence( $in );
+
+ system( "$CONSENSE >/dev/null 2>&1 << !
+$in
+Y
+!" )
+ && &dieWithUnexpectedError( "Could not execute \"$CONSENSE $in\"" );
+
+ return;
+}
+
+
+
+# Four arguments:
+# 1. options ("-" is not necessary)
+# 2. alignment or pwd file
+# 3. outfile
+# 4. temp dir
+# Last modified: 07/05/01
+sub executePhyloPl {
+
+ my $opts = $_[ 0 ];
+ my $B = $_[ 1 ];
+ my $C = $_[ 2 ];
+ my $D = $_[ 3 ];
+
+ &testForTextFilePresence( $B );
+
+ $opts = "-".$opts;
+
+ system( "$PHYLO_PL $opts $B $C $D" )
+ && &dieWithUnexpectedError( "Could not execute \"$PHYLO_PL $opts $B $C $D\"" );
+
+} ## executePhyloPl
+
+
+
+
+# Two arguments:
+# 1. Name of inputfile
+# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
+# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
+sub executePuzzleDQO {
+ my $in = $_[ 0 ];
+ my $matrix_option = $_[ 1 ];
+ my $mat = "";
+
+ &testForTextFilePresence( $in );
+
+ $mat = setModelForPuzzle( $matrix_option );
+
+ system( "$PUZZLE_DQO $in >/dev/null 2>&1 << !$mat
+y
+!" )
+ && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO\"" );
+
+ return;
+
+} ## executePuzzleDQO
+
+
+
+
+# Two arguments:
+# 1. Name of inputfile
+# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
+# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
+# Last modified: 01/28/02
+sub executePuzzleDQObootstrapped {
+ my $in = $_[ 0 ];
+ my $matrix_option = $_[ 1 ];
+
+
+ my $l = 0;
+ my $slen = 0;
+ my $counter = 0;
+ my $mat = "";
+ my $a = "";
+ my @a = ();
+
+ &testForTextFilePresence( $in );
+
+ open( GRP, "<$in" ) || &dieWithUnexpectedError( "Cannot open file \"$in\"" );
+ while( <GRP> ) {
+ if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) {
+ $counter++;
+ }
+ }
+ close( GRP );
+
+ $l = `cat $in | wc -l`;
+ $slen = $l / $counter;
+
+ system( "split -$slen $in $in.splt." )
+ && &dieWithUnexpectedError( "Could not execute \"split -$slen $in $in.splt.\"" );
+
+ @a = <$in.splt.*>;
+
+ $mat = setModelForPuzzle( $matrix_option );
+
+ foreach $a ( @a ) {
+
+ system( "$PUZZLE_DQO $a >/dev/null 2>&1 << !$mat
+y
+!" )
+ && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO $a\"" );
+
+ system( "cat $a.dist >> $in.dist" )
+ && &dieWithUnexpectedError( "Could not execute \"cat outdist >> $in.dist\"" );
+
+ unlink( $a, $a.".dist" );
+ }
+
+ return;
+
+} ## executePuzzleDQObootstrapped
+
+
+
+# Transfers a Pfam (SELEX) alignment to a
+# PHYLIP sequential style alignment.
+# It only writes "match columns" as indicated by the
+# "# RF" line ('x' means match).
+#
+# Three arguments:
+# 1. infile name
+# 2. outfile name
+# 3. 1 to NOT ensure that match states contain only 'A'-'Z' or '-'
+#
+# Returns the number of match states (=length of output alignment),
+# the length of the input alignment,
+# the number of seqs in the input alignment
+#
+# Last modified: 07/07/01
+#
+sub pfam2phylipMatchOnly {
+
+ my $infile = $_[ 0 ];
+ my $outfile = $_[ 1 ];
+ my $ne = $_[ 2 ];
+ my @seq_name = ();
+ my @seq_array = ();
+ my $return_line = "";
+ my $seq = "";
+ my $x = 0;
+ my $y = 0;
+ my $i = 0;
+ my $x_offset = 0;
+ my $max_x = 0;
+ my $rf_y = 0;
+ my $number_colum = 0;
+ my $not_ensure = 0;
+ my $saw_rf_line = 0;
+
+ if ( $ne && $ne == 1 ) {
+ $not_ensure = 1;
+ }
+
+ &testForTextFilePresence( $infile );
+
+ open( INPP, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
+
+ # This reads in the first block. It reads in the seq names.
+ while ( 1 ) {
+ if ( &isPfamSequenceLine( $return_line ) ) {
+ $return_line =~ /^(\S+)\s+(\S+)/;
+ $seq_name[ $y ] = substr( $1, 0, $LENGTH_OF_NAME );
+ $seq = $2;
+ for ( $x = 0; $x < length( $seq ); $x++ ) {
+ $seq_array[ $x ][ $y ] = substr( $seq, $x, 1 );
+ }
+ $y++;
+ }
+ elsif ( &isRFline( $return_line ) ) {
+ $saw_rf_line = 1;
+ $return_line =~ /\s+(\S+)\s*$/;
+ $seq = $1;
+ $x_offset = length( $seq );
+ $rf_y = $y;
+ for ( $x = 0; $x < $x_offset; $x++ ) {
+ $seq_array[ $x ][ $rf_y ] = substr( $seq, $x, 1 );
+ }
+ last;
+ }
+
+ $return_line = <INPP>;
+
+ if ( !$return_line ) {
+ &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" );
+ }
+ }
+
+ if ( $saw_rf_line != 1 ) {
+ &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" );
+ }
+
+ $y = 0;
+ $max_x = 0;
+
+ # This reads all blocks after the 1st one.
+ while ( $return_line = <INPP> ) {
+ if ( &isPfamSequenceLine( $return_line ) ) {
+ $return_line =~ /^\S+\s+(\S+)/;
+ $seq = $1;
+ for ( $x = 0; $x < length( $seq ); $x++ ) {
+ $seq_array[ $x + $x_offset ][ $y % $rf_y ] = substr( $seq, $x, 1 );
+ }
+ $y++;
+ }
+ elsif ( &isRFline( $return_line ) ) {
+ if ( $y != $rf_y ) {
+ &dieWithUnexpectedError( "Alignment not in expected format" );
+ }
+
+ $return_line =~ /\s+(\S+)\s*$/;
+ $seq = $1;
+ $max_x = length( $seq );
+
+ for ( $x = 0; $x < length( $seq ); $x++ ) {
+ $seq_array[ $x + $x_offset ][ $rf_y ] = substr( $seq, $x, 1 );
+ }
+
+ $y = 0;
+ $x_offset = $x_offset + $max_x;
+ $max_x = 0;
+ }
+ }
+
+ close( INPP );
+
+ # Counts the match states, and hence the number of aa in the alignment:
+ for ( $x = 0; $x < $x_offset; $x++ ) {
+ if ( !$seq_array[ $x ][ $rf_y ] ) {
+ &dieWithUnexpectedError( "Alignment not in expected format" );
+ }
+ if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) {
+ $number_colum++;
+ }
+ }
+
+ # Writes the file:
+
+ open( OUTPP, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" );
+ print OUTPP "$rf_y $number_colum\n";
+ for ( $y = 0; $y < $rf_y; $y++ ) {
+ print OUTPP "$seq_name[ $y ]";
+ for ( $i = 0; $i < ( $LENGTH_OF_NAME - length( $seq_name[ $y ] ) ); $i++ ) {
+ print OUTPP " ";
+ }
+ for ( $x = 0; $x < $x_offset; $x++ ) {
+ if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) {
+ if ( !$seq_array[ $x ][ $y ] ) {
+ &dieWithUnexpectedError( "Alignment not in expected format" );
+ }
+ if ( $not_ensure != 1 && $seq_array[ $x ][ $y ] !~ /[A-Z]|-/ ) {
+ &dieWithUnexpectedError( "Alignment not in expected format (match states must only contain 'A'-'Z' or '-')" );
+ }
+ print OUTPP "$seq_array[ $x ][ $y ]";
+ }
+ }
+ print OUTPP "\n";
+ }
+ close( OUTPP );
+
+ return $number_colum, $x_offset, $rf_y;
+
+} ## pfam2phylipMatchOnly
+
+
+
+# Returns whether the argument (a String)
+# starts with a SWISS-PROT name (SEQN_SPECI).
+# Last modified: 06/21/01
+sub startsWithSWISS_PROTname {
+ return ( $_[ 0 ] =~ /^[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/ );
+}
+
+
+
+# Returns whether the argument starts with XXX.. XXXXX.. and the first
+# character is not a "#".
+# Last modified: 06/21/01
+sub isPfamSequenceLine {
+ return( !&isPfamCommentLine( $_[ 0 ] )
+ && &containsPfamNamedSequence( $_[ 0 ] ) );
+}
+
+
+
+# Returns whether the argument does start with a "#".
+# Last modified: 06/21/01
+sub isPfamCommentLine {
+ return ( $_[ 0 ] =~ /^#/ );
+}
+
+
+
+# Returns whether the argument starts with XXX XXXXX.
+# Last modified: 06/21/01
+sub containsPfamNamedSequence {
+ return ( $_[ 0 ] =~ /^\S+\s+\S+/ );
+}
+
+
+# Returns whether the argument starts with XXX XXXXX.
+# Last modified: 06/21/01
+sub isRFline {
+ return ( $_[ 0 ] =~ /^#.*RF/ );
+}
+
+
+
+# Three arguments:
+# 1. pairwise distance file
+# 2. number of bootstraps
+# 3. initial tree: BME, GME or NJ
+# Last modified: 2008/12/31
+sub executeFastme {
+ my $inpwd = $_[ 0 ];
+ my $bs = $_[ 1 ];
+ my $init_opt = $_[ 2 ];
+
+ &testForTextFilePresence( $inpwd );
+ my $command = "";
+ if ( $bs > 0 ) {
+ $command = "$FASTME -b $init_opt -i $inpwd -n $bs -s b";
+ }
+ else {
+ $command = "$FASTME -b $init_opt -i $inpwd -s b";
+ }
+ print $command;
+
+ system( $command );
+
+
+} ## executeFastme
+
+
+# Four arguments:
+# 1. pairwise distance file
+# 2. number of bootstraps
+# 3. seed for random number generator
+# 4. lower-triangular data matrix? 1: yes; no, otherwise
+sub executeNeighbor {
+ my $inpwd = $_[ 0 ];
+ my $bs = $_[ 1 ];
+ my $s = $_[ 2 ];
+ my $l = $_[ 3 ];
+ my $multi = "";
+ my $lower = "";
+
+ &testForTextFilePresence( $inpwd );
+
+ if ( $bs >= 2 ) {
+ $multi = "
+M
+$bs
+$s";
+ }
+ if ( $l == 1 ) {
+ $lower = "
+L";
+ }
+
+ system( "$NEIGHBOR >/dev/null 2>&1 << !
+$inpwd$multi$lower
+2
+3
+Y
+!" )
+ && &dieWithUnexpectedError( "Could not execute \"$NEIGHBOR $inpwd$multi$lower\"" );
+
+} ## executeNeighbor
+
+
+# Seven arguments:
+# 1. pairwise distance file
+# 2. number of bootstraps
+# 3. seed for random number generator
+# 4. number of jumbles for input order
+# 5. lower-triangular data matrix? 1: yes; no, otherwise
+# 6. FM for Fitch-Margoliash, ME for ME# 6.
+# 7. 1 to use globale rearragements
+sub executeFitch {
+ my $inpwd = $_[ 0 ];
+ my $bs = $_[ 1 ];
+ my $s = $_[ 2 ];
+ my $j = $_[ 3 ];
+ my $l = $_[ 4 ];
+ my $m = $_[ 5 ];
+ my $use_global_rearr = $_[ 6 ];
+ my $jumble = "";
+ my $multi = "";
+ my $lower = "";
+ my $method = "";
+
+ my $global = "";
+ if ( $use_global_rearr == 1 ) {
+ $global = "
+G";
+ }
+
+ &testForTextFilePresence( $inpwd );
+
+ if ( $m eq "FM" ) {
+ $method = "";
+ }
+ elsif ( $m eq "ME" ) {
+ $method = "
+D";
+ }
+ else {
+ &dieWithUnexpectedError( "method for FITCH must be either FM or ME" );
+ }
+
+ if ( $j >= 1 ) {
+ $jumble = "
+J
+$s
+$j";
+ }
+
+ if ( $bs >= 2 ) {
+ $multi = "
+M
+$bs
+$s";
+ }
+ if ( $l == 1 ) {
+ $lower = "
+L";
+ }
+
+ # jumble must be set BEFORE multi!
+ system( "$FITCH 2>&1 << !
+$inpwd$method$global$jumble$multi$lower
+3
+Y
+!" )
+ && &dieWithUnexpectedError( "Could not execute \"$FITCH $inpwd$method$global$jumble$multi$lower\"" );
+ # 3: Do NOT print out tree
+
+} ## executeFitch
+
+
+
+# Two arguments:
+# 1. pairwise distance file
+# 2. outfile
+sub executeBionj {
+ my $inpwd = $_[ 0 ];
+ my $out = $_[ 1 ];
+
+ &testForTextFilePresence( $inpwd );
+ my $command = "$BIONJ $inpwd $out";
+
+ system( $command )
+ && &dieWithUnexpectedError( $command );
+
+}
+
+# Four arguments:
+# 1. (effective) sequence length
+# 2. (effective) number of bases
+# 3. pairwise distance file
+# 4. outfile
+sub executeWeighbor {
+ my $L = $_[ 0 ];
+ my $b = $_[ 1 ];
+ my $i = $_[ 2 ];
+ my $o = $_[ 3 ];
+
+ &testForTextFilePresence( $i );
+ my $command = "$WEIGHBOR -L $L -b $b -i $i -o $o";
+
+ system( $command )
+ && &dieWithUnexpectedError( $command );
+
+}
+
+# Six arguments:
+# 1. DNA or Amino-Acids sequence filename (PHYLIP format)
+# 2. number of data sets to analyse (ex:3)
+# 3. Model: JTT | MtREV | Dayhoff | WAG | VT | DCMut | Blosum62 (Amino-Acids)
+# 4. number of relative substitution rate categories (ex:4), positive integer
+# 5. starting tree filename (Newick format), your tree filename | BIONJ for a distance-based tree
+# 6. 1 to estimate proportion of invariable sites, otherwise, fixed proportion "0.0" is used
+# PHYML produces several results files :
+# <sequence file name>_phyml_lk.txt : likelihood value(s)
+# <sequence file name>_phyml_tree.txt : inferred tree(s)
+# <sequence file name>_phyml_stat.txt : detailed execution stats
+sub executePhyml {
+ my $sequences = $_[ 0 ];
+ my $data_sets = $_[ 1 ];
+ my $model = $_[ 2 ];
+ my $nb_categ = $_[ 3 ];
+ my $tree = $_[ 4 ];
+ my $estimate_invar_sites = $_[ 5 ];
+
+ if ( $data_sets < 1 ) {
+ $data_sets = 1
+ }
+
+ my $invar = "0.0"; # proportion of invariable sites,
+ # a fixed value (ex:0.0) | e to get the maximum likelihood estimate
+ if ( $estimate_invar_sites == 1 ) {
+ $invar = "e";
+ }
+
+ my $data_type = "1"; # 0 = DNA | 1 = Amino-Acids
+ my $format = "i"; # i = interleaved sequence format | s = sequential
+ my $bootstrap_sets = "0"; # number of bootstrap data sets to generate (ex:2)
+ # only works with one data set to analyse
+
+ my $alpha = "e"; # gamma distribution parameter,
+ # a fixed value (ex:1.0) | e to get the maximum likelihood estimate
+
+ my $opt_topology = "y"; # optimise tree topology ? y | n
+ my $opt_lengths = "y"; # optimise branch lengths and rate parameters ? y | n
+
+ if ( $data_sets > 1 ) {
+ # No need to calc branch lengths for bootstrapped analysis
+ $opt_lengths = "n";
+ }
+
+ &testForTextFilePresence( $sequences );
+ my $command = "$PHYML $sequences $data_type $format $data_sets $bootstrap_sets $model $invar $nb_categ $alpha $tree $opt_topology $opt_lengths";
+
+ print( "\n$command\n");
+
+ system( $command )
+ && &dieWithUnexpectedError( $command );
+
+}
+
+
+
+
+# Four arguments:
+# 1. name of alignment file (in correct format!)
+# 2. number of bootstraps
+# 3. jumbles: 0: do not jumble; >=1 number of jumbles
+# 4. seed for random number generator
+sub executeProtpars {
+ my $align = $_[ 0 ];
+ my $bs = $_[ 1 ];
+ my $rand = $_[ 2 ];
+ my $s = $_[ 3 ];
+ my $jumble = "";
+ my $multi = "";
+
+
+ &testForTextFilePresence( $align );
+
+ if ( $bs > 1 && $rand < 1 ) {
+ $rand = 1;
+ }
+
+ if ( $rand >= 1 ) {
+ $jumble = "
+J
+$s
+$rand";
+ }
+
+ if ( $bs > 1 ) {
+ $multi = "
+M
+D
+$bs";
+ }
+
+ system( "$PROTPARS 2>&1 << !
+$align$jumble$multi
+Y
+!" )
+ && &dieWithUnexpectedError( "Could not execute \"$PROTPARS $align$jumble$multi\"" );
+ # 3: Do NOT print out tree
+
+
+ return;
+
+} ## executeProtpars
+
+
+
+# "Model of substitution" order for DQO TREE-PUZZLE 5.0:
+# Auto
+# m -> Dayhoff (Dayhoff et al. 1978)
+# m -> JTT (Jones et al. 1992)
+# m -> mtREV24 (Adachi-Hasegawa 1996)
+# m -> BLOSUM62 (Henikoff-Henikoff 92)
+# m -> VT (Mueller-Vingron 2000)
+# m -> WAG (Whelan-Goldman 2000)
+# m -> Auto
+# One argument:
+# matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
+# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
+# Last modified: 07/07/01
+sub setModelForPuzzle {
+ my $matrix_option = $_[ 0 ];
+ my $matr = "";
+
+ if ( $matrix_option == 0 ) { # JTT
+ $matr = "
+m
+m";
+ }
+ elsif ( $matrix_option == 2 ) { # BLOSUM 62
+ $matr = "
+m
+m
+m
+m";
+ }
+ elsif ( $matrix_option == 3 ) { # mtREV24
+ $matr = "
+m
+m
+m";
+ }
+ elsif ( $matrix_option == 5 ) { # VT
+ $matr = "
+m
+m
+m
+m
+m";
+ }
+ elsif ( $matrix_option == 6 ) { # WAG
+ $matr = "
+m
+m
+m
+m
+m
+m";
+ }
+ elsif ( $matrix_option == 7 ) { # auto
+ $matr = "";
+ }
+ else { # PAM
+ $matr = "
+m"
+ }
+
+ return $matr;
+
+} ## setModelForPuzzle
+
+# One argument:
+# Model of rate heterogeneity:
+# 1 for "8 Gamma distributed rates"
+# 2 for "Two rates (1 invariable + 1 variable)"
+# 3 for "Mixed (1 invariable + 8 Gamma rates)"
+# otherwise: Uniform rate
+# Last modified: 09/08/03
+sub setRateHeterogeneityOptionForPuzzle {
+ my $rate_heterogeneity_option = $_[ 0 ];
+ my $opt = "";
+
+ if ( $rate_heterogeneity_option == 1 ) {
+ $opt = "
+w";
+ }
+ elsif ( $rate_heterogeneity_option == 2 ) {
+ $opt = "
+w
+w";
+ }
+ elsif ( $rate_heterogeneity_option == 3 ) {
+ $opt = "
+w
+w
+w";
+ }
+ else {
+ $opt = "";
+ }
+
+ return $opt;
+} ## setRateHeterogeneityOptionForPuzzle
+
+
+# One argument:
+# Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise
+# Last modified: 09/08/03
+sub setParameterEstimatesOptionForPuzzle {
+ my $parameter_estimates_option = $_[ 0 ];
+ my $opt = "";
+
+ if ( $parameter_estimates_option == 1 ) {
+ $opt = "
+e";
+ }
+ else {
+ $opt = "";
+ }
+
+ return $opt;
+} ## setParameterEstimatesOptionForPuzzle
+
+
+
+# three/four/five arguments:
+# 1. Name of inputfile
+# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
+# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
+# 3. Number of sequences in alignment
+# 4. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise
+# 5. Model of rate heterogeneity:
+# 1 for "8 Gamma distributed rates"
+# 2 for "Two rates (1 invariable + 1 variable)"
+# 3 for "Mixed (1 invariable + 8 Gamma rates)"
+# otherwise: Uniform rate
+sub executePuzzleBootstrapped {
+ my $in = $_[ 0 ];
+ my $matrix_option = $_[ 1 ];
+ my $number_of_seqs = $_[ 2 ];
+ my $parameter_estimates_option = $_[ 3 ];
+ my $rate_heterogeneity_option = $_[ 4 ];
+
+ my $l = 0;
+ my $slen = 0;
+ my $counter = 0;
+ my $mat = "";
+ my $est = "";
+ my $rate = "";
+ my $a = "";
+ my @a = ();
+
+ &testForTextFilePresence( $in );
+
+ open( GRP, "<$in" ) || die "\n\n$0: Unexpected error: Cannot open file <<$in>>: $!";
+ while( <GRP> ) {
+ if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) {
+ $counter++;
+ }
+ }
+ close( GRP );
+
+ $l = `cat $in | wc -l`;
+ $slen = $l / $counter;
+
+ system( "split --suffix-length=4 -$slen $in $in.splt." )
+ && die "\n\n$0: executePuzzleDQObootstrapped: Could not execute \"split --suffix-length=4 -$slen $in $in.splt.\": $!";
+
+ @a = <$in.splt.*>;
+
+ $mat = setModelForPuzzle( $matrix_option );
+ if ( $parameter_estimates_option ) {
+ $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option );
+ }
+ if ( $rate_heterogeneity_option ) {
+ $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option );
+ }
+
+ my $k="";
+ if ( $number_of_seqs <= 257 ) {
+ $k = "k";
+ }
+
+ foreach $a ( @a ) {
+ print "-".$a."\n";
+ system( "$PUZZLE $a << !
+$k
+k
+k$mat$est$rate
+y
+!" )
+ && die "$0: Could not execute \"$PUZZLE $a\"";
+
+ system( "cat $a.dist >> $in.dist" )
+ && die "$0: Could not execute \"cat outdist >> $in.dist\"";
+
+ unlink( $a, $a.".dist", $a.".puzzle" );
+ }
+
+ return;
+
+} ## executePuzzleBootstrapped
+
+
+
+
+
+# three/four/five arguments:
+# 1. Name of inputfile
+# 2. Matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
+# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
+# 3. Number of sequences in alignment
+# 4. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise
+# 5. Model of rate heterogeneity:
+# 1 for "8 Gamma distributed rates"
+# 2 for "Two rates (1 invariable + 1 variable)"
+# 3 for "Mixed (1 invariable + 8 Gamma rates)"
+# otherwise: Uniform rate
+sub executePuzzle {
+ my $in = $_[ 0 ];
+ my $matrix_option = $_[ 1 ];
+ my $number_of_seqs = $_[ 2 ];
+ my $parameter_estimates_option = $_[ 3 ];
+ my $rate_heterogeneity_option = $_[ 4 ];
+ my $mat = "";
+ my $est = "";
+ my $rate = "";
+
+ &testForTextFilePresence( $in );
+
+ $mat = &setModelForPuzzle( $matrix_option );
+ if ( $parameter_estimates_option ) {
+ $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option );
+ }
+ if ( $rate_heterogeneity_option ) {
+ $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option );
+ }
+
+ my $k="";
+ if ( $number_of_seqs <= 257 ) {
+ $k = "k";
+ }
+
+
+ system( "$PUZZLE $in << !
+$k
+k
+k$mat$est$rate
+y
+!" )
+ && die "$0: Could not execute \"$PUZZLE\"";
+
+ return;
+
+} ## executePuzzle
+
+
+
+
+# Preparation of the pwd file
+sub addDistsToQueryToPWDfile {
+ my $pwd_file = $_[ 0 ];
+ my $disttoquery_file = $_[ 1 ];
+ my $outfile = $_[ 2 ];
+ my $name_of_query = $_[ 3 ];
+ my $name_of_query_ = "";
+ my $return_line_pwd = "";
+ my $return_line_dq = "";
+ my $num_of_sqs = 0;
+ my $block = 0;
+ my $name_from_pwd = "X";
+ my $name_from_dq = "Y";
+ my @dists_to_query = ();
+ my $i = 0;
+
+ &testForTextFilePresence( $pwd_file );
+ &testForTextFilePresence( $disttoquery_file );
+
+ $name_of_query_ = $name_of_query;
+ for ( my $j = 0; $j <= ( $LENGTH_OF_NAME - length( $name_of_query ) - 1 ); ++$j ) {
+ $name_of_query_ .= " ";
+ }
+
+ open( OUT_AD, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" );
+ open( IN_PWD, "$pwd_file" ) || &dieWithUnexpectedError( "Cannot open file \"$pwd_file\"" );
+ open( IN_DQ, "$disttoquery_file" ) || &dieWithUnexpectedError( "Cannot open file \"$disttoquery_file\"" );
+
+ W: while ( $return_line_pwd = <IN_PWD> ) {
+
+
+ if ( $return_line_pwd =~ /^\s*(\d+)\s*$/ ) {
+ $num_of_sqs = $1;
+ $num_of_sqs++;
+ if ( $block > 0 ) {
+ print OUT_AD "$name_of_query_ ";
+ for ( my $j = 0; $j < $i; ++$j ) {
+ print OUT_AD "$dists_to_query[ $j ] ";
+ }
+ print OUT_AD "0.0\n";
+ }
+ print OUT_AD " $num_of_sqs\n";
+ $block++;
+ @dists_to_query = ();
+ $i = 0;
+ }
+
+ if ( $block == 1
+ && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) {
+ $name_from_pwd = $1;
+
+ if ( !defined( $return_line_dq = <IN_DQ> ) ) {
+ &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" );
+ }
+
+ if ( $return_line_dq !~ /\S/ ) {
+ if ( !defined( $return_line_dq = <IN_DQ> ) ) {
+ &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" );
+ }
+ }
+ $return_line_dq =~ /^\s*(\S+)\s+(\S+)/;
+ $name_from_dq = $1;
+ $dists_to_query[ $i++ ] = $2;
+
+
+ if ( $name_from_pwd ne $name_from_dq ) {
+ &dieWithUnexpectedError( "Order of sequence names in \"$pwd_file\" and \"$disttoquery_file\" is not the same" );
+ }
+ print OUT_AD $return_line_pwd;
+
+ }
+ elsif ( $block > 1
+ && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) {
+ $name_from_pwd = $1;
+ if ( !defined( $return_line_dq = <IN_DQ> ) ) {
+ &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" );
+ }
+ if ( $return_line_dq !~ /\S/ ) {
+ if ( !defined( $return_line_dq = <IN_DQ>) ) {
+ &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" );
+ }
+ }
+ $return_line_dq =~ /^\s*\S+\s+(\S+)/;
+ $dists_to_query[ $i++ ] = $1;
+ print OUT_AD $return_line_pwd;
+ }
+ }
+ print OUT_AD "$name_of_query_ ";
+ for ( my $j = 0; $j < $i; ++$j ) {
+ print OUT_AD "$dists_to_query[ $j ] ";
+ }
+ print OUT_AD "0.0\n";
+
+ close( OUT_AD );
+ close( IN_PWD );
+ close( IN_DQ );
+ return $block;
+
+} ## addDistsToQueryToPWDfile
+
+
+
+
+# Three arguments:
+# 1. HMMER model db
+# 2. name of HMM
+# 3. outputfile name
+# Last modified: 02/27/01
+sub executeHmmfetch {
+
+ my $db = $_[ 0 ];
+ my $name = $_[ 1 ];
+ my $outfile = $_[ 2 ];
+
+ system( "$HMMFETCH $db $name > $outfile" )
+ && &dieWithUnexpectedError( "Could not execute \"$HMMFETCH $db $name > $outfile\"" );
+ return;
+
+} ## executeHmmfetch
+
+
+
+# Checks wether a file is present, not empty and a plain textfile.
+# One argument: name of file.
+# Last modified: 07/07/01
+sub testForTextFilePresence {
+ my $file = $_[ 0 ];
+ unless ( ( -s $file ) && ( -f $file ) && ( -T $file ) ) {
+ dieWithUnexpectedError( "File \"$file\" does not exist, is empty, or is not a plain textfile" );
+ }
+} ## testForTextFilePresence
+
+
+# Last modified: 02/21/03
+sub addSlashAtEndIfNotPresent {
+ my $filename = $_[ 0 ];
+ $filename =~ s/\s+//g;
+ unless ( $filename =~ /\/$/ ) {
+ $filename = $filename."/";
+ }
+ return $filename;
+} ## addSlashAtEndIfNotPresent
+
+
+
+# Last modified: 02/15/02
+sub exitWithWarning {
+
+ my $text = $_[ 0 ];
+ if ( defined( $_[ 1 ] ) && $_[ 1 ] == 1 ) {
+ print( "<H4 class=\"error\">user error</H4>\n" );
+ print( "<P>\n" );
+ print( "<B>$text</B>\n" );
+ print( "</P>\n" );
+ print( "<P>  </P>\n" );
+ }
+ else {
+ print( "\n\n$text\n\n" );
+ }
+
+ exit( 0 );
+
+} ## exit_with_warning
+
+
+
+# Last modified: 02/15/02
+sub dieWithUnexpectedError {
+
+ my $text = $_[ 0 ];
+
+ die( "\n\n$0:\nUnexpected error (should not have happened):\n$text\n$!\n\n" );
+
+} ## dieWithUnexpectedError
+
+
+
+1;
--- /dev/null
+#!/usr/bin/perl -W
+#
+# $Id: phylo_pl.pl,v 1.32 2010/12/13 19:00:22 cmzmasek Exp $
+#
+# FORESTER -- software libraries and applications
+# for evolutionary biology research and applications.
+#
+# Copyright (C) 2008-2009 Christian M. Zmasek
+# Copyright (C) 2008-2009 Burnham Institute for Medical Research
+# All rights reserved
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+#
+# Contact: phylosoft @ gmail . com
+# WWW: www.phylosoft.org/forester
+#
+#
+#
+# Requirements phylo_pl is part of the FORESTER libraries.
+# ------------ Many of its global variables are set via forester.pm.
+#
+#
+# Note. Use xt.pl (for Pfam alignments) or mt.pl (for other alignments)
+# to run phylo_pl.pl on whole directories of alignments files.
+#
+#
+#
+#
+# =========================
+#
+# METHOD ORDER (IMPORTANT!)
+# 1. FastME
+# 2. phylip NJ
+# 3. phylip fitch FM
+# 4. phylip fitch ME
+# 5. BIONJ
+# 6. Weighbor
+# 7. Raxml
+# 8. phyml
+# 9. phylip proml
+# 10. phylip protpars
+# 11. all
+#==========================
+
+use strict;
+
+use FindBin;
+use lib $FindBin::Bin;
+use forester;
+
+my $VERSION = "1.0.1";
+my $LAST_MODIFIED = "2009.10.02";
+
+my $RAXML_MODEL_BASE = "PROTGAMMA";
+my $RAXML_ALGORITHM = "a";
+
+my $TEMP_DIR_DEFAULT = "/tmp/phylo_pl_"; # Where all the infiles, outfiles, etc will be created.
+
+my $bootstraps = 100; # 0,1: do not bootstrap. Default: 100
+my $matrix = 5; # 0 = JTT
+ # 1 = PAM
+ # 2 = BLOSUM 62
+ # 3 = mtREV24
+ # 5 = VT - default
+ # 6 = WAG
+ # 7 = auto in puzzle
+ # 8 = DCMut in PHYML, VT in TREE-PUZZLE
+my $rate_heterogeneity = 0; # 0 = Uniform rate (default)
+ # 1 = 8 Gamma distributed rates
+ # 2 = Two rates (1 invariable + 1 variable)
+ # 3 = Mixed (1 invariable + 8 Gamma rates)
+my $seed = 9; # Seed for random number generators. Default: 9
+my $keep_multiple_trees = 0; # 0: delete multiple tree file
+ # 1: do not delete multiple tree file
+my $exact_parameter_est = 0; # 0: no; 1: yes
+
+my $phyml_rel_substitution_rate_cat = 4;
+
+my $jumbles = 2;
+my $use_fastme = 0; # 0: no; 1: yes
+my $use_phylip_nj = 0; # 0: no; 1: yes
+my $use_phylip_fitch_fm = 0; # 0: no; 1: yes
+my $use_phylip_fitch_me = 0; # 0: no; 1: yes
+my $use_bionj = 0; # 0: no; 1: yes
+my $use_weighbor = 0; # 0: no; 1: yes
+my $use_raxml = 0; # 0: no; 1: yes
+my $use_phyml = 0; # 0: no; 1: yes
+my $use_proml = 0; # 0: no; 1: yes
+my $use_protpars = 0; # 0: no; 1: yes
+my $use_global_rearr = 0; # 0: no; 1: yes
+my $estimate_invar_sites = 0; # 0: no; 1: yes
+
+my $fastme_init_tree_opt = "NJ";
+
+my %seqnames = (); # number => seqname
+my %numbers = (); # seqname => number
+my $options = "";
+my $infile = "";
+my $pwdfile = "";
+my $outfile = "";
+my $logfile = "";
+my $multipwdfile = "";
+my $distancefile = "";
+my $log = "";
+my $ii = 0;
+my $temp_dir = "";
+my $current_dir = "";
+my @out = ();
+my $number_of_seqs = 0;
+my $number_of_aa = 0;
+
+my $use_pwd_based_methods = 0;
+
+print( "\n");
+print( "phylo_pl $VERSION ($LAST_MODIFIED)\n" );
+print( "__________________________________\n");
+print( "\n\n");
+
+
+
+unless ( @ARGV == 2 || @ARGV == 3 || @ARGV == 4 || @ARGV == 5 ) {
+ &printUsage();
+ exit ( -1 );
+}
+
+
+
+# Analyzes the options:
+# ---------------------
+
+if ( $ARGV[ 0 ] =~ /^-.+/ ) {
+
+ unless ( @ARGV > 2 ) {
+ &printUsage();
+ exit ( -1 );
+ }
+ $options = $ARGV[ 0 ];
+
+ if ( @ARGV != 3 && @ARGV != 4 ) {
+ &printUsage();
+ exit ( -1 );
+ }
+ $infile = $ARGV[ 1 ];
+ $outfile = $ARGV[ 2 ];
+ if ( @ARGV == 4 ) {
+ $temp_dir = $ARGV[ 3 ];
+ }
+ if ( $options =~ /B(\d+)/ ) {
+ $bootstraps = $1;
+ if ( $bootstraps <= 1 ) {
+ $bootstraps = 0;
+ }
+ elsif ( $bootstraps <= 9 ) {
+ $bootstraps = 0;
+ print "\n\nphylo_pl: WARNING: Bootstrap number must be devisable by 10,\nno bootstrapping.\n\n";
+ }
+ elsif ( $bootstraps % 10 != 0 ) {
+ $bootstraps = $bootstraps - $bootstraps % 10; # to ensure $bootstraps % 10 == 0
+ print "\n\nphylo_pl: WARNING: Bootstrap number must be devisable by 10,\nset to $bootstraps.\n\n";
+ }
+ }
+ if ( $options =~ /n/ ) {
+ $use_phylip_nj = 1;
+ }
+ if ( $options =~ /q@(\d)/ ) {
+ $use_fastme = 1;
+ my $opt = $1;
+ if ( $opt == 1 ) {
+ $fastme_init_tree_opt = "GME";
+ }
+ elsif ( $opt == 2 ) {
+ $fastme_init_tree_opt = "BME";
+ }
+ elsif ( $opt == 3 ) {
+ $fastme_init_tree_opt = "NJ";
+ }
+ else {
+ &printUsage();
+ exit ( -1 );
+ }
+ }
+ if ( $options =~ /f/ ) {
+ $use_phylip_fitch_fm = 1;
+ }
+ if ( $options =~ /e/ ) {
+ $use_phylip_fitch_me = 1;
+ }
+ if ( $options =~ /b/ ) {
+ $use_bionj = 1;
+ }
+ if ( $options =~ /w/ ) {
+ $use_weighbor = 1;
+ }
+ if ( $options =~ /x/ ) {
+ $use_raxml = 1;
+ }
+ if ( $options =~ /y/ ) {
+ $use_phyml = 1;
+ }
+ if ( $options =~ /o/ ) {
+ $use_proml = 1;
+ }
+ if ( $options =~ /p/ ) {
+ $use_protpars = 1;
+ }
+ if ( $options =~ /G/ ) {
+ $use_global_rearr = 1;
+ }
+ if ( $options =~ /I/ ) {
+ $estimate_invar_sites = 1;
+ }
+ if ( $options =~ /j(\d+)/ ) {
+ $jumbles = $1;
+ if ( $jumbles < 1 ) {
+ $jumbles = 0;
+ }
+ }
+ if ( $options =~ /r(\d+)/ ) {
+ $phyml_rel_substitution_rate_cat = $1;
+ if ( $phyml_rel_substitution_rate_cat < 1 ) {
+ $phyml_rel_substitution_rate_cat = 1;
+ }
+ }
+ if ( $options =~ /J/ ) {
+ $matrix = 0; # JTT
+ }
+ if ( $options =~ /P/ ) {
+ $matrix = 1; # PAM
+ }
+ if ( $options =~ /L/ ) {
+ $matrix = 2; # Blosum 62
+ }
+ if ( $options =~ /M/ ) {
+ $matrix = 3; # mtREV24
+ }
+ if ( $options =~ /W/ ) {
+ $matrix = 6; # WAG
+ }
+ if ( $options =~ /A/ ) {
+ $matrix = 7; # auto
+ }
+ if ( $options =~ /D/ ) {
+ $matrix = 8; # DCMut in PHYML and RAXML, VT in PUZZLE
+ }
+ if ( $options =~ /S(\d+)/ ) {
+ $seed = $1;
+ }
+ if ( $options =~ /X/ ) {
+ $keep_multiple_trees = 1;
+ }
+ if ( $options =~ /E/ ) {
+ $exact_parameter_est = 1;
+ }
+ if ( $options =~ /g/ ) {
+ $rate_heterogeneity = 1;
+ }
+ if ( $options =~ /t/ ) {
+ $rate_heterogeneity = 2;
+ }
+ if ( $options =~ /m/ ) {
+ $rate_heterogeneity = 3;
+ }
+}
+else {
+ unless ( @ARGV == 2 || @ARGV == 3 ) {
+ &printUsage();
+ exit ( -1 );
+ }
+ $infile = $ARGV[ 0 ];
+ $outfile = $ARGV[ 1 ];
+ if ( @ARGV == 3 ) {
+ $temp_dir = $ARGV[ 2 ];
+ }
+}
+
+if ( $use_fastme != 1 &&
+ $use_phylip_nj != 1 &&
+ $use_phylip_fitch_fm != 1 &&
+ $use_phylip_fitch_me != 1 &&
+ $use_bionj != 1 &&
+ $use_weighbor != 1 &&
+ $use_raxml != 1 &&
+ $use_phyml != 1 &&
+ $use_proml != 1 &&
+ $use_protpars != 1 ) {
+
+ $use_fastme = 1;
+ $use_phylip_nj = 1;
+ $use_phylip_fitch_fm = 1;
+ $use_phylip_fitch_me = 1;
+ $use_bionj = 1;
+ $use_raxml = 1;
+ $use_weighbor = 1;
+ $use_phyml = 1;
+ $use_proml = 1;
+ $use_protpars = 1;
+}
+
+
+if ( $use_fastme == 1 ||
+ $use_phylip_nj == 1 ||
+ $use_phylip_fitch_fm == 1 ||
+ $use_phylip_fitch_me == 1 ||
+ $use_bionj == 1 ||
+ $use_weighbor == 1 ) {
+ $use_pwd_based_methods = 1;
+}
+else {
+ $use_pwd_based_methods = 0;
+}
+
+$current_dir = `pwd`;
+$current_dir =~ s/\s//;
+
+if ( $outfile !~ /^\// ) {
+ # outfile is not absolute path.
+ $outfile = $current_dir."/".$outfile;
+}
+
+
+
+# TREE-PUZZLE sets the option in this way:
+# If two rates or mixed, exact parameter estimates are used.
+if ( $rate_heterogeneity == 2
+|| $rate_heterogeneity == 3 ) {
+ $exact_parameter_est = 1
+}
+
+
+if ( $outfile =~ /\.xml$/i ) {
+ $outfile =~ s/\.xml//i;
+}
+elsif ( $outfile =~ /\.aln$/i ) {
+ $outfile =~ s/\.aln//i;
+}
+elsif ( $outfile =~ /\.fasta$/i ) {
+ $outfile =~ s/\.fasta//i;
+}
+elsif ( $outfile =~ /\.fas$/i ) {
+ $outfile =~ s/\.fas//i;
+}
+elsif ( $outfile =~ /\.seqs$/i ) {
+ $outfile =~ s/\.seqs//i;
+}
+
+
+$logfile = $outfile.$LOG_FILE_SUFFIX;
+$multipwdfile = $outfile.$MULTIPLE_PWD_FILE_SUFFIX;
+$distancefile = $outfile.$SUFFIX_PWD_NOT_BOOTS;
+
+&dieIfFileExists( $logfile );
+&dieIfFileExists( $multipwdfile );
+&dieIfFileExists( $distancefile );
+
+
+my $fastme_outtree = $outfile."_fme.xml";
+my $phylip_nj_outtree = $outfile."_pnj.xml";
+my $phylip_fm_outtree = $outfile."_pfm.xml";
+my $phylip_me_outtree = $outfile."_pme.xml";
+my $bionj_outtree = $outfile."_bionj.xml";
+my $weighbor_outtree = $outfile."_weigh.xml";
+my $raxml_outtree = $outfile."_raxml.xml";
+my $phyml_outtree = $outfile."_phyml.xml";
+my $proml_outtree = $outfile."_proml.xml";
+my $protpars_outtree = $outfile."_ppp.xml";
+my $all_outtree = $outfile."_comb.xml";
+
+my $multitreefile_fastme = $outfile."_fme".$MULTIPLE_TREES_FILE_SUFFIX;
+my $multitreefile_phylip_nj = $outfile."_pnj".$MULTIPLE_TREES_FILE_SUFFIX;
+my $multitreefile_phylip_fm = $outfile."_pfm".$MULTIPLE_TREES_FILE_SUFFIX;
+my $multitreefile_phylip_me = $outfile."_pme".$MULTIPLE_TREES_FILE_SUFFIX;
+my $multitreefile_bionj = $outfile."_bionj".$MULTIPLE_TREES_FILE_SUFFIX;
+my $multitreefile_weighbor = $outfile."_weigh".$MULTIPLE_TREES_FILE_SUFFIX;
+my $multitreefile_raxml = $outfile."_raxml".$MULTIPLE_TREES_FILE_SUFFIX;
+my $multitreefile_phyml = $outfile."_phyml".$MULTIPLE_TREES_FILE_SUFFIX;
+my $multitreefile_proml = $outfile."_proml".$MULTIPLE_TREES_FILE_SUFFIX;
+my $multitreefile_protpars = $outfile."_ppp".$MULTIPLE_TREES_FILE_SUFFIX;
+
+if ( $use_fastme == 1 ) {
+ &dieIfFileExists( $fastme_outtree );
+ if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) {
+ &dieIfFileExists( $multitreefile_fastme );
+ }
+}
+if( $use_phylip_nj == 1 ) {
+ &dieIfFileExists( $phylip_nj_outtree );
+ if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) {
+ &dieIfFileExists( $multitreefile_phylip_nj );
+ }
+}
+if( $use_phylip_fitch_fm == 1 ) {
+ &dieIfFileExists( $phylip_fm_outtree );
+ if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) {
+ &dieIfFileExists( $multitreefile_phylip_fm );
+ }
+}
+if( $use_phylip_fitch_me == 1 ) {
+ &dieIfFileExists( $phylip_me_outtree );
+ if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) {
+ &dieIfFileExists( $multitreefile_phylip_me );
+ }
+}
+if( $use_bionj == 1 ) {
+ &dieIfFileExists( $bionj_outtree );
+ if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) {
+ &dieIfFileExists( $multitreefile_bionj );
+ }
+}
+if( $use_weighbor == 1 ) {
+ &dieIfFileExists( $weighbor_outtree );
+ if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) {
+ &dieIfFileExists( $multitreefile_weighbor );
+ }
+}
+if( $use_raxml == 1 ) {
+ &dieIfFileExists( $raxml_outtree );
+ if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) {
+ &dieIfFileExists( $multitreefile_raxml );
+ }
+}
+if( $use_phyml == 1 ) {
+ &dieIfFileExists( $phyml_outtree );
+ if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) {
+ &dieIfFileExists( $multitreefile_phyml );
+ }
+}
+if( $use_proml == 1 ) {
+ &dieIfFileExists( $proml_outtree );
+ if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) {
+ &dieIfFileExists( $multitreefile_proml );
+ }
+}
+if( $use_protpars == 1 ) {
+ &dieIfFileExists( $protpars_outtree );
+ if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) {
+ &dieIfFileExists( $multitreefile_protpars );
+ }
+}
+if ( $bootstraps > 1 ) {
+ &dieIfFileExists( $all_outtree );
+}
+if ( $infile ne "" ) {
+ unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
+ die "\n\nphylo_pl: Input alignment file \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n";
+ }
+}
+
+
+
+
+# Prints out the options:
+# -----------------------
+
+
+$log = "\n$0 logfile:\n";
+$log = $log."Version: $VERSION\n\n";
+
+
+
+if ( $infile ne "" ) {
+ $log = $log."Input : $infile\n";
+}
+
+if ( $keep_multiple_trees == 1 && $bootstraps >= 2 ) {
+ $log = $log."Multiple distance matrices : $multipwdfile\n";
+}
+
+
+$log = $log."Bootstraps : $bootstraps\n";
+
+if ( $use_pwd_based_methods == 1 ) {
+ $log = $log."Prgrm to calculate pairwise dist. : TREE-PUZZLE (version: $PUZZLE_VERSION)\n";
+}
+
+
+if ( $use_fastme == 1 ) {
+ $log = $log."Program to calculate tree : FastME (version: $FASTME_VERSION)\n";
+ $log = $log."Method for intial tree in FastME : $fastme_init_tree_opt\n";
+ $log = $log."Tree swapping (NNI) in FastME : balanced (default)\n";
+}
+if ( $use_phylip_nj == 1 ) {
+ $log = $log."Program to calculate tree : PHYLIP NEIGHBOR NJ (version: $PHYLIP_VERSION)\n";
+}
+if ( $use_phylip_fitch_fm == 1 ) {
+ $log = $log."Program to calculate tree : PHYLIP FITCH Fitch-Margoliash (version: $PHYLIP_VERSION)\n";
+}
+if ( $use_phylip_fitch_me == 1 ) {
+ $log = $log."Program to calculate tree : PHYLIP FITCH Minimal Evolution (version: $PHYLIP_VERSION)\n";
+}
+if ( $use_bionj == 1 ) {
+ $log = $log."Program to calculate tree : BIONJ (version: $BIONJ_VERSION)\n";
+}
+if ( $use_weighbor == 1 ) {
+ $log = $log."Program to calculate tree : Weighbor [no invariable sites, b=14] (version: $WEIGHBOR_VERSION)\n";
+}
+if ( $use_raxml == 1 ) {
+ $log = $log."Program to calculate tree : RAxML [$RAXML_MODEL_BASE] (uses its own bootstraps, if bootstrapped: -f $RAXML_ALGORITHM) (version: $RAXML_VERSION)\n";
+}
+if ( $use_phyml == 1 ) {
+ $log = $log."Program to calculate tree : PHYML (MLE for gamma distr param and proportion of inv sites) (version: $PHYML_VERSION)\n";
+ $log = $log."# of rel subst rate categories : $phyml_rel_substitution_rate_cat\n";
+}
+if ( $use_proml == 1 ) {
+ $log = $log."Program to calculate tree : PHYLIP PROML (uses PAM unless JTT selected) (version: $PHYLIP_VERSION)\n";
+}
+if ( $use_protpars == 1 ) {
+ $log = $log."Program to calculate tree : PHYLIP PROTPARS (with global rearrangements) (version: $PHYLIP_VERSION)\n";
+}
+if ( $use_phylip_fitch_fm == 1 || $use_phylip_fitch_me == 1 || $use_protpars == 1 || $use_proml ) {
+ $log = $log."Number of jumbles (input order rand): $jumbles\n";
+
+}
+if ( $use_phylip_fitch_fm == 1 || $use_phylip_fitch_me == 1 || $use_proml ) {
+ if ( $use_global_rearr == 1 ) {
+ $log = $log."Global rearrangements : true\n";
+ }
+ else {
+ $log = $log."Global rearrangements : false\n";
+
+ }
+}
+
+if ( $bootstraps > 0 ) {
+ $log = $log."Prgrm to calculate ML branch lenghts: TREE-PUZZLE (version: $PUZZLE_VERSION)\n";
+}
+
+$log = $log."Model : ";
+if ( $matrix == 0 ) {
+ $log = $log."JTT (Jones et al. 1992)\n";
+}
+elsif ( $matrix == 1 ) {
+ $log = $log."PAM (Dayhoff et al. 1978)\n";
+}
+elsif ( $matrix == 2 ) {
+ $log = $log."BLOSUM 62 (Henikoff-Henikoff 92)\n";
+}
+elsif ( $matrix == 3 ) {
+ $log = $log."mtREV24 (Adachi-Hasegawa 1996)\n";
+}
+elsif ( $matrix == 5 ) {
+ $log = $log."VT (Mueller-Vingron 2000)\n";
+}
+elsif ( $matrix == 6 ) {
+ $log = $log."WAG (Whelan-Goldman 2000)\n";
+}
+elsif ( $matrix == 7 ) {
+ $log = $log."auto in TREE-PUZZLE\n";
+}
+elsif ( $matrix == 8 ) {
+ $log = $log."DCMut (Kosial and Goldman, 2005) in PHYML and RAxML, VT in TREE-PUZZLE\n";
+}
+else {
+ &dieWithUnexpectedError( "Unknown model: matrix=$matrix" );
+}
+if ( $use_raxml == 1 || $use_phyml == 1 ) {
+ if ( $estimate_invar_sites == 1 ) {
+ $log = $log."Estimate proportion of invariable sites in RAXML and/or PHYML: true\n";
+ }
+ else {
+ $log = $log."Estimate proportion of invariable sites in RAXML and/or PHYML: false (proportion \"0.0\" is used in PHYML)\n";
+ }
+}
+
+$log = $log."Model of rate heterogeneity (PUZZLE): ";
+if ( $rate_heterogeneity == 1 ) {
+ $log = $log."8 Gamma distributed rates\n";
+}
+elsif ( $rate_heterogeneity == 2 ) {
+ $log = $log."Two rates (1 invariable + 1 variable)\n";
+}
+elsif ( $rate_heterogeneity == 3 ) {
+ $log = $log."Mixed (1 invariable + 8 Gamma rates)\n";
+}
+else {
+ $log = $log."Uniform rate\n";
+}
+$log = $log."Seed for random number generators : $seed\n";
+if ( $exact_parameter_est == 1 ) {
+ $log = $log."Exact parameter estimates in TREE-PUZZLE\n";
+}
+
+$log = $log."Start time/date : ".`date`;
+
+
+
+
+# That's where the mischief starts....
+# ------------------------------------
+
+$ii = 0;
+
+srand();
+my $range = 1000000;
+my $random_number = int( rand( $range ) );
+
+if ( $temp_dir eq "" ) {
+ $temp_dir = $TEMP_DIR_DEFAULT;
+}
+
+$temp_dir = $temp_dir.$random_number.$ii;
+
+while ( -e $temp_dir ) {
+ $ii++;
+ $temp_dir = $temp_dir.$random_number.$ii;
+}
+
+mkdir( $temp_dir, 0700 )
+|| die "\n\n$0: Could not create <<$temp_dir>>: $!\n\n";
+
+unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) {
+ die "\n\n$0: <<$temp_dir>> does not exist, or is not a directory.\n\n";
+}
+
+
+&cp( $infile, $temp_dir."/INFILE" );
+unless ( chmod ( 0600, $temp_dir."/INFILE" ) ) {
+ warn "\n\n$0: Could not chmod. $!\n\n";
+}
+$infile = "INFILE";
+
+chdir ( $temp_dir )
+|| die "\n\n$0: Could not chdir to <<$temp_dir>>: $!\n\n";
+
+&cp( $infile, "infile" );
+@out = &getNumberOfSeqsAndAas( $infile );
+$number_of_seqs = $out[ 0 ];
+$number_of_aa = $out[ 1 ];
+
+my $SEQBOOT_OUTFILE = "seqboot_outfile";
+
+if ( $bootstraps > 1 && ( $use_pwd_based_methods == 1
+ || $use_phyml == 1
+ || $use_proml == 1
+ || $use_protpars == 1 ) ) {
+ &executeSeqboot( $seed, $bootstraps );
+ &mv( "outfile", $SEQBOOT_OUTFILE );
+ &rm( "infile" );
+}
+
+&cp( $infile, "align" );
+
+if ( $use_pwd_based_methods == 1 ) {
+ # Calculating the pairwise distances (saved in file "infile"): "puzzle"
+ if ( $bootstraps > 1 ) {
+ &executePuzzleBootstrapped( $SEQBOOT_OUTFILE,
+ $matrix,
+ $number_of_seqs,
+ $exact_parameter_est,
+ $rate_heterogeneity );
+
+ $pwdfile = $SEQBOOT_OUTFILE.".dist";
+ }
+ else {
+ &executePuzzle( "infile",
+ $matrix,
+ $number_of_seqs,
+ $exact_parameter_est,
+ $rate_heterogeneity );
+ $pwdfile = "infile.dist";
+ }
+}
+
+&rm( "infile" );
+
+# Methods based on alignment
+# --------------------------
+my $OUTTREE_RAXML = "outtree_rax";
+my $OUTTREE_PHYML = "outtree_phyml";
+my $OUTTREE_PROML = "outtree_proml";
+my $OUTTREE_PROTPARS = "outtree_protpars";
+
+my $CONSENSUS_RAXML = "consensus_raxml";
+my $CONSENSUS_PHYML = "consensus_phyml";
+my $CONSENSUS_PROML = "consensus_proml";
+my $CONSENSUS_PROTPARS = "consensus_protpars";
+
+my $OUTTREES_ALL = "outtrees_all";
+my $all_count = 0;
+
+if ( $use_raxml == 1 ) {
+
+ my $model = "---";
+ if ( $matrix == 0 ) {
+ $model = "JTT";
+ }
+ elsif ( $matrix == 1 ) {
+ $model = "DAYHOFF";
+ }
+ elsif ( $matrix == 2 ) {
+ $model = "BLOSUM62";
+ }
+ elsif ( $matrix == 3 ) {
+ $model = "MTREV";
+ }
+ elsif ( $matrix == 5 ) {
+ $model = "VT";
+ }
+ elsif ( $matrix == 6 ) {
+ $model = "WAG";
+ }
+ elsif ( $matrix == 7 ) {
+ $model = "VT";
+ }
+ elsif ( $matrix == 8 ) {
+ $model = "DCMUT";
+ }
+ else {
+ &dieWithUnexpectedError( "Unknown model: matrix=$matrix" );
+ }
+
+ print( "\n========== RAxML begin =========\n\n" );
+ # Six arguments:
+ # 1. DNA or Amino-Acids sequence filename (PHYLIP format)
+ # 2. Model, eg. PROTGAMMAIVT
+ # 3. Replicates (bootstrap)
+ # 4. Seed for bootstrap
+ # 5. Output suffix
+ # 6. Algorithm (only for bootstrap, default otherwise)
+ my $invar = "";
+ if ( $estimate_invar_sites == 1 ) {
+ $invar = "I";
+ }
+
+ # NOTE. RaxML does its own bootstrapping.
+ &executeRaxml( "align", $RAXML_MODEL_BASE.$invar.$model."F", $bootstraps, $seed, "xxx", $RAXML_ALGORITHM );
+ print( "\n========== RAxML end =========\n\n" );
+
+ &rm( "RAxML_log.xxx" );
+ &rm( "RAxML_parsimonyTree.xxx" );
+ &mv( "RAxML_info.xxx", $outfile."_raxml_info" );
+ if ( $bootstraps > 1 ) {
+ &rm( "RAxML_bestTree.xxx" );
+ &mv( "RAxML_bipartitions.xxx", $CONSENSUS_RAXML );
+ &append( "RAxML_bootstrap.xxx", $OUTTREES_ALL );
+ if ( $keep_multiple_trees == 1 ) {
+ &mv( "RAxML_bootstrap.xxx", $multitreefile_raxml );
+ }
+ else {
+ &rm( "RAxML_bootstrap.xxx" );
+ }
+ $all_count++;
+ }
+ else {
+ &mv( "RAxML_result.xxx", $OUTTREE_RAXML );
+ }
+}
+
+
+if ( $use_phyml == 1 ) {
+
+ my $model = "---";
+ if ( $matrix == 0 ) {
+ $model = "JTT";
+ }
+ elsif ( $matrix == 1 ) {
+ $model = "Dayhoff";
+ }
+ elsif ( $matrix == 2 ) {
+ $model = "Blosum62";
+ }
+ elsif ( $matrix == 3 ) {
+ $model = "MtREV";
+ }
+ elsif ( $matrix == 5 ) {
+ $model = "VT";
+ }
+ elsif ( $matrix == 6 ) {
+ $model = "WAG";
+ }
+ elsif ( $matrix == 7 ) {
+ $model = "VT";
+ }
+ elsif ( $matrix == 8 ) {
+ $model = "DCMut";
+ }
+ else {
+ &dieWithUnexpectedError( "Unknown model: matrix=$matrix" );
+ }
+
+ my $input = "";
+ if ( $bootstraps > 1 ) {
+ $input = $SEQBOOT_OUTFILE;
+ }
+ else {
+ $input = "align";
+ }
+ print( "\n========== PHYML begin =========\n\n" );
+ # Six arguments:
+ # 1. DNA or Amino-Acids sequence filename (PHYLIP format)
+ # 2. number of data sets to analyse (ex:3)
+ # 3. Model: JTT | MtREV | Dayhoff | WAG | VT | DCMut | Blosum62 (Amino-Acids)
+ # 4. number of relative substitution rate categories (ex:4), positive integer
+ # 5. starting tree filename (Newick format), your tree filename | BIONJ for a distance-based tree
+ # 6. 1 to estimate proportion of invariable sites, otherwise, fixed proportion "0.0" is used
+ # PHYML produces several results files :
+ # <sequence file name>_phyml_lk.txt : likelihood value(s)
+ # <sequence file name>_phyml_tree.txt : inferred tree(s)
+ # <sequence file name>_phyml_stat.txt : detailed execution stats
+ &executePhyml( $input, $bootstraps, $model, $phyml_rel_substitution_rate_cat, "BIONJ", $estimate_invar_sites );
+ print( "\n========== PHYML end =========\n\n" );
+
+ &rm( $input."_phyml_lk.txt" );
+ &mv( $input."_phyml_tree.txt", $OUTTREE_PHYML );
+ if ( -e $outfile."_phyml_stat" ) {
+ &rm( $outfile."_phyml_stat" );
+ }
+ &mv( $input."_phyml_stat.txt", $outfile."_phyml_stat" );
+ if ( $bootstraps > 1 ) {
+ &append( $OUTTREE_PHYML, $OUTTREES_ALL );
+ $all_count++;
+ }
+
+}
+
+if ( $use_proml == 1 ) {
+ my $input = "";
+ if ( $bootstraps > 1 ) {
+ $input = $SEQBOOT_OUTFILE;
+ }
+ else {
+ $input = "align";
+ }
+ print( "\n========== PHYLIP PROML begin =========\n\n" );
+ # Five arguments:
+ # 1. name of alignment file (in correct format!)
+ # 2. number of bootstraps
+ # 3. jumbles: 0: do not jumble; >=1 number of jumbles
+ # 4. seed for random number generator
+ # 5. 1 for PAM instead of JTT
+ my $use_pam = 1;
+ if ( $matrix == 0 ) {
+ $use_pam = 0;
+ }
+ &executeProml( $input, $bootstraps, $jumbles, $seed, $use_pam, $use_global_rearr );
+ print( "\n========== PHYLIP PROML end =========\n\n" );
+ &mv( "outtree", $OUTTREE_PROML );
+ &rm( "outfile" );
+ if ( $bootstraps > 1 ) {
+ &append( $OUTTREE_PROML, $OUTTREES_ALL );
+ $all_count++;
+ }
+}
+
+
+if ( $use_protpars == 1 ) {
+ my $input = "";
+ if ( $bootstraps > 1 ) {
+ $input = $SEQBOOT_OUTFILE;
+ }
+ else {
+ $input = "align";
+ }
+ print( "\n========== PHYLIP PROTPARS begin =========\n\n" );
+ &executeProtpars( $input, $bootstraps, $jumbles, $seed );
+ print( "\n========== PHYLIP PROTPARS end =========\n\n" );
+ &mv( "outtree", $OUTTREE_PROTPARS );
+ &rm( $outfile."_protpars_outfile" );
+ &mv( "outfile", $outfile."_protpars_outfile" );
+ if ( $bootstraps > 1 ) {
+ &append( $OUTTREE_PROTPARS, $OUTTREES_ALL );
+ $all_count++;
+ }
+}
+
+
+
+# Methods based on PWD
+# --------------------
+my $OUTTREE_FASTME = "outtree_fastme";
+my $OUTTREE_PHYLIP_NJ = "outtree_phylip_nj";
+my $OUTTREE_PHYLIP_FM = "outtree_phylip_fm";
+my $OUTTREE_PHYLIP_ME = "outtree_phylip_me";
+my $OUTTREE_BIONJ = "outtree_bionj";
+my $OUTTREE_WEIGHBOR = "outtree_weighbor";
+
+my $CONSENSUS_FASTME = "consensus_fastme";
+my $CONSENSUS_PHYLIP_NJ = "consensus_phylip_nj";
+my $CONSENSUS_PHYLIP_FM = "consensus_phylip_fm";
+my $CONSENSUS_PHYLIP_ME = "consensus_phylip_me";
+my $CONSENSUS_BIONJ = "consensus_bionj";
+my $CONSENSUS_WEIGHBOR = "consensus_weighbor";
+my $CONSENSUS_ALL = "consensus_all";
+
+
+if ( $use_fastme == 1 ) {
+ print( "\n========== FASTME begin =========\n\n" );
+ &executeFastme( $pwdfile, $bootstraps, $fastme_init_tree_opt );
+ print( "\n========== FASTME end ===========\n\n" );
+ &rm( "output.d" );
+ &mv( "output.tre", $OUTTREE_FASTME );
+ if ( $bootstraps > 1 ) {
+ &append( $OUTTREE_FASTME, $OUTTREES_ALL );
+ $all_count++;
+ }
+}
+if ( $use_phylip_nj ) {
+ print( "\n========== PHYLIP NEIGHBOR begin =========\n\n" );
+ &executeNeighbor( $pwdfile, $bootstraps, $seed, 0 );
+ print( "\n========== PHYLIP NEIGHBOR end =========\n\n" );
+ &mv( "outtree", $OUTTREE_PHYLIP_NJ );
+ &rm( "outfile" );
+ if ( $bootstraps > 1 ) {
+ &append( $OUTTREE_PHYLIP_NJ, $OUTTREES_ALL );
+ $all_count++;
+ }
+}
+if ( $use_phylip_fitch_fm ) {
+ print( "\n========== PHYLIP FITCH FM begin =========\n\n" );
+ &executeFitch( $pwdfile, $bootstraps, $seed, $jumbles, 0, "FM", $use_global_rearr );
+ print( "\n========== PHYLIP FITCH FM end =========\n\n" );
+ &mv( "outtree", $OUTTREE_PHYLIP_FM );
+ &rm( "outfile" );
+ if ( $bootstraps > 1 ) {
+ &append( $OUTTREE_PHYLIP_FM, $OUTTREES_ALL );
+ $all_count++;
+ }
+}
+if ( $use_phylip_fitch_me ) {
+ print( "\n========== PHYLIP FITCH ME begin =========\n\n" );
+ &executeFitch( $pwdfile, $bootstraps, $seed, $jumbles, 0, "ME", $use_global_rearr );
+ print( "\n========== PHYLIP FITCH ME end =========\n\n" );
+ &mv( "outtree", $OUTTREE_PHYLIP_ME );
+ &rm( "outfile" );
+ if ( $bootstraps > 1 ) {
+ &append( $OUTTREE_PHYLIP_ME, $OUTTREES_ALL );
+ $all_count++;
+ }
+}
+if ( $use_bionj ) {
+ print( "\n========== BIONJ begin =========\n\n" );
+ &executeBionj( $pwdfile, $OUTTREE_BIONJ );
+ print( "\n========== BIONJ end =========\n\n" );
+ if ( $bootstraps > 1 ) {
+ &append( $OUTTREE_BIONJ, $OUTTREES_ALL );
+ $all_count++;
+ }
+}
+if ( $use_weighbor ) {
+ print( "\n========== WEIGHBOR begin =========\n\n" );
+ &executeWeighbor( $number_of_aa, 14, $pwdfile, $OUTTREE_WEIGHBOR );
+ print( "\n========== WEIGHBOR end =========\n\n" );
+ if ( $bootstraps > 1 ) {
+ &append( $OUTTREE_WEIGHBOR, $OUTTREES_ALL );
+ $all_count++;
+ }
+}
+
+
+
+if ( $bootstraps > 1 ) {
+ # Consense:
+ if ( $use_fastme == 1 ) {
+ &consense( $OUTTREE_FASTME, $CONSENSUS_FASTME );
+ }
+ if ( $use_phylip_nj == 1 ) {
+ &consense( $OUTTREE_PHYLIP_NJ, $CONSENSUS_PHYLIP_NJ );
+ }
+ if ( $use_phylip_fitch_fm == 1 ) {
+ &consense( $OUTTREE_PHYLIP_FM, $CONSENSUS_PHYLIP_FM );
+ }
+ if ( $use_phylip_fitch_me == 1 ) {
+ &consense( $OUTTREE_PHYLIP_ME, $CONSENSUS_PHYLIP_ME );
+ }
+ if ( $use_bionj == 1 ) {
+ &consense( $OUTTREE_BIONJ, $CONSENSUS_BIONJ );
+ }
+ if ( $use_weighbor == 1 ) {
+ &consense( $OUTTREE_WEIGHBOR, $CONSENSUS_WEIGHBOR );
+ }
+ if ( $use_phyml == 1 ) {
+ &consense( $OUTTREE_PHYML, $CONSENSUS_PHYML );
+ }
+ if ( $use_proml == 1 ) {
+ &consense( $OUTTREE_PROML, $CONSENSUS_PROML );
+ }
+ if ( $use_protpars == 1 ) {
+ &consense( $OUTTREE_PROTPARS, $CONSENSUS_PROTPARS );
+ }
+ if ( $all_count > 1 ) {
+ &consense( $OUTTREES_ALL, $CONSENSUS_ALL );
+ }
+ else {
+ &rm( $OUTTREES_ALL );
+ }
+
+ my $INTREE_FOR_PUZZLE = "intree"; #why so serious?
+ &rm( $INTREE_FOR_PUZZLE );
+ system( "touch", $INTREE_FOR_PUZZLE )
+ && die("\n\n$0: could not \"touch $INTREE_FOR_PUZZLE\": $!\n\n");
+
+ if ( $use_fastme == 1 ) {
+ &append( $CONSENSUS_FASTME, $INTREE_FOR_PUZZLE );
+ }
+ if ( $use_phylip_nj == 1 ) {
+ &append( $CONSENSUS_PHYLIP_NJ, $INTREE_FOR_PUZZLE );
+ }
+ if ( $use_phylip_fitch_fm == 1 ) {
+ &append( $CONSENSUS_PHYLIP_FM, $INTREE_FOR_PUZZLE );
+ }
+ if ( $use_phylip_fitch_me == 1 ) {
+ &append( $CONSENSUS_PHYLIP_ME, $INTREE_FOR_PUZZLE );
+ }
+ if ( $use_bionj == 1 ) {
+ &append( $CONSENSUS_BIONJ, $INTREE_FOR_PUZZLE );
+ }
+ if ( $use_weighbor == 1 ) {
+ &append( $CONSENSUS_WEIGHBOR, $INTREE_FOR_PUZZLE );
+ }
+ if ( $use_raxml == 1 ) {
+ # Needed, because TREE-PUZZLE adds internal labels for all subsequent trees
+ # when evaluating given trees (this seems a strange behaviour).
+ removeSupportValues( $CONSENSUS_RAXML, $CONSENSUS_RAXML."_support_removed" );
+ &append( $CONSENSUS_RAXML."_support_removed", $INTREE_FOR_PUZZLE );
+ &rm( $CONSENSUS_RAXML."_support_removed" );
+ }
+ if ( $use_phyml == 1 ) {
+ &append( $CONSENSUS_PHYML, $INTREE_FOR_PUZZLE );
+ }
+ if ( $use_proml == 1 ) {
+ &append( $CONSENSUS_PROML, $INTREE_FOR_PUZZLE );
+ }
+ if ( $use_protpars == 1 ) {
+ &append( $CONSENSUS_PROTPARS, $INTREE_FOR_PUZZLE );
+ }
+ if ( $all_count > 1 ) {
+ &append( $CONSENSUS_ALL, $INTREE_FOR_PUZZLE );
+ }
+
+
+ # Puzzle for ML branch lenghts:
+ # The alignment is read from infile by default.
+ # The tree is read from intree by default.
+ &rm( "infile" );
+ &mv( "align", "infile" ); # align = original alignment in phylip interleaved.
+
+ &executePuzzleToCalculateBranchLenghts( $matrix,
+ $exact_parameter_est,
+ $rate_heterogeneity );
+
+ my $OUTTREE_PUZZLE = "outtree_puzzle";
+
+ &rm( $outfile."_puzzle_outfile" );
+
+ &mv( "outfile", $outfile."_puzzle_outfile" );
+ &mv( "outtree", $OUTTREE_PUZZLE );
+ &rm( "outdist" );
+ &rm( "intree" );
+
+
+ # Transfer
+ # --------
+ my $counter = 0;
+ if ( $use_fastme == 1 ) {
+ &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_FASTME, $fastme_outtree, $counter++ );
+ &rm( $CONSENSUS_FASTME );
+ }
+ if ( $use_phylip_nj == 1 ) {
+ &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PHYLIP_NJ, $phylip_nj_outtree, $counter++ );
+ &rm( $CONSENSUS_PHYLIP_NJ );
+ }
+ if ( $use_phylip_fitch_fm == 1 ) {
+ &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PHYLIP_FM, $phylip_fm_outtree, $counter++ );
+ &rm( $CONSENSUS_PHYLIP_FM );
+ }
+ if ( $use_phylip_fitch_me == 1 ) {
+ &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PHYLIP_ME, $phylip_me_outtree, $counter++ );
+ &rm( $CONSENSUS_PHYLIP_ME );
+ }
+ if ( $use_bionj == 1 ) {
+ &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_BIONJ, $bionj_outtree, $counter++ );
+ &rm( $CONSENSUS_BIONJ );
+ }
+ if ( $use_weighbor == 1 ) {
+ &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_WEIGHBOR, $weighbor_outtree, $counter++ );
+ &rm( $CONSENSUS_WEIGHBOR );
+ }
+ if ( $use_raxml == 1 ) {
+ &to_phyloxml( $CONSENSUS_RAXML, $raxml_outtree, 1, 1 );
+ $counter++;
+ }
+ if ( $use_phyml == 1 ) {
+ &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PHYML, $phyml_outtree, $counter++ );
+ &rm( $CONSENSUS_PHYML );
+ }
+ if ( $use_proml == 1 ) {
+ &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PROML, $proml_outtree, $counter++ );
+ &rm( $CONSENSUS_PROML );
+ }
+ if ( $use_protpars == 1 ) {
+ &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PROTPARS, $protpars_outtree, $counter++ );
+ &rm( $CONSENSUS_PROTPARS );
+ }
+ if ( $all_count > 1 ) {
+ &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_ALL, $all_outtree, $counter++ );
+ &rm( $CONSENSUS_ALL );
+ }
+
+ # Clean up
+ # --------
+ &rm( $OUTTREE_PUZZLE );
+ &rm( $SEQBOOT_OUTFILE );
+ if ( $keep_multiple_trees == 1 ) {
+ if ( $use_fastme == 1 ) {
+ &mv( $OUTTREE_FASTME, $multitreefile_fastme );
+ }
+ if ( $use_phylip_nj == 1 ) {
+ &mv( $OUTTREE_PHYLIP_NJ, $multitreefile_phylip_nj );
+ }
+ if ( $use_phylip_fitch_fm == 1 ) {
+ &mv( $OUTTREE_PHYLIP_FM, $multitreefile_phylip_fm );
+ }
+ if ( $use_phylip_fitch_me == 1 ) {
+ &mv( $OUTTREE_PHYLIP_ME, $multitreefile_phylip_me );
+ }
+ if ( $use_bionj == 1 ) {
+ &mv( $OUTTREE_BIONJ, $multitreefile_bionj );
+ }
+ if ( $use_weighbor == 1 ) {
+ &mv( $OUTTREE_WEIGHBOR, $multitreefile_weighbor );
+ }
+ if ( $use_phyml == 1 ) {
+ &mv( $OUTTREE_PHYML, $multitreefile_phyml );
+ }
+ if ( $use_proml == 1 ) {
+ &mv( $OUTTREE_PROML, $multitreefile_proml );
+ }
+ if ( $use_protpars == 1 ) {
+ &mv( $OUTTREE_PROTPARS, $multitreefile_protpars );
+ }
+ &mv( $pwdfile, $multipwdfile );
+ }
+ else {
+ if ( $use_fastme == 1 ) {
+ &rm( $OUTTREE_FASTME );
+ }
+ if ( $use_phylip_nj == 1 ) {
+ &rm( $OUTTREE_PHYLIP_NJ );
+ }
+ if ( $use_phylip_fitch_fm == 1 ) {
+ &rm( $OUTTREE_PHYLIP_FM );
+ }
+ if ( $use_phylip_fitch_me == 1 ) {
+ &rm( $OUTTREE_PHYLIP_ME );
+ }
+ if ( $use_bionj == 1 ) {
+ &rm( $OUTTREE_BIONJ );
+ }
+ if ( $use_weighbor == 1 ) {
+ &rm( $OUTTREE_WEIGHBOR );
+ }
+ if ( $use_phyml == 1 ) {
+ &rm( $OUTTREE_PHYML );
+ }
+ if ( $use_proml == 1 ) {
+ &rm( $OUTTREE_PROML );
+ }
+ if ( $use_protpars == 1 ) {
+ &rm( $OUTTREE_PROTPARS );
+ }
+ &rm( $pwdfile );
+ }
+ if ( $all_count > 1 ) {
+ &rm( $OUTTREES_ALL );
+ }
+} # if ( $bootstraps > 1 )
+else {
+ &rm( "infile.dist" );
+
+ &rm( "infile.puzzle" );
+ if ( $use_fastme == 1 ) {
+ &to_phyloxml( $OUTTREE_FASTME, $fastme_outtree, 0, 1 );
+ }
+ if ( $use_phylip_nj == 1 ) {
+ &to_phyloxml( $OUTTREE_PHYLIP_NJ, $phylip_nj_outtree, 0, 1);
+ }
+ if ( $use_phylip_fitch_fm == 1 ) {
+ &to_phyloxml( $OUTTREE_PHYLIP_FM, $phylip_fm_outtree, 0, 1 );
+ }
+ if ( $use_phylip_fitch_me == 1 ) {
+ &to_phyloxml( $OUTTREE_PHYLIP_ME, $phylip_me_outtree, 0, 1 );
+ }
+ if ( $use_bionj == 1 ) {
+ &to_phyloxml( $OUTTREE_BIONJ, $bionj_outtree, 0, 1 );
+ }
+ if ( $use_weighbor == 1 ) {
+ &to_phyloxml( $OUTTREE_WEIGHBOR, $weighbor_outtree, 0, 1 );
+ }
+ if ( $use_raxml == 1 ) {
+ &to_phyloxml( $OUTTREE_RAXML, $raxml_outtree, 0, 1 );
+ }
+ if ( $use_phyml == 1 ) {
+ &to_phyloxml( $OUTTREE_PHYML, $phyml_outtree, 0, 1 );
+ }
+ if ( $use_proml == 1 ) {
+ &to_phyloxml( $OUTTREE_PROML, $proml_outtree, 0, 1 );
+ }
+ if ( $use_protpars == 1 ) {
+ &to_phyloxml( $OUTTREE_PROTPARS, $protpars_outtree, 0, 1 );
+ }
+} # if ( $bootstraps > 1 )
+
+&rm( $infile );
+&rm( "infile" );
+&rm( "align" );
+&rm( "align.reduced" );
+
+
+$log = $log."Finish time/date : ".`date`;
+
+if ( $bootstraps > 1 ) {
+ $log = $log."Puzzle output file : ".$outfile."_puzzle_outfile\n";
+}
+$log = $log."Columns in alignment : $number_of_aa\n";
+$log = $log."Number of sequences in alignment : $number_of_seqs\n";
+if ( $all_count > 1 ) {
+ $log = $log."Combined consensus : $all_outtree\n";
+}
+
+
+if ( $bootstraps > 1 ) {
+ $log = $log."\n\n";
+ $log = $log."Simple support value statistics (trees are numbered the same as for TREE PUZZLE output)\n";
+ $log = $log."------------------------------- \n";
+ $log = $log."\n";
+}
+
+open( OUT, ">$logfile" ) || die "\n$0: Cannot create file <<$logfile>>: $!\n";
+print OUT $log;
+close( OUT );
+
+if ( $bootstraps > 1 ) {
+ # Simple support statistics
+ # -------------------------
+ my $SS_OUT = $temp_dir."/ss_out";
+ my @phylos = ();
+ my $ounter = 0;
+ if ( $use_fastme == 1 ) {
+ $phylos[ $ounter++ ] = $fastme_outtree;
+ }
+ if ( $use_phylip_nj == 1 ) {
+ $phylos[ $ounter++ ] = $phylip_nj_outtree;
+ }
+ if ( $use_phylip_fitch_fm == 1 ) {
+ $phylos[ $ounter++ ] = $phylip_fm_outtree;
+ }
+ if ( $use_phylip_fitch_me == 1 ) {
+ $phylos[ $ounter++ ] = $phylip_me_outtree;
+ }
+ if ( $use_bionj == 1 ) {
+ $phylos[ $ounter++ ] = $bionj_outtree;
+ }
+ if ( $use_weighbor == 1 ) {
+ $phylos[ $ounter++ ] = $weighbor_outtree;
+ }
+ if ( $use_raxml == 1 ) {
+ $phylos[ $ounter++ ] = $raxml_outtree;
+ }
+ if ( $use_phyml == 1 ) {
+ $phylos[ $ounter++ ] = $phyml_outtree;
+ }
+ if ( $use_proml == 1 ) {
+ $phylos[ $ounter++ ] = $proml_outtree;
+ }
+ if ( $use_protpars == 1 ) {
+ $phylos[ $ounter++ ] = $protpars_outtree;
+ }
+ if ( $all_count > 1) {
+ $phylos[ $ounter++ ] = $all_outtree;
+ }
+ &executeSupportStatistics( $SS_OUT, @phylos );
+ &append( $SS_OUT, $logfile );
+ &rm( $SS_OUT );
+
+ # Append parts of puzzle output file
+ # ----------------------------------
+ if ( $all_count > 1 ) {
+ &parsePuzzleOutfile( $outfile."_puzzle_outfile", $logfile );
+ }
+}
+
+chdir( $current_dir )
+|| die "\n\n$0: Could not chdir to <<$current_dir>>: $!\n\n";
+
+rmdir( $temp_dir )
+|| print "\n\n$0: Warning: Could not remove <<$temp_dir>>: $!\n\n";
+
+print "\n\n\n$0 successfully comleted.\n\n";
+
+exit( 0 );
+
+
+
+# Methods:
+# --------
+
+
+# Six arguments:
+# 1. DNA or Amino-Acids sequence filename (PHYLIP format)
+# 2. Model, eg. PROTGAMMAIVT
+# 3. Replicates (bootstrap)
+# 4. Seed for bootstrap
+# 5. Output suffix
+# 6. Algorithm (only for bootstrap, default otherwise)
+# NOTE. RaxML does its own bootstrapping.
+sub executeRaxml {
+ my $msa = $_[ 0 ];
+ my $model = $_[ 1 ];
+ my $replicates = $_[ 2 ];
+ my $seed = $_[ 3 ];
+ my $outfile_suffix = $_[ 4 ];
+ my $algo = $_[ 5 ];
+
+ &testForTextFilePresence( $msa );
+ my $command = "$RAXML -m $model -s $msa -n $outfile_suffix";
+
+ if ( $replicates > 1 ) {
+ $command = $command . " -x $seed -N $replicates";
+ if ( $algo ) {
+ $command = $command . " -f $algo";
+ }
+ }
+
+ print( "\n$command\n");
+
+ system( $command )
+ && &dieWithUnexpectedError( $command );
+
+}
+
+
+sub to_phyloxml {
+ my $from = $_[ 0 ];
+ my $to = $_[ 1 ];
+ my $internal_names_are_boots = $_[ 2 ];
+ my $extract_taxonomy = $_[ 3 ];
+ &dieIfFileExists( $to );
+ &dieIfFileNotExists( $from );
+ my $command = "$NEWICK_TO_PHYLOXML -f=nn $from $to";
+ if ( $internal_names_are_boots == 1 ) {
+ $command = $command . " -i";
+ }
+ if ( $extract_taxonomy == 1 ) {
+ $command = $command . " -xt";
+ }
+ system( $command )
+ && die "$0: Could not execute \"$command \"";
+ &rm( $from );
+}
+
+
+sub mv {
+ my $from = $_[ 0 ];
+ my $to = $_[ 1 ];
+ &dieIfFileExists( $to );
+ &dieIfFileNotExists( $from );
+ system( "mv", $from, $to )
+ && die "\n\n$0: could not move \"$from\" to \"$to\": $!\n\n";
+}
+
+sub cp {
+ my $from = $_[ 0 ];
+ my $to = $_[ 1 ];
+ &dieIfFileExists( $to );
+ &dieIfFileNotExists( $from );
+
+ system( "cp", $from, $to )
+ && die "\n\n$0: could not copy \"$from\" to \"$to\": $!\n\n";
+}
+
+sub rm {
+ my $f = $_[ 0 ];
+ unlink( $f );
+}
+
+sub consense {
+ my $multi_in = $_[ 0 ];
+ my $consense_out = $_[ 1 ];
+ &executeConsense( $multi_in );
+ &mv( "outtree", $consense_out );
+ &rm( "outfile" );
+
+}
+
+
+
+# 1. file to be appended
+# 2. file to append to
+sub append {
+ my $to_be_appended = $_[ 0 ];
+ my $append_to = $_[ 1 ];
+ &dieIfFileNotExists( $to_be_appended );
+ system( "cat $to_be_appended >> $append_to" )
+ && die "\n\n$0: could not execute \"cat $to_be_appended >> $append_to\": $!\n\n";
+
+}
+
+sub dieIfFileExists {
+ my $file = $_[ 0 ];
+ if ( -e $file ) {
+ die "\n\n$0: \"$file\" already exists\n\n";
+ }
+}
+
+sub dieIfFileNotExists {
+ my $file = $_[ 0 ];
+ unless ( ( -s $file ) && ( -f $file ) ) {
+ die( "\n\n$0: \"$file\" does not exist or is empty" );
+ }
+}
+
+
+
+
+# Two arguments:
+# 1. seed for random number generator
+# 2. number of bootstraps
+# Reads in "infile" by default.
+sub executeSeqboot {
+
+ my $s = $_[ 0 ];
+ my $bs = $_[ 1 ];
+ my $verb = "";
+
+ &testForTextFilePresence( $infile );
+
+
+ $verb = "
+2";
+
+ system( "$SEQBOOT << !
+r
+$bs$verb
+Y
+$s
+!" )
+ && die "$0: Could not execute \"$SEQBOOT\"";
+
+}
+
+
+
+# One/two/three argument(s):
+# Reads in tree from "intree" by default. (Presence of "intree" automatically
+# switches into "User defined trees" mode.)
+# 1. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
+# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
+# 2. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise
+# 3. Model of rate heterogeneity:
+# 1 for "8 Gamma distributed rates"
+# 2 for "Two rates (1 invariable + 1 variable)"
+# 3 for "Mixed (1 invariable + 8 Gamma rates)"
+# otherwise: Uniform rate
+# Last modified: 09/08/03 (added 2nd and 3rd parameter)
+sub executePuzzleToCalculateBranchLenghts {
+ my $matrix_option = $_[ 0 ];
+ my $parameter_estimates_option = $_[ 1 ];
+ my $rate_heterogeneity_option = $_[ 2 ];
+ my $i = 0;
+ my $mat = "";
+ my $est = "";
+ my $rate = "";
+
+ unless ( ( -s "infile" ) && ( -f "infile" ) && ( -T "infile" ) ) {
+ die "\n$0: executePuzzleToCalculateBranchLenghts: <<infile>> does not exist, is empty, or is not a plain textfile.\n";
+ }
+ unless ( ( -s "intree" ) && ( -f "intree" ) && ( -T "intree" ) ) {
+ die "\n$0: executePuzzleToCalculateBranchLenghts: <<intree>> does not exist, is empty, or is not a plain textfile.\n";
+ }
+
+ $mat = setModelForPuzzle( $matrix_option );
+ if ( $parameter_estimates_option ) {
+ $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option );
+ }
+ if ( $rate_heterogeneity_option ) {
+ $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option );
+ }
+
+
+ system( "$PUZZLE << !
+$mat$est$rate
+x
+y
+!" )
+ && die "$0: Could not execute \"$PUZZLE\" (mat=$mat est=$est rate=$rate)";
+
+}
+
+# Two arguments:
+# 1. puzzle outfile
+# 2. file to append to
+sub parsePuzzleOutfile {
+ my $puzzle_outfile = $_[ 0 ];
+ my $file_to_append_to = $_[ 1 ];
+ &testForTextFilePresence( $puzzle_outfile );
+ open( OUT, ">>$file_to_append_to" ) || &dieWithUnexpectedError( "Cannot open \"$file_to_append_to\"" );
+ open( IN, "$puzzle_outfile" ) || &dieWithUnexpectedError( "Cannot open file \"$puzzle_outfile\"" );
+ my $return_line;
+ my $read = 0;
+ print OUT "\nTREE PUZZLE output\n";
+ print OUT "------------------\n";
+ while ( $return_line = <IN> ) {
+ if ( $return_line =~/COMPARISON OF USER TREES/ ) {
+ $read = 1;
+ }
+ elsif( $return_line =~/TIME STAMP/ ) {
+ $read = 0;
+ }
+ elsif( $read ) {
+ print OUT $return_line;
+ }
+ }
+ close( IN );
+ close( OUT );
+}
+
+# Three/four arguments:
+# 1. Name of file containing tree with correct branch lengths
+# 2. Name of file containing tree with correct bootstraps
+# 3. Outputfilename
+# 4. Index of tree with correct branch lengths, in case more than one in file
+# Last modified: 2007.11.27
+sub executeSupportTransfer {
+ my $tree_with_bl = $_[ 0 ];
+ my $tree_with_bs = $_[ 1 ];
+ my $out = $_[ 2 ];
+ my $index = $_[ 3 ];
+
+ &testForTextFilePresence( $tree_with_bl );
+ &testForTextFilePresence( $tree_with_bs );
+ my $command = "$SUPPORT_TRANSFER $tree_with_bl $tree_with_bs $out $index";
+ system( $command )
+ && die "$0: Could not execute \"$command\"";
+}
+
+
+# Two or more arguments:
+# 1. outfile
+# 2. phylogeny 1 with support values
+# 3. phylogeny 2 with support values
+# 4. ...
+sub executeSupportStatistics {
+ my $outfile = $_[ 0 ];
+ &dieIfFileExists( $outfile );
+ my $phylos = "";
+ for( my $i = 1; $i < scalar(@_); ++$i ) {
+ &testForTextFilePresence( $_[ $i ] );
+ $phylos .= $_[ $i ]." ";
+ }
+ my $command = "$SUPPORT_STATISTICS -o=$outfile $phylos";
+ system( "$command" )
+ && die "$0: Could not execute \"$command\"";
+}
+
+
+sub getNumberOfSeqsAndAas {
+ my $infile = $_[ 0 ];
+ my $seqs = 0;
+ my $aa = 0;
+ open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n";
+ while( <IN> ) {
+ if ( $_ =~ /^\s*(\d+)\s+(\d+)\s*$/ ) {
+ $seqs = $1;
+ $aa = $2;
+ }
+ }
+ close( IN );
+
+ if ( $seqs == 0 || $aa == 0 ) {
+ die( "\n$0: Could not get number of seqs and aa from: $infile" );
+ }
+ return $seqs, $aa;
+}
+
+
+
+sub removeSupportValues {
+ my $infile = $_[ 0 ];
+ my $outfile = $_[ 1 ];
+ &testForTextFilePresence( $infile );
+ open( OUT, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" );
+ open( IN, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
+ while ( my $line = <IN> ) {
+ $line =~ s/\)\d+\.?\d*:/\):/g;
+ print OUT "$line";
+ }
+ close( OUT );
+ close( IN );
+}
+
+
+
+
+# Six arguments:
+# 1. name of alignment file (in correct format!)
+# 2. number of bootstraps
+# 3. jumbles: 0: do not jumble; >=1 number of jumbles
+# 4. seed for random number generator
+# 5. 1 for PAM instead of JTT
+# 6. 1 to use globale rearragements
+sub executeProml {
+ my $align = $_[ 0 ];
+ my $bs = $_[ 1 ];
+ my $rand = $_[ 2 ];
+ my $s = $_[ 3 ];
+ my $use_pam = $_[ 4 ];
+ my $use_global_rearr = $_[ 5 ];
+ my $jumble = "";
+ my $multi = "";
+ my $pam = "";
+
+ &testForTextFilePresence( $align );
+
+ if ( $bs > 1 && $rand < 1 ) {
+ $rand = 1;
+ }
+
+ if ( $rand >= 1 ) {
+ $jumble = "
+J
+$s
+$rand";
+ }
+
+ if ( $bs > 1 ) {
+ $multi = "
+M
+D
+$bs";
+ }
+
+
+ if ( $use_pam == 1 ) {
+ $pam = "
+P
+P";
+ }
+
+ my $global = "";
+ if ( $use_global_rearr == 1 ) {
+ $global = "
+G";
+ }
+
+ system( "$PROML 2>&1 << !
+$align$jumble$multi$pam$global
+3
+Y
+!" )
+ && &dieWithUnexpectedError( "Could not execute \"$PROML $align$jumble$multi$pam$global\"" );
+ # 3: Do NOT print out tree
+
+ return;
+
+} ## executeProml
+
+
+sub printUsage {
+
+ print <<END;
+
+Copyright (C) 2002-2007 Christian M. Zmasek
+All rights reserved
+
+Author: Christian M. Zmasek
+phylosoft\@gmail.com
+http://www.phylosoft.org
+
+ Requirements phylo_pl is part of the FORESTER collection of programs.
+ ------------ Many of its global variables are set via forester.pm.
+
+ Note. Use xt.pl (for Pfam alignments) or mt.pl (for other alignments)
+ to run phylo_pl.pl on whole directories of alignments files.
+
+ Usage
+ -----
+
+ phylo_pl.pl [-options] <input alignment in SELEX (Pfam), PHYLIP
+ sequential format, or Clustal W output> <outputfile>
+ [path/name for temporary directory to be created]
+
+ Example:
+ "% phylo_pl.pl -B100q\@1nbS9X IL5.aln IL5_tree"
+
+ Options
+ -------
+
+ Bx : Number of bootstraps. B0: do not bootstrap. Default is 100 bootstrapps.
+ The number of bootstrapps should be divisible by 10.
+ J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE and/or PHYML, RAXML, default: VT (Mueller-Vingron 2000).
+ L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE and/or PHYML, RAXML, default: VT.
+ M : Use mtREV24 matrix (Adachi-Hasegawa 1996) in TREE-PUZZLE and/or PHYML, default: VT.
+ W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE and/or PHYML, RAXML, default: VT.
+ P : Use PAM matrix (Dayhoff et al. 1978) in TREE-PUZZLE and/or PHYML, RAXML, default: VT.
+ D : Use DCMut matrix (Kosial and Goldman, 2005) in PHYML, RAXML, VT in TREE-PUZZLE.
+ A : Let TREE-PUZZLE choose which matrix to use, default: VT
+ E : Exact parameter estimates in TREE-PUZZLE, default: Approximate.
+ Model of rate heterogeneity in TREE-PUZZLE (default: Uniform rate):
+ g : 8 Gamma distributed rates
+ t : Two rates (1 invariable + 1 variable)
+ m : Mixed (1 invariable + 8 Gamma rates)
+ q\@x: Use FastME, x: 1: GME
+ 2: BME
+ 3: NJ
+ n : Use PHYLIP Neighbor (NJ).
+ f : Use PHYLIP Fitch.
+ e : Use PHYLIP Minimal Evolution.
+ b : Use BIONJ.
+ w : Use Weighbor.
+ x : Use RAxML.
+ y : Use PHYML.
+ o : Use PHYLIP proml.
+ p : Use PHYLIP protpars.
+ rx : Number of relative substitution rate categories in PHYML (default is 4).
+ jx : Number of jumbles (input order randomization) for PHYLIP FM, ME, PROTPARS, and PROML (default is 2) (random seed set with Sx).
+ I : Estimate proportion of invariable sites in RAXML and/or PHYML (otherwise, proportion "0.0" is used in PHYML)
+ G : to turn on global rearrangements in PHYLIP FM, ME, and PROML
+ Sx : Seed for random number generator(s). Must be 4n+1. Default is 9.
+ X : To keep multiple tree file (=trees from bootstrap resampled alignments) and
+ pairwise distance matrix file (in case of bootstrap analysis).
+
+END
+
+} ## printUsage