# $Id: forester.pm,v 1.26 2010/12/13 19:00:22 cmzmasek Exp $ # # FORESTER -- software libraries and applications # for evolutionary biology research and applications. # # Copyright (C) 2007-2009 Christian M. Zmasek # Copyright (C) 2007-2009 Burnham Institute for Medical Research # All rights reserved # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA # # Contact: phylosoft @ gmail . com # WWW: www.phylosoft.org/forester # # # package forester; use strict; require Exporter; our $VERSION = 1.000; our @ISA = qw( Exporter ); our @EXPORT = qw( executeConsense executePhyloPl executePuzzleDQO executePuzzleDQObootstrapped pfam2phylipMatchOnly startsWithSWISS_PROTname isPfamSequenceLine isPfamCommentLine containsPfamNamedSequence isRFline executeProtpars setModelForPuzzle setRateHeterogeneityOptionForPuzzle setParameterEstimatesOptionForPuzzle executePuzzleBootstrapped executePuzzle executeFastme executeNeighbor executeFitch executeBionj executeWeighbor executePhyml executeHmmfetch addDistsToQueryToPWDfile testForTextFilePresence exitWithWarning dieWithUnexpectedError addSlashAtEndIfNotPresent $LENGTH_OF_NAME $MIN_NUMBER_OF_AA $TREMBL_ACDEOS_FILE $SWISSPROT_ACDEOS_FILE $SPECIES_NAMES_FILE $SPECIES_TREE_FILE_DEFAULT $MULTIPLE_TREES_FILE_SUFFIX $LOG_FILE_SUFFIX $ALIGN_FILE_SUFFIX $TREE_FILE_SUFFIX $ADDITION_FOR_RIO_ANNOT_TREE $SUFFIX_PWD $SUFFIX_BOOT_STRP_POS $MULTIPLE_PWD_FILE_SUFFIX $SUFFIX_PWD_NOT_BOOTS $SUFFIX_HMM $MATRIX_FOR_PWD $RIO_PWD_DIRECTORY $RIO_BSP_DIRECTORY $RIO_NBD_DIRECTORY $RIO_ALN_DIRECTORY $RIO_HMM_DIRECTORY $PFAM_FULL_DIRECTORY $PFAM_SEED_DIRECTORY $PRIOR_FILE_DIR $PFAM_HMM_DB $FORESTER_JAR $SEQBOOT $NEIGHBOR $PROTPARS $CONSENSE $PROML $PHYLIP_VERSION $PUZZLE $PUZZLE_VERSION $FASTME $FASTME_VERSION $BIONJ $BIONJ_VERSION $WEIGHBOR $WEIGHBOR_VERSION $RAXML $RAXML_VERSION $PHYML $PHYML_VERSION $HMMALIGN $HMMSEARCH $HMMBUILD $HMMFETCH $SFE $HMMCALIBRATE $P7EXTRACT $MULTIFETCH $BOOTSTRAP_CZ $BOOTSTRAP_CZ_PL $SUPPORT_TRANSFER $SUPPORT_STATISTICS $NEWICK_TO_PHYLOXML $PHYLO_PL $RIO_PL $DORIO $PUZZLE_DQO $BOOTSTRAPS $PATH_TO_FORESTER $JAVA $NODE_LIST $RIO_SLAVE_DRIVER $RIO_SLAVE $TEMP_DIR_DEFAULT $EXPASY_SPROT_SEARCH_DE $EXPASY_SPROT_SEARCH_AC ); # ============================================================================= # ============================================================================= # # THESE VARIABLES ARE ENVIRONMENT DEPENDENT, AND NEED TO BE SET ACCORDINGLY # BY THE USER # ------------------------------------------------------------------------- # # For using just "phylo_pl.pl", only the following variables need to be set # $JAVA # $FORESTER_JAR # $TEMP_DIR_DEFAULT # $SEQBOOT # $CONSENSE # $PUZZLE # $FASTME # $NEIGHBOR # $FITCH # $BIONJ # $WEIGHBOR # $PHYML # $PROTPARS # Software directory: # --------------------- our $SOFTWARE_DIR = "/home/czmasek/SOFTWARE/"; # Java virtual machine: # --------------------- our $JAVA = $SOFTWARE_DIR."JAVA/jdk1.6.0_03/bin/java"; # Where all the temporary files can be created: # --------------------------------------------- our $TEMP_DIR_DEFAULT = "/tmp/"; # Programs from Joe Felsenstein's PHYLIP package: # ----------------------------------------------- our $SEQBOOT = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/seqboot"; our $NEIGHBOR = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/neighbor"; our $PROTPARS = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/protpars"; our $PROML = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/proml"; our $FITCH = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/fitch"; our $CONSENSE = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/consense"; our $PHYLIP_VERSION = "3.68"; # TREE-PUZZLE: # ------------ our $PUZZLE = $SOFTWARE_DIR."TREE_PUZZLE/tree-puzzle-5.2/src/puzzle"; our $PUZZLE_VERSION = "5.2"; # FASTME: # ----------------------------------------------------- our $FASTME = $SOFTWARE_DIR."FASTME/fastme2.0/fastme"; our $FASTME_VERSION = "2.0"; # BIONJ: # ----------------------------------------------------- our $BIONJ = $SOFTWARE_DIR."BIONJ/bionj"; our $BIONJ_VERSION = "[1997]"; # WEIGHBOR: # ----------------------------------------------------- our $WEIGHBOR = $SOFTWARE_DIR."WEIGHBOR/Weighbor/weighbor"; our $WEIGHBOR_VERSION = "1.2.1"; # PHYML: # ----------------------------------------------------- our $PHYML = $SOFTWARE_DIR."PHYML/phyml_v2.4.4/exe/phyml_linux"; our $PHYML_VERSION = "2.4.4"; # RAXML: # ----------------------------------------------------- our $RAXML = $SOFTWARE_DIR."RAXML/RAxML-7.0.4/raxmlHPC"; our $RAXML_VERSION = "7.0.4"; # forester.jar. This jar file is currently available at: http://www.phylosoft.org # ------------------------------------------------------------------------------- our $FORESTER_JAR = $SOFTWARE_DIR."FORESTER/DEV/forester/forester/java/forester.jar"; # End of variables which need to be set by the user for using "phylo_pl.pl". # Tool from forester.jar to transfer support values: # ------------------------------------------------- our $SUPPORT_TRANSFER = $JAVA." -cp $FORESTER_JAR org.forester.application.support_transfer"; # Tool from forester.jar for simple statistics for support values: # ---------------------------------------------------------------- our $SUPPORT_STATISTICS = $JAVA." -cp $FORESTER_JAR org.forester.application.support_statistics"; # Tool from forester.jar to transfer nh to phyloXML: # ------------------------------------------------- our $NEWICK_TO_PHYLOXML = $JAVA." -cp $FORESTER_JAR org.forester.application.phyloxml_converter"; # FORESTER itself (currently not needed for "phylo_pl.pl"): # --------------------------------------------------------- our $PATH_TO_FORESTER = ""; # Pfam data (not needed for phylo_pl.pl): # -------------------------------------- our $PFAM_FULL_DIRECTORY = "/path/to/Pfam/Full/"; our $PFAM_SEED_DIRECTORY = "/path/to/Pfam/Seed/"; our $PFAM_HMM_DB = "/path/to/Pfam/Pfam_ls"; # Need to run "hmmindex" on this # to produce .ssi file. # Then, for example # "setenv HMMERDB /home/rio/pfam-6.6/" $PATH_TO_FORESTER = &addSlashAtEndIfNotPresent( $PATH_TO_FORESTER ); # Description lines and species from SWISS-PROT and TrEMBL (not needed for phylo_pl.pl): # ------------------------------------------------------------------------------------- our $TREMBL_ACDEOS_FILE = $PATH_TO_FORESTER."data/trembl22_ACDEOS_1-6"; our $SWISSPROT_ACDEOS_FILE = $PATH_TO_FORESTER."data/sp40_ACDEOS_1-6"; # Names of species which can be analyzed and analyzed # against (must also be in tree $SPECIES_TREE_FILE_DEFAULT). # By using a list with less species, RIO analyses become faster # but lose phylogenetic resolution. # For many purposes, list "tree_of_life_bin_1-6_species_list" # in "data/species/" might be sufficient: # (not needed for phylo_pl.pl) # -------------------------------------------------------------- our $SPECIES_NAMES_FILE = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6_species_list"; # A default species tree in NHX format. # For many purposes, tree "tree_of_life_bin_1-6.nhx" # in "data/species/" might be fine: # (not needed for phylo_pl.pl) # -------------------------------------------------- our $SPECIES_TREE_FILE_DEFAULT = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6.nhx"; # Data for using precalculated distances: # (not needed for phylo_pl.pl) # --------------------------------------- our $MATRIX_FOR_PWD = 2; # The matrix which has been used for the pwd in $RIO_PWD_DIRECTORY. # 0=JTT, 1=PAM, 2=BLOSUM 62, 3=mtREV24, 5=VT, 6=WAG. our $RIO_PWD_DIRECTORY = $PATH_TO_FORESTER."example_data/"; # all must end with "/" our $RIO_BSP_DIRECTORY = $PATH_TO_FORESTER."example_data/"; our $RIO_NBD_DIRECTORY = $PATH_TO_FORESTER."example_data/"; our $RIO_ALN_DIRECTORY = $PATH_TO_FORESTER."example_data/"; our $RIO_HMM_DIRECTORY = $PATH_TO_FORESTER."example_data/"; # # End of variables which need to be set by the user. # # ============================================================================= # ============================================================================= $TEMP_DIR_DEFAULT = &addSlashAtEndIfNotPresent( $TEMP_DIR_DEFAULT ); $PFAM_FULL_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_FULL_DIRECTORY ); $PFAM_SEED_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_SEED_DIRECTORY ); # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # These variables should normally not be changed: # our $PRIOR_FILE_DIR = $PATH_TO_FORESTER."data/priors_for_hmmbuild/"; # Directory containing dirichlet prior # files needed for certain aligments # by hmmbuild (e.g. Collagen). # TREE-PUZZLE: our $PUZZLE_DQO = $PATH_TO_FORESTER."puzzle_dqo/src/puzzle"; # HMMER: our $HMMALIGN = $PATH_TO_FORESTER."hmmer/binaries/hmmalign"; our $HMMSEARCH = $PATH_TO_FORESTER."hmmer/binaries/hmmsearch"; our $HMMBUILD = $PATH_TO_FORESTER."hmmer/binaries/hmmbuild"; our $HMMFETCH = $PATH_TO_FORESTER."hmmer/binaries/hmmfetch"; our $SFE = $PATH_TO_FORESTER."hmmer/binaries/sfetch"; our $HMMCALIBRATE = $PATH_TO_FORESTER."hmmer/binaries/hmmcalibrate"; our $P7EXTRACT = $PATH_TO_FORESTER."perl/p7extract.pl"; our $MULTIFETCH = $PATH_TO_FORESTER."perl/multifetch.pl"; # RIO/FORESTER: our $BOOTSTRAP_CZ = $PATH_TO_FORESTER."C/bootstrap_cz"; our $BOOTSTRAP_CZ_PL = $PATH_TO_FORESTER."perl/bootstrap_cz.pl"; #our $SUPPORT_TRANSFER = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.transfersBranchLenghts"; #our $SUPPORT_TRANSFER = $JAVA." -cp /home/czmasek/SOFTWARE/FORESTER/forester3/forester.jar org.forester.tools.SupportTransfer"; our $PHYLO_PL = $PATH_TO_FORESTER."perl/phylo_pl.pl"; our $RIO_PL = $PATH_TO_FORESTER."perl/rio.pl"; our $DORIO = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.DoRIO"; # parallel RIO: our $RIO_SLAVE_DRIVER = $PATH_TO_FORESTER."perl/rio_slave_driver.pl"; our $RIO_SLAVE = $PATH_TO_FORESTER."perl/rio_slave.pl"; our $NODE_LIST = $PATH_TO_FORESTER."data/node_list.dat"; # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ our $BOOTSTRAPS = 100; our $MIN_NUMBER_OF_AA = 20; # After removal of gaps, if less, gaps are not removed. our $LENGTH_OF_NAME = 10; our $MULTIPLE_TREES_FILE_SUFFIX = ".mlt"; our $LOG_FILE_SUFFIX = ".log"; our $ALIGN_FILE_SUFFIX = ".aln"; our $TREE_FILE_SUFFIX = ".nhx"; our $ADDITION_FOR_RIO_ANNOT_TREE = ".rio"; our $SUFFIX_PWD = ".pwd"; our $MULTIPLE_PWD_FILE_SUFFIX = ".mpwd"; our $SUFFIX_BOOT_STRP_POS = ".bsp"; our $SUFFIX_PWD_NOT_BOOTS = ".nbd"; our $SUFFIX_HMM = ".hmm"; our $EXPASY_SPROT_SEARCH_DE = "http://www.expasy.org/cgi-bin/sprot-search-de?"; our $EXPASY_SPROT_SEARCH_AC = "http://www.expasy.org/cgi-bin/sprot-search-ac?"; # One argument: input multiple trees file # Last modified: 07/05/01 sub executeConsense { my $in = $_[ 0 ]; &testForTextFilePresence( $in ); system( "$CONSENSE >/dev/null 2>&1 << ! $in Y !" ) && &dieWithUnexpectedError( "Could not execute \"$CONSENSE $in\"" ); return; } # Four arguments: # 1. options ("-" is not necessary) # 2. alignment or pwd file # 3. outfile # 4. temp dir # Last modified: 07/05/01 sub executePhyloPl { my $opts = $_[ 0 ]; my $B = $_[ 1 ]; my $C = $_[ 2 ]; my $D = $_[ 3 ]; &testForTextFilePresence( $B ); $opts = "-".$opts; system( "$PHYLO_PL $opts $B $C $D" ) && &dieWithUnexpectedError( "Could not execute \"$PHYLO_PL $opts $B $C $D\"" ); } ## executePhyloPl # Two arguments: # 1. Name of inputfile # 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; # 5 = VT; 6 = WAG; 7 = auto; PAM otherwise sub executePuzzleDQO { my $in = $_[ 0 ]; my $matrix_option = $_[ 1 ]; my $mat = ""; &testForTextFilePresence( $in ); $mat = setModelForPuzzle( $matrix_option ); system( "$PUZZLE_DQO $in >/dev/null 2>&1 << !$mat y !" ) && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO\"" ); return; } ## executePuzzleDQO # Two arguments: # 1. Name of inputfile # 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; # 5 = VT; 6 = WAG; 7 = auto; PAM otherwise # Last modified: 01/28/02 sub executePuzzleDQObootstrapped { my $in = $_[ 0 ]; my $matrix_option = $_[ 1 ]; my $l = 0; my $slen = 0; my $counter = 0; my $mat = ""; my $a = ""; my @a = (); &testForTextFilePresence( $in ); open( GRP, "<$in" ) || &dieWithUnexpectedError( "Cannot open file \"$in\"" ); while( ) { if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) { $counter++; } } close( GRP ); $l = `cat $in | wc -l`; $slen = $l / $counter; system( "split -$slen $in $in.splt." ) && &dieWithUnexpectedError( "Could not execute \"split -$slen $in $in.splt.\"" ); @a = <$in.splt.*>; $mat = setModelForPuzzle( $matrix_option ); foreach $a ( @a ) { system( "$PUZZLE_DQO $a >/dev/null 2>&1 << !$mat y !" ) && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO $a\"" ); system( "cat $a.dist >> $in.dist" ) && &dieWithUnexpectedError( "Could not execute \"cat outdist >> $in.dist\"" ); unlink( $a, $a.".dist" ); } return; } ## executePuzzleDQObootstrapped # Transfers a Pfam (SELEX) alignment to a # PHYLIP sequential style alignment. # It only writes "match columns" as indicated by the # "# RF" line ('x' means match). # # Three arguments: # 1. infile name # 2. outfile name # 3. 1 to NOT ensure that match states contain only 'A'-'Z' or '-' # # Returns the number of match states (=length of output alignment), # the length of the input alignment, # the number of seqs in the input alignment # # Last modified: 07/07/01 # sub pfam2phylipMatchOnly { my $infile = $_[ 0 ]; my $outfile = $_[ 1 ]; my $ne = $_[ 2 ]; my @seq_name = (); my @seq_array = (); my $return_line = ""; my $seq = ""; my $x = 0; my $y = 0; my $i = 0; my $x_offset = 0; my $max_x = 0; my $rf_y = 0; my $number_colum = 0; my $not_ensure = 0; my $saw_rf_line = 0; if ( $ne && $ne == 1 ) { $not_ensure = 1; } &testForTextFilePresence( $infile ); open( INPP, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); # This reads in the first block. It reads in the seq names. while ( 1 ) { if ( &isPfamSequenceLine( $return_line ) ) { $return_line =~ /^(\S+)\s+(\S+)/; $seq_name[ $y ] = substr( $1, 0, $LENGTH_OF_NAME ); $seq = $2; for ( $x = 0; $x < length( $seq ); $x++ ) { $seq_array[ $x ][ $y ] = substr( $seq, $x, 1 ); } $y++; } elsif ( &isRFline( $return_line ) ) { $saw_rf_line = 1; $return_line =~ /\s+(\S+)\s*$/; $seq = $1; $x_offset = length( $seq ); $rf_y = $y; for ( $x = 0; $x < $x_offset; $x++ ) { $seq_array[ $x ][ $rf_y ] = substr( $seq, $x, 1 ); } last; } $return_line = ; if ( !$return_line ) { &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" ); } } if ( $saw_rf_line != 1 ) { &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" ); } $y = 0; $max_x = 0; # This reads all blocks after the 1st one. while ( $return_line = ) { if ( &isPfamSequenceLine( $return_line ) ) { $return_line =~ /^\S+\s+(\S+)/; $seq = $1; for ( $x = 0; $x < length( $seq ); $x++ ) { $seq_array[ $x + $x_offset ][ $y % $rf_y ] = substr( $seq, $x, 1 ); } $y++; } elsif ( &isRFline( $return_line ) ) { if ( $y != $rf_y ) { &dieWithUnexpectedError( "Alignment not in expected format" ); } $return_line =~ /\s+(\S+)\s*$/; $seq = $1; $max_x = length( $seq ); for ( $x = 0; $x < length( $seq ); $x++ ) { $seq_array[ $x + $x_offset ][ $rf_y ] = substr( $seq, $x, 1 ); } $y = 0; $x_offset = $x_offset + $max_x; $max_x = 0; } } close( INPP ); # Counts the match states, and hence the number of aa in the alignment: for ( $x = 0; $x < $x_offset; $x++ ) { if ( !$seq_array[ $x ][ $rf_y ] ) { &dieWithUnexpectedError( "Alignment not in expected format" ); } if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) { $number_colum++; } } # Writes the file: open( OUTPP, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); print OUTPP "$rf_y $number_colum\n"; for ( $y = 0; $y < $rf_y; $y++ ) { print OUTPP "$seq_name[ $y ]"; for ( $i = 0; $i < ( $LENGTH_OF_NAME - length( $seq_name[ $y ] ) ); $i++ ) { print OUTPP " "; } for ( $x = 0; $x < $x_offset; $x++ ) { if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) { if ( !$seq_array[ $x ][ $y ] ) { &dieWithUnexpectedError( "Alignment not in expected format" ); } if ( $not_ensure != 1 && $seq_array[ $x ][ $y ] !~ /[A-Z]|-/ ) { &dieWithUnexpectedError( "Alignment not in expected format (match states must only contain 'A'-'Z' or '-')" ); } print OUTPP "$seq_array[ $x ][ $y ]"; } } print OUTPP "\n"; } close( OUTPP ); return $number_colum, $x_offset, $rf_y; } ## pfam2phylipMatchOnly # Returns whether the argument (a String) # starts with a SWISS-PROT name (SEQN_SPECI). # Last modified: 06/21/01 sub startsWithSWISS_PROTname { return ( $_[ 0 ] =~ /^[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/ ); } # Returns whether the argument starts with XXX.. XXXXX.. and the first # character is not a "#". # Last modified: 06/21/01 sub isPfamSequenceLine { return( !&isPfamCommentLine( $_[ 0 ] ) && &containsPfamNamedSequence( $_[ 0 ] ) ); } # Returns whether the argument does start with a "#". # Last modified: 06/21/01 sub isPfamCommentLine { return ( $_[ 0 ] =~ /^#/ ); } # Returns whether the argument starts with XXX XXXXX. # Last modified: 06/21/01 sub containsPfamNamedSequence { return ( $_[ 0 ] =~ /^\S+\s+\S+/ ); } # Returns whether the argument starts with XXX XXXXX. # Last modified: 06/21/01 sub isRFline { return ( $_[ 0 ] =~ /^#.*RF/ ); } # Three arguments: # 1. pairwise distance file # 2. number of bootstraps # 3. initial tree: BME, GME or NJ # Last modified: 2008/12/31 sub executeFastme { my $inpwd = $_[ 0 ]; my $bs = $_[ 1 ]; my $init_opt = $_[ 2 ]; &testForTextFilePresence( $inpwd ); my $command = ""; if ( $bs > 0 ) { $command = "$FASTME -b $init_opt -i $inpwd -n $bs -s b"; } else { $command = "$FASTME -b $init_opt -i $inpwd -s b"; } print $command; system( $command ); } ## executeFastme # Four arguments: # 1. pairwise distance file # 2. number of bootstraps # 3. seed for random number generator # 4. lower-triangular data matrix? 1: yes; no, otherwise sub executeNeighbor { my $inpwd = $_[ 0 ]; my $bs = $_[ 1 ]; my $s = $_[ 2 ]; my $l = $_[ 3 ]; my $multi = ""; my $lower = ""; &testForTextFilePresence( $inpwd ); if ( $bs >= 2 ) { $multi = " M $bs $s"; } if ( $l == 1 ) { $lower = " L"; } system( "$NEIGHBOR >/dev/null 2>&1 << ! $inpwd$multi$lower 2 3 Y !" ) && &dieWithUnexpectedError( "Could not execute \"$NEIGHBOR $inpwd$multi$lower\"" ); } ## executeNeighbor # Seven arguments: # 1. pairwise distance file # 2. number of bootstraps # 3. seed for random number generator # 4. number of jumbles for input order # 5. lower-triangular data matrix? 1: yes; no, otherwise # 6. FM for Fitch-Margoliash, ME for ME# 6. # 7. 1 to use globale rearragements sub executeFitch { my $inpwd = $_[ 0 ]; my $bs = $_[ 1 ]; my $s = $_[ 2 ]; my $j = $_[ 3 ]; my $l = $_[ 4 ]; my $m = $_[ 5 ]; my $use_global_rearr = $_[ 6 ]; my $jumble = ""; my $multi = ""; my $lower = ""; my $method = ""; my $global = ""; if ( $use_global_rearr == 1 ) { $global = " G"; } &testForTextFilePresence( $inpwd ); if ( $m eq "FM" ) { $method = ""; } elsif ( $m eq "ME" ) { $method = " D"; } else { &dieWithUnexpectedError( "method for FITCH must be either FM or ME" ); } if ( $j >= 1 ) { $jumble = " J $s $j"; } if ( $bs >= 2 ) { $multi = " M $bs $s"; } if ( $l == 1 ) { $lower = " L"; } # jumble must be set BEFORE multi! system( "$FITCH 2>&1 << ! $inpwd$method$global$jumble$multi$lower 3 Y !" ) && &dieWithUnexpectedError( "Could not execute \"$FITCH $inpwd$method$global$jumble$multi$lower\"" ); # 3: Do NOT print out tree } ## executeFitch # Two arguments: # 1. pairwise distance file # 2. outfile sub executeBionj { my $inpwd = $_[ 0 ]; my $out = $_[ 1 ]; &testForTextFilePresence( $inpwd ); my $command = "$BIONJ $inpwd $out"; system( $command ) && &dieWithUnexpectedError( $command ); } # Four arguments: # 1. (effective) sequence length # 2. (effective) number of bases # 3. pairwise distance file # 4. outfile sub executeWeighbor { my $L = $_[ 0 ]; my $b = $_[ 1 ]; my $i = $_[ 2 ]; my $o = $_[ 3 ]; &testForTextFilePresence( $i ); my $command = "$WEIGHBOR -L $L -b $b -i $i -o $o"; system( $command ) && &dieWithUnexpectedError( $command ); } # Six arguments: # 1. DNA or Amino-Acids sequence filename (PHYLIP format) # 2. number of data sets to analyse (ex:3) # 3. Model: JTT | MtREV | Dayhoff | WAG | VT | DCMut | Blosum62 (Amino-Acids) # 4. number of relative substitution rate categories (ex:4), positive integer # 5. starting tree filename (Newick format), your tree filename | BIONJ for a distance-based tree # 6. 1 to estimate proportion of invariable sites, otherwise, fixed proportion "0.0" is used # PHYML produces several results files : # _phyml_lk.txt : likelihood value(s) # _phyml_tree.txt : inferred tree(s) # _phyml_stat.txt : detailed execution stats sub executePhyml { my $sequences = $_[ 0 ]; my $data_sets = $_[ 1 ]; my $model = $_[ 2 ]; my $nb_categ = $_[ 3 ]; my $tree = $_[ 4 ]; my $estimate_invar_sites = $_[ 5 ]; if ( $data_sets < 1 ) { $data_sets = 1 } my $invar = "0.0"; # proportion of invariable sites, # a fixed value (ex:0.0) | e to get the maximum likelihood estimate if ( $estimate_invar_sites == 1 ) { $invar = "e"; } my $data_type = "1"; # 0 = DNA | 1 = Amino-Acids my $format = "i"; # i = interleaved sequence format | s = sequential my $bootstrap_sets = "0"; # number of bootstrap data sets to generate (ex:2) # only works with one data set to analyse my $alpha = "e"; # gamma distribution parameter, # a fixed value (ex:1.0) | e to get the maximum likelihood estimate my $opt_topology = "y"; # optimise tree topology ? y | n my $opt_lengths = "y"; # optimise branch lengths and rate parameters ? y | n if ( $data_sets > 1 ) { # No need to calc branch lengths for bootstrapped analysis $opt_lengths = "n"; } &testForTextFilePresence( $sequences ); my $command = "$PHYML $sequences $data_type $format $data_sets $bootstrap_sets $model $invar $nb_categ $alpha $tree $opt_topology $opt_lengths"; print( "\n$command\n"); system( $command ) && &dieWithUnexpectedError( $command ); } # Four arguments: # 1. name of alignment file (in correct format!) # 2. number of bootstraps # 3. jumbles: 0: do not jumble; >=1 number of jumbles # 4. seed for random number generator sub executeProtpars { my $align = $_[ 0 ]; my $bs = $_[ 1 ]; my $rand = $_[ 2 ]; my $s = $_[ 3 ]; my $jumble = ""; my $multi = ""; &testForTextFilePresence( $align ); if ( $bs > 1 && $rand < 1 ) { $rand = 1; } if ( $rand >= 1 ) { $jumble = " J $s $rand"; } if ( $bs > 1 ) { $multi = " M D $bs"; } system( "$PROTPARS 2>&1 << ! $align$jumble$multi Y !" ) && &dieWithUnexpectedError( "Could not execute \"$PROTPARS $align$jumble$multi\"" ); # 3: Do NOT print out tree return; } ## executeProtpars # "Model of substitution" order for DQO TREE-PUZZLE 5.0: # Auto # m -> Dayhoff (Dayhoff et al. 1978) # m -> JTT (Jones et al. 1992) # m -> mtREV24 (Adachi-Hasegawa 1996) # m -> BLOSUM62 (Henikoff-Henikoff 92) # m -> VT (Mueller-Vingron 2000) # m -> WAG (Whelan-Goldman 2000) # m -> Auto # One argument: # matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; # 5 = VT; 6 = WAG; 7 = auto; PAM otherwise # Last modified: 07/07/01 sub setModelForPuzzle { my $matrix_option = $_[ 0 ]; my $matr = ""; if ( $matrix_option == 0 ) { # JTT $matr = " m m"; } elsif ( $matrix_option == 2 ) { # BLOSUM 62 $matr = " m m m m"; } elsif ( $matrix_option == 3 ) { # mtREV24 $matr = " m m m"; } elsif ( $matrix_option == 5 ) { # VT $matr = " m m m m m"; } elsif ( $matrix_option == 6 ) { # WAG $matr = " m m m m m m"; } elsif ( $matrix_option == 7 ) { # auto $matr = ""; } else { # PAM $matr = " m" } return $matr; } ## setModelForPuzzle # One argument: # Model of rate heterogeneity: # 1 for "8 Gamma distributed rates" # 2 for "Two rates (1 invariable + 1 variable)" # 3 for "Mixed (1 invariable + 8 Gamma rates)" # otherwise: Uniform rate # Last modified: 09/08/03 sub setRateHeterogeneityOptionForPuzzle { my $rate_heterogeneity_option = $_[ 0 ]; my $opt = ""; if ( $rate_heterogeneity_option == 1 ) { $opt = " w"; } elsif ( $rate_heterogeneity_option == 2 ) { $opt = " w w"; } elsif ( $rate_heterogeneity_option == 3 ) { $opt = " w w w"; } else { $opt = ""; } return $opt; } ## setRateHeterogeneityOptionForPuzzle # One argument: # Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise # Last modified: 09/08/03 sub setParameterEstimatesOptionForPuzzle { my $parameter_estimates_option = $_[ 0 ]; my $opt = ""; if ( $parameter_estimates_option == 1 ) { $opt = " e"; } else { $opt = ""; } return $opt; } ## setParameterEstimatesOptionForPuzzle # three/four/five arguments: # 1. Name of inputfile # 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; # 5 = VT; 6 = WAG; 7 = auto; PAM otherwise # 3. Number of sequences in alignment # 4. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise # 5. Model of rate heterogeneity: # 1 for "8 Gamma distributed rates" # 2 for "Two rates (1 invariable + 1 variable)" # 3 for "Mixed (1 invariable + 8 Gamma rates)" # otherwise: Uniform rate sub executePuzzleBootstrapped { my $in = $_[ 0 ]; my $matrix_option = $_[ 1 ]; my $number_of_seqs = $_[ 2 ]; my $parameter_estimates_option = $_[ 3 ]; my $rate_heterogeneity_option = $_[ 4 ]; my $l = 0; my $slen = 0; my $counter = 0; my $mat = ""; my $est = ""; my $rate = ""; my $a = ""; my @a = (); &testForTextFilePresence( $in ); open( GRP, "<$in" ) || die "\n\n$0: Unexpected error: Cannot open file <<$in>>: $!"; while( ) { if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) { $counter++; } } close( GRP ); $l = `cat $in | wc -l`; $slen = $l / $counter; system( "split --suffix-length=4 -$slen $in $in.splt." ) && die "\n\n$0: executePuzzleDQObootstrapped: Could not execute \"split --suffix-length=4 -$slen $in $in.splt.\": $!"; @a = <$in.splt.*>; $mat = setModelForPuzzle( $matrix_option ); if ( $parameter_estimates_option ) { $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option ); } if ( $rate_heterogeneity_option ) { $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option ); } my $k=""; if ( $number_of_seqs <= 257 ) { $k = "k"; } foreach $a ( @a ) { print "-".$a."\n"; system( "$PUZZLE $a << ! $k k k$mat$est$rate y !" ) && die "$0: Could not execute \"$PUZZLE $a\""; system( "cat $a.dist >> $in.dist" ) && die "$0: Could not execute \"cat outdist >> $in.dist\""; unlink( $a, $a.".dist", $a.".puzzle" ); } return; } ## executePuzzleBootstrapped # three/four/five arguments: # 1. Name of inputfile # 2. Matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; # 5 = VT; 6 = WAG; 7 = auto; PAM otherwise # 3. Number of sequences in alignment # 4. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise # 5. Model of rate heterogeneity: # 1 for "8 Gamma distributed rates" # 2 for "Two rates (1 invariable + 1 variable)" # 3 for "Mixed (1 invariable + 8 Gamma rates)" # otherwise: Uniform rate sub executePuzzle { my $in = $_[ 0 ]; my $matrix_option = $_[ 1 ]; my $number_of_seqs = $_[ 2 ]; my $parameter_estimates_option = $_[ 3 ]; my $rate_heterogeneity_option = $_[ 4 ]; my $mat = ""; my $est = ""; my $rate = ""; &testForTextFilePresence( $in ); $mat = &setModelForPuzzle( $matrix_option ); if ( $parameter_estimates_option ) { $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option ); } if ( $rate_heterogeneity_option ) { $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option ); } my $k=""; if ( $number_of_seqs <= 257 ) { $k = "k"; } system( "$PUZZLE $in << ! $k k k$mat$est$rate y !" ) && die "$0: Could not execute \"$PUZZLE\""; return; } ## executePuzzle # Preparation of the pwd file sub addDistsToQueryToPWDfile { my $pwd_file = $_[ 0 ]; my $disttoquery_file = $_[ 1 ]; my $outfile = $_[ 2 ]; my $name_of_query = $_[ 3 ]; my $name_of_query_ = ""; my $return_line_pwd = ""; my $return_line_dq = ""; my $num_of_sqs = 0; my $block = 0; my $name_from_pwd = "X"; my $name_from_dq = "Y"; my @dists_to_query = (); my $i = 0; &testForTextFilePresence( $pwd_file ); &testForTextFilePresence( $disttoquery_file ); $name_of_query_ = $name_of_query; for ( my $j = 0; $j <= ( $LENGTH_OF_NAME - length( $name_of_query ) - 1 ); ++$j ) { $name_of_query_ .= " "; } open( OUT_AD, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); open( IN_PWD, "$pwd_file" ) || &dieWithUnexpectedError( "Cannot open file \"$pwd_file\"" ); open( IN_DQ, "$disttoquery_file" ) || &dieWithUnexpectedError( "Cannot open file \"$disttoquery_file\"" ); W: while ( $return_line_pwd = ) { if ( $return_line_pwd =~ /^\s*(\d+)\s*$/ ) { $num_of_sqs = $1; $num_of_sqs++; if ( $block > 0 ) { print OUT_AD "$name_of_query_ "; for ( my $j = 0; $j < $i; ++$j ) { print OUT_AD "$dists_to_query[ $j ] "; } print OUT_AD "0.0\n"; } print OUT_AD " $num_of_sqs\n"; $block++; @dists_to_query = (); $i = 0; } if ( $block == 1 && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) { $name_from_pwd = $1; if ( !defined( $return_line_dq = ) ) { &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); } if ( $return_line_dq !~ /\S/ ) { if ( !defined( $return_line_dq = ) ) { &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); } } $return_line_dq =~ /^\s*(\S+)\s+(\S+)/; $name_from_dq = $1; $dists_to_query[ $i++ ] = $2; if ( $name_from_pwd ne $name_from_dq ) { &dieWithUnexpectedError( "Order of sequence names in \"$pwd_file\" and \"$disttoquery_file\" is not the same" ); } print OUT_AD $return_line_pwd; } elsif ( $block > 1 && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) { $name_from_pwd = $1; if ( !defined( $return_line_dq = ) ) { &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); } if ( $return_line_dq !~ /\S/ ) { if ( !defined( $return_line_dq = ) ) { &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); } } $return_line_dq =~ /^\s*\S+\s+(\S+)/; $dists_to_query[ $i++ ] = $1; print OUT_AD $return_line_pwd; } } print OUT_AD "$name_of_query_ "; for ( my $j = 0; $j < $i; ++$j ) { print OUT_AD "$dists_to_query[ $j ] "; } print OUT_AD "0.0\n"; close( OUT_AD ); close( IN_PWD ); close( IN_DQ ); return $block; } ## addDistsToQueryToPWDfile # Three arguments: # 1. HMMER model db # 2. name of HMM # 3. outputfile name # Last modified: 02/27/01 sub executeHmmfetch { my $db = $_[ 0 ]; my $name = $_[ 1 ]; my $outfile = $_[ 2 ]; system( "$HMMFETCH $db $name > $outfile" ) && &dieWithUnexpectedError( "Could not execute \"$HMMFETCH $db $name > $outfile\"" ); return; } ## executeHmmfetch # Checks wether a file is present, not empty and a plain textfile. # One argument: name of file. # Last modified: 07/07/01 sub testForTextFilePresence { my $file = $_[ 0 ]; unless ( ( -s $file ) && ( -f $file ) && ( -T $file ) ) { dieWithUnexpectedError( "File \"$file\" does not exist, is empty, or is not a plain textfile" ); } } ## testForTextFilePresence # Last modified: 02/21/03 sub addSlashAtEndIfNotPresent { my $filename = $_[ 0 ]; $filename =~ s/\s+//g; unless ( $filename =~ /\/$/ ) { $filename = $filename."/"; } return $filename; } ## addSlashAtEndIfNotPresent # Last modified: 02/15/02 sub exitWithWarning { my $text = $_[ 0 ]; if ( defined( $_[ 1 ] ) && $_[ 1 ] == 1 ) { print( "

user error

\n" ); print( "

\n" ); print( "$text\n" ); print( "

\n" ); print( "

 

\n" ); } else { print( "\n\n$text\n\n" ); } exit( 0 ); } ## exit_with_warning # Last modified: 02/15/02 sub dieWithUnexpectedError { my $text = $_[ 0 ]; die( "\n\n$0:\nUnexpected error (should not have happened):\n$text\n$!\n\n" ); } ## dieWithUnexpectedError 1;