1 # $Id: forester.pm,v 1.26 2010/12/13 19:00:22 cmzmasek Exp $
3 # FORESTER -- software libraries and applications
4 # for evolutionary biology research and applications.
6 # Copyright (C) 2007-2009 Christian M. Zmasek
7 # Copyright (C) 2007-2009 Burnham Institute for Medical Research
10 # This library is free software; you can redistribute it and/or
11 # modify it under the terms of the GNU Lesser General Public
12 # License as published by the Free Software Foundation; either
13 # version 2.1 of the License, or (at your option) any later version.
15 # This library is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 # Lesser General Public License for more details.
20 # You should have received a copy of the GNU Lesser General Public
21 # License along with this library; if not, write to the Free Software
22 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 # Contact: phylosoft @ gmail . com
25 # WWW: www.phylosoft.org/forester
37 our @ISA = qw( Exporter );
39 our @EXPORT = qw( executeConsense
42 executePuzzleDQObootstrapped
44 startsWithSWISS_PROTname
47 containsPfamNamedSequence
51 setRateHeterogeneityOptionForPuzzle
52 setParameterEstimatesOptionForPuzzle
53 executePuzzleBootstrapped
62 addDistsToQueryToPWDfile
63 testForTextFilePresence
65 dieWithUnexpectedError
66 addSlashAtEndIfNotPresent
70 $SWISSPROT_ACDEOS_FILE
72 $SPECIES_TREE_FILE_DEFAULT
73 $MULTIPLE_TREES_FILE_SUFFIX
77 $ADDITION_FOR_RIO_ANNOT_TREE
80 $MULTIPLE_PWD_FILE_SUFFIX
136 $EXPASY_SPROT_SEARCH_DE
137 $EXPASY_SPROT_SEARCH_AC
143 # =============================================================================
144 # =============================================================================
146 # THESE VARIABLES ARE ENVIRONMENT DEPENDENT, AND NEED TO BE SET ACCORDINGLY
148 # -------------------------------------------------------------------------
151 # For using just "phylo_pl.pl", only the following variables need to be set
166 # Software directory:
167 # ---------------------
169 our $SOFTWARE_DIR = "/home/czmasek/SOFTWARE/";
172 # Java virtual machine:
173 # ---------------------
174 our $JAVA = $SOFTWARE_DIR."JAVA/jdk1.6.0_03/bin/java";
177 # Where all the temporary files can be created:
178 # ---------------------------------------------
179 our $TEMP_DIR_DEFAULT = "/tmp/";
182 # Programs from Joe Felsenstein's PHYLIP package:
183 # -----------------------------------------------
184 our $SEQBOOT = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/seqboot";
185 our $NEIGHBOR = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/neighbor";
186 our $PROTPARS = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/protpars";
187 our $PROML = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/proml";
188 our $FITCH = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/fitch";
189 our $CONSENSE = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/consense";
190 our $PHYLIP_VERSION = "3.68";
194 our $PUZZLE = $SOFTWARE_DIR."TREE_PUZZLE/tree-puzzle-5.2/src/puzzle";
195 our $PUZZLE_VERSION = "5.2";
198 # -----------------------------------------------------
199 our $FASTME = $SOFTWARE_DIR."FASTME/fastme2.0/fastme";
200 our $FASTME_VERSION = "2.0";
203 # -----------------------------------------------------
204 our $BIONJ = $SOFTWARE_DIR."BIONJ/bionj";
205 our $BIONJ_VERSION = "[1997]";
208 # -----------------------------------------------------
209 our $WEIGHBOR = $SOFTWARE_DIR."WEIGHBOR/Weighbor/weighbor";
210 our $WEIGHBOR_VERSION = "1.2.1";
213 # -----------------------------------------------------
214 our $PHYML = $SOFTWARE_DIR."PHYML/phyml_v2.4.4/exe/phyml_linux";
215 our $PHYML_VERSION = "2.4.4";
218 # -----------------------------------------------------
219 our $RAXML = $SOFTWARE_DIR."RAXML/RAxML-7.0.4/raxmlHPC";
220 our $RAXML_VERSION = "7.0.4";
223 # forester.jar. This jar file is currently available at: http://www.phylosoft.org
224 # -------------------------------------------------------------------------------
226 our $FORESTER_JAR = $SOFTWARE_DIR."FORESTER/DEV/forester-atv/java/forester.jar";
230 # End of variables which need to be set by the user for using "phylo_pl.pl".
245 # Tool from forester.jar to transfer support values:
246 # -------------------------------------------------
247 our $SUPPORT_TRANSFER = $JAVA." -cp $FORESTER_JAR org.forester.application.support_transfer";
251 # Tool from forester.jar for simple statistics for support values:
252 # ----------------------------------------------------------------
253 our $SUPPORT_STATISTICS = $JAVA." -cp $FORESTER_JAR org.forester.application.support_statistics";
256 # Tool from forester.jar to transfer nh to phyloXML:
257 # -------------------------------------------------
258 our $NEWICK_TO_PHYLOXML = $JAVA." -cp $FORESTER_JAR org.forester.application.phyloxml_converter";
262 # FORESTER itself (currently not needed for "phylo_pl.pl"):
263 # ---------------------------------------------------------
264 our $PATH_TO_FORESTER = "";
267 # Pfam data (not needed for phylo_pl.pl):
268 # --------------------------------------
269 our $PFAM_FULL_DIRECTORY = "/path/to/Pfam/Full/";
270 our $PFAM_SEED_DIRECTORY = "/path/to/Pfam/Seed/";
271 our $PFAM_HMM_DB = "/path/to/Pfam/Pfam_ls"; # Need to run "hmmindex" on this
272 # to produce .ssi file.
274 # "setenv HMMERDB /home/rio/pfam-6.6/"
277 $PATH_TO_FORESTER = &addSlashAtEndIfNotPresent( $PATH_TO_FORESTER );
280 # Description lines and species from SWISS-PROT and TrEMBL (not needed for phylo_pl.pl):
281 # -------------------------------------------------------------------------------------
282 our $TREMBL_ACDEOS_FILE = $PATH_TO_FORESTER."data/trembl22_ACDEOS_1-6";
284 our $SWISSPROT_ACDEOS_FILE = $PATH_TO_FORESTER."data/sp40_ACDEOS_1-6";
288 # Names of species which can be analyzed and analyzed
289 # against (must also be in tree $SPECIES_TREE_FILE_DEFAULT).
290 # By using a list with less species, RIO analyses become faster
291 # but lose phylogenetic resolution.
292 # For many purposes, list "tree_of_life_bin_1-6_species_list"
293 # in "data/species/" might be sufficient:
294 # (not needed for phylo_pl.pl)
295 # --------------------------------------------------------------
296 our $SPECIES_NAMES_FILE = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6_species_list";
300 # A default species tree in NHX format.
301 # For many purposes, tree "tree_of_life_bin_1-6.nhx"
302 # in "data/species/" might be fine:
303 # (not needed for phylo_pl.pl)
304 # --------------------------------------------------
305 our $SPECIES_TREE_FILE_DEFAULT = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6.nhx";
309 # Data for using precalculated distances:
310 # (not needed for phylo_pl.pl)
311 # ---------------------------------------
312 our $MATRIX_FOR_PWD = 2; # The matrix which has been used for the pwd in $RIO_PWD_DIRECTORY.
313 # 0=JTT, 1=PAM, 2=BLOSUM 62, 3=mtREV24, 5=VT, 6=WAG.
315 our $RIO_PWD_DIRECTORY = $PATH_TO_FORESTER."example_data/"; # all must end with "/"
316 our $RIO_BSP_DIRECTORY = $PATH_TO_FORESTER."example_data/";
317 our $RIO_NBD_DIRECTORY = $PATH_TO_FORESTER."example_data/";
318 our $RIO_ALN_DIRECTORY = $PATH_TO_FORESTER."example_data/";
319 our $RIO_HMM_DIRECTORY = $PATH_TO_FORESTER."example_data/";
324 # End of variables which need to be set by the user.
326 # =============================================================================
327 # =============================================================================
333 $TEMP_DIR_DEFAULT = &addSlashAtEndIfNotPresent( $TEMP_DIR_DEFAULT );
334 $PFAM_FULL_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_FULL_DIRECTORY );
335 $PFAM_SEED_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_SEED_DIRECTORY );
339 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
340 # These variables should normally not be changed:
343 our $PRIOR_FILE_DIR = $PATH_TO_FORESTER."data/priors_for_hmmbuild/";
344 # Directory containing dirichlet prior
345 # files needed for certain aligments
346 # by hmmbuild (e.g. Collagen).
353 our $PUZZLE_DQO = $PATH_TO_FORESTER."puzzle_dqo/src/puzzle";
356 our $HMMALIGN = $PATH_TO_FORESTER."hmmer/binaries/hmmalign";
357 our $HMMSEARCH = $PATH_TO_FORESTER."hmmer/binaries/hmmsearch";
358 our $HMMBUILD = $PATH_TO_FORESTER."hmmer/binaries/hmmbuild";
359 our $HMMFETCH = $PATH_TO_FORESTER."hmmer/binaries/hmmfetch";
360 our $SFE = $PATH_TO_FORESTER."hmmer/binaries/sfetch";
361 our $HMMCALIBRATE = $PATH_TO_FORESTER."hmmer/binaries/hmmcalibrate";
363 our $P7EXTRACT = $PATH_TO_FORESTER."perl/p7extract.pl";
364 our $MULTIFETCH = $PATH_TO_FORESTER."perl/multifetch.pl";
368 our $BOOTSTRAP_CZ = $PATH_TO_FORESTER."C/bootstrap_cz";
369 our $BOOTSTRAP_CZ_PL = $PATH_TO_FORESTER."perl/bootstrap_cz.pl";
370 #our $SUPPORT_TRANSFER = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.transfersBranchLenghts";
371 #our $SUPPORT_TRANSFER = $JAVA." -cp /home/czmasek/SOFTWARE/FORESTER/forester3/forester.jar org.forester.tools.SupportTransfer";
373 our $PHYLO_PL = $PATH_TO_FORESTER."perl/phylo_pl.pl";
374 our $RIO_PL = $PATH_TO_FORESTER."perl/rio.pl";
375 our $DORIO = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.DoRIO";
377 our $RIO_SLAVE_DRIVER = $PATH_TO_FORESTER."perl/rio_slave_driver.pl";
378 our $RIO_SLAVE = $PATH_TO_FORESTER."perl/rio_slave.pl";
379 our $NODE_LIST = $PATH_TO_FORESTER."data/node_list.dat";
382 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
385 our $BOOTSTRAPS = 100;
386 our $MIN_NUMBER_OF_AA = 20; # After removal of gaps, if less, gaps are not removed.
387 our $LENGTH_OF_NAME = 10;
392 our $MULTIPLE_TREES_FILE_SUFFIX = ".mlt";
393 our $LOG_FILE_SUFFIX = ".log";
394 our $ALIGN_FILE_SUFFIX = ".aln";
395 our $TREE_FILE_SUFFIX = ".nhx";
396 our $ADDITION_FOR_RIO_ANNOT_TREE = ".rio";
397 our $SUFFIX_PWD = ".pwd";
398 our $MULTIPLE_PWD_FILE_SUFFIX = ".mpwd";
399 our $SUFFIX_BOOT_STRP_POS = ".bsp";
400 our $SUFFIX_PWD_NOT_BOOTS = ".nbd";
401 our $SUFFIX_HMM = ".hmm";
403 our $EXPASY_SPROT_SEARCH_DE = "http://www.expasy.org/cgi-bin/sprot-search-de?";
404 our $EXPASY_SPROT_SEARCH_AC = "http://www.expasy.org/cgi-bin/sprot-search-ac?";
408 # One argument: input multiple trees file
409 # Last modified: 07/05/01
410 sub executeConsense {
413 &testForTextFilePresence( $in );
415 system( "$CONSENSE >/dev/null 2>&1 << !
419 && &dieWithUnexpectedError( "Could not execute \"$CONSENSE $in\"" );
427 # 1. options ("-" is not necessary)
428 # 2. alignment or pwd file
431 # Last modified: 07/05/01
439 &testForTextFilePresence( $B );
443 system( "$PHYLO_PL $opts $B $C $D" )
444 && &dieWithUnexpectedError( "Could not execute \"$PHYLO_PL $opts $B $C $D\"" );
452 # 1. Name of inputfile
453 # 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
454 # 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
455 sub executePuzzleDQO {
457 my $matrix_option = $_[ 1 ];
460 &testForTextFilePresence( $in );
462 $mat = setModelForPuzzle( $matrix_option );
464 system( "$PUZZLE_DQO $in >/dev/null 2>&1 << !$mat
467 && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO\"" );
471 } ## executePuzzleDQO
477 # 1. Name of inputfile
478 # 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
479 # 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
480 # Last modified: 01/28/02
481 sub executePuzzleDQObootstrapped {
483 my $matrix_option = $_[ 1 ];
493 &testForTextFilePresence( $in );
495 open( GRP, "<$in" ) || &dieWithUnexpectedError( "Cannot open file \"$in\"" );
497 if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) {
503 $l = `cat $in | wc -l`;
504 $slen = $l / $counter;
506 system( "split -$slen $in $in.splt." )
507 && &dieWithUnexpectedError( "Could not execute \"split -$slen $in $in.splt.\"" );
511 $mat = setModelForPuzzle( $matrix_option );
515 system( "$PUZZLE_DQO $a >/dev/null 2>&1 << !$mat
518 && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO $a\"" );
520 system( "cat $a.dist >> $in.dist" )
521 && &dieWithUnexpectedError( "Could not execute \"cat outdist >> $in.dist\"" );
523 unlink( $a, $a.".dist" );
528 } ## executePuzzleDQObootstrapped
532 # Transfers a Pfam (SELEX) alignment to a
533 # PHYLIP sequential style alignment.
534 # It only writes "match columns" as indicated by the
535 # "# RF" line ('x' means match).
540 # 3. 1 to NOT ensure that match states contain only 'A'-'Z' or '-'
542 # Returns the number of match states (=length of output alignment),
543 # the length of the input alignment,
544 # the number of seqs in the input alignment
546 # Last modified: 07/07/01
548 sub pfam2phylipMatchOnly {
550 my $infile = $_[ 0 ];
551 my $outfile = $_[ 1 ];
555 my $return_line = "";
563 my $number_colum = 0;
567 if ( $ne && $ne == 1 ) {
571 &testForTextFilePresence( $infile );
573 open( INPP, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
575 # This reads in the first block. It reads in the seq names.
577 if ( &isPfamSequenceLine( $return_line ) ) {
578 $return_line =~ /^(\S+)\s+(\S+)/;
579 $seq_name[ $y ] = substr( $1, 0, $LENGTH_OF_NAME );
581 for ( $x = 0; $x < length( $seq ); $x++ ) {
582 $seq_array[ $x ][ $y ] = substr( $seq, $x, 1 );
586 elsif ( &isRFline( $return_line ) ) {
588 $return_line =~ /\s+(\S+)\s*$/;
590 $x_offset = length( $seq );
592 for ( $x = 0; $x < $x_offset; $x++ ) {
593 $seq_array[ $x ][ $rf_y ] = substr( $seq, $x, 1 );
598 $return_line = <INPP>;
600 if ( !$return_line ) {
601 &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" );
605 if ( $saw_rf_line != 1 ) {
606 &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" );
612 # This reads all blocks after the 1st one.
613 while ( $return_line = <INPP> ) {
614 if ( &isPfamSequenceLine( $return_line ) ) {
615 $return_line =~ /^\S+\s+(\S+)/;
617 for ( $x = 0; $x < length( $seq ); $x++ ) {
618 $seq_array[ $x + $x_offset ][ $y % $rf_y ] = substr( $seq, $x, 1 );
622 elsif ( &isRFline( $return_line ) ) {
624 &dieWithUnexpectedError( "Alignment not in expected format" );
627 $return_line =~ /\s+(\S+)\s*$/;
629 $max_x = length( $seq );
631 for ( $x = 0; $x < length( $seq ); $x++ ) {
632 $seq_array[ $x + $x_offset ][ $rf_y ] = substr( $seq, $x, 1 );
636 $x_offset = $x_offset + $max_x;
643 # Counts the match states, and hence the number of aa in the alignment:
644 for ( $x = 0; $x < $x_offset; $x++ ) {
645 if ( !$seq_array[ $x ][ $rf_y ] ) {
646 &dieWithUnexpectedError( "Alignment not in expected format" );
648 if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) {
655 open( OUTPP, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" );
656 print OUTPP "$rf_y $number_colum\n";
657 for ( $y = 0; $y < $rf_y; $y++ ) {
658 print OUTPP "$seq_name[ $y ]";
659 for ( $i = 0; $i < ( $LENGTH_OF_NAME - length( $seq_name[ $y ] ) ); $i++ ) {
662 for ( $x = 0; $x < $x_offset; $x++ ) {
663 if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) {
664 if ( !$seq_array[ $x ][ $y ] ) {
665 &dieWithUnexpectedError( "Alignment not in expected format" );
667 if ( $not_ensure != 1 && $seq_array[ $x ][ $y ] !~ /[A-Z]|-/ ) {
668 &dieWithUnexpectedError( "Alignment not in expected format (match states must only contain 'A'-'Z' or '-')" );
670 print OUTPP "$seq_array[ $x ][ $y ]";
677 return $number_colum, $x_offset, $rf_y;
679 } ## pfam2phylipMatchOnly
683 # Returns whether the argument (a String)
684 # starts with a SWISS-PROT name (SEQN_SPECI).
685 # Last modified: 06/21/01
686 sub startsWithSWISS_PROTname {
687 return ( $_[ 0 ] =~ /^[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/ );
692 # Returns whether the argument starts with XXX.. XXXXX.. and the first
693 # character is not a "#".
694 # Last modified: 06/21/01
695 sub isPfamSequenceLine {
696 return( !&isPfamCommentLine( $_[ 0 ] )
697 && &containsPfamNamedSequence( $_[ 0 ] ) );
702 # Returns whether the argument does start with a "#".
703 # Last modified: 06/21/01
704 sub isPfamCommentLine {
705 return ( $_[ 0 ] =~ /^#/ );
710 # Returns whether the argument starts with XXX XXXXX.
711 # Last modified: 06/21/01
712 sub containsPfamNamedSequence {
713 return ( $_[ 0 ] =~ /^\S+\s+\S+/ );
717 # Returns whether the argument starts with XXX XXXXX.
718 # Last modified: 06/21/01
720 return ( $_[ 0 ] =~ /^#.*RF/ );
726 # 1. pairwise distance file
727 # 2. number of bootstraps
728 # 3. initial tree: BME, GME or NJ
729 # Last modified: 2008/12/31
733 my $init_opt = $_[ 2 ];
735 &testForTextFilePresence( $inpwd );
738 $command = "$FASTME -b $init_opt -i $inpwd -n $bs -s b";
741 $command = "$FASTME -b $init_opt -i $inpwd -s b";
752 # 1. pairwise distance file
753 # 2. number of bootstraps
754 # 3. seed for random number generator
755 # 4. lower-triangular data matrix? 1: yes; no, otherwise
756 sub executeNeighbor {
764 &testForTextFilePresence( $inpwd );
777 system( "$NEIGHBOR >/dev/null 2>&1 << !
783 && &dieWithUnexpectedError( "Could not execute \"$NEIGHBOR $inpwd$multi$lower\"" );
789 # 1. pairwise distance file
790 # 2. number of bootstraps
791 # 3. seed for random number generator
792 # 4. number of jumbles for input order
793 # 5. lower-triangular data matrix? 1: yes; no, otherwise
794 # 6. FM for Fitch-Margoliash, ME for ME# 6.
795 # 7. 1 to use globale rearragements
803 my $use_global_rearr = $_[ 6 ];
810 if ( $use_global_rearr == 1 ) {
815 &testForTextFilePresence( $inpwd );
820 elsif ( $m eq "ME" ) {
825 &dieWithUnexpectedError( "method for FITCH must be either FM or ME" );
846 # jumble must be set BEFORE multi!
847 system( "$FITCH 2>&1 << !
848 $inpwd$method$global$jumble$multi$lower
852 && &dieWithUnexpectedError( "Could not execute \"$FITCH $inpwd$method$global$jumble$multi$lower\"" );
853 # 3: Do NOT print out tree
860 # 1. pairwise distance file
866 &testForTextFilePresence( $inpwd );
867 my $command = "$BIONJ $inpwd $out";
870 && &dieWithUnexpectedError( $command );
875 # 1. (effective) sequence length
876 # 2. (effective) number of bases
877 # 3. pairwise distance file
879 sub executeWeighbor {
885 &testForTextFilePresence( $i );
886 my $command = "$WEIGHBOR -L $L -b $b -i $i -o $o";
889 && &dieWithUnexpectedError( $command );
894 # 1. DNA or Amino-Acids sequence filename (PHYLIP format)
895 # 2. number of data sets to analyse (ex:3)
896 # 3. Model: JTT | MtREV | Dayhoff | WAG | VT | DCMut | Blosum62 (Amino-Acids)
897 # 4. number of relative substitution rate categories (ex:4), positive integer
898 # 5. starting tree filename (Newick format), your tree filename | BIONJ for a distance-based tree
899 # 6. 1 to estimate proportion of invariable sites, otherwise, fixed proportion "0.0" is used
900 # PHYML produces several results files :
901 # <sequence file name>_phyml_lk.txt : likelihood value(s)
902 # <sequence file name>_phyml_tree.txt : inferred tree(s)
903 # <sequence file name>_phyml_stat.txt : detailed execution stats
905 my $sequences = $_[ 0 ];
906 my $data_sets = $_[ 1 ];
908 my $nb_categ = $_[ 3 ];
910 my $estimate_invar_sites = $_[ 5 ];
912 if ( $data_sets < 1 ) {
916 my $invar = "0.0"; # proportion of invariable sites,
917 # a fixed value (ex:0.0) | e to get the maximum likelihood estimate
918 if ( $estimate_invar_sites == 1 ) {
922 my $data_type = "1"; # 0 = DNA | 1 = Amino-Acids
923 my $format = "i"; # i = interleaved sequence format | s = sequential
924 my $bootstrap_sets = "0"; # number of bootstrap data sets to generate (ex:2)
925 # only works with one data set to analyse
927 my $alpha = "e"; # gamma distribution parameter,
928 # a fixed value (ex:1.0) | e to get the maximum likelihood estimate
930 my $opt_topology = "y"; # optimise tree topology ? y | n
931 my $opt_lengths = "y"; # optimise branch lengths and rate parameters ? y | n
933 if ( $data_sets > 1 ) {
934 # No need to calc branch lengths for bootstrapped analysis
938 &testForTextFilePresence( $sequences );
939 my $command = "$PHYML $sequences $data_type $format $data_sets $bootstrap_sets $model $invar $nb_categ $alpha $tree $opt_topology $opt_lengths";
941 print( "\n$command\n");
944 && &dieWithUnexpectedError( $command );
952 # 1. name of alignment file (in correct format!)
953 # 2. number of bootstraps
954 # 3. jumbles: 0: do not jumble; >=1 number of jumbles
955 # 4. seed for random number generator
956 sub executeProtpars {
965 &testForTextFilePresence( $align );
967 if ( $bs > 1 && $rand < 1 ) {
985 system( "$PROTPARS 2>&1 << !
989 && &dieWithUnexpectedError( "Could not execute \"$PROTPARS $align$jumble$multi\"" );
990 # 3: Do NOT print out tree
999 # "Model of substitution" order for DQO TREE-PUZZLE 5.0:
1001 # m -> Dayhoff (Dayhoff et al. 1978)
1002 # m -> JTT (Jones et al. 1992)
1003 # m -> mtREV24 (Adachi-Hasegawa 1996)
1004 # m -> BLOSUM62 (Henikoff-Henikoff 92)
1005 # m -> VT (Mueller-Vingron 2000)
1006 # m -> WAG (Whelan-Goldman 2000)
1009 # matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
1010 # 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
1011 # Last modified: 07/07/01
1012 sub setModelForPuzzle {
1013 my $matrix_option = $_[ 0 ];
1016 if ( $matrix_option == 0 ) { # JTT
1021 elsif ( $matrix_option == 2 ) { # BLOSUM 62
1028 elsif ( $matrix_option == 3 ) { # mtREV24
1034 elsif ( $matrix_option == 5 ) { # VT
1042 elsif ( $matrix_option == 6 ) { # WAG
1051 elsif ( $matrix_option == 7 ) { # auto
1061 } ## setModelForPuzzle
1064 # Model of rate heterogeneity:
1065 # 1 for "8 Gamma distributed rates"
1066 # 2 for "Two rates (1 invariable + 1 variable)"
1067 # 3 for "Mixed (1 invariable + 8 Gamma rates)"
1068 # otherwise: Uniform rate
1069 # Last modified: 09/08/03
1070 sub setRateHeterogeneityOptionForPuzzle {
1071 my $rate_heterogeneity_option = $_[ 0 ];
1074 if ( $rate_heterogeneity_option == 1 ) {
1078 elsif ( $rate_heterogeneity_option == 2 ) {
1083 elsif ( $rate_heterogeneity_option == 3 ) {
1094 } ## setRateHeterogeneityOptionForPuzzle
1098 # Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise
1099 # Last modified: 09/08/03
1100 sub setParameterEstimatesOptionForPuzzle {
1101 my $parameter_estimates_option = $_[ 0 ];
1104 if ( $parameter_estimates_option == 1 ) {
1113 } ## setParameterEstimatesOptionForPuzzle
1117 # three/four/five arguments:
1118 # 1. Name of inputfile
1119 # 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
1120 # 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
1121 # 3. Number of sequences in alignment
1122 # 4. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise
1123 # 5. Model of rate heterogeneity:
1124 # 1 for "8 Gamma distributed rates"
1125 # 2 for "Two rates (1 invariable + 1 variable)"
1126 # 3 for "Mixed (1 invariable + 8 Gamma rates)"
1127 # otherwise: Uniform rate
1128 sub executePuzzleBootstrapped {
1130 my $matrix_option = $_[ 1 ];
1131 my $number_of_seqs = $_[ 2 ];
1132 my $parameter_estimates_option = $_[ 3 ];
1133 my $rate_heterogeneity_option = $_[ 4 ];
1144 &testForTextFilePresence( $in );
1146 open( GRP, "<$in" ) || die "\n\n$0: Unexpected error: Cannot open file <<$in>>: $!";
1148 if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) {
1154 $l = `cat $in | wc -l`;
1155 $slen = $l / $counter;
1157 system( "split --suffix-length=4 -$slen $in $in.splt." )
1158 && die "\n\n$0: executePuzzleDQObootstrapped: Could not execute \"split --suffix-length=4 -$slen $in $in.splt.\": $!";
1162 $mat = setModelForPuzzle( $matrix_option );
1163 if ( $parameter_estimates_option ) {
1164 $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option );
1166 if ( $rate_heterogeneity_option ) {
1167 $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option );
1171 if ( $number_of_seqs <= 257 ) {
1177 system( "$PUZZLE $a << !
1183 && die "$0: Could not execute \"$PUZZLE $a\"";
1185 system( "cat $a.dist >> $in.dist" )
1186 && die "$0: Could not execute \"cat outdist >> $in.dist\"";
1188 unlink( $a, $a.".dist", $a.".puzzle" );
1193 } ## executePuzzleBootstrapped
1199 # three/four/five arguments:
1200 # 1. Name of inputfile
1201 # 2. Matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
1202 # 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
1203 # 3. Number of sequences in alignment
1204 # 4. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise
1205 # 5. Model of rate heterogeneity:
1206 # 1 for "8 Gamma distributed rates"
1207 # 2 for "Two rates (1 invariable + 1 variable)"
1208 # 3 for "Mixed (1 invariable + 8 Gamma rates)"
1209 # otherwise: Uniform rate
1212 my $matrix_option = $_[ 1 ];
1213 my $number_of_seqs = $_[ 2 ];
1214 my $parameter_estimates_option = $_[ 3 ];
1215 my $rate_heterogeneity_option = $_[ 4 ];
1220 &testForTextFilePresence( $in );
1222 $mat = &setModelForPuzzle( $matrix_option );
1223 if ( $parameter_estimates_option ) {
1224 $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option );
1226 if ( $rate_heterogeneity_option ) {
1227 $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option );
1231 if ( $number_of_seqs <= 257 ) {
1236 system( "$PUZZLE $in << !
1242 && die "$0: Could not execute \"$PUZZLE\"";
1251 # Preparation of the pwd file
1252 sub addDistsToQueryToPWDfile {
1253 my $pwd_file = $_[ 0 ];
1254 my $disttoquery_file = $_[ 1 ];
1255 my $outfile = $_[ 2 ];
1256 my $name_of_query = $_[ 3 ];
1257 my $name_of_query_ = "";
1258 my $return_line_pwd = "";
1259 my $return_line_dq = "";
1262 my $name_from_pwd = "X";
1263 my $name_from_dq = "Y";
1264 my @dists_to_query = ();
1267 &testForTextFilePresence( $pwd_file );
1268 &testForTextFilePresence( $disttoquery_file );
1270 $name_of_query_ = $name_of_query;
1271 for ( my $j = 0; $j <= ( $LENGTH_OF_NAME - length( $name_of_query ) - 1 ); ++$j ) {
1272 $name_of_query_ .= " ";
1275 open( OUT_AD, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" );
1276 open( IN_PWD, "$pwd_file" ) || &dieWithUnexpectedError( "Cannot open file \"$pwd_file\"" );
1277 open( IN_DQ, "$disttoquery_file" ) || &dieWithUnexpectedError( "Cannot open file \"$disttoquery_file\"" );
1279 W: while ( $return_line_pwd = <IN_PWD> ) {
1282 if ( $return_line_pwd =~ /^\s*(\d+)\s*$/ ) {
1286 print OUT_AD "$name_of_query_ ";
1287 for ( my $j = 0; $j < $i; ++$j ) {
1288 print OUT_AD "$dists_to_query[ $j ] ";
1290 print OUT_AD "0.0\n";
1292 print OUT_AD " $num_of_sqs\n";
1294 @dists_to_query = ();
1299 && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) {
1300 $name_from_pwd = $1;
1302 if ( !defined( $return_line_dq = <IN_DQ> ) ) {
1303 &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" );
1306 if ( $return_line_dq !~ /\S/ ) {
1307 if ( !defined( $return_line_dq = <IN_DQ> ) ) {
1308 &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" );
1311 $return_line_dq =~ /^\s*(\S+)\s+(\S+)/;
1313 $dists_to_query[ $i++ ] = $2;
1316 if ( $name_from_pwd ne $name_from_dq ) {
1317 &dieWithUnexpectedError( "Order of sequence names in \"$pwd_file\" and \"$disttoquery_file\" is not the same" );
1319 print OUT_AD $return_line_pwd;
1323 && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) {
1324 $name_from_pwd = $1;
1325 if ( !defined( $return_line_dq = <IN_DQ> ) ) {
1326 &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" );
1328 if ( $return_line_dq !~ /\S/ ) {
1329 if ( !defined( $return_line_dq = <IN_DQ>) ) {
1330 &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" );
1333 $return_line_dq =~ /^\s*\S+\s+(\S+)/;
1334 $dists_to_query[ $i++ ] = $1;
1335 print OUT_AD $return_line_pwd;
1338 print OUT_AD "$name_of_query_ ";
1339 for ( my $j = 0; $j < $i; ++$j ) {
1340 print OUT_AD "$dists_to_query[ $j ] ";
1342 print OUT_AD "0.0\n";
1349 } ## addDistsToQueryToPWDfile
1357 # 3. outputfile name
1358 # Last modified: 02/27/01
1359 sub executeHmmfetch {
1363 my $outfile = $_[ 2 ];
1365 system( "$HMMFETCH $db $name > $outfile" )
1366 && &dieWithUnexpectedError( "Could not execute \"$HMMFETCH $db $name > $outfile\"" );
1369 } ## executeHmmfetch
1373 # Checks wether a file is present, not empty and a plain textfile.
1374 # One argument: name of file.
1375 # Last modified: 07/07/01
1376 sub testForTextFilePresence {
1378 unless ( ( -s $file ) && ( -f $file ) && ( -T $file ) ) {
1379 dieWithUnexpectedError( "File \"$file\" does not exist, is empty, or is not a plain textfile" );
1381 } ## testForTextFilePresence
1384 # Last modified: 02/21/03
1385 sub addSlashAtEndIfNotPresent {
1386 my $filename = $_[ 0 ];
1387 $filename =~ s/\s+//g;
1388 unless ( $filename =~ /\/$/ ) {
1389 $filename = $filename."/";
1392 } ## addSlashAtEndIfNotPresent
1396 # Last modified: 02/15/02
1397 sub exitWithWarning {
1400 if ( defined( $_[ 1 ] ) && $_[ 1 ] == 1 ) {
1401 print( "<H4 class=\"error\">user error</H4>\n" );
1403 print( "<B>$text</B>\n" );
1405 print( "<P>  </P>\n" );
1408 print( "\n\n$text\n\n" );
1413 } ## exit_with_warning
1417 # Last modified: 02/15/02
1418 sub dieWithUnexpectedError {
1422 die( "\n\n$0:\nUnexpected error (should not have happened):\n$text\n$!\n\n" );
1424 } ## dieWithUnexpectedError