5 # Copyright (C) 1999-2003 Washington University School of Medicine
6 # and Howard Hughes Medical Institute
9 # Author: Christian M. Zmasek
10 # zmasek@genetics.wustl.edu
11 # http://www.genetics.wustl.edu/eddy/people/zmasek/
13 # Last modified 04/06/04
16 # Requirements makeTree is part of the RIO/FORESTER suite of programs.
17 # ------------ Many of its global variables are set via rio_module.pm.
20 # Note. Use xt.pl (for Pfam alignments) or mt.pl (for other alignments)
21 # to run makeTree.pl on whole directories of alignments files.
28 # Tree calculation based on a Pfam/Clustal W alignment
29 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31 # makeTree.pl [-options] <input alignment in SELEX (Pfam), PHYLIP
32 # sequential format, or Clustal W output> <outputfile>
33 # [path/name for temporary directory to be created]
36 # "% makeTree.pl -UTB1000S41NDXV /DB/PFAM/Full/IL5 IL5_tree"
39 # Tree calculation based on precalculated pairwise distances
40 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
41 # Consensus tree will have no branch length values.
42 # Precalculated pairwise distances are the output of "pfam2pwd.pl",
43 # number of bootstraps needs to match the one used for the pwds.
45 # makeTree.pl <-options, includes "F"> <pwdfile: boostrapped pairwise
46 # distances> <outputfile> [path/name for temporary directory
50 # "% makeTree.pl -FB100S21XV /pfam2pwd_out/IL5.pwd IL5_tree"
53 # Tree calculation based on precalculated pairwise distances
54 # and matching alignment
55 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
56 # Consensus tree will have branch length values.
57 # Precalculated pairwise distances and the matching (processed)
58 # alignment are the output of "pfam2pwd.pl", number of bootstraps
59 # needs to match the one used for the pwds, matrix needs to match
60 # the one used for the pwds.
62 # makeTree.pl <-options, includes "UF"> <pwdfile: boostrapped pairwise
63 # distances> <alnfile: corresponding alignment>
64 # <outputfile> [path/name for temporary directory to be created]
67 # "% makeTree.pl -UFLB100S21XV /pfam2pwd_out/IL5.pwd /pfam2pwd_out/IL5.aln IL5_tree"
73 # N : Suggestion to remove columns in the alignment which contain gaps.
74 # Gaps are not removed, if, after removal of gaps, the resulting alignment would
75 # be shorter than $MIN_NUMBER_OF_AA. Default is not to remove gaps.
76 # Bx : Number of bootstrapps. B0: do not bootstrap. Default is 100 bootstrapps.
77 # The number of bootstrapps should be divisible by 10.
78 # U : Use TREE-PUZZLE to calculate ML branchlengths for consesus tree, in case of
79 # bootstrapped analysis.
80 # J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE, default: PAM.
81 # L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE, default: PAM.
82 # M : Use mtREV24 matrix (Adachi-Hasegawa 1996) inTREE-PUZZLE, default: PAM.
83 # W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE, default: PAM.
84 # T : Use VT matrix (Mueller-Vingron 2000) in TREE-PUZZLE, default: PAM.
85 # P : Let TREE-PUZZLE choose which matrix to use, default: PAM.
86 # E : Exact parameter estimates in TREE-PUZZLE, default: Approximate.
87 # Model of rate heterogeneity in TREE-PUZZLE (default: Uniform rate)
88 # g : 8 Gamma distributed rates
89 # t : Two rates (1 invariable + 1 variable)
90 # m : Mixed (1 invariable + 8 Gamma rates)
91 # R : Randomize input order in PHYLIP NEIGHBOR.
92 # A : Use PHYLIP PROTPARS instead of NEIGHBOR (and no pairwise distance calculation).
93 # jx : Number of jumbles when using PHYLIP PROTPARS (random seed set with Sx).
94 # Sx : Seed for random number generator(s). Must be 4n+1. Default is 9.
95 # X : To keep multiple tree file (=trees from bootstrap resampled alignments).
96 # D : To keep (and create in case of bootstrap analysis) pairwise distance matrix file.
97 # This is created form the not resampled (original) alignment.
98 # C : Calculate pairwise distances only (no tree). Bootstrap is always 1.
99 # No other files are generated.
100 # F : Pairwise distance (pwd) file as input (instead of alignment).
101 # No -D, -C, and -N options available in this case.
103 # # : Only for rio.pl: Do not calculate consensus tree ("I" option in rio.pl).
110 # 09/06/03: Added "#" option (to be used only for rio.pl).
111 # 03/24/04: Do not replace "?" with "-" in method pfam2phylip.
117 use lib $FindBin::Bin;
120 my $VERSION = "4.210";
122 my $TEMP_DIR_DEFAULT = "/tmp/maketree"; # Where all the infiles, outfiles, etc will be created.
124 my $remove_gaps = 0; # 0: do not remove gaps; 1: remove gaps
125 my $bootstraps = 100; # 0,1: do not bootstrap. Default: 100
126 my $puzzle_consensus_tree = 0; # 0: no; 1: yes. No is default.
127 my $matrix = 1; # 0 = JTT
134 my $rate_heterogeneity = 0; # 0 = Uniform rate (default)
135 # 1 = 8 Gamma distributed rates
136 # 2 = Two rates (1 invariable + 1 variable)
137 # 3 = Mixed (1 invariable + 8 Gamma rates)
138 my $randomize_input_order = 0; # 0: do not randomize input order; 1 jumble
139 my $seed = 9; # Seed for random number generators. Default: 9
140 my $keep_multiple_trees = 0; # 0: delete multiple tree file
141 # 1: do not delete multiple tree file
142 my $keep_distance_matrix = 0; # 1: (create and) keep; 0: do not (create and) keep
143 my $verbose = 0; # 0: no; 1: yes
144 my $pairwise_dist_only = 0; # 0: no; 1: yes
145 my $start_with_pwd = 0; # 0: no; 1: yes
146 my $start_with_pwd_and_aln = 0; # 0: no; 1: yes
147 my $no_consenus_tree = 0; # 0: no; 1: yes
148 my $exact_parameter_est = 0; # 0: no; 1: yes
149 my $use_protpars = 0; # 0: no; 1: yes
150 my $protpars_jumbles = 0;
152 my %seqnames = (); # number => seqname
153 my %numbers = (); # seqname => number
161 my $multitreefile = "";
162 my $distancefile = "";
164 my $number_of_aa = 0;
168 my $current_dir = "";
170 my $number_of_seqs = 0;
174 unless ( @ARGV == 2 || @ARGV == 3 || @ARGV == 4 || @ARGV == 5 ) {
181 # Analyzes the options:
182 # ---------------------
184 if ( $ARGV[ 0 ] =~ /^-.+/ ) {
186 unless ( @ARGV > 2 ) {
190 $options = $ARGV[ 0 ];
192 if ( $options =~ /F/ && $options !~ /U/ ) {
193 if ( @ARGV != 3 && @ARGV != 4 ) {
200 $pwdfile = $ARGV[ 1 ];
202 $outfile = $ARGV[ 2 ];
204 $temp_dir = $ARGV[ 3 ];
208 elsif ( $options =~ /F/ && $options =~ /U/ ) {
209 if ( @ARGV != 4 && @ARGV != 5 ) {
214 $start_with_pwd_and_aln = 1;
215 $pwdfile = $ARGV[ 1 ];
216 $infile = $ARGV[ 2 ];
217 $outfile = $ARGV[ 3 ];
219 $temp_dir = $ARGV[ 4 ];
224 if ( @ARGV != 3 && @ARGV != 4 ) {
228 $infile = $ARGV[ 1 ];
229 $outfile = $ARGV[ 2 ];
231 $temp_dir = $ARGV[ 3 ];
235 if ( $options =~ /N/ && $start_with_pwd != 1 ) {
236 $remove_gaps = 1; # do remove gaps
238 if ( $options =~ /B(\d+)/ ) {
240 if ( $bootstraps <= 1 ) {
243 elsif ( $bootstraps <= 9 ) {
245 print "\n\nMAKETREE: WARNING: Bootstrap number must be devisable by 10,\nno bootstrapping.\n\n";
247 elsif ( $bootstraps % 10 != 0 ) {
248 $bootstraps = $bootstraps - $bootstraps % 10; # to ensure $bootstraps % 10 == 0
249 print "\n\nMAKETREE: WARNING: Bootstrap number must be devisable by 10,\nhas been set to $bootstraps.\n\n";
252 if ( $options =~ /A/ ) {
253 $use_protpars = 1 # PROTPARS
255 if ( $options =~ /j(\d+)/ ) {
256 $protpars_jumbles = $1;
257 if ( $protpars_jumbles < 0 ) {
258 $protpars_jumbles = 0;
261 if ( $options =~ /J/ ) {
264 if ( $options =~ /L/ ) {
265 $matrix = 2; # Blossum
267 if ( $options =~ /M/ ) {
268 $matrix = 3; # mtREV24
270 if ( $options =~ /T/ ) {
273 if ( $options =~ /W/ ) {
276 if ( $options =~ /P/ ) {
279 if ( $options =~ /R/ ) {
280 $randomize_input_order = 1;
282 if ( $options =~ /S(\d+)/ ) {
285 if ( $options =~ /U/ ) {
286 $puzzle_consensus_tree = 1;
288 if ( $options =~ /X/ ) {
289 $keep_multiple_trees = 1;
291 if ( $options =~ /D/ && $start_with_pwd != 1 ) {
292 $keep_distance_matrix = 1;
294 if ( $options =~ /V/ ) {
297 if ( $options =~ /C/ && $start_with_pwd != 1 ) {
298 $pairwise_dist_only = 1;
300 if ( $options =~ /E/ ) {
301 $exact_parameter_est = 1;
303 if ( $options =~ /g/ ) {
304 $rate_heterogeneity = 1;
306 if ( $options =~ /t/ ) {
307 $rate_heterogeneity = 2;
309 if ( $options =~ /m/ ) {
310 $rate_heterogeneity = 3;
312 if ( $options =~ /#/ ) {
313 $no_consenus_tree = 1;
315 if ( $protpars_jumbles > 0 && $use_protpars != 1 ) {
319 if ( $use_protpars == 1 ) {
320 if ( $randomize_input_order >= 1
321 || $start_with_pwd == 1
322 || $keep_distance_matrix == 1
323 || $pairwise_dist_only == 1 ) {
327 if ( $bootstraps > 1 && $protpars_jumbles < 1 ) {
328 $protpars_jumbles = 1;
335 unless ( @ARGV == 2 || @ARGV == 3 ) {
339 $infile = $ARGV[ 0 ];
340 $outfile = $ARGV[ 1 ];
342 $temp_dir = $ARGV[ 2 ];
349 $current_dir = `pwd`;
350 $current_dir =~ s/\s//;
352 if ( $outfile !~ /^\// ) {
353 # outfile is not absolute path.
354 $outfile = $current_dir."/".$outfile;
359 if ( $pairwise_dist_only == 1 ) {
361 $keep_multiple_trees = 0;
362 $puzzle_consensus_tree = 0;
363 $randomize_input_order = 0;
365 $keep_distance_matrix = 1;
368 if ( $bootstraps < 2 ) {
369 $no_consenus_tree = 0;
372 # TREE-PUZZLE sets the option in this way:
373 # If two rates or mixed, exact parameter estimates are used.
374 if ( $rate_heterogeneity == 2
375 || $rate_heterogeneity == 3 ) {
376 $exact_parameter_est = 1
379 $logfile = $outfile.$LOG_FILE_SUFFIX;
380 $alignfile = $outfile.$ALIGN_FILE_SUFFIX;
381 $multitreefile = $outfile.$MULTIPLE_TREES_FILE_SUFFIX;
382 $distancefile = $outfile.$SUFFIX_PWD_NOT_BOOTS;
384 if ( $outfile =~ /\.nhx$/i ) {
385 $outfilenhx = $outfile;
386 $logfile =~ s/\.nhx//i;
387 $alignfile =~ s/\.nhx//i;
388 $outfile =~ s/\.nhx//i;
389 $multitreefile =~ s/\.nhx//i;
390 $distancefile =~ s/\.nhx//i;
393 $outfilenhx = $outfile.".nhx";
396 if ( -e $outfilenhx ) {
397 die "\n\nmakeTree: \"$outfilenhx\" already exists.\n\n";
399 if ( $infile ne "" ) {
400 unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
401 die "\n\nmakeTree: Input alignment file \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n";
404 if ( $start_with_pwd == 1 ) {
405 unless ( ( -s $pwdfile ) && ( -f $pwdfile ) && ( -T $pwdfile ) ) {
406 die "\n\nmakeTree: Pairwise distance file \"$pwdfile\" does not exist, is empty, or is not a plain textfile.\n\n";
412 # Prints out the options:
413 # -----------------------
416 $log = "\n$0 logfile:\n";
417 $log = $log."Version: $VERSION\n\n";
420 if ( $start_with_pwd == 1 ) {
421 $log = $log."Input pairwise distance file (bootstrapped): $pwdfile\n";
423 if ( $infile ne "" ) {
424 $log = $log."Input alignment : $infile\n";
427 if ( $no_consenus_tree != 1 ) {
428 $log = $log."Output tree file : $outfilenhx\n";
431 if ( $keep_multiple_trees == 1 && $bootstraps >= 2 ) {
432 $log = $log."Output multiple trees file : $multitreefile\n";
434 if ( $keep_distance_matrix ) {
435 $log = $log."Output pairwise distance file : $distancefile\n";
438 $log = $log."Bootstraps : $bootstraps\n";
440 if ( $start_with_pwd != 1 && $use_protpars != 1 ) {
441 $log = $log."Prgrm to calculate pairwise dist. : TREE-PUZZLE\n";
444 if ( $use_protpars == 1 ) {
445 $log = $log."Program to calculate tree : PHYLIP PROTPARS\n";
446 $log = $log."Number of jumbles in PROTPARS : $protpars_jumbles\n";
449 $log = $log."Program to calculate tree : PHYLIP NEIGHBOR (NJ)\n";
452 if ( $puzzle_consensus_tree == 1 ) {
453 $log = $log."Prgrm to calculate ML branch lenghts: TREE-PUZZLE\n";
455 if ( $puzzle_consensus_tree == 1 || $start_with_pwd != 1 ) {
456 $log = $log."Model : ";
457 if ( $matrix == 0 ) {
458 $log = $log."JTT (Jones et al. 1992)\n";
460 elsif ( $matrix == 2 ) {
461 $log = $log."BLOSUM 62 (Henikoff-Henikoff 92)\n";
463 elsif ( $matrix == 3 ) {
464 $log = $log."mtREV24 (Adachi-Hasegawa 1996)\n";
466 elsif ( $matrix == 5 ) {
467 $log = $log."VT (Mueller-Vingron 2000)\n";
469 elsif ( $matrix == 6 ) {
470 $log = $log."WAG (Whelan-Goldman 2000)\n";
472 elsif ( $matrix == 7 ) {
473 $log = $log."auto\n";
476 $log = $log."PAM (Dayhoff et al. 1978)\n";
479 $log = $log."Model of rate heterogeneity : ";
480 if ( $rate_heterogeneity == 1 ) {
481 $log = $log."8 Gamma distributed rates\n";
483 elsif ( $rate_heterogeneity == 2 ) {
484 $log = $log."Two rates (1 invariable + 1 variable)\n";
486 elsif ( $rate_heterogeneity == 3 ) {
487 $log = $log."Mixed (1 invariable + 8 Gamma rates)\n";
490 $log = $log."Uniform rate\n";
492 if ( $randomize_input_order >= 1 ) {
493 $log = $log."Randomize input order in NEIGHBOR : yes\n";
495 $log = $log."Seed for random number generators : $seed\n";
496 if ( $exact_parameter_est == 1 ) {
497 $log = $log."Exact parameter estimates in TREE-PUZZLE\n";
500 $log = $log."Start time/date : ".`date`;
505 # That's where the mischief starts....
506 # ------------------------------------
512 if ( $temp_dir eq "" ) {
513 $temp_dir = $TEMP_DIR_DEFAULT;
516 $temp_dir = $temp_dir.$time_st.$ii;
518 while ( -e $temp_dir ) {
520 $temp_dir = $temp_dir.$time_st.$ii;
523 mkdir( $temp_dir, 0700 )
524 || die "\n\n$0: Unexpected error: Could not create <<$temp_dir>>: $!\n\n";
526 unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) {
527 die "\n\n$0: Unexpected error: <<$temp_dir>> does not exist, or is not a directory.\n\n";
531 if ( $start_with_pwd != 1 ) {
532 system( "cp", $infile, $temp_dir."/INFILE" );
533 unless ( chmod ( 0600, $temp_dir."/INFILE" ) ) {
534 warn "\n\n$0: Could not chmod. $!\n\n";
541 || die "\n\n$0: Unexpected error: Could not chdir to <<$temp_dir>>: $!\n\n";
544 if ( $start_with_pwd != 1 ) {
546 @out = &DoPfam2phylip( $infile, $alignfile, $remove_gaps );
547 $number_of_aa = $out[ 0 ];
548 $orig_length = $out[ 1 ];
549 $number_of_seqs = $out[ 2 ];
551 system( "cp", $alignfile, "infile" );
553 if ( $use_protpars != 1 ) {
554 # Calculating the pairwise distances (saved in file "infile"): "puzzle"
556 system( "cp", $alignfile, "align" );
558 if ( $bootstraps > 1 ) {
560 &executeSeqboot( $seed, $bootstraps );
562 if ( $keep_distance_matrix ) {
563 system( "mv", "outfile", "outfile___" );
564 system( "cp", "align", "infile" );
565 &executePuzzle( "infile",
567 $exact_parameter_est,
568 $rate_heterogeneity );
569 system( "mv", "infile.dist", $distancefile );
570 system( "mv", "outfile___", "outfile" );
572 unlink( "infile" ); # Necessary, since "infile" is puzzle's default input.
573 system( "mv", "outfile", "IN" );
575 &executePuzzleBootstrapped( "IN",
577 $exact_parameter_est,
578 $rate_heterogeneity );
580 $pwdfile = "IN.dist";
585 &executePuzzle( "infile",
587 $exact_parameter_est,
588 $rate_heterogeneity );
590 if ( $keep_distance_matrix ) {
591 system( "cp outdist $distancefile" );
593 $pwdfile = "infile.dist";
596 unlink( "infile.tree" );
598 if ( $pairwise_dist_only == 1 ) {
599 unlink( "infile", "align", "INFILE", "outdist", $alignfile );
600 chdir( $current_dir )
601 || die "\n\n$0: Unexpected error: Could not chdir to <<$current_dir>>: $!\n\n";
604 || die "\n\n$0: Unexpected error: Could not remove <<$temp_dir>>: $!\n\n";
606 print "\n\n$0 finished.\n\n";
607 print "Output pairwise distance file written as: $distancefile\n\n";
608 print "\n\nmakeTree successfully terminated.\n\n";
612 } ## if ( $use_protpars != 1 )
614 } ## if ( $start_with_pwd != 1 )
617 # Calculating the tree (saved in file "infile"):
619 if ( $use_protpars != 1 ) {
621 &executeNeighbor( $pwdfile, $bootstraps, $randomize_input_order, $seed, 1 );
624 if ( $bootstraps > 1 ) {
625 &executeSeqboot( $seed, $bootstraps );
627 system( "mv", "outfile", "infile" );
629 &executeProtpars( "infile", $bootstraps, $protpars_jumbles, $seed );
634 if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) {
636 system( "cp", "outtree", $multitreefile );
640 system( "mv", "outtree", "intree" );
642 if ( $bootstraps > 1 ) {
643 if ( $no_consenus_tree != 1 ) {
646 &executeConsense( "intree" );
648 if ( $puzzle_consensus_tree == 1 ) {
650 system( "cp", "outtree", "treefile_consense" );
651 system( "mv", "outtree", "intree" );
653 # Puzzle for ML branch lenghts:
654 # The alignment is read from infile by default.
655 # The tree is read from intree by default.
657 if ( $start_with_pwd_and_aln == 1 ) {
658 &pfam2phylipMatchOnly( $infile,
662 elsif ( $use_protpars != 1 ) {
663 system( "mv", "align", "infile" ); # align = original alignment in phylip interleaved.
666 &executePuzzleToCalculateBranchLenghts( $matrix,
667 $exact_parameter_est,
668 $rate_heterogeneity );
670 unlink( "outfile", "outdist" );
671 system( "mv", "outtree", "outree_puzzle" );
674 &executeTransfersBranchLenghts( "outree_puzzle", "treefile_consense", $outfilenhx );
678 unlink( "outfile", "align" );
679 system( "mv", "outtree", $outfilenhx );
683 unlink( "outfile", "align" );
688 unlink( "align", "infile.dist" );
689 if ( $start_with_pwd != 1 ) {
690 system( "mv intree $outfilenhx" );
696 unlink( "treefile_consense", "outtree", "outree_puzzle",
697 "infile", "intree", "align", "INFILE", "IN", "IN.dist", "outdist" );
700 $log = $log."Finish time/date : ".`date`;
702 if ( $start_with_pwd != 1 ) {
703 $log = $log."Removed gaps : ";
704 if ( $remove_gaps == 1 ) {
710 $log = $log."Columns in alignment used : $number_of_aa\n";
711 $log = $log."Columns in original alignment : $orig_length\n";
712 $log = $log."Number of sequences in alignment : $number_of_seqs\n";
716 open( OUT, ">$logfile" ) || die "\n$0: Cannot create file <<$logfile>>: $!\n";
721 chdir( $current_dir )
722 || die "\n\n$0:Unexpected error: Could not chdir to <<$current_dir>>: $!\n\n";
726 || die "\n\n$0:Unexpected error: Could not remove <<$temp_dir>>: $!\n\n";
728 if ( $verbose == 1 ) {
729 print "\n\n$0 finished.\n";
730 if ( $no_consenus_tree != 1 ) {
731 print "Output tree written as : $outfilenhx\n";
733 print "Log written as : $logfile\n";
734 if ( $start_with_pwd != 1 ) {
735 print "Alignment written as : $alignfile\n";
737 if ( $keep_multiple_trees == 1 && $bootstraps >= 2 ) {
738 print "Multiple trees written as : $multitreefile\n";
740 if ( $keep_distance_matrix ) {
741 print "Distance matrix written as: $distancefile\n";
758 # Executes pfam2phylip.
759 # If resulting alignment is too short due to the removal
760 # of gaps, is does not remove gaps.
764 # 3. remove gaps: 1 to remove gaps; 0: do not remove gaps
765 # Last modified: 06/04/01
769 my $option = $_[ 2 ];
773 if ( $option == 1 ) {
774 @output = &pfam2phylip( $in, $out, 1 );
777 die "\n\n$0: DoPfam2phylip: Unexpected error.\n\n";
779 if ( $aa < $MIN_NUMBER_OF_AA ) {
785 if ( $option == 0 ) { # Must be another "if" (no elsif of else)!
786 @output = &pfam2phylip( $in, $out, 2 );
787 # 2 is to substitute non-letters with "-" in the sequence.
790 die "\n\n$0: DoPfam2phylip: Unexpected error.\n\n";
799 # 1. seed for random number generator
800 # 2. number of bootstraps
801 # Reads in "infile" by default.
808 &testForTextFilePresence( $infile );
810 if ( $verbose != 1 ) {
816 system( "$SEQBOOT << !
822 && die "$0: Could not execute \"$SEQBOOT\"";
831 # One/two/three argument(s):
832 # Reads in tree from "intree" by default. (Presence of "intree" automatically
833 # switches into "User defined trees" mode.)
834 # 1. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
835 # 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
836 # 2. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise
837 # 3. Model of rate heterogeneity:
838 # 1 for "8 Gamma distributed rates"
839 # 2 for "Two rates (1 invariable + 1 variable)"
840 # 3 for "Mixed (1 invariable + 8 Gamma rates)"
841 # otherwise: Uniform rate
842 # Last modified: 09/08/03 (added 2nd and 3rd parameter)
843 sub executePuzzleToCalculateBranchLenghts {
844 my $matrix_option = $_[ 0 ];
845 my $parameter_estimates_option = $_[ 1 ];
846 my $rate_heterogeneity_option = $_[ 2 ];
852 unless ( ( -s "infile" ) && ( -f "infile" ) && ( -T "infile" ) ) {
853 die "\n$0: executePuzzleToCalculateBranchLenghts: <<infile>> does not exist, is empty, or is not a plain textfile.\n";
855 unless ( ( -s "intree" ) && ( -f "intree" ) && ( -T "intree" ) ) {
856 die "\n$0: executePuzzleToCalculateBranchLenghts: <<intree>> does not exist, is empty, or is not a plain textfile.\n";
859 $mat = setModelForPuzzle( $matrix_option );
860 if ( $parameter_estimates_option ) {
861 $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option );
863 if ( $rate_heterogeneity_option ) {
864 $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option );
867 system( "$PUZZLE << !
872 && die "$0: Could not execute \"$PUZZLE\"";
884 # Three/four arguments:
885 # 1. Name of file containing tree with correct branch lengths
886 # 2. Name of file containing tree with correct bootstraps
888 # 4. R to reroot both trees in the same manner (use for FITCH,
889 # since this changes to rooting.
890 sub executeTransfersBranchLenghts {
891 my $tree_with_bl = $_[ 0 ];
892 my $tree_with_bs = $_[ 1 ];
894 my $reroot = $_[ 3 ];
897 if ( $reroot && $reroot eq "R" ) {
901 &testForTextFilePresence( $tree_with_bl );
902 &testForTextFilePresence( $tree_with_bs );
904 system( "$TRANSFERSBRANCHLENGHTS $tree_with_bl $tree_with_bs $out $R" )
905 && die "$0: Could not execute \"$TRANSFERSBRANCHLENGHTS $tree_with_bl $tree_with_bs $out $R\"";
913 # Called by method DoPfam2phylip.
914 # This reads a multiple sequence alignment file in Pfam format,
915 # Phylip's sequential format, or ClustalW (".aln")output and saves them
916 # in Phylip's sequential or interleaved format.
917 # (Those two are the same in this case, since all the seqs will be
918 # one line in length (no returns)).
919 # It returns (1st) the number of aa (columns) in the resulting
920 # alignment and the (2nd) number of aa (columns) in the original
923 # Reads a file containing a sequence alignment in the following format
925 # #comments <- empty lines and lines begining with # (not mandatory)
928 # <- at least one empty line between blocks
932 # Saves it in the "sequential" format of phylip:
933 # number of OTUs length of aa seqs
940 # 3. 1 : Removes colums with a gap (non-letter character)
941 # 2 : Substitutes non-letter characters (except "?") in the sequence with "-".
943 # Last modified: 03/24/04
945 # 03/24/04: Do not replace "?" with "-"
949 my $infile = $_[ 0 ];
950 my $outfile = $_[ 1 ];
951 my $options = $_[ 2 ]; # 1: remove gaps; 2: non-letters (except "?") -> "-"
952 my $return_line = "";
956 my $original_length = 0;
966 my $saw_a_sequence_line = 0;
969 die "\n$0: pfam2phylip: <<$outfile>> already exists.\n";
972 &testForTextFilePresence( $infile );
974 open( INPP, "$infile" ) || die "\n$0: pfam2phylip: Cannot open file <<$infile>>: $!\n";
976 until ( $return_line !~ /^\s*\S+\s+\S+/ && $saw_a_sequence_line == 1 ) {
977 if ( $return_line =~ /^\s*\S+\s+\S+/
978 && $return_line !~ /^\s*#/
979 && $return_line !~ /^\s*\d+\s+\d+/
980 && $return_line !~ /^\s*CLUSTAL/ ) {
981 $saw_a_sequence_line = 1;
982 $return_line =~ /^\s*(\S+)\s+(\S+)/;
983 $seq_name[ $y ] = $1;
985 $seq_name[ $y ] = substr( $seq_name[ $y ], 0, $LENGTH_OF_NAME );
987 for ( $x = 0; $x <= length( $seq ) - 1; $x++ ) {
988 $seq_array[ $x ][ $y ] = substr( $seq, $x, 1 );
990 if ( $x_offset < length( $seq ) ) {
991 $x_offset = length( $seq );
995 $return_line = <INPP>;
996 if ( !$return_line ) {
1005 while ( $return_line = <INPP> ) {
1006 if ( $return_line =~ /^\s*(\S+)\s+(\S+)/
1007 && $return_line !~ /^\s*#/
1008 && $return_line !~ /^\s*\d+\s+\d+/ ) {
1009 $return_line =~ /^\s*\S+\s+(\S+)/;
1011 for ( $x = 0; $x <= length( $seq ) - 1; $x++ ) {
1012 $seq_array[ $x + $x_offset ][ $y % $max_y ] = substr( $seq, $x, 1 );
1014 if ( $max_x < length( $seq ) ) {
1015 $max_x = length( $seq );
1018 if ( ( $y % $max_y ) == 0 ) {
1020 $x_offset = $x_offset + $max_x;
1025 $original_length = $x_offset;
1029 # Removes "gap-columns" (gaps = everything except a-z characters):
1030 if ( $options == 1 ) {
1033 COLUMN: for ( $x = 0; $x <= $x_offset - 1; $x++ ) { # goes through all aa positions (columns)
1035 for ( $y = 0; $y <= $max_y - 1; $y++ ) { # goes through all aas in a particular position
1037 unless ( $seq_array[ $x ][ $y ] && $seq_array[ $x ][ $y ] =~ /[a-z]/i ) {
1043 # If this point is reached, column must be OK = no gaps.
1045 for ( $m = 0; $m <= $max_y; $m++ ) {
1046 for ( $n = $x; $n <= $x_offset; $n++ ) {
1047 $seq_array[ $n - $move ][ $m ] = $seq_array[ $n ][ $m ];
1050 $x_offset = $x_offset - $move;
1056 for ( $m = 0; $m <= $max_y; $m++ ) {
1057 for ( $n = $x; $n <= $x_offset; $n++ ) {
1058 $seq_array[ $n - $move ][ $m ] = $seq_array[ $n ][ $m ];
1061 $x_offset = $x_offset - $move;
1070 open( OUTPP, ">$outfile" ) || die "\n$0: pfam2phylip: Cannot create file <<$outfile>>: $!\n";
1071 print OUTPP "$max_y $x_offset\n";
1072 for ( $y = 0; $y < $max_y; $y++ ) {
1073 print OUTPP "$seq_name[ $y ]";
1074 for ( $i = 0; $i <= ( $LENGTH_OF_NAME - length( $seq_name[ $y ] ) - 1 ); $i++ ) {
1077 for ( $x = 0; $x <= $x_offset - 1; $x++ ) {
1078 if ( $options == 2 ) {
1079 if ( $seq_array[ $x ][ $y ] ) {
1080 $seq_array[ $x ][ $y ] =~s /[^a-zA-Z\?]/-/;
1083 $seq_array[ $x ][ $y ] = "-";
1086 print OUTPP "$seq_array[ $x ][ $y ]";
1092 return $x_offset, $original_length, $max_y;
1102 print " makeTree.pl version $VERSION\n";
1103 print " -----------\n";
1107 Copyright (C) 1999-2003 Washington University School of Medicine
1108 and Howard Hughes Medical Institute
1111 Author: Christian M. Zmasek
1112 zmasek\@genetics.wustl.edu
1113 http://www.genetics.wustl.edu/eddy/people/zmasek/
1115 Last modified 09/06/03
1118 Requirements makeTree is part of the RIO/FORESTER suite of programs.
1119 ------------ Many of its global variables are set via rio_module.pm.
1123 Note. Use xt.pl (for Pfam alignments) or mt.pl (for other alignments)
1124 to run makeTree.pl on whole directories of alignments files.
1131 Tree calculation based on a Pfam/Clustal W alignment
1132 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1134 makeTree.pl [-options] <input alignment in SELEX (Pfam), PHYLIP
1135 sequential format, or Clustal W output> <outputfile>
1136 [path/name for temporary directory to be created]
1139 "% makeTree.pl -UTB1000S41NDXV /DB/PFAM/Full/IL5 IL5_tree"
1142 Tree calculation based on precalculated pairwise distances
1143 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1144 Consensus tree will have no branch length values.
1145 Precalculated pairwise distances are the output of "pfam2pwd.pl",
1146 number of bootstraps needs to match the one used for the pwds.
1148 makeTree.pl <-options, includes "F"> <pwdfile: boostrapped pairwise
1149 distances> <outputfile> [path/name for temporary directory
1153 "% makeTree.pl -FB100S21XV /pfam2pwd_out/IL5.pwd IL5_tree"
1156 Tree calculation based on precalculated pairwise distances
1157 and matching alignment
1158 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1159 Consensus tree will have branch length values.
1160 Precalculated pairwise distances and the matching (processed)
1161 alignment are the output of "pfam2pwd.pl", number of bootstraps
1162 needs to match the one used for the pwds, matrix needs to match
1163 the one used for the pwds.
1165 makeTree.pl <-options, includes "UF"> <pwdfile: boostrapped pairwise
1166 distances> <alnfile: corresponding alignment>
1167 <outputfile> [path/name for temporary directory to be created]
1170 "% makeTree.pl -UFLB100S21XV /pfam2pwd_out/IL5.pwd /pfam2pwd_out/IL5.aln IL5_tree"
1176 N : Suggestion to remove columns in the alignment which contain gaps.
1177 Gaps are not removed, if, after removal of gaps, the resulting alignment would
1178 be shorter than $MIN_NUMBER_OF_AA. Default is not to remove gaps.
1179 Bx : Number of bootstrapps. B0: do not bootstrap. Default is 100 bootstrapps.
1180 The number of bootstrapps should be divisible by 10.
1181 U : Use TREE-PUZZLE to calculate ML branchlengths for consesus tree, in case of
1182 bootstrapped analysis.
1183 J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE, default: PAM.
1184 L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE, default: PAM.
1185 M : Use mtREV24 matrix (Adachi-Hasegawa 1996) inTREE-PUZZLE, default: PAM.
1186 W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE, default: PAM.
1187 T : Use VT matrix (Mueller-Vingron 2000) in TREE-PUZZLE, default: PAM.
1188 P : Let TREE-PUZZLE choose which matrix to use, default: PAM.
1189 E : Exact parameter estimates in TREE-PUZZLE, default: Approximate.
1190 Model of rate heterogeneity in TREE-PUZZLE (default: Uniform rate)
1191 g : 8 Gamma distributed rates
1192 t : Two rates (1 invariable + 1 variable)
1193 m : Mixed (1 invariable + 8 Gamma rates)
1194 R : Randomize input order in PHYLIP NEIGHBOR.
1195 A : Use PHYLIP PROTPARS instead of NEIGHBOR (and no pairwise distance calculation).
1196 jx : Number of jumbles when using PHYLIP PROTPARS (random seed set with Sx).
1197 Sx : Seed for random number generator(s). Must be 4n+1. Default is 9.
1198 X : To keep multiple tree file (=trees from bootstrap resampled alignments).
1199 D : To keep (and create in case of bootstrap analysis) pairwise distance matrix file.
1200 This is created form the not resampled (original) alignment.
1201 C : Calculate pairwise distances only (no tree). Bootstrap is always 1.
1202 No other files are generated.
1203 F : Pairwise distance (pwd) file as input (instead of alignment).
1204 No -D, -C, and -N options available in this case.
1206 # : Only for rio.pl: Do not calculate consensus tree ("I" option in rio.pl).