5 # Copyright (C) 1999-2003 Washington University School of Medicine
6 # and Howard Hughes Medical Institute
9 # Author: Christian M. Zmasek
10 # zmasek@genetics.wustl.edu
11 # http://www.genetics.wustl.edu/eddy/people/zmasek/
15 # Last modified 08/26/03
18 # Bootstrap resamples an alignment in PHYLIP sequential format <bootstraps>
20 # Amino acid sequences must only be represented by uppercase letters (A-Z)
22 # In mode 0 it saves the positions which it used to create the
23 # bootstrapped alignment into <positions outfile>.
24 # Mode 1 allows to recreate exactly the same boostrapped alignment
25 # by reading in a <positions infile>.
26 # Sequence names are normalized to $LENGTH_OF_NAME characters.
27 # The output alignment is in PHYLIP's sequential or interleaved format.
28 # (These two are the same in this case, since all the seqs will be one
29 # line in length (no returns in seq).)
32 # bootstrap_cz.pl <mode (0 or 1)> <bootstraps> <alignment infile>
33 # <alignment outfile> <positions out- (mode 0) or in-file (mode 1)>
34 # [random number seed (mode 0 only)]
39 use lib $FindBin::Bin;
43 my $VERSION = "2.001";
45 my $modus = -1; # 0 to create pos. file, 1 to use premade pos. file
48 my $outalign_file = "";
49 my $positions_file = "";
54 $bootstraps = $ARGV[ 1 ];
56 $outalign_file = $ARGV[ 3 ];
57 $positions_file = $ARGV[ 4 ];
60 if ( @ARGV != 5 && @ARGV != 6 ) {
65 if ( $modus != 0 && $modus != 1 ) {
70 if ( $modus == 0 && @ARGV != 6 ) {
75 if ( $modus == 1 && @ARGV != 5 ) {
80 if ( $bootstraps < 1 ) {
85 if ( $seed && $seed < 0 ) {
91 unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
92 die "\n\nbootstrap_cz.pl: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n";
94 if ( -e $outalign_file ) {
95 die "\n\nbootstrap_cz.pl: \"$outalign_file\" already exists.\n\n";
99 if ( -e $positions_file ) {
100 die "\n\nbootstrap_cz.pl: \"$positions_file\" already exists.\n\n";
104 unless ( ( -s $positions_file ) && ( -f $positions_file ) && ( -T $positions_file ) ) {
105 die "\n\nbootstrap_cz.pl: \"$positions_file\" does not exist, is empty, or is not a plain textfile.\n\n";
110 &bootstrap( $modus, $bootstraps, $infile, $outalign_file, $positions_file, $seed );
113 &bootstrap( $modus, $bootstraps, $infile, $outalign_file, $positions_file );
125 # Five/six arguemnts:
126 # 1. Mode: 0 to create pos. file, 1 to use premade pos. file
128 # 3. Alignment infile name
130 # 5. file name for positions file (created if mode is 0, read if mode is 1)
131 # [6. If modus is 0: seed for random number generator]
133 # This method is very similar to method "pfam2phylip" "in makeTree.pl".
135 # Last modified: 05/17/01
140 my $bootstraps = $_[ 1 ];
141 my $infile = $_[ 2 ];
142 my $outalign_file = $_[ 3 ];
143 my $positions_file = $_[ 4 ];
147 my @random_numbers = ();
148 my $return_line = "";
153 my $original_length = 0;
159 my $number_of_seqs = 0;
160 my $number_of_colm = 0;
163 # Checks the arguments
164 # --------------------
168 die "\n\n$0: bootstrap: Failed to give a seed for random number generator.\n\n";
172 elsif( $modus == 1 ) {
174 die "\n\n$0: bootstrap: Must not give a seed for random number generator.\n\n";
176 unless ( ( -s $positions_file ) && ( -f $positions_file ) && ( -T $positions_file ) ) {
177 die "\n\n$0: bootstrap: <<$positions_file>> does not exist, is empty, or is not a plain textfile.\n\n";
181 die "\n\n$0: bootstrap: modus must be either 0 or 1.\n\n";
184 unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
185 die "\n\n$0: bootstrap: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n";
190 # Reads in the alignment
191 # ----------------------
193 open( IN, "$infile" ) || die "\n$0: bootstrap: Cannot open file <<$infile>>: $!";
194 while ( $return_line = <IN> ) {
196 if ( $return_line =~ /^\s*(\d+)\s+(\d+)/ ) {
197 $number_of_seqs = $1;
198 $number_of_colm = $2;
200 elsif ( $return_line =~ /^(\S+)\s+(\S+)/ ) {
201 $seq_name[ $seq_no ] = substr( $1, 0, $LENGTH_OF_NAME );
203 if ( $original_length == 0 ) {
204 $original_length = length( $seq );
206 elsif ( $original_length != length( $seq ) ) {
207 die "\n\n$0: Sequences do not have the same length.\n\n";
209 for ( $x = 0; $x < $original_length; $x++ ) {
210 $seq_array[ $x ][ $seq_no ] = substr( $seq, $x, 1 );
217 if ( ( $number_of_seqs != $seq_no )
218 || ( $number_of_colm != $original_length ) ) {
219 die "\n\n$0: Number of sequences or number of columns are inconsisten with the values given in the alignment.\n\n";
222 # Adusts the length of the names to $LENGTH_OF_NAME
223 # -------------------------------------------------
225 for ( $y = 0; $y < $seq_no; $y++ ) {
226 $length = length( $seq_name[ $y ] );
227 for ( $i = 0; $i <= ( $LENGTH_OF_NAME - $length - 1 ); $i++ ) {
228 $seq_name[ $y ] .= " ";
234 # Bootstraps $bootstraps times and writes the outputfiles
235 # -------------------------------------------------------
237 open( OUT, ">$outalign_file" ) || die "\n\n$0: bootstrap: Cannot create file <<$outalign_file>>: $!";
239 open( OUT_P, ">$positions_file" ) || die "\n\n$0: bootstrap: Cannot create file <<$positions_file>>: $!";
242 open( IN_P, "$positions_file" ) || die "\n\n$0: bootstrap: Cannot open file <<$positions_file>>: $!";
245 for ( $n = 0; $n < $bootstraps; $n++ ) {
248 for ( $x = 0; $x < $original_length; $x++ ) {
249 $random = int( rand( $original_length ) );
250 print OUT_P "$random ";
251 $random_numbers[ $x ] = $random;
256 $return_line = <IN_P>;
257 if ( !$return_line || $return_line !~ /\d/ ) {
258 die "\n\n$0: bootstrap: <<$positions_file>> seems too short or otherwise unsuitable.\n\n";
260 $return_line =~ s/^\s+//;
261 $return_line =~ s/\s+$//;
262 @random_numbers = split( /\s+/, $return_line );
263 if ( scalar( @random_numbers ) != $original_length ) {
264 die "\n\n$0: bootstrap: <<$positions_file>> seems not to correspond to <<$infile>>.\n\n";
268 print OUT " $seq_no $original_length\n";
270 for ( $y = 0; $y < $seq_no; $y++ ) {
271 print OUT "$seq_name[ $y ]";
273 for ( $x = 0; $x < $original_length; $x++ ) {
274 $random = $random_numbers[ $x ];
275 if ( !$seq_array[ $random ][ $y ] || $seq_array[ $random ][ $y ] !~ /[A-Z]|-/ ) {
276 die "\n\n$0: Sequence must be represented by uppercase letters A-Z and \"-\" only.\n\n";
278 print OUT $seq_array[ $random ][ $y ];
302 print " bootstrap_cz.pl $VERSION\n";
303 print " ---------------\n";
305 print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n";
308 print " Bootstrap resamples an alignment in PHYLIP sequential format\n";
309 print " <bootstraps> times.\n";
310 print " In mode 0 it saves the positions which it used to create the\n";
311 print " bootstrapped alignment into <positions outfile>.\n";
312 print " Mode 1 allows to recreate exactly the same boostrapped alignment\n";
313 print " by reading in a <positions infile>.\n";
314 print " Sequence names are normalized to $LENGTH_OF_NAME characters.\n";
315 print " The output alignment is in PHYLIP's sequential or interleaved format.\n";
316 print " (These two are the same in this case, since all the seqs will be one\n";
317 print " line in length (no returns in seq).)\n";
320 print " bootstrap_cz.pl <mode (0 or 1)> <bootstraps> <alignment infile>\n";
321 print " <alignment outfile> <positions out (mode 0) or infile (mode 1)>\n";
322 print " [random number seed (mode 0 only)]\n";