4 # Copyright (C) 1999-2002 Washington University School of Medicine
5 # and Howard Hughes Medical Institute
8 # Author: Christian M. Zmasek
9 # zmasek@genetics.wustl.edu
10 # http://www.genetics.wustl.edu/eddy/people/zmasek/
14 # Last modified: 01/27/02
17 # Bootstrap resamples an alignment in PHYLIP sequential format <bootstraps> times.
18 # Bootstrapping is not done randomly but according to a BSP (bootstrap
20 # The BSP file can be created with the Perl program "bootstrap_cz.pl"
22 # This prgram has the same functionality as "bootstrap_cz.pl" in mode 1.
23 # Sequence names are normalized to LENGTH_OF_NAME characters.
24 # The output alignment is in PHYLIP's sequential or interleaved format.
25 # (These two are the same in this case, since all the seqs will be one
26 # line in length (no returns in seq).)
28 # Usage: bootstrap_cz <bootstraps> <alignment infile> <BSP file> <outfile>
29 # [number of processors]
40 #define LENGTH_OF_NAME 26
43 static char **names, /* This stores the sequence names */
44 **sequences; /* This stores the sequences */
45 static int number_of_seqs,
49 void readInAlignmnet( const char * );
50 void bootstrapAccordingToBSPfile( int, const char *, const char * );
51 void checkForMemAllocFailure( void * );
52 int fileExists( const char *);
53 void errorInCommandLine();
59 /* Reads the seqs and seq-names from inalignment */
60 /* into **sequences and **sequences. */
61 /* Inalignment must be in PHYLIP sequential format. */
62 /* Last modified: 06/25/01 */
63 void readInAlignment( const char *inalignment ) {
65 FILE *inalignment_fp = NULL;
68 register char c = ' ';
77 inalignment_fp = fopen( inalignment, "r" );
78 if ( inalignment_fp == NULL ) {
79 printf( "\nbootstrap_cz: Error: Could not open alignment file for reading.\n" );
83 if ( fscanf( inalignment_fp, "%d", &number_of_seqs ) != 1 ) {
84 printf( "\nbootstrap_cz: Error: Could not read in number of seqs.\n" );
87 if ( fscanf( inalignment_fp, "%d", &number_of_colm ) != 1 ) {
88 printf( "\nbootstrap_cz: Error: Could not read in number of columns.\n" );
92 names = malloc( number_of_seqs * sizeof( char *) );
93 checkForMemAllocFailure( names );
94 for ( i = 0; i < number_of_seqs; ++i ) {
95 names[ i ] = malloc( LENGTH_OF_NAME * sizeof( char ) );
96 checkForMemAllocFailure( names[ i ] );
99 sequences = malloc( number_of_seqs * sizeof( char * ) );
100 checkForMemAllocFailure( sequences );
101 for ( i = 0; i < number_of_seqs; ++i ) {
102 sequences[ i ] = malloc( number_of_colm * sizeof( char ) );
103 checkForMemAllocFailure( sequences[ i ] );
106 max_length = ( 30 * LENGTH_OF_NAME ) + number_of_colm;
108 str = malloc( max_length * sizeof( char * ) );
109 checkForMemAllocFailure( str );
111 while ( fgets( str, max_length, inalignment_fp ) != NULL ) {
113 if ( !isspace( str[ 0 ] ) != 0 ) {
116 while ( str[ i ] != ' ' ) {
117 names[ seq ][ i ] = str[ i ];
122 while ( ii < LENGTH_OF_NAME ) {
123 names[ seq ][ ii ] = ' ';
129 while ( str[ i ] != '\n' && str[ i ] != '\r' && str[ i ] != '\0' ) {
132 if ( isupper( c ) != 0 || c == '-' ) {
133 sequences[ seq ][ z++ ] = c;
136 printf( "\nbootstrap_cz: Error: Sequence must be represented by uppercase letters A-Z and \"-\" only.\n" );
141 if ( z > number_of_colm ) {
142 printf( "\nbootstrap_cz: Error: line in \"%s\" contains more than %d columns.\n",
143 inalignment, number_of_colm );
147 if ( z != number_of_colm ) {
148 printf( "\nbootstrap_cz: Error: line in \"%s\" contains a incorrect number of columns.\n",
155 if ( seq > number_of_seqs ) {
156 printf( "\nbootstrap_cz: Error: \"%s\" contains more than %d seqs.\n",
157 inalignment, number_of_seqs );
163 } /* while ( fgets ) */
165 if ( seq != number_of_seqs ) {
166 printf( "\nbootstrap_cz: Error: \"%s\" contains a incorrect number of seqs.\n",
171 fclose( inalignment_fp );
175 } /* readInAlignment */
179 /* Rearrenges the aa in sequences according to */
180 /* the bsp (bootstrap positions) file bsp_file. */
181 /* Writes the results to outfile */
182 /* Last modified: 06/07/01 */
183 void bootstrapAccordingToBSPfile( int bootstraps,
184 const char *bsp_file,
185 const char *outfile ) {
187 FILE *bsp_file_fp = NULL,
189 int *positions = NULL,
191 register int boot = 0,
195 positions = malloc( number_of_colm * sizeof( int ) );
196 checkForMemAllocFailure( positions );
199 bsp_file_fp = fopen( bsp_file, "r" );
200 if ( bsp_file_fp == NULL ) {
201 printf( "\nbootstrap_cz: Error: could not open file \"%s\" for reading.\n",
206 outfile_fp = fopen( outfile, "w" );
207 if ( outfile_fp == NULL ) {
208 printf( "\nbootstrap_cz: Error: could not open file \"%s\" for writing.\n",
213 for ( boot = 0; boot < bootstraps; ++boot ) {
215 for ( i = 0; i < number_of_colm; ++i ) {
216 if ( fscanf( bsp_file_fp, "%d", &p ) != 1 ) {
217 printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment.\n",
224 fprintf( outfile_fp, " %d %d\n", number_of_seqs, number_of_colm );
225 for ( seq = 0; seq < number_of_seqs; ++seq ) {
226 for ( i = 0; i < LENGTH_OF_NAME; ++i ) {
227 fprintf( outfile_fp, "%c", names[ seq ][ i ] );
229 for ( i = 0; i < number_of_colm; ++i ) {
230 fprintf( outfile_fp, "%c", sequences[ seq ][ positions[ i ] ] );
232 fprintf( outfile_fp, "\n" );
236 /* Now, the bsp file must not contain any more numbers */
237 if ( fscanf( bsp_file_fp, "%d", &p ) == 1 ) {
238 printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment (too long).\n",
240 printf( ">%d<\n", p );
241 printf( "number of seqs=%d\n", number_of_seqs );
245 fclose( bsp_file_fp );
246 fclose( outfile_fp );
250 } /* bootstrapAccordingToBSPfile */
254 /* Rearrenges the aa in sequences according to */
255 /* the bsp (bootstrap positions) file bsp_file. */
256 /* Writes the results to outfile */
257 /* Last modified: 01/25/02 */
258 void bootstrapAccordingToBSPfileP( int bootstraps,
260 const char *bsp_file,
261 const char *outfile ) {
263 FILE *bsp_file_fp = NULL,
265 int *positions = NULL,
267 char *outfile_ = NULL;
268 register int boot = 0,
277 block_size = ( int ) bootstraps / processors;
278 larger_blocks = bootstraps - ( block_size * processors ); /* number of blocks which have a size of
281 positions = malloc( number_of_colm * sizeof( int ) );
282 checkForMemAllocFailure( positions );
284 outfile_ = malloc( ( strlen( outfile ) + 20 ) * sizeof( char ) );
285 checkForMemAllocFailure( outfile_ );
287 bsp_file_fp = fopen( bsp_file, "r" );
288 if ( bsp_file_fp == NULL ) {
289 printf( "\nbootstrap_cz: Error: could not open file \"%s\" for reading.\n",
298 for ( boot = 0; boot < bootstraps; ++boot ) {
300 for ( i = 0; i < number_of_colm; ++i ) {
301 if ( fscanf( bsp_file_fp, "%d", &p ) != 1 ) {
302 printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment.\n",
311 if ( larger_blocks > 0 ) {
312 if ( j >= block_size + 1 ) {
318 else if ( j >= block_size ) {
325 fclose( outfile_fp );
327 sprintf( outfile_, "%s%d", outfile, z++ );
328 if ( fileExists( outfile_ ) == 1 ) {
329 printf( "\nbootstrap_cz: Error: outfile \"%s\" already exists.\n",
333 outfile_fp = fopen( outfile_, "w" );
334 if ( outfile_fp == NULL ) {
335 printf( "\nbootstrap_cz: Error: could not open file \"%s\" for writing.\n",
342 fprintf( outfile_fp, " %d %d\n", number_of_seqs, number_of_colm );
343 for ( seq = 0; seq < number_of_seqs; ++seq ) {
344 for ( i = 0; i < LENGTH_OF_NAME; ++i ) {
345 fprintf( outfile_fp, "%c", names[ seq ][ i ] );
347 for ( i = 0; i < number_of_colm; ++i ) {
348 fprintf( outfile_fp, "%c", sequences[ seq ][ positions[ i ] ] );
350 fprintf( outfile_fp, "\n" );
354 /* Now, the bsp file must not contain any more numbers */
355 if ( fscanf( bsp_file_fp, "%d", &p ) == 1 ) {
356 printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment (too long).\n",
358 printf( ">%d<\n", p );
359 printf( "number of seqs=%d\n", number_of_seqs );
363 fclose( bsp_file_fp );
364 fclose( outfile_fp );
371 } /* bootstrapAccordingToBSPfileP */
376 /* Exits if *p is NULL. */
377 /* Last modified: 06/06/01 */
378 void checkForMemAllocFailure( void *p ) {
380 printf( "\nbootstrap_cz: Memory allocation failed.\n" );
386 } /* checkForMemAllocFailure */
390 /* Returns 1 if filename can be opened. */
391 /* Returns 0 otherwise. */
392 /* Last modified: 06/07/01 */
393 int fileExists( const char *filename ) {
395 if ( ( fp = fopen( filename, "r" ) ) != NULL ) {
406 void errorInCommandLine() {
408 printf( " bootstrap_cz version 3.000\n" );
409 printf( " ---------------------------\n\n" );
410 printf( " Purpose:\n" );
411 printf( " Bootstrap resamples an alignment in PHYLIP sequential format <bootstraps> times.\n" );
412 printf( " Bootstrapping is not done randomly but according to a BSP (bootstrap\n" );
413 printf( " positions) file.\n" );
414 printf( " The BSP file can be created with the Perl program \"bootstrap_cz.pl\"\n" );
415 printf( " in mode 0.\n" );
416 printf( " This prgram has the same functionality as \"bootstrap_cz.pl\" in mode 1.\n" );
417 printf( " Sequence names are normalized to LENGTH_OF_NAME characters.\n" );
418 printf( " The output alignment is in PHYLIP's sequential or interleaved format.\n" );
419 printf( " (These two are the same in this case, since all the seqs will be one\n" );
420 printf( " line in length (no returns in seq).)\n\n" );
421 printf( " Usage: bootstrap_cz <bootstraps> <alignment infile> <BSP file> <outfile>\n" );
422 printf( " [number of processors]\n\n" );
423 } /* errorInCommandLine */
427 int main( int argc, char *argv[] ) {
429 char *inalign = NULL,
436 if ( argc != 5 && argc != 6 ) {
437 errorInCommandLine();
441 bootstraps = atoi( argv[ 1 ] );
443 bsp_file = argv[ 3 ];
446 if ( bootstraps < 1 ) {
447 errorInCommandLine();
452 processors = atoi( argv[ 5 ] );
453 if ( processors < 1 ) {
454 errorInCommandLine();
457 if ( processors > bootstraps ) {
458 processors = bootstraps;
462 if ( argc == 5 && fileExists( outfile ) == 1 ) {
463 printf( "\nbootstrap_cz: Error: outfile \"%s\" already exists.\n",
468 readInAlignment( inalign );
471 bootstrapAccordingToBSPfile( bootstraps,
476 bootstrapAccordingToBSPfileP( bootstraps,