6 # Copyright (C) 2003 Christian M. Zmasek
9 # Author: Christian M. Zmasek
10 # zmasek@genetics.wustl.edu
11 # http://www.genetics.wustl.edu/eddy/people/zmasek/
14 # Last modified 03/25/03
18 # Calculates trees based on Pfam alignments or precalculated distances using
24 use lib $FindBin::Bin;
28 # To use _your_ species list make $MY_SPECIES_NAMES_FILE point to it.
29 # To use _your_ TrEMBL ACDEOS make $MY_TREMBL_ACDEOS_FILE point to it.
31 my $MY_SPECIES_NAMES_FILE = $SPECIES_NAMES_FILE; # $SPECIES_NAMES_FILE is inherited
34 my $MY_TREMBL_ACDEOS_FILE = $TREMBL_ACDEOS_FILE; # $TREMBL_ACDEOS_FILE is inherited
37 my $MY_TEMP_DIR = $TEMP_DIR_DEFAULT; # $TEMP_DIR_DEFAULT is inherited
40 my $LOGFILE = "00_xt_logfile";
41 my $PWD_SUFFIX = ".pwd";
42 my $ALN_SUFFIX = ".aln";
44 my $use_precalc_pwd = 0; # 0: input is Pfam aligments ($input_dir must point to "/Pfam/Full/").
45 # 1: input is precalculated pairwise distancs ($input_dir must point to "").
46 my $use_precalc_pwd_and_aln = 0;# 0: otherwise
47 # 1: input is precalculated pairwise distancs
48 # _and_ alns,$use_precalc_pwd = 1 ($input_dir must point to alns).
49 my $add_species = 0; # "I": 0: do nothing with species information.
50 # "S": 1: add species code to TrEMBL sequences and ignore sequences from
51 # species not in $MY_SPECIES_NAMES_FILE (only if input is Pfam aligments).
52 my $options = ""; # Options for makeTree.pl, see makeTree.pl.
53 # Do not use F [Pairwise distance (pwd) file as input (instead of alignment)]
54 # since this is determined with $USE_PRECALC_PWD
55 my $min_seqs = 0; # Minimal number of sequences (TREE-PUZZLE needs at least four seqs).
56 # Ignored if $USE_PRECALC_PWD = 1
57 my $max_seqs = 0; # Maximal number of sequences.
58 # Ignored if $USE_PRECALC_PWD = 1
60 my $input_dir_aln = ""; # for .aln files
67 my %AC_OS = (); # AC -> species name
68 my %Species_names_hash = ();
71 my $already_present = 0;
72 my @too_small_names = ();
73 my @too_large_names = ();
74 my @already_present_names = ();
77 # Analyzes the options:
78 # ---------------------
80 unless ( @ARGV == 3 || @ARGV == 4 || @ARGV == 6 ) {
86 $use_precalc_pwd_and_aln = 0;
87 $options = $ARGV[ 0 ];
88 $input_dir = $ARGV[ 1 ];
89 $output_dir = $ARGV[ 2 ];
92 elsif ( @ARGV == 4 ) {
94 $use_precalc_pwd_and_aln = 1;
95 $options = $ARGV[ 0 ];
96 $input_dir = $ARGV[ 1 ];
97 $input_dir_aln = $ARGV[ 2 ];
98 $output_dir = $ARGV[ 3 ];
100 $input_dir_aln = &addSlashAtEndIfNotPresent( $input_dir_aln );
103 $use_precalc_pwd = 0;
104 $use_precalc_pwd_and_aln = 0;
105 $add_species = $ARGV[ 0 ];
106 $options = $ARGV[ 1 ];
107 $min_seqs = $ARGV[ 2 ];
108 $max_seqs = $ARGV[ 3 ];
109 $input_dir = $ARGV[ 4 ];
110 $output_dir = $ARGV[ 5 ];
111 if ( $min_seqs < 4 ) {
114 if ( $add_species eq "I" ) {
117 elsif ( $add_species eq "S" ) {
121 print( "\nFirst must be either \"I\" [Ignore species] or\n\"S\" [add Species code to TrEMBL sequences and ignore sequences from species not in $MY_SPECIES_NAMES_FILE].\n\n" );
128 $input_dir = &addSlashAtEndIfNotPresent( $input_dir );
129 $output_dir = &addSlashAtEndIfNotPresent( $output_dir );
130 $MY_TEMP_DIR = &addSlashAtEndIfNotPresent( $MY_TEMP_DIR );
135 # This adds a "-" before the options for makeTree:
136 # ------------------------------------------------
137 unless ( $options =~ /^-/ ) {
138 $options = "-".$options;
143 # If based on pwd, species are "fixed" and certain options for makeTree
144 # are not applicable and option "F" is mandatory:
145 # ---------------------------------------------------------------------
146 if ( $use_precalc_pwd == 1 ) {
150 unless ( $options =~ /F/ ) {
151 $options = $options."F";
158 if ( $use_precalc_pwd_and_aln == 1 ) {
159 unless ( $options =~ /U/ ) {
160 $options = $options."U";
163 if ( $use_precalc_pwd_and_aln == 0 && $use_precalc_pwd == 1 ) {
170 # If species are to be considered, speices names file and TrEMBL ACDEOS
171 # files need to be read in:
172 # ---------------------------------------------------------------------
173 if ( $add_species == 1 ) {
174 print "\nXT.PL: Reading species names file...\n";
175 &readSpeciesNamesFile( $MY_SPECIES_NAMES_FILE );
176 print "\nXT.PL: Reading TrEMBL ACDEOS file...\n";
177 &readTrEMBL_ACDEOS_FILE( $MY_TREMBL_ACDEOS_FILE );
182 # This creates the temp file:
183 # --------------------------
188 my $temp_file = $MY_TEMP_DIR."xt".$time.$ii;
190 while ( -e $temp_file ) {
192 $temp_file = $MY_TEMP_DIR."xt".$time.$ii;
199 opendir( DIR, $input_dir ) || error( "Cannot open directory \"$input_dir\": $!" );
203 while( defined( $filename = readdir( DIR ) ) ) {
204 if ( $filename =~ /^\.\.?$/ ) {
207 if ( $use_precalc_pwd == 1 && $filename !~ /$PWD_SUFFIX$/ ) {
210 $filenames[ $i ] = $filename;
218 FOREACH: foreach $filename ( @filenames ) {
220 # If the corresponding tree seems to already exists, do next one.
222 if ( $use_precalc_pwd == 1 ) {
223 $fn =~ s/$PWD_SUFFIX$//;
225 if ( -e "$output_dir$fn.nhx" ) {
226 $already_present_names[ $already_present++ ] = $fn;
230 if ( $use_precalc_pwd != 1 ) {
232 if ( $add_species == 1 ) {
234 # 1. Pfam flat file name
236 # Returns the number of sequences in the resulting alignment.
237 $seqs = &removeSeqsFromPfamAlign( $input_dir.$filename, $temp_file );
241 # Gets the number of seqs in the alignment.
242 open( F, "$input_dir"."$filename" );
244 if ( $_ =~/^#.+SQ\s+(\d+)\s*$/ ) {
252 if ( $seqs < $min_seqs ) {
253 $too_small_names[ $too_small++ ] = $filename;
256 if ( $seqs > $max_seqs ) {
257 $too_large_names [ $too_large++ ] = $filename;
264 if ( $use_precalc_pwd == 1 ) {
265 print "working on: $filename\n";
268 print "working on: $filename [$seqs seqs]\n";
270 print "[tree calculation $i]\n";
271 print "=====================================================================\n\n\n";
274 unlink( "$output_dir$filename.aln", "$output_dir$filename.log" );
276 print( "XT.PL: executing:\n" );
280 if ( $add_species == 1 ) {
281 $inputfile = $temp_file;
284 $inputfile = $input_dir.$filename;
287 if ( $use_precalc_pwd == 1 ) {
288 $filename =~ s/$PWD_SUFFIX$//;
291 if ( $use_precalc_pwd_and_aln == 1 ) {
292 $inputfile = $inputfile." ".$input_dir_aln.$filename.$ALN_SUFFIX;
295 my $command = "$MAKETREE $options $inputfile $output_dir$filename.nhx";
297 print( "$command\n" );
298 system( $command ) && &error( "Could not execute \"$command\"" );
300 if ( $add_species == 1 ) {
301 if ( unlink( $temp_file ) != 1 ) {
302 &error( "Unexpected: Could not delete \"$temp_file\"" );
312 print( "\n\n\nXT.PL: Done!\n" );
313 print( "Wrote \"$LOGFILE\".\n\n" );
326 print( "\nxt.pl: ERROR:\n" );
327 print( "$text\n\n" );
331 } ## dieWithUnexpectedError
333 # Similar to the method with the same name in "rio.pl".
334 # Removes sequences from a Pfam flat file.
335 # Adds species to TrEMBL seqs.
336 # It can remove all sequences not from species listed in a species names file.
338 # 1. Pfam flat file name
340 # Returns the number of sequences in the resulting alignment.
341 # Last modified: 02/22/03
342 sub removeSeqsFromPfamAlign {
343 my $infile = $_[ 0 ];
344 my $outfile = $_[ 1 ];
345 my $return_line = "";
346 my $saw_sequence_line = 0;
347 my $number_of_seqs = 0;
356 open( OUT_RNSP, ">$outfile" ) || die "\n\n$0: Unexpected error: Cannot create file \"$outfile\": $!";
357 open( IN_RNSP, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!";
358 while ( $return_line = <IN_RNSP> ) {
360 if ( $saw_sequence_line == 1
361 && !&containsPfamNamedSequence( $return_line )
362 && !&isPfamCommentLine( $return_line ) ) {
363 # This is just for counting purposes.
364 $saw_sequence_line = 2;
366 if ( &isPfamSequenceLine( $return_line ) ) {
367 if ( $saw_sequence_line == 0 ) {
368 $saw_sequence_line = 1;
370 $return_line =~ /^\s*(\S+)\s+(\S+)/;
373 if ( !&startsWithSWISS_PROTname( $return_line ) ) {
374 $seq_name =~ /^(\S+)\//;
376 unless( exists( $AC_OS{ $AC } ) ) {
377 #ACs not present in "ACDEOS" file.
381 if ( !$OS || $OS eq "" ) {
382 die "\n\n$0: Unexpected error: species for \"$AC\" not found.\n\n";
384 unless( exists( $Species_names_hash{ $OS } ) ) {
387 $seq_name =~ s/\//_$OS\//;
390 if ( $return_line =~ /_([A-Z0-9]{1,5})\// ) {
391 unless( exists( $Species_names_hash{ $1 } ) ) {
395 # remove everything whose species cannot be determined.
400 $length = length( $seq_name );
401 for ( $i = 0; $i <= ( $LENGTH_OF_NAME - $length - 1 ); $i++ ) {
404 $return_line = $seq_name.$seq."\n";
407 if ( !&isPfamCommentLine( $return_line ) ) {
408 print OUT_RNSP $return_line;
411 if ( $saw_sequence_line == 1 ) {
414 } ## while ( $return_line = <IN_RNSP> )
418 return $number_of_seqs;
420 } ## removeSeqsFromPfamAlign
428 # Reads in (SWISS-PROT) species names from a file.
429 # Names must be separated by newlines.
430 # Lines beginning with "#" are ignored.
431 # A possible "=" and everything after is ignored.
432 # One argument: species-names-file name
433 # Last modified: 04/24/01
434 sub readSpeciesNamesFile {
435 my $infile = $_[ 0 ];
436 my $return_line = "";
439 unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
440 die "\n\n$0: Error: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n";
443 open( IN_RSNF, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!\n\n";
444 while ( $return_line = <IN_RSNF> ) {
445 if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) {
448 $Species_names_hash{ $species } = "";
454 } ## readSpeciesNamesFile
458 # Last modified: 05/18/01
459 sub readTrEMBL_ACDEOS_FILE {
460 my $infile = $_[ 0 ];
461 my $return_line = "";
463 unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
464 &error( "\"$infile\" does not exist, is empty, or is not a plain textfile" );
466 # Fill up (huge) hashs.
467 open( HH, "$infile" ) || &error( "Unexpected error: Cannot open file \"$infile\"" );
468 while ( $return_line = <HH> ) {
470 if ( $return_line =~ /(\S+);[^;]*;(\S+)/ ) {
475 } ## readTrEMBL_ACDEOS_FILE
479 # Last modified: 05/17/01
481 if ( -e "$LOGFILE" ) {
482 &error( "logfile \"$LOGFILE\" already exists, rename it or place it in another directory" );
485 open( L, ">$LOGFILE" ) || &error( "Cannot create logfile: $!" );
486 if ( $use_precalc_pwd != 1 ) {
487 print L "Trees are based directly on Pfam alignments\n";
488 if ( $add_species == 1 ) {
489 print L "Add species code to TrEMBL sequences and ignore sequences\nfrom species not in $MY_SPECIES_NAMES_FILE\n";
492 print L "Do nothing with species information\n";
496 print L "Trees are based on precalculated pairwise distances\n";
498 if ( $use_precalc_pwd_and_aln == 1 ) {
499 print L "and the matching alignments\n";
501 print L "Options for makeTree: $options\n";
502 if ( $use_precalc_pwd != 1 ) {
503 print L "Min seqs : $min_seqs\n";
504 print L "Max seqs : $max_seqs\n";
506 if ( $add_species == 1 ) {
507 print L "TrEMBL ACDEOS file : $MY_TREMBL_ACDEOS_FILE\n";
508 print L "Species names file : $MY_SPECIES_NAMES_FILE\n";
510 print L "Input directory : $input_dir\n";
511 if ( $use_precalc_pwd_and_aln == 1 ) {
512 print L "Input directory aln : $input_dir_aln\n";
514 print L "Output directory : $output_dir\n";
515 print L "Start date : ".`date`;
521 # Last modified: 05/17/01
525 print L "Successfully calculated $i trees.\n";
526 if ( $use_precalc_pwd != 1 ) {
527 print L "Too large alignments (>$max_seqs): $too_large\n";
528 print L "Too small alignments (<$min_seqs): $too_small\n";
530 print L "Alignments for which a tree appears to already exist: $already_present\n";
531 print L "Finish date : ".`date`."\n\n";
532 if ( $use_precalc_pwd != 1 ) {
533 print L "List of the $too_large alignments which were ignored because they\n";
534 print L "contained too many sequences (>$max_seqs) [after pruning]:\n";
535 for ( $j = 0; $j < $too_large; ++$j ) {
536 print L "$too_large_names[ $j ]\n";
539 print L "List of the $too_small alignments which were ignored because they\n";
540 print L "contained not enough sequences (<$min_seqs) [after pruning]:\n";
541 for ( $j = 0; $j < $too_small; ++$j ) {
542 print L "$too_small_names[ $j ]\n";
546 print L "List of the $already_present alignments which were ignored because\n";
547 print L "a tree appears to already exist:\n";
548 for ( $j = 0; $j < $already_present; ++$j ) {
549 print L "$already_present_names[ $j ]\n";
561 print " Copyright (C) 2003 Christian M. Zmasek\n";
562 print " All rights reserved\n";
564 print " Author: Christian M. Zmasek\n";
565 print " zmasek\@genetics.wustl.edu\n";
566 print " http://www.genetics.wustl.edu/eddy/forester/\n";
572 print " Tree construction using makeTree.pl based on directories\n";
573 print " of Pfam alignments or precalculated pairwise distances.\n";
579 print " Input is Pfam aligments:\n";
580 print " xt.pl <I or S> <options for makeTree.pl> <minimal number of seqs>\n";
581 print " <maximal number of seqs> <input directory: Pfam aligments> <output directory>\n";
583 print " Input is precalculated pairwise distancs:\n";
584 print " xt.pl <options for makeTree.pl> <input directory: pairwise distances \"$PWD_SUFFIX\"> <output directory>\n";
586 print " Input is precalculated pairwise distancs and corresponding alignment files:\n";
587 print " xt.pl <options for makeTree.pl> <input directory: pairwise distances \"$PWD_SUFFIX\">\n";
588 print " <input directory: corresponding alignment files \"$ALN_SUFFIX\"> <output directory>\n";
594 print " \"xt.pl S NS21UTRB100DX 4 200 DB/PFAM/Full/ trees/\"\n";
596 print " \"xt.pl FLB100R /pfam2pwd_out/ trees/\"\n";
598 print " \"xt.pl FULB100R /pfam2pwd_out/ /pfam2pwd_out/ trees/\"\n";
604 print " I: ignore species information (use all sequences)\n";
605 print " S: add species codes to TrEMBL sequences and ignore sequences\n";
606 print " from species not in $MY_SPECIES_NAMES_FILE,\n";
607 print " species codes are extracted from $MY_TREMBL_ACDEOS_FILE\n";
610 print " Options for makeTree\n";
611 print " --------------------\n";
613 print " N : Suggestion to remove columns in the alignment which contain gaps.\n";
614 print " Gaps are not removed, if, after removal of gaps, the resulting\n";
615 print " alignment would be shorter than $MIN_NUMBER_OF_AA aa (\$MIN_NUMBER_OF_AA).\n";
616 print " Default is not to remove gaps.\n";
617 print " Bx : Number of bootstrapps. B0: do not bootstrap. Default: 100 bootstrapps.\n";
618 print " The number of bootstrapps should be divisible by 10.\n";
619 print " U : Use TREE-PUZZLE to calculate ML branchlengths for consesus tree, in case of\n";
620 print " bootstrapped analysis.\n";
621 print " J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE, default: PAM.\n";
622 print " L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE, default: PAM.\n";
623 print " M : Use mtREV24 matrix (Adachi-Hasegawa 1996) in TREE-PUZZLE, default: PAM.\n";
624 print " W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE, default: PAM.\n";
625 print " T : Use VT matrix (Mueller-Vingron 2000) in TREE-PUZZLE, default: PAM.\n";
626 print " P : Let TREE-PUZZLE choose which matrix to use, default: PAM.\n";
627 print " R : Randomize input order in PHYLIP NEIGHBOR.\n";
628 print " Sx : Seed for random number generator(s). Must be 4n+1. Default is 9.\n";
629 print " X : To keep multiple tree file (=trees from bootstrap resampled alignments).\n";
630 print " D : To keep (and create, in case of bootstrap analysis) pairwise distance\n";
631 print " matrix file. This is created form the not resampled aligment.\n";
632 print " C : Calculate pairwise distances only (no tree). Bootstrap is always 1.\n";
633 print " No other files are generated.\n";
634 print " F : Pairwise distance (pwd) file as input (instead of alignment).\n";
635 print " No -D, -C, and -N options available in this case.\n";
636 print " V : Verbose\n";