5 # Copyright (C) 1999-2002 Washington University School of Medicine
6 # and Howard Hughes Medical Institute
9 # Author: Christian M. Zmasek
10 # zmasek@genetics.wustl.edu
11 # http://www.genetics.wustl.edu/eddy/people/zmasek/
15 # Last modified 02/20/03
18 # See RIO_INSTALL on how to use this program.
19 # ------------------------------------------
25 use lib $FindBin::Bin;
28 my $VERSION = "3.002";
32 # =============================================================================
33 # =============================================================================
35 # THESE VARIABLES NEED TO BE SET BY THE USER
36 # ------------------------------------------
40 # Pfam alignments to calculate pairwise distances from:
41 # -----------------------------------------------------
42 my $MY_PFAM_FULL_DIRECTORY = "/path/to/Pfam/Full/"; # must end with "/"
46 # This file lists all the alignments for which to calculate pairwise distances
47 # from. If left empty, ALL the alignments in $MY_PFAM_FULL_DIRECTORY
49 # ----------------------------------------------------------------------------
50 my $ALGNS_TO_USE_LIST_FILE = "";
54 # This is _VERY IMPORTANT_. It determines the species whose sequences
55 # are being used (sequences from species not listed in $MY_SPECIES_NAMES_FILE
56 # are ignored). Normally, one would use the same list as RIO uses
57 # ($SPECIES_NAMES_FILE in "rio_module.pm") -- currently "tree_of_life_bin_1-6.nhx".
59 # For certain large families (such as protein kinases), one might use a
60 # species file which contains less species in order to be able to finish
61 # the calculations in reasonable time.
62 # For example, to exclude most mammals, use:
63 # my $MY_SPECIES_NAMES_FILE = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6_species_list_NO_RAT_MONKEYS_APES_SHEEP_GOAT_HAMSTER"
64 # (to only use sequences from SWISS-PROT add this line:
65 # $TREMBL_ACDEOS_FILE = $PATH_TO_FORESTER."data/NO_TREMBL";)
66 # ----------------------------------------------------------------------------
67 my $MY_SPECIES_NAMES_FILE = $SPECIES_NAMES_FILE;
71 # This is were the output goes (must end with "/")
72 # ------------------------------------------------
73 my $MY_RIO_PWD_DIRECTORY = "/path/to/pfam2pwd_out/pwd/";
74 my $MY_RIO_BSP_DIRECTORY = "/path/to/pfam2pwd_out/bsp/";
75 my $MY_RIO_NBD_DIRECTORY = "/path/to/pfam2pwd_out/nbd/";
76 my $MY_RIO_ALN_DIRECTORY = "/path/to/pfam2pwd_out/aln/";
77 my $MY_RIO_HMM_DIRECTORY = "/path/to/pfam2pwd_out/hmm/";
81 # A directory to create temporary files in:
82 # -----------------------------------------
83 my $MY_TEMP_DIR = "/tmp/"; # must end with "/"
87 # Alignments in which the number of sequences after pruning (determined
88 # by "$MY_SPECIES_NAMES_FILE") is lower than this, are ignored
89 # (no calculation of pwds):
90 # ------------------------------------------------------------------
95 # Alignments in which the number of sequences after pruning (determined
96 # by "$MY_SPECIES_NAMES_FILE") is greater than this, are ignored
97 # (no calculation of pwds):
98 # ------------------------------------------------------------------
103 # Seed for the random number generator for bootstrapping (must be 4n+1):
104 # ---------------------------------------------------------------------
109 # This is used to choose the model to be used for the (ML)
110 # distance calculation:
111 # IMPORTANT: "$MY_MATRIX_FOR_PWD" in "rio_module.pm" needs to
112 # have the same value, when the pwds calculated are going to
120 # --------------------------------------------------------
126 # End of variables which need to be set by the user.
128 # =============================================================================
129 # =============================================================================
144 my $current_dir = "";
145 my $return_line = "";
147 my @too_small_names = ();
148 my @too_large_names = ();
149 my %Species_names_hash = ();
150 my %AC_OS = (); # AC -> species name
151 my %AC_DE = (); # AC -> description
152 my %ALGNS_TO_USE = (); # name of alignment -> ""
153 my $use_algns_to_use_list = 0;
154 my $LOGFILE = "00_pfam2pwd_LOGFILE";
155 $HMMBUILD = $HMMBUILD." --amino";
164 opendir( DIR, $MY_PFAM_FULL_DIRECTORY ) || die "\n\n$0: Cannot open directory $MY_PFAM_FULL_DIRECTORY: $!\n\n";
166 while( defined( $filename = readdir( DIR ) ) ) {
167 if ( $filename =~ /^\.\.?$/ ) {
170 $filenames[ $i ] = $filename;
176 &readSpeciesNamesFile( $MY_SPECIES_NAMES_FILE );
178 &readTrEMBL_ACDEOS_FILE();
180 if ( defined( $ALGNS_TO_USE_LIST_FILE ) && $ALGNS_TO_USE_LIST_FILE =~ /\w/ ) {
181 $use_algns_to_use_list = 1;
186 $current_dir = `pwd`;
187 $current_dir =~ s/\s//;
189 || die "\n\n$0: Unexpected error: Could not chdir to <<$tmp_dir>>: $!";
193 FOREACH_ALIGN: foreach $filename ( @filenames ) {
195 # If the corresponding pwd, positions, and aln files seem to already exists, do next one.
196 if ( ( -e $MY_RIO_PWD_DIRECTORY.$filename.$SUFFIX_PWD )
197 && ( -e $MY_RIO_BSP_DIRECTORY.$filename.$SUFFIX_BOOT_STRP_POS )
198 && ( -e $MY_RIO_NBD_DIRECTORY.$filename.$SUFFIX_PWD_NOT_BOOTS )
199 && ( -e $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX )
200 && ( -e $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM ) ) {
204 if ( $use_algns_to_use_list == 1 && !exists( $ALGNS_TO_USE{ $filename } ) ) {
209 $seqs = &removeSeqsFromPfamAlign( $MY_PFAM_FULL_DIRECTORY.$filename,
212 if ( $seqs < $MIN_SEQS ) {
213 unlink( "REM_SEQ_OUTFILE" );
214 $too_small_names[ $too_small++ ] = $filename;
217 elsif ( $seqs > $MAX_SEQS ) {
218 unlink( "REM_SEQ_OUTFILE" );
219 $too_large_names [ $too_large++ ] = $filename;
225 print " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
226 print " $i: $filename ($seqs seqs)\n";
227 print " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
230 # If one of the two file exists from a previous (interrupted) run.
231 unlink( $MY_RIO_PWD_DIRECTORY.$filename.$SUFFIX_PWD );
232 unlink( $MY_RIO_BSP_DIRECTORY.$filename.$SUFFIX_BOOT_STRP_POS );
233 unlink( $MY_RIO_NBD_DIRECTORY.$filename.$SUFFIX_PWD_NOT_BOOTS );
234 unlink( $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX );
235 unlink( $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM );
238 &executeHmmbuild( "REM_SEQ_OUTFILE",
239 $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX,
242 if ( unlink( "hmm" ) != 1 ) {
243 die "\n\n$0: Unexpected error: Could not delete <<hmm>>: $!";
246 if ( unlink( "REM_SEQ_OUTFILE" ) != 1 ) {
247 die "\n\n$0: Unexpected error: Could not delete <<REM_SEQ_OUTFILE>>: $!";
250 executeHmmbuildHand( $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX,
251 $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM );
253 system( $HMMCALIBRATE, $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM )
254 && die "\n\n$0: Could not execute \"$HMMCALIBRATE $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM\": $!";
256 &pfam2phylipMatchOnly( $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX, "infile" );
258 &executePuzzle( "infile", $MY_MATRIX );
260 system( "mv", "infile.dist", $MY_RIO_NBD_DIRECTORY.$filename.$SUFFIX_PWD_NOT_BOOTS )
261 && die "\n\n$0: Unexpected error: $!";
263 &executeBootstrap( "infile",
266 $MY_RIO_BSP_DIRECTORY.$filename.$SUFFIX_BOOT_STRP_POS,
269 if ( unlink( "infile" ) != 1 ) {
270 die "\n\n$0: Unexpected error: Could not delete <<infile>>: $!";
274 &executePuzzleBootstrapped( "BOOTSTRAPPED_ALGN", $MY_MATRIX );
276 ##if ( unlink( "outfile" ) != 1 ) {
277 ## die "\n\n$0: Unexpected error: Could not delete <<outfile>>: $!";
281 system( "mv", "BOOTSTRAPPED_ALGN".".dist", $MY_RIO_PWD_DIRECTORY.$filename.$SUFFIX_PWD )
282 && die "\n\n$0: Unexpected error: $!\n\n";
284 if ( unlink( "BOOTSTRAPPED_ALGN" ) != 1 ) {
285 die "\n\n$0: Unexpected error: Could not delete <<BOOTSTRAPPED_ALGN>>: $!";
290 } ## End of FOREACH_ALIGN loop.
293 chdir( $current_dir )
294 || die "\n\n$0: Unexpected error: Could not chdir to <<$current_dir>>: $!";
301 print( "pfam2pwd.pl: Done.\n" );
302 print( "Successfully calculated $i pairwise distance files.\n" );
303 print( "Too large alignments (>$MAX_SEQS): $too_large\n" );
304 print( "Too small alignments (<$MIN_SEQS): $too_small\n" );
305 print( "See the logfile \"$MY_RIO_PWD_DIRECTORY".$LOGFILE."\"\n" );
321 # 1. Stockholm alignment
324 # Returns the options used.
325 # Last modified: 06/26/01
326 sub executeHmmbuild {
329 my $outalignment = $_[ 1 ];
330 my $outhmm = $_[ 2 ];
333 unless ( ( -s $full ) && ( -f $full ) && ( -T $full ) ) {
334 die "\n\n$0: \"$full\" does not exist, is empty, or is not a plain textfile.\n\n";
337 $options = getHmmbuildOptionsFromPfam( $full );
344 $options =~ s/-o\s+\S+//;
345 $options =~ s/(\s|^)[^-]\S+/ /g;
347 if ( $options =~ /--prior/ ) {
348 my $basename = basename( $full );
349 $basename .= ".PRIOR";
350 $options =~ s/--prior/--prior $PRIOR_FILE_DIR$basename/;
353 # Remove for versions of HMMER lower than 2.2.
354 if ( $options =~ /--informat\s+\S+/ ) {
355 $options =~ s/--informat\s+\S+/-/;
358 system( "$HMMBUILD $options -o $outalignment $outhmm $full" )
359 && die "\n\n$0: Could not execute \"$HMMBUILD $options -o $outalignment $outhmm $full\".\n\n";
363 } ## executeHmmbuild.
367 # 1. Stockholm alignment
369 # Returns the options used.
370 # Last modified: 06/26/01
371 sub executeHmmbuildHand {
374 my $outhmm = $_[ 1 ];
377 unless ( ( -s $full ) && ( -f $full ) && ( -T $full ) ) {
378 die "\n\n$0: \"$full\" does not exist, is empty, or is not a plain textfile.\n\n";
381 $options = getHmmbuildOptionsFromPfam( $full );
388 $options =~ s/-o\s+\S+//;
389 $options =~ s/(\s|^)[^-]\S+/ /g;
391 if ( $options =~ /--prior/ ) {
392 my $basename = basename( $full );
393 $basename .= ".PRIOR";
394 $options =~ s/--prior/--prior $PRIOR_FILE_DIR$basename/;
397 # Remove for versions of HMMER lower than 2.2.
398 if ( $options =~ /--informat\s+\S+/ ) {
399 $options =~ s/--informat\s+\S+/-/;
402 system( "$HMMBUILD --hand $options $outhmm $full" )
403 && die "\n\n$0: Could not execute \"$HMMBUILD -- hand $options $outhmm $full\".\n\n";
407 } ## executeHmmbuildHand.
413 # Last modified: 02/26/01
414 sub getHmmbuildOptionsFromPfam {
416 my $infile = $_[ 0 ];
417 my $return_line = "";
420 unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
421 die "\n\n$0: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n";
424 open( GHO, $infile ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!";
425 while ( $return_line = <GHO> ) {
426 if ( $return_line =~ /^\s*#.*hmmbuild\s+(.+)\s*$/ ) {
435 } ## getHmmbuildOptionsFromPfam
439 # Similar to the method with the same name in "rio.pl".
440 # Removes sequences from a Pfam flat file.
441 # Adds species to TrEMBL seqs.
442 # It can remove all sequences not from species listed in a species names file.
443 # It can remove all sequences which do not have a SWISS-PROT name (XXXX_XXXXX)
445 # 1. Pfam flat file name
447 # 3. 1 to remove TrEMBL seqs with "(FRAGMENT)" in their DE line.
448 # Returns the number of sequences in the resulting alignment.
449 # If a query name is given, it returns -1 if query is not found in alignment,
450 # -10 if the name is not unique.
451 # Last modified: 05/24/02
452 sub removeSeqsFromPfamAlign {
453 my $infile = $_[ 0 ];
454 my $outfile = $_[ 1 ];
455 my $remove_frags = $_[ 2 ];
456 my $return_line = "";
457 my $saw_sequence_line = 0;
458 my $number_of_seqs = 0;
468 open( OUT_RNSP, ">$outfile" ) || die "\n\n$0: Unexpected error: Cannot create file \"$outfile\": $!";
469 open( IN_RNSP, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!";
470 while ( $return_line = <IN_RNSP> ) {
472 if ( $saw_sequence_line == 1
473 && !&containsPfamNamedSequence( $return_line )
474 && !&isPfamCommentLine( $return_line ) ) {
475 # This is just for counting purposes.
476 $saw_sequence_line = 2;
478 if ( &isPfamSequenceLine( $return_line ) ) {
479 if ( $saw_sequence_line == 0 ) {
480 $saw_sequence_line = 1;
482 $return_line =~ /^\s*(\S+)\s+(\S+)/;
485 if ( !&startsWithSWISS_PROTname( $return_line ) ) {
486 $seq_name =~ /^(\S+)\//;
488 unless( exists( $AC_OS{ $AC } ) ) {
489 #ACs not present in "ACDEOS" file.
493 if ( !$OS || $OS eq "" ) {
494 die "\n\n$0: Unexpected error: species for \"$AC\" not found.\n\n";
496 unless( exists( $Species_names_hash{ $OS } ) ) {
499 if ( $remove_frags == 1 ) {
501 if ( $DE && $DE =~ /\(FRAGMENT\)/ ) {
505 $seq_name =~ s/\//_$OS\//;
508 if ( $return_line =~ /_([A-Z0-9]{1,5})\// ) {
509 unless( exists( $Species_names_hash{ $1 } ) ) {
513 # remove everything whose species cannot be determined.
518 $length = length( $seq_name );
519 for ( $i = 0; $i <= ( $LENGTH_OF_NAME - $length - 1 ); $i++ ) {
522 $return_line = $seq_name.$seq."\n";
525 if ( !&isPfamCommentLine( $return_line ) ) {
526 print OUT_RNSP $return_line;
529 if ( $saw_sequence_line == 1 ) {
532 } ## while ( $return_line = <IN_RNSP> )
536 return $number_of_seqs;
538 } ## removeSeqsFromPfamAlign
546 # Reads in (SWISS-PROT) species names from a file.
547 # Names must be separated by newlines.
548 # Lines beginning with "#" are ignored.
549 # A possible "=" and everything after is ignored.
550 # One argument: species-names-file name
551 # Last modified: 04/24/01
552 sub readSpeciesNamesFile {
553 my $infile = $_[ 0 ];
554 my $return_line = "";
557 unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
558 die "\n\n$0: Error: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n";
561 open( IN_RSNF, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!\n\n";
562 while ( $return_line = <IN_RSNF> ) {
563 if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) {
566 $Species_names_hash{ $species } = "";
572 } ## readSpeciesNamesFile
576 # Last modified: 02/21/03
577 sub readTrEMBL_ACDEOS_FILE {
579 my $return_line = "";
581 unless ( ( -s $TREMBL_ACDEOS_FILE ) && ( -f $TREMBL_ACDEOS_FILE ) && ( -T $TREMBL_ACDEOS_FILE ) ) {
582 die "\n\n$0: Error: \"$TREMBL_ACDEOS_FILE\" does not exist, is empty, or is not a plain textfile.\n\n";
584 # Fill up (huge) hashs.
585 open( HH, "$TREMBL_ACDEOS_FILE" ) || die "\n\n$0: Unexpected error: Cannot open file <<$TREMBL_ACDEOS_FILE>>: $!\n\n";
586 while ( $return_line = <HH> ) {
588 if ( $return_line =~ /(\S+);([^;]*);(\S+)/ ) {
594 } ## readTrEMBL_ACDEOS_FILE
597 # Last modified: 02/21/03
600 my $return_line = "";
602 unless ( ( -s $ALGNS_TO_USE_LIST_FILE ) && ( -f $ALGNS_TO_USE_LIST_FILE ) && ( -T $ALGNS_TO_USE_LIST_FILE ) ) {
603 die "\n\n$0: Error: \"$ALGNS_TO_USE_LIST_FILE\" does not exist, is empty, or is not a plain textfile.\n\n";
606 open( LF, "$ALGNS_TO_USE_LIST_FILE" ) || die "\n\n$0: Unexpected error: Cannot open file <<$ALGNS_TO_USE_LIST_FILE>>: $!\n\n";
607 while ( $return_line = <LF> ) {
608 if ( $return_line =~ /^\s*(\S+)\s*$/ ) { # just a list
609 $ALGNS_TO_USE{ $1 } = "";
611 elsif ( $return_line =~ /^\s*\S+\s+\S+\s+(\S+)/ ) { # "changes" list from Pfam
612 $ALGNS_TO_USE{ $1 } = "";
623 # 1. Name of inputfile
625 # 2. Name of output alignment file
626 # 3. Name of output positions file
627 # 4. Seed for random number generator
629 # Last modified: 06/23/01
630 sub executeBootstrap {
631 my $infile = $_[ 0 ];
632 my $bootstraps = $_[ 1 ];
633 my $outalign = $_[ 2 ];
634 my $positions = $_[ 3 ];
637 system( "$BOOTSTRAP_CZ_PL 0 $bootstraps $infile $outalign $positions $seed" )
638 && die "\n\n$0: executeBootstrap:\nCould not execute \"$BOOTSTRAP_CZ_PL 0 $bootstraps $infile $outalign $positions $seed\".\n\n";
640 } ## executeBootstrap
645 # Last modified: 05/22/02
651 $tmp_dir = $MY_TEMP_DIR.$time.$ii;
653 while ( -e $tmp_dir ) {
655 $tmp_dir = $MY_TEMP_DIR.$time.$ii;
658 mkdir( $tmp_dir, 0777 )
659 || die "\n\n$0: Unexpected error: Could not create <<$tmp_dir>>: $!\n\n";
661 unless ( ( -e $tmp_dir ) && ( -d $tmp_dir ) ) {
662 die "\n\n$0: Unexpected error: failed to create <<$tmp_dir>>.\n\n";
669 # Last modified: 05/17/01
671 if ( -e $MY_RIO_PWD_DIRECTORY.$LOGFILE ) {
672 print "\npfam2pwd.pl:\n";
673 print "logfile $MY_RIO_PWD_DIRECTORY"."$LOGFILE already exists\n";
674 print "rename it or place it in another directory\n";
678 open( L, ">$MY_RIO_PWD_DIRECTORY".$LOGFILE )
679 || die "\n\n$0: startLogfile: Cannot create logfile: $!\n\n";
680 print L "Min seqs : $MIN_SEQS\n";
681 print L "Max seqs : $MAX_SEQS\n";
682 print L "Seed : $MY_SEED\n";
683 print L "TrEMBL ACDEOS file : $TREMBL_ACDEOS_FILE\n";
684 print L "Species names file : $MY_SPECIES_NAMES_FILE\n";
685 print L "Pfam directory : $MY_PFAM_FULL_DIRECTORY\n";
686 print L "PWD outputdirectory: $MY_RIO_PWD_DIRECTORY\n";
687 print L "BSP outputdirectory: $MY_RIO_BSP_DIRECTORY\n";
688 print L "NBD outputdirectory: $MY_RIO_NBD_DIRECTORY\n";
689 print L "ALN outputdirectory: $MY_RIO_ALN_DIRECTORY\n";
690 print L "HMM outputdirectory: $MY_RIO_HMM_DIRECTORY\n";
691 print L "Start date : ".`date`;
692 if ( $MY_MATRIX == 0 ) {
693 print L "Matrix : JTT\n";
695 elsif ( $MY_MATRIX == 2 ) {
696 print L "Matrix : BLOSUM 62\n";
698 elsif ( $MY_MATRIX == 3 ) {
699 print L "Matrix : mtREV24\n";
701 elsif ( $MY_MATRIX == 5 ) {
702 print L "Matrix : VT\n";
704 elsif ( $MY_MATRIX == 6 ) {
705 print L "Matrix : WAG\n";
707 elsif ( $MY_MATRIX == 7 ) {
708 print L "Matrix : auto\n";
711 print L "Matrix : PAM\n";
717 # Last modified: 05/17/01
721 print L "Successfully calculated $i pairwise distance files.\n";
722 print L "Too large alignments (>$MAX_SEQS): $too_large\n";
723 print L "Too small alignments (<$MIN_SEQS): $too_small\n";
724 print L "Finish date : ".`date`."\n\n";
726 print L "List of the $too_large alignments which were ignored because they\n";
727 print L "contained too many sequences (>$MAX_SEQS) after pruning:\n";
728 for ( $j = 0; $j < $too_large; ++$j ) {
729 print L "$too_large_names[ $j ]\n";
732 print L "List of the $too_small alignments which were ignored because they\n";
733 print L "contained not enough sequences (<$MIN_SEQS) after pruning:\n";
734 for ( $j = 0; $j < $too_small; ++$j ) {
735 print L "$too_small_names[ $j ]\n";