#!/usr/bin/perl -w # # Xrio.pl # ------- # Copyright (C) 1999-2001 Washington University School of Medicine # and Howard Hughes Medical Institute # All rights reserved # # Author: Christian M. Zmasek # zmasek@genetics.wustl.edu # http://www.genetics.wustl.edu/eddy/people/zmasek/ # # Created: 03/01/01 # # Last modified 06/22/01 # Objective. Runs "rio.pl" for each Pfam assignment in "infile". # # Usage. rio.pl # # species names file: list of species to use for analysis. # # This version uses the CE number as identifier for output files.\n"; # # Format for infile: # # >>3R5.2 CE19648 (CAMBRIDGE) TR:Q9XWB1 protein_id:CAA21778.1 # // # # >>4R79.1 CE19649 Zinc-binding metalloprotease domain (CAMBRIDGE) protein_id:CAB63429.1 # =Astacin Astacin (Peptidase family M12A) 296.3 3.8e-85 1 # // # # >>4R79.2 CE19650 Ras family (CAMBRIDGE) TR:Q9XXA4 protein_id:CAA20282.1 # =ras Ras family 208.8 8.1e-59 1 # =FA_desaturase Fatty acid desaturase 4.5 1.5 1 # =UPF0117 Domain of unknown function DUF36 3.1 3.5 1 # =arf ADP-ribosylation factor family -46.0 1.5e-05 1 # // # # # # Xrio.pl /nfs/wol2/people/zmasek/wormpep43_hmmpfam6.2/wormpep43_Hmmpfam_6.2 /nfs/wol2/people/zmasek/species_trees/tree_of_life_bin_1-4_species_list /nfs/wol2/people/zmasek/XrioTEST3 /nfs/wol2/people/zmasek/XrioTEST3/OUTFILE1 /nfs/wol2/people/zmasek/XrioTEST3/LOG1 use strict; use FindBin; use lib $FindBin::Bin; use rio_module; $RIO_PL = "rio.pl"; my $VERSION = "3.000"; my $FASTA_DB = "/nfs/wol2/people/zmasek/DB/wormpep/wormpep43"; my $QUERY_SPECIES = "CAEEL"; my $SPECIES_TREE = $SPECIES_TREE_FILE_DEFAULT; my $RIOPL_OPTIONS = "T=B P=6 L=0 R=0 U=80 V=0 X=2 Y=2 Z=2 C E I"; my $TEMP_DIR = "/tmp/Xriopl"; # Where all the temp files, etc will be created. my %Species_names_hash = (); my $infile = ""; my $outfile = ""; # Huge file of all rio outputs. my $logfile = ""; # Lists all sequences which have been analyzed successfully. my $output_directory = ""; my $species_names_file = ""; my $return_line = ""; my $ID = ""; my $pfam_name = ""; my $E_value = 0; my $score = 0; my $GA = 0; my $temp_dir = ""; my $outname = ""; my %outnames = (); my $seqs = 0; my $ii = 0; my $time = 0; my $successful = 0; my $query_not_aligned = 0; my $pwd_not_present = 0; my $already_done = 0; my $start_date = ""; my $old_fh = ""; my %AC_OS = (); # AC -> species name for TrEMBL seqs my %AC_DE = (); # AC -> description for TrEMBL seqs my $description_line = ""; my $message1 = ""; my $message2 = ""; $start_date = `date`; if ( @ARGV != 5 ) { &errorInCommandLine(); exit ( -1 ); } $infile = $ARGV[ 0 ]; $species_names_file = $ARGV[ 1 ]; $output_directory = $ARGV[ 2 ]; $outfile = $ARGV[ 3 ]; $logfile = $ARGV[ 4 ]; if ( -e $outfile ) { die "\n\n$0: <<$outfile>> already exists.\n\n"; } unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { die "\n\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n"; } unless ( ( -s $species_names_file ) && ( -f $species_names_file ) && ( -T $species_names_file ) ) { die "\n\n$0: <<$species_names_file>> does not exist, is empty, or is not a plain textfile.\n\n"; } unless ( ( -s $TREMBL_ACDEOS_FILE ) && ( -f $TREMBL_ACDEOS_FILE ) && ( -T $TREMBL_ACDEOS_FILE ) ) { die "\n\n$0: <<$TREMBL_ACDEOS_FILE>> does not exist, is empty, or is not a plain textfile.\n\n"; } unless ( ( -e $output_directory ) && ( -d $output_directory ) ) { die "\n\n$0: <<$output_directory>> does not exist, or is not a directory.\n\n"; } # Reads in the species file: # -------------------------- &readSpeciesNamesFile( $species_names_file ); # Reads in the file containing AC, DE and OS for TrEMBL seqs: # ----------------------------------------------------------- open( HH, "$TREMBL_ACDEOS_FILE" ) || die "\n\n$0: Unexpected error: Cannot open file <<$TREMBL_ACDEOS_FILE>>: $!\n\n"; while ( $return_line = ) { if ( $return_line =~ /(\S+);([^;]*);(\S+)/ ) { $AC_OS{ $1 } = $3; $AC_DE{ $1 } = $2; } } close( HH ); # Reads in outnames in logfile, if present: # ----------------------------------------- if ( ( -s $logfile ) ) { open( L, "$logfile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$logfile>>: $!\n\n"; while ( $return_line = ) { if ( $return_line =~ /\s*(\S+)/ ) { $outnames{ $1 } = 0; } } close( L ); } # Creates the temp directory: # --------------------------- $ii = 0; $time = time; $temp_dir = $TEMP_DIR.$time.$ii; while ( -e $temp_dir ) { $ii++; $temp_dir = $TEMP_DIR.$time.$ii; } mkdir( $temp_dir, 0777 ) || die "\n\n$0:Unexpected error: Could not create <<$temp_dir>>: $!\n\n"; unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) { die "\n\n$0:Unexpected error: <<$temp_dir>> does not exist, or is not a directory: $!\n\n"; } $message1 = "# $0\n". "# Version : $VERSION\n". "# Date started : $start_date". "# Infile : $infile\n". "# Species names file : $species_names_file\n". "# Output directory : $output_directory\n". "# Outfile : $outfile\n". "# RIO PWD directory : $RIO_PWD_DIRECTORY\n". "# RIO BSP directory : $RIO_BSP_DIRECTORY\n". "# RIO NBD directory : $RIO_NBD_DIRECTORY\n". "# RIO ALN directory : $RIO_ALN_DIRECTORY\n". "# RIO HMM directory : $RIO_HMM_DIRECTORY\n". "# Fasta db : $FASTA_DB\n". "# Species of query : $QUERY_SPECIES\n". "# Species tree : $SPECIES_TREE\n". "# rio.pl options : $RIOPL_OPTIONS\n\n\n"; open( IN, "$infile" ) || die "\n\n$0: Cannot open file <<$infile>>: $!\n\n"; open( LOG, ">> $logfile" ) || die "\n\n$0: Cannot open file <<$logfile>>: $!\n\n"; # Turns off buffering for LOG. $old_fh = select( LOG ); $| = 1; select( $old_fh ); $ID = ""; W: while ( $return_line = ) { if ( $return_line =~ /^\s*>>.*(CE\d+)/ ) { $ID = $1; $return_line =~ /^\s*>>(.+)/; $description_line = $1; } elsif ( $return_line =~ /^\s*\/\// ) { $ID = ""; } elsif ( $return_line =~ /^\s*=(\S+)\s+.+\s+(\S+)\s+(\S+)\s+\S+\s*$/ && $ID ne "" ) { $pfam_name = $1; $score = $2; $E_value = $3; $outname = $ID.".".$pfam_name; # Checks if already done. if ( %outnames && exists( $outnames{ $outname } ) ) { $already_done++; next W; } &executeHmmfetch( $PFAM_HMM_DB, $pfam_name, $temp_dir."/HMMFILE" ); $GA = &getGA1cutoff( $temp_dir."/HMMFILE" ); unlink( $temp_dir."/HMMFILE" ); if ( $GA == 2000 ) { die "\n\n$0: Unexpected error: No GA cutoff found for \"$pfam_name\".\n\n"; } elsif ( $score < $GA ) { next W; } if ( -s $output_directory."/".$outname ) { unlink( $output_directory."/".$outname ); } $message1 .= "\n\n". "# ############################################################################\n". "# Annotation: $description_line\n". "# HMM : $pfam_name\n". "# score : $score\n". "# E-value : $E_value\n"; unless ( ( -s $RIO_PWD_DIRECTORY.$pfam_name.$SUFFIX_PWD ) ) { $pwd_not_present++; $message1 .= "# No PWD file for this family.\n". "# ############################################################################\n"; next W; } unless ( ( -s $PFAM_SEED_DIRECTORY."/".$pfam_name ) && ( -f $PFAM_SEED_DIRECTORY."/".$pfam_name ) && ( -T $PFAM_SEED_DIRECTORY."/".$pfam_name ) ) { die "\n\n$0: Error: Pfam seed alignment <<$PFAM_SEED_DIRECTORY"."/"."$pfam_name>> not present.\n\n"; } &getSequenceFromFastaFile( $FASTA_DB, $temp_dir."/QUERY", $ID ); &performRIO( $pfam_name, # A= $temp_dir."/QUERY", # Q= $output_directory."/".$outname, # O= $ID."_".$QUERY_SPECIES, # N= $SPECIES_TREE, # S= $RIOPL_OPTIONS, # L=0 R=0 U=70 V=0 X=2 Y=2 Z=2 C E K I x $temp_dir."/riopltempdir" ); # j= if ( -s $output_directory."/".$outname ) { $successful++; } else { $message1 .= "# Query has not been aligned (E value too low).\n". "# ############################################################################\n"; $query_not_aligned++; } if ( unlink( $temp_dir."/QUERY" ) != 1 ) { die "\n$0: Unexpected error: File(s) could not be deleted.\n"; } if ( -s $output_directory."/".$outname ) { open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n\n"; print OUT_MESG_ONE ( $message1 ); close( OUT_MESG_ONE ); $message1 = ""; open( OUT_MESG_TWO, ">$temp_dir/_message2_" ) || die "\n\n$0: Cannot create file \"$temp_dir/_message2_\": $!\n\n"; print OUT_MESG_TWO ( "# Successful calculations : $successful\n" ); print OUT_MESG_TWO ( "# No calculation due to absence of PWD file: $pwd_not_present\n" ); print OUT_MESG_TWO ( "# Calculation already performed : $already_done\n" ); print OUT_MESG_TWO ( "# ############################################################################\n" ); close( OUT_MESG_TWO ); if ( -s $outfile ) { system( "cat $outfile $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile"."___" ) && die "\n\n$0: Could not execute \"cat $outfile $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile"."___\": $!\n\n"; system( "mv", $outfile."___", $outfile ) && die "\n\n$0: Could not execute \"mv $outfile"."___ $outfile\": $!\n\n"; } else { system( "cat $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile" ) && die "\n\n$0: Could not execute \"cat $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile\": $!\n\n"; } print LOG "$outname\n"; unlink( "$temp_dir/_message1_", "$temp_dir/_message2_" ); } } ## End of elsif ( $return_line =~ /^\s*=(\S+)\s+.+\s+(\S+)\s+(\S+)\s+\S+$/ && $ID ne "" ) } ## End of while ( $return_line = ) close( IN ); close( LOG ); open( OUT_MESG_TWO, ">$temp_dir/_message2_" ) || die "\n$0: Cannot create file \"$temp_dir/_message2_\": $!\n"; print OUT_MESG_TWO ( "\n\n# Xrio.pl successfully terminated.\n" ); print OUT_MESG_TWO ( "# Started : $start_date" ); print OUT_MESG_TWO ( "# Terminated: ".`date`."\n" ); print OUT_MESG_TWO ( "# Successful calculations : $successful\n" ); print OUT_MESG_TWO ( "# No calculation due to absence of PWD file: $pwd_not_present\n" ); print OUT_MESG_TWO ( "# Calculation already performed : $already_done\n\n" ); close( OUT_MESG_TWO ); if ( -s $outfile ) { if ( $message1 ne "" ) { open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n"; print OUT_MESG_ONE ( $message1 ); close( OUT_MESG_ONE ); system( "cat $outfile $temp_dir/_message1_ $temp_dir/_message2_ > $outfile"."___" ) && die "$0: Could not execute \"cat $outfile $temp_dir/_message1_ $temp_dir/_message2_ > $outfile"."___\": $!"; } else { system( "cat $outfile $temp_dir/_message2_ > $outfile"."___" ) && die "$0: Could not execute \"cat $outfile $temp_dir/_message2_ > $outfile"."___\": $!"; } system( "mv", $outfile."___", $outfile ) && die "$0: Could not execute \"mv $outfile"."___ $outfile\": $!"; } else { open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n"; print OUT_MESG_ONE ( $message1 ); close( OUT_MESG_ONE ); system( "cat $temp_dir/_message1_ $temp_dir/_message2_ > $outfile" ) && die "$0: Could not execute \"cat $temp_dir/_message1_ $temp_dir/_message2_ > $outfile\": $!"; } unlink( "$temp_dir/_message1_", "$temp_dir/_message2_" ); rmdir( $temp_dir ) || die "\n$0: Unexpected failure (could not remove: $temp_dir): $!\n"; print( "\n\nXrio.pl successfully terminated.\n" ); print( "Successful calculations : $successful\n" ); print( "No calculation due to absence of PWD file: $pwd_not_present\n" ); print( "Calculation already performed : $already_done\n" ); print( "Started : $start_date" ); print( "Terminated: ".`date`."\n" ); print( "\n" ); exit( 0 ); # Methods # ------- # Gets the gathering cutoff per sequence from a HMM file. # # One argument: the HMM file name # Returns the gathering cutoff per sequence, 2000 upon failure # Last modified: 07/11/01 sub getGA1cutoff { my $infile = $_[ 0 ]; my $return_line = ""; my $GA = 2000; &testForTextFilePresence( $infile ); open( H, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!"; while ( $return_line = ) { if ( $return_line =~ /^GA\s+(\S+)/ ) { $GA = $1; close( H ); return $GA; } } close( H ); return $GA; } ## getGA1cutoff # 1. A= Name of Pfam family # 2. Q= Query file # 3. O= Output # 4. N= query Name # 5. S= Species tree file # 6. more options, such I K m # 7. j= Name for temporary directory sub performRIO { my $pfam_name = $_[ 0 ]; my $query_file = $_[ 1 ]; my $output_file = $_[ 2 ]; my $name_for_query = $_[ 3 ]; my $species_tree_file = $_[ 4 ]; my $more_options = $_[ 5 ]; my $tmp_file_rio = $_[ 6 ]; my $options_for_rio = ""; $options_for_rio .= ( " A=".$pfam_name ); $options_for_rio .= ( " Q=".$query_file ); $options_for_rio .= ( " O=".$output_file ); $options_for_rio .= ( " N=".$name_for_query ); $options_for_rio .= ( " S=".$species_tree_file ); $options_for_rio .= ( " j=".$tmp_file_rio ); $options_for_rio .= ( " ".$more_options ); system( "$RIO_PL 1 $options_for_rio" ) && die "$0: performRIO: Could not execute \"$RIO_PL 1 $options_for_rio\": $!\n"; } ## performRIO # Reads in (SWISS-PROT) species names from a file. # Names must be separated by newlines. # Lines beginning with "#" are ignored. # A possible "=" and everything after is ignored. # One argument: species-names-file name # Last modified: 04/24/01 sub readSpeciesNamesFile { my $infile = $_[ 0 ]; my $return_line = ""; my $species = ""; unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { &Error( "\"$infile\" does not exist,\n is empty, or is not a plain textfile." ); } open( IN_RSNF, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!"; while ( $return_line = ) { if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) { $species = $1; $species =~ s/=.+//; $Species_names_hash{ $species } = ""; } } close( IN_RSNF ); return; } ## readSpeciesNamesFile # Searches the > line of a multiple seq file for a # query, saves the found entries. # Three arguments: # 1. multi Fasta file to search through # 2. outputfile name # 3. query # Last modified: 03/05/01 sub getSequenceFromFastaFile { my $inputfile = $_[ 0 ]; my $outputfile = $_[ 1 ]; my $query = $_[ 2 ]; my $hits = 0; open( IN_GSFF, "$inputfile" ) || die "\n$0: getSequenceFromFastaFile: Cannot open file <<$inputfile>>: $!\n"; open( OUT_GSFF, ">$outputfile" ) || die "\n$0: getSequenceFromFastaFile: Cannot create file <<$outputfile>>: $!\n"; while ( $return_line = ) { if ( $return_line =~ /^\s*>.*$query\s+/ ) { $hits++; print $return_line; print OUT_GSFF $return_line; $return_line = ; while ( $return_line && $return_line =~ /^\s*[^>]/ ) { print OUT_GSFF $return_line; $return_line = ; } last; # In Wormpep there _are_ ambigous CE numbers. } } close( IN_GSFF ); close( OUT_GSFF ); if ( $hits < 1 ) { die "\n$0: getSequenceFromFastaFile: Unexpected error: <<$query>> not found.\n"; } if ( $hits > 1 ) { die "\n$0: getSequenceFromFastaFile: Unexpected error: <<$query>> is ambigous.\n"; } } ## getSequenceFromFastaFile # Last modified: 03/08/01 sub errorInCommandLine { print "\n"; print " Xrio.pl $VERSION\n"; print " -------\n"; print "\n"; print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n"; print "\n"; print " Purpose. Runs \"rio.pl\" for each Pfam assignment in \"infile\".\n"; print "\n"; print " Usage. rio.pl \n"; print "\n"; print " infile: has the following format (defined per example):\n"; print " >>4R79.1 CE19649 Zinc-binding metalloprotease domain (CAMBRIDGE) protein_id:CAB63429.1\n"; print " =Astacin Astacin (Peptidase family M12A) 296.3 3.8e-85 1\n"; print " //\n"; print "\n"; print " >>4R79.2 CE19650 Ras family (CAMBRIDGE) TR:Q9XXA4 protein_id:CAA20282.1\n"; print " =ras Ras family 208.8 8.1e-59 1\n"; print " =FA_desaturase Fatty acid desaturase 4.5 1.5 1\n"; print " =UPF0117 Domain of unknown function DUF36 3.1 3.5 1\n"; print " =arf ADP-ribosylation factor family -46.0 1.5e-05 1\n"; print " //\n"; print "\n"; print " species names file: list of species to use for analysis.\n"; print "\n"; print " This version uses the CE number as identifier for output files.\n"; print "\n"; exit( -1 ); } ## errorInCommandLine