5 # Copyright (C) 1999-2001 Washington University School of Medicine
6 # and Howard Hughes Medical Institute
9 # Author: Christian M. Zmasek
10 # zmasek@genetics.wustl.edu
11 # http://www.genetics.wustl.edu/eddy/people/zmasek/
15 # Last modified 06/22/01
18 # Objective. Runs "rio.pl" for each Pfam assignment in "infile".
20 # Usage. rio.pl <infile> <species names file> <output directory> <outfile> <logfile>
22 # species names file: list of species to use for analysis.
24 # This version uses the CE number as identifier for output files.\n";
28 # >>3R5.2 CE19648 (CAMBRIDGE) TR:Q9XWB1 protein_id:CAA21778.1
31 # >>4R79.1 CE19649 Zinc-binding metalloprotease domain (CAMBRIDGE) protein_id:CAB63429.1
32 # =Astacin Astacin (Peptidase family M12A) 296.3 3.8e-85 1
35 # >>4R79.2 CE19650 Ras family (CAMBRIDGE) TR:Q9XXA4 protein_id:CAA20282.1
36 # =ras Ras family 208.8 8.1e-59 1
37 # =FA_desaturase Fatty acid desaturase 4.5 1.5 1
38 # =UPF0117 Domain of unknown function DUF36 3.1 3.5 1
39 # =arf ADP-ribosylation factor family -46.0 1.5e-05 1
45 # Xrio.pl /nfs/wol2/people/zmasek/wormpep43_hmmpfam6.2/wormpep43_Hmmpfam_6.2 /nfs/wol2/people/zmasek/species_trees/tree_of_life_bin_1-4_species_list /nfs/wol2/people/zmasek/XrioTEST3 /nfs/wol2/people/zmasek/XrioTEST3/OUTFILE1 /nfs/wol2/people/zmasek/XrioTEST3/LOG1
54 use lib $FindBin::Bin;
58 my $VERSION = "3.000";
60 my $FASTA_DB = "/nfs/wol2/people/zmasek/DB/wormpep/wormpep43";
61 my $QUERY_SPECIES = "CAEEL";
62 my $SPECIES_TREE = $SPECIES_TREE_FILE_DEFAULT;
64 my $RIOPL_OPTIONS = "T=B P=6 L=0 R=0 U=80 V=0 X=2 Y=2 Z=2 C E I";
66 my $TEMP_DIR = "/tmp/Xriopl"; # Where all the temp files, etc will be created.
68 my %Species_names_hash = ();
71 my $outfile = ""; # Huge file of all rio outputs.
72 my $logfile = ""; # Lists all sequences which have been analyzed successfully.
73 my $output_directory = "";
74 my $species_names_file = "";
90 my $query_not_aligned = 0;
91 my $pwd_not_present = 0;
95 my %AC_OS = (); # AC -> species name for TrEMBL seqs
96 my %AC_DE = (); # AC -> description for TrEMBL seqs
98 my $description_line = "";
102 $start_date = `date`;
105 &errorInCommandLine();
109 $infile = $ARGV[ 0 ];
110 $species_names_file = $ARGV[ 1 ];
111 $output_directory = $ARGV[ 2 ];
112 $outfile = $ARGV[ 3 ];
113 $logfile = $ARGV[ 4 ];
117 die "\n\n$0: <<$outfile>> already exists.\n\n";
119 unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
120 die "\n\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n";
122 unless ( ( -s $species_names_file ) && ( -f $species_names_file ) && ( -T $species_names_file ) ) {
123 die "\n\n$0: <<$species_names_file>> does not exist, is empty, or is not a plain textfile.\n\n";
125 unless ( ( -s $TREMBL_ACDEOS_FILE ) && ( -f $TREMBL_ACDEOS_FILE ) && ( -T $TREMBL_ACDEOS_FILE ) ) {
126 die "\n\n$0: <<$TREMBL_ACDEOS_FILE>> does not exist, is empty, or is not a plain textfile.\n\n";
128 unless ( ( -e $output_directory ) && ( -d $output_directory ) ) {
129 die "\n\n$0: <<$output_directory>> does not exist, or is not a directory.\n\n";
134 # Reads in the species file:
135 # --------------------------
136 &readSpeciesNamesFile( $species_names_file );
140 # Reads in the file containing AC, DE and OS for TrEMBL seqs:
141 # -----------------------------------------------------------
142 open( HH, "$TREMBL_ACDEOS_FILE" ) || die "\n\n$0: Unexpected error: Cannot open file <<$TREMBL_ACDEOS_FILE>>: $!\n\n";
143 while ( $return_line = <HH> ) {
144 if ( $return_line =~ /(\S+);([^;]*);(\S+)/ ) {
153 # Reads in outnames in logfile, if present:
154 # -----------------------------------------
155 if ( ( -s $logfile ) ) {
156 open( L, "$logfile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$logfile>>: $!\n\n";
157 while ( $return_line = <L> ) {
158 if ( $return_line =~ /\s*(\S+)/ ) {
167 # Creates the temp directory:
168 # ---------------------------
174 $temp_dir = $TEMP_DIR.$time.$ii;
176 while ( -e $temp_dir ) {
178 $temp_dir = $TEMP_DIR.$time.$ii;
181 mkdir( $temp_dir, 0777 )
182 || die "\n\n$0:Unexpected error: Could not create <<$temp_dir>>: $!\n\n";
184 unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) {
185 die "\n\n$0:Unexpected error: <<$temp_dir>> does not exist, or is not a directory: $!\n\n";
190 $message1 = "# $0\n".
191 "# Version : $VERSION\n".
192 "# Date started : $start_date".
193 "# Infile : $infile\n".
194 "# Species names file : $species_names_file\n".
195 "# Output directory : $output_directory\n".
196 "# Outfile : $outfile\n".
197 "# RIO PWD directory : $RIO_PWD_DIRECTORY\n".
198 "# RIO BSP directory : $RIO_BSP_DIRECTORY\n".
199 "# RIO NBD directory : $RIO_NBD_DIRECTORY\n".
200 "# RIO ALN directory : $RIO_ALN_DIRECTORY\n".
201 "# RIO HMM directory : $RIO_HMM_DIRECTORY\n".
202 "# Fasta db : $FASTA_DB\n".
203 "# Species of query : $QUERY_SPECIES\n".
204 "# Species tree : $SPECIES_TREE\n".
205 "# rio.pl options : $RIOPL_OPTIONS\n\n\n";
207 open( IN, "$infile" ) || die "\n\n$0: Cannot open file <<$infile>>: $!\n\n";
208 open( LOG, ">> $logfile" ) || die "\n\n$0: Cannot open file <<$logfile>>: $!\n\n";
211 # Turns off buffering for LOG.
212 $old_fh = select( LOG );
219 W: while ( $return_line = <IN> ) {
221 if ( $return_line =~ /^\s*>>.*(CE\d+)/ ) {
223 $return_line =~ /^\s*>>(.+)/;
224 $description_line = $1;
226 elsif ( $return_line =~ /^\s*\/\// ) {
229 elsif ( $return_line =~ /^\s*=(\S+)\s+.+\s+(\S+)\s+(\S+)\s+\S+\s*$/
236 $outname = $ID.".".$pfam_name;
238 # Checks if already done.
239 if ( %outnames && exists( $outnames{ $outname } ) ) {
244 &executeHmmfetch( $PFAM_HMM_DB, $pfam_name, $temp_dir."/HMMFILE" );
246 $GA = &getGA1cutoff( $temp_dir."/HMMFILE" );
247 unlink( $temp_dir."/HMMFILE" );
250 die "\n\n$0: Unexpected error: No GA cutoff found for \"$pfam_name\".\n\n";
252 elsif ( $score < $GA ) {
256 if ( -s $output_directory."/".$outname ) {
257 unlink( $output_directory."/".$outname );
262 "# ############################################################################\n".
263 "# Annotation: $description_line\n".
264 "# HMM : $pfam_name\n".
265 "# score : $score\n".
266 "# E-value : $E_value\n";
270 unless ( ( -s $RIO_PWD_DIRECTORY.$pfam_name.$SUFFIX_PWD ) ) {
272 $message1 .= "# No PWD file for this family.\n".
273 "# ############################################################################\n";
278 unless ( ( -s $PFAM_SEED_DIRECTORY."/".$pfam_name ) && ( -f $PFAM_SEED_DIRECTORY."/".$pfam_name ) && ( -T $PFAM_SEED_DIRECTORY."/".$pfam_name ) ) {
279 die "\n\n$0: Error: Pfam seed alignment <<$PFAM_SEED_DIRECTORY"."/"."$pfam_name>> not present.\n\n";
283 &getSequenceFromFastaFile( $FASTA_DB,
287 &performRIO( $pfam_name, # A=
288 $temp_dir."/QUERY", # Q=
289 $output_directory."/".$outname, # O=
290 $ID."_".$QUERY_SPECIES, # N=
292 $RIOPL_OPTIONS, # L=0 R=0 U=70 V=0 X=2 Y=2 Z=2 C E K I x
293 $temp_dir."/riopltempdir" ); # j=
297 if ( -s $output_directory."/".$outname ) {
301 $message1 .= "# Query has not been aligned (E value too low).\n".
302 "# ############################################################################\n";
303 $query_not_aligned++;
306 if ( unlink( $temp_dir."/QUERY" ) != 1 ) {
307 die "\n$0: Unexpected error: File(s) could not be deleted.\n";
312 if ( -s $output_directory."/".$outname ) {
313 open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n\n";
314 print OUT_MESG_ONE ( $message1 );
315 close( OUT_MESG_ONE );
319 open( OUT_MESG_TWO, ">$temp_dir/_message2_" ) || die "\n\n$0: Cannot create file \"$temp_dir/_message2_\": $!\n\n";
320 print OUT_MESG_TWO ( "# Successful calculations : $successful\n" );
321 print OUT_MESG_TWO ( "# No calculation due to absence of PWD file: $pwd_not_present\n" );
322 print OUT_MESG_TWO ( "# Calculation already performed : $already_done\n" );
323 print OUT_MESG_TWO ( "# ############################################################################\n" );
324 close( OUT_MESG_TWO );
327 system( "cat $outfile $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile"."___" )
328 && die "\n\n$0: Could not execute \"cat $outfile $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile"."___\": $!\n\n";
329 system( "mv", $outfile."___", $outfile )
330 && die "\n\n$0: Could not execute \"mv $outfile"."___ $outfile\": $!\n\n";
333 system( "cat $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile" )
334 && die "\n\n$0: Could not execute \"cat $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile\": $!\n\n";
338 print LOG "$outname\n";
340 unlink( "$temp_dir/_message1_", "$temp_dir/_message2_" );
346 } ## End of elsif ( $return_line =~ /^\s*=(\S+)\s+.+\s+(\S+)\s+(\S+)\s+\S+$/ && $ID ne "" )
348 } ## End of while ( $return_line = <IN> )
354 open( OUT_MESG_TWO, ">$temp_dir/_message2_" ) || die "\n$0: Cannot create file \"$temp_dir/_message2_\": $!\n";
355 print OUT_MESG_TWO ( "\n\n# Xrio.pl successfully terminated.\n" );
356 print OUT_MESG_TWO ( "# Started : $start_date" );
357 print OUT_MESG_TWO ( "# Terminated: ".`date`."\n" );
358 print OUT_MESG_TWO ( "# Successful calculations : $successful\n" );
359 print OUT_MESG_TWO ( "# No calculation due to absence of PWD file: $pwd_not_present\n" );
360 print OUT_MESG_TWO ( "# Calculation already performed : $already_done\n\n" );
361 close( OUT_MESG_TWO );
364 if ( $message1 ne "" ) {
365 open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n";
366 print OUT_MESG_ONE ( $message1 );
367 close( OUT_MESG_ONE );
368 system( "cat $outfile $temp_dir/_message1_ $temp_dir/_message2_ > $outfile"."___" )
369 && die "$0: Could not execute \"cat $outfile $temp_dir/_message1_ $temp_dir/_message2_ > $outfile"."___\": $!";
372 system( "cat $outfile $temp_dir/_message2_ > $outfile"."___" )
373 && die "$0: Could not execute \"cat $outfile $temp_dir/_message2_ > $outfile"."___\": $!";
375 system( "mv", $outfile."___", $outfile )
376 && die "$0: Could not execute \"mv $outfile"."___ $outfile\": $!";
379 open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n";
380 print OUT_MESG_ONE ( $message1 );
381 close( OUT_MESG_ONE );
382 system( "cat $temp_dir/_message1_ $temp_dir/_message2_ > $outfile" )
383 && die "$0: Could not execute \"cat $temp_dir/_message1_ $temp_dir/_message2_ > $outfile\": $!";
386 unlink( "$temp_dir/_message1_", "$temp_dir/_message2_" );
388 rmdir( $temp_dir ) || die "\n$0: Unexpected failure (could not remove: $temp_dir): $!\n";
390 print( "\n\nXrio.pl successfully terminated.\n" );
391 print( "Successful calculations : $successful\n" );
392 print( "No calculation due to absence of PWD file: $pwd_not_present\n" );
393 print( "Calculation already performed : $already_done\n" );
394 print( "Started : $start_date" );
395 print( "Terminated: ".`date`."\n" );
407 # Gets the gathering cutoff per sequence from a HMM file.
409 # One argument: the HMM file name
410 # Returns the gathering cutoff per sequence, 2000 upon failure
411 # Last modified: 07/11/01
414 my $infile = $_[ 0 ];
415 my $return_line = "";
418 &testForTextFilePresence( $infile );
420 open( H, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!";
421 while ( $return_line = <H> ) {
423 if ( $return_line =~ /^GA\s+(\S+)/ ) {
439 # 1. A= Name of Pfam family
443 # 5. S= Species tree file
444 # 6. more options, such I K m
445 # 7. j= Name for temporary directory
447 my $pfam_name = $_[ 0 ];
448 my $query_file = $_[ 1 ];
449 my $output_file = $_[ 2 ];
450 my $name_for_query = $_[ 3 ];
451 my $species_tree_file = $_[ 4 ];
452 my $more_options = $_[ 5 ];
453 my $tmp_file_rio = $_[ 6 ];
455 my $options_for_rio = "";
457 $options_for_rio .= ( " A=".$pfam_name );
458 $options_for_rio .= ( " Q=".$query_file );
459 $options_for_rio .= ( " O=".$output_file );
460 $options_for_rio .= ( " N=".$name_for_query );
461 $options_for_rio .= ( " S=".$species_tree_file );
462 $options_for_rio .= ( " j=".$tmp_file_rio );
463 $options_for_rio .= ( " ".$more_options );
465 system( "$RIO_PL 1 $options_for_rio" )
466 && die "$0: performRIO: Could not execute \"$RIO_PL 1 $options_for_rio\": $!\n";
472 # Reads in (SWISS-PROT) species names from a file.
473 # Names must be separated by newlines.
474 # Lines beginning with "#" are ignored.
475 # A possible "=" and everything after is ignored.
476 # One argument: species-names-file name
477 # Last modified: 04/24/01
478 sub readSpeciesNamesFile {
479 my $infile = $_[ 0 ];
480 my $return_line = "";
483 unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
484 &Error( "\"$infile\" does not exist,\n is empty, or is not a plain textfile." );
487 open( IN_RSNF, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!";
488 while ( $return_line = <IN_RSNF> ) {
489 if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) {
492 $Species_names_hash{ $species } = "";
498 } ## readSpeciesNamesFile
502 # Searches the > line of a multiple seq file for a
503 # query, saves the found entries.
505 # 1. multi Fasta file to search through
508 # Last modified: 03/05/01
509 sub getSequenceFromFastaFile {
511 my $inputfile = $_[ 0 ];
512 my $outputfile = $_[ 1 ];
516 open( IN_GSFF, "$inputfile" )
517 || die "\n$0: getSequenceFromFastaFile: Cannot open file <<$inputfile>>: $!\n";
518 open( OUT_GSFF, ">$outputfile" )
519 || die "\n$0: getSequenceFromFastaFile: Cannot create file <<$outputfile>>: $!\n";
522 while ( $return_line = <IN_GSFF> ) {
523 if ( $return_line =~ /^\s*>.*$query\s+/ ) {
526 print OUT_GSFF $return_line;
527 $return_line = <IN_GSFF>;
528 while ( $return_line && $return_line =~ /^\s*[^>]/ ) {
529 print OUT_GSFF $return_line;
530 $return_line = <IN_GSFF>;
532 last; # In Wormpep there _are_ ambigous CE numbers.
540 die "\n$0: getSequenceFromFastaFile: Unexpected error: <<$query>> not found.\n";
543 die "\n$0: getSequenceFromFastaFile: Unexpected error: <<$query>> is ambigous.\n";
546 } ## getSequenceFromFastaFile
551 # Last modified: 03/08/01
552 sub errorInCommandLine {
555 print " Xrio.pl $VERSION\n";
558 print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n";
560 print " Purpose. Runs \"rio.pl\" for each Pfam assignment in \"infile\".\n";
562 print " Usage. rio.pl <infile> <species names file> <output directory> <outfile> <logfile>\n";
564 print " infile: has the following format (defined per example):\n";
565 print " >>4R79.1 CE19649 Zinc-binding metalloprotease domain (CAMBRIDGE) protein_id:CAB63429.1\n";
566 print " =Astacin Astacin (Peptidase family M12A) 296.3 3.8e-85 1\n";
569 print " >>4R79.2 CE19650 Ras family (CAMBRIDGE) TR:Q9XXA4 protein_id:CAA20282.1\n";
570 print " =ras Ras family 208.8 8.1e-59 1\n";
571 print " =FA_desaturase Fatty acid desaturase 4.5 1.5 1\n";
572 print " =UPF0117 Domain of unknown function DUF36 3.1 3.5 1\n";
573 print " =arf ADP-ribosylation factor family -46.0 1.5e-05 1\n";
576 print " species names file: list of species to use for analysis.\n";
578 print " This version uses the CE number as identifier for output files.\n";
583 } ## errorInCommandLine