[ RIO SERVER | phylogenomic analysis of a protein sequence ]

From: cmzmasek Date: Fri, 3 Mar 2017 00:28:37 +0000 (-0800) Subject: removed old files X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=352354f08235f48bcbc48208a5b5bc50fa11ec13;p=jalview.git removed old files --- diff --git a/forester/archive/RIO/C/Makefile b/forester/archive/RIO/C/Makefile deleted file mode 100644 index 552adfe..0000000 --- a/forester/archive/RIO/C/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -# Makefile for bootstrap_cz -# Last modified 06/07/01 - -# For GNU gcc -CFLAGS = -O2 -Wall -pedantic - -# This one specifies the "cc" C compiler -#CC = cc $(CFLAGS) -# -# To use "gcc" instead -CC = gcc $(CFLAGS) - - -bootstrap_cz: bootstrap_cz.c - $(CC) -o bootstrap_cz bootstrap_cz.c - diff --git a/forester/archive/RIO/C/bootstrap_cz.c b/forester/archive/RIO/C/bootstrap_cz.c deleted file mode 100644 index 6f59a30..0000000 --- a/forester/archive/RIO/C/bootstrap_cz.c +++ /dev/null @@ -1,484 +0,0 @@ -/* -# bootstrap_cz -# ------------ -# Copyright (C) 1999-2002 Washington University School of Medicine -# and Howard Hughes Medical Institute -# All rights reserved -# -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Created: 06/06/01 -# -# Last modified: 01/27/02 -# -# Purpose: -# Bootstrap resamples an alignment in PHYLIP sequential format times. -# Bootstrapping is not done randomly but according to a BSP (bootstrap -# positions) file. -# The BSP file can be created with the Perl program "bootstrap_cz.pl" -# in mode 0. -# This prgram has the same functionality as "bootstrap_cz.pl" in mode 1. -# Sequence names are normalized to LENGTH_OF_NAME characters. -# The output alignment is in PHYLIP's sequential or interleaved format. -# (These two are the same in this case, since all the seqs will be one -# line in length (no returns in seq).) -# -# Usage: bootstrap_cz -# [number of processors] -*/ - - - -#include -#include -#include -#include - - -#define LENGTH_OF_NAME 26 - - -static char **names, /* This stores the sequence names */ - **sequences; /* This stores the sequences */ -static int number_of_seqs, - number_of_colm; - - -void readInAlignmnet( const char * ); -void bootstrapAccordingToBSPfile( int, const char *, const char * ); -void checkForMemAllocFailure( void * ); -int fileExists( const char *); -void errorInCommandLine(); - - - - - -/* Reads the seqs and seq-names from inalignment */ -/* into **sequences and **sequences. */ -/* Inalignment must be in PHYLIP sequential format. */ -/* Last modified: 06/25/01 */ -void readInAlignment( const char *inalignment ) { - - FILE *inalignment_fp = NULL; - char *str = NULL; - int max_length = 0; - register char c = ' '; - register int i = 0, - ii = 0, - z = 0, - seq = 0; - - number_of_seqs = 0; - number_of_colm = 0; - - inalignment_fp = fopen( inalignment, "r" ); - if ( inalignment_fp == NULL ) { - printf( "\nbootstrap_cz: Error: Could not open alignment file for reading.\n" ); - exit( -1 ); - } - - if ( fscanf( inalignment_fp, "%d", &number_of_seqs ) != 1 ) { - printf( "\nbootstrap_cz: Error: Could not read in number of seqs.\n" ); - exit( -1 ); - } - if ( fscanf( inalignment_fp, "%d", &number_of_colm ) != 1 ) { - printf( "\nbootstrap_cz: Error: Could not read in number of columns.\n" ); - exit( -1 ); - } - - names = malloc( number_of_seqs * sizeof( char *) ); - checkForMemAllocFailure( names ); - for ( i = 0; i < number_of_seqs; ++i ) { - names[ i ] = malloc( LENGTH_OF_NAME * sizeof( char ) ); - checkForMemAllocFailure( names[ i ] ); - } - - sequences = malloc( number_of_seqs * sizeof( char * ) ); - checkForMemAllocFailure( sequences ); - for ( i = 0; i < number_of_seqs; ++i ) { - sequences[ i ] = malloc( number_of_colm * sizeof( char ) ); - checkForMemAllocFailure( sequences[ i ] ); - } - - max_length = ( 30 * LENGTH_OF_NAME ) + number_of_colm; - - str = malloc( max_length * sizeof( char * ) ); - checkForMemAllocFailure( str ); - - while ( fgets( str, max_length, inalignment_fp ) != NULL ) { - - if ( !isspace( str[ 0 ] ) != 0 ) { - - i = 0; - while ( str[ i ] != ' ' ) { - names[ seq ][ i ] = str[ i ]; - i++; - } - - ii = i; - while ( ii < LENGTH_OF_NAME ) { - names[ seq ][ ii ] = ' '; - ii++; - } - - z = 0; - - while ( str[ i ] != '\n' && str[ i ] != '\r' && str[ i ] != '\0' ) { - c = str[ i ]; - if ( c != ' ' ) { - if ( isupper( c ) != 0 || c == '-' ) { - sequences[ seq ][ z++ ] = c; - } - else { - printf( "\nbootstrap_cz: Error: Sequence must be represented by uppercase letters A-Z and \"-\" only.\n" ); - exit( -1 ); - } - } - i++; - if ( z > number_of_colm ) { - printf( "\nbootstrap_cz: Error: line in \"%s\" contains more than %d columns.\n", - inalignment, number_of_colm ); - exit( -1 ); - } - } - if ( z != number_of_colm ) { - printf( "\nbootstrap_cz: Error: line in \"%s\" contains a incorrect number of columns.\n", - inalignment ); - exit( -1 ); - } - - seq++; - - if ( seq > number_of_seqs ) { - printf( "\nbootstrap_cz: Error: \"%s\" contains more than %d seqs.\n", - inalignment, number_of_seqs ); - exit( -1 ); - } - } - - - } /* while ( fgets ) */ - - if ( seq != number_of_seqs ) { - printf( "\nbootstrap_cz: Error: \"%s\" contains a incorrect number of seqs.\n", - inalignment ); - exit( -1 ); - } - - fclose( inalignment_fp ); - - return; - -} /* readInAlignment */ - - - -/* Rearrenges the aa in sequences according to */ -/* the bsp (bootstrap positions) file bsp_file. */ -/* Writes the results to outfile */ -/* Last modified: 06/07/01 */ -void bootstrapAccordingToBSPfile( int bootstraps, - const char *bsp_file, - const char *outfile ) { - - FILE *bsp_file_fp = NULL, - *outfile_fp = NULL; - int *positions = NULL, - p = 0; - register int boot = 0, - seq = 0, - i = 0; - - positions = malloc( number_of_colm * sizeof( int ) ); - checkForMemAllocFailure( positions ); - - - bsp_file_fp = fopen( bsp_file, "r" ); - if ( bsp_file_fp == NULL ) { - printf( "\nbootstrap_cz: Error: could not open file \"%s\" for reading.\n", - bsp_file ); - exit( -1 ); - } - - outfile_fp = fopen( outfile, "w" ); - if ( outfile_fp == NULL ) { - printf( "\nbootstrap_cz: Error: could not open file \"%s\" for writing.\n", - outfile ); - exit( -1 ); - } - - for ( boot = 0; boot < bootstraps; ++boot ) { - - for ( i = 0; i < number_of_colm; ++i ) { - if ( fscanf( bsp_file_fp, "%d", &p ) != 1 ) { - printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment.\n", - bsp_file ); - exit( -1 ); - } - positions[ i ] = p; - } - - fprintf( outfile_fp, " %d %d\n", number_of_seqs, number_of_colm ); - for ( seq = 0; seq < number_of_seqs; ++seq ) { - for ( i = 0; i < LENGTH_OF_NAME; ++i ) { - fprintf( outfile_fp, "%c", names[ seq ][ i ] ); - } - for ( i = 0; i < number_of_colm; ++i ) { - fprintf( outfile_fp, "%c", sequences[ seq ][ positions[ i ] ] ); - } - fprintf( outfile_fp, "\n" ); - } - } - - /* Now, the bsp file must not contain any more numbers */ - if ( fscanf( bsp_file_fp, "%d", &p ) == 1 ) { - printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment (too long).\n", - bsp_file ); - printf( ">%d<\n", p ); - printf( "number of seqs=%d\n", number_of_seqs ); - exit( -1 ); - } - - fclose( bsp_file_fp ); - fclose( outfile_fp ); - free( positions ); - return; - -} /* bootstrapAccordingToBSPfile */ - - - -/* Rearrenges the aa in sequences according to */ -/* the bsp (bootstrap positions) file bsp_file. */ -/* Writes the results to outfile */ -/* Last modified: 01/25/02 */ -void bootstrapAccordingToBSPfileP( int bootstraps, - int processors, - const char *bsp_file, - const char *outfile ) { - - FILE *bsp_file_fp = NULL, - *outfile_fp = NULL; - int *positions = NULL, - p = 0; - char *outfile_ = NULL; - register int boot = 0, - seq = 0, - i = 0, - j = 0, - z = 0, - flag = 0; - int block_size = 0, - larger_blocks = 0; - - block_size = ( int ) bootstraps / processors; - larger_blocks = bootstraps - ( block_size * processors ); /* number of blocks which have a size of - block_size + 1 */ - - positions = malloc( number_of_colm * sizeof( int ) ); - checkForMemAllocFailure( positions ); - - outfile_ = malloc( ( strlen( outfile ) + 20 ) * sizeof( char ) ); - checkForMemAllocFailure( outfile_ ); - - bsp_file_fp = fopen( bsp_file, "r" ); - if ( bsp_file_fp == NULL ) { - printf( "\nbootstrap_cz: Error: could not open file \"%s\" for reading.\n", - bsp_file ); - exit( -1 ); - } - - j = -1; - flag = 1; - z = 0; - - for ( boot = 0; boot < bootstraps; ++boot ) { - - for ( i = 0; i < number_of_colm; ++i ) { - if ( fscanf( bsp_file_fp, "%d", &p ) != 1 ) { - printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment.\n", - bsp_file ); - exit( -1 ); - } - positions[ i ] = p; - } - - j++; - - if ( larger_blocks > 0 ) { - if ( j >= block_size + 1 ) { - flag = 1; - j = 0; - larger_blocks--; - } - } - else if ( j >= block_size ) { - flag = 1; - j = 0; - } - - if ( flag == 1 ) { - if ( boot > 0 ) { - fclose( outfile_fp ); - } - sprintf( outfile_, "%s%d", outfile, z++ ); - if ( fileExists( outfile_ ) == 1 ) { - printf( "\nbootstrap_cz: Error: outfile \"%s\" already exists.\n", - outfile_ ); - exit( -1 ); - } - outfile_fp = fopen( outfile_, "w" ); - if ( outfile_fp == NULL ) { - printf( "\nbootstrap_cz: Error: could not open file \"%s\" for writing.\n", - outfile_ ); - exit( -1 ); - } - flag = 0; - } - - fprintf( outfile_fp, " %d %d\n", number_of_seqs, number_of_colm ); - for ( seq = 0; seq < number_of_seqs; ++seq ) { - for ( i = 0; i < LENGTH_OF_NAME; ++i ) { - fprintf( outfile_fp, "%c", names[ seq ][ i ] ); - } - for ( i = 0; i < number_of_colm; ++i ) { - fprintf( outfile_fp, "%c", sequences[ seq ][ positions[ i ] ] ); - } - fprintf( outfile_fp, "\n" ); - } - } - - /* Now, the bsp file must not contain any more numbers */ - if ( fscanf( bsp_file_fp, "%d", &p ) == 1 ) { - printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment (too long).\n", - bsp_file ); - printf( ">%d<\n", p ); - printf( "number of seqs=%d\n", number_of_seqs ); - exit( -1 ); - } - - fclose( bsp_file_fp ); - fclose( outfile_fp ); - - free( positions ); - free( outfile_ ); - - return; - -} /* bootstrapAccordingToBSPfileP */ - - - - -/* Exits if *p is NULL. */ -/* Last modified: 06/06/01 */ -void checkForMemAllocFailure( void *p ) { - if ( p == NULL ) { - printf( "\nbootstrap_cz: Memory allocation failed.\n" ); - exit( -1 ); - } - else { - return; - } -} /* checkForMemAllocFailure */ - - - -/* Returns 1 if filename can be opened. */ -/* Returns 0 otherwise. */ -/* Last modified: 06/07/01 */ -int fileExists( const char *filename ) { - FILE *fp = NULL; - if ( ( fp = fopen( filename, "r" ) ) != NULL ) { - fclose( fp ); - return 1; - } - else { - return 0; - } -} /* fileExists */ - - - -void errorInCommandLine() { - printf( "\n" ); - printf( " bootstrap_cz version 3.000\n" ); - printf( " ---------------------------\n\n" ); - printf( " Purpose:\n" ); - printf( " Bootstrap resamples an alignment in PHYLIP sequential format times.\n" ); - printf( " Bootstrapping is not done randomly but according to a BSP (bootstrap\n" ); - printf( " positions) file.\n" ); - printf( " The BSP file can be created with the Perl program \"bootstrap_cz.pl\"\n" ); - printf( " in mode 0.\n" ); - printf( " This prgram has the same functionality as \"bootstrap_cz.pl\" in mode 1.\n" ); - printf( " Sequence names are normalized to LENGTH_OF_NAME characters.\n" ); - printf( " The output alignment is in PHYLIP's sequential or interleaved format.\n" ); - printf( " (These two are the same in this case, since all the seqs will be one\n" ); - printf( " line in length (no returns in seq).)\n\n" ); - printf( " Usage: bootstrap_cz \n" ); - printf( " [number of processors]\n\n" ); -} /* errorInCommandLine */ - - - -int main( int argc, char *argv[] ) { - - char *inalign = NULL, - *bsp_file = NULL, - *outfile = NULL; - int bootstraps = 0, - processors = 0; - - - if ( argc != 5 && argc != 6 ) { - errorInCommandLine(); - exit( -1 ); - } - - bootstraps = atoi( argv[ 1 ] ); - inalign = argv[ 2 ]; - bsp_file = argv[ 3 ]; - outfile = argv[ 4 ]; - - if ( bootstraps < 1 ) { - errorInCommandLine(); - exit( -1 ); - } - - if ( argc == 6 ) { - processors = atoi( argv[ 5 ] ); - if ( processors < 1 ) { - errorInCommandLine(); - exit( -1 ); - } - if ( processors > bootstraps ) { - processors = bootstraps; - } - } - - if ( argc == 5 && fileExists( outfile ) == 1 ) { - printf( "\nbootstrap_cz: Error: outfile \"%s\" already exists.\n", - outfile ); - exit( -1 ); - } - - readInAlignment( inalign ); - - if ( argc == 5 ) { - bootstrapAccordingToBSPfile( bootstraps, - bsp_file, - outfile ); - } - else { - bootstrapAccordingToBSPfileP( bootstraps, - processors, - bsp_file, - outfile ); - } - - return 0; - -} /* main */ diff --git a/forester/archive/forester-1.92.tar.Z b/forester/archive/forester-1.92.tar.Z deleted file mode 100644 index feccac7..0000000 Binary files a/forester/archive/forester-1.92.tar.Z and /dev/null differ diff --git a/forester/archive/perl/00README b/forester/archive/perl/00README deleted file mode 100755 index 2103cf4..0000000 --- a/forester/archive/perl/00README +++ /dev/null @@ -1,48 +0,0 @@ -Overview of the Perl scripts in this directory ----------------------------------------------- - -This directory contains a collection of (mostly horrible) -Perl scripts. Some of them are still maintained, such as -phylo_pl.pl. - -Some of the scripts in this directory relay heavily -on forester.pm. - - - -RIO pipeline: -- rio.pl -- makeTree.pl -- p7extract.pl -- multifetch.pl - - -Running a parallelized RIO web server: -- nph-riowebserver -- rio_slave.pl -- rio_slave_driver.pl -- queue.pm - - -To prepare data to be used by RIO: -- bootstrap_cz.pl -- pfam2slx.pl -- extractSWISS-PROT.pl -- extractTrembl.pl -- pfam2pwd.pl - - -To run multiple RIO analyses in an automated fashion: -- Xrio.pl - - -To analyze RIO results (of Xrio.pl runs): -- bootstrapCounter.pl -- bootstrapSelector.pl -- diffFinder3.pl - - -Counting of species in SWISS-PROT and TrEMBL: -- countSpeciesSPTrEMBL.pl - - diff --git a/forester/archive/perl/Xrio.pl b/forester/archive/perl/Xrio.pl deleted file mode 100755 index 08c3ceb..0000000 --- a/forester/archive/perl/Xrio.pl +++ /dev/null @@ -1,585 +0,0 @@ -#!/usr/bin/perl -w -# -# Xrio.pl -# ------- -# Copyright (C) 1999-2001 Washington University School of Medicine -# and Howard Hughes Medical Institute -# All rights reserved -# -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Created: 03/01/01 -# -# Last modified 06/22/01 - - -# Objective. Runs "rio.pl" for each Pfam assignment in "infile". -# -# Usage. rio.pl -# -# species names file: list of species to use for analysis. -# -# This version uses the CE number as identifier for output files.\n"; -# -# Format for infile: -# -# >>3R5.2 CE19648 (CAMBRIDGE) TR:Q9XWB1 protein_id:CAA21778.1 -# // -# -# >>4R79.1 CE19649 Zinc-binding metalloprotease domain (CAMBRIDGE) protein_id:CAB63429.1 -# =Astacin Astacin (Peptidase family M12A) 296.3 3.8e-85 1 -# // -# -# >>4R79.2 CE19650 Ras family (CAMBRIDGE) TR:Q9XXA4 protein_id:CAA20282.1 -# =ras Ras family 208.8 8.1e-59 1 -# =FA_desaturase Fatty acid desaturase 4.5 1.5 1 -# =UPF0117 Domain of unknown function DUF36 3.1 3.5 1 -# =arf ADP-ribosylation factor family -46.0 1.5e-05 1 -# // -# -# -# - -# Xrio.pl /nfs/wol2/people/zmasek/wormpep43_hmmpfam6.2/wormpep43_Hmmpfam_6.2 /nfs/wol2/people/zmasek/species_trees/tree_of_life_bin_1-4_species_list /nfs/wol2/people/zmasek/XrioTEST3 /nfs/wol2/people/zmasek/XrioTEST3/OUTFILE1 /nfs/wol2/people/zmasek/XrioTEST3/LOG1 - - - - - -use strict; - -use FindBin; -use lib $FindBin::Bin; -use rio_module; - - $RIO_PL = "rio.pl"; -my $VERSION = "3.000"; - -my $FASTA_DB = "/nfs/wol2/people/zmasek/DB/wormpep/wormpep43"; -my $QUERY_SPECIES = "CAEEL"; -my $SPECIES_TREE = $SPECIES_TREE_FILE_DEFAULT; - -my $RIOPL_OPTIONS = "T=B P=6 L=0 R=0 U=80 V=0 X=2 Y=2 Z=2 C E I"; - -my $TEMP_DIR = "/tmp/Xriopl"; # Where all the temp files, etc will be created. - -my %Species_names_hash = (); - -my $infile = ""; -my $outfile = ""; # Huge file of all rio outputs. -my $logfile = ""; # Lists all sequences which have been analyzed successfully. -my $output_directory = ""; -my $species_names_file = ""; - - -my $return_line = ""; -my $ID = ""; -my $pfam_name = ""; -my $E_value = 0; -my $score = 0; -my $GA = 0; -my $temp_dir = ""; -my $outname = ""; -my %outnames = (); -my $seqs = 0; -my $ii = 0; -my $time = 0; -my $successful = 0; -my $query_not_aligned = 0; -my $pwd_not_present = 0; -my $already_done = 0; -my $start_date = ""; -my $old_fh = ""; -my %AC_OS = (); # AC -> species name for TrEMBL seqs -my %AC_DE = (); # AC -> description for TrEMBL seqs - -my $description_line = ""; -my $message1 = ""; -my $message2 = ""; - -$start_date = `date`; - -if ( @ARGV != 5 ) { - &errorInCommandLine(); - exit ( -1 ); -} - -$infile = $ARGV[ 0 ]; -$species_names_file = $ARGV[ 1 ]; -$output_directory = $ARGV[ 2 ]; -$outfile = $ARGV[ 3 ]; -$logfile = $ARGV[ 4 ]; - - -if ( -e $outfile ) { - die "\n\n$0: <<$outfile>> already exists.\n\n"; -} -unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n"; -} -unless ( ( -s $species_names_file ) && ( -f $species_names_file ) && ( -T $species_names_file ) ) { - die "\n\n$0: <<$species_names_file>> does not exist, is empty, or is not a plain textfile.\n\n"; -} -unless ( ( -s $TREMBL_ACDEOS_FILE ) && ( -f $TREMBL_ACDEOS_FILE ) && ( -T $TREMBL_ACDEOS_FILE ) ) { - die "\n\n$0: <<$TREMBL_ACDEOS_FILE>> does not exist, is empty, or is not a plain textfile.\n\n"; -} -unless ( ( -e $output_directory ) && ( -d $output_directory ) ) { - die "\n\n$0: <<$output_directory>> does not exist, or is not a directory.\n\n"; -} - - - -# Reads in the species file: -# -------------------------- -&readSpeciesNamesFile( $species_names_file ); - - - -# Reads in the file containing AC, DE and OS for TrEMBL seqs: -# ----------------------------------------------------------- -open( HH, "$TREMBL_ACDEOS_FILE" ) || die "\n\n$0: Unexpected error: Cannot open file <<$TREMBL_ACDEOS_FILE>>: $!\n\n"; -while ( $return_line = ) { - if ( $return_line =~ /(\S+);([^;]*);(\S+)/ ) { - $AC_OS{ $1 } = $3; - $AC_DE{ $1 } = $2; - } -} -close( HH ); - - - -# Reads in outnames in logfile, if present: -# ----------------------------------------- -if ( ( -s $logfile ) ) { - open( L, "$logfile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$logfile>>: $!\n\n"; - while ( $return_line = ) { - if ( $return_line =~ /\s*(\S+)/ ) { - $outnames{ $1 } = 0; - } - } - close( L ); -} - - - -# Creates the temp directory: -# --------------------------- - -$ii = 0; - -$time = time; - -$temp_dir = $TEMP_DIR.$time.$ii; - -while ( -e $temp_dir ) { - $ii++; - $temp_dir = $TEMP_DIR.$time.$ii; -} - -mkdir( $temp_dir, 0777 ) -|| die "\n\n$0:Unexpected error: Could not create <<$temp_dir>>: $!\n\n"; - -unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) { - die "\n\n$0:Unexpected error: <<$temp_dir>> does not exist, or is not a directory: $!\n\n"; -} - - - -$message1 = "# $0\n". - "# Version : $VERSION\n". - "# Date started : $start_date". - "# Infile : $infile\n". - "# Species names file : $species_names_file\n". - "# Output directory : $output_directory\n". - "# Outfile : $outfile\n". - "# RIO PWD directory : $RIO_PWD_DIRECTORY\n". - "# RIO BSP directory : $RIO_BSP_DIRECTORY\n". - "# RIO NBD directory : $RIO_NBD_DIRECTORY\n". - "# RIO ALN directory : $RIO_ALN_DIRECTORY\n". - "# RIO HMM directory : $RIO_HMM_DIRECTORY\n". - "# Fasta db : $FASTA_DB\n". - "# Species of query : $QUERY_SPECIES\n". - "# Species tree : $SPECIES_TREE\n". - "# rio.pl options : $RIOPL_OPTIONS\n\n\n"; - -open( IN, "$infile" ) || die "\n\n$0: Cannot open file <<$infile>>: $!\n\n"; -open( LOG, ">> $logfile" ) || die "\n\n$0: Cannot open file <<$logfile>>: $!\n\n"; - - -# Turns off buffering for LOG. -$old_fh = select( LOG ); -$| = 1; -select( $old_fh ); - - -$ID = ""; - -W: while ( $return_line = ) { - - if ( $return_line =~ /^\s*>>.*(CE\d+)/ ) { - $ID = $1; - $return_line =~ /^\s*>>(.+)/; - $description_line = $1; - } - elsif ( $return_line =~ /^\s*\/\// ) { - $ID = ""; - } - elsif ( $return_line =~ /^\s*=(\S+)\s+.+\s+(\S+)\s+(\S+)\s+\S+\s*$/ - && $ID ne "" ) { - - $pfam_name = $1; - $score = $2; - $E_value = $3; - - $outname = $ID.".".$pfam_name; - - # Checks if already done. - if ( %outnames && exists( $outnames{ $outname } ) ) { - $already_done++; - next W; - } - - &executeHmmfetch( $PFAM_HMM_DB, $pfam_name, $temp_dir."/HMMFILE" ); - - $GA = &getGA1cutoff( $temp_dir."/HMMFILE" ); - unlink( $temp_dir."/HMMFILE" ); - - if ( $GA == 2000 ) { - die "\n\n$0: Unexpected error: No GA cutoff found for \"$pfam_name\".\n\n"; - } - elsif ( $score < $GA ) { - next W; - } - - if ( -s $output_directory."/".$outname ) { - unlink( $output_directory."/".$outname ); - } - - - $message1 .= "\n\n". - "# ############################################################################\n". - "# Annotation: $description_line\n". - "# HMM : $pfam_name\n". - "# score : $score\n". - "# E-value : $E_value\n"; - - - - unless ( ( -s $RIO_PWD_DIRECTORY.$pfam_name.$SUFFIX_PWD ) ) { - $pwd_not_present++; - $message1 .= "# No PWD file for this family.\n". - "# ############################################################################\n"; - next W; - } - - - unless ( ( -s $PFAM_SEED_DIRECTORY."/".$pfam_name ) && ( -f $PFAM_SEED_DIRECTORY."/".$pfam_name ) && ( -T $PFAM_SEED_DIRECTORY."/".$pfam_name ) ) { - die "\n\n$0: Error: Pfam seed alignment <<$PFAM_SEED_DIRECTORY"."/"."$pfam_name>> not present.\n\n"; - } - - - &getSequenceFromFastaFile( $FASTA_DB, - $temp_dir."/QUERY", - $ID ); - - &performRIO( $pfam_name, # A= - $temp_dir."/QUERY", # Q= - $output_directory."/".$outname, # O= - $ID."_".$QUERY_SPECIES, # N= - $SPECIES_TREE, # S= - $RIOPL_OPTIONS, # L=0 R=0 U=70 V=0 X=2 Y=2 Z=2 C E K I x - $temp_dir."/riopltempdir" ); # j= - - - - if ( -s $output_directory."/".$outname ) { - $successful++; - } - else { - $message1 .= "# Query has not been aligned (E value too low).\n". - "# ############################################################################\n"; - $query_not_aligned++; - } - - if ( unlink( $temp_dir."/QUERY" ) != 1 ) { - die "\n$0: Unexpected error: File(s) could not be deleted.\n"; - } - - - - if ( -s $output_directory."/".$outname ) { - open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n\n"; - print OUT_MESG_ONE ( $message1 ); - close( OUT_MESG_ONE ); - - $message1 = ""; - - open( OUT_MESG_TWO, ">$temp_dir/_message2_" ) || die "\n\n$0: Cannot create file \"$temp_dir/_message2_\": $!\n\n"; - print OUT_MESG_TWO ( "# Successful calculations : $successful\n" ); - print OUT_MESG_TWO ( "# No calculation due to absence of PWD file: $pwd_not_present\n" ); - print OUT_MESG_TWO ( "# Calculation already performed : $already_done\n" ); - print OUT_MESG_TWO ( "# ############################################################################\n" ); - close( OUT_MESG_TWO ); - - if ( -s $outfile ) { - system( "cat $outfile $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile"."___" ) - && die "\n\n$0: Could not execute \"cat $outfile $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile"."___\": $!\n\n"; - system( "mv", $outfile."___", $outfile ) - && die "\n\n$0: Could not execute \"mv $outfile"."___ $outfile\": $!\n\n"; - } - else { - system( "cat $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile" ) - && die "\n\n$0: Could not execute \"cat $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile\": $!\n\n"; - - } - - print LOG "$outname\n"; - - unlink( "$temp_dir/_message1_", "$temp_dir/_message2_" ); - - } - - - - } ## End of elsif ( $return_line =~ /^\s*=(\S+)\s+.+\s+(\S+)\s+(\S+)\s+\S+$/ && $ID ne "" ) - -} ## End of while ( $return_line = ) - -close( IN ); -close( LOG ); - - -open( OUT_MESG_TWO, ">$temp_dir/_message2_" ) || die "\n$0: Cannot create file \"$temp_dir/_message2_\": $!\n"; -print OUT_MESG_TWO ( "\n\n# Xrio.pl successfully terminated.\n" ); -print OUT_MESG_TWO ( "# Started : $start_date" ); -print OUT_MESG_TWO ( "# Terminated: ".`date`."\n" ); -print OUT_MESG_TWO ( "# Successful calculations : $successful\n" ); -print OUT_MESG_TWO ( "# No calculation due to absence of PWD file: $pwd_not_present\n" ); -print OUT_MESG_TWO ( "# Calculation already performed : $already_done\n\n" ); -close( OUT_MESG_TWO ); - -if ( -s $outfile ) { - if ( $message1 ne "" ) { - open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n"; - print OUT_MESG_ONE ( $message1 ); - close( OUT_MESG_ONE ); - system( "cat $outfile $temp_dir/_message1_ $temp_dir/_message2_ > $outfile"."___" ) - && die "$0: Could not execute \"cat $outfile $temp_dir/_message1_ $temp_dir/_message2_ > $outfile"."___\": $!"; - } - else { - system( "cat $outfile $temp_dir/_message2_ > $outfile"."___" ) - && die "$0: Could not execute \"cat $outfile $temp_dir/_message2_ > $outfile"."___\": $!"; - } - system( "mv", $outfile."___", $outfile ) - && die "$0: Could not execute \"mv $outfile"."___ $outfile\": $!"; -} -else { - open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n"; - print OUT_MESG_ONE ( $message1 ); - close( OUT_MESG_ONE ); - system( "cat $temp_dir/_message1_ $temp_dir/_message2_ > $outfile" ) - && die "$0: Could not execute \"cat $temp_dir/_message1_ $temp_dir/_message2_ > $outfile\": $!"; -} - -unlink( "$temp_dir/_message1_", "$temp_dir/_message2_" ); - -rmdir( $temp_dir ) || die "\n$0: Unexpected failure (could not remove: $temp_dir): $!\n"; - -print( "\n\nXrio.pl successfully terminated.\n" ); -print( "Successful calculations : $successful\n" ); -print( "No calculation due to absence of PWD file: $pwd_not_present\n" ); -print( "Calculation already performed : $already_done\n" ); -print( "Started : $start_date" ); -print( "Terminated: ".`date`."\n" ); -print( "\n" ); - -exit( 0 ); - - - -# Methods -# ------- - - - -# Gets the gathering cutoff per sequence from a HMM file. -# -# One argument: the HMM file name -# Returns the gathering cutoff per sequence, 2000 upon failure -# Last modified: 07/11/01 -sub getGA1cutoff { - - my $infile = $_[ 0 ]; - my $return_line = ""; - my $GA = 2000; - - &testForTextFilePresence( $infile ); - - open( H, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!"; - while ( $return_line = ) { - - if ( $return_line =~ /^GA\s+(\S+)/ ) { - $GA = $1; - close( H ); - return $GA; - } - - } - close( H ); - return $GA; - -} ## getGA1cutoff - - - - - -# 1. A= Name of Pfam family -# 2. Q= Query file -# 3. O= Output -# 4. N= query Name -# 5. S= Species tree file -# 6. more options, such I K m -# 7. j= Name for temporary directory -sub performRIO { - my $pfam_name = $_[ 0 ]; - my $query_file = $_[ 1 ]; - my $output_file = $_[ 2 ]; - my $name_for_query = $_[ 3 ]; - my $species_tree_file = $_[ 4 ]; - my $more_options = $_[ 5 ]; - my $tmp_file_rio = $_[ 6 ]; - - my $options_for_rio = ""; - - $options_for_rio .= ( " A=".$pfam_name ); - $options_for_rio .= ( " Q=".$query_file ); - $options_for_rio .= ( " O=".$output_file ); - $options_for_rio .= ( " N=".$name_for_query ); - $options_for_rio .= ( " S=".$species_tree_file ); - $options_for_rio .= ( " j=".$tmp_file_rio ); - $options_for_rio .= ( " ".$more_options ); - - system( "$RIO_PL 1 $options_for_rio" ) - && die "$0: performRIO: Could not execute \"$RIO_PL 1 $options_for_rio\": $!\n"; - -} ## performRIO - - - -# Reads in (SWISS-PROT) species names from a file. -# Names must be separated by newlines. -# Lines beginning with "#" are ignored. -# A possible "=" and everything after is ignored. -# One argument: species-names-file name -# Last modified: 04/24/01 -sub readSpeciesNamesFile { - my $infile = $_[ 0 ]; - my $return_line = ""; - my $species = ""; - - unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - &Error( "\"$infile\" does not exist,\n is empty, or is not a plain textfile." ); - } - - open( IN_RSNF, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!"; - while ( $return_line = ) { - if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) { - $species = $1; - $species =~ s/=.+//; - $Species_names_hash{ $species } = ""; - } - } - close( IN_RSNF ); - - return; -} ## readSpeciesNamesFile - - - -# Searches the > line of a multiple seq file for a -# query, saves the found entries. -# Three arguments: -# 1. multi Fasta file to search through -# 2. outputfile name -# 3. query -# Last modified: 03/05/01 -sub getSequenceFromFastaFile { - - my $inputfile = $_[ 0 ]; - my $outputfile = $_[ 1 ]; - my $query = $_[ 2 ]; - my $hits = 0; - - open( IN_GSFF, "$inputfile" ) - || die "\n$0: getSequenceFromFastaFile: Cannot open file <<$inputfile>>: $!\n"; - open( OUT_GSFF, ">$outputfile" ) - || die "\n$0: getSequenceFromFastaFile: Cannot create file <<$outputfile>>: $!\n"; - - - while ( $return_line = ) { - if ( $return_line =~ /^\s*>.*$query\s+/ ) { - $hits++; - print $return_line; - print OUT_GSFF $return_line; - $return_line = ; - while ( $return_line && $return_line =~ /^\s*[^>]/ ) { - print OUT_GSFF $return_line; - $return_line = ; - } - last; # In Wormpep there _are_ ambigous CE numbers. - } - - } - - close( IN_GSFF ); - close( OUT_GSFF ); - if ( $hits < 1 ) { - die "\n$0: getSequenceFromFastaFile: Unexpected error: <<$query>> not found.\n"; - } - if ( $hits > 1 ) { - die "\n$0: getSequenceFromFastaFile: Unexpected error: <<$query>> is ambigous.\n"; - } - -} ## getSequenceFromFastaFile - - - - -# Last modified: 03/08/01 -sub errorInCommandLine { - - print "\n"; - print " Xrio.pl $VERSION\n"; - print " -------\n"; - print "\n"; - print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n"; - print "\n"; - print " Purpose. Runs \"rio.pl\" for each Pfam assignment in \"infile\".\n"; - print "\n"; - print " Usage. rio.pl \n"; - print "\n"; - print " infile: has the following format (defined per example):\n"; - print " >>4R79.1 CE19649 Zinc-binding metalloprotease domain (CAMBRIDGE) protein_id:CAB63429.1\n"; - print " =Astacin Astacin (Peptidase family M12A) 296.3 3.8e-85 1\n"; - print " //\n"; - print "\n"; - print " >>4R79.2 CE19650 Ras family (CAMBRIDGE) TR:Q9XXA4 protein_id:CAA20282.1\n"; - print " =ras Ras family 208.8 8.1e-59 1\n"; - print " =FA_desaturase Fatty acid desaturase 4.5 1.5 1\n"; - print " =UPF0117 Domain of unknown function DUF36 3.1 3.5 1\n"; - print " =arf ADP-ribosylation factor family -46.0 1.5e-05 1\n"; - print " //\n"; - print "\n"; - print " species names file: list of species to use for analysis.\n"; - print "\n"; - print " This version uses the CE number as identifier for output files.\n"; - print "\n"; - - exit( -1 ); - -} ## errorInCommandLine - - diff --git a/forester/archive/perl/bootstrapCounter.pl b/forester/archive/perl/bootstrapCounter.pl deleted file mode 100755 index 2aab587..0000000 --- a/forester/archive/perl/bootstrapCounter.pl +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/perl -w -# -# bootstrapCounter.pl -# ------------------- -# -# Copyright (C) 2001 Washington University School of Medicine -# and Howard Hughes Medical Institute -# All rights reserved -# -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Created: 04/04/01 -# -# Last modified 08/16/01 -# -# -# Objective. Determines the distribution of top orthology bootstrap values -# of a Xrio.pl output file. -# -# Usage. "bootstrapCounter.pl " -# -# Important. The result of this is meaningful ONLY if the thresholds -# for output of the RIO analysis are set to zero (L=0 R=0). -# -# Format for infile: -# ... -# -# # ############################################################################ -# # Annotation: B0511.6 CE17345 helicase (ST.LOUIS) TR:O61815 protein_id:AAC17654.1 -# # HMM : ABC_tran -# # score : -59.6 -# # E-value : 1.1 -# # Query has not been aligned (score lower than gathering cutoff). -# # ############################################################################ -# -# -# # ############################################################################ -# # Annotation: B0511.7 CE17346 (ST.LOUIS) TR:O61817 protein_id:AAC17655.1 -# # HMM : FHA -# # score : 71.6 -# # E-value : 1.7e-17 -# RIO - Resampled Inference of Orthologs -# Version: 1.000 -# ------------------------------------------------------------------------------ -# Alignment file: /tmp/Xriopl9846081980/Full-FHA -# Alignment : FHA domain -# HMM : FHA -# Query file : /tmp/Xriopl9846081980/__queryfile__ -# ============================================================================== -# -# Query : CE17346.FHA_CAEEL/45-114 -# -# Number (in %) of observed orthologies (o) and super orthologies (s) to query -# in bootstrapped trees, evolutionary distance to query: -# -# Sequence Description # o[%] s[%] distance -# -------- ----------- ---- ---- -------- -# YC67_MYCTU/308-372 - 20 14 1.577840 -# FRAH_ANASP/204-277 FRAH PROTEIN. 17 16 1.532670 -# ABA2_NICPL/557-633 ZEAXANTHIN EPOXIDASE PRECURSOR (EC 1.14.-.-). 14 11 1.885700 -# ABA2_LYCES/563-639 ZEAXANTHIN EPOXIDASE PRECURSOR (EC 1.14.-.-). 14 11 2.140000 -# -# -# -# Distance values (based on ML branch length values on consensus tree) -# -------------------------------------------------------------------- -# Given the thresholds for distance calculations: -# No sequence is considered orthologous to query. -# -# ... - - - -use strict; - -my $VERSION = 0.200; - -my $infile = ""; -my $outfile = ""; -my $return_line = ""; -my $results = 0; -my $o_bootstraps = 0; -my $s_bootstraps = 0; -my @o_bootstraps_array = (); -my @s_bootstraps_array = (); -my $total = 0; -my $i = 0; - - -if ( @ARGV != 2 ) { - &errorInCommandLine(); - exit ( -1 ); -} - -$infile = $ARGV[ 0 ]; -$outfile = $ARGV[ 1 ]; - -if ( -e $outfile ) { - die "\n$0: <<$outfile>> already exists.\n"; -} -unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n"; -} - - -open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; - -$results = 0; -for ( $i = 0; $i <= 100; ++$i ) { - $s_bootstraps_array[ $i ] = $o_bootstraps_array[ $i ] = 0; -} - -while ( $return_line = ) { - - if ( $return_line =~ /^\s*--------\s+/ ) { - $results = 1; - } - elsif ( $return_line =~ /^\s*Distance\s+values\s+/i ) { - $results = 0; - } - elsif ( $results == 1 && $return_line =~ /^\s*!NO\s+ORTHOLOGS/ ) { - $o_bootstraps_array[ 0 ]++; - $s_bootstraps_array[ 0 ]++; - $total++; - $results = 0; - } - elsif ( $results == 1 && $return_line =~ /(\S+)\s+(\S+)\s+\S+\s*$/ ) { - $o_bootstraps = $1; - $s_bootstraps = $2; - $results = 0; - if ( $o_bootstraps > 100 || $s_bootstraps > 100 - || $o_bootstraps < 0 ) { - print "o bootstraps: $o_bootstraps\n"; - print "s bootstraps: $s_bootstraps\n"; - die "\n\n$0: Error: Boostrap value(s) out of range.\n\n"; - } - - $total++; - $o_bootstraps_array[ $o_bootstraps ]++; - $s_bootstraps_array[ $s_bootstraps ]++; - - } -} - -close( IN ); - - -open( OUT, ">$outfile" ) || die "\n$0: Cannot create file \"$outfile\": $!\n"; - -print OUT "bootstrapCounter.pl version: $VERSION\n\n"; -print OUT "Distribution of top bootstrap values\n\n"; -print OUT "Input file : $infile\n"; -print OUT "Output file: $outfile\n"; -print OUT "Date : ".`date`."\n"; -print OUT "Total: $total\n\n"; -print OUT "top-orthology-bootstraps vs. count:\n"; -for ( $i = 0; $i < @o_bootstraps_array; ++$i ) { - print OUT "$i $o_bootstraps_array[ $i ]\n"; -} -print OUT "\n\ntop-super-orthology-bootstraps vs. count:\n"; -for ( $i = 0; $i < @s_bootstraps_array; ++$i ) { - print OUT "$i $s_bootstraps_array[ $i ]\n"; -} -close( OUT ); - -print( "\nDone.\n\n" ); - -exit( 0 ); - - - -sub errorInCommandLine { - print "\n"; - print " bootstrapCounter.pl version: $VERSION\n"; - print " Usage: \"bootstrapCounter.pl \"\n"; - print " Important: The result of this is meaningful ONLY if the thresholds\n"; - print " for output of the RIO analysis are set to zero (L=0 R=0).\n"; - print "\n"; - exit( -1 ); -} - - diff --git a/forester/archive/perl/bootstrapSelector.pl b/forester/archive/perl/bootstrapSelector.pl deleted file mode 100755 index ec6ce69..0000000 --- a/forester/archive/perl/bootstrapSelector.pl +++ /dev/null @@ -1,291 +0,0 @@ -#!/usr/bin/perl -w -# -# bootstrapSelector.pl -# -------------------- -# -# Copyright (C) 2001 Washington University School of Medicine -# and Howard Hughes Medical Institute -# All rights reserved -# -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Created: 04/06/01 -# -# Last modified 09/24/01 -# -# -# Objective. Selection of RIO analysis results with top ortholgy -# bootstrap values greater or less than a threshold. -# -# Usage: "bootstrapSelector.pl " -# Options: "l" for "less or equal" ("grater or equal" is default) -# "c" for "all hits must meet threshold in case of -# multiple copies of the same domain in the query" -# (default: "at least one") -# Example: "bootstrapSelector.pl 95lc OUTFILE_At_1 At_1_out" -# -# Important. The result of this is meaningful ONLY if the thresholds -# for output of the RIO analysis are set to zero (L=0 R=0). -# -# -# Format for infile: -# -# ... -# -# # ############################################################################ -# # Annotation: B0511.6 CE17345 helicase (ST.LOUIS) TR:O61815 protein_id:AAC17654.1 -# # HMM : ABC_tran -# # score : -59.6 -# # E-value : 1.1 -# # Query has not been aligned (score lower than gathering cutoff). -# # ############################################################################ -# -# -# # ############################################################################ -# # Annotation: B0511.7 CE17346 (ST.LOUIS) TR:O61817 protein_id:AAC17655.1 -# # HMM : FHA -# # score : 71.6 -# # E-value : 1.7e-17 -# RIO - Resampled Inference of Orthologs -# Version: 1.000 -# ------------------------------------------------------------------------------ -# Alignment file: /tmp/Xriopl9846081980/Full-FHA -# Alignment : FHA domain -# HMM : FHA -# Query file : /tmp/Xriopl9846081980/__queryfile__ -# ============================================================================== -# -# Query : CE17346.FHA_CAEEL/45-114 -# -# Number (in %) of observed orthologies (o) and super orthologies (s) to query -# in bootstrapped trees, evolutionary distance to query: -# -# Sequence Description # o[%] s[%] distance -# -------- ----------- ---- ---- -------- -# YC67_MYCTU/308-372 - 20 14 1.577840 -# FRAH_ANASP/204-277 FRAH PROTEIN. 17 16 1.532670 -# ABA2_NICPL/557-633 ZEAXANTHIN EPOXIDASE PRECURSOR (EC 1.14.-.-). 14 11 1.885700 -# ABA2_LYCES/563-639 ZEAXANTHIN EPOXIDASE PRECURSOR (EC 1.14.-.-). 14 11 2.140000 -# -# -# -# Distance values (based on ML branch length values on consensus tree) -# -------------------------------------------------------------------- -# Given the thresholds for distance calculations: -# No sequence is considered orthologous to query. -# -# ... - - - -use strict; - -my $VERSION = 1.000; -my $threshold = 0; -my $infile = ""; -my $outfile = ""; -my $summary_outfile = ""; -my $return_line = ""; -my $identifier = ""; -my $top1 = ""; -my $analysis_performed = 0; -my $reading = 0; -my $i = 0; -my @lines = (); -my $larger = 1; -my $complete = 0; -my $total = 0; - -if ( @ARGV != 3 ) { - &errorInCommandLine(); - exit ( -1 ); -} - -$threshold = $ARGV[ 0 ]; -$infile = $ARGV[ 1 ]; -$outfile = $ARGV[ 2 ]; -$summary_outfile = $outfile.".short"; - -if ( -e $outfile ) { - die "\n$0: <<$outfile>> already exists.\n"; -} -unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n"; -} - - -if ( $threshold =~ /l/ ) { - $larger = 0; - $threshold =~ s/l//; -} -if ( $threshold =~ /c/ ) { - $complete = 1; - $threshold =~ s/c//; -} - -open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; - -open( OUT, ">$outfile" ) || die "\n$0: Cannot create file \"$outfile\": $!\n"; -open( OUT_SUMMARY, ">$summary_outfile" ) || die "\n$0: Cannot create file \"$summary_outfile\": $!\n"; - -print OUT "bootstrapSelector.pl version: $VERSION\n\n"; -print OUT "Selection of RIO analysis results with top ortholgy\n"; -print OUT "bootstrap values greater or less than a threshold.\n\n"; -if ( $larger == 1 ) { - print OUT "Threshold : Grater than or equal to $threshold\n"; -} -else { - print OUT "Threshold : Less than or equal to $threshold\n"; -} -print OUT "In case of multiple copies of the same domain in the query:\n"; -if ( $complete == 1 ) { - print OUT "All hits must meet threshold.\n"; -} -else { - print OUT "At least one hit must meet threshold.\n"; -} -print OUT "Input file : $infile\n"; -print OUT "Output file : $outfile\n"; -print OUT "Output file short: $summary_outfile\n"; -print OUT "Date : ".`date`."\n\n\n"; - -while ( $return_line = ) { - - if ( $return_line =~ /^\s*# Annotation:\s*(.+)/ ) { - $identifier = $1; - $identifier = substr( $identifier, 0, 60); - $analysis_performed = 0; - $reading = 1; - $i = 0; - @lines = (); - } - - if ( $reading == 1 && $return_line =~ /^\s*RIO/ ) { - $analysis_performed = 1; - } - - if ( $reading == 1 - && $return_line =~ /^\s*# ####################################/ ) { - if ( $analysis_performed == 1 ) { - &analyze(); - } - $reading = 0; - } - - if ( $reading == 1 ) { - $lines[ $i++ ] = $return_line; - } -} - -close( IN ); - -print OUT "\n\nTotal: $total\n"; - -close( OUT ); -close( OUT_SUMMARY ); - -print "\nTotal: $total\n"; -print "Done.\n\n"; - -exit( 0 ); - - -sub analyze { - my $j = 0; - my $results = 0; - my $o_bootstraps = 0; - $top1 = ""; - - for ( $j = 0; $j < $i; $j++ ) { - - if ( $lines[ $j ] =~ /^\s*--------\s+/ ) { - $results = 1; - } - elsif ( $lines[ $j ] =~ /^\s*Distance\s+values\s+/i ) { - $results = 0; - } - elsif ( $results == 1 - && ( $lines[ $j ] =~ /\S+\s+\S+\s+\S+\s*$/ - || $lines[ $j ] =~ /^\s*!NO\s+ORTHOLOGS/ ) ) { - - if ( $lines[ $j ] =~ /^\s*!NO\s+ORTHOLOGS/ ) { - $o_bootstraps = 0; - } - else { - $lines[ $j ] =~ /(\S+)\s+\S+\s+\S+\s*$/; - $o_bootstraps = $1; - if ( $top1 eq "" ) { - $top1 = $lines[ $j ]; - $top1 =~ s/\n//; - $top1 =~ s/\s{2,}/ /g; - } - } - - $results = 0; - - if ( $o_bootstraps > 100 || $o_bootstraps < 0 ) { - print "o bootstraps: $o_bootstraps\n"; - die "\n\n$0: Error: Boostrap value(s) out of range.\n\n"; - } - - if ( $larger == 1 ) { - if ( $complete != 1 && $o_bootstraps >= $threshold ) { - &writeout(); - $total++; - return; - } - elsif ( $complete == 1 && $o_bootstraps < $threshold ) { - return; - } - } - else { - if ( $complete != 1 && $o_bootstraps <= $threshold ) { - &writeout(); - $total++; - return; - } - elsif ( $complete == 1 && $o_bootstraps > $threshold ) { - return; - } - } - } - } - if ( $complete == 1 ) { - &writeout(); - $total++; - } - return; -} - - - -sub writeout { - my $j = 0; - print OUT "# ############################################################################\n"; - for ( $j = 0; $j < $i; ++$j ) { - print OUT "$lines[ $j ]"; - } - print OUT "# ############################################################################\n\n\n"; - print OUT_SUMMARY "$identifier [top 1: $top1]\n\n"; -} - - - -sub errorInCommandLine { - print "\n"; - print " bootstrapCounter.pl version: $VERSION\n"; - print " Usage: \"bootstrapSelector.pl \"\n"; - print " Options: \"l\" for \"less or equal\" (\"grater or equal\" is default)\n"; - print " \"c\" for \"all hits must meet threshold in case of\n"; - print " multiple copies of the same domain in the query\"\n"; - print " (default: \"at least one\")\n"; - print " Example:\n"; - print " \"bootstrapSelector.pl 95lc OUTFILE_At_1 At_1_out\"\n\n"; - print " Important: The result of this is meaningful ONLY if the thresholds\n"; - print " for output of the RIO analysis are set to zero (L=0 R=0).\n\n"; - exit( -1 ); -} - - diff --git a/forester/archive/perl/bootstrap_cz.pl b/forester/archive/perl/bootstrap_cz.pl deleted file mode 100755 index 0c020a5..0000000 --- a/forester/archive/perl/bootstrap_cz.pl +++ /dev/null @@ -1,325 +0,0 @@ -#!/usr/bin/perl -w - -# bootstrap_cz.pl -# --------------- -# Copyright (C) 1999-2003 Washington University School of Medicine -# and Howard Hughes Medical Institute -# All rights reserved -# -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Created: 05/17/01 -# -# Last modified 08/26/03 -# -# Purpose: -# Bootstrap resamples an alignment in PHYLIP sequential format -# times. -# Amino acid sequences must only be represented by uppercase letters (A-Z) -# and '-'. -# In mode 0 it saves the positions which it used to create the -# bootstrapped alignment into . -# Mode 1 allows to recreate exactly the same boostrapped alignment -# by reading in a . -# Sequence names are normalized to $LENGTH_OF_NAME characters. -# The output alignment is in PHYLIP's sequential or interleaved format. -# (These two are the same in this case, since all the seqs will be one -# line in length (no returns in seq).) -# -# Usage: -# bootstrap_cz.pl -# -# [random number seed (mode 0 only)] -# - -use strict; -use FindBin; -use lib $FindBin::Bin; - -use rio_module; - -my $VERSION = "2.001"; - -my $modus = -1; # 0 to create pos. file, 1 to use premade pos. file -my $bootstraps = -1; -my $infile = ""; -my $outalign_file = ""; -my $positions_file = ""; -my $seed = -1; - - -$modus = $ARGV[ 0 ]; -$bootstraps = $ARGV[ 1 ]; -$infile = $ARGV[ 2 ]; -$outalign_file = $ARGV[ 3 ]; -$positions_file = $ARGV[ 4 ]; -$seed = $ARGV[ 5 ]; - -if ( @ARGV != 5 && @ARGV != 6 ) { - &printUsage(); - exit( -1 ); -} - -if ( $modus != 0 && $modus != 1 ) { - &printUsage(); - exit( -1 ); -} - -if ( $modus == 0 && @ARGV != 6 ) { - &printUsage(); - exit( -1 ); -} - -if ( $modus == 1 && @ARGV != 5 ) { - &printUsage(); - exit( -1 ); -} - -if ( $bootstraps < 1 ) { - &printUsage(); - exit( -1 ); -} - -if ( $seed && $seed < 0 ) { - &printUsage(); - exit( -1 ); -} - - -unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n\nbootstrap_cz.pl: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n"; -} -if ( -e $outalign_file ) { - die "\n\nbootstrap_cz.pl: \"$outalign_file\" already exists.\n\n"; -} - -if ( $modus == 0 ) { - if ( -e $positions_file ) { - die "\n\nbootstrap_cz.pl: \"$positions_file\" already exists.\n\n"; - } -} -else { - unless ( ( -s $positions_file ) && ( -f $positions_file ) && ( -T $positions_file ) ) { - die "\n\nbootstrap_cz.pl: \"$positions_file\" does not exist, is empty, or is not a plain textfile.\n\n"; - } -} - -if ( $modus == 0 ) { - &bootstrap( $modus, $bootstraps, $infile, $outalign_file, $positions_file, $seed ); -} -else { - &bootstrap( $modus, $bootstraps, $infile, $outalign_file, $positions_file ); -} - - -exit( 0 ); - - - -# Methods -# ------- - - -# Five/six arguemnts: -# 1. Mode: 0 to create pos. file, 1 to use premade pos. file -# 2. bootstraps -# 3. Alignment infile name -# 4. Outfile name -# 5. file name for positions file (created if mode is 0, read if mode is 1) -# [6. If modus is 0: seed for random number generator] -# -# This method is very similar to method "pfam2phylip" "in makeTree.pl". -# -# Last modified: 05/17/01 -# -sub bootstrap { - - my $modus = $_[ 0 ]; - my $bootstraps = $_[ 1 ]; - my $infile = $_[ 2 ]; - my $outalign_file = $_[ 3 ]; - my $positions_file = $_[ 4 ]; - - my @seq_name = (); - my @seq_array = (); - my @random_numbers = (); - my $return_line = ""; - my $seq = ""; - my $x = 0; - my $y = 0; - my $seq_no = 0; - my $original_length = 0; - my $max_x = 0; - my $n = 0; - my $i = 0; - my $random = -1; - my $length = 0; - my $number_of_seqs = 0; - my $number_of_colm = 0; - - - # Checks the arguments - # -------------------- - - if ( $modus == 0 ) { - if ( !$_[ 5 ] ) { - die "\n\n$0: bootstrap: Failed to give a seed for random number generator.\n\n"; - } - srand( $_[ 5 ] ); - } - elsif( $modus == 1 ) { - if ( $_[ 5 ] ) { - die "\n\n$0: bootstrap: Must not give a seed for random number generator.\n\n"; - } - unless ( ( -s $positions_file ) && ( -f $positions_file ) && ( -T $positions_file ) ) { - die "\n\n$0: bootstrap: <<$positions_file>> does not exist, is empty, or is not a plain textfile.\n\n"; - } - } - else { - die "\n\n$0: bootstrap: modus must be either 0 or 1.\n\n"; - } - - unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n\n$0: bootstrap: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n"; - } - - - - # Reads in the alignment - # ---------------------- - - open( IN, "$infile" ) || die "\n$0: bootstrap: Cannot open file <<$infile>>: $!"; - while ( $return_line = ) { - - if ( $return_line =~ /^\s*(\d+)\s+(\d+)/ ) { - $number_of_seqs = $1; - $number_of_colm = $2; - } - elsif ( $return_line =~ /^(\S+)\s+(\S+)/ ) { - $seq_name[ $seq_no ] = substr( $1, 0, $LENGTH_OF_NAME ); - $seq = $2; - if ( $original_length == 0 ) { - $original_length = length( $seq ); - } - elsif ( $original_length != length( $seq ) ) { - die "\n\n$0: Sequences do not have the same length.\n\n"; - } - for ( $x = 0; $x < $original_length; $x++ ) { - $seq_array[ $x ][ $seq_no ] = substr( $seq, $x, 1 ); - } - $seq_no++; - } - } - close( IN ); - - if ( ( $number_of_seqs != $seq_no ) - || ( $number_of_colm != $original_length ) ) { - die "\n\n$0: Number of sequences or number of columns are inconsisten with the values given in the alignment.\n\n"; - } - - # Adusts the length of the names to $LENGTH_OF_NAME - # ------------------------------------------------- - - for ( $y = 0; $y < $seq_no; $y++ ) { - $length = length( $seq_name[ $y ] ); - for ( $i = 0; $i <= ( $LENGTH_OF_NAME - $length - 1 ); $i++ ) { - $seq_name[ $y ] .= " "; - } - } - - - - # Bootstraps $bootstraps times and writes the outputfiles - # ------------------------------------------------------- - - open( OUT, ">$outalign_file" ) || die "\n\n$0: bootstrap: Cannot create file <<$outalign_file>>: $!"; - if ( $modus == 0 ) { - open( OUT_P, ">$positions_file" ) || die "\n\n$0: bootstrap: Cannot create file <<$positions_file>>: $!"; - } - else { - open( IN_P, "$positions_file" ) || die "\n\n$0: bootstrap: Cannot open file <<$positions_file>>: $!"; - } - - for ( $n = 0; $n < $bootstraps; $n++ ) { - - if ( $modus == 0 ) { - for ( $x = 0; $x < $original_length; $x++ ) { - $random = int( rand( $original_length ) ); - print OUT_P "$random "; - $random_numbers[ $x ] = $random; - } - print OUT_P "\n"; - } - else { - $return_line = ; - if ( !$return_line || $return_line !~ /\d/ ) { - die "\n\n$0: bootstrap: <<$positions_file>> seems too short or otherwise unsuitable.\n\n"; - } - $return_line =~ s/^\s+//; - $return_line =~ s/\s+$//; - @random_numbers = split( /\s+/, $return_line ); - if ( scalar( @random_numbers ) != $original_length ) { - die "\n\n$0: bootstrap: <<$positions_file>> seems not to correspond to <<$infile>>.\n\n"; - } - } - - print OUT " $seq_no $original_length\n"; - - for ( $y = 0; $y < $seq_no; $y++ ) { - print OUT "$seq_name[ $y ]"; - - for ( $x = 0; $x < $original_length; $x++ ) { - $random = $random_numbers[ $x ]; - if ( !$seq_array[ $random ][ $y ] || $seq_array[ $random ][ $y ] !~ /[A-Z]|-/ ) { - die "\n\n$0: Sequence must be represented by uppercase letters A-Z and \"-\" only.\n\n"; - } - print OUT $seq_array[ $random ][ $y ]; - } - print OUT "\n"; - } - } - - close( OUT ); - - if ( $modus == 0 ) { - print OUT_P "\n"; - close( OUT_P ); - } - else { - close( IN_P ); - } - - return; - -} ## bootstrap - - - -sub printUsage { - print "\n"; - print " bootstrap_cz.pl $VERSION\n"; - print " ---------------\n"; - print "\n"; - print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n"; - print "\n"; - print " Purpose:\n"; - print " Bootstrap resamples an alignment in PHYLIP sequential format\n"; - print " times.\n"; - print " In mode 0 it saves the positions which it used to create the\n"; - print " bootstrapped alignment into .\n"; - print " Mode 1 allows to recreate exactly the same boostrapped alignment\n"; - print " by reading in a .\n"; - print " Sequence names are normalized to $LENGTH_OF_NAME characters.\n"; - print " The output alignment is in PHYLIP's sequential or interleaved format.\n"; - print " (These two are the same in this case, since all the seqs will be one\n"; - print " line in length (no returns in seq).)\n"; - print "\n"; - print " Usage:\n"; - print " bootstrap_cz.pl \n"; - print " \n"; - print " [random number seed (mode 0 only)]\n"; - print "\n"; -} ## printUsage - diff --git a/forester/archive/perl/countSpeciesSPTrEMBL.pl b/forester/archive/perl/countSpeciesSPTrEMBL.pl deleted file mode 100755 index 1e9a626..0000000 --- a/forester/archive/perl/countSpeciesSPTrEMBL.pl +++ /dev/null @@ -1,150 +0,0 @@ -#!/usr/bin/perl -W - -# countSpeciesSPTrEMBL.pl -# ----------------------- -# -# Copyright (C) 2003 Christian M. Zmasek -# All rights reserved -# -# Created: 02/27/03 -# Last modified: 02/27/03 -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Last modified 05/23/02 - -# Purpose. Counts species in SWISS-PROT and TrEMBL. -# -# Usage. countSpeciesSPTrEMBL.pl -# - - -use strict; - - -my $VERSION = "1.000"; -my $infile_sp = ""; -my $infile_tr = ""; -my $outfile = ""; - -my $return_line = ""; -my $read = 0; -my $os = ""; -my %species_count = (); # full name -> count. - - -if ( @ARGV != 3 ) { - &errorInCommandLine(); -} - -$infile_tr = $ARGV[ 0 ]; -$infile_sp = $ARGV[ 1 ]; -$outfile = $ARGV[ 2 ]; - - - -if ( -e $outfile ) { - die "\n$0: <<$outfile>> already exists.\n\n"; -} -unless ( ( -s $infile_tr ) && ( -f $infile_tr ) && ( -T $infile_tr ) ) { - die "\n$0: <$infile_tr>> does not exist, is empty, or is not a plain textfile.\n\n"; -} -unless ( ( -s $infile_sp ) && ( -f $infile_sp ) && ( -T $infile_sp ) ) { - die "\n$0: <<$infile_sp>> does not exist, is empty, or is not a plain textfile.\n\n"; -} - -open( IN_TR, "$infile_tr" ) || die "\n$0: Cannot open file <<$infile_tr>>: $!\n"; -open( IN_SP, "$infile_sp" ) || die "\n$0: Cannot open file <<$infile_sp>>: $!\n"; -open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n"; - - -$read = 0; - -while ( $return_line = ) { - if ( $return_line =~ /^AC\s+(\S+);/ ) { - $read = 1; - } - elsif ( $return_line =~ /^OS\s+(.+)\.\s*$/ && $read == 1 ) { - $os = $1; - $os =~ s/$.+$//g; - $os =~ s/^\s+//; - $os =~ s/\s+$//; - $os =~ s/\.$//; - if ( exists( $species_count{ $os } ) ) { - $species_count{ $os } = $species_count{ $os } + 1; - } - else { - $species_count{ $os } = 1; - } - print "$os\n"; - } - elsif ( $return_line =~ /^\/\// && $read == 1 ) { - $read = 0; - $os = ""; - } -} - -close( IN_TR ); - -$read = 0; -$os = ""; -$return_line = ""; - -while ( $return_line = ) { - if ( $return_line =~ /^ID\s+(\S+)/ ) { - $read = 1; - } - elsif ( $return_line =~ /^OS\s+(.+)\s*$/ && $read == 1 ) { - $os = $1; - $os =~ s/\(.+//g; - $os =~ s/^\s+//; - $os =~ s/\s+$//; - $os =~ s/\.$//; - $read = 0; - if ( exists( $species_count{ $os } ) ) { - $species_count{ $os } = $species_count{ $os } + 1; - } - else { - $species_count{ $os } = 1; - } - print "$os\n"; - } - elsif ( $return_line =~ /^\/\// && $read == 1 ) { - $read = 0; - $os = ""; - } -} - -close( IN_SP ); - - -foreach my $species ( sort { $species_count{ $b } <=> $species_count{ $a } } keys %species_count ) { - print OUT "$species: $species_count{$species}\n"; -} - - -print "\n\nDone!\n\n"; - -close( OUT ); - -exit( 0 ); - - - - - - -sub errorInCommandLine { - print "\n"; - print " countSpeciesSPTrEMBL.pl $VERSION\n"; - print " -----------------------\n"; - print "\n"; - print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n"; - print "\n"; - print " Purpose. Counts species in SWISS-PROT and TrEMBL.\n"; - print "\n"; - print " Usage. countSpeciesSPTrEMBL.pl \n"; - print "\n"; - exit( -1 ); -} diff --git a/forester/archive/perl/extractSWISS-PROT.pl b/forester/archive/perl/extractSWISS-PROT.pl deleted file mode 100755 index 7bcfec0..0000000 --- a/forester/archive/perl/extractSWISS-PROT.pl +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/perl -W - -# extractSWISS-PROT.pl -# -------------------- -# -# Copyright (C) 2001 Washington University School of Medicine -# and Howard Hughes Medical Institute -# All rights reserved -# -# Created: 09/25/01 -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Last modified 05/23/02 - -# Purpose. Extracts ID, DE, and species from a "sprot.dat" file. -# The output is used by "rio.pl". -# If a species list (format: SWISS-PROT-code=full name) is supplied: -# only sequences from species found in this list are written to -# outfile (recommended). -# -# Usage. extractSWISS-PROT.pl [species list] - -# Remark. Need to re-run this if species in species tree or species list -# are added/changed or if a new version of Pfam is used!! - - -use strict; - - -my $VERSION = "1.001"; -my $infile = ""; -my $outfile = ""; -my $speciesfile = ""; -my $return_line = ""; -my $read = 0; -my $ac = ""; -my $de = ""; -my $os = ""; -my %Species_names = (); # SWISS-PROT name -> "". -my $i = 0; - -if ( @ARGV != 2 && @ARGV != 3 ) { - &errorInCommandLine(); -} - -$infile = $ARGV[ 0 ]; -$outfile = $ARGV[ 1 ]; - -if ( @ARGV == 3 ) { - $speciesfile = $ARGV[ 2 ]; - unless ( ( -s $speciesfile ) && ( -f $speciesfile ) && ( -T $speciesfile ) ) { - die "\n$0: <<$speciesfile>> does not exist, is empty, or is not a plain textfile.\n\n"; - } - &readSpeciesNamesFile( $speciesfile ); -} - -if ( -e $outfile ) { - die "\n$0: <<$outfile>> already exists.\n\n"; -} -unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n"; -} - -open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; -open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n"; - -print OUT "# extractTrembl.pl version: $VERSION\n"; -print OUT "# trembl.dat file: $infile\n"; -print OUT "# output file : $outfile\n"; -print OUT "# species file : $speciesfile\n"; -print OUT "# date : ".`date`."\n\n"; - -$read = 0; - -while ( $return_line = ) { - if ( $return_line =~ /^ID\s+(\S+)/ ) { - $ac = $1; - $read = 1; - if ( $ac =~ /[A-Z0-9]+_([A-Z0-9]+)/ ) { - $os = $1; - } - else { - die "\n$0: Unexpected format: $ac.\n\n"; - } - if ( $speciesfile ne "" ) { - unless ( exists( $Species_names{ $os } ) ) { - $read = 0; - $ac = ""; - $de = ""; - $os = ""; - next; - } - } - } - elsif ( $return_line =~ /^DE\s+(.+)/ && $read == 1 ) { - if ( $de ne "" ) { - $de .= " ".$1; - } - else { - $de = $1; - } - } - elsif ( $return_line =~ /^\/\// && $read == 1 ) { - $read = 0; - print OUT "$ac;$de;$os\n"; - $ac = ""; - $de = ""; - $os = ""; - $i++; - } -} - -close( IN ); - -print OUT "\n # $i entries.\n"; - -close( OUT ); - -exit( 0 ); - - - -# Reads in species file. -# Format: SWISS-PROT=full name (e.g. "BACSU=Bacillus subtilis") -# Lines beginning with "#" are ignored. -# One argument: species file-name -# Last modified: 04/24/01 -sub readSpeciesNamesFile { - my $infile = $_[ 0 ]; - my $return_line = ""; - my $sp = ""; - my $full = ""; - - unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n$0: readSpeciesNamesFile: <<$infile>> does not exist, is empty, or is not a plain textfile.\n"; - } - - open( IN_RSNF, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; - while ( $return_line = ) { - if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)=(.+)/ ) { - $sp = $1; - $full = $2; - $full =~ s/^\s+//; - $full =~ s/\s+$//; - $Species_names{ $sp } = ""; - } - } - close( IN_RSNF ); - - return; -} - - - -sub errorInCommandLine { - print "\n"; - print " extractSWISS-PROT.pl $VERSION\n"; - print " --------------------\n"; - print "\n"; - print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n"; - print "\n"; - print " Purpose. Extracts ID, DE, and species from a \"sprot.dat\" file.\n"; - print " The resulting output is used by \"rio.pl\".\n"; - print " If a species list (format: SWISS-PROT-code=full name) is supplied:\n"; - print " only sequences from species found in this list are written to\n"; - print " outfile (recommended).\n"; - print "\n"; - print " Usage. extractSWISS-PROT.pl [species list]\n"; - print "\n"; - print " Remark. Need to re-run this if species in species tree or species list\n"; - print " are added/changed or if a new version of Pfam is used!!\n"; - print "\n\n"; - exit( -1 ); -} diff --git a/forester/archive/perl/extractSpecies.pl b/forester/archive/perl/extractSpecies.pl deleted file mode 100755 index 32a40de..0000000 --- a/forester/archive/perl/extractSpecies.pl +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/perl -W - -# extractSpecies.pl -# ---------------- -# -# Copyright (C) 2003 Christian M. Zmasek -# All rights reserved -# -# Created: 09/03/03 -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Last modified 03/12/04 (Added gg) - -# Purpose. Adds species information to a file describing a phylogenetic -# tree in the following format (by way of example): -# "((ceINX_CE33055:0.02883,cbINX_CB09748:0.02934):0.36899[&&NHX:B=100],..." -# ce stands for "CAEEL". The hash %SPECIES needs to be set accordingly. -# - - -use strict; - - -my %SPECIES = ( - "dm" => "DROME", - "ag" => "ANOGA", - "ce" => "CAEEL", - "cb" => "CAEBR", - "ci" => "CIOIN", - "fr" => "FUGRU", - "gg" => "CHICK", - "rn" => "RAT", - "mm" => "MOUSE", - "hs" => "HUMAN" - ); - - -my $infile = ""; -my $outfile = ""; -my $intree = ""; -my $return_line = ""; - -if ( @ARGV != 1 && @ARGV != 2 ) { - &errorInCommandLine(); -} - -$infile = $ARGV[ 0 ]; - -if ( @ARGV == 1 ) { - $outfile = $infile; - $outfile =~ s/\.nhx$//; - $outfile .= "_species.nhx"; -} - -if ( @ARGV == 2 ) { - $outfile = $ARGV[ 1 ]; -} - - - - -if ( -e $outfile ) { - die "\n$0: <<$outfile>> already exists.\n\n"; -} -unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n"; -} - -open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; -open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n"; - -while ( $return_line = ) { - $return_line =~ s/\s+//g; - $return_line =~ s/\+/_/g; - - $intree .= $return_line; - -} - -close( IN ); - -while ( ( my $short, my $long ) = each ( %SPECIES ) ) { - - while ( $intree =~ /[(),]($short[^\[]+?)[(),]/ ) { - - my $name_and_length = $1; - - print "$name_and_length -> $name_and_length\[\&\&NHX:S=$long\]\n"; - - $intree =~ s/$name_and_length/$name_and_length\[&&NHX:S=$long\]/; - - } - -} - -print OUT $intree; - -close( OUT ); - -print "\n\nDone!\n\n"; - -exit( 0 ); - - - -sub errorInCommandLine { - print "\n"; - print "extractSpecies.pl infile [outfile]"; - print "\n\n"; - exit( -1 ); -} diff --git a/forester/archive/perl/extractTrembl.pl b/forester/archive/perl/extractTrembl.pl deleted file mode 100755 index fadf216..0000000 --- a/forester/archive/perl/extractTrembl.pl +++ /dev/null @@ -1,199 +0,0 @@ -#!/usr/bin/perl -W - -# extractTrembl.pl -# ---------------- -# -# Copyright (C) 2001 Washington University School of Medicine -# and Howard Hughes Medical Institute -# All rights reserved -# -# Created: 04/24/01 -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Last modified 05/23/02 - -# Purpose. Extracts AC, DE, and OS from a "trembl.dat" file. -# The output is used by "rio.pl". -# If a species list (format: SWISS-PROT-code=full name) is supplied: -# only sequences from species found in this list are written to -# outfile and their full species names replaced with their SWISS-PROT -# code (recommended). -# -# Usage. extractTrembl.pl [species list] - -# Remark. Need to re-run this if species in species tree or species list -# are added/changed or if a new version of Pfam is used!! - -# Some "heuristic" is required for Synechococcus, Synechocystis, Anabaena: -# see below. - -use strict; - - -my $VERSION = "1.001"; -my $infile = ""; -my $outfile = ""; -my $speciesfile = ""; -my $return_line = ""; -my $read = 0; -my $ac = ""; -my $de = ""; -my $os = ""; -my %Species_names = (); # full name -> SWISS-PROT name. -my $i = 0; - -if ( @ARGV != 2 && @ARGV != 3 ) { - &errorInCommandLine(); -} - -$infile = $ARGV[ 0 ]; -$outfile = $ARGV[ 1 ]; - -if ( @ARGV == 3 ) { - $speciesfile = $ARGV[ 2 ]; - unless ( ( -s $speciesfile ) && ( -f $speciesfile ) && ( -T $speciesfile ) ) { - die "\n$0: <<$speciesfile>> does not exist, is empty, or is not a plain textfile.\n\n"; - } - &readSpeciesNamesFile( $speciesfile ); -} - -if ( -e $outfile ) { - die "\n$0: <<$outfile>> already exists.\n\n"; -} -unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n"; -} - -open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; -open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n"; - -print OUT "# extractTrembl.pl version: $VERSION\n"; -print OUT "# trembl.dat file: $infile\n"; -print OUT "# output file : $outfile\n"; -print OUT "# species file : $speciesfile\n"; -print OUT "# date : ".`date`."\n\n"; - -$read = 0; - -while ( $return_line = ) { - if ( $return_line =~ /^AC\s+(\S+);/ ) { - $ac = $1; - $read = 1; - } - elsif ( $return_line =~ /^DE\s+(.+)/ && $read == 1 ) { - if ( $de ne "" ) { - $de .= " ".$1; - } - else { - $de = $1; - } - } - - elsif ( $return_line =~ /^OS\s+(.+)\.\s*$/ && $read == 1 ) { - $os = $1; - if ( $speciesfile ne "" - && ( $os =~ /Synechococcus/ - || $os =~ /Synechocystis/ - || $os =~ /Anabaena/ ) ) { - if ( $os =~ /PCC\s*(\d+)/ ) { - $os = "PCC ".$1; - } - else { - $read = 0; - $ac = ""; - $de = ""; - $os = ""; - next; - } - } - else { - $os =~ s/$.+$//g; - } - $os =~ s/^\s+//; - $os =~ s/\s+$//; - if ( $speciesfile ne "" ) { - unless ( exists( $Species_names{ $os } ) ) { - $read = 0; - $ac = ""; - $de = ""; - $os = ""; - next; - } - $os = $Species_names{ $os }; - } - } - elsif ( $return_line =~ /^\/\// && $read == 1 ) { - $read = 0; - print OUT "$ac;$de;$os\n"; - $ac = ""; - $de = ""; - $os = ""; - $i++; - } -} - -close( IN ); - -print OUT "\n # $i entries.\n"; - -close( OUT ); - -exit( 0 ); - - - -# Reads in species file. -# Format: SWISS-PROT=full name (e.g. "BACSU=Bacillus subtilis") -# Lines beginning with "#" are ignored. -# One argument: species file-name -# Last modified: 04/24/01 -sub readSpeciesNamesFile { - my $infile = $_[ 0 ]; - my $return_line = ""; - my $sp = ""; - my $full = ""; - - unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n$0: readSpeciesNamesFile: <<$infile>> does not exist, is empty, or is not a plain textfile.\n"; - } - - open( IN_RSNF, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; - while ( $return_line = ) { - if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)=(.+)/ ) { - $sp = $1; - $full = $2; - $full =~ s/^\s+//; - $full =~ s/\s+$//; - $Species_names{ $full } = $sp; - } - } - close( IN_RSNF ); - - return; -} - - - -sub errorInCommandLine { - print "\n"; - print " extractTrembl.pl $VERSION\n"; - print " ----------------\n"; - print "\n"; - print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n"; - print "\n"; - print " Purpose. Extracts AC, DE, and OS from a \"trembl.dat\" file.\n"; - print " The resulting output is used by \"rio.pl\".\n"; - print " If a species list (format: SWISS-PROT-code=full name) is supplied:\n"; - print " only sequences from species found in this list are written to\n"; - print " outfile and their full species names replaced with their SWISS-PROT\n"; - print " code (recommended).\n"; - print "\n"; - print " Usage. extractTrembl.pl [species list]\n"; - print "\n"; - print " Remark. Need to re-run this if species in species tree or species list\n"; - print " are added/changed or if a new version of Pfam is used!!\n"; - print "\n\n"; - exit( -1 ); -} diff --git a/forester/archive/perl/file_proc.pl b/forester/archive/perl/file_proc.pl deleted file mode 100755 index 0122778..0000000 --- a/forester/archive/perl/file_proc.pl +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/perl -w - -my $in = $ARGV[ 0 ]; -my $out = $ARGV[ 1 ]; - -if ( -e $out ) { - print "File $out already exists.\n"; - exit( -1 ); -} - -if ( !-e $in ) { - print "File $in does not exist.\n"; - exit( -1 ); -} - -open( IN, $in ) ; -open ( OUT, ">>$out" ) || die ( "Could not open file $out for writing!\n" ); - -while ( my $line = ) { - my $newline = &proc_line( $line ); - if ( length( $newline ) > 0 ) { - print OUT $newline; - } -} - - -close( OUT ) or die( "can't close $out: $!" ); -close( IN ) or die( "can't close $in: $!" ); - -sub proc_line { - my $line = shift; - - - if ( $line =~ /^#/ ) { - return ""; - } - if ( $line =~ /^Predicted coding sequence$s$:/ ) { - return ""; - } - elsif ( $line =~ /^>.*_aa\s*$/ ) { - return ""; - } - elsif ( $line =~ /^>/ ) { - return $line; - } - elsif ( $line !~ /[a-z]/ ) { - return ""; - } - else { - return $line;; - } -} diff --git a/forester/archive/perl/gs_aa_extract.pl b/forester/archive/perl/gs_aa_extract.pl deleted file mode 100755 index d485920..0000000 --- a/forester/archive/perl/gs_aa_extract.pl +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/perl -W - -# $Id: gs_aa_extract.pl,v 1.2 2008/03/09 00:11:50 cmzmasek Exp $ - -# This extracts the AA sequences from GENSCAN output files -# Copyright (C) 2008-2009 Christian M. Zmasek -# All rights reserved -# Created 2007-07-28 in Winterthur, Switzerland by CMZ - -# Usage: gs_aa_extract.pl - -use strict; - -if ( scalar( @ARGV ) != 2 ) { - print "\ngs_aa_extract.pl \n\n"; - exit( -1 ); -} - -my $infile = $ARGV[ 0 ]; -my $outfile = $ARGV[ 1 ]; - -if ( -e $outfile) { - die "\n$0: \"$outfile\" already exists.\n\n"; -} -unless( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n$0: cannot read from \"$infile\".\n\n"; -} - -open( IN, "$infile" ) || die "\n$0: Cannot open file \"$infile\": $!\n"; -open( OUT, ">$outfile" ) || die "\n$0: Cannot create file \"$outfile\": $!\n"; - -my $line = ""; -my $desc = ""; - -while ( $line = ) { - if ( $line =~ /^>/ ) { - $desc = $line; - } - elsif ( $line =~ /^[A-Z]+$/ ) { - if ( length( $desc ) > 0 ) { - print OUT $desc; - $desc = ""; - } - print OUT $line; - } -} - -close( OUT ); - -print( "\nOK\n" ); - -exit( 0 ); - diff --git a/forester/archive/perl/makeTree.pl b/forester/archive/perl/makeTree.pl deleted file mode 100755 index 45c4658..0000000 --- a/forester/archive/perl/makeTree.pl +++ /dev/null @@ -1,1211 +0,0 @@ -#!/usr/bin/perl -W - -# makeTree.pl -# ----------- -# Copyright (C) 1999-2003 Washington University School of Medicine -# and Howard Hughes Medical Institute -# All rights reserved -# -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Last modified 04/06/04 -# -# -# Requirements makeTree is part of the RIO/FORESTER suite of programs. -# ------------ Many of its global variables are set via rio_module.pm. -# -# -# Note. Use xt.pl (for Pfam alignments) or mt.pl (for other alignments) -# to run makeTree.pl on whole directories of alignments files. -# -# -# -# Usage -# ----- -# -# Tree calculation based on a Pfam/Clustal W alignment -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# makeTree.pl [-options] -# [path/name for temporary directory to be created] -# -# Example: -# "% makeTree.pl -UTB1000S41NDXV /DB/PFAM/Full/IL5 IL5_tree" -# -# -# Tree calculation based on precalculated pairwise distances -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Consensus tree will have no branch length values. -# Precalculated pairwise distances are the output of "pfam2pwd.pl", -# number of bootstraps needs to match the one used for the pwds. -# -# makeTree.pl <-options, includes "F"> [path/name for temporary directory -# to be created] -# -# Example: -# "% makeTree.pl -FB100S21XV /pfam2pwd_out/IL5.pwd IL5_tree" -# -# -# Tree calculation based on precalculated pairwise distances -# and matching alignment -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Consensus tree will have branch length values. -# Precalculated pairwise distances and the matching (processed) -# alignment are the output of "pfam2pwd.pl", number of bootstraps -# needs to match the one used for the pwds, matrix needs to match -# the one used for the pwds. -# -# makeTree.pl <-options, includes "UF"> -# [path/name for temporary directory to be created] -# -# Example: -# "% makeTree.pl -UFLB100S21XV /pfam2pwd_out/IL5.pwd /pfam2pwd_out/IL5.aln IL5_tree" -# -# -# Options -# ------- -# -# N : Suggestion to remove columns in the alignment which contain gaps. -# Gaps are not removed, if, after removal of gaps, the resulting alignment would -# be shorter than $MIN_NUMBER_OF_AA. Default is not to remove gaps. -# Bx : Number of bootstrapps. B0: do not bootstrap. Default is 100 bootstrapps. -# The number of bootstrapps should be divisible by 10. -# U : Use TREE-PUZZLE to calculate ML branchlengths for consesus tree, in case of -# bootstrapped analysis. -# J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE, default: PAM. -# L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE, default: PAM. -# M : Use mtREV24 matrix (Adachi-Hasegawa 1996) inTREE-PUZZLE, default: PAM. -# W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE, default: PAM. -# T : Use VT matrix (Mueller-Vingron 2000) in TREE-PUZZLE, default: PAM. -# P : Let TREE-PUZZLE choose which matrix to use, default: PAM. -# E : Exact parameter estimates in TREE-PUZZLE, default: Approximate. -# Model of rate heterogeneity in TREE-PUZZLE (default: Uniform rate) -# g : 8 Gamma distributed rates -# t : Two rates (1 invariable + 1 variable) -# m : Mixed (1 invariable + 8 Gamma rates) -# R : Randomize input order in PHYLIP NEIGHBOR. -# A : Use PHYLIP PROTPARS instead of NEIGHBOR (and no pairwise distance calculation). -# jx : Number of jumbles when using PHYLIP PROTPARS (random seed set with Sx). -# Sx : Seed for random number generator(s). Must be 4n+1. Default is 9. -# X : To keep multiple tree file (=trees from bootstrap resampled alignments). -# D : To keep (and create in case of bootstrap analysis) pairwise distance matrix file. -# This is created form the not resampled (original) alignment. -# C : Calculate pairwise distances only (no tree). Bootstrap is always 1. -# No other files are generated. -# F : Pairwise distance (pwd) file as input (instead of alignment). -# No -D, -C, and -N options available in this case. -# V : Verbose. -# # : Only for rio.pl: Do not calculate consensus tree ("I" option in rio.pl). -# -# -# -# History: -# ------- -# -# 09/06/03: Added "#" option (to be used only for rio.pl). -# 03/24/04: Do not replace "?" with "-" in method pfam2phylip. - - -use strict; - -use FindBin; -use lib $FindBin::Bin; -use rio_module2; - -my $VERSION = "4.210"; - -my $TEMP_DIR_DEFAULT = "/tmp/maketree"; # Where all the infiles, outfiles, etc will be created. - -my $remove_gaps = 0; # 0: do not remove gaps; 1: remove gaps -my $bootstraps = 100; # 0,1: do not bootstrap. Default: 100 -my $puzzle_consensus_tree = 0; # 0: no; 1: yes. No is default. -my $matrix = 1; # 0 = JTT - # 1 = PAM - default - # 2 = BLOSUM 62 - # 3 = mtREV24 - # 5 = VT - # 6 = WAG - # 7 = auto -my $rate_heterogeneity = 0; # 0 = Uniform rate (default) - # 1 = 8 Gamma distributed rates - # 2 = Two rates (1 invariable + 1 variable) - # 3 = Mixed (1 invariable + 8 Gamma rates) -my $randomize_input_order = 0; # 0: do not randomize input order; 1 jumble -my $seed = 9; # Seed for random number generators. Default: 9 -my $keep_multiple_trees = 0; # 0: delete multiple tree file - # 1: do not delete multiple tree file -my $keep_distance_matrix = 0; # 1: (create and) keep; 0: do not (create and) keep -my $verbose = 0; # 0: no; 1: yes -my $pairwise_dist_only = 0; # 0: no; 1: yes -my $start_with_pwd = 0; # 0: no; 1: yes -my $start_with_pwd_and_aln = 0; # 0: no; 1: yes -my $no_consenus_tree = 0; # 0: no; 1: yes -my $exact_parameter_est = 0; # 0: no; 1: yes -my $use_protpars = 0; # 0: no; 1: yes -my $protpars_jumbles = 0; - -my %seqnames = (); # number => seqname -my %numbers = (); # seqname => number -my $options = ""; -my $infile = ""; -my $pwdfile = ""; -my $outfile = ""; -my $outfilenhx = ""; -my $logfile = ""; -my $alignfile = ""; -my $multitreefile = ""; -my $distancefile = ""; -my $log = ""; -my $number_of_aa = 0; -my $orig_length = 0; -my $ii = 0; -my $temp_dir = ""; -my $current_dir = ""; -my @out = (); -my $number_of_seqs = 0; - - - -unless ( @ARGV == 2 || @ARGV == 3 || @ARGV == 4 || @ARGV == 5 ) { - &printUsage(); - exit ( -1 ); -} - - - -# Analyzes the options: -# --------------------- - -if ( $ARGV[ 0 ] =~ /^-.+/ ) { - - unless ( @ARGV > 2 ) { - &printUsage(); - exit ( -1 ); - } - $options = $ARGV[ 0 ]; - - if ( $options =~ /F/ && $options !~ /U/ ) { - if ( @ARGV != 3 && @ARGV != 4 ) { - &printUsage(); - exit ( -1 ); - - } - $start_with_pwd = 1; - $infile = ""; - $pwdfile = $ARGV[ 1 ]; - - $outfile = $ARGV[ 2 ]; - if ( @ARGV == 4 ) { - $temp_dir = $ARGV[ 3 ]; - } - - } - elsif ( $options =~ /F/ && $options =~ /U/ ) { - if ( @ARGV != 4 && @ARGV != 5 ) { - &printUsage(); - exit ( -1 ); - } - $start_with_pwd = 1; - $start_with_pwd_and_aln = 1; - $pwdfile = $ARGV[ 1 ]; - $infile = $ARGV[ 2 ]; - $outfile = $ARGV[ 3 ]; - if ( @ARGV == 5 ) { - $temp_dir = $ARGV[ 4 ]; - } - - } - else { - if ( @ARGV != 3 && @ARGV != 4 ) { - &printUsage(); - exit ( -1 ); - } - $infile = $ARGV[ 1 ]; - $outfile = $ARGV[ 2 ]; - if ( @ARGV == 4 ) { - $temp_dir = $ARGV[ 3 ]; - } - } - - if ( $options =~ /N/ && $start_with_pwd != 1 ) { - $remove_gaps = 1; # do remove gaps - } - if ( $options =~ /B(\d+)/ ) { - $bootstraps = $1; - if ( $bootstraps <= 1 ) { - $bootstraps = 0; - } - elsif ( $bootstraps <= 9 ) { - $bootstraps = 0; - print "\n\nMAKETREE: WARNING: Bootstrap number must be devisable by 10,\nno bootstrapping.\n\n"; - } - elsif ( $bootstraps % 10 != 0 ) { - $bootstraps = $bootstraps - $bootstraps % 10; # to ensure $bootstraps % 10 == 0 - print "\n\nMAKETREE: WARNING: Bootstrap number must be devisable by 10,\nhas been set to $bootstraps.\n\n"; - } - } - if ( $options =~ /A/ ) { - $use_protpars = 1 # PROTPARS - } - if ( $options =~ /j(\d+)/ ) { - $protpars_jumbles = $1; - if ( $protpars_jumbles < 0 ) { - $protpars_jumbles = 0; - } - } - if ( $options =~ /J/ ) { - $matrix = 0; # JTT - } - if ( $options =~ /L/ ) { - $matrix = 2; # Blossum - } - if ( $options =~ /M/ ) { - $matrix = 3; # mtREV24 - } - if ( $options =~ /T/ ) { - $matrix = 5; # VT - } - if ( $options =~ /W/ ) { - $matrix = 6; # WAG - } - if ( $options =~ /P/ ) { - $matrix = 7; # auto - } - if ( $options =~ /R/ ) { - $randomize_input_order = 1; - } - if ( $options =~ /S(\d+)/ ) { - $seed = $1; - } - if ( $options =~ /U/ ) { - $puzzle_consensus_tree = 1; - } - if ( $options =~ /X/ ) { - $keep_multiple_trees = 1; - } - if ( $options =~ /D/ && $start_with_pwd != 1 ) { - $keep_distance_matrix = 1; - } - if ( $options =~ /V/ ) { - $verbose = 1; - } - if ( $options =~ /C/ && $start_with_pwd != 1 ) { - $pairwise_dist_only = 1; - } - if ( $options =~ /E/ ) { - $exact_parameter_est = 1; - } - if ( $options =~ /g/ ) { - $rate_heterogeneity = 1; - } - if ( $options =~ /t/ ) { - $rate_heterogeneity = 2; - } - if ( $options =~ /m/ ) { - $rate_heterogeneity = 3; - } - if ( $options =~ /#/ ) { - $no_consenus_tree = 1; - } - if ( $protpars_jumbles > 0 && $use_protpars != 1 ) { - &printUsage(); - exit ( -1 ); - } - if ( $use_protpars == 1 ) { - if ( $randomize_input_order >= 1 - || $start_with_pwd == 1 - || $keep_distance_matrix == 1 - || $pairwise_dist_only == 1 ) { - &printUsage(); - exit ( -1 ); - } - if ( $bootstraps > 1 && $protpars_jumbles < 1 ) { - $protpars_jumbles = 1; - } - } - -} - -else { - unless ( @ARGV == 2 || @ARGV == 3 ) { - &printUsage(); - exit ( -1 ); - } - $infile = $ARGV[ 0 ]; - $outfile = $ARGV[ 1 ]; - if ( @ARGV == 3 ) { - $temp_dir = $ARGV[ 2 ]; - } -} - - - - -$current_dir = `pwd`; -$current_dir =~ s/\s//; - -if ( $outfile !~ /^\// ) { - # outfile is not absolute path. - $outfile = $current_dir."/".$outfile; -} - - - -if ( $pairwise_dist_only == 1 ) { - $bootstraps = 0; - $keep_multiple_trees = 0; - $puzzle_consensus_tree = 0; - $randomize_input_order = 0; - $start_with_pwd = 0; - $keep_distance_matrix = 1; -} - -if ( $bootstraps < 2 ) { - $no_consenus_tree = 0; -} - -# TREE-PUZZLE sets the option in this way: -# If two rates or mixed, exact parameter estimates are used. -if ( $rate_heterogeneity == 2 -|| $rate_heterogeneity == 3 ) { - $exact_parameter_est = 1 -} - -$logfile = $outfile.$LOG_FILE_SUFFIX; -$alignfile = $outfile.$ALIGN_FILE_SUFFIX; -$multitreefile = $outfile.$MULTIPLE_TREES_FILE_SUFFIX; -$distancefile = $outfile.$SUFFIX_PWD_NOT_BOOTS; - -if ( $outfile =~ /\.nhx$/i ) { - $outfilenhx = $outfile; - $logfile =~ s/\.nhx//i; - $alignfile =~ s/\.nhx//i; - $outfile =~ s/\.nhx//i; - $multitreefile =~ s/\.nhx//i; - $distancefile =~ s/\.nhx//i; -} -else { - $outfilenhx = $outfile.".nhx"; -} - -if ( -e $outfilenhx ) { - die "\n\nmakeTree: \"$outfilenhx\" already exists.\n\n"; -} -if ( $infile ne "" ) { - unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n\nmakeTree: Input alignment file \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n"; - } -} -if ( $start_with_pwd == 1 ) { - unless ( ( -s $pwdfile ) && ( -f $pwdfile ) && ( -T $pwdfile ) ) { - die "\n\nmakeTree: Pairwise distance file \"$pwdfile\" does not exist, is empty, or is not a plain textfile.\n\n"; - } -} - - - -# Prints out the options: -# ----------------------- - - -$log = "\n$0 logfile:\n"; -$log = $log."Version: $VERSION\n\n"; - - -if ( $start_with_pwd == 1 ) { - $log = $log."Input pairwise distance file (bootstrapped): $pwdfile\n"; -} -if ( $infile ne "" ) { - $log = $log."Input alignment : $infile\n"; -} - -if ( $no_consenus_tree != 1 ) { - $log = $log."Output tree file : $outfilenhx\n"; -} - -if ( $keep_multiple_trees == 1 && $bootstraps >= 2 ) { - $log = $log."Output multiple trees file : $multitreefile\n"; -} -if ( $keep_distance_matrix ) { - $log = $log."Output pairwise distance file : $distancefile\n"; -} - -$log = $log."Bootstraps : $bootstraps\n"; - -if ( $start_with_pwd != 1 && $use_protpars != 1 ) { - $log = $log."Prgrm to calculate pairwise dist. : TREE-PUZZLE\n"; -} - -if ( $use_protpars == 1 ) { - $log = $log."Program to calculate tree : PHYLIP PROTPARS\n"; - $log = $log."Number of jumbles in PROTPARS : $protpars_jumbles\n"; -} -else { - $log = $log."Program to calculate tree : PHYLIP NEIGHBOR (NJ)\n"; -} - -if ( $puzzle_consensus_tree == 1 ) { - $log = $log."Prgrm to calculate ML branch lenghts: TREE-PUZZLE\n"; -} -if ( $puzzle_consensus_tree == 1 || $start_with_pwd != 1 ) { - $log = $log."Model : "; - if ( $matrix == 0 ) { - $log = $log."JTT (Jones et al. 1992)\n"; - } - elsif ( $matrix == 2 ) { - $log = $log."BLOSUM 62 (Henikoff-Henikoff 92)\n"; - } - elsif ( $matrix == 3 ) { - $log = $log."mtREV24 (Adachi-Hasegawa 1996)\n"; - } - elsif ( $matrix == 5 ) { - $log = $log."VT (Mueller-Vingron 2000)\n"; - } - elsif ( $matrix == 6 ) { - $log = $log."WAG (Whelan-Goldman 2000)\n"; - } - elsif ( $matrix == 7 ) { - $log = $log."auto\n"; - } - else { - $log = $log."PAM (Dayhoff et al. 1978)\n"; - } -} -$log = $log."Model of rate heterogeneity : "; -if ( $rate_heterogeneity == 1 ) { - $log = $log."8 Gamma distributed rates\n"; -} -elsif ( $rate_heterogeneity == 2 ) { - $log = $log."Two rates (1 invariable + 1 variable)\n"; -} -elsif ( $rate_heterogeneity == 3 ) { - $log = $log."Mixed (1 invariable + 8 Gamma rates)\n"; -} -else { - $log = $log."Uniform rate\n"; -} -if ( $randomize_input_order >= 1 ) { - $log = $log."Randomize input order in NEIGHBOR : yes\n"; -} -$log = $log."Seed for random number generators : $seed\n"; -if ( $exact_parameter_est == 1 ) { - $log = $log."Exact parameter estimates in TREE-PUZZLE\n"; -} - -$log = $log."Start time/date : ".`date`; - - - - -# That's where the mischief starts.... -# ------------------------------------ - -$ii = 0; - -my $time_st = time; - -if ( $temp_dir eq "" ) { - $temp_dir = $TEMP_DIR_DEFAULT; -} - -$temp_dir = $temp_dir.$time_st.$ii; - -while ( -e $temp_dir ) { - $ii++; - $temp_dir = $temp_dir.$time_st.$ii; -} - -mkdir( $temp_dir, 0700 ) -|| die "\n\n$0: Unexpected error: Could not create <<$temp_dir>>: $!\n\n"; - -unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) { - die "\n\n$0: Unexpected error: <<$temp_dir>> does not exist, or is not a directory.\n\n"; -} - - -if ( $start_with_pwd != 1 ) { - system( "cp", $infile, $temp_dir."/INFILE" ); - unless ( chmod ( 0600, $temp_dir."/INFILE" ) ) { - warn "\n\n$0: Could not chmod. $!\n\n"; - } - $infile = "INFILE"; -} - - -chdir ( $temp_dir ) -|| die "\n\n$0: Unexpected error: Could not chdir to <<$temp_dir>>: $!\n\n"; - - -if ( $start_with_pwd != 1 ) { - - @out = &DoPfam2phylip( $infile, $alignfile, $remove_gaps ); - $number_of_aa = $out[ 0 ]; - $orig_length = $out[ 1 ]; - $number_of_seqs = $out[ 2 ]; - - system( "cp", $alignfile, "infile" ); - - if ( $use_protpars != 1 ) { - # Calculating the pairwise distances (saved in file "infile"): "puzzle" - - system( "cp", $alignfile, "align" ); - - if ( $bootstraps > 1 ) { - - &executeSeqboot( $seed, $bootstraps ); - - if ( $keep_distance_matrix ) { - system( "mv", "outfile", "outfile___" ); - system( "cp", "align", "infile" ); - &executePuzzle( "infile", - $matrix, - $exact_parameter_est, - $rate_heterogeneity ); - system( "mv", "infile.dist", $distancefile ); - system( "mv", "outfile___", "outfile" ); - } - unlink( "infile" ); # Necessary, since "infile" is puzzle's default input. - system( "mv", "outfile", "IN" ); - - &executePuzzleBootstrapped( "IN", - $matrix, - $exact_parameter_est, - $rate_heterogeneity ); - - $pwdfile = "IN.dist"; - - } - else { - - &executePuzzle( "infile", - $matrix, - $exact_parameter_est, - $rate_heterogeneity ); - - if ( $keep_distance_matrix ) { - system( "cp outdist $distancefile" ); - } - $pwdfile = "infile.dist"; - } - - unlink( "infile.tree" ); - - if ( $pairwise_dist_only == 1 ) { - unlink( "infile", "align", "INFILE", "outdist", $alignfile ); - chdir( $current_dir ) - || die "\n\n$0: Unexpected error: Could not chdir to <<$current_dir>>: $!\n\n"; - - rmdir( $temp_dir ) - || die "\n\n$0: Unexpected error: Could not remove <<$temp_dir>>: $!\n\n"; - - print "\n\n$0 finished.\n\n"; - print "Output pairwise distance file written as: $distancefile\n\n"; - print "\n\nmakeTree successfully terminated.\n\n"; - exit( 0 ); - } - - } ## if ( $use_protpars != 1 ) - -} ## if ( $start_with_pwd != 1 ) - - -# Calculating the tree (saved in file "infile"): - -if ( $use_protpars != 1 ) { - unlink( "infile" ); - &executeNeighbor( $pwdfile, $bootstraps, $randomize_input_order, $seed, 1 ); -} -else { - if ( $bootstraps > 1 ) { - &executeSeqboot( $seed, $bootstraps ); - unlink( "infile" ); - system( "mv", "outfile", "infile" ); - } - &executeProtpars( "infile", $bootstraps, $protpars_jumbles, $seed ); -} - -unlink( "outfile" ); - -if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { - - system( "cp", "outtree", $multitreefile ); -} - - -system( "mv", "outtree", "intree" ); - -if ( $bootstraps > 1 ) { - if ( $no_consenus_tree != 1 ) { - - # Consense: - &executeConsense( "intree" ); - - if ( $puzzle_consensus_tree == 1 ) { - - system( "cp", "outtree", "treefile_consense" ); - system( "mv", "outtree", "intree" ); - - # Puzzle for ML branch lenghts: - # The alignment is read from infile by default. - # The tree is read from intree by default. - - if ( $start_with_pwd_and_aln == 1 ) { - &pfam2phylipMatchOnly( $infile, - "infile", - 1 ); - } - elsif ( $use_protpars != 1 ) { - system( "mv", "align", "infile" ); # align = original alignment in phylip interleaved. - } - - &executePuzzleToCalculateBranchLenghts( $matrix, - $exact_parameter_est, - $rate_heterogeneity ); - - unlink( "outfile", "outdist" ); - system( "mv", "outtree", "outree_puzzle" ); - - # Transfer - &executeTransfersBranchLenghts( "outree_puzzle", "treefile_consense", $outfilenhx ); - - } - else { - unlink( "outfile", "align" ); - system( "mv", "outtree", $outfilenhx ); - } - } - else { - unlink( "outfile", "align" ); - - } -} -else { - unlink( "align", "infile.dist" ); - if ( $start_with_pwd != 1 ) { - system( "mv intree $outfilenhx" ); - } - -} - - -unlink( "treefile_consense", "outtree", "outree_puzzle", - "infile", "intree", "align", "INFILE", "IN", "IN.dist", "outdist" ); - - -$log = $log."Finish time/date : ".`date`; - -if ( $start_with_pwd != 1 ) { - $log = $log."Removed gaps : "; - if ( $remove_gaps == 1 ) { - $log = $log."yes\n"; - } - else { - $log = $log."no\n"; - } - $log = $log."Columns in alignment used : $number_of_aa\n"; - $log = $log."Columns in original alignment : $orig_length\n"; - $log = $log."Number of sequences in alignment : $number_of_seqs\n"; -} - - -open( OUT, ">$logfile" ) || die "\n$0: Cannot create file <<$logfile>>: $!\n"; -print OUT $log; -close( OUT ); - - -chdir( $current_dir ) -|| die "\n\n$0:Unexpected error: Could not chdir to <<$current_dir>>: $!\n\n"; - - -rmdir( $temp_dir ) -|| die "\n\n$0:Unexpected error: Could not remove <<$temp_dir>>: $!\n\n"; - -if ( $verbose == 1 ) { - print "\n\n$0 finished.\n"; - if ( $no_consenus_tree != 1 ) { - print "Output tree written as : $outfilenhx\n"; - } - print "Log written as : $logfile\n"; - if ( $start_with_pwd != 1 ) { - print "Alignment written as : $alignfile\n"; - } - if ( $keep_multiple_trees == 1 && $bootstraps >= 2 ) { - print "Multiple trees written as : $multitreefile\n"; - } - if ( $keep_distance_matrix ) { - print "Distance matrix written as: $distancefile\n"; - } -} - - -exit( 0 ); - - - - - -# Methods: -# -------- - - - - -# Executes pfam2phylip. -# If resulting alignment is too short due to the removal -# of gaps, is does not remove gaps. -# Three arguments: -# 1. infile -# 2. outfile -# 3. remove gaps: 1 to remove gaps; 0: do not remove gaps -# Last modified: 06/04/01 -sub DoPfam2phylip { - my $in = $_[ 0 ]; - my $out = $_[ 1 ]; - my $option = $_[ 2 ]; - my $aa = 0; - my @output = (); - - if ( $option == 1 ) { - @output = &pfam2phylip( $in, $out, 1 ); - $aa = $output[ 0 ]; - if ( $aa < 0 ) { - die "\n\n$0: DoPfam2phylip: Unexpected error.\n\n"; - } - if ( $aa < $MIN_NUMBER_OF_AA ) { - unlink( $out ); - $option = 0; - $remove_gaps = 0; - } - } - if ( $option == 0 ) { # Must be another "if" (no elsif of else)! - @output = &pfam2phylip( $in, $out, 2 ); - # 2 is to substitute non-letters with "-" in the sequence. - $aa = $output[ 0 ]; - if ( $aa <= 0 ) { - die "\n\n$0: DoPfam2phylip: Unexpected error.\n\n"; - } - } - return @output; -} - - - -# Two arguments: -# 1. seed for random number generator -# 2. number of bootstraps -# Reads in "infile" by default. -sub executeSeqboot { - - my $s = $_[ 0 ]; - my $bs = $_[ 1 ]; - my $verb = ""; - - &testForTextFilePresence( $infile ); - - if ( $verbose != 1 ) { - $verb = " -2"; - } - - - system( "$SEQBOOT << ! -r -$bs$verb -Y -$s -!" ) - && die "$0: Could not execute \"$SEQBOOT\""; - - return; - -} - - - - -# One/two/three argument(s): -# Reads in tree from "intree" by default. (Presence of "intree" automatically -# switches into "User defined trees" mode.) -# 1. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; -# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise -# 2. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise -# 3. Model of rate heterogeneity: -# 1 for "8 Gamma distributed rates" -# 2 for "Two rates (1 invariable + 1 variable)" -# 3 for "Mixed (1 invariable + 8 Gamma rates)" -# otherwise: Uniform rate -# Last modified: 09/08/03 (added 2nd and 3rd parameter) -sub executePuzzleToCalculateBranchLenghts { - my $matrix_option = $_[ 0 ]; - my $parameter_estimates_option = $_[ 1 ]; - my $rate_heterogeneity_option = $_[ 2 ]; - my $i = 0; - my $mat = ""; - my $est = ""; - my $rate = ""; - - unless ( ( -s "infile" ) && ( -f "infile" ) && ( -T "infile" ) ) { - die "\n$0: executePuzzleToCalculateBranchLenghts: <> does not exist, is empty, or is not a plain textfile.\n"; - } - unless ( ( -s "intree" ) && ( -f "intree" ) && ( -T "intree" ) ) { - die "\n$0: executePuzzleToCalculateBranchLenghts: <> does not exist, is empty, or is not a plain textfile.\n"; - } - - $mat = setModelForPuzzle( $matrix_option ); - if ( $parameter_estimates_option ) { - $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option ); - } - if ( $rate_heterogeneity_option ) { - $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option ); - } - - system( "$PUZZLE << ! -$mat$est$rate -x -y -!" ) - && die "$0: Could not execute \"$PUZZLE\""; - - return; - -} - - - - - - - -# Three/four arguments: -# 1. Name of file containing tree with correct branch lengths -# 2. Name of file containing tree with correct bootstraps -# 3. Outputfilename -# 4. R to reroot both trees in the same manner (use for FITCH, -# since this changes to rooting. -sub executeTransfersBranchLenghts { - my $tree_with_bl = $_[ 0 ]; - my $tree_with_bs = $_[ 1 ]; - my $out = $_[ 2 ]; - my $reroot = $_[ 3 ]; - my $R = ""; - - if ( $reroot && $reroot eq "R" ) { - $R = "R"; - } - - &testForTextFilePresence( $tree_with_bl ); - &testForTextFilePresence( $tree_with_bs ); - - system( "$TRANSFERSBRANCHLENGHTS $tree_with_bl $tree_with_bs $out $R" ) - && die "$0: Could not execute \"$TRANSFERSBRANCHLENGHTS $tree_with_bl $tree_with_bs $out $R\""; - - - return; -} - - - -# Called by method DoPfam2phylip. -# This reads a multiple sequence alignment file in Pfam format, -# Phylip's sequential format, or ClustalW (".aln")output and saves them -# in Phylip's sequential or interleaved format. -# (Those two are the same in this case, since all the seqs will be -# one line in length (no returns)). -# It returns (1st) the number of aa (columns) in the resulting -# alignment and the (2nd) number of aa (columns) in the original -# alignment. -# -# Reads a file containing a sequence alignment in the following format -# (as used in Pfam): -# #comments <- empty lines and lines begining with # (not mandatory) -# name1 kal -# name2 kal -# <- at least one empty line between blocks -# name1 kale -# name2 k.le -# -# Saves it in the "sequential" format of phylip: -# number of OTUs length of aa seqs -# name1 kalkale -# name2 kalk-le -# -# Three arguments: -# 1. infile name -# 2. outfile name -# 3. 1 : Removes colums with a gap (non-letter character) -# 2 : Substitutes non-letter characters (except "?") in the sequence with "-". -# -# Last modified: 03/24/04 -# Changes: -# 03/24/04: Do not replace "?" with "-" -# -sub pfam2phylip { - - my $infile = $_[ 0 ]; - my $outfile = $_[ 1 ]; - my $options = $_[ 2 ]; # 1: remove gaps; 2: non-letters (except "?") -> "-" - my $return_line = ""; - my $x = 0; - my $y = 0; - my $x_offset = 0; - my $original_length = 0; - my @seq_name = (); - my @seq_array = (); - my $seq = ""; - my $max_x = 0; - my $max_y = 0; - my $m = 0; - my $n = 0; - my $i = 0; - my $move = 0; - my $saw_a_sequence_line = 0; - - if ( -e $outfile ) { - die "\n$0: pfam2phylip: <<$outfile>> already exists.\n"; - } - - &testForTextFilePresence( $infile ); - - open( INPP, "$infile" ) || die "\n$0: pfam2phylip: Cannot open file <<$infile>>: $!\n"; - - until ( $return_line !~ /^\s*\S+\s+\S+/ && $saw_a_sequence_line == 1 ) { - if ( $return_line =~ /^\s*\S+\s+\S+/ - && $return_line !~ /^\s*#/ - && $return_line !~ /^\s*\d+\s+\d+/ - && $return_line !~ /^\s*CLUSTAL/ ) { - $saw_a_sequence_line = 1; - $return_line =~ /^\s*(\S+)\s+(\S+)/; - $seq_name[ $y ] = $1; - $seq = $2; - $seq_name[ $y ] = substr( $seq_name[ $y ], 0, $LENGTH_OF_NAME ); - - for ( $x = 0; $x <= length( $seq ) - 1; $x++ ) { - $seq_array[ $x ][ $y ] = substr( $seq, $x, 1 ); - } - if ( $x_offset < length( $seq ) ) { - $x_offset = length( $seq ); - } - $y++; - } - $return_line = ; - if ( !$return_line ) { - last; - } - } - - $max_y = $y; - $y = 0; - $max_x = 0; - - while ( $return_line = ) { - if ( $return_line =~ /^\s*(\S+)\s+(\S+)/ - && $return_line !~ /^\s*#/ - && $return_line !~ /^\s*\d+\s+\d+/ ) { - $return_line =~ /^\s*\S+\s+(\S+)/; - $seq = $1; - for ( $x = 0; $x <= length( $seq ) - 1; $x++ ) { - $seq_array[ $x + $x_offset ][ $y % $max_y ] = substr( $seq, $x, 1 ); - } - if ( $max_x < length( $seq ) ) { - $max_x = length( $seq ); - } - $y++; - if ( ( $y % $max_y ) == 0 ) { - $y = 0; - $x_offset = $x_offset + $max_x; - $max_x = 0; - } - } - } - $original_length = $x_offset; - close( INPP ); - - - # Removes "gap-columns" (gaps = everything except a-z characters): - if ( $options == 1 ) { - $move = 0; - - COLUMN: for ( $x = 0; $x <= $x_offset - 1; $x++ ) { # goes through all aa positions (columns) - - for ( $y = 0; $y <= $max_y - 1; $y++ ) { # goes through all aas in a particular position - - unless ( $seq_array[ $x ][ $y ] && $seq_array[ $x ][ $y ] =~ /[a-z]/i ) { - $move++; - next COLUMN; - } - } - - # If this point is reached, column must be OK = no gaps. - if ( $move >= 1 ) { - for ( $m = 0; $m <= $max_y; $m++ ) { - for ( $n = $x; $n <= $x_offset; $n++ ) { - $seq_array[ $n - $move ][ $m ] = $seq_array[ $n ][ $m ]; - } - } - $x_offset = $x_offset - $move; - $x = $x - $move; - $move = 0; - } - } - if ( $move >= 1 ) { - for ( $m = 0; $m <= $max_y; $m++ ) { - for ( $n = $x; $n <= $x_offset; $n++ ) { - $seq_array[ $n - $move ][ $m ] = $seq_array[ $n ][ $m ]; - } - } - $x_offset = $x_offset - $move; - $x = $x - $move; - $move = 0; - } - } - - - # Writes the file: - - open( OUTPP, ">$outfile" ) || die "\n$0: pfam2phylip: Cannot create file <<$outfile>>: $!\n"; - print OUTPP "$max_y $x_offset\n"; - for ( $y = 0; $y < $max_y; $y++ ) { - print OUTPP "$seq_name[ $y ]"; - for ( $i = 0; $i <= ( $LENGTH_OF_NAME - length( $seq_name[ $y ] ) - 1 ); $i++ ) { - print OUTPP " "; - } - for ( $x = 0; $x <= $x_offset - 1; $x++ ) { - if ( $options == 2 ) { - if ( $seq_array[ $x ][ $y ] ) { - $seq_array[ $x ][ $y ] =~s /[^a-zA-Z\?]/-/; - } - else { - $seq_array[ $x ][ $y ] = "-"; - } - } - print OUTPP "$seq_array[ $x ][ $y ]"; - } - print OUTPP "\n"; - } - close( OUTPP ); - - return $x_offset, $original_length, $max_y; - -} ## pfam2phylip - - - - -sub printUsage { - - print "\n"; - print " makeTree.pl version $VERSION\n"; - print " -----------\n"; - - print < - [path/name for temporary directory to be created] - - Example: - "% makeTree.pl -UTB1000S41NDXV /DB/PFAM/Full/IL5 IL5_tree" - - - Tree calculation based on precalculated pairwise distances - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Consensus tree will have no branch length values. - Precalculated pairwise distances are the output of "pfam2pwd.pl", - number of bootstraps needs to match the one used for the pwds. - - makeTree.pl <-options, includes "F"> [path/name for temporary directory - to be created] - - Example: - "% makeTree.pl -FB100S21XV /pfam2pwd_out/IL5.pwd IL5_tree" - - - Tree calculation based on precalculated pairwise distances - and matching alignment - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Consensus tree will have branch length values. - Precalculated pairwise distances and the matching (processed) - alignment are the output of "pfam2pwd.pl", number of bootstraps - needs to match the one used for the pwds, matrix needs to match - the one used for the pwds. - - makeTree.pl <-options, includes "UF"> - [path/name for temporary directory to be created] - - Example: - "% makeTree.pl -UFLB100S21XV /pfam2pwd_out/IL5.pwd /pfam2pwd_out/IL5.aln IL5_tree" - - - Options - ------- - - N : Suggestion to remove columns in the alignment which contain gaps. - Gaps are not removed, if, after removal of gaps, the resulting alignment would - be shorter than $MIN_NUMBER_OF_AA. Default is not to remove gaps. - Bx : Number of bootstrapps. B0: do not bootstrap. Default is 100 bootstrapps. - The number of bootstrapps should be divisible by 10. - U : Use TREE-PUZZLE to calculate ML branchlengths for consesus tree, in case of - bootstrapped analysis. - J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE, default: PAM. - L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE, default: PAM. - M : Use mtREV24 matrix (Adachi-Hasegawa 1996) inTREE-PUZZLE, default: PAM. - W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE, default: PAM. - T : Use VT matrix (Mueller-Vingron 2000) in TREE-PUZZLE, default: PAM. - P : Let TREE-PUZZLE choose which matrix to use, default: PAM. - E : Exact parameter estimates in TREE-PUZZLE, default: Approximate. - Model of rate heterogeneity in TREE-PUZZLE (default: Uniform rate) - g : 8 Gamma distributed rates - t : Two rates (1 invariable + 1 variable) - m : Mixed (1 invariable + 8 Gamma rates) - R : Randomize input order in PHYLIP NEIGHBOR. - A : Use PHYLIP PROTPARS instead of NEIGHBOR (and no pairwise distance calculation). - jx : Number of jumbles when using PHYLIP PROTPARS (random seed set with Sx). - Sx : Seed for random number generator(s). Must be 4n+1. Default is 9. - X : To keep multiple tree file (=trees from bootstrap resampled alignments). - D : To keep (and create in case of bootstrap analysis) pairwise distance matrix file. - This is created form the not resampled (original) alignment. - C : Calculate pairwise distances only (no tree). Bootstrap is always 1. - No other files are generated. - F : Pairwise distance (pwd) file as input (instead of alignment). - No -D, -C, and -N options available in this case. - V : Verbose. - # : Only for rio.pl: Do not calculate consensus tree ("I" option in rio.pl). - - -END - -} ## printUsage diff --git a/forester/archive/perl/mt.pl b/forester/archive/perl/mt.pl deleted file mode 100755 index dd04ca1..0000000 --- a/forester/archive/perl/mt.pl +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/perl -W - -# mt.pl -# ----- -# -# Copyright (C) 2003 Christian M. Zmasek -# All rights reserved -# -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Version: 1.000 -# Created on: 09/05/03 -# Last modified: 09/05/03 -# -# -# -# Calculates trees based on all alignments/files in a given directory using -# makeTree.pl. -# -# - -use strict; -use FindBin; -use lib $FindBin::Bin; -use rio_module2; - - -my $PREPROCESSING_COMMAND = ""; -my $PERFORM_PREPROCESSING = 0; - -my $POSTPROCESSING_COMMAND = "/nfs/dm3/homedir1/czmasek/RIO1.24/perl/extractSpecies.pl"; -my $PERFORM_POSTPROCESSING = 1; - - -my $MY_TEMP_DIR = $TEMP_DIR_DEFAULT; # $TEMP_DIR_DEFAULT is inherited - # from rio_module.pm - - - - -my $options = ""; # Options for makeTree.pl, see makeTree.pl. - - -my $suffix = ""; -my $use_suffixes = 0; -my $input_dir = ""; -my $output_dir = ""; - -my $i = 0; -my $filename = ""; -my @filenames = (); - - - - - - -# Analyzes the options: -# --------------------- - -unless ( @ARGV == 3 || @ARGV == 4 ) { - &printUsage(); -} - -$options = $ARGV[ 0 ]; -$input_dir = $ARGV[ 1 ]; -$output_dir = $ARGV[ 2 ]; - -if ( @ARGV == 3 ) { - $use_suffixes = 0; -} -elsif ( @ARGV == 4 ) { - $use_suffixes = 1; - $suffix = $ARGV[ 3 ]; -} - - -$input_dir = &addSlashAtEndIfNotPresent( $input_dir ); -$output_dir = &addSlashAtEndIfNotPresent( $output_dir ); -$MY_TEMP_DIR = &addSlashAtEndIfNotPresent( $MY_TEMP_DIR ); - - - - -# This adds a "-" before the options for makeTree: -# ------------------------------------------------ -unless ( $options =~ /^-/ ) { - $options = "-".$options; -} - - - - - -# This creates the temp file: -# -------------------------- - -my $time = time; -my $ii = 0; - -my $temp_file = $MY_TEMP_DIR."mt".$time.$ii; - -while ( -e $temp_file ) { - $ii++; - $temp_file = $MY_TEMP_DIR."mt".$time.$ii; -} - - - -opendir( DIR, $input_dir ) || error( "Cannot open directory \"$input_dir\": $!" ); - -$i = 0; - -while( defined( $filename = readdir( DIR ) ) ) { - if ( $filename =~ /^\.\.?$/ ) { - next; - } - if ( $use_suffixes == 1 && $filename !~ /$suffix$/ ) { - next; - } - - $filenames[ $i ] = $filename; - $i++; -} - -close( DIR ); - -$i = 0; - -FOREACH: foreach $filename ( @filenames ) { - - # If the corresponding tree seems to already exists, do next one. - if ( -e "$output_dir$filename.nhx" ) { - next FOREACH; - } - - print "\n\n\n\n"; - print "MT.PL\n"; - print "working on: $filename\n"; - - print "[tree calculation $i]\n"; - print "=====================================================================\n\n\n"; - - - unlink( "$output_dir$filename.aln", - "$output_dir$filename.log", - "$output_dir$filename.nbd" ); - - print( "MT.PL: executing:\n" ); - - my $inputfile = $input_dir.$filename; - - my $outputfilename = ""; - - if ( $use_suffixes == 1 ) { - $outputfilename = $output_dir . $filename; - $outputfilename =~ s/$suffix$//; - $outputfilename =~ s/\.$//; - $outputfilename .= ".nhx"; - } - else { - $outputfilename = $output_dir . $filename . ".nhx"; - } - - - - if ( $PERFORM_PREPROCESSING == 1 ) { - my $pre_command = "$PREPROCESSING_COMMAND"; - - print( "$pre_command\n" ); - system( $pre_command ) && &error( "Could not execute \"$pre_command\"" ); - } - - $MAKETREE = "/nfs/dm3/homedir1/czmasek/RIO1.24/perl/makeTree2.pl"; # <<<<<<<<<<<<<<<<<<<<<<<-------------------~~~~~~~~~~~~~~~~~~~~~~~ - - my $command = "$MAKETREE $options $inputfile $outputfilename"; - - print( "$command\n" ); - system( $command ) && &error( "Could not execute \"$command\"" ); - - - - if ( $PERFORM_POSTPROCESSING == 1 ) { - my $post_command = "$POSTPROCESSING_COMMAND $outputfilename"; - - print( "$post_command\n" ); - system( $post_command ) && &error( "Could not execute \"$post_command\"" ); - } - - - - $i++; - -} - - - -print( "\n\n\nMT.PL: Done!\n" ); - -exit( 0 ); - - - - - - -sub error{ - - my $text = $_[ 0 ]; - - print( "\nxt.pl: ERROR:\n" ); - print( "$text\n\n" ); - - exit( -1 ); - -} - - - - -sub printUsage { - print "\n"; - print " mt.pl\n"; - print " _____\n"; - print " \n"; - print " Copyright (C) 2003 Christian M. Zmasek\n"; - print " All rights reserved\n"; - print "\n"; - print " Author: Christian M. Zmasek\n"; - print " zmasek\@genetics.wustl.edu\n"; - print " http://www.genetics.wustl.edu/eddy/forester/\n"; - print "\n"; - print "\n"; - print " Purpose\n"; - print " -------\n"; - print "\n"; - print " Tree construction using makeTree.pl on all alignments/files\n"; - print " in a given directory.\n"; - print "\n"; - print "\n"; - print " Usage\n"; - print " -----\n"; - print "\n"; - print " mt.pl [suffix for alignments to be used in input directory]\n"; - print "\n"; - print " If a suffix is given, it will be removed for the output files.\n"; - print "\n"; - print "\n"; - print " Example\n"; - print " -------\n"; - print "\n"; - print " \"mt.pl NS21UTRB100DX alignments/ trees/ .aln\"\n"; - print "\n"; - print "\n"; - print "\n"; - exit( -1 ); - -} diff --git a/forester/archive/perl/multifetch.pl b/forester/archive/perl/multifetch.pl deleted file mode 100755 index 82205df..0000000 --- a/forester/archive/perl/multifetch.pl +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/perl - -# multifetch.pl [options] -# -# Fetch all the seqs on the list. The list is a file with one line -# per sequence; the first field is the key. -# -# Options: -# -d : domain fetch - list is in GDF format -# -n : include this many extra residues upstream (-d only) -# -c : include this many extra residues downstream (-d only) -# -f : fetch in FASTA instead of native format -# -g : use getseq from , not fetch from main databases. -# This always gives FASTA output. -# -D : specify a source database, same usage as getseq: -# -Dsw SwissProt -# -Dpir PIR -# -Dem EMBL -# -Dgb GenBank -# -Dwp WormPep -# -Dowl OWL - - -use FindBin; -use lib $FindBin::Bin; -use rio_module; -require "getopts.pl"; - - -&Getopts('c:n:dfg:D:'); -if ($opt_c) { $extra_c = $opt_c; } -if ($opt_n) { $extra_n = $opt_n; } -if ($opt_d) { $domains = 1; } -if ($opt_f) { $fmtarg = "-Ffasta";} else {$fmtarg = ""; } -if ($opt_g) { $filearg = "-d$opt_g ";} else {$filearg = ""; } -if ($opt_D) { $dbarg = "-D$opt_D "; } else {$dbarg = ""; } - - -while (<>) { - if ($domains) { - if (($name, $from, $to, $source) = /^\s*(\S+)\s+(\d+)\s+(\d+)\s+(\S+)/){ - if ($from < $to) { - $from -= $opt_n; - $to += $opt_c; - } - else { - $from += $opt_n; - $to -= $opt_c; - } - - system("$SFE $filearg $dbarg $fmtarg -r \"$name\" -f $from -t $to \"$source\"") - && die "\n\n$0: Unexpected error: Could not execute \"$SFE $filearg $dbarg $fmtarg -r \"$name\" -f $from -t $to \"$source\"\": $!"; - } - } else { - if (/^\s*(\S+)/) { - $key = $1; - - system("$SFE $filearg $dbarg $fmtarg \"$key\"") - && die "\n\n$0: Unexpected error: Could not execute \"$SFE $filearg $dbarg $fmtarg \"$key\"\": $!"; - } - } -} - -# 01/30/02 -# CZ -# Added usage of rio_module.pm, $SFE for sfetch. - -# Thu Apr 10 18:27:40 1997 -# - added -D option -# - simplified from six different getseq calls to two - diff --git a/forester/archive/perl/nph-riowebserver b/forester/archive/perl/nph-riowebserver deleted file mode 100755 index dee1123..0000000 --- a/forester/archive/perl/nph-riowebserver +++ /dev/null @@ -1,939 +0,0 @@ -#! /usr/bin/perl -W - -# nph-riowebserver.pl -# ------------------- -# -# Copyright (C) 2002 Washington University School of Medicine -# and Howard Hughes Medical Institute -# All rights reserved -# -# Created: 02/18/02 -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Last modified: 02/20/02 - - -use strict; -use CGI; -use queue; - - -my $RIOPL = "/home/rio/forester/perl/rio4P.pl"; -my $JAVA = "/home/rio/j2sdk1.4.0/bin/java"; -my $TEST_NHX = $JAVA." -cp /home/rio/forester/java forester.tools.testNHX"; -my $TEMPDIR = "/home/rio/rws_temp"; -my $SPECIESTREE = "/home/rio/forester/data/species/tree_of_life_bin_1-4.nhx"; -my $SPECIESLIST = "/home/rio/forester/data/species/tree_of_life_bin_1-4_species_list"; -my $hmm_search_url_A = "http://pfam.wustl.edu/cgi-bin/nph-hmmsearch?protseq="; -my $hmm_search_url_B = "&search_mode=merge&cutoff_strategy=ga"; - -my $RIO_ALN_DIRECTORY = "/data/rio/ALNs/"; -my $RIO_NBD_DIRECTORY = "/data/rio/NBDs/"; -my $ALIGN_FILE_SUFFIX = ".aln"; -my $ALIGN_NBD_FILE = ".nbd"; -my $DIR_FOR_TREES = "/var/www/html/trees/"; # Directory for NHX files to be read by ATV applet -my $URL_FOR_TREES = "http://forester.wustl.edu/trees/"; # URL base for NHX files to be read by ATV applet -my $CODE_BASE_FOR_ATV_APPLET = "http://forester.wustl.edu/applets/"; # URL for ATV applet (jar file) -my $TARGET_FILES_IN_DIR_FOR_TREES = 100; # If the number of nhx files in $DIR_FOR_TREES is lager then $MAX_FILES_IN_DIR_FOR_TREES -my $MAX_FILES_IN_DIR_FOR_TREES = 120; # the oldest files will be deleted until the number is down to $TARGET_FILES_IN_DIR_FOR_TREES. -my $O_THRESHOLD_DEFAULT = 0; -my $SN_THRESHOLD_DEFAULT = 0; -my $U_THRESHOLD_DEFAULT = 50; -my $SEED_FOR_RANDOM_DEFAULT = 41; -my $SORT_DEFAULT = 12; -my $MIN_SIZE = 5; # Minimal size (in chars) for input files -my $MAX_SIZE = 10000; # Maximal size (in chars) for input files -my $MAX_LINES = 1000; # Maximal lines for input files -my $RIO_OPTIONS = "U=60 Y=2 X=2 Z=2 I C E x +"; -my $CONTACT = "zmasek\@genetics.wustl.edu"; -my $VERSION = "0.3"; - - -my $o_threshold = 0; -my $sn_threshold = 0; -my $u_threshold = 0; -my $seed_for_random = 0; -my $sort = 0; -my $size_d = 0; -my $size_c = 0; -my $entry_time = 0; -my $njobs = 0; -my $njobs_thisuser = 0; -my $user_defined_tree = 0; - - - -my $query = ""; -my $query_seq = ""; -my $query_seq_file = ""; -my $tree_file = ""; -my $pfam_domain = ""; -my $species = ""; -my $output_tree = ""; -my $output_up = ""; -my $remote_addr = ""; -my $oneline = ""; -my $aln = ""; -my $speciestree = ""; -my $output = ""; -my $query_sequence = ""; # To be submitted to hmmsearch website, if necessary. -my $link_to_hmmsearch = ""; - -my @lines = (); -my %Species_names_hash = (); - - -$| = 1; - -$query = new CGI; - - -$query_seq = $query->param( 'query_seq' ); -$query_seq_file = $query->upload( 'query_seq_file' ); -$pfam_domain = $query->param( 'pfam_domain' ); -$species = $query->param( 'species' ); -$o_threshold = $query->param( 'o_threshold' ); -$sn_threshold = $query->param( 'sn_threshold' ); -$u_threshold = $query->param( 'u_threshold' ); -$seed_for_random = $query->param( 'seed_for_random' ); -$output_up = $query->param( 'output_up' ); -$sort = $query->param( 'sort_priority' ); -$tree_file = $query->upload( 'tree_file' ); - -$remote_addr = $ENV{ REMOTE_ADDR }; - - -# NPH header -# ---------- -print $query->header( -status=>"200 OK", - -server=>"$ENV{ SERVER_SOFTWARE }", - -nph=>1 ); - - - - -# Prints the first HTML -# --------------------- -print "\n"; -print "\n"; -print "\n"; -print "[ RIO SERVER | phylogenomic analysis of a protein sequence ]\n"; -print "\n"; -print "\n"; - -&print_ATV_JavaScript(); - -print "\n"; -print "\n"; - -&print_navbar(); - - - -# Reads in, cleans up and checks -# ------------------------------ - -if ( ( !defined( $query_seq_file ) && !defined( $query_seq ) ) -|| ( $query_seq_file !~ /\w+/ && $query_seq !~ /\w+/ ) ) { - &nph_user_error( "need to specify a sequence file or submit a sequence directly" ); -} - -if ( $query_seq_file =~ /\w+/ && $query_seq =~ /\w+/ ) { - &nph_user_error( "cannot specify a sequence file and submit a sequence directly" ); -} - - -if ( $query_seq_file =~ /\w+/ ) { - # Reading in from file - &readInFile( $query_seq_file ); -} -else { - # "cut and paste" - @lines = split( /^/, $query_seq ); -} - - -if ( $lines[ 0 ] =~ /^\s*>/ ) { # FASTA - shift( @lines ); -} - - -foreach $oneline ( @lines ) { - $size_d += length( $oneline ); - if ( $size_d > $MAX_SIZE ) { - &nph_user_error( "query sequence is too long (>$MAX_SIZE)" ); - } - $oneline =~ s/[^A-Za-z]//g; - $size_c += length( $oneline ); -} -if ( $size_c < $MIN_SIZE ) { - &nph_user_error( "query sequence is too short (<$MIN_SIZE)" ); -} - - -# Writes a temp file for the query sequence -open( PROT, ">$TEMPDIR/$$.query" ) || &nph_fatal_error( "failed to open temp query file" ); -foreach $oneline ( @lines ) { - print PROT $oneline; - $query_sequence .= $oneline; -} -close( PROT ); - -if ( !defined( $species ) || $species !~ /\w+/ || length( $species ) < 2 ) { - &nph_user_error( "need to specify the species of the query sequence" ); -} - -$link_to_hmmsearch = " >> click here to perform hmmsearch on query sequence << "; - -if ( !defined( $pfam_domain ) || $pfam_domain !~ /\w+/ || length( $pfam_domain ) < 1 ) { - &nph_user_error( "need to specify a name for a pfam domain of the query sequence
$link_to_hmmsearch" ); -} - -if ( length( $species ) > 5 ) { - &nph_user_error( "invalid species name" ); -} -$species =~ s/[^A-Za-z0-9]//g; -if ( length( $species ) < 2 ) { - &nph_user_error( "invalid species name" ); -} - -if ( length( $pfam_domain ) > 40 ) { - &nph_user_error( "invalid pfam domain name
$link_to_hmmsearch" ); -} -$pfam_domain =~ s/[\s,;\.><\|\\\/!@\#\$%&\*\^=]//g; -if ( length( $pfam_domain ) < 1 ) { - &nph_user_error( "invalid pfam domain name
$link_to_hmmsearch" ); -} - -if ( defined( $tree_file ) && $tree_file =~ /\w+/ ) { - $user_defined_tree = 1; -} - -$species =~ tr/a-z/A-Z/; - -if ( $user_defined_tree != 1 ) { - &checkForPresenceOfSpecies( $species ); -} - -$aln = $RIO_ALN_DIRECTORY.$pfam_domain.$ALIGN_FILE_SUFFIX; - -if ( &checkForTextFilePresence( $aln ) != 1 ) { - &nph_user_error( "no pairwise distances precalculated for pfam domain \"$pfam_domain\"
$link_to_hmmsearch" ); -} - - -if ( checkForNumberBetween0and100( $o_threshold ) != 1 ) { - $o_threshold = $O_THRESHOLD_DEFAULT; -} -if ( checkForNumberBetween0and100( $sn_threshold ) != 1 ) { - $sn_threshold = $SN_THRESHOLD_DEFAULT; -} -if ( checkForNumberBetween0and100( $u_threshold ) != 1 ) { - $u_threshold = $U_THRESHOLD_DEFAULT; -} -if ( !defined( $seed_for_random ) || $seed_for_random !~ /\d/ -|| $seed_for_random =~ /\D/ || $seed_for_random > 10000 || $seed_for_random < 0 ) { - $seed_for_random = $SEED_FOR_RANDOM_DEFAULT; -} -if ( !defined( $sort ) -|| $sort > 16 || $sort < 12 ) { - $sort = $SORT_DEFAULT; -} - -if ( defined( $output_up ) && $output_up eq "yes" ) { - $RIO_OPTIONS .= " p"; -} -else { - $u_threshold = -1; -} - - - - - - - - -# User defined species tree is dealt with here -# -------------------------------------------- - -if ( $user_defined_tree == 1 ) { - &readInFile( $tree_file ); - $size_d = 0; - $size_c = 0; - foreach $oneline ( @lines ) { - $size_d += length( $oneline ); - if ( $size_d > $MAX_SIZE ) { - &nph_user_error( "user defined species tree file is too long (>$MAX_SIZE)" ); - } - $oneline =~ s/;\|,<>\s//g; - $oneline =~ tr/a-z/A-Z/; - - $size_c += length( $oneline ); - } - if ( $size_c < $MIN_SIZE ) { - &nph_user_error( "user defined species tree file is too short (<$MIN_SIZE)" ); - } - - open( TREE, ">$TEMPDIR/$$.tree" ) || &nph_fatal_error( "failed to open temp species tree file" ); - foreach $oneline ( @lines ) { - print TREE $oneline; - } - close( TREE ); - - $speciestree = "$TEMPDIR/$$.tree"; - system( "$TEST_NHX $speciestree" ) - && nph_user_error( "user defined species tree is not in proper NHX format (or is unrooted, or contains multifurcations) $!" ); - -} -else { - $speciestree = $SPECIESTREE; -} - - - -# Join the queue, using queue.pm API -# ---------------------------------- - -$entry_time = time; - -( $njobs, $njobs_thisuser ) = &queue::CheckQueue( "rioqueue", $remote_addr, $TEMPDIR ); -if ( $njobs > 5 ) { - &nph_user_error("The server is currently swamped, with $njobs searches in the queue.
Please come back later! Sorry."); -} -if ( $njobs_thisuser > 0 ) { - &nph_user_error( "We already have $njobs_thisuser searches in the queue from - your IP address ($remote_addr). Please wait until some or all of them - finish.
If you think you got this message in error, wait a minute or so and - resubmit your job. You probably hit your browser's stop button after you - started a search - but that doesn't stop our compute cluster, it just breaks - your connection to us. You won't be able to start a new search until the - cluster's done." ); -} -if ( $njobs > 0 ) { - print_waiting_message( $njobs ); -} -&queue::WaitInQueue( "rioqueue", $remote_addr, $TEMPDIR, $$, 10 ); # wait with ten-second granularity. - - - - -# Prints "waiting" header -# ----------------------- - -my $number_of_seqs = &getNumberOfSeqsFromNBDfile( $RIO_NBD_DIRECTORY.$pfam_domain.$ALIGN_NBD_FILE ); -my $estimated_time = &estimateTime( $number_of_seqs ); - -print( "

RIO: Starting search. Estimated time: $estimated_time seconds per domain (assuming all rio nodes are running). Please wait...

\n" ); - - - - -# Runs RIO -# -------- - -&run_rio( $pfam_domain, # domain - "$TEMPDIR/$$.query", # query file name - "$TEMPDIR/$$.outfile", # output file name - "QUERY_$species", # name for query - $speciestree, # species tree - $RIO_OPTIONS, # more options - "$TEMPDIR/$$", # temp file - $o_threshold, - $sn_threshold, - $u_threshold, - $seed_for_random, - $sort ); - - - -# Done -# ---- - -&showATVlinks(); - - - -# Cleanup -unlink( "$TEMPDIR/$$.query", "$TEMPDIR/$$.tree" ); - -$output .= "

\n"; - -# Leaves the queue in an orderly fashion. -&queue::RemoveFromQueue( "rioqueue", $remote_addr, $TEMPDIR, $$ ); - -print( $output ); - -&print_footer(); - -&removeFiles( $DIR_FOR_TREES, $TARGET_FILES_IN_DIR_FOR_TREES, $MAX_FILES_IN_DIR_FOR_TREES ); - -exit( 0 ); - - - - - - - - -# Methods -# ------- - - - -# Last modified: 02/19/02 -sub run_rio { - - my $pfam_name = $_[ 0 ]; - my $query_file = $_[ 1 ]; - my $output_file = $_[ 2 ]; - my $name_for_query = $_[ 3 ]; - my $species_tree_file = $_[ 4 ]; - my $more_options = $_[ 5 ]; - my $tmp_file_rio = $_[ 6 ]; - my $t_o = $_[ 7 ]; - my $t_sn = $_[ 8 ]; - my $t_u = $_[ 9 ]; - my $seed = $_[ 10 ]; - my $sort = $_[ 11 ]; - - my $start_time = time; - - my $options_for_rio = ""; - - $options_for_rio .= ( " A=".$pfam_name ); - $options_for_rio .= ( " Q=".$query_file ); - $options_for_rio .= ( " O=".$output_file ); - $options_for_rio .= ( " N=".$name_for_query ); - $options_for_rio .= ( " S=".$species_tree_file ); - $options_for_rio .= ( " j=".$tmp_file_rio ); - $options_for_rio .= ( " L=".$t_o ); - $options_for_rio .= ( " B=".$t_sn ); - if ( $t_u != -1 ) { - $options_for_rio .= ( " v=".$t_u ); - } - $options_for_rio .= ( " y=".$seed ); - $options_for_rio .= ( " P=".$sort ); - $options_for_rio .= ( " ".$more_options ); - - $output = `$RIOPL 1 $options_for_rio`; - - if ( $? != 0 ) { - nph_rio_error(); - } - my $finish_time = time; - my $wait_time = $finish_time - $entry_time; - my $cpu_time = $finish_time - $start_time; - - - - # Logs the results. - my $date = `date`; - chop( $date ); - open ( LOGFILE, ">>$TEMPDIR/log") || &nph_fatal_error( "could not open log file" ); - flock( LOGFILE, 2 ); - print LOGFILE "$date queue: $njobs wait: $wait_time true_cpu: $cpu_time pid: $$ addr: $ENV{'REMOTE_ADDR'} host: $ENV{'REMOTE_HOST'} pfam: $pfam_name\n"; - flock( LOGFILE, 8 ); - close ( LOGFILE ); - - return; - -} ## run_rio - - - - -# Reads a file into "@lines" -# Last modified: 02/19/02 -sub readInFile { - my $file = $_[ 0 ]; - my $l = 0; - my $s = 0; - @lines = (); - - $file =~ s/;\|,<>&\s//g; - - while( <$file> ) { - $s += length( $_ ); - if ( $s > $MAX_SIZE ) { - &nph_user_error( "query sequence is too long (>$MAX_SIZE)" ); - } - $l++; - if ( $l > $MAX_LINES ) { - &nph_user_error( "file has too many lines (>$MAX_LINES)" ); - } - - push( @lines, $_ ); - - } - -} ## readInFile - - - - -# Reads in (SWISS-PROT) species names from a file. -# Names must be separated by newlines. -# Lines beginning with "#" are ignored. -# A possible "=" and everything after is ignored. -# One argument: species-names-file name -# Last modified: 02/19/02 -sub readSpeciesNamesFile { - my $infile = $_[ 0 ]; - my $return_line = ""; - my $species = ""; - - open( IN_RSNF, "$infile" ) || &nph_fatal_error( "could not open species list" ); - while ( $return_line = ) { - if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) { - $species = $1; - $species =~ s/=.+//; - $Species_names_hash{ $species } = ""; - } - } - close( IN_RSNF ); - - return; -} ## readSpeciesNamesFile - - - -# Last modified: 02/19/02 -sub checkForNumberBetween0and100 { - - my $x = $_[ 0 ]; - - if ( !defined( $x ) || $x !~ /\d/ || $x =~ /\D/ || $x > 100 || $x < 0 ) { - return 0; - } - else { - return 1; - } - -} ## checkForNumberBetween0and100 - - - -# Last modified: 02/19/02 -sub getNumberOfSeqsFromNBDfile { - my $infile = $_[ 0 ]; - my $return_line = ""; - my $number_of_seqs = 0; - - open( C, "$infile" ) || &nph_fatal_error( "could not open NBD file" ); - while ( $return_line = ) { - if ( $return_line =~ /^\s*(\d+)\s*$/ ) { - $number_of_seqs = $1; - last; - } - } - close( C ); - return $number_of_seqs; - -} ## getNumberOfSeqsFromNBDfile - - - -# Last modified: 02/19/02 -sub print_waiting_message { - - my $njobs = $_[ 0 ]; - - print( "

\n" ); - print( "RIO: There are $njobs searches queued ahead of you on the RIO server. Please wait...\n" ); - print( "

\n" ); - - return; - -} ## print_waiting_message - - - -# Last modified: 02/19/02 -sub checkForPresenceOfSpecies { - - my $species = $_[ 0 ]; - - &readSpeciesNamesFile( $SPECIESLIST ); - unless( exists( $Species_names_hash{ $species } ) ) { - &nph_user_error( "species \"$species\" not present in currently used species tree" ); - } - - return; -} ## checkForPresenceOfSepecies - - - -# Last modified: 02/19/02 -sub checkForTextFilePresence { - - my $file = $_[ 0 ]; - - if ( ( -s $file ) && ( -f $file ) && ( -T $file ) ) { - return 1; - } - else { - return 0; - } - -} ## checkForTextFilePresence - - - - - -# Last modified: 02/19/02 -sub print_footer { - - &print_navbar(); - &print_contact(); - print( "\n" ); - print( "\n" ); - - return; - -} ## print_footer - - - -# Last modified: 02/19/02 -sub print_navbar { - - print( "

\n" ); - print( "

\n" ); - print( "RIO $VERSION \n" ); - print( "phylogenomic analysis of a protein sequence | " ); - print( "help | " ); - print( "forester/rio home | " ); - print( "pfam\n" ); - print( "

\n" ); - print( "

\n" ); - - return; - -} ## print_navbar - - - -# Last modified: 02/19/02 -sub print_contact { - - print( "

comments, questions, flames? email $CONTACT

\n" ); - - return; - -} ## print_contact - - - -# Last modified: 02/19/02 -sub showATVlinks { - - my $domain_no = 0; - - if ( -s "$TEMPDIR/$$.outfile.rio.nhx" ) { - $domain_no = 1; - system( "mv", "$TEMPDIR/$$.outfile.rio.nhx", $DIR_FOR_TREES ) - && &nph_fatal_error( "could not mv $TEMPDIR/$$.outfile.rio.nhx" ); - } - elsif ( -s "$TEMPDIR/$$.outfile.rio-1.nhx" ) { - $domain_no = 1; - while ( -s "$TEMPDIR/$$.outfile.rio-$domain_no.nhx" ) { - system( "mv", "$TEMPDIR/$$.outfile.rio-$domain_no.nhx", $DIR_FOR_TREES ) - && &nph_fatal_error( "could not mv $TEMPDIR/$$.outfile.rio-$domain_no.nhx.nhx" ); - $domain_no++; - } - - } - - - if ( $domain_no == 1 ) { - $output .= "

\n"; - $output .= "\n"; - $output .= "\n"; - $output .= "

\n"; - $output .= "download NHX file describing this tree

\n"; - } - elsif ( $domain_no > 1 ) { - $output .= "

\n"; - $output .= "\n"; - $output .= "\n"; - } - $output .= "

\n"; - $output .= "download NHX file for domain #$x

\n"; - } - - return; - -} ## showATVlinks - - -# Removes output tree (NHX) files if more than $_[ 2 ] in $_[ 0 ] -# Removes until $_[ 1 ] are left -# Last modified: 02/19/02 -sub removeFiles { - - my $dir = $_[ 0 ]; - my $target = $_[ 1 ]; - my $max = $_[ 2 ]; - - my $files = &countFilesInDir( $dir ); - - if ( $files > $max ) { - - my $diff = $files - $target; - - for ( my $i = 0; $i < $diff; $i++ ) { - &removeOldestFile( $dir ); - } - } - - return; -} ## removeFiles - - - -# Last modified: 02/19/02 -sub countFilesInDir { - - my $dir = $_[ 0 ]; - my $file = ""; - my $c = 0; - - opendir( DIR, $dir ) || &nph_fatal_error( "could not open dir $dir" ); - while( defined ( $file = readdir( DIR ) ) ) { - unless ( $file =~ /\d/ ) { - next; - } - $c++; - } - closedir( DIR ); - - return( $c ); - -} ## countFilesInDir - - - -# Last modified: 02/19/02 -sub removeOldestFile { - my $dir = $_[ 0 ]; - my $file = ""; - my $oldest = ""; - my $smallest_time = 0; - my $time = 0; - my $first = 1; - - opendir( DIR, $dir ) || &nph_fatal_error( "could not open dir $dir" ); - while( defined ( $file = readdir( DIR ) ) ) { - unless ( $file =~ /\d/ ) { - next; - } - $file =~ /(\d+)/; - $time = $1; - if ( $first == 1 ) { - $first = 0; - $smallest_time = $time; - $oldest = $file - } - elsif ( $time < $smallest_time ) { - $smallest_time = $time; - $oldest = $file; - } - } - closedir( DIR ); - - unlink( $dir.$oldest ) || &nph_fatal_error( "could not delete $oldest" ); - - return; - -} ## removeOldestFile - - - -# Last modified: 02/19/02 -sub print_ATV_JavaScript { - -print < - - - -END - - return; - -} ## print_ATV_JavaScript - - - -# Last modified: 02/19/02 -sub estimateTime { - my $number_of_seqs = $_[ 0 ]; - my $estimated_time = 0; - if ( $number_of_seqs <= 50 ) { - $estimated_time = 15; - } - elsif ( $number_of_seqs <= 100 ) { - $estimated_time = 20; - } - elsif ( $number_of_seqs <= 150 ) { - $estimated_time = 30; - } - elsif ( $number_of_seqs <= 200 ) { - $estimated_time = 35; - } - elsif ( $number_of_seqs <= 250 ) { - $estimated_time = 40; - } - elsif ( $number_of_seqs <= 300 ) { - $estimated_time = 60; - } - elsif ( $number_of_seqs <= 400 ) { - $estimated_time = 100; - } - elsif ( $number_of_seqs <= 500 ) { - $estimated_time = 160; - } - elsif ( $number_of_seqs <= 600 ) { - $estimated_time = 390; - } - elsif ( $number_of_seqs <= 700 ) { - $estimated_time = 530; - } - elsif ( $number_of_seqs <= 800 ) { - $estimated_time = 750; - } - elsif ( $number_of_seqs <= 900 ) { - $estimated_time = 850; - } - else { - $estimated_time = $number_of_seqs; - } - return $estimated_time; -} ## estimateTime - - - -# Last modified: 02/19/02 -sub nph_rio_error { - - my $mesg = $_[ 0 ]; - - &queue::RemoveFromQueue( "rioqueue", $remote_addr, $TEMPDIR, $$ ); - - unlink( "$TEMPDIR/$$.query", "$TEMPDIR/$$.tree" ); - - - - if ( $user_defined_tree == 1 ) { - print( "

RIO error

\n" ); - print( "

[the RIO analysis appearently died]

\n" ); - print( "

the most likely source of this error is an invalid user defined species tree

\n" ); - } - else { - print( "

RIO server fatal error

\n" ); - print( "

[the RIO analysis appearently died for unknown reasons]

\n" ); - print( "

This type of error should not happen

\n" ); - print( "

\n" ); - print( "We may have logged it automatically, but we would appreciate it if you would also notify us at\n" ); - print( "$CONTACT\n" ); - print( "

\n" ); - } - print( "

\n" ); - - &print_footer(); - system( "rm -r $TEMPDIR/$$"."0" ); - die; - -} ## nph_fatal_error - - - -# Last modified: 02/19/02 -sub nph_fatal_error { - - my $mesg = $_[ 0 ]; - - &queue::RemoveFromQueue( "rioqueue", $remote_addr, $TEMPDIR, $$ ); - - unlink( "$TEMPDIR/$$.query", "$TEMPDIR/$$.tree" ); - - print( "

RIO server fatal error

\n" ); - print( "

[$mesg : $!]

\n" ); - print( "

This type of error should not happen

\n" ); - print( "

\n" ); - print( "We may have logged it automatically, but we would appreciate it if you would also notify us at\n" ); - print( "$CONTACT\n" ); - print( "

\n" ); - print( "

\n" ); - - - &print_footer(); - die; - -} ## nph_fatal_error - - - -# Last modified: 02/19/02 -sub nph_user_error { - - my $mesg = $_[ 0 ]; - - &queue::RemoveFromQueue( "rioqueue", $remote_addr, $TEMPDIR, $$ ); - - unlink( "$TEMPDIR/$$.query", "$TEMPDIR/$$.tree" ); - - print( "

user error

\n" ); - print( "

\n" ); - print( "$mesg\n" ); - print( "

\n" ); - print( "

\n" ); - - - &print_footer(); - - die "nph-riowebserver handled: $mesg"; - -} ## nph_user_error - - - - diff --git a/forester/archive/perl/p7extract.pl b/forester/archive/perl/p7extract.pl deleted file mode 100755 index fe6a69e..0000000 --- a/forester/archive/perl/p7extract.pl +++ /dev/null @@ -1,116 +0,0 @@ -#! /usr/bin/perl - -# Usage: p7extract.pl -# -# Converts hmmsearch output to GLF or GDF. GLF is the default. -# Order is sorted by bit score -# -# Options: -# -C : extract Pfam coverage statistics (NAR paper) -# -d : extract domains in GDF format -# -t : report only hits better than evalue of -# -s : include scores in output -# -e : include evalues in output -# -l : include negative log evalues in output for easy sorting -# -# Note: p7extract.pl -sel gives the extended GLF format expected by -# the PROFMARK benchmark scripts - -require "getopts.pl"; - -$ethresh = 0; - -&Getopts('Cdt:sel'); -if ($opt_C) { $coverage_mode = 1; $gdfmode = 1;} -if ($opt_d) { $gdfmode = 1; } -if ($opt_t) { $ethresh = $opt_t; } -if ($opt_s) { $do_scores = 1; } -if ($opt_e) { $do_eval = 1; } -if ($opt_l) { $do_log = 1; } - -$status = 1; # -C only: assume we will fail, 'til proven otherwise - -while (<>) -{ - if (/^Query HMM:\s+(\S+)/) {$hmmname = $1;} - if (/^Scores for complete sequences/) {$indom = 0; $inseq = 1;} - if (/^Parsed for domains/) {$indom = 1; $inseq = 0;} - if (/^Histogram of all scores/) {$indom = 0; $inseq = 0;} - if (/^Total sequences searched/) {$status = 0;} # looks like we've seen output - - if ( $inseq && - (($id, $sc, $ev, $nd) = /(\S+).+\s(\S+)\s+(\S+)\s+(\d+)\s*$/)) - { - if (($ethresh == 0 || $ev < $ethresh) && $show_key{$id} == 0) - { - if (! $gdfmode) { - $show_key{$id} = 1; # remember this name - $show_sc{$id} = $sc; - $show_ev{$id} = $ev; - } - $numseqs++; - } - } - - if ($gdfmode && $indom && - (($id, $sqfrom, $sqto, $sc, $ev) = - /(\S+)\s+\S+\s+(\d+)\s+(\d+).+\s(\S+)\s+(\S+)\s*$/)) - { - if (($ethresh == 0 || $ev < $ethresh) && $show_key{$id} == 0) - { - $key = "$id/$sqfrom-$sqto"; - $show_key{$key} = 1; - $show_id{$key} = $id; - $show_sqfrom{$key} = $sqfrom; - $show_sqto{$key} = $sqto; - $show_sc{$key} = $sc; - $show_ev{$key} = $ev; - - $numdomains++; - - $domsize = $sqto - $sqfrom + 1; - if ($domsize < 0) { $domsize *= -1; } - $numresidues += $domsize; - } - } - -} - -if ($coverage_mode) -{ - if ($status == 0) { - printf "%-20s %6d %6d %6d\n", $hmmname, $numseqs, $numdomains, $numresidues; - exit 0; - } else { - printf "%-20s [FAILED]\n", $hmmname; - exit 1; - } - -} - - - -foreach $key (sort byscore keys(%show_key)) -{ - if ($gdfmode) - { - printf("%-24s\t%6d\t%6d\t%15s", - $key, $show_sqfrom{$key}, $show_sqto{$key}, $show_id{$key}) - } else { - printf("%-24s", $key); - } - # Optional extensions to GDF/GLF - if ($do_scores) { printf("\t%8s", $show_sc{$key}); } - if ($do_eval) { printf("\t%12s", $show_ev{$key}); } - if ($do_log) { printf("\t%12.1f", -log($show_ev{$key})); } - print "\n"; -} - -sub byscore { - $show_sc{$b} <=> $show_sc{$a}; -} - -sub byevalue { - $show_ev{$a} <=> $show_ev{$b}; -} - diff --git a/forester/archive/perl/pfam2pwd.pl b/forester/archive/perl/pfam2pwd.pl deleted file mode 100755 index 73ac495..0000000 --- a/forester/archive/perl/pfam2pwd.pl +++ /dev/null @@ -1,743 +0,0 @@ -#!/usr/bin/perl -W - -# pfam2pwd.pl -# ----------- -# Copyright (C) 1999-2002 Washington University School of Medicine -# and Howard Hughes Medical Institute -# All rights reserved -# -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Created: 05/17/01 -# -# Last modified 02/20/03 -# -# -# See RIO_INSTALL on how to use this program. -# ------------------------------------------ -# - -use strict; - -use FindBin; -use lib $FindBin::Bin; -use rio_module; - -my $VERSION = "3.002"; - - - -# ============================================================================= -# ============================================================================= -# -# THESE VARIABLES NEED TO BE SET BY THE USER -# ------------------------------------------ -# - - -# Pfam alignments to calculate pairwise distances from: -# ----------------------------------------------------- -my $MY_PFAM_FULL_DIRECTORY = "/path/to/Pfam/Full/"; # must end with "/" - - - -# This file lists all the alignments for which to calculate pairwise distances -# from. If left empty, ALL the alignments in $MY_PFAM_FULL_DIRECTORY -# will be used: -# ---------------------------------------------------------------------------- -my $ALGNS_TO_USE_LIST_FILE = ""; - - - -# This is _VERY IMPORTANT_. It determines the species whose sequences -# are being used (sequences from species not listed in $MY_SPECIES_NAMES_FILE -# are ignored). Normally, one would use the same list as RIO uses -# ($SPECIES_NAMES_FILE in "rio_module.pm") -- currently "tree_of_life_bin_1-6.nhx". -# -# For certain large families (such as protein kinases), one might use a -# species file which contains less species in order to be able to finish -# the calculations in reasonable time. -# For example, to exclude most mammals, use: -# my $MY_SPECIES_NAMES_FILE = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6_species_list_NO_RAT_MONKEYS_APES_SHEEP_GOAT_HAMSTER" -# (to only use sequences from SWISS-PROT add this line: -# $TREMBL_ACDEOS_FILE = $PATH_TO_FORESTER."data/NO_TREMBL";) -# ---------------------------------------------------------------------------- -my $MY_SPECIES_NAMES_FILE = $SPECIES_NAMES_FILE; - - - -# This is were the output goes (must end with "/") -# ------------------------------------------------ -my $MY_RIO_PWD_DIRECTORY = "/path/to/pfam2pwd_out/pwd/"; -my $MY_RIO_BSP_DIRECTORY = "/path/to/pfam2pwd_out/bsp/"; -my $MY_RIO_NBD_DIRECTORY = "/path/to/pfam2pwd_out/nbd/"; -my $MY_RIO_ALN_DIRECTORY = "/path/to/pfam2pwd_out/aln/"; -my $MY_RIO_HMM_DIRECTORY = "/path/to/pfam2pwd_out/hmm/"; - - - -# A directory to create temporary files in: -# ----------------------------------------- -my $MY_TEMP_DIR = "/tmp/"; # must end with "/" - - - -# Alignments in which the number of sequences after pruning (determined -# by "$MY_SPECIES_NAMES_FILE") is lower than this, are ignored -# (no calculation of pwds): -# ------------------------------------------------------------------ -my $MIN_SEQS = 5; - - - -# Alignments in which the number of sequences after pruning (determined -# by "$MY_SPECIES_NAMES_FILE") is greater than this, are ignored -# (no calculation of pwds): -# ------------------------------------------------------------------ -my $MAX_SEQS = 700; - - - -# Seed for the random number generator for bootstrapping (must be 4n+1): -# --------------------------------------------------------------------- -my $MY_SEED = 85; - - - -# This is used to choose the model to be used for the (ML) -# distance calculation: -# IMPORTANT: "$MY_MATRIX_FOR_PWD" in "rio_module.pm" needs to -# have the same value, when the pwds calculated are going to -# be used for RIO! -# 0 = JTT -# 2 = BLOSUM 62 -# 3 = mtREV24 -# 5 = VT -# 6 = WAG -# PAM otherwise -# -------------------------------------------------------- -my $MY_MATRIX = 2; - - - -# -# End of variables which need to be set by the user. -# -# ============================================================================= -# ============================================================================= - - - - - - - - -my $too_small = 0; -my $too_large = 0; -my $i = 0; -my $seqs = 0; -my $filename = ""; -my $tmp_dir = ""; -my $current_dir = ""; -my $return_line = ""; -my @filenames = (); -my @too_small_names = (); -my @too_large_names = (); -my %Species_names_hash = (); -my %AC_OS = (); # AC -> species name -my %AC_DE = (); # AC -> description -my %ALGNS_TO_USE = (); # name of alignment -> "" -my $use_algns_to_use_list = 0; -my $LOGFILE = "00_pfam2pwd_LOGFILE"; - $HMMBUILD = $HMMBUILD." --amino"; - - -&createTempdir(); - - -&startLogfile(); - - -opendir( DIR, $MY_PFAM_FULL_DIRECTORY ) || die "\n\n$0: Cannot open directory $MY_PFAM_FULL_DIRECTORY: $!\n\n"; -$i = 0; -while( defined( $filename = readdir( DIR ) ) ) { - if ( $filename =~ /^\.\.?$/ ) { - next; - } - $filenames[ $i ] = $filename; - $i++; -} -close( DIR ); - - -&readSpeciesNamesFile( $MY_SPECIES_NAMES_FILE ); - -&readTrEMBL_ACDEOS_FILE(); - -if ( defined( $ALGNS_TO_USE_LIST_FILE ) && $ALGNS_TO_USE_LIST_FILE =~ /\w/ ) { - $use_algns_to_use_list = 1; - &readListFile(); -} - - -$current_dir = `pwd`; -$current_dir =~ s/\s//; -chdir ( $tmp_dir ) -|| die "\n\n$0: Unexpected error: Could not chdir to <<$tmp_dir>>: $!"; - -$i = 0; - -FOREACH_ALIGN: foreach $filename ( @filenames ) { - - # If the corresponding pwd, positions, and aln files seem to already exists, do next one. - if ( ( -e $MY_RIO_PWD_DIRECTORY.$filename.$SUFFIX_PWD ) - && ( -e $MY_RIO_BSP_DIRECTORY.$filename.$SUFFIX_BOOT_STRP_POS ) - && ( -e $MY_RIO_NBD_DIRECTORY.$filename.$SUFFIX_PWD_NOT_BOOTS ) - && ( -e $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX ) - && ( -e $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM ) ) { - next FOREACH_ALIGN; - } - - if ( $use_algns_to_use_list == 1 && !exists( $ALGNS_TO_USE{ $filename } ) ) { - next FOREACH_ALIGN; - } - - - $seqs = &removeSeqsFromPfamAlign( $MY_PFAM_FULL_DIRECTORY.$filename, - "REM_SEQ_OUTFILE", - 1 ); - if ( $seqs < $MIN_SEQS ) { - unlink( "REM_SEQ_OUTFILE" ); - $too_small_names[ $too_small++ ] = $filename; - next FOREACH_ALIGN; - } - elsif ( $seqs > $MAX_SEQS ) { - unlink( "REM_SEQ_OUTFILE" ); - $too_large_names [ $too_large++ ] = $filename; - next FOREACH_ALIGN; - } - - - print "\n\n\n"; - print " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; - print " $i: $filename ($seqs seqs)\n"; - print " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; - print "\n"; - - # If one of the two file exists from a previous (interrupted) run. - unlink( $MY_RIO_PWD_DIRECTORY.$filename.$SUFFIX_PWD ); - unlink( $MY_RIO_BSP_DIRECTORY.$filename.$SUFFIX_BOOT_STRP_POS ); - unlink( $MY_RIO_NBD_DIRECTORY.$filename.$SUFFIX_PWD_NOT_BOOTS ); - unlink( $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX ); - unlink( $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM ); - - - &executeHmmbuild( "REM_SEQ_OUTFILE", - $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX, - "hmm" ); - - if ( unlink( "hmm" ) != 1 ) { - die "\n\n$0: Unexpected error: Could not delete <>: $!"; - } - - if ( unlink( "REM_SEQ_OUTFILE" ) != 1 ) { - die "\n\n$0: Unexpected error: Could not delete <>: $!"; - } - - executeHmmbuildHand( $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX, - $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM ); - - system( $HMMCALIBRATE, $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM ) - && die "\n\n$0: Could not execute \"$HMMCALIBRATE $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM\": $!"; - - &pfam2phylipMatchOnly( $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX, "infile" ); - - &executePuzzle( "infile", $MY_MATRIX ); - - system( "mv", "infile.dist", $MY_RIO_NBD_DIRECTORY.$filename.$SUFFIX_PWD_NOT_BOOTS ) - && die "\n\n$0: Unexpected error: $!"; - - &executeBootstrap( "infile", - $BOOTSTRAPS, - "BOOTSTRAPPED_ALGN", - $MY_RIO_BSP_DIRECTORY.$filename.$SUFFIX_BOOT_STRP_POS, - $MY_SEED ); - - if ( unlink( "infile" ) != 1 ) { - die "\n\n$0: Unexpected error: Could not delete <>: $!"; - } - - - &executePuzzleBootstrapped( "BOOTSTRAPPED_ALGN", $MY_MATRIX ); - - ##if ( unlink( "outfile" ) != 1 ) { - ## die "\n\n$0: Unexpected error: Could not delete <>: $!"; - ##} - - - system( "mv", "BOOTSTRAPPED_ALGN".".dist", $MY_RIO_PWD_DIRECTORY.$filename.$SUFFIX_PWD ) - && die "\n\n$0: Unexpected error: $!\n\n"; - - if ( unlink( "BOOTSTRAPPED_ALGN" ) != 1 ) { - die "\n\n$0: Unexpected error: Could not delete <>: $!"; - } - - $i++; - -} ## End of FOREACH_ALIGN loop. - - -chdir( $current_dir ) -|| die "\n\n$0: Unexpected error: Could not chdir to <<$current_dir>>: $!"; - -rmdir( $tmp_dir ); - -&finishLogfile(); - -print "\n\n\n"; -print( "pfam2pwd.pl: Done.\n" ); -print( "Successfully calculated $i pairwise distance files.\n" ); -print( "Too large alignments (>$MAX_SEQS): $too_large\n" ); -print( "Too small alignments (<$MIN_SEQS): $too_small\n" ); -print( "See the logfile \"$MY_RIO_PWD_DIRECTORY".$LOGFILE."\"\n" ); -print "\n\n\n"; - -exit( 0 ); - - - - - - -# Methods -# ------- - - - -# Three arguments: -# 1. Stockholm alignment -# 2. Outalignment -# 3. Outhmm -# Returns the options used. -# Last modified: 06/26/01 -sub executeHmmbuild { - - my $full = $_[ 0 ]; - my $outalignment = $_[ 1 ]; - my $outhmm = $_[ 2 ]; - my $options = ""; - - unless ( ( -s $full ) && ( -f $full ) && ( -T $full ) ) { - die "\n\n$0: \"$full\" does not exist, is empty, or is not a plain textfile.\n\n"; - } - - $options = getHmmbuildOptionsFromPfam( $full ); - - $options =~ s/-f//; - $options =~ s/-g//; - $options =~ s/-s//; - $options =~ s/-F//; - $options =~ s/-A//; - $options =~ s/-o\s+\S+//; - $options =~ s/(\s|^)[^-]\S+/ /g; - - if ( $options =~ /--prior/ ) { - my $basename = basename( $full ); - $basename .= ".PRIOR"; - $options =~ s/--prior/--prior $PRIOR_FILE_DIR$basename/; - } - - # Remove for versions of HMMER lower than 2.2. - if ( $options =~ /--informat\s+\S+/ ) { - $options =~ s/--informat\s+\S+/-/; - } - - system( "$HMMBUILD $options -o $outalignment $outhmm $full" ) - && die "\n\n$0: Could not execute \"$HMMBUILD $options -o $outalignment $outhmm $full\".\n\n"; - - return $options; - -} ## executeHmmbuild. - - -# Two arguments: -# 1. Stockholm alignment -# 2. Outhmm -# Returns the options used. -# Last modified: 06/26/01 -sub executeHmmbuildHand { - - my $full = $_[ 0 ]; - my $outhmm = $_[ 1 ]; - my $options = ""; - - unless ( ( -s $full ) && ( -f $full ) && ( -T $full ) ) { - die "\n\n$0: \"$full\" does not exist, is empty, or is not a plain textfile.\n\n"; - } - - $options = getHmmbuildOptionsFromPfam( $full ); - - $options =~ s/-f//; - $options =~ s/-g//; - $options =~ s/-s//; - $options =~ s/-F//; - $options =~ s/-A//; - $options =~ s/-o\s+\S+//; - $options =~ s/(\s|^)[^-]\S+/ /g; - - if ( $options =~ /--prior/ ) { - my $basename = basename( $full ); - $basename .= ".PRIOR"; - $options =~ s/--prior/--prior $PRIOR_FILE_DIR$basename/; - } - - # Remove for versions of HMMER lower than 2.2. - if ( $options =~ /--informat\s+\S+/ ) { - $options =~ s/--informat\s+\S+/-/; - } - - system( "$HMMBUILD --hand $options $outhmm $full" ) - && die "\n\n$0: Could not execute \"$HMMBUILD -- hand $options $outhmm $full\".\n\n"; - - return $options; - -} ## executeHmmbuildHand. - - - -# One argument: -# Pfam align name. -# Last modified: 02/26/01 -sub getHmmbuildOptionsFromPfam { - - my $infile = $_[ 0 ]; - my $return_line = ""; - my $result = ""; - - unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n\n$0: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n"; - } - - open( GHO, $infile ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!"; - while ( $return_line = ) { - if ( $return_line =~ /^\s*#.*hmmbuild\s+(.+)\s*$/ ) { - $result = $1; - close( GHO ); - return $result; - } - } - close( GHO ); - return $result; - -} ## getHmmbuildOptionsFromPfam - - - -# Similar to the method with the same name in "rio.pl". -# Removes sequences from a Pfam flat file. -# Adds species to TrEMBL seqs. -# It can remove all sequences not from species listed in a species names file. -# It can remove all sequences which do not have a SWISS-PROT name (XXXX_XXXXX) -# Three arguments: -# 1. Pfam flat file name -# 2. outfile name -# 3. 1 to remove TrEMBL seqs with "(FRAGMENT)" in their DE line. -# Returns the number of sequences in the resulting alignment. -# If a query name is given, it returns -1 if query is not found in alignment, -# -10 if the name is not unique. -# Last modified: 05/24/02 -sub removeSeqsFromPfamAlign { - my $infile = $_[ 0 ]; - my $outfile = $_[ 1 ]; - my $remove_frags = $_[ 2 ]; - my $return_line = ""; - my $saw_sequence_line = 0; - my $number_of_seqs = 0; - my $DE = ""; - my $OS = ""; - my $AC = ""; - my $i = 0; - my $length = 0; - my $seq_name = ""; - my $seq = ""; - - - open( OUT_RNSP, ">$outfile" ) || die "\n\n$0: Unexpected error: Cannot create file \"$outfile\": $!"; - open( IN_RNSP, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!"; - while ( $return_line = ) { - - if ( $saw_sequence_line == 1 - && !&containsPfamNamedSequence( $return_line ) - && !&isPfamCommentLine( $return_line ) ) { - # This is just for counting purposes. - $saw_sequence_line = 2; - } - if ( &isPfamSequenceLine( $return_line ) ) { - if ( $saw_sequence_line == 0 ) { - $saw_sequence_line = 1; - } - $return_line =~ /^\s*(\S+)\s+(\S+)/; - $seq_name = $1; - $seq = $2; - if ( !&startsWithSWISS_PROTname( $return_line ) ) { - $seq_name =~ /^(\S+)\//; - $AC = $1; - unless( exists( $AC_OS{ $AC } ) ) { - #ACs not present in "ACDEOS" file. - next; - } - $OS = $AC_OS{ $AC }; - if ( !$OS || $OS eq "" ) { - die "\n\n$0: Unexpected error: species for \"$AC\" not found.\n\n"; - } - unless( exists( $Species_names_hash{ $OS } ) ) { - next; - } - if ( $remove_frags == 1 ) { - $DE = $AC_DE{ $AC }; - if ( $DE && $DE =~ /$FRAGMENT$/ ) { - next; - } - } - $seq_name =~ s/\//_$OS\//; - } - else { - if ( $return_line =~ /_([A-Z0-9]{1,5})\// ) { - unless( exists( $Species_names_hash{ $1 } ) ) { - next; - } - } - # remove everything whose species cannot be determined. - else { - next; - } - } - $length = length( $seq_name ); - for ( $i = 0; $i <= ( $LENGTH_OF_NAME - $length - 1 ); $i++ ) { - $seq_name .= " "; - } - $return_line = $seq_name.$seq."\n"; - } - - if ( !&isPfamCommentLine( $return_line ) ) { - print OUT_RNSP $return_line; - } - - if ( $saw_sequence_line == 1 ) { - $number_of_seqs++; - } - } ## while ( $return_line = ) - close( IN_RNSP ); - close( OUT_RNSP ); - - return $number_of_seqs; - -} ## removeSeqsFromPfamAlign - - - - - - - -# Reads in (SWISS-PROT) species names from a file. -# Names must be separated by newlines. -# Lines beginning with "#" are ignored. -# A possible "=" and everything after is ignored. -# One argument: species-names-file name -# Last modified: 04/24/01 -sub readSpeciesNamesFile { - my $infile = $_[ 0 ]; - my $return_line = ""; - my $species = ""; - - unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n\n$0: Error: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n"; - } - - open( IN_RSNF, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!\n\n"; - while ( $return_line = ) { - if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) { - $species = $1; - $species =~ s/=.+//; - $Species_names_hash{ $species } = ""; - } - } - close( IN_RSNF ); - - return; -} ## readSpeciesNamesFile - - - -# Last modified: 02/21/03 -sub readTrEMBL_ACDEOS_FILE { - - my $return_line = ""; - - unless ( ( -s $TREMBL_ACDEOS_FILE ) && ( -f $TREMBL_ACDEOS_FILE ) && ( -T $TREMBL_ACDEOS_FILE ) ) { - die "\n\n$0: Error: \"$TREMBL_ACDEOS_FILE\" does not exist, is empty, or is not a plain textfile.\n\n"; - } - # Fill up (huge) hashs. - open( HH, "$TREMBL_ACDEOS_FILE" ) || die "\n\n$0: Unexpected error: Cannot open file <<$TREMBL_ACDEOS_FILE>>: $!\n\n"; - while ( $return_line = ) { - - if ( $return_line =~ /(\S+);([^;]*);(\S+)/ ) { - $AC_OS{ $1 } = $3; - $AC_DE{ $1 } = $2; - } - } - close( HH ); -} ## readTrEMBL_ACDEOS_FILE - - -# Last modified: 02/21/03 -sub readListFile { - - my $return_line = ""; - - unless ( ( -s $ALGNS_TO_USE_LIST_FILE ) && ( -f $ALGNS_TO_USE_LIST_FILE ) && ( -T $ALGNS_TO_USE_LIST_FILE ) ) { - die "\n\n$0: Error: \"$ALGNS_TO_USE_LIST_FILE\" does not exist, is empty, or is not a plain textfile.\n\n"; - } - # Fill up hash. - open( LF, "$ALGNS_TO_USE_LIST_FILE" ) || die "\n\n$0: Unexpected error: Cannot open file <<$ALGNS_TO_USE_LIST_FILE>>: $!\n\n"; - while ( $return_line = ) { - if ( $return_line =~ /^\s*(\S+)\s*$/ ) { # just a list - $ALGNS_TO_USE{ $1 } = ""; - } - elsif ( $return_line =~ /^\s*\S+\s+\S+\s+(\S+)/ ) { # "changes" list from Pfam - $ALGNS_TO_USE{ $1 } = ""; - } - - } - close( LF ); - -} ## readListFile - - - -# Five arguments: -# 1. Name of inputfile -# 2. Bootstraps -# 2. Name of output alignment file -# 3. Name of output positions file -# 4. Seed for random number generator -# -# Last modified: 06/23/01 -sub executeBootstrap { - my $infile = $_[ 0 ]; - my $bootstraps = $_[ 1 ]; - my $outalign = $_[ 2 ]; - my $positions = $_[ 3 ]; - my $seed = $_[ 4 ]; - - system( "$BOOTSTRAP_CZ_PL 0 $bootstraps $infile $outalign $positions $seed" ) - && die "\n\n$0: executeBootstrap:\nCould not execute \"$BOOTSTRAP_CZ_PL 0 $bootstraps $infile $outalign $positions $seed\".\n\n"; - -} ## executeBootstrap - - - - -# Last modified: 05/22/02 -sub createTempdir { - - my $ii = 0; - my $time = time; - - $tmp_dir = $MY_TEMP_DIR.$time.$ii; - - while ( -e $tmp_dir ) { - $ii++; - $tmp_dir = $MY_TEMP_DIR.$time.$ii; - } - - mkdir( $tmp_dir, 0777 ) - || die "\n\n$0: Unexpected error: Could not create <<$tmp_dir>>: $!\n\n"; - - unless ( ( -e $tmp_dir ) && ( -d $tmp_dir ) ) { - die "\n\n$0: Unexpected error: failed to create <<$tmp_dir>>.\n\n"; - } - -} ## createTempdir - - - -# Last modified: 05/17/01 -sub startLogfile { - if ( -e $MY_RIO_PWD_DIRECTORY.$LOGFILE ) { - print "\npfam2pwd.pl:\n"; - print "logfile $MY_RIO_PWD_DIRECTORY"."$LOGFILE already exists\n"; - print "rename it or place it in another directory\n"; - exit( -1 ); - } - - open( L, ">$MY_RIO_PWD_DIRECTORY".$LOGFILE ) - || die "\n\n$0: startLogfile: Cannot create logfile: $!\n\n"; - print L "Min seqs : $MIN_SEQS\n"; - print L "Max seqs : $MAX_SEQS\n"; - print L "Seed : $MY_SEED\n"; - print L "TrEMBL ACDEOS file : $TREMBL_ACDEOS_FILE\n"; - print L "Species names file : $MY_SPECIES_NAMES_FILE\n"; - print L "Pfam directory : $MY_PFAM_FULL_DIRECTORY\n"; - print L "PWD outputdirectory: $MY_RIO_PWD_DIRECTORY\n"; - print L "BSP outputdirectory: $MY_RIO_BSP_DIRECTORY\n"; - print L "NBD outputdirectory: $MY_RIO_NBD_DIRECTORY\n"; - print L "ALN outputdirectory: $MY_RIO_ALN_DIRECTORY\n"; - print L "HMM outputdirectory: $MY_RIO_HMM_DIRECTORY\n"; - print L "Start date : ".`date`; - if ( $MY_MATRIX == 0 ) { - print L "Matrix : JTT\n"; - } - elsif ( $MY_MATRIX == 2 ) { - print L "Matrix : BLOSUM 62\n"; - } - elsif ( $MY_MATRIX == 3 ) { - print L "Matrix : mtREV24\n"; - } - elsif ( $MY_MATRIX == 5 ) { - print L "Matrix : VT\n"; - } - elsif ( $MY_MATRIX == 6 ) { - print L "Matrix : WAG\n"; - } - elsif ( $MY_MATRIX == 7 ) { - print L "Matrix : auto\n"; - } - else { - print L "Matrix : PAM\n"; - } -} ## startLogfile - - - -# Last modified: 05/17/01 -sub finishLogfile { - my $j = 0; - print L "\n\n"; - print L "Successfully calculated $i pairwise distance files.\n"; - print L "Too large alignments (>$MAX_SEQS): $too_large\n"; - print L "Too small alignments (<$MIN_SEQS): $too_small\n"; - print L "Finish date : ".`date`."\n\n"; - - print L "List of the $too_large alignments which were ignored because they\n"; - print L "contained too many sequences (>$MAX_SEQS) after pruning:\n"; - for ( $j = 0; $j < $too_large; ++$j ) { - print L "$too_large_names[ $j ]\n"; - } - print L "\n\n"; - print L "List of the $too_small alignments which were ignored because they\n"; - print L "contained not enough sequences (<$MIN_SEQS) after pruning:\n"; - for ( $j = 0; $j < $too_small; ++$j ) { - print L "$too_small_names[ $j ]\n"; - } - print L "\n"; - close( L ); -} ## finishLogfile - - - - diff --git a/forester/archive/perl/pfam2slx.pl b/forester/archive/perl/pfam2slx.pl deleted file mode 100755 index 1d591f5..0000000 --- a/forester/archive/perl/pfam2slx.pl +++ /dev/null @@ -1,94 +0,0 @@ -#! /usr/bin/perl - -# Unpack a pfam flatfile, containing many alignments, -# into separate SELEX-format alignment files. -# -# Assumes that ID is the first line in a record, -# that SQ is the last line before the alignment starts, -# and that there is one aligned sequence per line. -# - - -################################################################ -# PFAMSERVER - The Washington University/St. Louis Pfam web server -# Copyright (C) 1995-1999 Washington University School of Medicine -# Copyright (C) 1995-1999 Sanger Centre/Genome Research Ltd. -# Copyright (C) 1998-1999 Karolinska Institutet Center for Genomics Research -# All Rights Reserved -# -# This source code is distributed under the terms of the -# GNU General Public License. See the files COPYRIGHT and LICENSE -# for details. -# -################################################################ - -$cpl = 50; # 50 sequence characters per line -$/ = "\n//"; # paragraph mode on // separators - -while (<>) { - $in_alignment = 0; - $nseq = 0; - @lines = split(/^/); - while ($line = shift(@lines)) { - if ($in_alignment) { - if ($line =~ /^\#/) { next; } - elsif ($line =~ /^(\S+)\s+(\S+)/) { - $name[$nseq] = $1; - $aseq[$nseq] = $2; - $nseq++; - } - } - elsif ($line =~ /^\#=GF ID (\S+)\s*$/) { - $root = $1; - print "working on $root\n"; - if (-e "$root") { - system ("mv $root $root.orig"); - print "$root exists -- moved to $root.orig\n"; - } - open(SELEX,">$root") || die; - print SELEX "#=ID $root\n"; - } - elsif ($line =~ /^\#=GF AC (.+)$/) { print SELEX "#=AC $1\n"; } - elsif ($line =~ /^\#=GF DE (.+)$/) { print SELEX "#=DE $1\n"; } - - elsif ($line =~ /^\#=GF GA (\S+)\s+(\S+)/) - { print SELEX "#=GA $1 $2\n"; } - - elsif ($line =~ /^\#=GF TC (\S+) (\S+)/) - { print SELEX "#=TC $1 $2\n"; } - - elsif ($line =~ /^\#=GF NC (\S+) (\S+)/) - { print SELEX "#=NC $1 $2\n"; } - - elsif ($line =~ /^\#=GF SQ \d+/) { - print SELEX "# $line"; - $in_alignment = 1; - } - elsif ($line =~ /^\/\//) { - last; - } - else { - print SELEX "# $line"; - } - } - - # figure out maximum name length - $maxnamelen = 0; - for ($idx = 0; $idx < $nseq; $idx++) { - if (length($name[$idx]) > $maxnamelen) { - $maxnamelen = length($name[$idx]); - } - } - # break the alignment across - # multiple lines - $alen = length($aseq[0]); - for ($pos = 0; $pos < $alen; $pos += $cpl) { - for ($idx = 0; $idx < $nseq; $idx++) { - printf(SELEX "%-${maxnamelen}s %s\n", - $name[$idx], substr($aseq[$idx], $pos, $cpl)); - } - print SELEX "\n"; - } - close SELEX; -} - diff --git a/forester/archive/perl/queue.pm b/forester/archive/perl/queue.pm deleted file mode 100755 index a7374fe..0000000 --- a/forester/archive/perl/queue.pm +++ /dev/null @@ -1,150 +0,0 @@ -package queue; - -# Process queueing -# SRE, Wed Sep 2 14:37:14 1998 -# CVS $Id: queue.pm,v 1.1.1.1 2005/03/22 08:35:51 cmzmasek Exp $ -# Master copy: see src/queue (CVS controlled, separate from pfamserver) -# -# Written for Pfam web server; suited for queuing any set of commands. -# -# API: -# -# $mypid = $$; -# $delay_in_seconds = 2; -# -# $nqueued = &queue::CheckQueue("pfamqueue", "username", "/tmp"); -# print ("There are $nqueued jobs ahead of you in line\n"); -# &queue::WaitInQueue("pfamqueue", "username", "/tmp", $mypid, $delay_in_seconds); -# print ("Our turn! Working...\n"); -# (do stuff) -# &queue::RemoveFromQueue("pfamqueue", "username", "/tmp", $mypid); -# -# queuedir is a directory where the script has write permissions; -# typically a tmp directory of some sort. -# - - -################################################################ -# PFAMSERVER - The Washington University/St. Louis Pfam web server -# Copyright (C) 1995-1999 Washington University School of Medicine -# Copyright (C) 1995-1999 Sanger Centre/Genome Research Ltd. -# Copyright (C) 1998-1999 Karolinska Institutet Center for Genomics Research -# All Rights Reserved -# -# This source code is distributed under the terms of the -# GNU General Public License. See the files COPYRIGHT and LICENSE -# for details. -# -################################################################ -# RCS $Id: queue.pm,v 1.1.1.1 2005/03/22 08:35:51 cmzmasek Exp $ - - -# WaitInQueue() - add a process id to a queue, wait for turn -# -# Arguments: queue - name of queue (prefix of queue stamp) -# username - name of user (middle part of queue stamp) -# queuedir - directory to keep queue stamps in -# mypid - our process id -# delay - number of seconds between checking queue status -# -# Note: When it checks the queue, if a stamp is present that -# doesn't seem to correspond to a running process (ps -a), -# it deletes the stamp. This protects against crashed processes -# freezing all subsequent jobs. -# -# example: &WaitInQueue("pfamqueue", "/tmp", $mypid, 2); -# -# Returns 1 on success, 0 on failure. -# -# NOTE: You may have to set the ps command in WaitInQueue. -# It must return all running processes. -# -sub WaitInQueue -{ - local($queue, $username, $queuedir, $mypid, $delay) = @_; - local(@newqueue, @queuelist, %mark); - local(*STAMP, *QUEUEDIR); - local(%is_running); - local(@output, $line, $pid, $waiting); - - # get list of other guys who are working - opendir(QUEUEDIR, $queuedir); - @queuelist = grep(/$queue\.\S*\.\d+/, readdir(QUEUEDIR)); - closedir(QUEUEDIR); - # make stamp for our pid - if ($username eq "") { $username = "unknown"; } - open(STAMP, ">$queuedir/$queue.$username.$mypid") || return 0; - close(STAMP); - # wait for our turn - while (1) - { - if ($#queuelist == -1) { last; } # nobody ahead of us; our turn! - sleep($delay); - # get list of running processes - %is_running = 0; - @output = split(/^/, `ps -ax`); - foreach $line (@output) { - $line =~ /\s*(\d+)/; - $is_running{$1} = 1; - } - # verify that the guys we're waiting for - # are still running, and haven't crashed. - # if they have, reap their stamps, and their - # tmp files. - foreach $waiting (@queuelist) { - ($name, $pid) = ($waiting =~ /$queue\.(\S*)\.(\d+)/); - if (! $is_running{$pid}) { unlink "$queuedir/$queue.$name.$pid"; } - } - - # get new list of queued jobs ahead of us. - # ignore guys who came in after we grabbed our initial queue list; - # they're waiting for *us*. The crazed greps are the Perl-y - # way of computing an intersection between two arrays. - # - opendir(QUEUEDIR, $queuedir); - @newqueue = grep(/$queue\.\S*\.\d+/, readdir(QUEUEDIR)); - closedir(QUEUEDIR); - %mark = 0; - grep($mark{$_}++,@queuelist); - @queuelist = grep($mark{$_},@newqueue); - } - - 1; # time to run! -} - - -# CheckQueue() - return total number of processes working, other than us -# and the total that this particular username is running. -# -# Arguments: queue, username, queuedir -# -sub CheckQueue -{ - local($queue, $username, $queuedir) = @_; - local(*QUEUEDIR, @allqueue, $nall, $nuser); - - opendir(QUEUEDIR, $queuedir); - @allqueue = grep(/$queue\.\S*\.\d+/, readdir(QUEUEDIR)); - closedir(QUEUEDIR); - - if ($username eq "") {$username = "unknown"; } - $nall = $nuser = 0; - foreach $waiting (@allqueue) { - ($name, $pid) = ($waiting =~ /$queue\.(\S*)\.(\d+)/); - $nall++; - if ($name eq $username) { $nuser++; } - } - return ($nall, $nuser); -} - - -# RemoveFromQueue() - remove a pid from a queue -# -sub RemoveFromQueue -{ - local($queue, $username, $queuedir, $pid) = @_; - if ($username eq "") {$username = "unknown"; } - unlink "$queuedir/$queue.$username.$pid"; -} - -1; diff --git a/forester/archive/perl/rio.pl b/forester/archive/perl/rio.pl deleted file mode 100755 index 7594587..0000000 --- a/forester/archive/perl/rio.pl +++ /dev/null @@ -1,3391 +0,0 @@ -#!/usr/bin/perl -W - -# rio.pl -# ------ -# -# Copyright (C) 2000-2002 Washington University School of Medicine -# and Howard Hughes Medical Institute -# All rights reserved -# -# Created: 11/25/00 -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Last modified 09/06/03 -# - -# -# -# Available at: http://www.genetics.wustl.edu/eddy/forester/ -# RIO webserver: http://www.rio.wustl.edu/ -# -# Reference: -# Zmasek C.M. and Eddy S.R. (2002) -# RIO: Analyzing proteomes by automated phylogenomics using -# resampled inference of orthologs. -# BMC Bioinformatics 3:14 -# http://www.biomedcentral.com/1471-2105/3/14/ -# -# It is highly recommended that you read this paper before -# installing and/or using RIO. (Included in the RIO -# distribution as PDF: "RIO.pdf".) -# -# -# Before rio.pl can be used, some variables in rio_module.pm need to be set, -# as described in RIO_INSTALL. -# -# Usage: rio.pl -# ----- -# -# -# Examples: -# -------- -# % RIO1.1/perl/rio.pl 1 A=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out1 p I C E -# -# % RIO1.1/perl/rio.pl 2 A=aconitase N=LEU2_LACLA/5-449 O=out2 p I C E -# -# % RIO1.1/perl/rio.pl 3 A=/path/to/my/pfam/Full/aconitase H=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out3 p I C E -# -# % RIO1.1/perl/rio.pl 4 A=/path/to/my/pfam/Full/aconitase N=LEU2_LACLA/5-449 O=out4 p I C E -# -# % RIO1.1/perl/rio.pl 3 A=/path/to/my/pfam/Full/aconitase b=/path/to/my/pfam/Seed/aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out5 p I C E -# -# -# Modes: -# ------ -# -# 1: RIO analysis based on precalculated pairwise distances -# alignment does not contain query sequence -# -# 2: RIO analysis based on precalculated pairwise distances -# alignment does contain query sequence -# -# 3: RIO analysis based on Pfam alignments, -# alignment does not contain query sequence -# -# 4: RIO analysis based on Pfam alignments, -# alignment does contain query sequence -# -# -# -# Tagged arguments: -# ----------------- -# -# No "G=", "H=", "F=", "T=", "a=", "b=", "s", "f" in modes 1 and 2. -# -# -# A= Pfam alignment name (mandatory). This specifies the alignment -# against which the RIO analysis is to be performed. -# In modes 1 and 2: Pfam model (alignment) name -# (e.g. "A=aconitase"). -# In modes 3 and 4: Pfam alignment path/name -# (e.g. "A=/path/to/your/pfam/Full/aconitase"). -# -# Q= Path/name of file containing the query sequence -# (in FASTA format or raw sequence) (mandatory in modes 1 and 3). -# -# N= Query name (mandatory). This must include the SWISS-PROT code -# for the species of the query after a "_" (e.g. "N=QUERY_HAEIN"). -# If the query sequence is already in the alignment (modes 2 and 4) -# the complete name needs to be specified -- including "/xxx-xxx". -# -# O= Output file path/name (mandatory). -# -# T= Model for pairwaise distance calculation: -# J=JTT, B=BLOSUM 62, M=mtREV24, V=VT, W=WAG, P=PAM. -# BLOSUM 62 is default. -# (Not in modes 1 and 2; these modes use $MATRIX_FOR_PWD instead.) -# -# In modes 1 and 3, a HMM is needed to align the query sequence to -# the alignment and either one of the following options must be -# employed: -# H= HMM name: This uses hmmfetch to retrieve a HMM from -# $PFAM_HMM_DB. -# F= HMM file: This directly reads the HMM from a file. -# -# S= Species tree file path/name (in NHX format) (optional). -# If not specified, $SPECIES_TREE_FILE_DEFAULT is used. -# -# G= Species names file (optional). Only sequences associated with -# species found in this file are used. -# In the species names file, individual species names must be -# separated by newlines and lines starting with "#" are ignored. -# While only sequences associated with species found in the species -# tree ("S=") are used for the actual RIO analysis, this allows to -# remove sequences prior to tree calculation (which is the most -# time consuming step). -# -# P= Sort priority (default is 12): -# 0 : Ortholog -# 1 : Ortholog, Super ortholog -# 2 : Super ortholog, Ortholog -# 3 : Ortholog, Distance -# 4 : Distance, Ortholog -# 5 : Ortholog, Super ortholog, Distance -# 6 : Ortholog, Distance, Super ortholog -# 7 : Super ortholog, Ortholog, Distance -# 8 : Super ortholog, Distance, Ortholog -# 9 : Distance, Ortholog, Super ortholog -# 10 : Distance, Super ortholog, Ortholog -# 11 : Ortholog, Subtree neighbor, Distance -# 12 : Ortholog, Subtree neighbor, Super ortholog, Distance (default) -# 13 : Ortholog, Super ortholog, Subtree neighbor, Distance -# 14 : Subtree neighbor, Ortholog, Super ortholog, Distance -# 15 : Subtree neighbor, Distance, Ortholog, Super ortholog -# 16 : Ortholog, Distance, Subtree neighbor, Super ortholog -# 17 : Ortholog, Subtree neighbor, Distance, Super ortholog -# -# a= Bootstraps for tree construction (not in modes 1 and 2). -# Default is 100. -# -# L= Threshold for orthologies for output. Default is 0. -# v= Threshold for ultra-paralogies for output. Default is 50. -# -# U= Threshold for orthologies for distance calculation. Default is 60. -# -# X= In case of more than one putative orthologs: -# number of sd the distance query - LCA has to differ -# from the mean to generate a warning. Default is 2. -# -# Y= In case of no putative orthologs: -# number of sd the distance query - root has to differ -# from mean to generate a warning. Default is 2. -# -# Z= In case of one putative ortholog: -# threshold for factor between the two distances to their -# LCA (larger/smaller) to generate a warning. Default is 2. -# -# B= Threshold for subtree-neighborings. Default is 0. -# -# b= Build HMM from seed alignment with "hmmbuild -s" (optional). -# This is to prevent from finding multiple domains per sequence -# (i.e. prevents "cutting" the query sequence). Give path/name to -# Seed with this. -# -# j= Name for temporary directory (optional). -# -# y= Seed for random number generator. Default is 41. -# -# I Create and save a rooted, with duplication vs speciation, -# and orthology information annotated gene tree. -# If precalculated distances are used (modes 1 and 2): this gene -# tree is a NJ tree calculated based on the non-bootstrap resampled -# (original) pairwise distances. -# If precalculated distances are not used (modes 3 and 4): this gene -# is a consenus tree with ML branch length values and is also -# annotated with bootstrap values for each node. -# -# Options for output: -# p Output ultra-paralogs. -# D Description from SWISS-PROT and TrEMBL. -# C Complete description from SWISS-PROT and TrEMBL. -# E 118 character output instead of 78 character output. -# -# K Keep intermediate files (they will go into the same directory -# as the output file, their names are the same as of the output -# file, with various suffixes added). -# -# s Ignore non SWISS-PROT sequences (i.e. sequences from TrEMBL) -# in the Pfam alignment. -# -# f Try to ignore TrEMBL "fragments" (sequences with "fragment" in -# their description). -# -# + Parallel, use machines listed in file $NODE_LIST. -# -# x RIO used as web server -- HTML output. -# -# -# -# -# History -# ------- -# 09/06/03: Removal of minor bug. Only create consenus tree with ML branch length -# values if "I" option used (in modes 3 or 4) -- the problem/bug was that -# this tree was always created whether "I" was used or not. -# - - -use strict; - -use FindBin; -use lib $FindBin::Bin; -use Net::Ping; -use rio_module; - -use File::Basename; - - -my $VERSION = "5.010"; - -my $E_VALUE_THRESHOLD = 0.01; # For HMMSEARCH. -my $SORT_DEFAULT = 12; -my $THRESHOLD_ORTHOLOGS_DEFAULT = 0; -my $THRESHOLD_SN_DEFAULT = 0; -my $THRESHOLD_ORTHOLOGS_DEFAULT_DC = 60; -my $T_ULTRA_PARALOGS_DEFAULT = 50; -my $WARN_NO_ORTHOS_DEFAULT = 2; -my $WARN_MORE_THAN_ONE_ORTHO_DEFAULT = 2; -my $WARN_ONE_ORTHO_DEFAULT = 2; -my $MIN_NUMBER_OF_SEQS_IN_ALN = 4; -my $BOOSTRAPS_FOR_MAKETREE_DEFAULT = 100; -my $SEED_FOR_MAKETREE_DEFAULT = 41; -my $MATRIX_DEFAULT = 2; # 2=BLOSUM62 - -my $DO_RIO_TEMP_OUTFILE = "DoRIO_OUTFILE"; -my $TEMP_HMM_FILE = "HMMFILE"; - -my $DEFAULT_OPTIONS_FOR_MAKETREE = "XR"; - - -# I/O files, names: -my $alignment = ""; -my $hmm_file = ""; -my $hmm_name = ""; -my $seqX_file = ""; -my $species_tree_file = ""; -my $outfile = ""; -my $outfile_annot_nhx_tree = ""; -my $query_name = ""; -my $multiple_trees_file = ""; -my $distance_matrix_file = ""; -my $maketree_out_tree_file = ""; -my $seed_aln_for_hmmbuild = ""; -my $temp_dir = ""; -my $bsp_file = ""; -my $pwd_file = ""; -my $nbd_file = ""; -my $output_dir = ""; -my $species_names_file = " "; # Must be " ". -my $options_for_makeTree = ""; - - -# multiple choice options: -my $mode = 0; -my $sort = $SORT_DEFAULT; -my $matrix_n = $MATRIX_DEFAULT; # 0=JTT 1=PAM 2=BLOSUM62 3=mtREV24 5=VT 6=WAG - - - - -# yes/no options: -my $description = 0; -my $complete_description = 0; -my $long_output = 0; -my $keep = 0; -my $non_sp = 1; # 0 to remove non SP seqs. -my $safe_nhx = 0; -my $no_frags = 0; -my $output_ultraparalogs = 0; -my $parallel = 0; -my $output_HTML = 0; - - -# numerical options: -my $warn_no_orthos = $WARN_NO_ORTHOS_DEFAULT; -my $warn_more_than_one_ortho = $WARN_MORE_THAN_ONE_ORTHO_DEFAULT; -my $warn_one_ortho = $WARN_ONE_ORTHO_DEFAULT; -my $boostraps_for_makeTree = $BOOSTRAPS_FOR_MAKETREE_DEFAULT; -my $seed_for_makeTree = $SEED_FOR_MAKETREE_DEFAULT; -my $t_orthologs = $THRESHOLD_ORTHOLOGS_DEFAULT; -my $t_sn = $THRESHOLD_SN_DEFAULT; -my $t_orthologs_dc = $THRESHOLD_ORTHOLOGS_DEFAULT_DC; -my $t_ultra_paralogs = $T_ULTRA_PARALOGS_DEFAULT; - - -# internal variables: -my $print_header_for_orthologies = 0; -my $print_header_for_s_paralogs = 0; -my $length_of_alignment = 0; -my $length_of_orig_alignment = 0; -my $time = 0; -my $ii = 0; -my $j = 0; -my $jj = 0; -my $number_of_seqs_in_aln = 0; -my $f = 0; -my $saw_distance_values = 0; -my $saw_ultra_paralogs = 0; -my $bootstraps = 0; -my $ext_nodes_in_trees_analyzed = 0; -my $time_total = 0; -my $time_tree_calc = 0; -my $time_tree_calcT = 0; -my $time_rio = 0; -my $time_rioT = 0; -my $time_dqopuzzle = 0; -my $time_dqopuzzleT = 0; -my $time_addingdists = 0; -my $time_addingdistsT = 0; -my $processors = 0; -my $block_size = 0; -my $larger_blocks = 0; -my $printed_ultra_paralogs = 0; - -my $dorio_outfile = ""; -my $options_for_DoRIO = ""; -my $ortho_name = ""; -my $orthos = 0; -my $s_orthos = 0; -my $subtree_neighbors = 0; -my $dist = 0; -my $s_para_name = ""; -my $s_paras = 0; -my $sort_priority = ""; -my $return_line = ""; -my $matrix = ""; -my $command_line = ""; -my $command_line_for_hmmbuild = ""; -my $current_dir = ""; -my @complete_names = (); -my @temp_array = (); -my %Species_names_hash = (); -my %AC_DE = (); # AC => DE from "ACDEOS" TrEMBL file. -my %SP_AC_DE = (); # ID => DE from "ACIDOS" SWISS-PROT file. -my %names_in_pwd_file = (); -my @nodelist = (); - -my $start_date = `date`; - - - - -# This analyzes the options: -# -------------------------- - -$time_total = time; - -if ( @ARGV < 4 ) { - &printHelp(); -} - -$command_line = "$0 "; -for ( $j = 0; $j < @ARGV; ++$j ) { - $command_line .= "$ARGV[ $j ] "; -} - -&analyzeCommandLine( @ARGV ); - -if ( $species_tree_file eq "" ) { - $species_tree_file = $SPECIES_TREE_FILE_DEFAULT; -} - -&CheckArguments; - -$options_for_makeTree = $DEFAULT_OPTIONS_FOR_MAKETREE; -$options_for_makeTree .= "S".$seed_for_makeTree; - - -if ( $mode == 1 || $mode == 2 ) { - - if ( $mode == 1 ) { - $hmm_file = $RIO_HMM_DIRECTORY.$alignment.$SUFFIX_HMM; - $bsp_file = $RIO_BSP_DIRECTORY.$alignment.$SUFFIX_BOOT_STRP_POS; - &userErrorCheckForTextFileExistence( $hmm_file ); - &userErrorCheckForTextFileExistence( $bsp_file ); - } - - $pwd_file = $RIO_PWD_DIRECTORY.$alignment.$SUFFIX_PWD; - $nbd_file = $RIO_NBD_DIRECTORY.$alignment.$SUFFIX_PWD_NOT_BOOTS; - $alignment = $RIO_ALN_DIRECTORY.$alignment.$ALIGN_FILE_SUFFIX; - &userErrorCheckForTextFileExistence( $pwd_file ); - &userErrorCheckForTextFileExistence( $nbd_file ); - &userErrorCheckForTextFileExistence( $alignment ); - $no_frags = 0; - $non_sp = 1; - - $options_for_makeTree .= "F"; -} -elsif ( $mode == 3 || $mode == 4 ) { - if ( $safe_nhx == 1 ) { - $options_for_makeTree .= "U"; - } - else { - $options_for_makeTree .= "#"; - } - $options_for_makeTree .= "D"; # To calc. and keep pairwise distances. - $options_for_makeTree .= "B".$boostraps_for_makeTree; - -} - -if ( $output_HTML == 1 ) { - $| = 1; - $complete_description = 1; - $long_output = 1; - -} - -if ( $mode == 1 || $mode == 3 || $mode == 4 ) { - - if ( $mode == 1 ) { - $matrix_n = $MATRIX_FOR_PWD; - } - - if ( $matrix_n == 0 ) { - $options_for_makeTree .= "J"; - $matrix = "JTT (Jones et al. 1992)"; - } - elsif ( $matrix_n == 1 ) { # PAM is makeTree's default. - $matrix = "PAM (Dayhoff et al. 1978)"; - } - elsif ( $matrix_n == 2 ) { - $options_for_makeTree .= "L"; - $matrix = "BLOSUM 62 (Henikoff-Henikoff 92)"; - } - elsif ( $matrix_n == 3 ) { - $options_for_makeTree .= "M"; - $matrix = "mtREV24 (Adachi-Hasegawa 1996)"; - } - elsif ( $matrix_n == 5 ) { - $options_for_makeTree .= "T"; - $matrix = "VT (Mueller-Vingron 2000)"; - } - elsif ( $matrix_n == 6 ) { - $options_for_makeTree .= "W"; - $matrix = "WAG (Whelan-Goldman 2000)"; - } - else { - &dieWithUnexpectedError( "Failed sanity check" ); - } -} - - -# This creates the temp directory: -# -------------------------------- - -$ii = 0; - -$time = time; - -if ( $temp_dir eq "" ) { - $temp_dir = $TEMP_DIR_DEFAULT.$time.$ii; -} -else { - $temp_dir = $temp_dir.$ii; -} - -while ( -e $temp_dir ) { - $ii++; - $temp_dir = $TEMP_DIR_DEFAULT.$time.$ii; -} - -mkdir( $temp_dir, 0700 ) || &dieWithUnexpectedError( "Could not create \"$temp_dir\"" ); - -unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) { - &dieWithUnexpectedError( "\"$temp_dir\" does not exist, or is not a directory" ); -} - - - -# The analysis starts here: -# ------------------------- - -$dorio_outfile = $temp_dir."/".$DO_RIO_TEMP_OUTFILE; - -$output_dir = dirname( $outfile ); - -unless ( ( -e $output_dir ) && ( -d $output_dir ) ) { - &userError( "Outfile directory (\"$output_dir\") does not exist,\n or is not a directory." ); -} - -if ( $mode == 1 || $mode == 3 ) { - $query_name = substr( $query_name, 0, $LENGTH_OF_NAME - 10 ); -} - - - - - -if ( $mode == 1 || $mode == 3 ) { - - # Prepares the query file: - # ------------------------ - $query_name = &seqFile2CleanedUpFastaFile( $seqX_file, - "$temp_dir/QUERY_SEQ", - $query_name ); - if ( $query_name eq "" ) { - &userError( "Query file \"$seqX_file\") does not appear to contain a valid name\n and/or \"-N\" option has not been used." ); - } - - if ( $mode == 3 ) { - # Prepares the HMM: - # ----------------- - if ( $hmm_file eq "" ) { - $hmm_file = $temp_dir."/".$TEMP_HMM_FILE; - if ( $hmm_name ne "" ) { - &executeHmmfetch( $PFAM_HMM_DB, $hmm_name, $hmm_file ); - } - elsif ( $seed_aln_for_hmmbuild ne "" ) { - $command_line_for_hmmbuild = &executeHmmbuild( $seed_aln_for_hmmbuild, $hmm_file ); - } - } - - } -} - - - - -# This might remove non SWISS PROT seqs, TreMBL fragments, -# and seqs from species not in $species_names_file from the alignment: -# -------------------------------------------------------------------- -if ( $mode == 3 || $mode == 4 ) { - #if ( $do_not_removeSeqsFromPfamAlign != 1 ) { - - if ( $mode == 3 ) { - &removeSeqsFromPfamAlign( $alignment, - $temp_dir."/ALIGN2", - " ", - $species_names_file, - $non_sp, - $no_frags ); - } - else { - &removeSeqsFromPfamAlign( $alignment, - $temp_dir."/ALIGN2", - $query_name, - $species_names_file, - $non_sp, - $no_frags ); - } - -} - - - -# If necessary, this aligns the query to the pfam alignment -# using hmmsearch, p7extract.pl, multifetch.pl, and hmmalign -# from the HMMER package: -# ---------------------------------------------------------- -if ( $mode == 1 || $mode == 3 ) { - if ( $mode == 1 ) { - - $f = &alignWithHmmalign( $alignment, - $temp_dir."/QUERY_SEQ", - $hmm_file, - $temp_dir."/HMMALIGNOUT", - 1 ); # --mapali - - - } - else { - - $f = &alignWithHmmalign( $temp_dir."/ALIGN2", - $temp_dir."/QUERY_SEQ", - $hmm_file, - $temp_dir."/HMMALIGNOUT", - 0 ); # --withali - - } - if ( $f != 1 ) { - if ( $alignment =~ /.+\/(.+)/ ) { - $alignment = $1; - } - if ( $alignment =~ /(.+)\..+/ ) { - $alignment = $1; - } - &cleanUpTempDir(); - if ( $output_HTML == 1 ) { - &exitWithWarning( "query sequence does not contain sufficient similarity to the \"$alignment\" domain", 1 ); - } - else { - &exitWithWarning( "Query sequence does not contain sufficient similarity to the \"$alignment\" domain" ); - } - } - - - # In case query contains more than one of the same domain: - - @complete_names = &getCompleteName( $temp_dir."/HMMALIGNOUT", $query_name ); - - if ( @complete_names < 1 ) { - &dieWithUnexpectedError( "Could not find \"$query_name in $temp_dir"."/HMMALIGNOUT\"" ); - } -} -elsif ( $mode == 2 || $mode == 4 ) { - # Here, this is just for checking: - if ( $mode == 2 ) { - @complete_names = &getCompleteName( $alignment, $query_name ); - } - elsif ( $mode == 4 ) { - @complete_names = &getCompleteName( $temp_dir."/ALIGN2", $query_name ); - } - if ( @complete_names < 1 ) { - &dieWithUnexpectedError( "Could not find \"$query_name in $temp_dir"."/HMMALIGNOUT\"" ); - } - @complete_names = (); - $complete_names[ 0 ] = $query_name; -} - -if ( $parallel == 1 ) { - &readInNodesList(); - &pingNodes(); - $processors = scalar( @nodelist ); - if ( $processors < 2 ) { - $parallel = 0; - } - if ( $processors > $BOOTSTRAPS ) { - $processors = $BOOTSTRAPS; - } - else { - $block_size = int $BOOTSTRAPS / $processors; - $larger_blocks = $BOOTSTRAPS - ( $block_size * $processors ); # number of blocks which have a size of - # block_size + 1 - - } -} - - -# This opens the output file: -# --------------------------- -if ( $output_HTML != 1 ) { - open( OUT, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); -} - -# This starts printing to the output file: -# ---------------------------------------- -&printHeader(); - - - -# This loop goes through the different domains of the query -# which aligned to the alignment (in modes 2 and 4 this can -# obviously be only one): -# ----------------------------------------------------------- -for ( $jj = 0; $jj < @complete_names; ++$jj ) { - - if ( $mode == 1 ) { - # Moves the query to the last line(s) of the alignment. - # Removes other querie domains $complete_names[i]-- for which i != $jj - # -------------------------------------------------------------------- - - &moveToLast( $complete_names[ $jj ], - $temp_dir."/HMMALIGNOUT", - $temp_dir."/MOVETOLASTOUT", - \@complete_names ); - - } - - if ( $mode == 1 || $mode == 3 ) { - if ( $mode == 1 ) { - @temp_array = &pfam2phylipMatchOnly( $temp_dir."/MOVETOLASTOUT", - $temp_dir."/ALIGN2_PHYLIP_MO", - 0 ); - } - else { - @temp_array = &pfam2phylipMatchOnly( $temp_dir."/HMMALIGNOUT", - $temp_dir."/ALIGN2", - 1 ); - } - $length_of_alignment = $temp_array[ 0 ]; - $length_of_orig_alignment = $temp_array[ 1 ]; - $number_of_seqs_in_aln = $temp_array[ 2 ]; - } - elsif ( $mode == 2 || $mode == 4 ) { - - $query_name = $complete_names[ 0 ]; - - if ( $mode == 4 ) { - if ( !&startsWithSWISS_PROTname( $query_name ) ) { - # Query is not a SWISS-PROT sequence. - $query_name = &getCompleteNameForTrEMBLquerySeq( $temp_dir."/ALIGN2", - $query_name ); - } - - $number_of_seqs_in_aln = &countSeqsInPfamAlign( $temp_dir."/ALIGN2" ); - } - else { - if ( !&startsWithSWISS_PROTname( $query_name ) ) { - # Query is not a SWISS-PROT sequence. - $query_name = &getCompleteNameForTrEMBLquerySeq( $alignment, - $query_name ); - } - $number_of_seqs_in_aln = &countSeqsInPfamAlign( $alignment ); - } - - - - } - - if ( $number_of_seqs_in_aln < $MIN_NUMBER_OF_SEQS_IN_ALN ) { - &cleanUpTempDir(); - if ( $output_HTML == 1 ) { - &exitWithWarning( "Removal of sequences resulted in an alignment with less than $MIN_NUMBER_OF_SEQS_IN_ALN sequences ($number_of_seqs_in_aln)", 1 ); - } - else { - &exitWithWarning( "Removal of sequences resulted in an alignment with less than $MIN_NUMBER_OF_SEQS_IN_ALN sequences ($number_of_seqs_in_aln)" ); - } - } - - - if ( $mode == 1 ) { - - unlink( $temp_dir."/ALIGN2_BOOTSTRAPPED" ); - - if ( $parallel == 1 ) { - &executeBootstrap_cz( $BOOTSTRAPS, - $bsp_file, - $temp_dir."/ALIGN2_PHYLIP_MO", - $temp_dir."/ALIGN2_BOOTSTRAPPED", - $processors ); - - } - else { - - &executeBootstrap_cz( $BOOTSTRAPS, - $bsp_file, - $temp_dir."/ALIGN2_PHYLIP_MO", - $temp_dir."/ALIGN2_BOOTSTRAPPED" ); - - } - - - $current_dir = `pwd`; - $current_dir =~ s/\s//; - - chdir ( $temp_dir ) || &dieWithUnexpectedError( "Could not chdir to \"$temp_dir\"" ); - - - if ( $parallel == 1 ) { - - my $number = 0; - my $all_finished = 0; - - system( $RIO_SLAVE_DRIVER, - $block_size, - $larger_blocks, - $temp_dir."/ALIGN2_BOOTSTRAPPED", - $matrix_n, - $complete_names[ $jj ], - $pwd_file, - $temp_dir, - $seed_for_makeTree, - @nodelist ) - && &dieWithUnexpectedError( "Could not execute \"$RIO_SLAVE_DRIVER\"" ); - - while ( $all_finished != 1 ) { - for ( $number = 0; $number < $processors; $number++ ) { - unless ( -e "FINISHED_$number" ) { - $number = -1; - } - } - $all_finished = 1; - } - - sleep( 1 ); - - system( "mv", - "MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX."0", - "MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX ) - && &dieWithUnexpectedError( "$!" ); - - for ( $number = 1; $number < $processors; $number++ ) { - system( "cat MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX$number >> MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX" ) - && &dieWithUnexpectedError( "$!" ); - if ( unlink( "MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX$number" ) != 1 ) { - &dieWithUnexpectedError( "Could not delete \"MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX$number" ); - } - } - - # Sanity check: Counts ";" in "MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX". - if ( `grep -c ';' MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX` != $BOOTSTRAPS ) { - &dieWithUnexpectedError( "\"MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX\" does not contain $BOOTSTRAPS \";\"" ); - } - - for ( $number = 0; $number < $processors; $number++ ) { - if ( unlink( "FINISHED_$number" ) != 1 ) { - &dieWithUnexpectedError( "Could not delete \"FINISHED_$number\"" ); - } - } - - &executeConsense( "MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX ); - unlink( "outfile", "intree" ); - - system( "mv", "outtree", "MAKETREEOUT.nhx" ) - && &dieWithUnexpectedError( "$!" ); - - - } - else { - $time_dqopuzzle = time; #time - &executePuzzleDQObootstrapped( "ALIGN2_BOOTSTRAPPED", $matrix_n ); - $time_dqopuzzle = time - $time_dqopuzzle; #time - $time_dqopuzzleT += $time_dqopuzzle; #time - - system( "mv", "ALIGN2_BOOTSTRAPPED.dist", "DISTs_TO_QUERY" ) - && &dieWithUnexpectedError( "$!" ); - } - - - &executePuzzleDQO( "ALIGN2_PHYLIP_MO", $matrix_n ); - - unlink( "ALIGN2_PHYLIP_MO" ); - - system( "mv", "ALIGN2_PHYLIP_MO.dist", "DIST_TO_QUERY" ) - && &dieWithUnexpectedError( "$!" ); - - if ( $parallel != 1 ) { - $time_addingdists = time; - &addDistsToQueryToPWDfile( $pwd_file, - $temp_dir."/DISTs_TO_QUERY", - $temp_dir."/PWD_INC_QUERY", - $complete_names[ $jj ] ); - - - $time_addingdists = time - $time_addingdists; - $time_addingdistsT += $time_addingdists; - } - &addDistsToQueryToPWDfile( $nbd_file, - $temp_dir."/DIST_TO_QUERY", - $temp_dir."/NBD_INC_QUERY", - $complete_names[ $jj ] ); - - } - - if ( $mode == 2 ) { - $current_dir = `pwd`; - $current_dir =~ s/\s//; - chdir ( $temp_dir ) - || &dieWithUnexpectedError( "Could not chdir to \"$temp_dir\"" ); - - } - - - if ( $parallel != 1 ) { - unlink( $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX ); - } - - $time_tree_calc = time; - - # This calculates the trees - # ------------------------- - - if ( $mode == 1 || $mode == 2 ) { - - if ( $mode == 1 ) { - - &executeNeighbor( $temp_dir."/NBD_INC_QUERY", - 0, - 0, - 0, - 1 ); - - unlink( "outfile" ); - system( "mv", "outtree", "NBD_NJ_TREE" ) - && &dieWithUnexpectedError( "$!" ); - if ( $parallel != 1 ) { - &executeMakeTree( $options_for_makeTree, - $temp_dir."/PWD_INC_QUERY", - $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX, - $temp_dir."/maketree_tempdir" ); - } - - } - else { - &executeNeighbor( $nbd_file, - 0, - 0, - 0, - 1 ); - - unlink( "outfile" ); - system( "mv", "outtree", "NBD_NJ_TREE" ) - && &dieWithUnexpectedError( "$!" ); - - &executeMakeTree( $options_for_makeTree, - $pwd_file, - $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX, - $temp_dir."/maketree_tempdir" ); - - } - - chdir( $current_dir ) - || &dieWithUnexpectedError( "Could not chdir to \"$current_dir\"" ); - - - } - elsif ( $mode == 3 || $mode == 4 ) { - &executeMakeTree( $options_for_makeTree, - $temp_dir."/ALIGN2", - $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX, - $temp_dir."/maketree_tempdir" ); - - unlink( $temp_dir."/MAKETREEOUT".$ALIGN_FILE_SUFFIX ); - } - - - $time_tree_calc = time - $time_tree_calc; - $time_tree_calcT += $time_tree_calc; - - if ( $keep == 1 ) { - - system( "cp", $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX, $outfile.$TREE_FILE_SUFFIX ); - system( "cp", $temp_dir."/MAKETREEOUT".$LOG_FILE_SUFFIX, $outfile.$LOG_FILE_SUFFIX ); - system( "cp", $temp_dir."/MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX, $outfile.$MULTIPLE_TREES_FILE_SUFFIX ); - if ( $mode == 1 || $mode == 2 ) { - system( "cp", $temp_dir."/NBD_NJ_TREE", $outfile."-NJ".$TREE_FILE_SUFFIX ); - } - - } - - unlink( $temp_dir."/ALIGN2" ); - - $multiple_trees_file = $temp_dir."/MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX; - $maketree_out_tree_file = $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX; - $distance_matrix_file = $temp_dir."/MAKETREEOUT".$SUFFIX_PWD_NOT_BOOTS; - - - if ( $mode == 1 || $mode == 3 ) { - $query_name = $complete_names[ $jj ]; - } - - $options_for_DoRIO = ""; - - # This will result in saving of the annotated consenus tree: - # ---------------------------------------------------------- - if ( $safe_nhx == 1 ) { - my $number = $jj + 1; - if ( @complete_names > 1 ) { - $outfile_annot_nhx_tree = $outfile.$ADDITION_FOR_RIO_ANNOT_TREE."-".$number.$TREE_FILE_SUFFIX; - } - else { - $outfile_annot_nhx_tree = $outfile.$ADDITION_FOR_RIO_ANNOT_TREE.$TREE_FILE_SUFFIX; - } - } - - - - if ( $sort > 2 ) { - if ( $mode == 3 || $mode == 4 ) { - $options_for_DoRIO .= " D=".$distance_matrix_file; - } - elsif ( $mode == 1 ) { - $options_for_DoRIO .= " d=".$temp_dir."/DIST_TO_QUERY"; - } - elsif ( $mode == 2 ) { - $options_for_DoRIO .= " D=".$nbd_file; - } - } - $options_for_DoRIO .= " M=".$multiple_trees_file; - $options_for_DoRIO .= " 'N=".$query_name."'"; - $options_for_DoRIO .= " S=".$species_tree_file; - $options_for_DoRIO .= " O=".$dorio_outfile; - $options_for_DoRIO .= " P=".$sort; - $options_for_DoRIO .= " L=".$t_orthologs; - $options_for_DoRIO .= " B=".$t_sn; - $options_for_DoRIO .= " U=".$t_orthologs_dc; - $options_for_DoRIO .= " X=".$warn_more_than_one_ortho; - $options_for_DoRIO .= " Y=".$warn_no_orthos; - $options_for_DoRIO .= " Z=".$warn_one_ortho; - - if ( $mode == 1 || $mode == 2 ) { - $options_for_DoRIO .= " T=".$temp_dir."/NBD_NJ_TREE"; - $options_for_DoRIO .= " t=".$maketree_out_tree_file; - } - elsif ( $mode == 3 || $mode == 4 ) { - if ( $safe_nhx == 1 ) { # Added 09/04/03. - $options_for_DoRIO .= " T=".$maketree_out_tree_file; - } - } - - if ( $safe_nhx == 1 ) { - $options_for_DoRIO .= " I"; - } - if ( $output_ultraparalogs == 1 ) { - $options_for_DoRIO .= " p"; - $options_for_DoRIO .= " v=".$t_ultra_paralogs; - } - - $time_rio = time; - - &executeDoRIO( $options_for_DoRIO ); - - $time_rio = time - $time_rio; - $time_rioT += $time_rio; - - unless ( ( -s $dorio_outfile ) && ( -f $dorio_outfile ) && ( -T $dorio_outfile ) ) { - close( OUT ); - unlink( $outfile ); - &dieWithUnexpectedError( "failure during execution of RIO (no output generated)" ); - } - - if ( $safe_nhx == 1 ) { - system( "mv", - $temp_dir."/".$DO_RIO_TEMP_OUTFILE.$ADDITION_FOR_RIO_ANNOT_TREE.$TREE_FILE_SUFFIX, - $outfile_annot_nhx_tree ) - && &dieWithUnexpectedError( "$!" ); - } - - - open( IN, "$dorio_outfile" ) - || &dieWithUnexpectedError( "Cannot open file \"$dorio_outfile\"" ); - - $saw_distance_values = 0; - $saw_ultra_paralogs = 0; - $printed_ultra_paralogs = 0; - $print_header_for_orthologies = 1; - $print_header_for_s_paralogs = 1; - - - - - # This generates the report - # ------------------------- - - W: while ( $return_line = ) { - - if ( $return_line =~ /distance values:/i ) { - $saw_distance_values = 1; - &printTitleForDistanceValues(); - } - elsif ( $return_line =~ /ultra paralogs/i ) { - $saw_ultra_paralogs = 1; - } - elsif ( $return_line =~ /^mean bootstrap/i ) { - &printMeanBootstraps(); - } - elsif ( $return_line =~ /sort priority\s*:\s*(.+)/i ) { - $sort_priority = $1; - } - elsif ( $return_line =~ /ext nodes\s*:\s*(.+)/i ) { - $ext_nodes_in_trees_analyzed = $1 - 1; # One seq is query. - } - elsif ( $return_line =~ /bootstraps\s*:\s*(\S+)/i ) { - if ( $jj == @complete_names - 1 ) { - $bootstraps = $1; - if ( $output_HTML == 1 ) { - $| = 1; - } - &printOptions(); - last W; - } - } - elsif ( $saw_distance_values != 1 - && $saw_ultra_paralogs != 1 - && $return_line =~ /(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s*(\S*)/ ) { - $ortho_name = $1; - $orthos = $2; - $subtree_neighbors = $3; - $s_orthos = $4; - $dist = $5; - - if ( $print_header_for_orthologies == 1 ) { - &printHeaderForOrthologies(); - $print_header_for_orthologies = 0; - } - &printOrthologies(); - } - elsif ( $saw_distance_values != 1 - && $saw_ultra_paralogs != 1 - && $return_line =~ /^\s*-\s*$/ ) { - $ortho_name = "-"; - $orthos = 0; - $s_orthos = 0; - $dist = 0; - if ( $print_header_for_orthologies == 1 ) { - &printHeaderForOrthologies(); - $print_header_for_orthologies = 0; - } - &printOrthologies(); - } - elsif ( $output_ultraparalogs == 1 - && $saw_ultra_paralogs == 1 - && $return_line =~ /(\S+)\s+(\S+)\s+(\S+)/ ) { - $s_para_name = $1; - $s_paras = $2; - $dist = $3; - if ( $print_header_for_s_paralogs == 1 ) { - &printHeaderForSparalogs(); - $print_header_for_s_paralogs = 0; - } - &printUltraParlogs(); - $printed_ultra_paralogs = 1; - } - elsif ( $output_ultraparalogs == 1 - && $saw_ultra_paralogs == 1 - && $return_line =~ /^\s*-\s*$/ ) { - &printNoUltraParalogs(); - } - elsif ( $return_line =~ /Bootstraps/ ) { - $saw_distance_values = 0; - } - elsif ( $saw_distance_values == 1 && $saw_ultra_paralogs != 1 ) { - &printDistanceValues(); - } - - } - close( IN ); - -} # End of for loop going through possible - # multiple matches to the same alignment/model. - -if ( $output_HTML != 1 ) { - close( OUT ); -} - -&cleanUpTempDir(); - -if ( $output_HTML != 1 ) { - print( "\n\nrio.pl successfully terminated.\nOutput written to: $outfile\n\n" ); -} - -exit( 0 ); - - - - - - - - - -# =========================================================== -# Methods -# ----------------------------------------------------------- - - - - -# ----------------------------------------------------------- -# Parallization related -# ----------------------------------------------------------- - - - -# Last modified: 02/02/02 -sub readInNodesList { - - &testForTextFilePresence( $NODE_LIST ); - - open( NIN, "$NODE_LIST" ) || &dieWithUnexpectedError( "Cannot open file \"$NODE_LIST\"" ); - - while ( ) { - if ( $_ =~ /(\S+)/ ) { - push( @nodelist, $1 ); - } - } - close( NIN ); - return; -} - - - -# Last modified: 02/02/02 -sub pingNodes { - my @temp_node_list = (); - my $p = Net::Ping->new( "tcp", 2 ); # or "udp" - my $n = ""; - - foreach $n ( @nodelist ) { - if ( defined( $p->ping( $n ) ) ) { - push( @temp_node_list, $n ); - } - } - @nodelist = (); - @nodelist = @temp_node_list; - return; - -} - - - - -# ----------------------------------------------------------- -# Output related -# ----------------------------------------------------------- - - -# Last modified: 03/07/01 -sub printHeader { - - if ( $output_HTML != 1 ) { - print OUT "RIO - Resampled Inference of Orthologs\n"; - print OUT "Version: $VERSION\n"; - print OUT "------------------------------------------------------------------------------\n"; - - print OUT "Pfam alignment file : $alignment\n"; - if ( $mode == 3 ) { - print OUT "Pfam alignment description : ".&getDescriptionFromPfam( $alignment )."\n"; - } - if ( $mode == 1 || $mode == 2 ) { - print OUT "Bootstrapped pairwise distances file : $pwd_file\n"; - print OUT "Not bootstrapped pairwise distances file: $nbd_file\n"; - print OUT "Bootstrap positions file : $bsp_file\n"; - } - if ( $mode == 1 || $mode == 3 ) { - if ( $seed_aln_for_hmmbuild ne "" ) { - print OUT "HMM : built based on $seed_aln_for_hmmbuild\n"; - } - elsif ( $hmm_name ne "" ) { - print OUT "HMM : $hmm_name\n"; - } - else { - print OUT "HMM : $hmm_file\n"; - } - print OUT "Query file : $seqX_file\n"; - } - print OUT "==============================================================================\n\n"; - } - -} ## printHeader - - - - -# Last modified: 03/07/01 -sub printHeaderForOrthologies { - - if ( $output_HTML != 1 ) { - if ( $jj > 0 ) { - print OUT "\n\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n"; - } - - print OUT "Query : $query_name\n\n"; - - if ( @complete_names > 1 ) { - my $size = @complete_names; - my $number = $jj + 1; - print OUT "More than one region of the query were aligned to the profile HMM.\n"; - print OUT "This is for domain #$number out of $size.\n\n"; - } - - print OUT "Number (in %) of observed orthologies (o), \"subtree-neighborings\" (n),\n"; - print OUT "and super-orthologies (s) to query in bootstrapped trees, evolutionary\n"; - print OUT "distance to query (as average number of aa replacements per residue):\n\n"; - if ( $long_output != 1 ) { - print OUT "Sequence Description o[%] n[%] s[%] distance\n"; - print OUT "-------- ----------- ---- ---- ---- --------\n"; - } - else { - print OUT "Sequence Description o[%] n[%] s[%] distance\n"; - print OUT "-------- ----------- ---- ---- ---- --------\n"; - } - } - else { - if ( $jj > 0 ) { - print "\n"; - print "

\n"; - print "

\n"; - } - - if ( @complete_names > 1 ) { - my $size = @complete_names; - my $number = $jj + 1; - print "

More than one region of the query were aligned to the profile HMM. \n"; - print "This is for domain #$number out of $size.

\n"; - } - print "

Query : $query_name

\n"; - print "

Orthologies, subtree-neighborings, super-orthologies

\n"; - - print "

Number (in %) of observed orthologies (o), \"subtree-neighborings\" (n), \n"; - print "and super-orthologies (s) to query in bootstrapped trees, evolutionary \n"; - print "distance to query (as average number of aa replacements per residue):

\n"; - if ( $ortho_name ne "-" ) { - print "\n"; - - print "\n"; - } - } - -} ## printHeaderForOrthologies - - - -# Last modified: 10/15/01 -sub printHeaderForSparalogs { - - if ( $output_HTML != 1 ) { - print OUT "\nUltra-paralogs\n"; - print OUT "--------------\n"; - print OUT "Number (in %) of observed ultra-paralogies (up) to query\n"; - print OUT "in bootstrapped trees, evolutionary distance to query (as average number\n"; - print OUT "of aa replacements per residue):\n\n"; - if ( $long_output != 1 ) { - print OUT "Sequence Description up[%] distance\n"; - print OUT "-------- ----------- ----- --------\n"; - } - else { - print OUT "Sequence Description up[%] distance\n"; - print OUT "-------- ----------- ----- --------\n"; - } - } - else { - print "

Ultra-paralogs

\n"; - print "

Number (in %) of observed ultra-paralogies (up) to query \n"; - print "in bootstrapped trees, evolutionary distance to query (as average number \n"; - print "of aa replacements per residue):

\n"; - print "

Sequence

Description

o[%]

n[%]

s[%]

distance

\n"; - print "\n"; - - } - -} ## printHeaderForSparalogs - - - -# Last modified: 03/07/01 -sub printOrthologies { - my @cut = (); - my $i = 0; - my $descp = ""; - $orthos = &roundToInt( $orthos ); - $s_orthos = &roundToInt( $s_orthos ); - - if ( $sort > 10 ) { - $subtree_neighbors = &roundToInt( $subtree_neighbors ); - } - - if ( ( $description == 1 || $complete_description == 1 ) - && $ortho_name ne "-" ) { - - if ( $non_sp != 1 ) { - if ( &startsWithSWISS_PROTname( $ortho_name ) ) { - $descp = &getDescriptionFromSWISSPROT_ACDEOSfile( $SWISSPROT_ACDEOS_FILE, $ortho_name ); - } - else { - $descp = "-"; - } - } - else { - if ( &startsWithSWISS_PROTname( $ortho_name ) ) { - $descp = &getDescriptionFromSWISSPROT_ACDEOSfile( $SWISSPROT_ACDEOS_FILE, $ortho_name ); - } - else { - $descp = &getDescriptionFromTrEMBL_ACDEOSfile( $TREMBL_ACDEOS_FILE, $ortho_name ); - } - } - - if ( $output_HTML != 1 ) { - if ( $long_output == 1 ) { - @cut = &cutDescription( $descp, 73 ); - } - else { - @cut = &cutDescription( $descp, 33 ); - } - $descp = $cut[ 0 ]; - } - } - if ( $descp eq "" ) { - $descp = "-"; - } - - if ( $output_HTML != 1 ) { - - if ( $ortho_name eq "-" ) { - print OUT "\nNO ORTHOLOGS in alignment with the current thresholds for output\n"; - } - elsif ( $dist ne "-" ) { - if ( $long_output == 1 ) { - print OUT sprintf "%-24.24s%-74.74s%5s%5s%5s%10.6f", $ortho_name,$descp,$orthos,$subtree_neighbors,$s_orthos,$dist; - } - else { - print OUT sprintf "%-24.24s%-34.34s%5s%5s%5s%10.6f", $ortho_name,$descp,$orthos,$subtree_neighbors,$s_orthos,$dist; - } - } - else { - if ( $long_output == 1 ) { - print OUT sprintf "%-24.24s%-74.74s%5s%5s%5s%10.10s", $ortho_name,$descp,$orthos,$subtree_neighbors,$s_orthos,$dist; - } - else { - print OUT sprintf "%-24.24s%-34.34s%5s%5s%5s%10.10s", $ortho_name,$descp,$orthos,$subtree_neighbors,$s_orthos,$dist; - } - } - if ( $complete_description == 1 ) { - for ( $i = 1; $i < @cut; ++$i ) { - print OUT "\n"; - if ( $long_output == 1 ) { - print OUT sprintf " %-74.74s", $cut[ $i ]; - } - else { - print OUT sprintf " %-34.34s", $cut[ $i ]; - } - } - } - print OUT "\n"; - } - else { - if ( $ortho_name eq "-" ) { - print "

NO ORTHOLOGS in alignment with the current thresholds for output

\n"; - } - else { - $ortho_name = &replaceNameWithLinkToExpasy( $ortho_name ); - print "\n"; - } - } - -} ## printOrthologies - - - -sub replaceNameWithLinkToExpasy { - my $name = $_[ 0 ]; - - if ( $name =~ /(.+)_(.+)\/(.+)/ ) { - my $desc = $1; - my $spec = $2; - my $numbers = $3; - if ( length( $desc ) <= 4 ) { - $name = "".$desc."_".$spec."\/".$numbers; - } - else { - $name = "".$desc."_".$spec."\/".$numbers; - } - } - - return $name; - -} ## replaceNameWithLinkToExpasy - - - - -# Last modified: 10/15/01 -sub printUltraParlogs { - my @cut = (); - my $i = 0; - my $descp = ""; - $s_paras = &roundToInt( $s_paras ); - - if ( ( $description == 1 || $complete_description == 1 ) - && $s_para_name ne "-" ) { - - if ( $non_sp != 1 ) { - if ( &startsWithSWISS_PROTname( $s_para_name ) ) { - $descp = &getDescriptionFromSWISSPROT_ACDEOSfile( $SWISSPROT_ACDEOS_FILE, $s_para_name ); - } - else { - $descp = "-"; - } - } - else { - if ( &startsWithSWISS_PROTname( $s_para_name ) ) { - $descp = &getDescriptionFromSWISSPROT_ACDEOSfile( $SWISSPROT_ACDEOS_FILE, $s_para_name ); - } - else { - $descp = &getDescriptionFromTrEMBL_ACDEOSfile( $TREMBL_ACDEOS_FILE, $s_para_name ); - } - } - - if ( $output_HTML != 1 ) { - if ( $long_output == 1 ) { - @cut = &cutDescription( $descp, 73 ); - } - else { - @cut = &cutDescription( $descp, 33 ); - } - $descp = $cut[ 0 ]; - } - } - if ( $descp eq "" ) { - $descp = "-"; - } - - if ( $output_HTML != 1 ) { - - if ( $dist ne "-" ) { - if ( $long_output == 1 ) { - print OUT sprintf "%-24.24s%-74.74s%5s%10.6f", $s_para_name,$descp,$s_paras,$dist; - } - else { - print OUT sprintf "%-24.24s%-34.34s%5s%10.6f", $s_para_name,$descp,$s_paras,$dist; - } - } - else { - if ( $long_output == 1 ) { - print OUT sprintf "%-24.24s%-74.74s%5s%10.10s", $s_para_name,$descp,$s_paras,$dist; - } - else { - print OUT sprintf "%-24.24s%-34.34s%5s%10.10s", $s_para_name,$descp,$s_paras,$dist; - } - } - if ( $complete_description == 1 ) { - for ( $i = 1; $i < @cut; ++$i ) { - print OUT "\n"; - if ( $long_output == 1 ) { - print OUT sprintf " %-74.74s", $cut[ $i ]; - } - else { - print OUT sprintf " %-34.34s", $cut[ $i ]; - } - } - } - print OUT "\n"; - - } - else { - $s_para_name = &replaceNameWithLinkToExpasy( $s_para_name ); - print "\n"; - } - -} ## printUltraParlogs - - - -sub printNoUltraParalogs { - if ( $output_HTML != 1 ) { - print OUT "\nUltra-paralogs\n"; - print OUT "--------------\n"; - print OUT "\nNO ULTRA-PARALOGS in alignment with the current threshold of $t_ultra_paralogs%\n"; - } - else { - print "

Ultra-paralogs

\n"; - print "

NO ULTRA-PARALOGS in alignment with the current threshold of $t_ultra_paralogs%

\n"; - } -} ## printNoUltraParalogs - - - -# Called by method "printOrthologies". -# Last modified: 02/27/01 -sub cutDescription { - my $line = $_[ 0 ]; - my $size = $_[ 1 ]; - my @cut = (); - my $i = 0; - - while ( ( length( $line ) ) > $size ) { - $cut[ $i++ ] = substr( $line, 0, $size ); - $line = substr( $line, $size ); - } - $cut[ $i++ ] = $line; - return @cut; -} ## cutDescription - - - - -# Last modified: 02/27/01 -sub printTitleForDistanceValues { - if ( $output_HTML != 1 ) { - if ( $mode == 1 || $mode == 2 ) { - print OUT "\n\nDistance values (based on NJ tree of original alignment)\n"; - print OUT "--------------------------------------------------------\n"; - } - elsif ( $mode == 3 || $mode == 4 ) { - print OUT "\n\nDistance values (based on ML branch length values on consensus tree)\n"; - print OUT "--------------------------------------------------------------------\n"; - } - } - else { - print "

Distance values (based on NJ tree of original alignment)

\n"; - } - -} ## printTitleForDistanceValues - - - - -# Last modified: 02/27/01 -sub printDistanceValues { - if ( $output_HTML != 1 ) { - print OUT "$return_line"; - } - else { - chomp( $return_line ); - if ( $return_line =~ /WARNING/ ) { - $return_line =~ s/\+\/-/ ± /; - $return_line =~ s/\*/ × /; - print "

$return_line

\n"; - } - elsif ( $return_line =~ /lca\s+is/i ) { - print "

$return_line

\n"; - } - elsif ( $return_line =~ /orthologous/i ) { - print "

$return_line

\n"; - } - elsif ( $return_line =~ /distance\s+of\s+query/i ) { - print "

Sequence	Description	up[%]	distance
$ortho_name	$descp	$orthos	$subtree_neighbors	$s_orthos	$dist
$s_para_name	$descp	$s_paras	$dist

\n"; - } - if ( $return_line =~ /(.+)=(.+)/ ) { - print "\n"; - } - if ( $return_line =~ /sum\s+/i || $return_line =~ /distance\s+of\s+ortholog\s+to\s+LCA/i ) { - print "

= $2

\n"; - } - } -} ## printDistanceValues - - - - -# Last modified: 02/27/01 -sub printMeanBootstraps { - if ( $output_HTML != 1 ) { - print OUT "\n\n$return_line"; - } - else { - chomp( $return_line ); - $return_line =~ s/\+\/-/ ± /; - print "\n"; - print "

$return_line

\n"; - } -} ## printMeanBootstraps - - - - -# Last modified: 02/12/02 -sub printOptions { - - if ( $output_HTML != 1 ) { - print OUT "\n\n\n==============================================================================\n"; - if ( $number_of_seqs_in_aln >= $MIN_NUMBER_OF_SEQS_IN_ALN ) { - print OUT "RIO options\n"; - print OUT "-----------\n"; - print OUT "Mode : "; - if ( $mode == 1 ) { - print OUT "precalc. pwd files with alignment not containing query (1)\n"; - } - elsif ( $mode == 2 ) { - print OUT "precalc. pwd files with alignment containing query (2)\n"; - } - elsif ( $mode == 3 ) { - print OUT "alignment not containing query (3)\n"; - } - elsif ( $mode == 4 ) { - print OUT "alignment containing query (4)\n"; - } - print OUT "Bootstraps : $bootstraps\n"; - print OUT "Species tree : $species_tree_file\n"; - if ( $safe_nhx == 1 ) { - if ( $mode == 3 || $mode == 4 ) { - if ( @complete_names > 1 ) { - $outfile_annot_nhx_tree =~ s/-\d+\.nhx/-X.nhx/; - print OUT "Saved annotated consensus trees (ML branch lengths) : $outfile_annot_nhx_tree\n"; - } - else { - print OUT "Saved annotated consensus tree (ML branch lengths) : $outfile_annot_nhx_tree\n"; - } - } - elsif ( $mode == 1 || $mode == 2 ) { - if ( @complete_names > 1 ) { - $outfile_annot_nhx_tree =~ s/-\d+\.nhx/-X.nhx/; - print OUT "Saved annotated NJ trees (based on original alignment) : $outfile_annot_nhx_tree\n"; - } - else { - print OUT "Saved annotated NJ tree (based on original alignment) : $outfile_annot_nhx_tree\n"; - } - } - } - print OUT "Threshold for output for orthologies (L=) : $t_orthologs\n"; - print OUT "Threshold for output for subtree-neighborings (B=) : $t_sn\n"; - print OUT "Threshold for distance calc for orthologies (U=) : $t_orthologs_dc\n"; - - print OUT "When to generate warnings:\n"; - print OUT "More than one ortholog: diff. in standard deviations (X=): $warn_more_than_one_ortho\n"; - print OUT "No orthologs : diff. in standard deviations (Y=): $warn_no_orthos\n"; - print OUT "One ortholog : factor (Z=): $warn_one_ortho\n"; - if ( $output_ultraparalogs == 1 ) { - print OUT "Output ultra-paralogs (p)\n"; - print OUT "Threshold for ultra-paralogies (v=) : $t_ultra_paralogs\n"; - } - print OUT "Sort priority: $sort_priority\n"; - } - - print OUT "\nOptions for the calculation of the phylgenetic trees\n"; - print OUT "----------------------------------------------------\n"; - if ( $mode == 1 ) { - print OUT "Model for pairwise distance calculations : $matrix\n"; - } - elsif ( $mode == 3 || $mode == 4 ) { - print OUT "Model for pairwise dist and ML branch length calc. : $matrix\n"; - } - if ( $mode == 1 || $mode == 3 || $mode == 4 ) { - print OUT "Columns in alignment used for tree calc : $length_of_alignment\n"; - print OUT "Columns in original alignment : $length_of_orig_alignment\n"; - } - print OUT "Sequences in alignment used for trees (incl query) : $number_of_seqs_in_aln\n"; - - if ( $mode == 3 || $mode == 4 ) { - print OUT "Removed non-SWISS-PROT sequences : "; - if ( $non_sp == 1 ) { - print OUT "no\n"; - } - else { - print OUT "yes\n"; - } - if ( $non_sp == 1 ) { - print OUT "Removed \"TrEMBL fragments\" : "; - if ( $no_frags == 1 ) { - print OUT "yes\n"; - } - else { - print OUT "no\n"; - } - } - } - if ( $mode == 1 || $mode == 2 ) { - print OUT "Prgrm to calc. branch lengths for distance values : PHYLIP NEIGHBOR (NJ)\n"; - } - elsif ( $mode == 3 || $mode == 4 ) { - print OUT "Prgrm to calc branch lengths for distance values : TREE-PUZZLE\n"; - } - if ( $seed_aln_for_hmmbuild ne "" ) { - print OUT "HMM was built with hmmbuild using options : $command_line_for_hmmbuild\n"; - } - if ( ( $mode == 3 || $mode == 4 ) && $species_names_file =~ /\S/ ) { - print OUT "File listing species used for tree calculation (G=): $species_names_file\n"; - } - print OUT "Seed for random number generator : $seed_for_makeTree\n"; - print OUT "Options for makeTree : $options_for_makeTree\n"; - - $time_total = time - $time_total; - - print OUT "\nTime and date\n"; - print OUT "-------------\n"; - if ( $mode == 1 ) { - print OUT "Time requirement dqo puzzle : $time_dqopuzzleT s\n"; - } - - print OUT "Time requirement for tree calculation: $time_tree_calcT s\n"; - print OUT "Time requirement for SDI and RIO : $time_rioT s\n"; - print OUT "Total time requirement : $time_total s\n"; - print OUT "Date started : $start_date"; - print OUT ( "Date finished : ".`date` ); - - print OUT "\nCommand line\n"; - print OUT "------------\n"; - print OUT "$command_line\n"; - if ( $parallel == 1 ) { - print OUT "\nProcessors used: @nodelist\n"; - } - } - else { - if ( $printed_ultra_paralogs == 1 ) { - print "\n"; - } - if ( $species_tree_file =~ /.+\/(.+)/ ) { - $species_tree_file = $1; - } - print "

Options

\n"; - print "\n"; - print "\n"; - print "\n"; - print "\n"; - print "\n"; - print "\n"; - print "\n"; - print "\n"; - print "\n"; - print "\n"; - if ( $output_ultraparalogs == 1 ) { - print "\n"; - print "\n"; - } - print "\n"; - print "\n"; - print "\n"; - print "\n"; - print "\n"; - print "\n"; - print "

Bootstraps:	$bootstraps
Species tree:	$species_tree_file
Threshold for output for orthologies:	$t_orthologs
Threshold for output for subtree-neighborings:	$t_sn
Threshold for distance calc for orthologies:	$t_orthologs_dc
When to generate warnings
More than one ortholog [diff in standard deviations]:	$warn_more_than_one_ortho
No orthologs [diff in standard deviations]:	$warn_no_orthos
One ortholog [factor]:	$warn_one_ortho
Output ultra-paralogs
Threshold for ultra-paralogies:	$t_ultra_paralogs
Sort priority:	$sort_priority
Model for pairwise distance calculations:	$matrix
Columns in alignment used for tree calc:	$length_of_alignment
Columns in original alignment:	$length_of_orig_alignment
Sequences in alignment used for trees (incl query):	$number_of_seqs_in_aln
Seed for random number generator:	$seed_for_makeTree

\n"; - - $time_total = time - $time_total; - - print "

\n"; - print "\n"; - print "\n"; - print "\n"; - print ( "\n" ); - if ( $parallel == 1 ) { - print "\n"; - } - print "

Time requirement:	$time_total s
Date started:	$start_date
Date finished:	".`date`."
Number of processors used:	".scalar( @nodelist )."

\n"; - } - -} ## printOptions - - - - - - - - - - -# ----------------------------------------------------------- -# Execution of other programs -# ----------------------------------------------------------- - - - - - -# Two arguments: -# 1. seed -# 2. outfile -# Returns the options used. -# Last modified: 05/11/01 -sub executeHmmbuild { - - my $seed = $_[ 0 ]; - my $outfile = $_[ 1 ]; - my $options = ""; - - &testForTextFilePresence( $seed ); - - $options = getHmmbuildOptionsFromPfam( $seed ); - - $options =~ s/-f//; - $options =~ s/-g//; - $options =~ s/-s//; - $options =~ s/-F//; - $options =~ s/-A//; - $options =~ s/-o\s+\S+//; - $options =~ s/(\s|^)[^-]\S+/ /g; - - if ( $options =~ /--prior/ ) { - my $basename = basename( $seed ); - $basename .= ".PRIOR"; - $options =~ s/--prior/--prior $PRIOR_FILE_DIR$basename/; - } - - # Remove for versions of HMMER lower than 2.2. - if ( $options =~ /--informat\s+\S+/ ) { - $options =~ s/--informat\s+\S+/--informat SELEX/; - } - else { - $options = "--informat SELEX ".$options; - } - - system( "$HMMBUILD $options $outfile $seed" ) - && &dieWithUnexpectedError( "Could not execute \"$HMMBUILD $options $outfile $seed\"" ); - return $options; - -} ## executeHmmbuild. - - - - -# One argument: -# Pfam align name. -# Last modified: 02/26/01 -sub getHmmbuildOptionsFromPfam { - - my $infile = $_[ 0 ]; - my $return_line = ""; - my $result = ""; - - &testForTextFilePresence( $infile ); - - open( GHO, $infile ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); - while ( $return_line = ) { - if ( $return_line =~ /^\s*#.*hmmbuild\s+(.+)\s*$/ ) { - $result = $1; - close( GHO ); - return $result; - } - } - close( GHO ); - return $result; - -} ## getHmmbuildOptionsFromPfam - - - - -# Purpose. Aligns a FASTA file to a Pfam alignment using an HMM profile. -# Five arguemnts: -# 1. Pfam flat file name -# 2. Name of FASTA file to append -# 3. HMM profile file name -# 4. outputfile name -# 5. 1 use --mapali, --withali otherwise (in hmmalign) -# Returns 1 if successful, -1 if no alignment was made because -# E value of HMMSEARCH output was larger than $E_VALUE_THRESHOLD. -# Last modified: 07/11/01 -sub alignWithHmmalign { - my $alignment = $_[ 0 ]; - my $query = $_[ 1 ]; - my $hmm = $_[ 2 ]; - my $outfile = $_[ 3 ]; - my $use_mapali = $_[ 4 ]; - my $E = 2000; - my $ali = "--withali"; - - if ( $use_mapali == 1 ) { - $ali = "--mapali"; - } - - &testForTextFilePresence( $alignment ); - &testForTextFilePresence( $query ); - &testForTextFilePresence( $hmm ); - - system( "$HMMSEARCH $hmm $query > $temp_dir/HMMSEARCHOUT" ) - && &dieWithUnexpectedError( "Could not execute \"$HMMSEARCH $hmm $query > $temp_dir/HMMSEARCHOUT\"" ); - - - - $E = &getEvalue( "$temp_dir/HMMSEARCHOUT" ); - if ( $E == 2000 ) { - &dieWithUnexpectedError( "No E-value found in \"$temp_dir/HMMSEARCHOUT\"" ); - } - elsif ( $E > $E_VALUE_THRESHOLD ) { - unlink( "$temp_dir/HMMSEARCHOUT" ); - return ( -1 ); - } - - system( "$P7EXTRACT -d $temp_dir/HMMSEARCHOUT > $temp_dir/GDF" ) - && &dieWithUnexpectedError( "Could not execute \"$P7EXTRACT -d $temp_dir/HMMSEARCHOUT > $temp_dir/GDF\"" ); - - - system( "$MULTIFETCH -d -g $query $temp_dir/GDF > $temp_dir/MULTIFETCHOUT" ) - && &dieWithUnexpectedError( "Could not execute \"$MULTIFETCH -d -g $query $temp_dir/GDF > $temp_dir/MULTIFETCHOUT\"" ); - - # Checks if score was too low to have made a reasonable alignment. - unless ( -s "$temp_dir/MULTIFETCHOUT" ) { - unlink( "$temp_dir/HMMSEARCHOUT", "$temp_dir/GDF", "$temp_dir/MULTIFETCHOUT" ); - return ( -1 ); - } - - system( "$HMMALIGN -o $outfile $ali $alignment $hmm $temp_dir/MULTIFETCHOUT >/dev/null 2>&1" ) - && &dieWithUnexpectedError( "Could not execute \"$HMMALIGN -o $outfile $ali $alignment $hmm $temp_dir/MULTIFETCHOUT\"" ); - - if ( unlink( "$temp_dir/HMMSEARCHOUT", "$temp_dir/GDF","$temp_dir/MULTIFETCHOUT" ) != 3 ) { - &dieWithUnexpectedError( "Could not delete (a) file(s)" ); - } - - return 1; -} ## alignWithHmmalign - - - - -# Gets the E value for complete sequences (score includes all domains) -# from a HMMSEARCH output file. -# One argument: the HMMSEARCH output file name -# Returns the E value, 2000 if no E value found -# Last modified: 07/11/01 -sub getEvalue { - - my $infile = $_[ 0 ]; - my $return_line = ""; - my $flag = 0; - my $E = 2000; - - &testForTextFilePresence( $infile ); - - open( E, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); - while ( $return_line = ) { - - # "Sequence Description Score E-value N" - if ( $return_line =~ /Sequence.+Description.+Score.+E.+value.+N/ ) { - $flag = 1; - } - # "QUERY_HUMAN 657.4 1.3e-198 1" - elsif ( $flag == 1 && $return_line =~ /\s+(\S+)\s+\d+\s*$/ ) { - $E = $1; - close( E ); - return $E; - } - - } - close( E ); - return $E; - -} ## getEvalue - - - -# Four/Five arguments: -# 1. Number of bootstraps -# 2. bsp (bootstrap positions) file -# 3. Infile (alignment) -# 4. Outfile (bootstrapped according to bsp file) -# 5. Number of processors -# Last modified: 01/30/02 -sub executeBootstrap_cz { - my $boots = $_[ 0 ]; - my $bsp_file = $_[ 1 ]; - my $infile = $_[ 2 ]; - my $outfile = $_[ 3 ]; - my $processors = $_[ 4 ]; - - if ( defined( $processors ) && ( $processors > 1 ) ) { - system( "$BOOTSTRAP_CZ $boots $infile $bsp_file $outfile $processors" ) - && &dieWithUnexpectedError( "Could not execute \"$BOOTSTRAP_CZ $boots $infile $bsp_file $outfile $processors\"" ); - - } - else { - system( "$BOOTSTRAP_CZ $boots $infile $bsp_file $outfile" ) - && &dieWithUnexpectedError( "Could not execute \"$BOOTSTRAP_CZ $boots $infile $bsp_file $outfile\"" ); - } - -} ## executeBootstrap_cz - - - - - -# One argument: -# options for DoRIO.main. -# Last modified: 02/26/01 -sub executeDoRIO { - - my $options = $_[ 0 ]; - - system( "$DORIO $options >/dev/null 2>&1" ) - && &dieWithUnexpectedError( "Could not execute \"$DORIO $options\"" ); - - return; - -} ## executeDoRIO - - - - - - - - - - - -# ----------------------------------------------------------- -# These deal with the alignment -# ----------------------------------------------------------- - - - - -# Counts sequences from a Pfam flat file or -# in a PHYLIP interleaved aligment. -# One arguments: Pfam flat file name. -# Returns the number of sequences. -# Last modified: 07/10/01 -sub countSeqsInPfamAlign { - my $infile = $_[ 0 ]; - my $return_line = ""; - my $saw_sequence_line = 0; - my $number_of_seqs = 0; - - &testForTextFilePresence( $infile ); - - open( C, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); - while ( $return_line = ) { - - if ( $saw_sequence_line == 1 - && !&containsPfamNamedSequence( $return_line ) - && !&isPfamCommentLine( $return_line ) ) { - last; - } - if ( &isPfamSequenceLine( $return_line ) - && $return_line !~ /^\s*\d+\s+\d+/ ) { - if ( $saw_sequence_line == 0 ) { - $saw_sequence_line = 1; - } - $number_of_seqs++; - } - } - close( C ); - return $number_of_seqs; - -} ## countSeqsInPfamAlign - - - - -# This gets the complete name(s) of a sequence from a Pfam alignment. -# I.e. it adds "/xxx-xxx". -# 2 arguments: -# 1. Infile (alignment) -# 2. Name of query -# Returns a String-array of all the complete names found. -# Last modified: 03/04/01 -sub getCompleteName { - - my $infile = $_[ 0 ]; - my $query_name = $_[ 1 ]; - my $return_line = ""; - my @complete_names = (); - my $complete_name = ""; - my $i = 0; - - &testForTextFilePresence( $infile ); - - $query_name =~ s/\/.*//; - - open( INGCN, $infile ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); - while ( $return_line = ) { - if ( $return_line =~ /^\s*$query_name(\S+)\s+.+/ ) { - $complete_name = $query_name.$1; - if ( $i > 0 && $complete_names[ 0 ] eq $complete_name ) { - # Now, we saw of all of them. - last; - } - $complete_names[ $i++ ] = $complete_name; - } - } - - close( INGCN ); - return @complete_names; -} ## getCompleteName - - - - -# Removes sequences from a Pfam flat file. -# It can remove all sequences not from species listed in a species names file. -# It can remove all sequences which do not have a SWISS-PROT name (XXXX_XXXXX) -# It can remove all sequences which are "TrEMBL" fragments. -# Six arguments: -# 1. Pfam flat file name -# 2. outfile name -# 3. Name of the query - not to be removed -# (use " " to not use this functionality) -# 4. species names file (will be ignored if " ") -# 5. 1 to NOT remove non-SWISS_PROT seqs. -# 6. 1 to remove TrEMBL seqs with "(FRAGMENT)" in their DE line. -# (Only used if non SWISS_PROT seqswill not be removed) -# Returns the number of sequences in the resulting alignment. -# If a query name is given, it returns -1 if query is not found in alignment, -# -10 if the name is not unique. -# Last modified: 05/11/01 -sub removeSeqsFromPfamAlign { - my $infile = $_[ 0 ]; - my $outfile = $_[ 1 ]; - my $query = $_[ 2 ]; - my $species_names_file = $_[ 3 ]; - my $keep_non_sp = $_[ 4 ]; - my $remove_frags = $_[ 5 ]; - my $return_line = ""; - my $name = ""; - my $seq = ""; - my $saw_sequence_line = 0; - my $number_of_seqs = 0; - my $saw_query = 0; - my $query_given = 0; - my $species_names_file_given = 0; - my $length_of_name = 0; - my %AC_OS = (); # AC -> species name (TrEMBL) - my %AC_DE = (); # AC -> description (TrEMBL) - my $AC = ""; - my $DE = ""; - my $OS = ""; - - &testForTextFilePresence( $infile ); - - if ( $query =~ /\S/ ) { - $query_given = 1; - } - if ( $species_names_file =~ /\S/ ) { - $species_names_file_given = 1; - &readSpeciesNamesFile( $species_names_file ); - } - - if ( $keep_non_sp == 1 - || ( $query_given == 1 && !&startsWithSWISS_PROTname( $query ) ) ) { - - &testForTextFilePresence( $TREMBL_ACDEOS_FILE ); - - # Fill up hash $AC_OS and $AC_DE. - open( HH, "$TREMBL_ACDEOS_FILE" ) || &dieWithUnexpectedError( "Cannot open file \"$TREMBL_ACDEOS_FILE\"" ); - while ( $return_line = ) { - if ( $return_line =~ /(\S+);([^;]*);(\S+)/ ) { - $AC_OS{ $1 } = $3; - if ( $remove_frags == 1 ) { - $AC_DE{ $1 } = $2; - } - } - } - close( HH ); - } - - open( OUT_RNSP, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); - open( IN_RNSP, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); - while ( $return_line = ) { - - if ( $saw_sequence_line == 1 - && !&containsPfamNamedSequence( $return_line ) - && !&isPfamCommentLine( $return_line ) ) { - # This is just for counting purposes. - $saw_sequence_line = 2; - } - if ( &isPfamSequenceLine( $return_line ) ) { - if ( $saw_sequence_line == 0 ) { - $saw_sequence_line = 1; - } - $return_line =~ /(\S+)\s+(\S+)/; - $name = $1; - $seq = $2; - if ( $query_given == 1 && $name eq $query ) { - $saw_query++; - } - if ( ( $query_given == 1 && $name ne $query ) - || $query_given != 1 ) { - if ( !&startsWithSWISS_PROTname( $name ) ) { - if ( $keep_non_sp != 1 ) { - next; - } - else { - $name =~ /(\S+)\//; - $AC = $1; - unless( exists( $AC_OS{ $AC } ) ) { - #ACs not present in "ACDEOS" file. - next; - } - $OS = $AC_OS{ $AC }; - if ( !$OS || $OS eq "" ) { - &dieWithUnexpectedError( "species for \"$AC\" not found" ); - } - if ( $species_names_file_given == 1 ) { - unless( exists( $Species_names_hash{ $OS } ) ) { - next; - } - } - if ( $remove_frags == 1 ) { - $DE = $AC_DE{ $AC }; - if ( $DE && $DE =~ /$FRAGMENT$/ ) { - next; - } - } - $name =~ s/\//_$OS\//; - } - } - else { - if ( $species_names_file_given == 1 ) { - if ( $name =~ /_([A-Z0-9]{1,5})/ ) { - unless( exists( $Species_names_hash{ $1 } ) ) { - next; - } - } - # remove everything whose species cannot be determined. - else { - next; - } - } - } - } - elsif ( $query_given == 1 && $name eq $query - && !&startsWithSWISS_PROTname( $query ) ) { - # Adding species to non SWISS-PROT query - $name =~ /(\S+)\//; - $AC = $1; - unless( exists( $AC_OS{ $AC } ) ) { - #ACs not present in "ACDEOS" file. - &userError( "Could not establish species of query.\n Check file \"$TREMBL_ACDEOS_FILE\"." ); - } - $OS = $AC_OS{ $AC }; - if ( !$OS || $OS eq "" ) { - &dieWithUnexpectedError( "species for \"$AC\" not found" ); - } - $name =~ s/\//_$OS\//; - } - - $length_of_name = length( $name ); - - if ( $length_of_name > ( $LENGTH_OF_NAME - 1 ) ) { - &userError( "Name \"$name\" is too long." ); - } - - for ( my $j = 0; $j <= ( $LENGTH_OF_NAME - $length_of_name - 1 ); ++$j ) { - $name .= " "; - } - - $return_line = $name.$seq."\n"; - } - - print OUT_RNSP $return_line; - if ( $saw_sequence_line == 1 ) { - $number_of_seqs++; - } - } - close( IN_RNSP ); - close( OUT_RNSP ); - if ( $query_given == 1 ) { - if ( $saw_query < 1 ) { - return -1; - } - elsif ( $saw_query > 1 ) { - return -10; - } - } - return $number_of_seqs; - -} ## removeSeqsFromPfamAlign - - - - -# One argument: -# 1. PWD file -# "Returns" a Hash of Strings (=keys) containing all the names found in PWD file -# Last modified: 05/29/01 -sub getNamesFromPWDFile { - my $infile = $_[ 0 ]; - my $return_line = ""; - my $i = 0; - my $saw_dist_line = 0; - - &testForTextFilePresence( $infile ); - - open( GN_IN, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); - - while ( $return_line = ) { - if ( $saw_dist_line == 1 && $return_line =~ /^\s*(\d+)\s*$/ ) { - if ( $1 != $i ) { - &dieWithUnexpectedError( "Failed sanity check" ); - } - last; - } - elsif ( $return_line =~ /^\s*(\S+)\s+\S+/ ) { - $names_in_pwd_file{ $1 } = 0; - $i++; - $saw_dist_line = 1; - } - } - close( GN_IN ); - return; -} ## getNamesFromPWDFile - - - - -# Moves sequences which start with query name (argument 1) -# to the last positions in pfam alignment sepecified by argument 2. -# Removes seqs present in argument 4, unless for query name. -# Four arguments: -# 1. Query name -# 2. Infile (alignment) -# 3. Outfile (=infile with query seq moved to the bottom) -# 4. Array of seq names to remove, unless for query name -# Last modified: 06/25/01 -sub moveToLast { - my $query = $_[ 0 ]; - my $infile = $_[ 1 ]; - my $outfile = $_[ 2 ]; - my @to_remove = @{ $_[ 3 ] }; # @{} tells Perl that this is a list. - my $return_line = ""; - my $query_line = ""; - my $n = ""; - - &testForTextFilePresence( $infile ); - - open( MTL_IN, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); - open( MTL_OUT, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); - - W: while ( $return_line = ) { - if ( &isPfamCommentLine( $return_line ) - && ( !isRFline( $return_line ) || $mode != 1 ) ) { - next W; - } - if ( @to_remove > 1 ) { - foreach $n ( @to_remove ) { - if ( $n ne $query && $return_line =~ /^\s*$n\s+/ ) { - next W; - } - } - } - if ( $return_line =~ /^\s*$query\s+/ ) { - $query_line = $return_line; - } - elsif ( $query_line ne "" - && ( $return_line !~ /\S+/ || isRFline( $return_line ) ) ) { - print MTL_OUT $query_line; - print MTL_OUT $return_line; - $query_line = ""; - } - else { - print MTL_OUT $return_line; - } - } - if ( $query_line ne "" ) { - print MTL_OUT $query_line; - } - - close( MTL_IN ); - close( MTL_OUT ); - - return; - -} ## moveToLast - - - - - - - - - - -# ----------------------------------------------------------- -# Others -# ----------------------------------------------------------- - - - - -# This gets the complete name of a TrEMBL sequence from a Pfam alignment. -# I.e. it adds the species between "_" and "/XXX-XXX". -# 2 arguments: -# 1. Infile (alignment) -# 2. Name of query -# Returns the complete name found. -# Last modified: 04/25/01 -sub getCompleteNameForTrEMBLquerySeq { - - my $infile = $_[ 0 ]; - my $query_name = $_[ 1 ]; - my $return_line = ""; - my $complete_name = ""; - my $before_slash = ""; - my $after_slash = ""; - - &testForTextFilePresence( $infile ); - - $query_name =~ /(.+)\/.+/; - $before_slash = $1; - - $query_name =~ /.+\/(.+)/; - $after_slash = $1; - - open( INGCN, $infile ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); - while ( $return_line = ) { - if ( $return_line =~ /^\s*($before_slash.+\/$after_slash)/ ) { - $complete_name = $1; - last; - } - } - close( INGCN ); - if ( $complete_name eq "" ) { - &userError( "Could not find \"$query_name\" in \"$alignment\"." ); - } - return $complete_name; -} ## getCompleteNameForTrEMBLquerySeq - - - - -# One argument: -# Pfam align name. -# Last modified: 02/26/01 -sub getDescriptionFromPfam { - - my $infile = $_[ 0 ]; - my $return_line = ""; - my $result = ""; - - &testForTextFilePresence( $infile ); - - open( INGDPF, $infile ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); - while ( $return_line = ) { - if ( $return_line =~ /^\s*#=DE\s+(.+)/ ) { - $result = $1; - close( INGDPF ); - return $result; - } - } - close( INGDPF ); - return $result; - -} ## getDescriptionFromPfam - - - -# Reads in (SWISS-PROT) species names from a file. -# Names must be separated by newlines. -# Lines beginning with "#" are ignored. -# A possible "=" and everything after is ignored. -# One argument: species-names-file name -# Last modified: 04/24/01 -sub readSpeciesNamesFile { - my $infile = $_[ 0 ]; - my $return_line = ""; - my $species = ""; - - &testForTextFilePresence( $infile ); - - open( IN_RSNF, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); - while ( $return_line = ) { - if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) { - $species = $1; - $species =~ s/=.+//; - $Species_names_hash{ $species } = ""; - } - } - close( IN_RSNF ); - - return; -} ## readSpeciesNamesFile - - - - -# This reads a raw sequence file or a FASTA sequence file -# and saves it as a "cleaned up" FASTA sequence file. -# If no > line is in the file, it creates one with new sequence name. -# If a > line is in the file, it modifes it: -# white space -> _, ";" ":" "," or "|" -> "~", deletes everything after ( or [; -# length is limited to 40 characters. -# Error if $new_seq_name is "" and no > line in the file. -# Two/three arguments: -# 1. infile name -# 2. outfile name -# 3. new sequence name for > line(will be ignored if "") -# If new sequence name is "": -# returns the contents of the ">" line after modification. -# If new sequence name is specified: -# return new sequence name. -# Last modified: 03/04/01 -sub seqFile2CleanedUpFastaFile { - my $infile = $_[ 0 ]; - my $outfile = $_[ 1 ]; - my $new_seq_name = $_[ 2 ]; - my $return_line = ""; - my $mod_desc = ""; - my $saw_desc_line = 0; - - &testForTextFilePresence( $infile ); - - open( IN_CUFF, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); - open( OUT_CUFF, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); - - while ( $return_line = ) { - if ( $return_line =~ /\w/ && $return_line !~ /^\s*#/ ) { - if ( $return_line =~ /^\s*>/ ) { - if ( $new_seq_name eq "" && $return_line !~ /_/ ) { - &userError( "Description line of query file appears not to\n contain any species information. Use \"N=\" option." ); - } - elsif ( $new_seq_name eq "" ) { - $return_line =~ s/^\s*>\s*(.*?)\s*/>$1/; # Removes spaces before and after >. - $return_line = substr( $return_line, 0, $LENGTH_OF_NAME - 1 ); - $return_line =~ s/[\(\[].*//; # Removes "(" or "[" and everything after. - $return_line =~ s/\s+$//; # Removes spaces at end. - $return_line =~ s/\s+/_/g; # Replaces all white spaces with "_". - $return_line =~ s/[;:,\|]/~/g; # Replaces all ";", ":", ",", or "|" with "~". - $return_line =~ />\s*(\S+)/; - $mod_desc = $1; - $return_line .= "\n"; - } - else { - $return_line = ">".$new_seq_name."\n"; - $mod_desc = $new_seq_name; - } - $saw_desc_line = 1; - } - else { - if ( $saw_desc_line != 1 ) { - if ( $new_seq_name ne "" ) { - print OUT_CUFF ( ">".$new_seq_name."\n" ); - $mod_desc = $new_seq_name; - } - else { - &userError( "Query file is not a FASTA file\n and option \"N=\" has not been used." ); - } - $saw_desc_line = 1; - } - $return_line =~ s/[^a-zA-Z\r\n\f]//g; # Removes non-letters from sequence. - } - - if ( $return_line =~ /\w/ ) { - print OUT_CUFF $return_line; - } - } - } - close( IN_CUFF ); - close( OUT_CUFF ); - - return $mod_desc; -} ## seqFile2CleanedUpFastaFile - - - - -# Purpose. Gets description for TrEMBL seqs, -# from a file which contains the AC, DE, and OS -# and which has to be generated from a TrEMBL flat file db -# using "extractTrembl.pl". -# The same file is used in method "addSpeciesToNonSPseqs". -# Two arguments: -# 1. "ACDEOS" file (AC, DE, OS from TrEMBL db) -# 2. AC ("_species/..." is removed) -# Format: AC;DE;OS\n -# Last modified: 02/14/02 -sub getDescriptionFromTrEMBL_ACDEOSfile { - my $ACDEOS = $_[ 0 ]; - my $AC = $_[ 1 ]; - my $DE = ""; - - # Fill up (huge) hash, if not already done. - unless ( %AC_DE ) { - &testForTextFilePresence( $ACDEOS ); - open( ACDEOS, "$ACDEOS" ) || &dieWithUnexpectedError( "Cannot open file \"$ACDEOS\"" ); - while ( $return_line = ) { - if ( $return_line =~ /(\S+);([^;]+);/ ) { - $AC_DE{ $1 } = $2; - } - } - close( ACDEOS ); - } - - $AC =~ s/_.+//; - - unless( exists( $AC_DE{ $AC } ) ) { - #AC not present in "ACDEOS" file. - return "-"; - } - - $DE = $AC_DE{ $AC }; - - if ( !$DE || $DE eq "" ) { - $DE = "-"; - } - - return $DE; - -} ## getDescriptionFromTrEMBL_ACDEOSfile - - - -# Purpose. Gets description for SP seqs, -# from a file which contains the AC, DE, and OS -# and which has to be generated from a sprot.dat flat file db -# using "extractSWISS-PROT.pl". -# Two arguments: -# 1. "ACDEOS" file (AC, DE, OS from SWISS-PROT db) -# 2. SWISS-PROT AC (XXXX_XXXX) -# Format: AC;DE;OS\n -# Last modified: 02/12/02 -sub getDescriptionFromSWISSPROT_ACDEOSfile { - my $SPACDEOS = $_[ 0 ]; - my $AC = $_[ 1 ]; - my $DE = ""; - - # Fill up (huge) hash, if not already done. - unless ( %SP_AC_DE ) { - &testForTextFilePresence( $SPACDEOS ); - open( ACDEOS, "$SPACDEOS" ) || &dieWithUnexpectedError( "Cannot open file \"$SPACDEOS\"" ); - while ( $return_line = ) { - if ( $return_line =~ /(\S+);([^;]+);/ ) { - $SP_AC_DE{ $1 } = $2; - } - } - close( ACDEOS ); - } - - $AC =~ s/\/.+//; - - unless( exists( $SP_AC_DE{ $AC } ) ) { - #AC not present in "ACDEOS" file. - return "-"; - } - - $DE = $SP_AC_DE{ $AC }; - - if ( !$DE || $DE eq "" ) { - $DE = "-"; - } - - return $DE; - -} ## getDescriptionFromSWISSPROT_ACDEOSfile - - - - - - - - - -# ----------------------------------------------------------- -# Helpers -# ----------------------------------------------------------- - - - -# One argument: -# Numeric value to be rounded to int. -# Last modified: 10/17/01 -sub roundToInt { - my $x = $_[ 0 ]; - unless ( $x eq "-" ) { - $x = int ( $x + 0.5 ); - } - return $x; -} ## roundToInt - - - -# Removes files. -# Last modified: 03/10/01 -sub cleanUpTempDir { - unlink( $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX, $temp_dir."/MAKETREEOUT".$LOG_FILE_SUFFIX, - $temp_dir."/MAKETREEOUT".$ALIGN_FILE_SUFFIX, $temp_dir."/MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX, - $temp_dir."/MAKETREEOUT".$SUFFIX_PWD_NOT_BOOTS, $temp_dir."/".$DO_RIO_TEMP_OUTFILE, - $temp_dir."/ALIGN1", $temp_dir."/ALIGN2", $temp_dir."/QUERY_SEQ", $temp_dir."/NBD_NJ_TREE", - $temp_dir."/ALIGN2_BOOTSTRAPPED", $temp_dir."/ALIGN2_PROCESSED", $temp_dir."/DIST_TO_QUERY", - $temp_dir."/DISTs_TO_QUERY", $temp_dir."/HMMALIGNOUT", $temp_dir."/NBD_INC_QUERY", $temp_dir."/PWD_INC_QUERY", - $temp_dir."/HMMFILE", $temp_dir."/MOVETOLASTOUT" ); - rmdir( $temp_dir ); -} ## cleanUpTempDir - - - - - - - - - - - - -# ----------------------------------------------------------- -# Command line and arguments, Errors -# ----------------------------------------------------------- - - - -# One argument: -# the command line. -# Last modified: 03/08/01 -sub analyzeCommandLine { - - my $args = ""; - my $arg = ""; - my $char = ""; - - - - $mode = shift( @_ ); - - if ( $mode != 1 && $mode != 2 && $mode != 3 && $mode != 4 ) { - &errorInCommandLine( "Mode can only be: 1, 2, 3, or 4." ); - } - - - foreach $args ( @_ ) { - - $args =~ s/\s//g; - - $char = substr( $args, 0, 1 ); - - - if ( length( $args ) > 1 ) { - $arg = substr( $args, 2 ); - } - - if ( $char =~ /A/ ) { - if ( $alignment ne "" ) { - &errorInCommandLine( "Entered same argument twice." ); - } - if ( $mode == 3 || $mode == 4 ) { - &userErrorCheckForTextFileExistence( $arg ); - } - $alignment = $arg; - } - elsif ( $char =~ /B/ ) { - if ( $t_sn != $THRESHOLD_SN_DEFAULT ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $t_sn = $arg; - } - elsif ( $char =~ /C/ ) { - if ( $description == 1 || $complete_description == 1 ) { - &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"D\" and \"C\"." ); - } - $complete_description = 1; - } - elsif ( $char =~ /D/ ) { - if ( $description == 1 || $complete_description == 1 ) { - &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"D\" and \"C\"." ); - } - $description = 1; - } - elsif ( $char =~ /E/ ) { - if ( $long_output != 0 ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $long_output = 1; - } - elsif ( $char =~ /F/ ) { - if ( $hmm_file ne "" || $hmm_name ne "" || $seed_aln_for_hmmbuild ne "") { - &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"F=\", \"H=\" and \"b=\"." ); - } - if ( $mode == 1 || $mode == 2 ) { - &errorInCommandLine( "Can not use \"F=\" in modes 1 or 2." ); - } - &userErrorCheckForTextFileExistence( $arg ); - $hmm_file = $arg; - } - elsif ( $char =~ /G/ ) { - if ( $species_names_file ne " " ) { - &errorInCommandLine( "Entered same argument twice." ); - } - &userErrorCheckForTextFileExistence( $arg ); - $species_names_file = $arg; - } - elsif ( $char =~ /H/ ) { - if ( $hmm_name ne "" || $hmm_file ne "" || $seed_aln_for_hmmbuild ne "" ) { - &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"F=\", \"H=\" and \"b=\"." ); - } - if ( $mode == 1 || $mode == 2 ) { - &errorInCommandLine( "Can not use \"H=\" in modes 1 or 2." ); - } - $hmm_name = $arg; - } - elsif ( $char =~ /I/ ) { - if ( $safe_nhx != 0 ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $safe_nhx = 1; - } - elsif ( $char =~ /K/ ) { - if ( $keep != 0 ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $keep = 1; - } - elsif ( $char =~ /L/ ) { - if ( $t_orthologs != $THRESHOLD_ORTHOLOGS_DEFAULT ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $t_orthologs = $arg; - } - elsif ( $char =~ /N/ ) { - if ( $query_name ne "" ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $query_name = $arg; - } - elsif ( $char =~ /O/ ) { - if ( $outfile ne "" ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $outfile = $arg; - } - elsif ( $char =~ /P/ ) { - if ( $sort != $SORT_DEFAULT ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $sort = $arg; - } - elsif ( $char =~ /Q/ ) { - if ( $seqX_file ne "" ) { - &errorInCommandLine( "Entered same argument twice." ); - } - &userErrorCheckForTextFileExistence( $arg ); - $seqX_file = $arg; - } - elsif ( $char =~ /S/ ) { - if ( $species_tree_file ne "" ) { - &errorInCommandLine( "Entered same argument twice." ); - } - &userErrorCheckForTextFileExistence( $arg ); - $species_tree_file = $arg; - } - elsif ( $char =~ /T/ ) { - if ( $mode == 1 || $mode == 2 ) { - &errorInCommandLine( "Matrix cannot be changed in modes 1 and 2 (is dictated by \"\$MATRIX_FOR_PWD\" for mode 1)." ); - } - if ( $arg eq "J" ) { - $matrix_n = 0; - } - elsif ( $arg eq "P" ) { - $matrix_n = 1; - } - elsif ( $arg eq "B" ) { - $matrix_n = 2; - } - elsif ( $arg eq "M" ) { - $matrix_n = 3; - } - elsif ( $arg eq "V" ) { - $matrix_n = 5; - } - elsif ( $arg eq "W" ) { - $matrix_n = 6; - } - else { - &errorInCommandLine( "Use T=J for JTT, P for PAM, B for BLOSUM62, M for mtREV24, V for VT, W for WAG." ); - } - } - elsif ( $char =~ /U/ ) { - if ( $t_orthologs_dc != $THRESHOLD_ORTHOLOGS_DEFAULT_DC ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $t_orthologs_dc = $arg; - } - elsif ( $char =~ /X/ ) { - if ( $warn_more_than_one_ortho - != $WARN_MORE_THAN_ONE_ORTHO_DEFAULT ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $warn_more_than_one_ortho = $arg; - } - elsif ( $char =~ /Y/ ) { - if ( $warn_no_orthos != $WARN_NO_ORTHOS_DEFAULT ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $warn_no_orthos = $arg; - } - elsif ( $char =~ /Z/ ) { - if ( $warn_one_ortho != $WARN_ONE_ORTHO_DEFAULT ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $warn_one_ortho = $arg; - } - elsif ( $char =~ /a/ ) { - if ( $boostraps_for_makeTree != $BOOSTRAPS_FOR_MAKETREE_DEFAULT ) { - &errorInCommandLine( "Entered same argument twice." ); - } - if ( $mode == 1 || $mode == 2 ) { - &errorInCommandLine( "Modes 1 and 2: Cannot change bootstrap value. Do not use \"a=\"." ); - } - $boostraps_for_makeTree = $arg; - if ( $boostraps_for_makeTree < 10 ) { - &errorInCommandLine( "Bootsraps cannot be smaller than 10." ); - } - } - elsif ( $char =~ /b/ ) { - if ( $hmm_name ne "" || $hmm_file ne "" || $seed_aln_for_hmmbuild ne "" ) { - &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"F=\", \"H=\" and \"b=\"." ); - } - if ( $mode == 1 || $mode == 2 ) { - &errorInCommandLine( "Can not use \"b=\" in modes 1 or 2." ); - } - &userErrorCheckForTextFileExistence( $arg ); - $seed_aln_for_hmmbuild = $arg; - } - elsif ( $char =~ /f/ ) { - if ( $no_frags ne 0 ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $no_frags = 1; - } - elsif ( $char =~ /j/ ) { - if ( $temp_dir ne "" ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $temp_dir = $arg; - } - elsif ( $char =~ /p/ ) { - if ( $output_ultraparalogs != 0 ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $output_ultraparalogs = 1; - } - elsif ( $char =~ /s/ ) { - if ( $non_sp != 1 ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $non_sp = 0; - } - elsif ( $char =~ /v/ ) { - $t_ultra_paralogs = $arg; - } - elsif ( $char =~ /x/ ) { - if ( $output_HTML == 1 ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $output_HTML = 1; - } - elsif ( $char =~ /y/ ) { - if ( $seed_for_makeTree != $SEED_FOR_MAKETREE_DEFAULT ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $seed_for_makeTree = $arg; - } - elsif ( $char =~ /\+/ ) { - if ( $parallel != 0 ) { - &errorInCommandLine( "Entered same argument twice." ); - } - $parallel = 1; - } - else { - &errorInCommandLine( "Unknown option: \"$args\"." ); - } - } -} ## analyzeCommandLine - - - - -# Last modified: 03/08/01 -sub CheckArguments { - - if ( $outfile eq "" ) { - &errorInCommandLine( "Outfile not specified. Use \"O=\"." ); - } - if ( $alignment eq "" ) { - &errorInCommandLine( "Need to specify a Pfam alignment file. Use \"A=\"." ); - } - if ( -e $outfile ) { - &userError( "\"$outfile\" already exists." ); - } - - if ( $sort < 0 || $sort > 17 ) { - &errorInCommandLine( "Sort priority (\"P=\") must be between 0 and 15." ); - } - - if ( $parallel == 1 && $mode != 1 ) { - &errorInCommandLine( "Parallelization only implemented for mode 1." ); - } - - if ( $mode == 1 || $mode == 2 ) { - - if ( $species_names_file =~ /\S/ ) { - &errorInCommandLine( "Modes 1 and 2: Cannot use species names file. Do not use \"G=\"." ); - } - if ( $non_sp == 0 ) { - &errorInCommandLine( "Can not use \"s\" in modes 1 or 2." ); - } - if ( $no_frags == 1 ) { - &errorInCommandLine( "Can not use \"f\" in modes 1 or 2." ); - } - } - - if ( $mode == 1 || $mode == 3 ) { - if ( $seqX_file eq "" ) { - &errorInCommandLine( "Modes 1 and 3: Need to specify a query file. Use \"Q=\"." ); - } - } - - if ( $mode == 3 ) { - if ( $hmm_name eq "" && $hmm_file eq "" && $seed_aln_for_hmmbuild eq "" ) { - &errorInCommandLine( "Mode 3: Need to specify either a HMM name (\"H=\"), a HMM file (\"F=\") or build a HMM (\"b=\")." ); - } - } - - if ( $mode == 1 ) { - if ( $hmm_name ne "" || $hmm_file ne "" || $seed_aln_for_hmmbuild ne "" ) { - &errorInCommandLine( "Mode 1: Must not specify a HMM name (\"H=\"), a HMM file (\"F=\") or build a HMM (\"b=\")." ); - } - } - - if ( $mode == 2 || $mode == 4 ) { - if ( $seqX_file ne "" ) { - &errorInCommandLine( "Modes 2 and 4: Must not specify a query file. Do not use \"Q=\".\n" ); - } - if ( $query_name eq "" ) { - &errorInCommandLine( "Modes 2 and 4: Must specify a query name. Use \"N=\"." ); - } - if ( $hmm_name ne "" || $hmm_file ne "" || $seed_aln_for_hmmbuild ne "" ) { - &errorInCommandLine( "Modes 2 and 4: Cannot specify a HMM name (\"H=\"), a HMM file (\"F=\") or build a HMM (\"b=\")." ); - } - - } - - if ( $non_sp != 1 && $no_frags == 1 ) { - &errorInCommandLine( "\"Fragments\" are assumed to be only found in non SWISS-PROT seqs.\n Do not use \"f\" together with \"s\"." ); - } - - if ( $output_HTML == 1 ) { - if ( $mode != 1 ) { - &errorInCommandLine( "Output in HTML (for web server) only for mode 1." ); - } - } - - if ( $output_ultraparalogs == 0 && $t_ultra_paralogs != $T_ULTRA_PARALOGS_DEFAULT ) { - &errorInCommandLine( "Use \"p\" to output ultra paralogs (cannot use \"v=\" without \"p\")." ); - } - - if ( $non_sp == 1 && ( $mode == 3 || $mode == 4 ) ) { - unless ( ( -s $TREMBL_ACDEOS_FILE ) && ( -f $TREMBL_ACDEOS_FILE ) && ( -T $TREMBL_ACDEOS_FILE ) ) { - my $message = "AC, DE, and OS-file not found.\n"; - $message .= " If non SWISS-PROT sequences are not to be removed from the\n"; - $message .= " Pfam alignment (\"s\" option), variable \"\$TREMBL_ACDEOS_FILE\" needs\n"; - $message .= " to point to a file containing AC, DE, and OS from TrEMBL. Such a\n"; - $message .= " file can be generated with \"extractTrembl.pl\".\n"; - $message .= " Currently, \"TREMBL_ACDEOS_FILE\" points to:\n"; - $message .= " $TREMBL_ACDEOS_FILE"; - &userError( $message ); - } - } - - unless ( ( -s $species_tree_file ) && ( -f $species_tree_file ) && ( -T $species_tree_file ) ) { - my $message = "Species tree file not found.\n"; - $message .= " A valid species tree must be specified.\n"; - $message .= " Either, use \"S=\" option, or set variable\n"; - $message .= " \"\$SPECIES_TREE_FILE_DEFAULT\".\n"; - $message .= " Currently, this program looks for a species tree at:\n"; - $message .= " $species_tree_file"; - &userError( $message ); - } - - if ( $hmm_name ne "" ) { - unless ( ( -s $PFAM_HMM_DB ) && ( -f $PFAM_HMM_DB ) ) { - my $message = "HMMER model db file not found.\n"; - $message .= " If \"H=\" option is used, a valid HMMER model db needs\n"; - $message .= " to be specified with variable \"\$PFAM_HMM_DB\".\n"; - $message .= " Currently, \"\$PFAM_HMM_DB\" points to:\n"; - $message .= " $PFAM_HMM_DB"; - &userError( $message ); - } - } -} ## CheckArguments - - - -# Last modfied: 06/25/01 -sub userErrorCheckForTextFileExistence { - my $file = $_[ 0 ]; - unless ( ( -s $file ) && ( -f $file ) && ( -T $file ) ) { - &userError( "\"$file\" does not exist or is not a plain text file." ); - } -} ## checkForFileExistence - - - -# One argument: the error message. -# Last modified: 04/26/01 -sub errorInCommandLine { - - my $error = $_[ 0 ]; - - print " \n"; - print " rio.pl version: $VERSION\n"; - print " ------\n"; - print " \n"; - print " Error in command line:\n"; - if ( $error ne "" ) { - print " $error"; - } - print " \n\n"; - print " Type \"rio.pl\" (no arguments) for more information.\n"; - print " \n"; - exit( -1 ); -} ## errorInCommandLine - - - - -# One argument: the error message. -# Last modified: 04/26/01 -sub userError { - - my $error = $_[ 0 ]; - - print " \n"; - print " rio.pl version: $VERSION\n"; - print " ------\n"; - print " \n"; - print " Error:\n"; - if ( $error ne "" ) { - print " $error"; - } - print " \n\n"; - print " Type \"rio.pl\" (no arguments) for more information.\n"; - print " \n"; - &cleanUpTempDir(); - exit( -1 ); -} ## UserError - - - - - - -# Last modified: 04/26/01 -sub printHelp { - - print " \n"; - print " rio.pl version: $VERSION\n"; - print " ------\n\n"; - - print < - ----- - - - Examples: - --------- - - % RIO1.1/perl/rio.pl 1 A=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out1 p I C E - - % RIO1.1/perl/rio.pl 2 A=aconitase N=LEU2_LACLA/5-449 O=out2 p I C E - - % RIO1.1/perl/rio.pl 3 A=/path/to/my/pfam/Full/aconitase H=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out3 p I C E - - % RIO1.1/perl/rio.pl 4 A=/path/to/my/pfam/Full/aconitase N=LEU2_LACLA/5-449 O=out4 p I C E - - % RIO1.1/perl/rio.pl 3 A=/path/to/my/pfam/Full/aconitase b=/path/to/my/pfam/Seed/aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out5 p I C E - - - - Modes: - ------ - - 1: RIO analysis based on precalculated pairwise distances - alignment does not contain query sequence - - 2: RIO analysis based on precalculated pairwise distances - alignment does contain query sequence - - 3: RIO analysis based on Pfam alignments, - alignment does not contain query sequence - - 4: RIO analysis based on Pfam alignments, - alignment does contain query sequence - - - - Tagged arguments: - ----------------- - - No "G=", "H=", "F=", "T=", "a=", "b=", "s", "f" in modes 1 and 2. - - - A= Pfam alignment name (mandatory). This specifies the alignment - against which the RIO analysis is to be performed. - In modes 1 and 2: Pfam model (alignment) name - (e.g. "A=aconitase"). - In modes 3 and 4: Pfam alignment path/name - (e.g. "A=/path/to/your/pfam/Full/aconitase"). - - Q= Path/name of file containing the query sequence - (in FASTA format or raw sequence) (mandatory in modes 1 and 3). - - N= Query name (mandatory). This must include the SWISS-PROT code - for the species of the query after a "_" (e.g. "N=QUERY_HAEIN"). - If the query sequence is already in the alignment (modes 2 and 4) - the complete name needs to be specified -- including "/xxx-xxx". - - O= Output file path/name (mandatory). - - T= Model for pairwaise distance calculation: - J=JTT, B=BLOSUM 62, M=mtREV24, V=VT, W=WAG, P=PAM. - BLOSUM 62 is default. - (Not in modes 1 and 2; these modes use \$MATRIX_FOR_PWD instead.) - - In modes 1 and 3, a HMM is needed to align the query sequence to - the alignment and either one of the following options must be - employed: - H= HMM name: This uses hmmfetch to retrieve a HMM from - \$PFAM_HMM_DB. - F= HMM file: This directly reads the HMM from a file. - - S= Species tree file path/name (in NHX format) (optional). - If not specified, \$SPECIES_TREE_FILE_DEFAULT is used. - - G= Species names file (optional). Only sequences associated with - species found in this file are used. - In the species names file, individual species names must be - separated by newlines and lines starting with "#" are ignored. - While only sequences associated with species found in the species - tree ("S=") are used for the actual RIO analysis, this allows to - remove sequences prior to tree calculation (which is the most - time consuming step). - - P= Sort priority (default is 12): - 0 : Ortholog - 1 : Ortholog, Super ortholog - 2 : Super ortholog, Ortholog - 3 : Ortholog, Distance - 4 : Distance, Ortholog - 5 : Ortholog, Super ortholog, Distance - 6 : Ortholog, Distance, Super ortholog - 7 : Super ortholog, Ortholog, Distance - 8 : Super ortholog, Distance, Ortholog - 9 : Distance, Ortholog, Super ortholog - 10 : Distance, Super ortholog, Ortholog - 11 : Ortholog, Subtree neighbor, Distance - 12 : Ortholog, Subtree neighbor, Super ortholog, Distance (default) - 13 : Ortholog, Super ortholog, Subtree neighbor, Distance - 14 : Subtree neighbor, Ortholog, Super ortholog, Distance - 15 : Subtree neighbor, Distance, Ortholog, Super ortholog - 16 : Ortholog, Distance, Subtree neighbor, Super ortholog - 17 : Ortholog, Subtree neighbor, Distance, Super ortholog - - a= Bootstraps for tree construction (not in modes 1 and 2). - Default is 100. - - L= Threshold for orthologies for output. Default is 0. - v= Threshold for ultra-paralogies for output. Default is 50. - - U= Threshold for orthologies for distance calculation. Default is 60. - - X= In case of more than one putative orthologs: - number of sd the distance query - LCA has to differ - from the mean to generate a warning. Default is 2. - - Y= In case of no putative orthologs: - number of sd the distance query - root has to differ - from mean to generate a warning. Default is 2. - - Z= In case of one putative ortholog: - threshold for factor between the two distances to their - LCA (larger/smaller) to generate a warning. Default is 2. - - B= Threshold for subtree-neighborings. Default is 0. - - b= Build HMM from seed alignment with "hmmbuild -s" (optional). - This is to prevent from finding multiple domains per sequence - (i.e. prevents "cutting" the query sequence). Give path/name to - Seed with this. - - j= Name for temporary directory (optional). - - y= Seed for random number generator. Default is 41. - - I Create and save a rooted, with duplication vs speciation, - and orthology information annotated gene tree. - If precalculated distances are used (modes 1 and 2): this gene - tree is a NJ tree calculated based on the non-bootstrap resampled - (original) pairwise distances. - If precalculated distances are not used (modes 3 and 4): this gene - is a consenus tree with ML branch length values and is also - annotated with bootstrap values for each node. - - Options for output: - p Output ultra-paralogs. - D Description from SWISS-PROT and TrEMBL. - C Complete description from SWISS-PROT and TrEMBL. - E 118 character output instead of 78 character output. - - K Keep intermediate files (they will go into the same directory - as the output file, their names are the same as of the output - file, with various suffixes added). - - s Ignore non SWISS-PROT sequences (i.e. sequences from TrEMBL) - in the Pfam alignment. - - f Try to ignore TrEMBL "fragments" (sequences with "fragment" in - their description). - - + Parallel, use machines listed in file \$NODE_LIST. - - x RIO used as web server -- HTML output. - - -END - exit( 0 ); - -} ## printHelp - diff --git a/forester/archive/perl/rio_module.pm b/forester/archive/perl/rio_module.pm deleted file mode 100755 index 59b15e2..0000000 --- a/forester/archive/perl/rio_module.pm +++ /dev/null @@ -1,1108 +0,0 @@ -# Copyright (C) 2002-2003 Washington University School of Medicine -# and Howard Hughes Medical Institute -# All rights reserved -# -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Last modified 03/13/03 - - -package rio_module2; -use strict; -require Exporter; - -our $VERSION = 3.20; - -our @ISA = qw( Exporter ); - -our @EXPORT = qw( executeConsense - executeMakeTree - executePuzzleDQO - executePuzzleDQObootstrapped - pfam2phylipMatchOnly - startsWithSWISS_PROTname - isPfamSequenceLine - isPfamCommentLine - containsPfamNamedSequence - isRFline - executeNeighbor - executeProtpars - setModelForPuzzle - setRateHeterogeneityOptionForPuzzle - setParameterEstimatesOptionForPuzzle - executePuzzleBootstrapped - executePuzzle - executeHmmfetch - addDistsToQueryToPWDfile - testForTextFilePresence - exitWithWarning - dieWithUnexpectedError - addSlashAtEndIfNotPresent - $LENGTH_OF_NAME - $MIN_NUMBER_OF_AA - $TREMBL_ACDEOS_FILE - $SWISSPROT_ACDEOS_FILE - $SPECIES_NAMES_FILE - $SPECIES_TREE_FILE_DEFAULT - $MULTIPLE_TREES_FILE_SUFFIX - $LOG_FILE_SUFFIX - $ALIGN_FILE_SUFFIX - $TREE_FILE_SUFFIX - $ADDITION_FOR_RIO_ANNOT_TREE - $SUFFIX_PWD - $SUFFIX_BOOT_STRP_POS - $SUFFIX_PWD_NOT_BOOTS - $SUFFIX_HMM - $MATRIX_FOR_PWD - $RIO_PWD_DIRECTORY - $RIO_BSP_DIRECTORY - $RIO_NBD_DIRECTORY - $RIO_ALN_DIRECTORY - $RIO_HMM_DIRECTORY - $PFAM_FULL_DIRECTORY - $PFAM_SEED_DIRECTORY - $PRIOR_FILE_DIR - $PFAM_HMM_DB - $SEQBOOT - $NEIGHBOR - $PROTPARS - $CONSENSE - $PUZZLE - $HMMALIGN - $HMMSEARCH - $HMMBUILD - $HMMFETCH - $SFE - $HMMCALIBRATE - $P7EXTRACT - $MULTIFETCH - $BOOTSTRAP_CZ - $BOOTSTRAP_CZ_PL - $TRANSFERSBRANCHLENGHTS - $MAKETREE - $RIO_PL - $DORIO - $PUZZLE_DQO - $BOOTSTRAPS - $PATH_TO_FORESTER - $JAVA - $NODE_LIST - $RIO_SLAVE_DRIVER - $RIO_SLAVE - $TEMP_DIR_DEFAULT - $EXPASY_SPROT_SEARCH_DE - $EXPASY_SPROT_SEARCH_AC - ); - - - - -# ============================================================================= -# ============================================================================= -# -# THESE VARIABLES ARE ENVIRONMENT DEPENDENT, AND NEED TO BE SET ACCORDINGLY -# BY THE USER -# ------------------------------------------------------------------------- -# - - - -# RIO itself: -# ----------- -our $PATH_TO_FORESTER = "/nfs/dm3/homedir1/czmasek/RIO1.24/"; - - -# Java virtual machine: -# --------------------- -our $JAVA = "/usr/local/java/jdk/bin/java"; - - - -# Where all the temporary files can be created: -# --------------------------------------------- -our $TEMP_DIR_DEFAULT = "/tmp/"; - - - -# Pfam data: -# ---------- -our $PFAM_FULL_DIRECTORY = "/path/to/Pfam/Full/"; -our $PFAM_SEED_DIRECTORY = "/path/to/Pfam/Seed/"; -our $PFAM_HMM_DB = "/path/to/Pfam/Pfam_ls"; # Need to run "hmmindex" on this - # to produce .ssi file. - # Then, for example - # "setenv HMMERDB /home/rio/pfam-6.6/" - - -$PATH_TO_FORESTER = &addSlashAtEndIfNotPresent( $PATH_TO_FORESTER ); - - -# Description lines and species from SWISS-PROT and TrEMBL: -# --------------------------------------------------------- -our $TREMBL_ACDEOS_FILE = $PATH_TO_FORESTER."data/trembl22_ACDEOS_1-6"; - -our $SWISSPROT_ACDEOS_FILE = $PATH_TO_FORESTER."data/sp40_ACDEOS_1-6"; - - - -# Names of species which can be analyzed and analyzed -# against (must also be in tree $SPECIES_TREE_FILE_DEFAULT). -# By using a list with less species, RIO analyses become faster -# but lose phylogenetic resolution. -# For many purposes, list "tree_of_life_bin_1-6_species_list" -# in "data/species/" might be sufficient: -# -------------------------------------------------------------- -our $SPECIES_NAMES_FILE = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6_species_list"; - - - -# A default species tree in NHX format. -# For many purposes, tree "tree_of_life_bin_1-6.nhx" -# in "data/species/" might be fine: -# -------------------------------------------------- -our $SPECIES_TREE_FILE_DEFAULT = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6.nhx"; - - - -# Data for using precalculated distances: -# --------------------------------------- -our $MATRIX_FOR_PWD = 2; # The matrix which has been used for the pwd in $RIO_PWD_DIRECTORY. - # 0=JTT, 1=PAM, 2=BLOSUM 62, 3=mtREV24, 5=VT, 6=WAG. - -our $RIO_PWD_DIRECTORY = $PATH_TO_FORESTER."example_data/"; # all must end with "/" -our $RIO_BSP_DIRECTORY = $PATH_TO_FORESTER."example_data/"; -our $RIO_NBD_DIRECTORY = $PATH_TO_FORESTER."example_data/"; -our $RIO_ALN_DIRECTORY = $PATH_TO_FORESTER."example_data/"; -our $RIO_HMM_DIRECTORY = $PATH_TO_FORESTER."example_data/"; - - - -# -# End of variables which need to be set by the user. -# -# ============================================================================= -# ============================================================================= - - - - - -$TEMP_DIR_DEFAULT = &addSlashAtEndIfNotPresent( $TEMP_DIR_DEFAULT ); -$PFAM_FULL_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_FULL_DIRECTORY ); -$PFAM_SEED_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_SEED_DIRECTORY ); - - - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# These variables should normally not be changed: -# - -our $PRIOR_FILE_DIR = $PATH_TO_FORESTER."data/priors_for_hmmbuild/"; - # Directory containing dirichlet prior - # files needed for certain aligments - # by hmmbuild (e.g. Collagen). - - -# PHYLIP: -our $SEQBOOT = $PATH_TO_FORESTER."phylip_mod/exe/seqboot"; -our $NEIGHBOR = $PATH_TO_FORESTER."phylip_mod/exe/neighbor"; -our $PROTPARS = $PATH_TO_FORESTER."phylip_mod/exe/protpars"; -our $CONSENSE = $PATH_TO_FORESTER."phylip_mod/exe/consense"; - -# TREE-PUZZLE: -our $PUZZLE = $PATH_TO_FORESTER."puzzle_mod/src/puzzle"; -our $PUZZLE_DQO = $PATH_TO_FORESTER."puzzle_dqo/src/puzzle"; - -# HMMER: -our $HMMALIGN = $PATH_TO_FORESTER."hmmer/binaries/hmmalign"; -our $HMMSEARCH = $PATH_TO_FORESTER."hmmer/binaries/hmmsearch"; -our $HMMBUILD = $PATH_TO_FORESTER."hmmer/binaries/hmmbuild"; -our $HMMFETCH = $PATH_TO_FORESTER."hmmer/binaries/hmmfetch"; -our $SFE = $PATH_TO_FORESTER."hmmer/binaries/sfetch"; -our $HMMCALIBRATE = $PATH_TO_FORESTER."hmmer/binaries/hmmcalibrate"; - -our $P7EXTRACT = $PATH_TO_FORESTER."perl/p7extract.pl"; -our $MULTIFETCH = $PATH_TO_FORESTER."perl/multifetch.pl"; - - -# RIO/FORESTER: -our $BOOTSTRAP_CZ = $PATH_TO_FORESTER."C/bootstrap_cz"; -our $BOOTSTRAP_CZ_PL = $PATH_TO_FORESTER."perl/bootstrap_cz.pl"; -our $TRANSFERSBRANCHLENGHTS = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.transfersBranchLenghts"; -our $MAKETREE = $PATH_TO_FORESTER."perl/makeTree.pl"; -our $RIO_PL = $PATH_TO_FORESTER."perl/rio.pl"; -our $DORIO = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.DoRIO"; -# parallel RIO: -our $RIO_SLAVE_DRIVER = $PATH_TO_FORESTER."perl/rio_slave_driver.pl"; -our $RIO_SLAVE = $PATH_TO_FORESTER."perl/rio_slave.pl"; -our $NODE_LIST = $PATH_TO_FORESTER."data/node_list.dat"; - -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - -our $BOOTSTRAPS = 100; -our $MIN_NUMBER_OF_AA = 20; # After removal of gaps, if less, gaps are not removed. -our $LENGTH_OF_NAME = 26; - - - - -our $MULTIPLE_TREES_FILE_SUFFIX = ".mlt"; -our $LOG_FILE_SUFFIX = ".log"; -our $ALIGN_FILE_SUFFIX = ".aln"; -our $TREE_FILE_SUFFIX = ".nhx"; -our $ADDITION_FOR_RIO_ANNOT_TREE = ".rio"; -our $SUFFIX_PWD = ".pwd"; -our $SUFFIX_BOOT_STRP_POS = ".bsp"; -our $SUFFIX_PWD_NOT_BOOTS = ".nbd"; -our $SUFFIX_HMM = ".hmm"; - -our $EXPASY_SPROT_SEARCH_DE = "http://www.expasy.org/cgi-bin/sprot-search-de?"; -our $EXPASY_SPROT_SEARCH_AC = "http://www.expasy.org/cgi-bin/sprot-search-ac?"; - - - -# One argument: input multiple trees file -# Last modified: 07/05/01 -sub executeConsense { - my $in = $_[ 0 ]; - - &testForTextFilePresence( "$in" ); - - system( "$CONSENSE >/dev/null 2>&1 << ! -$in -Y -!" ) - && &dieWithUnexpectedError( "Could not execute \"$CONSENSE \"" ); - - return; -} - - - -# Four arguments: -# 1. options ("-" is not necessary) -# 2. alignment or pwd file -# 3. outfile -# 4. temp dir -# Last modified: 07/05/01 -sub executeMakeTree { - - my $opts = $_[ 0 ]; - my $B = $_[ 1 ]; - my $C = $_[ 2 ]; - my $D = $_[ 3 ]; - - &testForTextFilePresence( $B ); - - $opts = "-".$opts; - - system( "$MAKETREE $opts $B $C $D" ) - && &dieWithUnexpectedError( "Could not execute \"$MAKETREE $opts $B $C $D\"" ); - -} ## executeMakeTree - - - - -# Two arguments: -# 1. Name of inputfile -# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; -# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise -sub executePuzzleDQO { - my $in = $_[ 0 ]; - my $matrix_option = $_[ 1 ]; - my $mat = ""; - - &testForTextFilePresence( $in ); - - $mat = setModelForPuzzle( $matrix_option ); - - system( "$PUZZLE_DQO $in >/dev/null 2>&1 << !$mat -y -!" ) - && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO\"" ); - - return; - -} ## executePuzzleDQO - - - - -# Two arguments: -# 1. Name of inputfile -# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; -# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise -# Last modified: 01/28/02 -sub executePuzzleDQObootstrapped { - my $in = $_[ 0 ]; - my $matrix_option = $_[ 1 ]; - - - my $l = 0; - my $slen = 0; - my $counter = 0; - my $mat = ""; - my $a = ""; - my @a = (); - - &testForTextFilePresence( $in ); - - open( GRP, "<$in" ) || &dieWithUnexpectedError( "Cannot open file \"$in\"" ); - while( ) { - if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) { - $counter++; - } - } - close( GRP ); - - $l = `cat $in | wc -l`; - $slen = $l / $counter; - - system( "split -$slen $in $in.splt." ) - && &dieWithUnexpectedError( "Could not execute \"split -$slen $in $in.splt.\"" ); - - @a = <$in.splt.*>; - - $mat = setModelForPuzzle( $matrix_option ); - - foreach $a ( @a ) { - - system( "$PUZZLE_DQO $a >/dev/null 2>&1 << !$mat -y -!" ) - && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO $a\"" ); - - system( "cat $a.dist >> $in.dist" ) - && &dieWithUnexpectedError( "Could not execute \"cat outdist >> $in.dist\"" ); - - unlink( $a, $a.".dist" ); - } - - return; - -} ## executePuzzleDQObootstrapped - - - -# Transfers a Pfam (SELEX) alignment to a -# PHYLIP sequential style alignment. -# It only writes "match columns" as indicated by the -# "# RF" line ('x' means match). -# -# Three arguments: -# 1. infile name -# 2. outfile name -# 3. 1 to NOT ensure that match states contain only 'A'-'Z' or '-' -# -# Returns the number of match states (=length of output alignment), -# the length of the input alignment, -# the number of seqs in the input alignment -# -# Last modified: 07/07/01 -# -sub pfam2phylipMatchOnly { - - my $infile = $_[ 0 ]; - my $outfile = $_[ 1 ]; - my $ne = $_[ 2 ]; - my @seq_name = (); - my @seq_array = (); - my $return_line = ""; - my $seq = ""; - my $x = 0; - my $y = 0; - my $i = 0; - my $x_offset = 0; - my $max_x = 0; - my $rf_y = 0; - my $number_colum = 0; - my $not_ensure = 0; - my $saw_rf_line = 0; - - if ( $ne && $ne == 1 ) { - $not_ensure = 1; - } - - &testForTextFilePresence( $infile ); - - open( INPP, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); - - # This reads in the first block. It reads in the seq names. - while ( 1 ) { - if ( &isPfamSequenceLine( $return_line ) ) { - $return_line =~ /^(\S+)\s+(\S+)/; - $seq_name[ $y ] = substr( $1, 0, $LENGTH_OF_NAME ); - $seq = $2; - for ( $x = 0; $x < length( $seq ); $x++ ) { - $seq_array[ $x ][ $y ] = substr( $seq, $x, 1 ); - } - $y++; - } - elsif ( &isRFline( $return_line ) ) { - $saw_rf_line = 1; - $return_line =~ /\s+(\S+)\s*$/; - $seq = $1; - $x_offset = length( $seq ); - $rf_y = $y; - for ( $x = 0; $x < $x_offset; $x++ ) { - $seq_array[ $x ][ $rf_y ] = substr( $seq, $x, 1 ); - } - last; - } - - $return_line = ; - - if ( !$return_line ) { - &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" ); - } - } - - if ( $saw_rf_line != 1 ) { - &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" ); - } - - $y = 0; - $max_x = 0; - - # This reads all blocks after the 1st one. - while ( $return_line = ) { - if ( &isPfamSequenceLine( $return_line ) ) { - $return_line =~ /^\S+\s+(\S+)/; - $seq = $1; - for ( $x = 0; $x < length( $seq ); $x++ ) { - $seq_array[ $x + $x_offset ][ $y % $rf_y ] = substr( $seq, $x, 1 ); - } - $y++; - } - elsif ( &isRFline( $return_line ) ) { - if ( $y != $rf_y ) { - &dieWithUnexpectedError( "Alignment not in expected format" ); - } - - $return_line =~ /\s+(\S+)\s*$/; - $seq = $1; - $max_x = length( $seq ); - - for ( $x = 0; $x < length( $seq ); $x++ ) { - $seq_array[ $x + $x_offset ][ $rf_y ] = substr( $seq, $x, 1 ); - } - - $y = 0; - $x_offset = $x_offset + $max_x; - $max_x = 0; - } - } - - close( INPP ); - - # Counts the match states, and hence the number of aa in the alignment: - for ( $x = 0; $x < $x_offset; $x++ ) { - if ( !$seq_array[ $x ][ $rf_y ] ) { - &dieWithUnexpectedError( "Alignment not in expected format" ); - } - if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) { - $number_colum++; - } - } - - # Writes the file: - - open( OUTPP, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); - print OUTPP "$rf_y $number_colum\n"; - for ( $y = 0; $y < $rf_y; $y++ ) { - print OUTPP "$seq_name[ $y ]"; - for ( $i = 0; $i < ( $LENGTH_OF_NAME - length( $seq_name[ $y ] ) ); $i++ ) { - print OUTPP " "; - } - for ( $x = 0; $x < $x_offset; $x++ ) { - if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) { - if ( !$seq_array[ $x ][ $y ] ) { - &dieWithUnexpectedError( "Alignment not in expected format" ); - } - if ( $not_ensure != 1 && $seq_array[ $x ][ $y ] !~ /[A-Z]|-/ ) { - &dieWithUnexpectedError( "Alignment not in expected format (match states must only contain 'A'-'Z' or '-')" ); - } - print OUTPP "$seq_array[ $x ][ $y ]"; - } - } - print OUTPP "\n"; - } - close( OUTPP ); - - return $number_colum, $x_offset, $rf_y; - -} ## pfam2phylipMatchOnly - - - -# Returns whether the argument (a String) -# starts with a SWISS-PROT name (SEQN_SPECI). -# Last modified: 06/21/01 -sub startsWithSWISS_PROTname { - return ( $_[ 0 ] =~ /^[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/ ); -} - - - -# Returns whether the argument starts with XXX.. XXXXX.. and the first -# character is not a "#". -# Last modified: 06/21/01 -sub isPfamSequenceLine { - return( !&isPfamCommentLine( $_[ 0 ] ) - && &containsPfamNamedSequence( $_[ 0 ] ) ); -} - - - -# Returns whether the argument does start with a "#". -# Last modified: 06/21/01 -sub isPfamCommentLine { - return ( $_[ 0 ] =~ /^#/ ); -} - - - -# Returns whether the argument starts with XXX XXXXX. -# Last modified: 06/21/01 -sub containsPfamNamedSequence { - return ( $_[ 0 ] =~ /^\S+\s+\S+/ ); -} - - -# Returns whether the argument starts with XXX XXXXX. -# Last modified: 06/21/01 -sub isRFline { - return ( $_[ 0 ] =~ /^#.*RF/ ); -} - - - - -# Five arguments: -# 1. pairwise distance file -# 2. number of bootstraps -# 3. randomize_input_order: 0: do not randomize input order; >=1 jumble -# 4. seed for random number generator -# 5. lower-triangular data matrix? 1: yes; no, otherwise -# Last modified: 06/08/01 -sub executeNeighbor { - my $inpwd = $_[ 0 ]; - my $bs = $_[ 1 ]; - my $rand = $_[ 2 ]; - my $s = $_[ 3 ]; - my $l = $_[ 4 ]; - my $jumble = ""; - my $multi = ""; - my $lower = ""; - - - &testForTextFilePresence( $inpwd ); - - if ( $rand >= 1 ) { - $jumble = " -J -$s"; - } - - if ( $bs >= 2 ) { - $multi = " -M -$bs -$s"; - } - if ( $l == 1 ) { - $lower = " -L"; - } - - - system( "$NEIGHBOR >/dev/null 2>&1 << ! -$inpwd$jumble$multi$lower -2 -3 -Y -!" ) - && &dieWithUnexpectedError( "Could not execute \"$NEIGHBOR $inpwd$jumble$multi$lower\"" ); - # 3: Do NOT print out tree - - - return; - -} ## executeNeighbor - - - -# Four arguments: -# 1. name of alignment file (in correct format!) -# 2. number of bootstraps -# 3. jumbles: 0: do not jumble; >=1 number of jumbles -# 4. seed for random number generator -# Last modified: 03/13/04 -sub executeProtpars { - my $alin = $_[ 0 ]; - my $bs = $_[ 1 ]; - my $rand = $_[ 2 ]; - my $s = $_[ 3 ]; - my $jumble = ""; - my $multi = ""; - - - &testForTextFilePresence( $alin ); - - if ( $bs >= 2 && $rand < 1 ) { - $rand = 1; - } - - if ( $rand >= 1 ) { - $jumble = " -J -$s -$rand"; - } - - if ( $bs >= 2 ) { - $multi = " -M -D -$bs"; - } - - - - system( "$PROTPARS 2>&1 << ! -$alin$jumble$multi -I -3 -Y -!" ) - && &dieWithUnexpectedError( "Could not execute \"$PROTPARS $alin$jumble$multi\"" ); - # 3: Do NOT print out tree - # I: Interleaved - - return; - -} ## executeProtpars - - - -# "Model of substitution" order for DQO TREE-PUZZLE 5.0: -# Auto -# m -> Dayhoff (Dayhoff et al. 1978) -# m -> JTT (Jones et al. 1992) -# m -> mtREV24 (Adachi-Hasegawa 1996) -# m -> BLOSUM62 (Henikoff-Henikoff 92) -# m -> VT (Mueller-Vingron 2000) -# m -> WAG (Whelan-Goldman 2000) -# m -> Auto -# One argument: -# matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; -# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise -# Last modified: 07/07/01 -sub setModelForPuzzle { - my $matrix_option = $_[ 0 ]; - my $matr = ""; - - if ( $matrix_option == 0 ) { # JTT - $matr = " -m -m"; - } - elsif ( $matrix_option == 2 ) { # BLOSUM 62 - $matr = " -m -m -m -m"; - } - elsif ( $matrix_option == 3 ) { # mtREV24 - $matr = " -m -m -m"; - } - elsif ( $matrix_option == 5 ) { # VT - $matr = " -m -m -m -m -m"; - } - elsif ( $matrix_option == 6 ) { # WAG - $matr = " -m -m -m -m -m -m"; - } - elsif ( $matrix_option == 7 ) { # auto - $matr = ""; - } - else { # PAM - $matr = " -m" - } - - return $matr; - -} ## setModelForPuzzle - -# One argument: -# Model of rate heterogeneity: -# 1 for "8 Gamma distributed rates" -# 2 for "Two rates (1 invariable + 1 variable)" -# 3 for "Mixed (1 invariable + 8 Gamma rates)" -# otherwise: Uniform rate -# Last modified: 09/08/03 -sub setRateHeterogeneityOptionForPuzzle { - my $rate_heterogeneity_option = $_[ 0 ]; - my $opt = ""; - - if ( $rate_heterogeneity_option == 1 ) { - $opt = " -w"; - } - elsif ( $rate_heterogeneity_option == 2 ) { - $opt = " -w -w"; - } - elsif ( $rate_heterogeneity_option == 3 ) { - $opt = " -w -w -w"; - } - else { - $opt = ""; - } - - return $opt; -} ## setRateHeterogeneityOptionForPuzzle - - -# One argument: -# Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise -# Last modified: 09/08/03 -sub setParameterEstimatesOptionForPuzzle { - my $parameter_estimates_option = $_[ 0 ]; - my $opt = ""; - - if ( $parameter_estimates_option == 1 ) { - $opt = " -e"; - } - else { - $opt = ""; - } - - return $opt; -} ## setParameterEstimatesOptionForPuzzle - - - -# Two/three/four arguments: -# 1. Name of inputfile -# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; -# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise -# 3. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise -# 4. Model of rate heterogeneity: -# 1 for "8 Gamma distributed rates" -# 2 for "Two rates (1 invariable + 1 variable)" -# 3 for "Mixed (1 invariable + 8 Gamma rates)" -# otherwise: Uniform rate -# Last modified: 09/08/03 (added 3rd and 4th parameter) -sub executePuzzleBootstrapped { - my $in = $_[ 0 ]; - my $matrix_option = $_[ 1 ]; - my $parameter_estimates_option = $_[ 2 ]; - my $rate_heterogeneity_option = $_[ 3 ]; - - my $l = 0; - my $slen = 0; - my $counter = 0; - my $mat = ""; - my $est = ""; - my $rate = ""; - my $a = ""; - my @a = (); - - &testForTextFilePresence( $in ); - - open( GRP, "<$in" ) || die "\n\n$0: Unexpected error: Cannot open file <<$in>>: $!"; - while( ) { - if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) { - $counter++; - } - } - close( GRP ); - - $l = `cat $in | wc -l`; - $slen = $l / $counter; - - system( "split -$slen $in $in.splt." ) - && die "\n\n$0: executePuzzleDQObootstrapped: Could not execute \"split -$slen $in $in.splt.\": $!"; - - @a = <$in.splt.*>; - - $mat = setModelForPuzzle( $matrix_option ); - if ( $parameter_estimates_option ) { - $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option ); - } - if ( $rate_heterogeneity_option ) { - $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option ); - } - - foreach $a ( @a ) { - print "-".$a."\n"; - system( "$PUZZLE $a << ! -k -k$mat$est$rate -y -!" ) - && die "$0: Could not execute \"$PUZZLE $a\""; - - system( "cat $a.dist >> $in.dist" ) - && die "$0: Could not execute \"cat outdist >> $in.dist\""; - - unlink( $a, $a.".dist", $a.".tree" ); - } - - return; - -} ## executePuzzleBootstrapped - - - - - -# Two/three/four arguments: -# 1. Name of inputfile -# 2. Matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; -# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise -# 3. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise -# 4. Model of rate heterogeneity: -# 1 for "8 Gamma distributed rates" -# 2 for "Two rates (1 invariable + 1 variable)" -# 3 for "Mixed (1 invariable + 8 Gamma rates)" -# otherwise: Uniform rate -# Last modified: 09/08/03 (added 3rd and 4th parameter) -sub executePuzzle { - my $in = $_[ 0 ]; - my $matrix_option = $_[ 1 ]; - my $parameter_estimates_option = $_[ 2 ]; - my $rate_heterogeneity_option = $_[ 3 ]; - my $mat = ""; - my $est = ""; - my $rate = ""; - - &testForTextFilePresence( $in ); - - $mat = &setModelForPuzzle( $matrix_option ); - if ( $parameter_estimates_option ) { - $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option ); - } - if ( $rate_heterogeneity_option ) { - $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option ); - } - - - system( "$PUZZLE $in << ! -k -k$mat$est$rate -y -!" ) - && die "$0: Could not execute \"$PUZZLE\""; - - return; - -} ## executePuzzle - - - - -# Preparation of the pwd file -sub addDistsToQueryToPWDfile { - my $pwd_file = $_[ 0 ]; - my $disttoquery_file = $_[ 1 ]; - my $outfile = $_[ 2 ]; - my $name_of_query = $_[ 3 ]; - my $name_of_query_ = ""; - my $return_line_pwd = ""; - my $return_line_dq = ""; - my $num_of_sqs = 0; - my $block = 0; - my $name_from_pwd = "X"; - my $name_from_dq = "Y"; - my @dists_to_query = (); - my $i = 0; - - &testForTextFilePresence( $pwd_file ); - &testForTextFilePresence( $disttoquery_file ); - - $name_of_query_ = $name_of_query; - for ( my $j = 0; $j <= ( $LENGTH_OF_NAME - length( $name_of_query ) - 1 ); ++$j ) { - $name_of_query_ .= " "; - } - - open( OUT_AD, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); - open( IN_PWD, "$pwd_file" ) || &dieWithUnexpectedError( "Cannot open file \"$pwd_file\"" ); - open( IN_DQ, "$disttoquery_file" ) || &dieWithUnexpectedError( "Cannot open file \"$disttoquery_file\"" ); - - W: while ( $return_line_pwd = ) { - - - if ( $return_line_pwd =~ /^\s*(\d+)\s*$/ ) { - $num_of_sqs = $1; - $num_of_sqs++; - if ( $block > 0 ) { - print OUT_AD "$name_of_query_ "; - for ( my $j = 0; $j < $i; ++$j ) { - print OUT_AD "$dists_to_query[ $j ] "; - } - print OUT_AD "0.0\n"; - } - print OUT_AD " $num_of_sqs\n"; - $block++; - @dists_to_query = (); - $i = 0; - } - - if ( $block == 1 - && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) { - $name_from_pwd = $1; - - if ( !defined( $return_line_dq = ) ) { - &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); - } - - if ( $return_line_dq !~ /\S/ ) { - if ( !defined( $return_line_dq = ) ) { - &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); - } - } - $return_line_dq =~ /^\s*(\S+)\s+(\S+)/; - $name_from_dq = $1; - $dists_to_query[ $i++ ] = $2; - - - if ( $name_from_pwd ne $name_from_dq ) { - &dieWithUnexpectedError( "Order of sequence names in \"$pwd_file\" and \"$disttoquery_file\" is not the same" ); - } - print OUT_AD $return_line_pwd; - - } - elsif ( $block > 1 - && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) { - $name_from_pwd = $1; - if ( !defined( $return_line_dq = ) ) { - &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); - } - if ( $return_line_dq !~ /\S/ ) { - if ( !defined( $return_line_dq = ) ) { - &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); - } - } - $return_line_dq =~ /^\s*\S+\s+(\S+)/; - $dists_to_query[ $i++ ] = $1; - print OUT_AD $return_line_pwd; - } - } - print OUT_AD "$name_of_query_ "; - for ( my $j = 0; $j < $i; ++$j ) { - print OUT_AD "$dists_to_query[ $j ] "; - } - print OUT_AD "0.0\n"; - - close( OUT_AD ); - close( IN_PWD ); - close( IN_DQ ); - return $block; - -} ## addDistsToQueryToPWDfile - - - - -# Three arguments: -# 1. HMMER model db -# 2. name of HMM -# 3. outputfile name -# Last modified: 02/27/01 -sub executeHmmfetch { - - my $db = $_[ 0 ]; - my $name = $_[ 1 ]; - my $outfile = $_[ 2 ]; - - system( "$HMMFETCH $db $name > $outfile" ) - && &dieWithUnexpectedError( "Could not execute \"$HMMFETCH $db $name > $outfile\"" ); - return; - -} ## executeHmmfetch - - - -# Checks wether a file is present, not empty and a plain textfile. -# One argument: name of file. -# Last modified: 07/07/01 -sub testForTextFilePresence { - my $file = $_[ 0 ]; - unless ( ( -s $file ) && ( -f $file ) && ( -T $file ) ) { - dieWithUnexpectedError( "File \"$file\" does not exist, is empty, or is not a plain textfile" ); - } -} ## testForTextFilePresence - - -# Last modified: 02/21/03 -sub addSlashAtEndIfNotPresent { - my $filename = $_[ 0 ]; - $filename =~ s/\s+//g; - unless ( $filename =~ /\/$/ ) { - $filename = $filename."/"; - } - return $filename; -} ## addSlashAtEndIfNotPresent - - - -# Last modified: 02/15/02 -sub exitWithWarning { - - my $text = $_[ 0 ]; - if ( defined( $_[ 1 ] ) && $_[ 1 ] == 1 ) { - print( "

user error

\n" ); - print( "

\n" ); - print( "$text\n" ); - print( "

\n" ); - print( "

\n" ); - } - else { - print( "\n\n$text\n\n" ); - } - - exit( 0 ); - -} ## exit_with_warning - - - -# Last modified: 02/15/02 -sub dieWithUnexpectedError { - - my $text = $_[ 0 ]; - - die( "\n\n$0:\nUnexpected error (should not have happened):\n$text\n$!\n\n" ); - -} ## dieWithUnexpectedError - - - -1; diff --git a/forester/archive/perl/rio_slave.pl b/forester/archive/perl/rio_slave.pl deleted file mode 100755 index 94a0e56..0000000 --- a/forester/archive/perl/rio_slave.pl +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/perl -W - -# rio_slave.pl -# ------------ -# -# Copyright (C) 2002 Washington University School of Medicine -# and Howard Hughes Medical Institute -# All rights reserved -# -# Created: 01/18/02 -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Last modified: 02/20/02 - - -# Arguments: - -# 0: first block in multiple alignment to process -# 1: last block in multiple alignment to process -# 2: name of resampled alignment, inc. query -# 3: matrix number -# 4: name of query -# 5: PWD file -# 6: seed for random number generator for neighbor -# 7: node number -# 8: temp dir - - -use strict; - -use FindBin; -use lib $FindBin::Bin; -use rio_module; - -if ( @ARGV != 9 ) { - &dieWithUnexpectedError( "argument count is off" ); -} - -my $start = $ARGV[ 0 ]; -my $end = $ARGV[ 1 ]; -my $align = $ARGV[ 2 ]; -my $matrix_n = $ARGV[ 3 ]; -my $name = $ARGV[ 4 ]; -my $pwd_file = $ARGV[ 5 ]; -my $seed = $ARGV[ 6 ]; -my $number = $ARGV[ 7 ]; -my $temp_dir = $ARGV[ 8 ]; - -my $b = 0; -my $outfile = ""; -my $mytemp_dir = $temp_dir."/dir_".$number; - -mkdir( $mytemp_dir, 0700 ) -|| &dieWithUnexpectedError( "Could not create \"$mytemp_dir\"" ); - -unless ( ( -e $mytemp_dir ) && ( -d $mytemp_dir ) ) { - &dieWithUnexpectedError( "\"$mytemp_dir\" does not exist, or is not a directory" ); -} - - -&executePuzzleDQObootstrapped( $align, $matrix_n ); - -system( "mv", $align.".dist", $mytemp_dir."/DISTs_TO_QUERY" ) -&& &dieWithUnexpectedError( "could not mv" ); - -unlink( $align ); - -sleep( 2 ); - -÷PWDfile( $pwd_file, - $mytemp_dir."/DIVIDED", - $start, - $end ); - -&addDistsToQueryToPWDfile( $mytemp_dir."/DIVIDED", - $mytemp_dir."/DISTs_TO_QUERY", - $mytemp_dir."/PWD_INC_QUERY", - $name ); - -unlink( $mytemp_dir."/DIVIDED" ); - -$b = $end - $start + 1; - -chdir ( $mytemp_dir ) -|| &dieWithUnexpectedError( "Could not chdir to \"$mytemp_dir\"" ); - -&executeNeighbor( $mytemp_dir."/PWD_INC_QUERY", - $b, - 1, # randomize input order - $seed, - 1 ); # lower-triangular data matrix - - -unlink( "outfile", $mytemp_dir."/PWD_INC_QUERY", $mytemp_dir."/DISTs_TO_QUERY" ); - -system( "mv", "outtree", "../MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX.$number ) -&& &dieWithUnexpectedError( "could not mv" ); - -sleep( 1 ); - -chdir( ".." ) -|| &dieWithUnexpectedError( "Could not chdir to \"..\"" ); - -rmdir( $mytemp_dir ) || &dieWithUnexpectedError( "Could not delete \"$mytemp_dir\"" ); - -$outfile = "FINISHED_$number"; - -open( OUT, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); -close( OUT ); - -exit( 0 ); - - - - -sub dividePWDfile { - my $pwd_file = $_[ 0 ]; - my $outfile = $_[ 1 ]; - my $start = $_[ 2 ]; # e.g. 0 - my $end = $_[ 3 ]; # e.g. 9 - - my $c = 0; - my $write = 0; - my $return_line = ""; - - &testForTextFilePresence( $pwd_file ); - - open( IN_PWD, "$pwd_file" ) || &dieWithUnexpectedError( "Cannot open file \"$pwd_file\"" ); - open( OUT_PWD, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); - - while ( $return_line = ) { - if ( $return_line =~ /^\s*(\d+)\s*$/ ) { - if ( $c >= $start && $c <= $end ) { - $write = 1; - } - elsif ( $c > $end ) { - last; - } - $c++; - } - if ( $write == 1 ) { - print OUT_PWD $return_line; - } - } - - close( IN_PWD ); - close( OUT_PWD ); - - return; - -} ## dividePWDfile - - - - - - - diff --git a/forester/archive/perl/rio_slave_driver.pl b/forester/archive/perl/rio_slave_driver.pl deleted file mode 100755 index 3e82e4e..0000000 --- a/forester/archive/perl/rio_slave_driver.pl +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/perl -W - -# rio_slave_driver.pl -# ------------------- -# -# Copyright (C) 2002 Washington University School of Medicine -# and Howard Hughes Medical Institute -# All rights reserved -# -# Created: 01/18/02 -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Last modified: 02/20/02 - - -# 0: block size -# 1: number of blocks which have a size of block size + 1 -# 2: name of resampled alignment, inc. query -# 3: matrix number -# 4: name of query -# 5: PWD file -# 6: temp dir -# 7: seed for random number generator for neighbor -# 8...: list of node names - - - - -use strict; - -use FindBin; -use lib $FindBin::Bin; -use rio_module; - -if ( @ARGV < 9 ) { - &dieWithUnexpectedError( "argumnet count off" ); -} - - -my $block_size = shift( @ARGV ); -my $larger_blocks = shift( @ARGV ); -my $align = shift( @ARGV ); -my $matrix_n = shift( @ARGV ); -my $name = shift( @ARGV ); -my $pwd_file = shift( @ARGV ); -my $temp_dir = shift( @ARGV ); -my $seed = shift( @ARGV ); -my @nodelist = @ARGV; -my $start = 0; -my $end = 0; -my $x = 0; -my $node = ""; - - -$start = 0; - -if ( $larger_blocks > 0 ) { - $end = $block_size; -} -else { - $end = $block_size - 1; -} - -for ( $x = 0; $x < scalar( @nodelist ); $x++ ) { - my $child_pid; - $node = $nodelist[ $x ]; - - if ( !defined( $child_pid = fork() ) ) { - &dieWithUnexpectedError( "cannot fork" ); - } - elsif ( $child_pid ) { - # I'm the parent, forking off $nodelist number of children - } - else { - exec( "ssh", - $node, - "/usr/bin/perl", - $RIO_SLAVE, - $start, - $end, - $align.$x, - $matrix_n, - $name, - $pwd_file, - $seed, - $x, - $temp_dir ) - || &dieWithUnexpectedError( "could not \"exec ssh $node /usr/bin/perl $RIO_SLAVE\"" ); - - } - $larger_blocks--; - if ( $larger_blocks > 0 ) { - $start += ( $block_size + 1 ); - $end += ( $block_size + 1 ); - } - elsif ( $larger_blocks == 0 ) { - $start += ( $block_size + 1 ); - $end += $block_size; - } - else { - $start += $block_size; - $end += $block_size; - } -} - -exit( 0 ); diff --git a/forester/archive/perl/xt.pl b/forester/archive/perl/xt.pl deleted file mode 100755 index 4fc5da3..0000000 --- a/forester/archive/perl/xt.pl +++ /dev/null @@ -1,640 +0,0 @@ -#!/usr/bin/perl -W - -# xt.pl -# ----- -# -# Copyright (C) 2003 Christian M. Zmasek -# All rights reserved -# -# Author: Christian M. Zmasek -# zmasek@genetics.wustl.edu -# http://www.genetics.wustl.edu/eddy/people/zmasek/ -# -# Version: 1.010 -# Last modified 03/25/03 -# -# -# -# Calculates trees based on Pfam alignments or precalculated distances using -# makeTree.pl. - - -use strict; -use FindBin; -use lib $FindBin::Bin; -use rio_module; - - -# To use _your_ species list make $MY_SPECIES_NAMES_FILE point to it. -# To use _your_ TrEMBL ACDEOS make $MY_TREMBL_ACDEOS_FILE point to it. - -my $MY_SPECIES_NAMES_FILE = $SPECIES_NAMES_FILE; # $SPECIES_NAMES_FILE is inherited - # from rio_module.pm - -my $MY_TREMBL_ACDEOS_FILE = $TREMBL_ACDEOS_FILE; # $TREMBL_ACDEOS_FILE is inherited - # from rio_module.pm - -my $MY_TEMP_DIR = $TEMP_DIR_DEFAULT; # $TEMP_DIR_DEFAULT is inherited - # from rio_module.pm - -my $LOGFILE = "00_xt_logfile"; -my $PWD_SUFFIX = ".pwd"; -my $ALN_SUFFIX = ".aln"; - -my $use_precalc_pwd = 0; # 0: input is Pfam aligments ($input_dir must point to "/Pfam/Full/"). - # 1: input is precalculated pairwise distancs ($input_dir must point to ""). -my $use_precalc_pwd_and_aln = 0;# 0: otherwise - # 1: input is precalculated pairwise distancs - # _and_ alns,$use_precalc_pwd = 1 ($input_dir must point to alns). -my $add_species = 0; # "I": 0: do nothing with species information. - # "S": 1: add species code to TrEMBL sequences and ignore sequences from - # species not in $MY_SPECIES_NAMES_FILE (only if input is Pfam aligments). -my $options = ""; # Options for makeTree.pl, see makeTree.pl. - # Do not use F [Pairwise distance (pwd) file as input (instead of alignment)] - # since this is determined with $USE_PRECALC_PWD -my $min_seqs = 0; # Minimal number of sequences (TREE-PUZZLE needs at least four seqs). - # Ignored if $USE_PRECALC_PWD = 1 -my $max_seqs = 0; # Maximal number of sequences. - # Ignored if $USE_PRECALC_PWD = 1 -my $input_dir = ""; -my $input_dir_aln = ""; # for .aln files -my $output_dir = ""; - -my $i = 0; -my $seqs = 0; -my $filename = ""; -my @filenames = (); -my %AC_OS = (); # AC -> species name -my %Species_names_hash = (); -my $too_small = 0; -my $too_large = 0; -my $already_present = 0; -my @too_small_names = (); -my @too_large_names = (); -my @already_present_names = (); - - -# Analyzes the options: -# --------------------- - -unless ( @ARGV == 3 || @ARGV == 4 || @ARGV == 6 ) { - &printUsage(); -} - -if ( @ARGV == 3 ) { - $use_precalc_pwd = 1; - $use_precalc_pwd_and_aln = 0; - $options = $ARGV[ 0 ]; - $input_dir = $ARGV[ 1 ]; - $output_dir = $ARGV[ 2 ]; - $add_species = 0; -} -elsif ( @ARGV == 4 ) { - $use_precalc_pwd = 1; - $use_precalc_pwd_and_aln = 1; - $options = $ARGV[ 0 ]; - $input_dir = $ARGV[ 1 ]; - $input_dir_aln = $ARGV[ 2 ]; - $output_dir = $ARGV[ 3 ]; - $add_species = 0; - $input_dir_aln = &addSlashAtEndIfNotPresent( $input_dir_aln ); -} -else { - $use_precalc_pwd = 0; - $use_precalc_pwd_and_aln = 0; - $add_species = $ARGV[ 0 ]; - $options = $ARGV[ 1 ]; - $min_seqs = $ARGV[ 2 ]; - $max_seqs = $ARGV[ 3 ]; - $input_dir = $ARGV[ 4 ]; - $output_dir = $ARGV[ 5 ]; - if ( $min_seqs < 4 ) { - $min_seqs = 4; - } - if ( $add_species eq "I" ) { - $add_species = 0; - } - elsif ( $add_species eq "S" ) { - $add_species = 1; - } - else { - print( "\nFirst must be either \"I\" [Ignore species] or\n\"S\" [add Species code to TrEMBL sequences and ignore sequences from species not in $MY_SPECIES_NAMES_FILE].\n\n" ); - &printUsage(); - } -} - - - -$input_dir = &addSlashAtEndIfNotPresent( $input_dir ); -$output_dir = &addSlashAtEndIfNotPresent( $output_dir ); -$MY_TEMP_DIR = &addSlashAtEndIfNotPresent( $MY_TEMP_DIR ); - - - - -# This adds a "-" before the options for makeTree: -# ------------------------------------------------ -unless ( $options =~ /^-/ ) { - $options = "-".$options; -} - - - -# If based on pwd, species are "fixed" and certain options for makeTree -# are not applicable and option "F" is mandatory: -# --------------------------------------------------------------------- -if ( $use_precalc_pwd == 1 ) { - $options =~ s/D//g; - $options =~ s/C//g; - $options =~ s/N//g; - unless ( $options =~ /F/ ) { - $options = $options."F"; - } -} -else { - $options =~ s/F//g; -} - -if ( $use_precalc_pwd_and_aln == 1 ) { - unless ( $options =~ /U/ ) { - $options = $options."U"; - } -} -if ( $use_precalc_pwd_and_aln == 0 && $use_precalc_pwd == 1 ) { - $options =~ s/U//g; -} - - - - -# If species are to be considered, speices names file and TrEMBL ACDEOS -# files need to be read in: -# --------------------------------------------------------------------- -if ( $add_species == 1 ) { - print "\nXT.PL: Reading species names file...\n"; - &readSpeciesNamesFile( $MY_SPECIES_NAMES_FILE ); - print "\nXT.PL: Reading TrEMBL ACDEOS file...\n"; - &readTrEMBL_ACDEOS_FILE( $MY_TREMBL_ACDEOS_FILE ); -} - - - -# This creates the temp file: -# -------------------------- - -my $time = time; -my $ii = 0; - -my $temp_file = $MY_TEMP_DIR."xt".$time.$ii; - -while ( -e $temp_file ) { - $ii++; - $temp_file = $MY_TEMP_DIR."xt".$time.$ii; -} - - - -&startLogfile(); - -opendir( DIR, $input_dir ) || error( "Cannot open directory \"$input_dir\": $!" ); - -$i = 0; - -while( defined( $filename = readdir( DIR ) ) ) { - if ( $filename =~ /^\.\.?$/ ) { - next; - } - if ( $use_precalc_pwd == 1 && $filename !~ /$PWD_SUFFIX$/ ) { - next - } - $filenames[ $i ] = $filename; - $i++; -} - -close( DIR ); - -$i = 0; - -FOREACH: foreach $filename ( @filenames ) { - - # If the corresponding tree seems to already exists, do next one. - my $fn = $filename; - if ( $use_precalc_pwd == 1 ) { - $fn =~ s/$PWD_SUFFIX$//; - } - if ( -e "$output_dir$fn.nhx" ) { - $already_present_names[ $already_present++ ] = $fn; - next FOREACH; - } - - if ( $use_precalc_pwd != 1 ) { - - if ( $add_species == 1 ) { - - # 1. Pfam flat file name - # 2. outfile name - # Returns the number of sequences in the resulting alignment. - $seqs = &removeSeqsFromPfamAlign( $input_dir.$filename, $temp_file ); - - } - else { - # Gets the number of seqs in the alignment. - open( F, "$input_dir"."$filename" ); - while( ) { - if ( $_ =~/^#.+SQ\s+(\d+)\s*$/ ) { - $seqs = $1; - last; - } - } - close( F ); - } - - if ( $seqs < $min_seqs ) { - $too_small_names[ $too_small++ ] = $filename; - next FOREACH; - } - if ( $seqs > $max_seqs ) { - $too_large_names [ $too_large++ ] = $filename; - next FOREACH; - } - } - - print "\n\n\n\n"; - print "XT.PL\n"; - if ( $use_precalc_pwd == 1 ) { - print "working on: $filename\n"; - } - else { - print "working on: $filename [$seqs seqs]\n"; - } - print "[tree calculation $i]\n"; - print "=====================================================================\n\n\n"; - - - unlink( "$output_dir$filename.aln", "$output_dir$filename.log" ); - - print( "XT.PL: executing:\n" ); - - my $inputfile = ""; - - if ( $add_species == 1 ) { - $inputfile = $temp_file; - } - else { - $inputfile = $input_dir.$filename; - } - - if ( $use_precalc_pwd == 1 ) { - $filename =~ s/$PWD_SUFFIX$//; - } - - if ( $use_precalc_pwd_and_aln == 1 ) { - $inputfile = $inputfile." ".$input_dir_aln.$filename.$ALN_SUFFIX; - } - - my $command = "$MAKETREE $options $inputfile $output_dir$filename.nhx"; - - print( "$command\n" ); - system( $command ) && &error( "Could not execute \"$command\"" ); - - if ( $add_species == 1 ) { - if ( unlink( $temp_file ) != 1 ) { - &error( "Unexpected: Could not delete \"$temp_file\"" ); - } - } - - $i++; - -} - -&finishLogfile(); - -print( "\n\n\nXT.PL: Done!\n" ); -print( "Wrote \"$LOGFILE\".\n\n" ); - -exit( 0 ); - - - - - - -sub error{ - - my $text = $_[ 0 ]; - - print( "\nxt.pl: ERROR:\n" ); - print( "$text\n\n" ); - - exit( -1 ); - -} ## dieWithUnexpectedError - -# Similar to the method with the same name in "rio.pl". -# Removes sequences from a Pfam flat file. -# Adds species to TrEMBL seqs. -# It can remove all sequences not from species listed in a species names file. -# Two arguments: -# 1. Pfam flat file name -# 2. outfile name -# Returns the number of sequences in the resulting alignment. -# Last modified: 02/22/03 -sub removeSeqsFromPfamAlign { - my $infile = $_[ 0 ]; - my $outfile = $_[ 1 ]; - my $return_line = ""; - my $saw_sequence_line = 0; - my $number_of_seqs = 0; - my $OS = ""; - my $AC = ""; - my $i = 0; - my $length = 0; - my $seq_name = ""; - my $seq = ""; - - - open( OUT_RNSP, ">$outfile" ) || die "\n\n$0: Unexpected error: Cannot create file \"$outfile\": $!"; - open( IN_RNSP, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!"; - while ( $return_line = ) { - - if ( $saw_sequence_line == 1 - && !&containsPfamNamedSequence( $return_line ) - && !&isPfamCommentLine( $return_line ) ) { - # This is just for counting purposes. - $saw_sequence_line = 2; - } - if ( &isPfamSequenceLine( $return_line ) ) { - if ( $saw_sequence_line == 0 ) { - $saw_sequence_line = 1; - } - $return_line =~ /^\s*(\S+)\s+(\S+)/; - $seq_name = $1; - $seq = $2; - if ( !&startsWithSWISS_PROTname( $return_line ) ) { - $seq_name =~ /^(\S+)\//; - $AC = $1; - unless( exists( $AC_OS{ $AC } ) ) { - #ACs not present in "ACDEOS" file. - next; - } - $OS = $AC_OS{ $AC }; - if ( !$OS || $OS eq "" ) { - die "\n\n$0: Unexpected error: species for \"$AC\" not found.\n\n"; - } - unless( exists( $Species_names_hash{ $OS } ) ) { - next; - } - $seq_name =~ s/\//_$OS\//; - } - else { - if ( $return_line =~ /_([A-Z0-9]{1,5})\// ) { - unless( exists( $Species_names_hash{ $1 } ) ) { - next; - } - } - # remove everything whose species cannot be determined. - else { - next; - } - } - $length = length( $seq_name ); - for ( $i = 0; $i <= ( $LENGTH_OF_NAME - $length - 1 ); $i++ ) { - $seq_name .= " "; - } - $return_line = $seq_name.$seq."\n"; - } - - if ( !&isPfamCommentLine( $return_line ) ) { - print OUT_RNSP $return_line; - } - - if ( $saw_sequence_line == 1 ) { - $number_of_seqs++; - } - } ## while ( $return_line = ) - close( IN_RNSP ); - close( OUT_RNSP ); - - return $number_of_seqs; - -} ## removeSeqsFromPfamAlign - - - - - - - -# Reads in (SWISS-PROT) species names from a file. -# Names must be separated by newlines. -# Lines beginning with "#" are ignored. -# A possible "=" and everything after is ignored. -# One argument: species-names-file name -# Last modified: 04/24/01 -sub readSpeciesNamesFile { - my $infile = $_[ 0 ]; - my $return_line = ""; - my $species = ""; - - unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - die "\n\n$0: Error: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n"; - } - - open( IN_RSNF, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!\n\n"; - while ( $return_line = ) { - if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) { - $species = $1; - $species =~ s/=.+//; - $Species_names_hash{ $species } = ""; - } - } - close( IN_RSNF ); - - return; -} ## readSpeciesNamesFile - - - -# Last modified: 05/18/01 -sub readTrEMBL_ACDEOS_FILE { - my $infile = $_[ 0 ]; - my $return_line = ""; - - unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { - &error( "\"$infile\" does not exist, is empty, or is not a plain textfile" ); - } - # Fill up (huge) hashs. - open( HH, "$infile" ) || &error( "Unexpected error: Cannot open file \"$infile\"" ); - while ( $return_line = ) { - - if ( $return_line =~ /(\S+);[^;]*;(\S+)/ ) { - $AC_OS{ $1 } = $2; - } - } - close( HH ); -} ## readTrEMBL_ACDEOS_FILE - - - -# Last modified: 05/17/01 -sub startLogfile { - if ( -e "$LOGFILE" ) { - &error( "logfile \"$LOGFILE\" already exists, rename it or place it in another directory" ); - } - - open( L, ">$LOGFILE" ) || &error( "Cannot create logfile: $!" ); - if ( $use_precalc_pwd != 1 ) { - print L "Trees are based directly on Pfam alignments\n"; - if ( $add_species == 1 ) { - print L "Add species code to TrEMBL sequences and ignore sequences\nfrom species not in $MY_SPECIES_NAMES_FILE\n"; - } - else { - print L "Do nothing with species information\n"; - } - } - else { - print L "Trees are based on precalculated pairwise distances\n"; - } - if ( $use_precalc_pwd_and_aln == 1 ) { - print L "and the matching alignments\n"; - } - print L "Options for makeTree: $options\n"; - if ( $use_precalc_pwd != 1 ) { - print L "Min seqs : $min_seqs\n"; - print L "Max seqs : $max_seqs\n"; - } - if ( $add_species == 1 ) { - print L "TrEMBL ACDEOS file : $MY_TREMBL_ACDEOS_FILE\n"; - print L "Species names file : $MY_SPECIES_NAMES_FILE\n"; - } - print L "Input directory : $input_dir\n"; - if ( $use_precalc_pwd_and_aln == 1 ) { - print L "Input directory aln : $input_dir_aln\n"; - } - print L "Output directory : $output_dir\n"; - print L "Start date : ".`date`; - -} ## startLogfile - - - -# Last modified: 05/17/01 -sub finishLogfile { - my $j = 0; - print L "\n\n"; - print L "Successfully calculated $i trees.\n"; - if ( $use_precalc_pwd != 1 ) { - print L "Too large alignments (>$max_seqs): $too_large\n"; - print L "Too small alignments (<$min_seqs): $too_small\n"; - } - print L "Alignments for which a tree appears to already exist: $already_present\n"; - print L "Finish date : ".`date`."\n\n"; - if ( $use_precalc_pwd != 1 ) { - print L "List of the $too_large alignments which were ignored because they\n"; - print L "contained too many sequences (>$max_seqs) [after pruning]:\n"; - for ( $j = 0; $j < $too_large; ++$j ) { - print L "$too_large_names[ $j ]\n"; - } - print L "\n\n"; - print L "List of the $too_small alignments which were ignored because they\n"; - print L "contained not enough sequences (<$min_seqs) [after pruning]:\n"; - for ( $j = 0; $j < $too_small; ++$j ) { - print L "$too_small_names[ $j ]\n"; - } - } - print L "\n\n"; - print L "List of the $already_present alignments which were ignored because\n"; - print L "a tree appears to already exist:\n"; - for ( $j = 0; $j < $already_present; ++$j ) { - print L "$already_present_names[ $j ]\n"; - } - print L "\n"; - close( L ); -} ## finishLogfile - - -sub printUsage { - print "\n"; - print " xt.pl\n"; - print " _____\n"; - print " \n"; - print " Copyright (C) 2003 Christian M. Zmasek\n"; - print " All rights reserved\n"; - print "\n"; - print " Author: Christian M. Zmasek\n"; - print " zmasek\@genetics.wustl.edu\n"; - print " http://www.genetics.wustl.edu/eddy/forester/\n"; - print "\n"; - print "\n"; - print " Purpose\n"; - print " -------\n"; - print "\n"; - print " Tree construction using makeTree.pl based on directories\n"; - print " of Pfam alignments or precalculated pairwise distances.\n"; - print "\n"; - print "\n"; - print " Usage\n"; - print " -----\n"; - print "\n"; - print " Input is Pfam aligments:\n"; - print " xt.pl \n"; - print " \n"; - print "\n"; - print " Input is precalculated pairwise distancs:\n"; - print " xt.pl \n"; - print "\n"; - print " Input is precalculated pairwise distancs and corresponding alignment files:\n"; - print " xt.pl \n"; - print " \n"; - print "\n"; - print "\n"; - print " Examples\n"; - print " --------\n"; - print "\n"; - print " \"xt.pl S NS21UTRB100DX 4 200 DB/PFAM/Full/ trees/\"\n"; - print "\n"; - print " \"xt.pl FLB100R /pfam2pwd_out/ trees/\"\n"; - print "\n"; - print " \"xt.pl FULB100R /pfam2pwd_out/ /pfam2pwd_out/ trees/\"\n"; - print "\n"; - print "\n"; - print " Options\n"; - print " -------\n"; - print "\n"; - print " I: ignore species information (use all sequences)\n"; - print " S: add species codes to TrEMBL sequences and ignore sequences\n"; - print " from species not in $MY_SPECIES_NAMES_FILE,\n"; - print " species codes are extracted from $MY_TREMBL_ACDEOS_FILE\n"; - print "\n"; - print "\n"; - print " Options for makeTree\n"; - print " --------------------\n"; - print "\n"; - print " N : Suggestion to remove columns in the alignment which contain gaps.\n"; - print " Gaps are not removed, if, after removal of gaps, the resulting\n"; - print " alignment would be shorter than $MIN_NUMBER_OF_AA aa (\$MIN_NUMBER_OF_AA).\n"; - print " Default is not to remove gaps.\n"; - print " Bx : Number of bootstrapps. B0: do not bootstrap. Default: 100 bootstrapps.\n"; - print " The number of bootstrapps should be divisible by 10.\n"; - print " U : Use TREE-PUZZLE to calculate ML branchlengths for consesus tree, in case of\n"; - print " bootstrapped analysis.\n"; - print " J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE, default: PAM.\n"; - print " L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE, default: PAM.\n"; - print " M : Use mtREV24 matrix (Adachi-Hasegawa 1996) in TREE-PUZZLE, default: PAM.\n"; - print " W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE, default: PAM.\n"; - print " T : Use VT matrix (Mueller-Vingron 2000) in TREE-PUZZLE, default: PAM.\n"; - print " P : Let TREE-PUZZLE choose which matrix to use, default: PAM.\n"; - print " R : Randomize input order in PHYLIP NEIGHBOR.\n"; - print " Sx : Seed for random number generator(s). Must be 4n+1. Default is 9.\n"; - print " X : To keep multiple tree file (=trees from bootstrap resampled alignments).\n"; - print " D : To keep (and create, in case of bootstrap analysis) pairwise distance\n"; - print " matrix file. This is created form the not resampled aligment.\n"; - print " C : Calculate pairwise distances only (no tree). Bootstrap is always 1.\n"; - print " No other files are generated.\n"; - print " F : Pairwise distance (pwd) file as input (instead of alignment).\n"; - print " No -D, -C, and -N options available in this case.\n"; - print " V : Verbose\n"; - print "\n"; - exit( -1 ); - -}