+++ /dev/null
-# Makefile for bootstrap_cz
-# Last modified 06/07/01
-
-# For GNU gcc
-CFLAGS = -O2 -Wall -pedantic
-
-# This one specifies the "cc" C compiler
-#CC = cc $(CFLAGS)
-#
-# To use "gcc" instead
-CC = gcc $(CFLAGS)
-
-
-bootstrap_cz: bootstrap_cz.c
- $(CC) -o bootstrap_cz bootstrap_cz.c
-
+++ /dev/null
-/*
-# bootstrap_cz
-# ------------
-# Copyright (C) 1999-2002 Washington University School of Medicine
-# and Howard Hughes Medical Institute
-# All rights reserved
-#
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Created: 06/06/01
-#
-# Last modified: 01/27/02
-#
-# Purpose:
-# Bootstrap resamples an alignment in PHYLIP sequential format <bootstraps> times.
-# Bootstrapping is not done randomly but according to a BSP (bootstrap
-# positions) file.
-# The BSP file can be created with the Perl program "bootstrap_cz.pl"
-# in mode 0.
-# This prgram has the same functionality as "bootstrap_cz.pl" in mode 1.
-# Sequence names are normalized to LENGTH_OF_NAME characters.
-# The output alignment is in PHYLIP's sequential or interleaved format.
-# (These two are the same in this case, since all the seqs will be one
-# line in length (no returns in seq).)
-#
-# Usage: bootstrap_cz <bootstraps> <alignment infile> <BSP file> <outfile>
-# [number of processors]
-*/
-
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <string.h>
-
-
-#define LENGTH_OF_NAME 26
-
-
-static char **names, /* This stores the sequence names */
- **sequences; /* This stores the sequences */
-static int number_of_seqs,
- number_of_colm;
-
-
-void readInAlignmnet( const char * );
-void bootstrapAccordingToBSPfile( int, const char *, const char * );
-void checkForMemAllocFailure( void * );
-int fileExists( const char *);
-void errorInCommandLine();
-
-
-
-
-
-/* Reads the seqs and seq-names from inalignment */
-/* into **sequences and **sequences. */
-/* Inalignment must be in PHYLIP sequential format. */
-/* Last modified: 06/25/01 */
-void readInAlignment( const char *inalignment ) {
-
- FILE *inalignment_fp = NULL;
- char *str = NULL;
- int max_length = 0;
- register char c = ' ';
- register int i = 0,
- ii = 0,
- z = 0,
- seq = 0;
-
- number_of_seqs = 0;
- number_of_colm = 0;
-
- inalignment_fp = fopen( inalignment, "r" );
- if ( inalignment_fp == NULL ) {
- printf( "\nbootstrap_cz: Error: Could not open alignment file for reading.\n" );
- exit( -1 );
- }
-
- if ( fscanf( inalignment_fp, "%d", &number_of_seqs ) != 1 ) {
- printf( "\nbootstrap_cz: Error: Could not read in number of seqs.\n" );
- exit( -1 );
- }
- if ( fscanf( inalignment_fp, "%d", &number_of_colm ) != 1 ) {
- printf( "\nbootstrap_cz: Error: Could not read in number of columns.\n" );
- exit( -1 );
- }
-
- names = malloc( number_of_seqs * sizeof( char *) );
- checkForMemAllocFailure( names );
- for ( i = 0; i < number_of_seqs; ++i ) {
- names[ i ] = malloc( LENGTH_OF_NAME * sizeof( char ) );
- checkForMemAllocFailure( names[ i ] );
- }
-
- sequences = malloc( number_of_seqs * sizeof( char * ) );
- checkForMemAllocFailure( sequences );
- for ( i = 0; i < number_of_seqs; ++i ) {
- sequences[ i ] = malloc( number_of_colm * sizeof( char ) );
- checkForMemAllocFailure( sequences[ i ] );
- }
-
- max_length = ( 30 * LENGTH_OF_NAME ) + number_of_colm;
-
- str = malloc( max_length * sizeof( char * ) );
- checkForMemAllocFailure( str );
-
- while ( fgets( str, max_length, inalignment_fp ) != NULL ) {
-
- if ( !isspace( str[ 0 ] ) != 0 ) {
-
- i = 0;
- while ( str[ i ] != ' ' ) {
- names[ seq ][ i ] = str[ i ];
- i++;
- }
-
- ii = i;
- while ( ii < LENGTH_OF_NAME ) {
- names[ seq ][ ii ] = ' ';
- ii++;
- }
-
- z = 0;
-
- while ( str[ i ] != '\n' && str[ i ] != '\r' && str[ i ] != '\0' ) {
- c = str[ i ];
- if ( c != ' ' ) {
- if ( isupper( c ) != 0 || c == '-' ) {
- sequences[ seq ][ z++ ] = c;
- }
- else {
- printf( "\nbootstrap_cz: Error: Sequence must be represented by uppercase letters A-Z and \"-\" only.\n" );
- exit( -1 );
- }
- }
- i++;
- if ( z > number_of_colm ) {
- printf( "\nbootstrap_cz: Error: line in \"%s\" contains more than %d columns.\n",
- inalignment, number_of_colm );
- exit( -1 );
- }
- }
- if ( z != number_of_colm ) {
- printf( "\nbootstrap_cz: Error: line in \"%s\" contains a incorrect number of columns.\n",
- inalignment );
- exit( -1 );
- }
-
- seq++;
-
- if ( seq > number_of_seqs ) {
- printf( "\nbootstrap_cz: Error: \"%s\" contains more than %d seqs.\n",
- inalignment, number_of_seqs );
- exit( -1 );
- }
- }
-
-
- } /* while ( fgets ) */
-
- if ( seq != number_of_seqs ) {
- printf( "\nbootstrap_cz: Error: \"%s\" contains a incorrect number of seqs.\n",
- inalignment );
- exit( -1 );
- }
-
- fclose( inalignment_fp );
-
- return;
-
-} /* readInAlignment */
-
-
-
-/* Rearrenges the aa in sequences according to */
-/* the bsp (bootstrap positions) file bsp_file. */
-/* Writes the results to outfile */
-/* Last modified: 06/07/01 */
-void bootstrapAccordingToBSPfile( int bootstraps,
- const char *bsp_file,
- const char *outfile ) {
-
- FILE *bsp_file_fp = NULL,
- *outfile_fp = NULL;
- int *positions = NULL,
- p = 0;
- register int boot = 0,
- seq = 0,
- i = 0;
-
- positions = malloc( number_of_colm * sizeof( int ) );
- checkForMemAllocFailure( positions );
-
-
- bsp_file_fp = fopen( bsp_file, "r" );
- if ( bsp_file_fp == NULL ) {
- printf( "\nbootstrap_cz: Error: could not open file \"%s\" for reading.\n",
- bsp_file );
- exit( -1 );
- }
-
- outfile_fp = fopen( outfile, "w" );
- if ( outfile_fp == NULL ) {
- printf( "\nbootstrap_cz: Error: could not open file \"%s\" for writing.\n",
- outfile );
- exit( -1 );
- }
-
- for ( boot = 0; boot < bootstraps; ++boot ) {
-
- for ( i = 0; i < number_of_colm; ++i ) {
- if ( fscanf( bsp_file_fp, "%d", &p ) != 1 ) {
- printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment.\n",
- bsp_file );
- exit( -1 );
- }
- positions[ i ] = p;
- }
-
- fprintf( outfile_fp, " %d %d\n", number_of_seqs, number_of_colm );
- for ( seq = 0; seq < number_of_seqs; ++seq ) {
- for ( i = 0; i < LENGTH_OF_NAME; ++i ) {
- fprintf( outfile_fp, "%c", names[ seq ][ i ] );
- }
- for ( i = 0; i < number_of_colm; ++i ) {
- fprintf( outfile_fp, "%c", sequences[ seq ][ positions[ i ] ] );
- }
- fprintf( outfile_fp, "\n" );
- }
- }
-
- /* Now, the bsp file must not contain any more numbers */
- if ( fscanf( bsp_file_fp, "%d", &p ) == 1 ) {
- printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment (too long).\n",
- bsp_file );
- printf( ">%d<\n", p );
- printf( "number of seqs=%d\n", number_of_seqs );
- exit( -1 );
- }
-
- fclose( bsp_file_fp );
- fclose( outfile_fp );
- free( positions );
- return;
-
-} /* bootstrapAccordingToBSPfile */
-
-
-
-/* Rearrenges the aa in sequences according to */
-/* the bsp (bootstrap positions) file bsp_file. */
-/* Writes the results to outfile */
-/* Last modified: 01/25/02 */
-void bootstrapAccordingToBSPfileP( int bootstraps,
- int processors,
- const char *bsp_file,
- const char *outfile ) {
-
- FILE *bsp_file_fp = NULL,
- *outfile_fp = NULL;
- int *positions = NULL,
- p = 0;
- char *outfile_ = NULL;
- register int boot = 0,
- seq = 0,
- i = 0,
- j = 0,
- z = 0,
- flag = 0;
- int block_size = 0,
- larger_blocks = 0;
-
- block_size = ( int ) bootstraps / processors;
- larger_blocks = bootstraps - ( block_size * processors ); /* number of blocks which have a size of
- block_size + 1 */
-
- positions = malloc( number_of_colm * sizeof( int ) );
- checkForMemAllocFailure( positions );
-
- outfile_ = malloc( ( strlen( outfile ) + 20 ) * sizeof( char ) );
- checkForMemAllocFailure( outfile_ );
-
- bsp_file_fp = fopen( bsp_file, "r" );
- if ( bsp_file_fp == NULL ) {
- printf( "\nbootstrap_cz: Error: could not open file \"%s\" for reading.\n",
- bsp_file );
- exit( -1 );
- }
-
- j = -1;
- flag = 1;
- z = 0;
-
- for ( boot = 0; boot < bootstraps; ++boot ) {
-
- for ( i = 0; i < number_of_colm; ++i ) {
- if ( fscanf( bsp_file_fp, "%d", &p ) != 1 ) {
- printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment.\n",
- bsp_file );
- exit( -1 );
- }
- positions[ i ] = p;
- }
-
- j++;
-
- if ( larger_blocks > 0 ) {
- if ( j >= block_size + 1 ) {
- flag = 1;
- j = 0;
- larger_blocks--;
- }
- }
- else if ( j >= block_size ) {
- flag = 1;
- j = 0;
- }
-
- if ( flag == 1 ) {
- if ( boot > 0 ) {
- fclose( outfile_fp );
- }
- sprintf( outfile_, "%s%d", outfile, z++ );
- if ( fileExists( outfile_ ) == 1 ) {
- printf( "\nbootstrap_cz: Error: outfile \"%s\" already exists.\n",
- outfile_ );
- exit( -1 );
- }
- outfile_fp = fopen( outfile_, "w" );
- if ( outfile_fp == NULL ) {
- printf( "\nbootstrap_cz: Error: could not open file \"%s\" for writing.\n",
- outfile_ );
- exit( -1 );
- }
- flag = 0;
- }
-
- fprintf( outfile_fp, " %d %d\n", number_of_seqs, number_of_colm );
- for ( seq = 0; seq < number_of_seqs; ++seq ) {
- for ( i = 0; i < LENGTH_OF_NAME; ++i ) {
- fprintf( outfile_fp, "%c", names[ seq ][ i ] );
- }
- for ( i = 0; i < number_of_colm; ++i ) {
- fprintf( outfile_fp, "%c", sequences[ seq ][ positions[ i ] ] );
- }
- fprintf( outfile_fp, "\n" );
- }
- }
-
- /* Now, the bsp file must not contain any more numbers */
- if ( fscanf( bsp_file_fp, "%d", &p ) == 1 ) {
- printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment (too long).\n",
- bsp_file );
- printf( ">%d<\n", p );
- printf( "number of seqs=%d\n", number_of_seqs );
- exit( -1 );
- }
-
- fclose( bsp_file_fp );
- fclose( outfile_fp );
-
- free( positions );
- free( outfile_ );
-
- return;
-
-} /* bootstrapAccordingToBSPfileP */
-
-
-
-
-/* Exits if *p is NULL. */
-/* Last modified: 06/06/01 */
-void checkForMemAllocFailure( void *p ) {
- if ( p == NULL ) {
- printf( "\nbootstrap_cz: Memory allocation failed.\n" );
- exit( -1 );
- }
- else {
- return;
- }
-} /* checkForMemAllocFailure */
-
-
-
-/* Returns 1 if filename can be opened. */
-/* Returns 0 otherwise. */
-/* Last modified: 06/07/01 */
-int fileExists( const char *filename ) {
- FILE *fp = NULL;
- if ( ( fp = fopen( filename, "r" ) ) != NULL ) {
- fclose( fp );
- return 1;
- }
- else {
- return 0;
- }
-} /* fileExists */
-
-
-
-void errorInCommandLine() {
- printf( "\n" );
- printf( " bootstrap_cz version 3.000\n" );
- printf( " ---------------------------\n\n" );
- printf( " Purpose:\n" );
- printf( " Bootstrap resamples an alignment in PHYLIP sequential format <bootstraps> times.\n" );
- printf( " Bootstrapping is not done randomly but according to a BSP (bootstrap\n" );
- printf( " positions) file.\n" );
- printf( " The BSP file can be created with the Perl program \"bootstrap_cz.pl\"\n" );
- printf( " in mode 0.\n" );
- printf( " This prgram has the same functionality as \"bootstrap_cz.pl\" in mode 1.\n" );
- printf( " Sequence names are normalized to LENGTH_OF_NAME characters.\n" );
- printf( " The output alignment is in PHYLIP's sequential or interleaved format.\n" );
- printf( " (These two are the same in this case, since all the seqs will be one\n" );
- printf( " line in length (no returns in seq).)\n\n" );
- printf( " Usage: bootstrap_cz <bootstraps> <alignment infile> <BSP file> <outfile>\n" );
- printf( " [number of processors]\n\n" );
-} /* errorInCommandLine */
-
-
-
-int main( int argc, char *argv[] ) {
-
- char *inalign = NULL,
- *bsp_file = NULL,
- *outfile = NULL;
- int bootstraps = 0,
- processors = 0;
-
-
- if ( argc != 5 && argc != 6 ) {
- errorInCommandLine();
- exit( -1 );
- }
-
- bootstraps = atoi( argv[ 1 ] );
- inalign = argv[ 2 ];
- bsp_file = argv[ 3 ];
- outfile = argv[ 4 ];
-
- if ( bootstraps < 1 ) {
- errorInCommandLine();
- exit( -1 );
- }
-
- if ( argc == 6 ) {
- processors = atoi( argv[ 5 ] );
- if ( processors < 1 ) {
- errorInCommandLine();
- exit( -1 );
- }
- if ( processors > bootstraps ) {
- processors = bootstraps;
- }
- }
-
- if ( argc == 5 && fileExists( outfile ) == 1 ) {
- printf( "\nbootstrap_cz: Error: outfile \"%s\" already exists.\n",
- outfile );
- exit( -1 );
- }
-
- readInAlignment( inalign );
-
- if ( argc == 5 ) {
- bootstrapAccordingToBSPfile( bootstraps,
- bsp_file,
- outfile );
- }
- else {
- bootstrapAccordingToBSPfileP( bootstraps,
- processors,
- bsp_file,
- outfile );
- }
-
- return 0;
-
-} /* main */
+++ /dev/null
-Overview of the Perl scripts in this directory
-----------------------------------------------
-
-This directory contains a collection of (mostly horrible)
-Perl scripts. Some of them are still maintained, such as
-phylo_pl.pl.
-
-Some of the scripts in this directory relay heavily
-on forester.pm.
-
-
-
-RIO pipeline:
-- rio.pl
-- makeTree.pl
-- p7extract.pl
-- multifetch.pl
-
-
-Running a parallelized RIO web server:
-- nph-riowebserver
-- rio_slave.pl
-- rio_slave_driver.pl
-- queue.pm
-
-
-To prepare data to be used by RIO:
-- bootstrap_cz.pl
-- pfam2slx.pl
-- extractSWISS-PROT.pl
-- extractTrembl.pl
-- pfam2pwd.pl
-
-
-To run multiple RIO analyses in an automated fashion:
-- Xrio.pl
-
-
-To analyze RIO results (of Xrio.pl runs):
-- bootstrapCounter.pl
-- bootstrapSelector.pl
-- diffFinder3.pl
-
-
-Counting of species in SWISS-PROT and TrEMBL:
-- countSpeciesSPTrEMBL.pl
-
-
+++ /dev/null
-#!/usr/bin/perl -w
-#
-# Xrio.pl
-# -------
-# Copyright (C) 1999-2001 Washington University School of Medicine
-# and Howard Hughes Medical Institute
-# All rights reserved
-#
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Created: 03/01/01
-#
-# Last modified 06/22/01
-
-
-# Objective. Runs "rio.pl" for each Pfam assignment in "infile".
-#
-# Usage. rio.pl <infile> <species names file> <output directory> <outfile> <logfile>
-#
-# species names file: list of species to use for analysis.
-#
-# This version uses the CE number as identifier for output files.\n";
-#
-# Format for infile:
-#
-# >>3R5.2 CE19648 (CAMBRIDGE) TR:Q9XWB1 protein_id:CAA21778.1
-# //
-#
-# >>4R79.1 CE19649 Zinc-binding metalloprotease domain (CAMBRIDGE) protein_id:CAB63429.1
-# =Astacin Astacin (Peptidase family M12A) 296.3 3.8e-85 1
-# //
-#
-# >>4R79.2 CE19650 Ras family (CAMBRIDGE) TR:Q9XXA4 protein_id:CAA20282.1
-# =ras Ras family 208.8 8.1e-59 1
-# =FA_desaturase Fatty acid desaturase 4.5 1.5 1
-# =UPF0117 Domain of unknown function DUF36 3.1 3.5 1
-# =arf ADP-ribosylation factor family -46.0 1.5e-05 1
-# //
-#
-#
-#
-
-# Xrio.pl /nfs/wol2/people/zmasek/wormpep43_hmmpfam6.2/wormpep43_Hmmpfam_6.2 /nfs/wol2/people/zmasek/species_trees/tree_of_life_bin_1-4_species_list /nfs/wol2/people/zmasek/XrioTEST3 /nfs/wol2/people/zmasek/XrioTEST3/OUTFILE1 /nfs/wol2/people/zmasek/XrioTEST3/LOG1
-
-
-
-
-
-use strict;
-
-use FindBin;
-use lib $FindBin::Bin;
-use rio_module;
-
- $RIO_PL = "rio.pl";
-my $VERSION = "3.000";
-
-my $FASTA_DB = "/nfs/wol2/people/zmasek/DB/wormpep/wormpep43";
-my $QUERY_SPECIES = "CAEEL";
-my $SPECIES_TREE = $SPECIES_TREE_FILE_DEFAULT;
-
-my $RIOPL_OPTIONS = "T=B P=6 L=0 R=0 U=80 V=0 X=2 Y=2 Z=2 C E I";
-
-my $TEMP_DIR = "/tmp/Xriopl"; # Where all the temp files, etc will be created.
-
-my %Species_names_hash = ();
-
-my $infile = "";
-my $outfile = ""; # Huge file of all rio outputs.
-my $logfile = ""; # Lists all sequences which have been analyzed successfully.
-my $output_directory = "";
-my $species_names_file = "";
-
-
-my $return_line = "";
-my $ID = "";
-my $pfam_name = "";
-my $E_value = 0;
-my $score = 0;
-my $GA = 0;
-my $temp_dir = "";
-my $outname = "";
-my %outnames = ();
-my $seqs = 0;
-my $ii = 0;
-my $time = 0;
-my $successful = 0;
-my $query_not_aligned = 0;
-my $pwd_not_present = 0;
-my $already_done = 0;
-my $start_date = "";
-my $old_fh = "";
-my %AC_OS = (); # AC -> species name for TrEMBL seqs
-my %AC_DE = (); # AC -> description for TrEMBL seqs
-
-my $description_line = "";
-my $message1 = "";
-my $message2 = "";
-
-$start_date = `date`;
-
-if ( @ARGV != 5 ) {
- &errorInCommandLine();
- exit ( -1 );
-}
-
-$infile = $ARGV[ 0 ];
-$species_names_file = $ARGV[ 1 ];
-$output_directory = $ARGV[ 2 ];
-$outfile = $ARGV[ 3 ];
-$logfile = $ARGV[ 4 ];
-
-
-if ( -e $outfile ) {
- die "\n\n$0: <<$outfile>> already exists.\n\n";
-}
-unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n";
-}
-unless ( ( -s $species_names_file ) && ( -f $species_names_file ) && ( -T $species_names_file ) ) {
- die "\n\n$0: <<$species_names_file>> does not exist, is empty, or is not a plain textfile.\n\n";
-}
-unless ( ( -s $TREMBL_ACDEOS_FILE ) && ( -f $TREMBL_ACDEOS_FILE ) && ( -T $TREMBL_ACDEOS_FILE ) ) {
- die "\n\n$0: <<$TREMBL_ACDEOS_FILE>> does not exist, is empty, or is not a plain textfile.\n\n";
-}
-unless ( ( -e $output_directory ) && ( -d $output_directory ) ) {
- die "\n\n$0: <<$output_directory>> does not exist, or is not a directory.\n\n";
-}
-
-
-
-# Reads in the species file:
-# --------------------------
-&readSpeciesNamesFile( $species_names_file );
-
-
-
-# Reads in the file containing AC, DE and OS for TrEMBL seqs:
-# -----------------------------------------------------------
-open( HH, "$TREMBL_ACDEOS_FILE" ) || die "\n\n$0: Unexpected error: Cannot open file <<$TREMBL_ACDEOS_FILE>>: $!\n\n";
-while ( $return_line = <HH> ) {
- if ( $return_line =~ /(\S+);([^;]*);(\S+)/ ) {
- $AC_OS{ $1 } = $3;
- $AC_DE{ $1 } = $2;
- }
-}
-close( HH );
-
-
-
-# Reads in outnames in logfile, if present:
-# -----------------------------------------
-if ( ( -s $logfile ) ) {
- open( L, "$logfile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$logfile>>: $!\n\n";
- while ( $return_line = <L> ) {
- if ( $return_line =~ /\s*(\S+)/ ) {
- $outnames{ $1 } = 0;
- }
- }
- close( L );
-}
-
-
-
-# Creates the temp directory:
-# ---------------------------
-
-$ii = 0;
-
-$time = time;
-
-$temp_dir = $TEMP_DIR.$time.$ii;
-
-while ( -e $temp_dir ) {
- $ii++;
- $temp_dir = $TEMP_DIR.$time.$ii;
-}
-
-mkdir( $temp_dir, 0777 )
-|| die "\n\n$0:Unexpected error: Could not create <<$temp_dir>>: $!\n\n";
-
-unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) {
- die "\n\n$0:Unexpected error: <<$temp_dir>> does not exist, or is not a directory: $!\n\n";
-}
-
-
-
-$message1 = "# $0\n".
- "# Version : $VERSION\n".
- "# Date started : $start_date".
- "# Infile : $infile\n".
- "# Species names file : $species_names_file\n".
- "# Output directory : $output_directory\n".
- "# Outfile : $outfile\n".
- "# RIO PWD directory : $RIO_PWD_DIRECTORY\n".
- "# RIO BSP directory : $RIO_BSP_DIRECTORY\n".
- "# RIO NBD directory : $RIO_NBD_DIRECTORY\n".
- "# RIO ALN directory : $RIO_ALN_DIRECTORY\n".
- "# RIO HMM directory : $RIO_HMM_DIRECTORY\n".
- "# Fasta db : $FASTA_DB\n".
- "# Species of query : $QUERY_SPECIES\n".
- "# Species tree : $SPECIES_TREE\n".
- "# rio.pl options : $RIOPL_OPTIONS\n\n\n";
-
-open( IN, "$infile" ) || die "\n\n$0: Cannot open file <<$infile>>: $!\n\n";
-open( LOG, ">> $logfile" ) || die "\n\n$0: Cannot open file <<$logfile>>: $!\n\n";
-
-
-# Turns off buffering for LOG.
-$old_fh = select( LOG );
-$| = 1;
-select( $old_fh );
-
-
-$ID = "";
-
-W: while ( $return_line = <IN> ) {
-
- if ( $return_line =~ /^\s*>>.*(CE\d+)/ ) {
- $ID = $1;
- $return_line =~ /^\s*>>(.+)/;
- $description_line = $1;
- }
- elsif ( $return_line =~ /^\s*\/\// ) {
- $ID = "";
- }
- elsif ( $return_line =~ /^\s*=(\S+)\s+.+\s+(\S+)\s+(\S+)\s+\S+\s*$/
- && $ID ne "" ) {
-
- $pfam_name = $1;
- $score = $2;
- $E_value = $3;
-
- $outname = $ID.".".$pfam_name;
-
- # Checks if already done.
- if ( %outnames && exists( $outnames{ $outname } ) ) {
- $already_done++;
- next W;
- }
-
- &executeHmmfetch( $PFAM_HMM_DB, $pfam_name, $temp_dir."/HMMFILE" );
-
- $GA = &getGA1cutoff( $temp_dir."/HMMFILE" );
- unlink( $temp_dir."/HMMFILE" );
-
- if ( $GA == 2000 ) {
- die "\n\n$0: Unexpected error: No GA cutoff found for \"$pfam_name\".\n\n";
- }
- elsif ( $score < $GA ) {
- next W;
- }
-
- if ( -s $output_directory."/".$outname ) {
- unlink( $output_directory."/".$outname );
- }
-
-
- $message1 .= "\n\n".
- "# ############################################################################\n".
- "# Annotation: $description_line\n".
- "# HMM : $pfam_name\n".
- "# score : $score\n".
- "# E-value : $E_value\n";
-
-
-
- unless ( ( -s $RIO_PWD_DIRECTORY.$pfam_name.$SUFFIX_PWD ) ) {
- $pwd_not_present++;
- $message1 .= "# No PWD file for this family.\n".
- "# ############################################################################\n";
- next W;
- }
-
-
- unless ( ( -s $PFAM_SEED_DIRECTORY."/".$pfam_name ) && ( -f $PFAM_SEED_DIRECTORY."/".$pfam_name ) && ( -T $PFAM_SEED_DIRECTORY."/".$pfam_name ) ) {
- die "\n\n$0: Error: Pfam seed alignment <<$PFAM_SEED_DIRECTORY"."/"."$pfam_name>> not present.\n\n";
- }
-
-
- &getSequenceFromFastaFile( $FASTA_DB,
- $temp_dir."/QUERY",
- $ID );
-
- &performRIO( $pfam_name, # A=
- $temp_dir."/QUERY", # Q=
- $output_directory."/".$outname, # O=
- $ID."_".$QUERY_SPECIES, # N=
- $SPECIES_TREE, # S=
- $RIOPL_OPTIONS, # L=0 R=0 U=70 V=0 X=2 Y=2 Z=2 C E K I x
- $temp_dir."/riopltempdir" ); # j=
-
-
-
- if ( -s $output_directory."/".$outname ) {
- $successful++;
- }
- else {
- $message1 .= "# Query has not been aligned (E value too low).\n".
- "# ############################################################################\n";
- $query_not_aligned++;
- }
-
- if ( unlink( $temp_dir."/QUERY" ) != 1 ) {
- die "\n$0: Unexpected error: File(s) could not be deleted.\n";
- }
-
-
-
- if ( -s $output_directory."/".$outname ) {
- open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n\n";
- print OUT_MESG_ONE ( $message1 );
- close( OUT_MESG_ONE );
-
- $message1 = "";
-
- open( OUT_MESG_TWO, ">$temp_dir/_message2_" ) || die "\n\n$0: Cannot create file \"$temp_dir/_message2_\": $!\n\n";
- print OUT_MESG_TWO ( "# Successful calculations : $successful\n" );
- print OUT_MESG_TWO ( "# No calculation due to absence of PWD file: $pwd_not_present\n" );
- print OUT_MESG_TWO ( "# Calculation already performed : $already_done\n" );
- print OUT_MESG_TWO ( "# ############################################################################\n" );
- close( OUT_MESG_TWO );
-
- if ( -s $outfile ) {
- system( "cat $outfile $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile"."___" )
- && die "\n\n$0: Could not execute \"cat $outfile $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile"."___\": $!\n\n";
- system( "mv", $outfile."___", $outfile )
- && die "\n\n$0: Could not execute \"mv $outfile"."___ $outfile\": $!\n\n";
- }
- else {
- system( "cat $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile" )
- && die "\n\n$0: Could not execute \"cat $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile\": $!\n\n";
-
- }
-
- print LOG "$outname\n";
-
- unlink( "$temp_dir/_message1_", "$temp_dir/_message2_" );
-
- }
-
-
-
- } ## End of elsif ( $return_line =~ /^\s*=(\S+)\s+.+\s+(\S+)\s+(\S+)\s+\S+$/ && $ID ne "" )
-
-} ## End of while ( $return_line = <IN> )
-
-close( IN );
-close( LOG );
-
-
-open( OUT_MESG_TWO, ">$temp_dir/_message2_" ) || die "\n$0: Cannot create file \"$temp_dir/_message2_\": $!\n";
-print OUT_MESG_TWO ( "\n\n# Xrio.pl successfully terminated.\n" );
-print OUT_MESG_TWO ( "# Started : $start_date" );
-print OUT_MESG_TWO ( "# Terminated: ".`date`."\n" );
-print OUT_MESG_TWO ( "# Successful calculations : $successful\n" );
-print OUT_MESG_TWO ( "# No calculation due to absence of PWD file: $pwd_not_present\n" );
-print OUT_MESG_TWO ( "# Calculation already performed : $already_done\n\n" );
-close( OUT_MESG_TWO );
-
-if ( -s $outfile ) {
- if ( $message1 ne "" ) {
- open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n";
- print OUT_MESG_ONE ( $message1 );
- close( OUT_MESG_ONE );
- system( "cat $outfile $temp_dir/_message1_ $temp_dir/_message2_ > $outfile"."___" )
- && die "$0: Could not execute \"cat $outfile $temp_dir/_message1_ $temp_dir/_message2_ > $outfile"."___\": $!";
- }
- else {
- system( "cat $outfile $temp_dir/_message2_ > $outfile"."___" )
- && die "$0: Could not execute \"cat $outfile $temp_dir/_message2_ > $outfile"."___\": $!";
- }
- system( "mv", $outfile."___", $outfile )
- && die "$0: Could not execute \"mv $outfile"."___ $outfile\": $!";
-}
-else {
- open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n";
- print OUT_MESG_ONE ( $message1 );
- close( OUT_MESG_ONE );
- system( "cat $temp_dir/_message1_ $temp_dir/_message2_ > $outfile" )
- && die "$0: Could not execute \"cat $temp_dir/_message1_ $temp_dir/_message2_ > $outfile\": $!";
-}
-
-unlink( "$temp_dir/_message1_", "$temp_dir/_message2_" );
-
-rmdir( $temp_dir ) || die "\n$0: Unexpected failure (could not remove: $temp_dir): $!\n";
-
-print( "\n\nXrio.pl successfully terminated.\n" );
-print( "Successful calculations : $successful\n" );
-print( "No calculation due to absence of PWD file: $pwd_not_present\n" );
-print( "Calculation already performed : $already_done\n" );
-print( "Started : $start_date" );
-print( "Terminated: ".`date`."\n" );
-print( "\n" );
-
-exit( 0 );
-
-
-
-# Methods
-# -------
-
-
-
-# Gets the gathering cutoff per sequence from a HMM file.
-#
-# One argument: the HMM file name
-# Returns the gathering cutoff per sequence, 2000 upon failure
-# Last modified: 07/11/01
-sub getGA1cutoff {
-
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $GA = 2000;
-
- &testForTextFilePresence( $infile );
-
- open( H, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!";
- while ( $return_line = <H> ) {
-
- if ( $return_line =~ /^GA\s+(\S+)/ ) {
- $GA = $1;
- close( H );
- return $GA;
- }
-
- }
- close( H );
- return $GA;
-
-} ## getGA1cutoff
-
-
-
-
-
-# 1. A= Name of Pfam family
-# 2. Q= Query file
-# 3. O= Output
-# 4. N= query Name
-# 5. S= Species tree file
-# 6. more options, such I K m
-# 7. j= Name for temporary directory
-sub performRIO {
- my $pfam_name = $_[ 0 ];
- my $query_file = $_[ 1 ];
- my $output_file = $_[ 2 ];
- my $name_for_query = $_[ 3 ];
- my $species_tree_file = $_[ 4 ];
- my $more_options = $_[ 5 ];
- my $tmp_file_rio = $_[ 6 ];
-
- my $options_for_rio = "";
-
- $options_for_rio .= ( " A=".$pfam_name );
- $options_for_rio .= ( " Q=".$query_file );
- $options_for_rio .= ( " O=".$output_file );
- $options_for_rio .= ( " N=".$name_for_query );
- $options_for_rio .= ( " S=".$species_tree_file );
- $options_for_rio .= ( " j=".$tmp_file_rio );
- $options_for_rio .= ( " ".$more_options );
-
- system( "$RIO_PL 1 $options_for_rio" )
- && die "$0: performRIO: Could not execute \"$RIO_PL 1 $options_for_rio\": $!\n";
-
-} ## performRIO
-
-
-
-# Reads in (SWISS-PROT) species names from a file.
-# Names must be separated by newlines.
-# Lines beginning with "#" are ignored.
-# A possible "=" and everything after is ignored.
-# One argument: species-names-file name
-# Last modified: 04/24/01
-sub readSpeciesNamesFile {
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $species = "";
-
- unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- &Error( "\"$infile\" does not exist,\n is empty, or is not a plain textfile." );
- }
-
- open( IN_RSNF, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!";
- while ( $return_line = <IN_RSNF> ) {
- if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) {
- $species = $1;
- $species =~ s/=.+//;
- $Species_names_hash{ $species } = "";
- }
- }
- close( IN_RSNF );
-
- return;
-} ## readSpeciesNamesFile
-
-
-
-# Searches the > line of a multiple seq file for a
-# query, saves the found entries.
-# Three arguments:
-# 1. multi Fasta file to search through
-# 2. outputfile name
-# 3. query
-# Last modified: 03/05/01
-sub getSequenceFromFastaFile {
-
- my $inputfile = $_[ 0 ];
- my $outputfile = $_[ 1 ];
- my $query = $_[ 2 ];
- my $hits = 0;
-
- open( IN_GSFF, "$inputfile" )
- || die "\n$0: getSequenceFromFastaFile: Cannot open file <<$inputfile>>: $!\n";
- open( OUT_GSFF, ">$outputfile" )
- || die "\n$0: getSequenceFromFastaFile: Cannot create file <<$outputfile>>: $!\n";
-
-
- while ( $return_line = <IN_GSFF> ) {
- if ( $return_line =~ /^\s*>.*$query\s+/ ) {
- $hits++;
- print $return_line;
- print OUT_GSFF $return_line;
- $return_line = <IN_GSFF>;
- while ( $return_line && $return_line =~ /^\s*[^>]/ ) {
- print OUT_GSFF $return_line;
- $return_line = <IN_GSFF>;
- }
- last; # In Wormpep there _are_ ambigous CE numbers.
- }
-
- }
-
- close( IN_GSFF );
- close( OUT_GSFF );
- if ( $hits < 1 ) {
- die "\n$0: getSequenceFromFastaFile: Unexpected error: <<$query>> not found.\n";
- }
- if ( $hits > 1 ) {
- die "\n$0: getSequenceFromFastaFile: Unexpected error: <<$query>> is ambigous.\n";
- }
-
-} ## getSequenceFromFastaFile
-
-
-
-
-# Last modified: 03/08/01
-sub errorInCommandLine {
-
- print "\n";
- print " Xrio.pl $VERSION\n";
- print " -------\n";
- print "\n";
- print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n";
- print "\n";
- print " Purpose. Runs \"rio.pl\" for each Pfam assignment in \"infile\".\n";
- print "\n";
- print " Usage. rio.pl <infile> <species names file> <output directory> <outfile> <logfile>\n";
- print "\n";
- print " infile: has the following format (defined per example):\n";
- print " >>4R79.1 CE19649 Zinc-binding metalloprotease domain (CAMBRIDGE) protein_id:CAB63429.1\n";
- print " =Astacin Astacin (Peptidase family M12A) 296.3 3.8e-85 1\n";
- print " //\n";
- print "\n";
- print " >>4R79.2 CE19650 Ras family (CAMBRIDGE) TR:Q9XXA4 protein_id:CAA20282.1\n";
- print " =ras Ras family 208.8 8.1e-59 1\n";
- print " =FA_desaturase Fatty acid desaturase 4.5 1.5 1\n";
- print " =UPF0117 Domain of unknown function DUF36 3.1 3.5 1\n";
- print " =arf ADP-ribosylation factor family -46.0 1.5e-05 1\n";
- print " //\n";
- print "\n";
- print " species names file: list of species to use for analysis.\n";
- print "\n";
- print " This version uses the CE number as identifier for output files.\n";
- print "\n";
-
- exit( -1 );
-
-} ## errorInCommandLine
-
-
+++ /dev/null
-#!/usr/bin/perl -w
-#
-# bootstrapCounter.pl
-# -------------------
-#
-# Copyright (C) 2001 Washington University School of Medicine
-# and Howard Hughes Medical Institute
-# All rights reserved
-#
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Created: 04/04/01
-#
-# Last modified 08/16/01
-#
-#
-# Objective. Determines the distribution of top orthology bootstrap values
-# of a Xrio.pl output file.
-#
-# Usage. "bootstrapCounter.pl <infile = Xrio.pl-output> <outfile>"
-#
-# Important. The result of this is meaningful ONLY if the thresholds
-# for output of the RIO analysis are set to zero (L=0 R=0).
-#
-# Format for infile:
-# ...
-#
-# # ############################################################################
-# # Annotation: B0511.6 CE17345 helicase (ST.LOUIS) TR:O61815 protein_id:AAC17654.1
-# # HMM : ABC_tran
-# # score : -59.6
-# # E-value : 1.1
-# # Query has not been aligned (score lower than gathering cutoff).
-# # ############################################################################
-#
-#
-# # ############################################################################
-# # Annotation: B0511.7 CE17346 (ST.LOUIS) TR:O61817 protein_id:AAC17655.1
-# # HMM : FHA
-# # score : 71.6
-# # E-value : 1.7e-17
-# RIO - Resampled Inference of Orthologs
-# Version: 1.000
-# ------------------------------------------------------------------------------
-# Alignment file: /tmp/Xriopl9846081980/Full-FHA
-# Alignment : FHA domain
-# HMM : FHA
-# Query file : /tmp/Xriopl9846081980/__queryfile__
-# ==============================================================================
-#
-# Query : CE17346.FHA_CAEEL/45-114
-#
-# Number (in %) of observed orthologies (o) and super orthologies (s) to query
-# in bootstrapped trees, evolutionary distance to query:
-#
-# Sequence Description # o[%] s[%] distance
-# -------- ----------- ---- ---- --------
-# YC67_MYCTU/308-372 - 20 14 1.577840
-# FRAH_ANASP/204-277 FRAH PROTEIN. 17 16 1.532670
-# ABA2_NICPL/557-633 ZEAXANTHIN EPOXIDASE PRECURSOR (EC 1.14.-.-). 14 11 1.885700
-# ABA2_LYCES/563-639 ZEAXANTHIN EPOXIDASE PRECURSOR (EC 1.14.-.-). 14 11 2.140000
-#
-#
-#
-# Distance values (based on ML branch length values on consensus tree)
-# --------------------------------------------------------------------
-# Given the thresholds for distance calculations:
-# No sequence is considered orthologous to query.
-#
-# ...
-
-
-
-use strict;
-
-my $VERSION = 0.200;
-
-my $infile = "";
-my $outfile = "";
-my $return_line = "";
-my $results = 0;
-my $o_bootstraps = 0;
-my $s_bootstraps = 0;
-my @o_bootstraps_array = ();
-my @s_bootstraps_array = ();
-my $total = 0;
-my $i = 0;
-
-
-if ( @ARGV != 2 ) {
- &errorInCommandLine();
- exit ( -1 );
-}
-
-$infile = $ARGV[ 0 ];
-$outfile = $ARGV[ 1 ];
-
-if ( -e $outfile ) {
- die "\n$0: <<$outfile>> already exists.\n";
-}
-unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n";
-}
-
-
-open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n";
-
-$results = 0;
-for ( $i = 0; $i <= 100; ++$i ) {
- $s_bootstraps_array[ $i ] = $o_bootstraps_array[ $i ] = 0;
-}
-
-while ( $return_line = <IN> ) {
-
- if ( $return_line =~ /^\s*--------\s+/ ) {
- $results = 1;
- }
- elsif ( $return_line =~ /^\s*Distance\s+values\s+/i ) {
- $results = 0;
- }
- elsif ( $results == 1 && $return_line =~ /^\s*!NO\s+ORTHOLOGS/ ) {
- $o_bootstraps_array[ 0 ]++;
- $s_bootstraps_array[ 0 ]++;
- $total++;
- $results = 0;
- }
- elsif ( $results == 1 && $return_line =~ /(\S+)\s+(\S+)\s+\S+\s*$/ ) {
- $o_bootstraps = $1;
- $s_bootstraps = $2;
- $results = 0;
- if ( $o_bootstraps > 100 || $s_bootstraps > 100
- || $o_bootstraps < 0 ) {
- print "o bootstraps: $o_bootstraps\n";
- print "s bootstraps: $s_bootstraps\n";
- die "\n\n$0: Error: Boostrap value(s) out of range.\n\n";
- }
-
- $total++;
- $o_bootstraps_array[ $o_bootstraps ]++;
- $s_bootstraps_array[ $s_bootstraps ]++;
-
- }
-}
-
-close( IN );
-
-
-open( OUT, ">$outfile" ) || die "\n$0: Cannot create file \"$outfile\": $!\n";
-
-print OUT "bootstrapCounter.pl version: $VERSION\n\n";
-print OUT "Distribution of top bootstrap values\n\n";
-print OUT "Input file : $infile\n";
-print OUT "Output file: $outfile\n";
-print OUT "Date : ".`date`."\n";
-print OUT "Total: $total\n\n";
-print OUT "top-orthology-bootstraps vs. count:\n";
-for ( $i = 0; $i < @o_bootstraps_array; ++$i ) {
- print OUT "$i $o_bootstraps_array[ $i ]\n";
-}
-print OUT "\n\ntop-super-orthology-bootstraps vs. count:\n";
-for ( $i = 0; $i < @s_bootstraps_array; ++$i ) {
- print OUT "$i $s_bootstraps_array[ $i ]\n";
-}
-close( OUT );
-
-print( "\nDone.\n\n" );
-
-exit( 0 );
-
-
-
-sub errorInCommandLine {
- print "\n";
- print " bootstrapCounter.pl version: $VERSION\n";
- print " Usage: \"bootstrapCounter.pl <infile = Xrio.pl-output> <outfile>\"\n";
- print " Important: The result of this is meaningful ONLY if the thresholds\n";
- print " for output of the RIO analysis are set to zero (L=0 R=0).\n";
- print "\n";
- exit( -1 );
-}
-
-
+++ /dev/null
-#!/usr/bin/perl -w
-#
-# bootstrapSelector.pl
-# --------------------
-#
-# Copyright (C) 2001 Washington University School of Medicine
-# and Howard Hughes Medical Institute
-# All rights reserved
-#
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Created: 04/06/01
-#
-# Last modified 09/24/01
-#
-#
-# Objective. Selection of RIO analysis results with top ortholgy
-# bootstrap values greater or less than a threshold.
-#
-# Usage: "bootstrapSelector.pl <threshold options> <infile = Xrio.pl-output> <outfile>"
-# Options: "l" for "less or equal" ("grater or equal" is default)
-# "c" for "all hits must meet threshold in case of
-# multiple copies of the same domain in the query"
-# (default: "at least one")
-# Example: "bootstrapSelector.pl 95lc OUTFILE_At_1 At_1_out"
-#
-# Important. The result of this is meaningful ONLY if the thresholds
-# for output of the RIO analysis are set to zero (L=0 R=0).
-#
-#
-# Format for infile:
-#
-# ...
-#
-# # ############################################################################
-# # Annotation: B0511.6 CE17345 helicase (ST.LOUIS) TR:O61815 protein_id:AAC17654.1
-# # HMM : ABC_tran
-# # score : -59.6
-# # E-value : 1.1
-# # Query has not been aligned (score lower than gathering cutoff).
-# # ############################################################################
-#
-#
-# # ############################################################################
-# # Annotation: B0511.7 CE17346 (ST.LOUIS) TR:O61817 protein_id:AAC17655.1
-# # HMM : FHA
-# # score : 71.6
-# # E-value : 1.7e-17
-# RIO - Resampled Inference of Orthologs
-# Version: 1.000
-# ------------------------------------------------------------------------------
-# Alignment file: /tmp/Xriopl9846081980/Full-FHA
-# Alignment : FHA domain
-# HMM : FHA
-# Query file : /tmp/Xriopl9846081980/__queryfile__
-# ==============================================================================
-#
-# Query : CE17346.FHA_CAEEL/45-114
-#
-# Number (in %) of observed orthologies (o) and super orthologies (s) to query
-# in bootstrapped trees, evolutionary distance to query:
-#
-# Sequence Description # o[%] s[%] distance
-# -------- ----------- ---- ---- --------
-# YC67_MYCTU/308-372 - 20 14 1.577840
-# FRAH_ANASP/204-277 FRAH PROTEIN. 17 16 1.532670
-# ABA2_NICPL/557-633 ZEAXANTHIN EPOXIDASE PRECURSOR (EC 1.14.-.-). 14 11 1.885700
-# ABA2_LYCES/563-639 ZEAXANTHIN EPOXIDASE PRECURSOR (EC 1.14.-.-). 14 11 2.140000
-#
-#
-#
-# Distance values (based on ML branch length values on consensus tree)
-# --------------------------------------------------------------------
-# Given the thresholds for distance calculations:
-# No sequence is considered orthologous to query.
-#
-# ...
-
-
-
-use strict;
-
-my $VERSION = 1.000;
-my $threshold = 0;
-my $infile = "";
-my $outfile = "";
-my $summary_outfile = "";
-my $return_line = "";
-my $identifier = "";
-my $top1 = "";
-my $analysis_performed = 0;
-my $reading = 0;
-my $i = 0;
-my @lines = ();
-my $larger = 1;
-my $complete = 0;
-my $total = 0;
-
-if ( @ARGV != 3 ) {
- &errorInCommandLine();
- exit ( -1 );
-}
-
-$threshold = $ARGV[ 0 ];
-$infile = $ARGV[ 1 ];
-$outfile = $ARGV[ 2 ];
-$summary_outfile = $outfile.".short";
-
-if ( -e $outfile ) {
- die "\n$0: <<$outfile>> already exists.\n";
-}
-unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n";
-}
-
-
-if ( $threshold =~ /l/ ) {
- $larger = 0;
- $threshold =~ s/l//;
-}
-if ( $threshold =~ /c/ ) {
- $complete = 1;
- $threshold =~ s/c//;
-}
-
-open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n";
-
-open( OUT, ">$outfile" ) || die "\n$0: Cannot create file \"$outfile\": $!\n";
-open( OUT_SUMMARY, ">$summary_outfile" ) || die "\n$0: Cannot create file \"$summary_outfile\": $!\n";
-
-print OUT "bootstrapSelector.pl version: $VERSION\n\n";
-print OUT "Selection of RIO analysis results with top ortholgy\n";
-print OUT "bootstrap values greater or less than a threshold.\n\n";
-if ( $larger == 1 ) {
- print OUT "Threshold : Grater than or equal to $threshold\n";
-}
-else {
- print OUT "Threshold : Less than or equal to $threshold\n";
-}
-print OUT "In case of multiple copies of the same domain in the query:\n";
-if ( $complete == 1 ) {
- print OUT "All hits must meet threshold.\n";
-}
-else {
- print OUT "At least one hit must meet threshold.\n";
-}
-print OUT "Input file : $infile\n";
-print OUT "Output file : $outfile\n";
-print OUT "Output file short: $summary_outfile\n";
-print OUT "Date : ".`date`."\n\n\n";
-
-while ( $return_line = <IN> ) {
-
- if ( $return_line =~ /^\s*# Annotation:\s*(.+)/ ) {
- $identifier = $1;
- $identifier = substr( $identifier, 0, 60);
- $analysis_performed = 0;
- $reading = 1;
- $i = 0;
- @lines = ();
- }
-
- if ( $reading == 1 && $return_line =~ /^\s*RIO/ ) {
- $analysis_performed = 1;
- }
-
- if ( $reading == 1
- && $return_line =~ /^\s*# ####################################/ ) {
- if ( $analysis_performed == 1 ) {
- &analyze();
- }
- $reading = 0;
- }
-
- if ( $reading == 1 ) {
- $lines[ $i++ ] = $return_line;
- }
-}
-
-close( IN );
-
-print OUT "\n\nTotal: $total\n";
-
-close( OUT );
-close( OUT_SUMMARY );
-
-print "\nTotal: $total\n";
-print "Done.\n\n";
-
-exit( 0 );
-
-
-sub analyze {
- my $j = 0;
- my $results = 0;
- my $o_bootstraps = 0;
- $top1 = "";
-
- for ( $j = 0; $j < $i; $j++ ) {
-
- if ( $lines[ $j ] =~ /^\s*--------\s+/ ) {
- $results = 1;
- }
- elsif ( $lines[ $j ] =~ /^\s*Distance\s+values\s+/i ) {
- $results = 0;
- }
- elsif ( $results == 1
- && ( $lines[ $j ] =~ /\S+\s+\S+\s+\S+\s*$/
- || $lines[ $j ] =~ /^\s*!NO\s+ORTHOLOGS/ ) ) {
-
- if ( $lines[ $j ] =~ /^\s*!NO\s+ORTHOLOGS/ ) {
- $o_bootstraps = 0;
- }
- else {
- $lines[ $j ] =~ /(\S+)\s+\S+\s+\S+\s*$/;
- $o_bootstraps = $1;
- if ( $top1 eq "" ) {
- $top1 = $lines[ $j ];
- $top1 =~ s/\n//;
- $top1 =~ s/\s{2,}/ /g;
- }
- }
-
- $results = 0;
-
- if ( $o_bootstraps > 100 || $o_bootstraps < 0 ) {
- print "o bootstraps: $o_bootstraps\n";
- die "\n\n$0: Error: Boostrap value(s) out of range.\n\n";
- }
-
- if ( $larger == 1 ) {
- if ( $complete != 1 && $o_bootstraps >= $threshold ) {
- &writeout();
- $total++;
- return;
- }
- elsif ( $complete == 1 && $o_bootstraps < $threshold ) {
- return;
- }
- }
- else {
- if ( $complete != 1 && $o_bootstraps <= $threshold ) {
- &writeout();
- $total++;
- return;
- }
- elsif ( $complete == 1 && $o_bootstraps > $threshold ) {
- return;
- }
- }
- }
- }
- if ( $complete == 1 ) {
- &writeout();
- $total++;
- }
- return;
-}
-
-
-
-sub writeout {
- my $j = 0;
- print OUT "# ############################################################################\n";
- for ( $j = 0; $j < $i; ++$j ) {
- print OUT "$lines[ $j ]";
- }
- print OUT "# ############################################################################\n\n\n";
- print OUT_SUMMARY "$identifier [top 1: $top1]\n\n";
-}
-
-
-
-sub errorInCommandLine {
- print "\n";
- print " bootstrapCounter.pl version: $VERSION\n";
- print " Usage: \"bootstrapSelector.pl <threshold options> <infile = Xrio.pl-output> <outfile>\"\n";
- print " Options: \"l\" for \"less or equal\" (\"grater or equal\" is default)\n";
- print " \"c\" for \"all hits must meet threshold in case of\n";
- print " multiple copies of the same domain in the query\"\n";
- print " (default: \"at least one\")\n";
- print " Example:\n";
- print " \"bootstrapSelector.pl 95lc OUTFILE_At_1 At_1_out\"\n\n";
- print " Important: The result of this is meaningful ONLY if the thresholds\n";
- print " for output of the RIO analysis are set to zero (L=0 R=0).\n\n";
- exit( -1 );
-}
-
-
+++ /dev/null
-#!/usr/bin/perl -w
-
-# bootstrap_cz.pl
-# ---------------
-# Copyright (C) 1999-2003 Washington University School of Medicine
-# and Howard Hughes Medical Institute
-# All rights reserved
-#
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Created: 05/17/01
-#
-# Last modified 08/26/03
-#
-# Purpose:
-# Bootstrap resamples an alignment in PHYLIP sequential format <bootstraps>
-# times.
-# Amino acid sequences must only be represented by uppercase letters (A-Z)
-# and '-'.
-# In mode 0 it saves the positions which it used to create the
-# bootstrapped alignment into <positions outfile>.
-# Mode 1 allows to recreate exactly the same boostrapped alignment
-# by reading in a <positions infile>.
-# Sequence names are normalized to $LENGTH_OF_NAME characters.
-# The output alignment is in PHYLIP's sequential or interleaved format.
-# (These two are the same in this case, since all the seqs will be one
-# line in length (no returns in seq).)
-#
-# Usage:
-# bootstrap_cz.pl <mode (0 or 1)> <bootstraps> <alignment infile>
-# <alignment outfile> <positions out- (mode 0) or in-file (mode 1)>
-# [random number seed (mode 0 only)]
-#
-
-use strict;
-use FindBin;
-use lib $FindBin::Bin;
-
-use rio_module;
-
-my $VERSION = "2.001";
-
-my $modus = -1; # 0 to create pos. file, 1 to use premade pos. file
-my $bootstraps = -1;
-my $infile = "";
-my $outalign_file = "";
-my $positions_file = "";
-my $seed = -1;
-
-
-$modus = $ARGV[ 0 ];
-$bootstraps = $ARGV[ 1 ];
-$infile = $ARGV[ 2 ];
-$outalign_file = $ARGV[ 3 ];
-$positions_file = $ARGV[ 4 ];
-$seed = $ARGV[ 5 ];
-
-if ( @ARGV != 5 && @ARGV != 6 ) {
- &printUsage();
- exit( -1 );
-}
-
-if ( $modus != 0 && $modus != 1 ) {
- &printUsage();
- exit( -1 );
-}
-
-if ( $modus == 0 && @ARGV != 6 ) {
- &printUsage();
- exit( -1 );
-}
-
-if ( $modus == 1 && @ARGV != 5 ) {
- &printUsage();
- exit( -1 );
-}
-
-if ( $bootstraps < 1 ) {
- &printUsage();
- exit( -1 );
-}
-
-if ( $seed && $seed < 0 ) {
- &printUsage();
- exit( -1 );
-}
-
-
-unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n\nbootstrap_cz.pl: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n";
-}
-if ( -e $outalign_file ) {
- die "\n\nbootstrap_cz.pl: \"$outalign_file\" already exists.\n\n";
-}
-
-if ( $modus == 0 ) {
- if ( -e $positions_file ) {
- die "\n\nbootstrap_cz.pl: \"$positions_file\" already exists.\n\n";
- }
-}
-else {
- unless ( ( -s $positions_file ) && ( -f $positions_file ) && ( -T $positions_file ) ) {
- die "\n\nbootstrap_cz.pl: \"$positions_file\" does not exist, is empty, or is not a plain textfile.\n\n";
- }
-}
-
-if ( $modus == 0 ) {
- &bootstrap( $modus, $bootstraps, $infile, $outalign_file, $positions_file, $seed );
-}
-else {
- &bootstrap( $modus, $bootstraps, $infile, $outalign_file, $positions_file );
-}
-
-
-exit( 0 );
-
-
-
-# Methods
-# -------
-
-
-# Five/six arguemnts:
-# 1. Mode: 0 to create pos. file, 1 to use premade pos. file
-# 2. bootstraps
-# 3. Alignment infile name
-# 4. Outfile name
-# 5. file name for positions file (created if mode is 0, read if mode is 1)
-# [6. If modus is 0: seed for random number generator]
-#
-# This method is very similar to method "pfam2phylip" "in makeTree.pl".
-#
-# Last modified: 05/17/01
-#
-sub bootstrap {
-
- my $modus = $_[ 0 ];
- my $bootstraps = $_[ 1 ];
- my $infile = $_[ 2 ];
- my $outalign_file = $_[ 3 ];
- my $positions_file = $_[ 4 ];
-
- my @seq_name = ();
- my @seq_array = ();
- my @random_numbers = ();
- my $return_line = "";
- my $seq = "";
- my $x = 0;
- my $y = 0;
- my $seq_no = 0;
- my $original_length = 0;
- my $max_x = 0;
- my $n = 0;
- my $i = 0;
- my $random = -1;
- my $length = 0;
- my $number_of_seqs = 0;
- my $number_of_colm = 0;
-
-
- # Checks the arguments
- # --------------------
-
- if ( $modus == 0 ) {
- if ( !$_[ 5 ] ) {
- die "\n\n$0: bootstrap: Failed to give a seed for random number generator.\n\n";
- }
- srand( $_[ 5 ] );
- }
- elsif( $modus == 1 ) {
- if ( $_[ 5 ] ) {
- die "\n\n$0: bootstrap: Must not give a seed for random number generator.\n\n";
- }
- unless ( ( -s $positions_file ) && ( -f $positions_file ) && ( -T $positions_file ) ) {
- die "\n\n$0: bootstrap: <<$positions_file>> does not exist, is empty, or is not a plain textfile.\n\n";
- }
- }
- else {
- die "\n\n$0: bootstrap: modus must be either 0 or 1.\n\n";
- }
-
- unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n\n$0: bootstrap: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n";
- }
-
-
-
- # Reads in the alignment
- # ----------------------
-
- open( IN, "$infile" ) || die "\n$0: bootstrap: Cannot open file <<$infile>>: $!";
- while ( $return_line = <IN> ) {
-
- if ( $return_line =~ /^\s*(\d+)\s+(\d+)/ ) {
- $number_of_seqs = $1;
- $number_of_colm = $2;
- }
- elsif ( $return_line =~ /^(\S+)\s+(\S+)/ ) {
- $seq_name[ $seq_no ] = substr( $1, 0, $LENGTH_OF_NAME );
- $seq = $2;
- if ( $original_length == 0 ) {
- $original_length = length( $seq );
- }
- elsif ( $original_length != length( $seq ) ) {
- die "\n\n$0: Sequences do not have the same length.\n\n";
- }
- for ( $x = 0; $x < $original_length; $x++ ) {
- $seq_array[ $x ][ $seq_no ] = substr( $seq, $x, 1 );
- }
- $seq_no++;
- }
- }
- close( IN );
-
- if ( ( $number_of_seqs != $seq_no )
- || ( $number_of_colm != $original_length ) ) {
- die "\n\n$0: Number of sequences or number of columns are inconsisten with the values given in the alignment.\n\n";
- }
-
- # Adusts the length of the names to $LENGTH_OF_NAME
- # -------------------------------------------------
-
- for ( $y = 0; $y < $seq_no; $y++ ) {
- $length = length( $seq_name[ $y ] );
- for ( $i = 0; $i <= ( $LENGTH_OF_NAME - $length - 1 ); $i++ ) {
- $seq_name[ $y ] .= " ";
- }
- }
-
-
-
- # Bootstraps $bootstraps times and writes the outputfiles
- # -------------------------------------------------------
-
- open( OUT, ">$outalign_file" ) || die "\n\n$0: bootstrap: Cannot create file <<$outalign_file>>: $!";
- if ( $modus == 0 ) {
- open( OUT_P, ">$positions_file" ) || die "\n\n$0: bootstrap: Cannot create file <<$positions_file>>: $!";
- }
- else {
- open( IN_P, "$positions_file" ) || die "\n\n$0: bootstrap: Cannot open file <<$positions_file>>: $!";
- }
-
- for ( $n = 0; $n < $bootstraps; $n++ ) {
-
- if ( $modus == 0 ) {
- for ( $x = 0; $x < $original_length; $x++ ) {
- $random = int( rand( $original_length ) );
- print OUT_P "$random ";
- $random_numbers[ $x ] = $random;
- }
- print OUT_P "\n";
- }
- else {
- $return_line = <IN_P>;
- if ( !$return_line || $return_line !~ /\d/ ) {
- die "\n\n$0: bootstrap: <<$positions_file>> seems too short or otherwise unsuitable.\n\n";
- }
- $return_line =~ s/^\s+//;
- $return_line =~ s/\s+$//;
- @random_numbers = split( /\s+/, $return_line );
- if ( scalar( @random_numbers ) != $original_length ) {
- die "\n\n$0: bootstrap: <<$positions_file>> seems not to correspond to <<$infile>>.\n\n";
- }
- }
-
- print OUT " $seq_no $original_length\n";
-
- for ( $y = 0; $y < $seq_no; $y++ ) {
- print OUT "$seq_name[ $y ]";
-
- for ( $x = 0; $x < $original_length; $x++ ) {
- $random = $random_numbers[ $x ];
- if ( !$seq_array[ $random ][ $y ] || $seq_array[ $random ][ $y ] !~ /[A-Z]|-/ ) {
- die "\n\n$0: Sequence must be represented by uppercase letters A-Z and \"-\" only.\n\n";
- }
- print OUT $seq_array[ $random ][ $y ];
- }
- print OUT "\n";
- }
- }
-
- close( OUT );
-
- if ( $modus == 0 ) {
- print OUT_P "\n";
- close( OUT_P );
- }
- else {
- close( IN_P );
- }
-
- return;
-
-} ## bootstrap
-
-
-
-sub printUsage {
- print "\n";
- print " bootstrap_cz.pl $VERSION\n";
- print " ---------------\n";
- print "\n";
- print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n";
- print "\n";
- print " Purpose:\n";
- print " Bootstrap resamples an alignment in PHYLIP sequential format\n";
- print " <bootstraps> times.\n";
- print " In mode 0 it saves the positions which it used to create the\n";
- print " bootstrapped alignment into <positions outfile>.\n";
- print " Mode 1 allows to recreate exactly the same boostrapped alignment\n";
- print " by reading in a <positions infile>.\n";
- print " Sequence names are normalized to $LENGTH_OF_NAME characters.\n";
- print " The output alignment is in PHYLIP's sequential or interleaved format.\n";
- print " (These two are the same in this case, since all the seqs will be one\n";
- print " line in length (no returns in seq).)\n";
- print "\n";
- print " Usage:\n";
- print " bootstrap_cz.pl <mode (0 or 1)> <bootstraps> <alignment infile>\n";
- print " <alignment outfile> <positions out (mode 0) or infile (mode 1)>\n";
- print " [random number seed (mode 0 only)]\n";
- print "\n";
-} ## printUsage
-
+++ /dev/null
-#!/usr/bin/perl -W
-
-# countSpeciesSPTrEMBL.pl
-# -----------------------
-#
-# Copyright (C) 2003 Christian M. Zmasek
-# All rights reserved
-#
-# Created: 02/27/03
-# Last modified: 02/27/03
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Last modified 05/23/02
-
-# Purpose. Counts species in SWISS-PROT and TrEMBL.
-#
-# Usage. countSpeciesSPTrEMBL.pl <path/to/trembl.dat> <path/to/sprot.dat> <outfile>
-#
-
-
-use strict;
-
-
-my $VERSION = "1.000";
-my $infile_sp = "";
-my $infile_tr = "";
-my $outfile = "";
-
-my $return_line = "";
-my $read = 0;
-my $os = "";
-my %species_count = (); # full name -> count.
-
-
-if ( @ARGV != 3 ) {
- &errorInCommandLine();
-}
-
-$infile_tr = $ARGV[ 0 ];
-$infile_sp = $ARGV[ 1 ];
-$outfile = $ARGV[ 2 ];
-
-
-
-if ( -e $outfile ) {
- die "\n$0: <<$outfile>> already exists.\n\n";
-}
-unless ( ( -s $infile_tr ) && ( -f $infile_tr ) && ( -T $infile_tr ) ) {
- die "\n$0: <$infile_tr>> does not exist, is empty, or is not a plain textfile.\n\n";
-}
-unless ( ( -s $infile_sp ) && ( -f $infile_sp ) && ( -T $infile_sp ) ) {
- die "\n$0: <<$infile_sp>> does not exist, is empty, or is not a plain textfile.\n\n";
-}
-
-open( IN_TR, "$infile_tr" ) || die "\n$0: Cannot open file <<$infile_tr>>: $!\n";
-open( IN_SP, "$infile_sp" ) || die "\n$0: Cannot open file <<$infile_sp>>: $!\n";
-open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n";
-
-
-$read = 0;
-
-while ( $return_line = <IN_TR> ) {
- if ( $return_line =~ /^AC\s+(\S+);/ ) {
- $read = 1;
- }
- elsif ( $return_line =~ /^OS\s+(.+)\.\s*$/ && $read == 1 ) {
- $os = $1;
- $os =~ s/\(.+\)//g;
- $os =~ s/^\s+//;
- $os =~ s/\s+$//;
- $os =~ s/\.$//;
- if ( exists( $species_count{ $os } ) ) {
- $species_count{ $os } = $species_count{ $os } + 1;
- }
- else {
- $species_count{ $os } = 1;
- }
- print "$os\n";
- }
- elsif ( $return_line =~ /^\/\// && $read == 1 ) {
- $read = 0;
- $os = "";
- }
-}
-
-close( IN_TR );
-
-$read = 0;
-$os = "";
-$return_line = "";
-
-while ( $return_line = <IN_SP> ) {
- if ( $return_line =~ /^ID\s+(\S+)/ ) {
- $read = 1;
- }
- elsif ( $return_line =~ /^OS\s+(.+)\s*$/ && $read == 1 ) {
- $os = $1;
- $os =~ s/\(.+//g;
- $os =~ s/^\s+//;
- $os =~ s/\s+$//;
- $os =~ s/\.$//;
- $read = 0;
- if ( exists( $species_count{ $os } ) ) {
- $species_count{ $os } = $species_count{ $os } + 1;
- }
- else {
- $species_count{ $os } = 1;
- }
- print "$os\n";
- }
- elsif ( $return_line =~ /^\/\// && $read == 1 ) {
- $read = 0;
- $os = "";
- }
-}
-
-close( IN_SP );
-
-
-foreach my $species ( sort { $species_count{ $b } <=> $species_count{ $a } } keys %species_count ) {
- print OUT "$species: $species_count{$species}\n";
-}
-
-
-print "\n\nDone!\n\n";
-
-close( OUT );
-
-exit( 0 );
-
-
-
-
-
-
-sub errorInCommandLine {
- print "\n";
- print " countSpeciesSPTrEMBL.pl $VERSION\n";
- print " -----------------------\n";
- print "\n";
- print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n";
- print "\n";
- print " Purpose. Counts species in SWISS-PROT and TrEMBL.\n";
- print "\n";
- print " Usage. countSpeciesSPTrEMBL.pl <path/to/trembl.dat> <path/to/sprot.dat> <outfile>\n";
- print "\n";
- exit( -1 );
-}
+++ /dev/null
-#!/usr/bin/perl -W
-
-# extractSWISS-PROT.pl
-# --------------------
-#
-# Copyright (C) 2001 Washington University School of Medicine
-# and Howard Hughes Medical Institute
-# All rights reserved
-#
-# Created: 09/25/01
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Last modified 05/23/02
-
-# Purpose. Extracts ID, DE, and species from a "sprot.dat" file.
-# The output is used by "rio.pl".
-# If a species list (format: SWISS-PROT-code=full name) is supplied:
-# only sequences from species found in this list are written to
-# outfile (recommended).
-#
-# Usage. extractSWISS-PROT.pl <infile> <outfile> [species list]
-
-# Remark. Need to re-run this if species in species tree or species list
-# are added/changed or if a new version of Pfam is used!!
-
-
-use strict;
-
-
-my $VERSION = "1.001";
-my $infile = "";
-my $outfile = "";
-my $speciesfile = "";
-my $return_line = "";
-my $read = 0;
-my $ac = "";
-my $de = "";
-my $os = "";
-my %Species_names = (); # SWISS-PROT name -> "".
-my $i = 0;
-
-if ( @ARGV != 2 && @ARGV != 3 ) {
- &errorInCommandLine();
-}
-
-$infile = $ARGV[ 0 ];
-$outfile = $ARGV[ 1 ];
-
-if ( @ARGV == 3 ) {
- $speciesfile = $ARGV[ 2 ];
- unless ( ( -s $speciesfile ) && ( -f $speciesfile ) && ( -T $speciesfile ) ) {
- die "\n$0: <<$speciesfile>> does not exist, is empty, or is not a plain textfile.\n\n";
- }
- &readSpeciesNamesFile( $speciesfile );
-}
-
-if ( -e $outfile ) {
- die "\n$0: <<$outfile>> already exists.\n\n";
-}
-unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n";
-}
-
-open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n";
-open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n";
-
-print OUT "# extractTrembl.pl version: $VERSION\n";
-print OUT "# trembl.dat file: $infile\n";
-print OUT "# output file : $outfile\n";
-print OUT "# species file : $speciesfile\n";
-print OUT "# date : ".`date`."\n\n";
-
-$read = 0;
-
-while ( $return_line = <IN> ) {
- if ( $return_line =~ /^ID\s+(\S+)/ ) {
- $ac = $1;
- $read = 1;
- if ( $ac =~ /[A-Z0-9]+_([A-Z0-9]+)/ ) {
- $os = $1;
- }
- else {
- die "\n$0: Unexpected format: $ac.\n\n";
- }
- if ( $speciesfile ne "" ) {
- unless ( exists( $Species_names{ $os } ) ) {
- $read = 0;
- $ac = "";
- $de = "";
- $os = "";
- next;
- }
- }
- }
- elsif ( $return_line =~ /^DE\s+(.+)/ && $read == 1 ) {
- if ( $de ne "" ) {
- $de .= " ".$1;
- }
- else {
- $de = $1;
- }
- }
- elsif ( $return_line =~ /^\/\// && $read == 1 ) {
- $read = 0;
- print OUT "$ac;$de;$os\n";
- $ac = "";
- $de = "";
- $os = "";
- $i++;
- }
-}
-
-close( IN );
-
-print OUT "\n # $i entries.\n";
-
-close( OUT );
-
-exit( 0 );
-
-
-
-# Reads in species file.
-# Format: SWISS-PROT=full name (e.g. "BACSU=Bacillus subtilis")
-# Lines beginning with "#" are ignored.
-# One argument: species file-name
-# Last modified: 04/24/01
-sub readSpeciesNamesFile {
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $sp = "";
- my $full = "";
-
- unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n$0: readSpeciesNamesFile: <<$infile>> does not exist, is empty, or is not a plain textfile.\n";
- }
-
- open( IN_RSNF, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n";
- while ( $return_line = <IN_RSNF> ) {
- if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)=(.+)/ ) {
- $sp = $1;
- $full = $2;
- $full =~ s/^\s+//;
- $full =~ s/\s+$//;
- $Species_names{ $sp } = "";
- }
- }
- close( IN_RSNF );
-
- return;
-}
-
-
-
-sub errorInCommandLine {
- print "\n";
- print " extractSWISS-PROT.pl $VERSION\n";
- print " --------------------\n";
- print "\n";
- print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n";
- print "\n";
- print " Purpose. Extracts ID, DE, and species from a \"sprot.dat\" file.\n";
- print " The resulting output is used by \"rio.pl\".\n";
- print " If a species list (format: SWISS-PROT-code=full name) is supplied:\n";
- print " only sequences from species found in this list are written to\n";
- print " outfile (recommended).\n";
- print "\n";
- print " Usage. extractSWISS-PROT.pl <infile> <outfile> [species list]\n";
- print "\n";
- print " Remark. Need to re-run this if species in species tree or species list\n";
- print " are added/changed or if a new version of Pfam is used!!\n";
- print "\n\n";
- exit( -1 );
-}
+++ /dev/null
-#!/usr/bin/perl -W
-
-# extractSpecies.pl
-# ----------------
-#
-# Copyright (C) 2003 Christian M. Zmasek
-# All rights reserved
-#
-# Created: 09/03/03
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Last modified 03/12/04 (Added gg)
-
-# Purpose. Adds species information to a file describing a phylogenetic
-# tree in the following format (by way of example):
-# "((ceINX_CE33055:0.02883,cbINX_CB09748:0.02934):0.36899[&&NHX:B=100],..."
-# ce stands for "CAEEL". The hash %SPECIES needs to be set accordingly.
-#
-
-
-use strict;
-
-
-my %SPECIES = (
- "dm" => "DROME",
- "ag" => "ANOGA",
- "ce" => "CAEEL",
- "cb" => "CAEBR",
- "ci" => "CIOIN",
- "fr" => "FUGRU",
- "gg" => "CHICK",
- "rn" => "RAT",
- "mm" => "MOUSE",
- "hs" => "HUMAN"
- );
-
-
-my $infile = "";
-my $outfile = "";
-my $intree = "";
-my $return_line = "";
-
-if ( @ARGV != 1 && @ARGV != 2 ) {
- &errorInCommandLine();
-}
-
-$infile = $ARGV[ 0 ];
-
-if ( @ARGV == 1 ) {
- $outfile = $infile;
- $outfile =~ s/\.nhx$//;
- $outfile .= "_species.nhx";
-}
-
-if ( @ARGV == 2 ) {
- $outfile = $ARGV[ 1 ];
-}
-
-
-
-
-if ( -e $outfile ) {
- die "\n$0: <<$outfile>> already exists.\n\n";
-}
-unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n";
-}
-
-open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n";
-open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n";
-
-while ( $return_line = <IN> ) {
- $return_line =~ s/\s+//g;
- $return_line =~ s/\+/_/g;
-
- $intree .= $return_line;
-
-}
-
-close( IN );
-
-while ( ( my $short, my $long ) = each ( %SPECIES ) ) {
-
- while ( $intree =~ /[(),]($short[^\[]+?)[(),]/ ) {
-
- my $name_and_length = $1;
-
- print "$name_and_length -> $name_and_length\[\&\&NHX:S=$long\]\n";
-
- $intree =~ s/$name_and_length/$name_and_length\[&&NHX:S=$long\]/;
-
- }
-
-}
-
-print OUT $intree;
-
-close( OUT );
-
-print "\n\nDone!\n\n";
-
-exit( 0 );
-
-
-
-sub errorInCommandLine {
- print "\n";
- print "extractSpecies.pl infile [outfile]";
- print "\n\n";
- exit( -1 );
-}
+++ /dev/null
-#!/usr/bin/perl -W
-
-# extractTrembl.pl
-# ----------------
-#
-# Copyright (C) 2001 Washington University School of Medicine
-# and Howard Hughes Medical Institute
-# All rights reserved
-#
-# Created: 04/24/01
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Last modified 05/23/02
-
-# Purpose. Extracts AC, DE, and OS from a "trembl.dat" file.
-# The output is used by "rio.pl".
-# If a species list (format: SWISS-PROT-code=full name) is supplied:
-# only sequences from species found in this list are written to
-# outfile and their full species names replaced with their SWISS-PROT
-# code (recommended).
-#
-# Usage. extractTrembl.pl <infile> <outfile> [species list]
-
-# Remark. Need to re-run this if species in species tree or species list
-# are added/changed or if a new version of Pfam is used!!
-
-# Some "heuristic" is required for Synechococcus, Synechocystis, Anabaena:
-# see below.
-
-use strict;
-
-
-my $VERSION = "1.001";
-my $infile = "";
-my $outfile = "";
-my $speciesfile = "";
-my $return_line = "";
-my $read = 0;
-my $ac = "";
-my $de = "";
-my $os = "";
-my %Species_names = (); # full name -> SWISS-PROT name.
-my $i = 0;
-
-if ( @ARGV != 2 && @ARGV != 3 ) {
- &errorInCommandLine();
-}
-
-$infile = $ARGV[ 0 ];
-$outfile = $ARGV[ 1 ];
-
-if ( @ARGV == 3 ) {
- $speciesfile = $ARGV[ 2 ];
- unless ( ( -s $speciesfile ) && ( -f $speciesfile ) && ( -T $speciesfile ) ) {
- die "\n$0: <<$speciesfile>> does not exist, is empty, or is not a plain textfile.\n\n";
- }
- &readSpeciesNamesFile( $speciesfile );
-}
-
-if ( -e $outfile ) {
- die "\n$0: <<$outfile>> already exists.\n\n";
-}
-unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n";
-}
-
-open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n";
-open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n";
-
-print OUT "# extractTrembl.pl version: $VERSION\n";
-print OUT "# trembl.dat file: $infile\n";
-print OUT "# output file : $outfile\n";
-print OUT "# species file : $speciesfile\n";
-print OUT "# date : ".`date`."\n\n";
-
-$read = 0;
-
-while ( $return_line = <IN> ) {
- if ( $return_line =~ /^AC\s+(\S+);/ ) {
- $ac = $1;
- $read = 1;
- }
- elsif ( $return_line =~ /^DE\s+(.+)/ && $read == 1 ) {
- if ( $de ne "" ) {
- $de .= " ".$1;
- }
- else {
- $de = $1;
- }
- }
-
- elsif ( $return_line =~ /^OS\s+(.+)\.\s*$/ && $read == 1 ) {
- $os = $1;
- if ( $speciesfile ne ""
- && ( $os =~ /Synechococcus/
- || $os =~ /Synechocystis/
- || $os =~ /Anabaena/ ) ) {
- if ( $os =~ /PCC\s*(\d+)/ ) {
- $os = "PCC ".$1;
- }
- else {
- $read = 0;
- $ac = "";
- $de = "";
- $os = "";
- next;
- }
- }
- else {
- $os =~ s/\(.+\)//g;
- }
- $os =~ s/^\s+//;
- $os =~ s/\s+$//;
- if ( $speciesfile ne "" ) {
- unless ( exists( $Species_names{ $os } ) ) {
- $read = 0;
- $ac = "";
- $de = "";
- $os = "";
- next;
- }
- $os = $Species_names{ $os };
- }
- }
- elsif ( $return_line =~ /^\/\// && $read == 1 ) {
- $read = 0;
- print OUT "$ac;$de;$os\n";
- $ac = "";
- $de = "";
- $os = "";
- $i++;
- }
-}
-
-close( IN );
-
-print OUT "\n # $i entries.\n";
-
-close( OUT );
-
-exit( 0 );
-
-
-
-# Reads in species file.
-# Format: SWISS-PROT=full name (e.g. "BACSU=Bacillus subtilis")
-# Lines beginning with "#" are ignored.
-# One argument: species file-name
-# Last modified: 04/24/01
-sub readSpeciesNamesFile {
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $sp = "";
- my $full = "";
-
- unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n$0: readSpeciesNamesFile: <<$infile>> does not exist, is empty, or is not a plain textfile.\n";
- }
-
- open( IN_RSNF, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n";
- while ( $return_line = <IN_RSNF> ) {
- if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)=(.+)/ ) {
- $sp = $1;
- $full = $2;
- $full =~ s/^\s+//;
- $full =~ s/\s+$//;
- $Species_names{ $full } = $sp;
- }
- }
- close( IN_RSNF );
-
- return;
-}
-
-
-
-sub errorInCommandLine {
- print "\n";
- print " extractTrembl.pl $VERSION\n";
- print " ----------------\n";
- print "\n";
- print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n";
- print "\n";
- print " Purpose. Extracts AC, DE, and OS from a \"trembl.dat\" file.\n";
- print " The resulting output is used by \"rio.pl\".\n";
- print " If a species list (format: SWISS-PROT-code=full name) is supplied:\n";
- print " only sequences from species found in this list are written to\n";
- print " outfile and their full species names replaced with their SWISS-PROT\n";
- print " code (recommended).\n";
- print "\n";
- print " Usage. extractTrembl.pl <infile> <outfile> [species list]\n";
- print "\n";
- print " Remark. Need to re-run this if species in species tree or species list\n";
- print " are added/changed or if a new version of Pfam is used!!\n";
- print "\n\n";
- exit( -1 );
-}
+++ /dev/null
-#!/usr/bin/perl -w
-
-my $in = $ARGV[ 0 ];
-my $out = $ARGV[ 1 ];
-
-if ( -e $out ) {
- print "File $out already exists.\n";
- exit( -1 );
-}
-
-if ( !-e $in ) {
- print "File $in does not exist.\n";
- exit( -1 );
-}
-
-open( IN, $in ) ;
-open ( OUT, ">>$out" ) || die ( "Could not open file $out for writing!\n" );
-
-while ( my $line = <IN> ) {
- my $newline = &proc_line( $line );
- if ( length( $newline ) > 0 ) {
- print OUT $newline;
- }
-}
-
-
-close( OUT ) or die( "can't close $out: $!" );
-close( IN ) or die( "can't close $in: $!" );
-
-sub proc_line {
- my $line = shift;
-
-
- if ( $line =~ /^#/ ) {
- return "";
- }
- if ( $line =~ /^Predicted coding sequence\(s\):/ ) {
- return "";
- }
- elsif ( $line =~ /^>.*_aa\s*$/ ) {
- return "";
- }
- elsif ( $line =~ /^>/ ) {
- return $line;
- }
- elsif ( $line !~ /[a-z]/ ) {
- return "";
- }
- else {
- return $line;;
- }
-}
+++ /dev/null
-#!/usr/bin/perl -W
-
-# $Id: gs_aa_extract.pl,v 1.2 2008/03/09 00:11:50 cmzmasek Exp $
-
-# This extracts the AA sequences from GENSCAN output files
-# Copyright (C) 2008-2009 Christian M. Zmasek
-# All rights reserved
-# Created 2007-07-28 in Winterthur, Switzerland by CMZ
-
-# Usage: gs_aa_extract.pl <genscan-output infile> <outfile>
-
-use strict;
-
-if ( scalar( @ARGV ) != 2 ) {
- print "\ngs_aa_extract.pl <genscan-output infile> <outfile>\n\n";
- exit( -1 );
-}
-
-my $infile = $ARGV[ 0 ];
-my $outfile = $ARGV[ 1 ];
-
-if ( -e $outfile) {
- die "\n$0: \"$outfile\" already exists.\n\n";
-}
-unless( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n$0: cannot read from \"$infile\".\n\n";
-}
-
-open( IN, "$infile" ) || die "\n$0: Cannot open file \"$infile\": $!\n";
-open( OUT, ">$outfile" ) || die "\n$0: Cannot create file \"$outfile\": $!\n";
-
-my $line = "";
-my $desc = "";
-
-while ( $line = <IN> ) {
- if ( $line =~ /^>/ ) {
- $desc = $line;
- }
- elsif ( $line =~ /^[A-Z]+$/ ) {
- if ( length( $desc ) > 0 ) {
- print OUT $desc;
- $desc = "";
- }
- print OUT $line;
- }
-}
-
-close( OUT );
-
-print( "\nOK\n" );
-
-exit( 0 );
-
+++ /dev/null
-#!/usr/bin/perl -W
-
-# makeTree.pl
-# -----------
-# Copyright (C) 1999-2003 Washington University School of Medicine
-# and Howard Hughes Medical Institute
-# All rights reserved
-#
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Last modified 04/06/04
-#
-#
-# Requirements makeTree is part of the RIO/FORESTER suite of programs.
-# ------------ Many of its global variables are set via rio_module.pm.
-#
-#
-# Note. Use xt.pl (for Pfam alignments) or mt.pl (for other alignments)
-# to run makeTree.pl on whole directories of alignments files.
-#
-#
-#
-# Usage
-# -----
-#
-# Tree calculation based on a Pfam/Clustal W alignment
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# makeTree.pl [-options] <input alignment in SELEX (Pfam), PHYLIP
-# sequential format, or Clustal W output> <outputfile>
-# [path/name for temporary directory to be created]
-#
-# Example:
-# "% makeTree.pl -UTB1000S41NDXV /DB/PFAM/Full/IL5 IL5_tree"
-#
-#
-# Tree calculation based on precalculated pairwise distances
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Consensus tree will have no branch length values.
-# Precalculated pairwise distances are the output of "pfam2pwd.pl",
-# number of bootstraps needs to match the one used for the pwds.
-#
-# makeTree.pl <-options, includes "F"> <pwdfile: boostrapped pairwise
-# distances> <outputfile> [path/name for temporary directory
-# to be created]
-#
-# Example:
-# "% makeTree.pl -FB100S21XV /pfam2pwd_out/IL5.pwd IL5_tree"
-#
-#
-# Tree calculation based on precalculated pairwise distances
-# and matching alignment
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Consensus tree will have branch length values.
-# Precalculated pairwise distances and the matching (processed)
-# alignment are the output of "pfam2pwd.pl", number of bootstraps
-# needs to match the one used for the pwds, matrix needs to match
-# the one used for the pwds.
-#
-# makeTree.pl <-options, includes "UF"> <pwdfile: boostrapped pairwise
-# distances> <alnfile: corresponding alignment>
-# <outputfile> [path/name for temporary directory to be created]
-#
-# Example:
-# "% makeTree.pl -UFLB100S21XV /pfam2pwd_out/IL5.pwd /pfam2pwd_out/IL5.aln IL5_tree"
-#
-#
-# Options
-# -------
-#
-# N : Suggestion to remove columns in the alignment which contain gaps.
-# Gaps are not removed, if, after removal of gaps, the resulting alignment would
-# be shorter than $MIN_NUMBER_OF_AA. Default is not to remove gaps.
-# Bx : Number of bootstrapps. B0: do not bootstrap. Default is 100 bootstrapps.
-# The number of bootstrapps should be divisible by 10.
-# U : Use TREE-PUZZLE to calculate ML branchlengths for consesus tree, in case of
-# bootstrapped analysis.
-# J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE, default: PAM.
-# L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE, default: PAM.
-# M : Use mtREV24 matrix (Adachi-Hasegawa 1996) inTREE-PUZZLE, default: PAM.
-# W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE, default: PAM.
-# T : Use VT matrix (Mueller-Vingron 2000) in TREE-PUZZLE, default: PAM.
-# P : Let TREE-PUZZLE choose which matrix to use, default: PAM.
-# E : Exact parameter estimates in TREE-PUZZLE, default: Approximate.
-# Model of rate heterogeneity in TREE-PUZZLE (default: Uniform rate)
-# g : 8 Gamma distributed rates
-# t : Two rates (1 invariable + 1 variable)
-# m : Mixed (1 invariable + 8 Gamma rates)
-# R : Randomize input order in PHYLIP NEIGHBOR.
-# A : Use PHYLIP PROTPARS instead of NEIGHBOR (and no pairwise distance calculation).
-# jx : Number of jumbles when using PHYLIP PROTPARS (random seed set with Sx).
-# Sx : Seed for random number generator(s). Must be 4n+1. Default is 9.
-# X : To keep multiple tree file (=trees from bootstrap resampled alignments).
-# D : To keep (and create in case of bootstrap analysis) pairwise distance matrix file.
-# This is created form the not resampled (original) alignment.
-# C : Calculate pairwise distances only (no tree). Bootstrap is always 1.
-# No other files are generated.
-# F : Pairwise distance (pwd) file as input (instead of alignment).
-# No -D, -C, and -N options available in this case.
-# V : Verbose.
-# # : Only for rio.pl: Do not calculate consensus tree ("I" option in rio.pl).
-#
-#
-#
-# History:
-# -------
-#
-# 09/06/03: Added "#" option (to be used only for rio.pl).
-# 03/24/04: Do not replace "?" with "-" in method pfam2phylip.
-
-
-use strict;
-
-use FindBin;
-use lib $FindBin::Bin;
-use rio_module2;
-
-my $VERSION = "4.210";
-
-my $TEMP_DIR_DEFAULT = "/tmp/maketree"; # Where all the infiles, outfiles, etc will be created.
-
-my $remove_gaps = 0; # 0: do not remove gaps; 1: remove gaps
-my $bootstraps = 100; # 0,1: do not bootstrap. Default: 100
-my $puzzle_consensus_tree = 0; # 0: no; 1: yes. No is default.
-my $matrix = 1; # 0 = JTT
- # 1 = PAM - default
- # 2 = BLOSUM 62
- # 3 = mtREV24
- # 5 = VT
- # 6 = WAG
- # 7 = auto
-my $rate_heterogeneity = 0; # 0 = Uniform rate (default)
- # 1 = 8 Gamma distributed rates
- # 2 = Two rates (1 invariable + 1 variable)
- # 3 = Mixed (1 invariable + 8 Gamma rates)
-my $randomize_input_order = 0; # 0: do not randomize input order; 1 jumble
-my $seed = 9; # Seed for random number generators. Default: 9
-my $keep_multiple_trees = 0; # 0: delete multiple tree file
- # 1: do not delete multiple tree file
-my $keep_distance_matrix = 0; # 1: (create and) keep; 0: do not (create and) keep
-my $verbose = 0; # 0: no; 1: yes
-my $pairwise_dist_only = 0; # 0: no; 1: yes
-my $start_with_pwd = 0; # 0: no; 1: yes
-my $start_with_pwd_and_aln = 0; # 0: no; 1: yes
-my $no_consenus_tree = 0; # 0: no; 1: yes
-my $exact_parameter_est = 0; # 0: no; 1: yes
-my $use_protpars = 0; # 0: no; 1: yes
-my $protpars_jumbles = 0;
-
-my %seqnames = (); # number => seqname
-my %numbers = (); # seqname => number
-my $options = "";
-my $infile = "";
-my $pwdfile = "";
-my $outfile = "";
-my $outfilenhx = "";
-my $logfile = "";
-my $alignfile = "";
-my $multitreefile = "";
-my $distancefile = "";
-my $log = "";
-my $number_of_aa = 0;
-my $orig_length = 0;
-my $ii = 0;
-my $temp_dir = "";
-my $current_dir = "";
-my @out = ();
-my $number_of_seqs = 0;
-
-
-
-unless ( @ARGV == 2 || @ARGV == 3 || @ARGV == 4 || @ARGV == 5 ) {
- &printUsage();
- exit ( -1 );
-}
-
-
-
-# Analyzes the options:
-# ---------------------
-
-if ( $ARGV[ 0 ] =~ /^-.+/ ) {
-
- unless ( @ARGV > 2 ) {
- &printUsage();
- exit ( -1 );
- }
- $options = $ARGV[ 0 ];
-
- if ( $options =~ /F/ && $options !~ /U/ ) {
- if ( @ARGV != 3 && @ARGV != 4 ) {
- &printUsage();
- exit ( -1 );
-
- }
- $start_with_pwd = 1;
- $infile = "";
- $pwdfile = $ARGV[ 1 ];
-
- $outfile = $ARGV[ 2 ];
- if ( @ARGV == 4 ) {
- $temp_dir = $ARGV[ 3 ];
- }
-
- }
- elsif ( $options =~ /F/ && $options =~ /U/ ) {
- if ( @ARGV != 4 && @ARGV != 5 ) {
- &printUsage();
- exit ( -1 );
- }
- $start_with_pwd = 1;
- $start_with_pwd_and_aln = 1;
- $pwdfile = $ARGV[ 1 ];
- $infile = $ARGV[ 2 ];
- $outfile = $ARGV[ 3 ];
- if ( @ARGV == 5 ) {
- $temp_dir = $ARGV[ 4 ];
- }
-
- }
- else {
- if ( @ARGV != 3 && @ARGV != 4 ) {
- &printUsage();
- exit ( -1 );
- }
- $infile = $ARGV[ 1 ];
- $outfile = $ARGV[ 2 ];
- if ( @ARGV == 4 ) {
- $temp_dir = $ARGV[ 3 ];
- }
- }
-
- if ( $options =~ /N/ && $start_with_pwd != 1 ) {
- $remove_gaps = 1; # do remove gaps
- }
- if ( $options =~ /B(\d+)/ ) {
- $bootstraps = $1;
- if ( $bootstraps <= 1 ) {
- $bootstraps = 0;
- }
- elsif ( $bootstraps <= 9 ) {
- $bootstraps = 0;
- print "\n\nMAKETREE: WARNING: Bootstrap number must be devisable by 10,\nno bootstrapping.\n\n";
- }
- elsif ( $bootstraps % 10 != 0 ) {
- $bootstraps = $bootstraps - $bootstraps % 10; # to ensure $bootstraps % 10 == 0
- print "\n\nMAKETREE: WARNING: Bootstrap number must be devisable by 10,\nhas been set to $bootstraps.\n\n";
- }
- }
- if ( $options =~ /A/ ) {
- $use_protpars = 1 # PROTPARS
- }
- if ( $options =~ /j(\d+)/ ) {
- $protpars_jumbles = $1;
- if ( $protpars_jumbles < 0 ) {
- $protpars_jumbles = 0;
- }
- }
- if ( $options =~ /J/ ) {
- $matrix = 0; # JTT
- }
- if ( $options =~ /L/ ) {
- $matrix = 2; # Blossum
- }
- if ( $options =~ /M/ ) {
- $matrix = 3; # mtREV24
- }
- if ( $options =~ /T/ ) {
- $matrix = 5; # VT
- }
- if ( $options =~ /W/ ) {
- $matrix = 6; # WAG
- }
- if ( $options =~ /P/ ) {
- $matrix = 7; # auto
- }
- if ( $options =~ /R/ ) {
- $randomize_input_order = 1;
- }
- if ( $options =~ /S(\d+)/ ) {
- $seed = $1;
- }
- if ( $options =~ /U/ ) {
- $puzzle_consensus_tree = 1;
- }
- if ( $options =~ /X/ ) {
- $keep_multiple_trees = 1;
- }
- if ( $options =~ /D/ && $start_with_pwd != 1 ) {
- $keep_distance_matrix = 1;
- }
- if ( $options =~ /V/ ) {
- $verbose = 1;
- }
- if ( $options =~ /C/ && $start_with_pwd != 1 ) {
- $pairwise_dist_only = 1;
- }
- if ( $options =~ /E/ ) {
- $exact_parameter_est = 1;
- }
- if ( $options =~ /g/ ) {
- $rate_heterogeneity = 1;
- }
- if ( $options =~ /t/ ) {
- $rate_heterogeneity = 2;
- }
- if ( $options =~ /m/ ) {
- $rate_heterogeneity = 3;
- }
- if ( $options =~ /#/ ) {
- $no_consenus_tree = 1;
- }
- if ( $protpars_jumbles > 0 && $use_protpars != 1 ) {
- &printUsage();
- exit ( -1 );
- }
- if ( $use_protpars == 1 ) {
- if ( $randomize_input_order >= 1
- || $start_with_pwd == 1
- || $keep_distance_matrix == 1
- || $pairwise_dist_only == 1 ) {
- &printUsage();
- exit ( -1 );
- }
- if ( $bootstraps > 1 && $protpars_jumbles < 1 ) {
- $protpars_jumbles = 1;
- }
- }
-
-}
-
-else {
- unless ( @ARGV == 2 || @ARGV == 3 ) {
- &printUsage();
- exit ( -1 );
- }
- $infile = $ARGV[ 0 ];
- $outfile = $ARGV[ 1 ];
- if ( @ARGV == 3 ) {
- $temp_dir = $ARGV[ 2 ];
- }
-}
-
-
-
-
-$current_dir = `pwd`;
-$current_dir =~ s/\s//;
-
-if ( $outfile !~ /^\// ) {
- # outfile is not absolute path.
- $outfile = $current_dir."/".$outfile;
-}
-
-
-
-if ( $pairwise_dist_only == 1 ) {
- $bootstraps = 0;
- $keep_multiple_trees = 0;
- $puzzle_consensus_tree = 0;
- $randomize_input_order = 0;
- $start_with_pwd = 0;
- $keep_distance_matrix = 1;
-}
-
-if ( $bootstraps < 2 ) {
- $no_consenus_tree = 0;
-}
-
-# TREE-PUZZLE sets the option in this way:
-# If two rates or mixed, exact parameter estimates are used.
-if ( $rate_heterogeneity == 2
-|| $rate_heterogeneity == 3 ) {
- $exact_parameter_est = 1
-}
-
-$logfile = $outfile.$LOG_FILE_SUFFIX;
-$alignfile = $outfile.$ALIGN_FILE_SUFFIX;
-$multitreefile = $outfile.$MULTIPLE_TREES_FILE_SUFFIX;
-$distancefile = $outfile.$SUFFIX_PWD_NOT_BOOTS;
-
-if ( $outfile =~ /\.nhx$/i ) {
- $outfilenhx = $outfile;
- $logfile =~ s/\.nhx//i;
- $alignfile =~ s/\.nhx//i;
- $outfile =~ s/\.nhx//i;
- $multitreefile =~ s/\.nhx//i;
- $distancefile =~ s/\.nhx//i;
-}
-else {
- $outfilenhx = $outfile.".nhx";
-}
-
-if ( -e $outfilenhx ) {
- die "\n\nmakeTree: \"$outfilenhx\" already exists.\n\n";
-}
-if ( $infile ne "" ) {
- unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n\nmakeTree: Input alignment file \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n";
- }
-}
-if ( $start_with_pwd == 1 ) {
- unless ( ( -s $pwdfile ) && ( -f $pwdfile ) && ( -T $pwdfile ) ) {
- die "\n\nmakeTree: Pairwise distance file \"$pwdfile\" does not exist, is empty, or is not a plain textfile.\n\n";
- }
-}
-
-
-
-# Prints out the options:
-# -----------------------
-
-
-$log = "\n$0 logfile:\n";
-$log = $log."Version: $VERSION\n\n";
-
-
-if ( $start_with_pwd == 1 ) {
- $log = $log."Input pairwise distance file (bootstrapped): $pwdfile\n";
-}
-if ( $infile ne "" ) {
- $log = $log."Input alignment : $infile\n";
-}
-
-if ( $no_consenus_tree != 1 ) {
- $log = $log."Output tree file : $outfilenhx\n";
-}
-
-if ( $keep_multiple_trees == 1 && $bootstraps >= 2 ) {
- $log = $log."Output multiple trees file : $multitreefile\n";
-}
-if ( $keep_distance_matrix ) {
- $log = $log."Output pairwise distance file : $distancefile\n";
-}
-
-$log = $log."Bootstraps : $bootstraps\n";
-
-if ( $start_with_pwd != 1 && $use_protpars != 1 ) {
- $log = $log."Prgrm to calculate pairwise dist. : TREE-PUZZLE\n";
-}
-
-if ( $use_protpars == 1 ) {
- $log = $log."Program to calculate tree : PHYLIP PROTPARS\n";
- $log = $log."Number of jumbles in PROTPARS : $protpars_jumbles\n";
-}
-else {
- $log = $log."Program to calculate tree : PHYLIP NEIGHBOR (NJ)\n";
-}
-
-if ( $puzzle_consensus_tree == 1 ) {
- $log = $log."Prgrm to calculate ML branch lenghts: TREE-PUZZLE\n";
-}
-if ( $puzzle_consensus_tree == 1 || $start_with_pwd != 1 ) {
- $log = $log."Model : ";
- if ( $matrix == 0 ) {
- $log = $log."JTT (Jones et al. 1992)\n";
- }
- elsif ( $matrix == 2 ) {
- $log = $log."BLOSUM 62 (Henikoff-Henikoff 92)\n";
- }
- elsif ( $matrix == 3 ) {
- $log = $log."mtREV24 (Adachi-Hasegawa 1996)\n";
- }
- elsif ( $matrix == 5 ) {
- $log = $log."VT (Mueller-Vingron 2000)\n";
- }
- elsif ( $matrix == 6 ) {
- $log = $log."WAG (Whelan-Goldman 2000)\n";
- }
- elsif ( $matrix == 7 ) {
- $log = $log."auto\n";
- }
- else {
- $log = $log."PAM (Dayhoff et al. 1978)\n";
- }
-}
-$log = $log."Model of rate heterogeneity : ";
-if ( $rate_heterogeneity == 1 ) {
- $log = $log."8 Gamma distributed rates\n";
-}
-elsif ( $rate_heterogeneity == 2 ) {
- $log = $log."Two rates (1 invariable + 1 variable)\n";
-}
-elsif ( $rate_heterogeneity == 3 ) {
- $log = $log."Mixed (1 invariable + 8 Gamma rates)\n";
-}
-else {
- $log = $log."Uniform rate\n";
-}
-if ( $randomize_input_order >= 1 ) {
- $log = $log."Randomize input order in NEIGHBOR : yes\n";
-}
-$log = $log."Seed for random number generators : $seed\n";
-if ( $exact_parameter_est == 1 ) {
- $log = $log."Exact parameter estimates in TREE-PUZZLE\n";
-}
-
-$log = $log."Start time/date : ".`date`;
-
-
-
-
-# That's where the mischief starts....
-# ------------------------------------
-
-$ii = 0;
-
-my $time_st = time;
-
-if ( $temp_dir eq "" ) {
- $temp_dir = $TEMP_DIR_DEFAULT;
-}
-
-$temp_dir = $temp_dir.$time_st.$ii;
-
-while ( -e $temp_dir ) {
- $ii++;
- $temp_dir = $temp_dir.$time_st.$ii;
-}
-
-mkdir( $temp_dir, 0700 )
-|| die "\n\n$0: Unexpected error: Could not create <<$temp_dir>>: $!\n\n";
-
-unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) {
- die "\n\n$0: Unexpected error: <<$temp_dir>> does not exist, or is not a directory.\n\n";
-}
-
-
-if ( $start_with_pwd != 1 ) {
- system( "cp", $infile, $temp_dir."/INFILE" );
- unless ( chmod ( 0600, $temp_dir."/INFILE" ) ) {
- warn "\n\n$0: Could not chmod. $!\n\n";
- }
- $infile = "INFILE";
-}
-
-
-chdir ( $temp_dir )
-|| die "\n\n$0: Unexpected error: Could not chdir to <<$temp_dir>>: $!\n\n";
-
-
-if ( $start_with_pwd != 1 ) {
-
- @out = &DoPfam2phylip( $infile, $alignfile, $remove_gaps );
- $number_of_aa = $out[ 0 ];
- $orig_length = $out[ 1 ];
- $number_of_seqs = $out[ 2 ];
-
- system( "cp", $alignfile, "infile" );
-
- if ( $use_protpars != 1 ) {
- # Calculating the pairwise distances (saved in file "infile"): "puzzle"
-
- system( "cp", $alignfile, "align" );
-
- if ( $bootstraps > 1 ) {
-
- &executeSeqboot( $seed, $bootstraps );
-
- if ( $keep_distance_matrix ) {
- system( "mv", "outfile", "outfile___" );
- system( "cp", "align", "infile" );
- &executePuzzle( "infile",
- $matrix,
- $exact_parameter_est,
- $rate_heterogeneity );
- system( "mv", "infile.dist", $distancefile );
- system( "mv", "outfile___", "outfile" );
- }
- unlink( "infile" ); # Necessary, since "infile" is puzzle's default input.
- system( "mv", "outfile", "IN" );
-
- &executePuzzleBootstrapped( "IN",
- $matrix,
- $exact_parameter_est,
- $rate_heterogeneity );
-
- $pwdfile = "IN.dist";
-
- }
- else {
-
- &executePuzzle( "infile",
- $matrix,
- $exact_parameter_est,
- $rate_heterogeneity );
-
- if ( $keep_distance_matrix ) {
- system( "cp outdist $distancefile" );
- }
- $pwdfile = "infile.dist";
- }
-
- unlink( "infile.tree" );
-
- if ( $pairwise_dist_only == 1 ) {
- unlink( "infile", "align", "INFILE", "outdist", $alignfile );
- chdir( $current_dir )
- || die "\n\n$0: Unexpected error: Could not chdir to <<$current_dir>>: $!\n\n";
-
- rmdir( $temp_dir )
- || die "\n\n$0: Unexpected error: Could not remove <<$temp_dir>>: $!\n\n";
-
- print "\n\n$0 finished.\n\n";
- print "Output pairwise distance file written as: $distancefile\n\n";
- print "\n\nmakeTree successfully terminated.\n\n";
- exit( 0 );
- }
-
- } ## if ( $use_protpars != 1 )
-
-} ## if ( $start_with_pwd != 1 )
-
-
-# Calculating the tree (saved in file "infile"):
-
-if ( $use_protpars != 1 ) {
- unlink( "infile" );
- &executeNeighbor( $pwdfile, $bootstraps, $randomize_input_order, $seed, 1 );
-}
-else {
- if ( $bootstraps > 1 ) {
- &executeSeqboot( $seed, $bootstraps );
- unlink( "infile" );
- system( "mv", "outfile", "infile" );
- }
- &executeProtpars( "infile", $bootstraps, $protpars_jumbles, $seed );
-}
-
-unlink( "outfile" );
-
-if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) {
-
- system( "cp", "outtree", $multitreefile );
-}
-
-
-system( "mv", "outtree", "intree" );
-
-if ( $bootstraps > 1 ) {
- if ( $no_consenus_tree != 1 ) {
-
- # Consense:
- &executeConsense( "intree" );
-
- if ( $puzzle_consensus_tree == 1 ) {
-
- system( "cp", "outtree", "treefile_consense" );
- system( "mv", "outtree", "intree" );
-
- # Puzzle for ML branch lenghts:
- # The alignment is read from infile by default.
- # The tree is read from intree by default.
-
- if ( $start_with_pwd_and_aln == 1 ) {
- &pfam2phylipMatchOnly( $infile,
- "infile",
- 1 );
- }
- elsif ( $use_protpars != 1 ) {
- system( "mv", "align", "infile" ); # align = original alignment in phylip interleaved.
- }
-
- &executePuzzleToCalculateBranchLenghts( $matrix,
- $exact_parameter_est,
- $rate_heterogeneity );
-
- unlink( "outfile", "outdist" );
- system( "mv", "outtree", "outree_puzzle" );
-
- # Transfer
- &executeTransfersBranchLenghts( "outree_puzzle", "treefile_consense", $outfilenhx );
-
- }
- else {
- unlink( "outfile", "align" );
- system( "mv", "outtree", $outfilenhx );
- }
- }
- else {
- unlink( "outfile", "align" );
-
- }
-}
-else {
- unlink( "align", "infile.dist" );
- if ( $start_with_pwd != 1 ) {
- system( "mv intree $outfilenhx" );
- }
-
-}
-
-
-unlink( "treefile_consense", "outtree", "outree_puzzle",
- "infile", "intree", "align", "INFILE", "IN", "IN.dist", "outdist" );
-
-
-$log = $log."Finish time/date : ".`date`;
-
-if ( $start_with_pwd != 1 ) {
- $log = $log."Removed gaps : ";
- if ( $remove_gaps == 1 ) {
- $log = $log."yes\n";
- }
- else {
- $log = $log."no\n";
- }
- $log = $log."Columns in alignment used : $number_of_aa\n";
- $log = $log."Columns in original alignment : $orig_length\n";
- $log = $log."Number of sequences in alignment : $number_of_seqs\n";
-}
-
-
-open( OUT, ">$logfile" ) || die "\n$0: Cannot create file <<$logfile>>: $!\n";
-print OUT $log;
-close( OUT );
-
-
-chdir( $current_dir )
-|| die "\n\n$0:Unexpected error: Could not chdir to <<$current_dir>>: $!\n\n";
-
-
-rmdir( $temp_dir )
-|| die "\n\n$0:Unexpected error: Could not remove <<$temp_dir>>: $!\n\n";
-
-if ( $verbose == 1 ) {
- print "\n\n$0 finished.\n";
- if ( $no_consenus_tree != 1 ) {
- print "Output tree written as : $outfilenhx\n";
- }
- print "Log written as : $logfile\n";
- if ( $start_with_pwd != 1 ) {
- print "Alignment written as : $alignfile\n";
- }
- if ( $keep_multiple_trees == 1 && $bootstraps >= 2 ) {
- print "Multiple trees written as : $multitreefile\n";
- }
- if ( $keep_distance_matrix ) {
- print "Distance matrix written as: $distancefile\n";
- }
-}
-
-
-exit( 0 );
-
-
-
-
-
-# Methods:
-# --------
-
-
-
-
-# Executes pfam2phylip.
-# If resulting alignment is too short due to the removal
-# of gaps, is does not remove gaps.
-# Three arguments:
-# 1. infile
-# 2. outfile
-# 3. remove gaps: 1 to remove gaps; 0: do not remove gaps
-# Last modified: 06/04/01
-sub DoPfam2phylip {
- my $in = $_[ 0 ];
- my $out = $_[ 1 ];
- my $option = $_[ 2 ];
- my $aa = 0;
- my @output = ();
-
- if ( $option == 1 ) {
- @output = &pfam2phylip( $in, $out, 1 );
- $aa = $output[ 0 ];
- if ( $aa < 0 ) {
- die "\n\n$0: DoPfam2phylip: Unexpected error.\n\n";
- }
- if ( $aa < $MIN_NUMBER_OF_AA ) {
- unlink( $out );
- $option = 0;
- $remove_gaps = 0;
- }
- }
- if ( $option == 0 ) { # Must be another "if" (no elsif of else)!
- @output = &pfam2phylip( $in, $out, 2 );
- # 2 is to substitute non-letters with "-" in the sequence.
- $aa = $output[ 0 ];
- if ( $aa <= 0 ) {
- die "\n\n$0: DoPfam2phylip: Unexpected error.\n\n";
- }
- }
- return @output;
-}
-
-
-
-# Two arguments:
-# 1. seed for random number generator
-# 2. number of bootstraps
-# Reads in "infile" by default.
-sub executeSeqboot {
-
- my $s = $_[ 0 ];
- my $bs = $_[ 1 ];
- my $verb = "";
-
- &testForTextFilePresence( $infile );
-
- if ( $verbose != 1 ) {
- $verb = "
-2";
- }
-
-
- system( "$SEQBOOT << !
-r
-$bs$verb
-Y
-$s
-!" )
- && die "$0: Could not execute \"$SEQBOOT\"";
-
- return;
-
-}
-
-
-
-
-# One/two/three argument(s):
-# Reads in tree from "intree" by default. (Presence of "intree" automatically
-# switches into "User defined trees" mode.)
-# 1. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
-# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
-# 2. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise
-# 3. Model of rate heterogeneity:
-# 1 for "8 Gamma distributed rates"
-# 2 for "Two rates (1 invariable + 1 variable)"
-# 3 for "Mixed (1 invariable + 8 Gamma rates)"
-# otherwise: Uniform rate
-# Last modified: 09/08/03 (added 2nd and 3rd parameter)
-sub executePuzzleToCalculateBranchLenghts {
- my $matrix_option = $_[ 0 ];
- my $parameter_estimates_option = $_[ 1 ];
- my $rate_heterogeneity_option = $_[ 2 ];
- my $i = 0;
- my $mat = "";
- my $est = "";
- my $rate = "";
-
- unless ( ( -s "infile" ) && ( -f "infile" ) && ( -T "infile" ) ) {
- die "\n$0: executePuzzleToCalculateBranchLenghts: <<infile>> does not exist, is empty, or is not a plain textfile.\n";
- }
- unless ( ( -s "intree" ) && ( -f "intree" ) && ( -T "intree" ) ) {
- die "\n$0: executePuzzleToCalculateBranchLenghts: <<intree>> does not exist, is empty, or is not a plain textfile.\n";
- }
-
- $mat = setModelForPuzzle( $matrix_option );
- if ( $parameter_estimates_option ) {
- $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option );
- }
- if ( $rate_heterogeneity_option ) {
- $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option );
- }
-
- system( "$PUZZLE << !
-$mat$est$rate
-x
-y
-!" )
- && die "$0: Could not execute \"$PUZZLE\"";
-
- return;
-
-}
-
-
-
-
-
-
-
-# Three/four arguments:
-# 1. Name of file containing tree with correct branch lengths
-# 2. Name of file containing tree with correct bootstraps
-# 3. Outputfilename
-# 4. R to reroot both trees in the same manner (use for FITCH,
-# since this changes to rooting.
-sub executeTransfersBranchLenghts {
- my $tree_with_bl = $_[ 0 ];
- my $tree_with_bs = $_[ 1 ];
- my $out = $_[ 2 ];
- my $reroot = $_[ 3 ];
- my $R = "";
-
- if ( $reroot && $reroot eq "R" ) {
- $R = "R";
- }
-
- &testForTextFilePresence( $tree_with_bl );
- &testForTextFilePresence( $tree_with_bs );
-
- system( "$TRANSFERSBRANCHLENGHTS $tree_with_bl $tree_with_bs $out $R" )
- && die "$0: Could not execute \"$TRANSFERSBRANCHLENGHTS $tree_with_bl $tree_with_bs $out $R\"";
-
-
- return;
-}
-
-
-
-# Called by method DoPfam2phylip.
-# This reads a multiple sequence alignment file in Pfam format,
-# Phylip's sequential format, or ClustalW (".aln")output and saves them
-# in Phylip's sequential or interleaved format.
-# (Those two are the same in this case, since all the seqs will be
-# one line in length (no returns)).
-# It returns (1st) the number of aa (columns) in the resulting
-# alignment and the (2nd) number of aa (columns) in the original
-# alignment.
-#
-# Reads a file containing a sequence alignment in the following format
-# (as used in Pfam):
-# #comments <- empty lines and lines begining with # (not mandatory)
-# name1 kal
-# name2 kal
-# <- at least one empty line between blocks
-# name1 kale
-# name2 k.le
-#
-# Saves it in the "sequential" format of phylip:
-# number of OTUs length of aa seqs
-# name1 kalkale
-# name2 kalk-le
-#
-# Three arguments:
-# 1. infile name
-# 2. outfile name
-# 3. 1 : Removes colums with a gap (non-letter character)
-# 2 : Substitutes non-letter characters (except "?") in the sequence with "-".
-#
-# Last modified: 03/24/04
-# Changes:
-# 03/24/04: Do not replace "?" with "-"
-#
-sub pfam2phylip {
-
- my $infile = $_[ 0 ];
- my $outfile = $_[ 1 ];
- my $options = $_[ 2 ]; # 1: remove gaps; 2: non-letters (except "?") -> "-"
- my $return_line = "";
- my $x = 0;
- my $y = 0;
- my $x_offset = 0;
- my $original_length = 0;
- my @seq_name = ();
- my @seq_array = ();
- my $seq = "";
- my $max_x = 0;
- my $max_y = 0;
- my $m = 0;
- my $n = 0;
- my $i = 0;
- my $move = 0;
- my $saw_a_sequence_line = 0;
-
- if ( -e $outfile ) {
- die "\n$0: pfam2phylip: <<$outfile>> already exists.\n";
- }
-
- &testForTextFilePresence( $infile );
-
- open( INPP, "$infile" ) || die "\n$0: pfam2phylip: Cannot open file <<$infile>>: $!\n";
-
- until ( $return_line !~ /^\s*\S+\s+\S+/ && $saw_a_sequence_line == 1 ) {
- if ( $return_line =~ /^\s*\S+\s+\S+/
- && $return_line !~ /^\s*#/
- && $return_line !~ /^\s*\d+\s+\d+/
- && $return_line !~ /^\s*CLUSTAL/ ) {
- $saw_a_sequence_line = 1;
- $return_line =~ /^\s*(\S+)\s+(\S+)/;
- $seq_name[ $y ] = $1;
- $seq = $2;
- $seq_name[ $y ] = substr( $seq_name[ $y ], 0, $LENGTH_OF_NAME );
-
- for ( $x = 0; $x <= length( $seq ) - 1; $x++ ) {
- $seq_array[ $x ][ $y ] = substr( $seq, $x, 1 );
- }
- if ( $x_offset < length( $seq ) ) {
- $x_offset = length( $seq );
- }
- $y++;
- }
- $return_line = <INPP>;
- if ( !$return_line ) {
- last;
- }
- }
-
- $max_y = $y;
- $y = 0;
- $max_x = 0;
-
- while ( $return_line = <INPP> ) {
- if ( $return_line =~ /^\s*(\S+)\s+(\S+)/
- && $return_line !~ /^\s*#/
- && $return_line !~ /^\s*\d+\s+\d+/ ) {
- $return_line =~ /^\s*\S+\s+(\S+)/;
- $seq = $1;
- for ( $x = 0; $x <= length( $seq ) - 1; $x++ ) {
- $seq_array[ $x + $x_offset ][ $y % $max_y ] = substr( $seq, $x, 1 );
- }
- if ( $max_x < length( $seq ) ) {
- $max_x = length( $seq );
- }
- $y++;
- if ( ( $y % $max_y ) == 0 ) {
- $y = 0;
- $x_offset = $x_offset + $max_x;
- $max_x = 0;
- }
- }
- }
- $original_length = $x_offset;
- close( INPP );
-
-
- # Removes "gap-columns" (gaps = everything except a-z characters):
- if ( $options == 1 ) {
- $move = 0;
-
- COLUMN: for ( $x = 0; $x <= $x_offset - 1; $x++ ) { # goes through all aa positions (columns)
-
- for ( $y = 0; $y <= $max_y - 1; $y++ ) { # goes through all aas in a particular position
-
- unless ( $seq_array[ $x ][ $y ] && $seq_array[ $x ][ $y ] =~ /[a-z]/i ) {
- $move++;
- next COLUMN;
- }
- }
-
- # If this point is reached, column must be OK = no gaps.
- if ( $move >= 1 ) {
- for ( $m = 0; $m <= $max_y; $m++ ) {
- for ( $n = $x; $n <= $x_offset; $n++ ) {
- $seq_array[ $n - $move ][ $m ] = $seq_array[ $n ][ $m ];
- }
- }
- $x_offset = $x_offset - $move;
- $x = $x - $move;
- $move = 0;
- }
- }
- if ( $move >= 1 ) {
- for ( $m = 0; $m <= $max_y; $m++ ) {
- for ( $n = $x; $n <= $x_offset; $n++ ) {
- $seq_array[ $n - $move ][ $m ] = $seq_array[ $n ][ $m ];
- }
- }
- $x_offset = $x_offset - $move;
- $x = $x - $move;
- $move = 0;
- }
- }
-
-
- # Writes the file:
-
- open( OUTPP, ">$outfile" ) || die "\n$0: pfam2phylip: Cannot create file <<$outfile>>: $!\n";
- print OUTPP "$max_y $x_offset\n";
- for ( $y = 0; $y < $max_y; $y++ ) {
- print OUTPP "$seq_name[ $y ]";
- for ( $i = 0; $i <= ( $LENGTH_OF_NAME - length( $seq_name[ $y ] ) - 1 ); $i++ ) {
- print OUTPP " ";
- }
- for ( $x = 0; $x <= $x_offset - 1; $x++ ) {
- if ( $options == 2 ) {
- if ( $seq_array[ $x ][ $y ] ) {
- $seq_array[ $x ][ $y ] =~s /[^a-zA-Z\?]/-/;
- }
- else {
- $seq_array[ $x ][ $y ] = "-";
- }
- }
- print OUTPP "$seq_array[ $x ][ $y ]";
- }
- print OUTPP "\n";
- }
- close( OUTPP );
-
- return $x_offset, $original_length, $max_y;
-
-} ## pfam2phylip
-
-
-
-
-sub printUsage {
-
- print "\n";
- print " makeTree.pl version $VERSION\n";
- print " -----------\n";
-
- print <<END;
-
- Copyright (C) 1999-2003 Washington University School of Medicine
- and Howard Hughes Medical Institute
- All rights reserved
-
- Author: Christian M. Zmasek
- zmasek\@genetics.wustl.edu
- http://www.genetics.wustl.edu/eddy/people/zmasek/
-
- Last modified 09/06/03
-
-
- Requirements makeTree is part of the RIO/FORESTER suite of programs.
- ------------ Many of its global variables are set via rio_module.pm.
-
-
-
- Note. Use xt.pl (for Pfam alignments) or mt.pl (for other alignments)
- to run makeTree.pl on whole directories of alignments files.
-
-
-
- Usage
- -----
-
- Tree calculation based on a Pfam/Clustal W alignment
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- makeTree.pl [-options] <input alignment in SELEX (Pfam), PHYLIP
- sequential format, or Clustal W output> <outputfile>
- [path/name for temporary directory to be created]
-
- Example:
- "% makeTree.pl -UTB1000S41NDXV /DB/PFAM/Full/IL5 IL5_tree"
-
-
- Tree calculation based on precalculated pairwise distances
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Consensus tree will have no branch length values.
- Precalculated pairwise distances are the output of "pfam2pwd.pl",
- number of bootstraps needs to match the one used for the pwds.
-
- makeTree.pl <-options, includes "F"> <pwdfile: boostrapped pairwise
- distances> <outputfile> [path/name for temporary directory
- to be created]
-
- Example:
- "% makeTree.pl -FB100S21XV /pfam2pwd_out/IL5.pwd IL5_tree"
-
-
- Tree calculation based on precalculated pairwise distances
- and matching alignment
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Consensus tree will have branch length values.
- Precalculated pairwise distances and the matching (processed)
- alignment are the output of "pfam2pwd.pl", number of bootstraps
- needs to match the one used for the pwds, matrix needs to match
- the one used for the pwds.
-
- makeTree.pl <-options, includes "UF"> <pwdfile: boostrapped pairwise
- distances> <alnfile: corresponding alignment>
- <outputfile> [path/name for temporary directory to be created]
-
- Example:
- "% makeTree.pl -UFLB100S21XV /pfam2pwd_out/IL5.pwd /pfam2pwd_out/IL5.aln IL5_tree"
-
-
- Options
- -------
-
- N : Suggestion to remove columns in the alignment which contain gaps.
- Gaps are not removed, if, after removal of gaps, the resulting alignment would
- be shorter than $MIN_NUMBER_OF_AA. Default is not to remove gaps.
- Bx : Number of bootstrapps. B0: do not bootstrap. Default is 100 bootstrapps.
- The number of bootstrapps should be divisible by 10.
- U : Use TREE-PUZZLE to calculate ML branchlengths for consesus tree, in case of
- bootstrapped analysis.
- J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE, default: PAM.
- L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE, default: PAM.
- M : Use mtREV24 matrix (Adachi-Hasegawa 1996) inTREE-PUZZLE, default: PAM.
- W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE, default: PAM.
- T : Use VT matrix (Mueller-Vingron 2000) in TREE-PUZZLE, default: PAM.
- P : Let TREE-PUZZLE choose which matrix to use, default: PAM.
- E : Exact parameter estimates in TREE-PUZZLE, default: Approximate.
- Model of rate heterogeneity in TREE-PUZZLE (default: Uniform rate)
- g : 8 Gamma distributed rates
- t : Two rates (1 invariable + 1 variable)
- m : Mixed (1 invariable + 8 Gamma rates)
- R : Randomize input order in PHYLIP NEIGHBOR.
- A : Use PHYLIP PROTPARS instead of NEIGHBOR (and no pairwise distance calculation).
- jx : Number of jumbles when using PHYLIP PROTPARS (random seed set with Sx).
- Sx : Seed for random number generator(s). Must be 4n+1. Default is 9.
- X : To keep multiple tree file (=trees from bootstrap resampled alignments).
- D : To keep (and create in case of bootstrap analysis) pairwise distance matrix file.
- This is created form the not resampled (original) alignment.
- C : Calculate pairwise distances only (no tree). Bootstrap is always 1.
- No other files are generated.
- F : Pairwise distance (pwd) file as input (instead of alignment).
- No -D, -C, and -N options available in this case.
- V : Verbose.
- # : Only for rio.pl: Do not calculate consensus tree ("I" option in rio.pl).
-
-
-END
-
-} ## printUsage
+++ /dev/null
-#!/usr/bin/perl -W
-
-# mt.pl
-# -----
-#
-# Copyright (C) 2003 Christian M. Zmasek
-# All rights reserved
-#
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Version: 1.000
-# Created on: 09/05/03
-# Last modified: 09/05/03
-#
-#
-#
-# Calculates trees based on all alignments/files in a given directory using
-# makeTree.pl.
-#
-#
-
-use strict;
-use FindBin;
-use lib $FindBin::Bin;
-use rio_module2;
-
-
-my $PREPROCESSING_COMMAND = "";
-my $PERFORM_PREPROCESSING = 0;
-
-my $POSTPROCESSING_COMMAND = "/nfs/dm3/homedir1/czmasek/RIO1.24/perl/extractSpecies.pl";
-my $PERFORM_POSTPROCESSING = 1;
-
-
-my $MY_TEMP_DIR = $TEMP_DIR_DEFAULT; # $TEMP_DIR_DEFAULT is inherited
- # from rio_module.pm
-
-
-
-
-my $options = ""; # Options for makeTree.pl, see makeTree.pl.
-
-
-my $suffix = "";
-my $use_suffixes = 0;
-my $input_dir = "";
-my $output_dir = "";
-
-my $i = 0;
-my $filename = "";
-my @filenames = ();
-
-
-
-
-
-
-# Analyzes the options:
-# ---------------------
-
-unless ( @ARGV == 3 || @ARGV == 4 ) {
- &printUsage();
-}
-
-$options = $ARGV[ 0 ];
-$input_dir = $ARGV[ 1 ];
-$output_dir = $ARGV[ 2 ];
-
-if ( @ARGV == 3 ) {
- $use_suffixes = 0;
-}
-elsif ( @ARGV == 4 ) {
- $use_suffixes = 1;
- $suffix = $ARGV[ 3 ];
-}
-
-
-$input_dir = &addSlashAtEndIfNotPresent( $input_dir );
-$output_dir = &addSlashAtEndIfNotPresent( $output_dir );
-$MY_TEMP_DIR = &addSlashAtEndIfNotPresent( $MY_TEMP_DIR );
-
-
-
-
-# This adds a "-" before the options for makeTree:
-# ------------------------------------------------
-unless ( $options =~ /^-/ ) {
- $options = "-".$options;
-}
-
-
-
-
-
-# This creates the temp file:
-# --------------------------
-
-my $time = time;
-my $ii = 0;
-
-my $temp_file = $MY_TEMP_DIR."mt".$time.$ii;
-
-while ( -e $temp_file ) {
- $ii++;
- $temp_file = $MY_TEMP_DIR."mt".$time.$ii;
-}
-
-
-
-opendir( DIR, $input_dir ) || error( "Cannot open directory \"$input_dir\": $!" );
-
-$i = 0;
-
-while( defined( $filename = readdir( DIR ) ) ) {
- if ( $filename =~ /^\.\.?$/ ) {
- next;
- }
- if ( $use_suffixes == 1 && $filename !~ /$suffix$/ ) {
- next;
- }
-
- $filenames[ $i ] = $filename;
- $i++;
-}
-
-close( DIR );
-
-$i = 0;
-
-FOREACH: foreach $filename ( @filenames ) {
-
- # If the corresponding tree seems to already exists, do next one.
- if ( -e "$output_dir$filename.nhx" ) {
- next FOREACH;
- }
-
- print "\n\n\n\n";
- print "MT.PL\n";
- print "working on: $filename\n";
-
- print "[tree calculation $i]\n";
- print "=====================================================================\n\n\n";
-
-
- unlink( "$output_dir$filename.aln",
- "$output_dir$filename.log",
- "$output_dir$filename.nbd" );
-
- print( "MT.PL: executing:\n" );
-
- my $inputfile = $input_dir.$filename;
-
- my $outputfilename = "";
-
- if ( $use_suffixes == 1 ) {
- $outputfilename = $output_dir . $filename;
- $outputfilename =~ s/$suffix$//;
- $outputfilename =~ s/\.$//;
- $outputfilename .= ".nhx";
- }
- else {
- $outputfilename = $output_dir . $filename . ".nhx";
- }
-
-
-
- if ( $PERFORM_PREPROCESSING == 1 ) {
- my $pre_command = "$PREPROCESSING_COMMAND";
-
- print( "$pre_command\n" );
- system( $pre_command ) && &error( "Could not execute \"$pre_command\"" );
- }
-
- $MAKETREE = "/nfs/dm3/homedir1/czmasek/RIO1.24/perl/makeTree2.pl"; # <<<<<<<<<<<<<<<<<<<<<<<-------------------~~~~~~~~~~~~~~~~~~~~~~~
-
- my $command = "$MAKETREE $options $inputfile $outputfilename";
-
- print( "$command\n" );
- system( $command ) && &error( "Could not execute \"$command\"" );
-
-
-
- if ( $PERFORM_POSTPROCESSING == 1 ) {
- my $post_command = "$POSTPROCESSING_COMMAND $outputfilename";
-
- print( "$post_command\n" );
- system( $post_command ) && &error( "Could not execute \"$post_command\"" );
- }
-
-
-
- $i++;
-
-}
-
-
-
-print( "\n\n\nMT.PL: Done!\n" );
-
-exit( 0 );
-
-
-
-
-
-
-sub error{
-
- my $text = $_[ 0 ];
-
- print( "\nxt.pl: ERROR:\n" );
- print( "$text\n\n" );
-
- exit( -1 );
-
-}
-
-
-
-
-sub printUsage {
- print "\n";
- print " mt.pl\n";
- print " _____\n";
- print " \n";
- print " Copyright (C) 2003 Christian M. Zmasek\n";
- print " All rights reserved\n";
- print "\n";
- print " Author: Christian M. Zmasek\n";
- print " zmasek\@genetics.wustl.edu\n";
- print " http://www.genetics.wustl.edu/eddy/forester/\n";
- print "\n";
- print "\n";
- print " Purpose\n";
- print " -------\n";
- print "\n";
- print " Tree construction using makeTree.pl on all alignments/files\n";
- print " in a given directory.\n";
- print "\n";
- print "\n";
- print " Usage\n";
- print " -----\n";
- print "\n";
- print " mt.pl <options for makeTree.pl> <input directory: aligments> <output\n";
- print " directory> [suffix for alignments to be used in input directory]\n";
- print "\n";
- print " If a suffix is given, it will be removed for the output files.\n";
- print "\n";
- print "\n";
- print " Example\n";
- print " -------\n";
- print "\n";
- print " \"mt.pl NS21UTRB100DX alignments/ trees/ .aln\"\n";
- print "\n";
- print "\n";
- print "\n";
- exit( -1 );
-
-}
+++ /dev/null
-#!/usr/bin/perl
-
-# multifetch.pl [options] <list of seqs>
-#
-# Fetch all the seqs on the list. The list is a file with one line
-# per sequence; the first field is the key.
-#
-# Options:
-# -d : domain fetch - list is in GDF format
-# -n <extra 5'> : include this many extra residues upstream (-d only)
-# -c <extra 3'> : include this many extra residues downstream (-d only)
-# -f : fetch in FASTA instead of native format
-# -g <file> : use getseq from <file>, not fetch from main databases.
-# This always gives FASTA output.
-# -D <database> : specify a source database, same usage as getseq:
-# -Dsw SwissProt
-# -Dpir PIR
-# -Dem EMBL
-# -Dgb GenBank
-# -Dwp WormPep
-# -Dowl OWL
-
-
-use FindBin;
-use lib $FindBin::Bin;
-use rio_module;
-require "getopts.pl";
-
-
-&Getopts('c:n:dfg:D:');
-if ($opt_c) { $extra_c = $opt_c; }
-if ($opt_n) { $extra_n = $opt_n; }
-if ($opt_d) { $domains = 1; }
-if ($opt_f) { $fmtarg = "-Ffasta";} else {$fmtarg = ""; }
-if ($opt_g) { $filearg = "-d$opt_g ";} else {$filearg = ""; }
-if ($opt_D) { $dbarg = "-D$opt_D "; } else {$dbarg = ""; }
-
-
-while (<>) {
- if ($domains) {
- if (($name, $from, $to, $source) = /^\s*(\S+)\s+(\d+)\s+(\d+)\s+(\S+)/){
- if ($from < $to) {
- $from -= $opt_n;
- $to += $opt_c;
- }
- else {
- $from += $opt_n;
- $to -= $opt_c;
- }
-
- system("$SFE $filearg $dbarg $fmtarg -r \"$name\" -f $from -t $to \"$source\"")
- && die "\n\n$0: Unexpected error: Could not execute \"$SFE $filearg $dbarg $fmtarg -r \"$name\" -f $from -t $to \"$source\"\": $!";
- }
- } else {
- if (/^\s*(\S+)/) {
- $key = $1;
-
- system("$SFE $filearg $dbarg $fmtarg \"$key\"")
- && die "\n\n$0: Unexpected error: Could not execute \"$SFE $filearg $dbarg $fmtarg \"$key\"\": $!";
- }
- }
-}
-
-# 01/30/02
-# CZ
-# Added usage of rio_module.pm, $SFE for sfetch.
-
-# Thu Apr 10 18:27:40 1997
-# - added -D option
-# - simplified from six different getseq calls to two
-
+++ /dev/null
-#! /usr/bin/perl -W
-
-# nph-riowebserver.pl
-# -------------------
-#
-# Copyright (C) 2002 Washington University School of Medicine
-# and Howard Hughes Medical Institute
-# All rights reserved
-#
-# Created: 02/18/02
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Last modified: 02/20/02
-
-
-use strict;
-use CGI;
-use queue;
-
-
-my $RIOPL = "/home/rio/forester/perl/rio4P.pl";
-my $JAVA = "/home/rio/j2sdk1.4.0/bin/java";
-my $TEST_NHX = $JAVA." -cp /home/rio/forester/java forester.tools.testNHX";
-my $TEMPDIR = "/home/rio/rws_temp";
-my $SPECIESTREE = "/home/rio/forester/data/species/tree_of_life_bin_1-4.nhx";
-my $SPECIESLIST = "/home/rio/forester/data/species/tree_of_life_bin_1-4_species_list";
-my $hmm_search_url_A = "http://pfam.wustl.edu/cgi-bin/nph-hmmsearch?protseq=";
-my $hmm_search_url_B = "&search_mode=merge&cutoff_strategy=ga";
-
-my $RIO_ALN_DIRECTORY = "/data/rio/ALNs/";
-my $RIO_NBD_DIRECTORY = "/data/rio/NBDs/";
-my $ALIGN_FILE_SUFFIX = ".aln";
-my $ALIGN_NBD_FILE = ".nbd";
-my $DIR_FOR_TREES = "/var/www/html/trees/"; # Directory for NHX files to be read by ATV applet
-my $URL_FOR_TREES = "http://forester.wustl.edu/trees/"; # URL base for NHX files to be read by ATV applet
-my $CODE_BASE_FOR_ATV_APPLET = "http://forester.wustl.edu/applets/"; # URL for ATV applet (jar file)
-my $TARGET_FILES_IN_DIR_FOR_TREES = 100; # If the number of nhx files in $DIR_FOR_TREES is lager then $MAX_FILES_IN_DIR_FOR_TREES
-my $MAX_FILES_IN_DIR_FOR_TREES = 120; # the oldest files will be deleted until the number is down to $TARGET_FILES_IN_DIR_FOR_TREES.
-my $O_THRESHOLD_DEFAULT = 0;
-my $SN_THRESHOLD_DEFAULT = 0;
-my $U_THRESHOLD_DEFAULT = 50;
-my $SEED_FOR_RANDOM_DEFAULT = 41;
-my $SORT_DEFAULT = 12;
-my $MIN_SIZE = 5; # Minimal size (in chars) for input files
-my $MAX_SIZE = 10000; # Maximal size (in chars) for input files
-my $MAX_LINES = 1000; # Maximal lines for input files
-my $RIO_OPTIONS = "U=60 Y=2 X=2 Z=2 I C E x +";
-my $CONTACT = "zmasek\@genetics.wustl.edu";
-my $VERSION = "0.3";
-
-
-my $o_threshold = 0;
-my $sn_threshold = 0;
-my $u_threshold = 0;
-my $seed_for_random = 0;
-my $sort = 0;
-my $size_d = 0;
-my $size_c = 0;
-my $entry_time = 0;
-my $njobs = 0;
-my $njobs_thisuser = 0;
-my $user_defined_tree = 0;
-
-
-
-my $query = "";
-my $query_seq = "";
-my $query_seq_file = "";
-my $tree_file = "";
-my $pfam_domain = "";
-my $species = "";
-my $output_tree = "";
-my $output_up = "";
-my $remote_addr = "";
-my $oneline = "";
-my $aln = "";
-my $speciestree = "";
-my $output = "";
-my $query_sequence = ""; # To be submitted to hmmsearch website, if necessary.
-my $link_to_hmmsearch = "";
-
-my @lines = ();
-my %Species_names_hash = ();
-
-
-$| = 1;
-
-$query = new CGI;
-
-
-$query_seq = $query->param( 'query_seq' );
-$query_seq_file = $query->upload( 'query_seq_file' );
-$pfam_domain = $query->param( 'pfam_domain' );
-$species = $query->param( 'species' );
-$o_threshold = $query->param( 'o_threshold' );
-$sn_threshold = $query->param( 'sn_threshold' );
-$u_threshold = $query->param( 'u_threshold' );
-$seed_for_random = $query->param( 'seed_for_random' );
-$output_up = $query->param( 'output_up' );
-$sort = $query->param( 'sort_priority' );
-$tree_file = $query->upload( 'tree_file' );
-
-$remote_addr = $ENV{ REMOTE_ADDR };
-
-
-# NPH header
-# ----------
-print $query->header( -status=>"200 OK",
- -server=>"$ENV{ SERVER_SOFTWARE }",
- -nph=>1 );
-
-
-
-
-# Prints the first HTML
-# ---------------------
-print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd\">\n";
-print "<HTML>\n";
-print "<HEAD>\n";
-print "<TITLE>[ RIO SERVER | phylogenomic analysis of a protein sequence ]</TITLE>\n";
-print "<META HTTP-EQUIV = \"Content-Type\" CONTENT = \"text/html; charset=iso-8859-1\" >\n";
-print "<LINK REL = \"stylesheet\"\n";
-print " TYPE = \"text/css\"\n";
-print " HREF = \"http://forester.wustl.edu/style_rio_server2.css\">\n";
-
-&print_ATV_JavaScript();
-
-print "</HEAD>\n";
-print "<BODY>\n";
-
-&print_navbar();
-
-
-
-# Reads in, cleans up and checks
-# ------------------------------
-
-if ( ( !defined( $query_seq_file ) && !defined( $query_seq ) )
-|| ( $query_seq_file !~ /\w+/ && $query_seq !~ /\w+/ ) ) {
- &nph_user_error( "need to specify a sequence file or submit a sequence directly" );
-}
-
-if ( $query_seq_file =~ /\w+/ && $query_seq =~ /\w+/ ) {
- &nph_user_error( "cannot specify a sequence file and submit a sequence directly" );
-}
-
-
-if ( $query_seq_file =~ /\w+/ ) {
- # Reading in from file
- &readInFile( $query_seq_file );
-}
-else {
- # "cut and paste"
- @lines = split( /^/, $query_seq );
-}
-
-
-if ( $lines[ 0 ] =~ /^\s*>/ ) { # FASTA
- shift( @lines );
-}
-
-
-foreach $oneline ( @lines ) {
- $size_d += length( $oneline );
- if ( $size_d > $MAX_SIZE ) {
- &nph_user_error( "query sequence is too long (>$MAX_SIZE)" );
- }
- $oneline =~ s/[^A-Za-z]//g;
- $size_c += length( $oneline );
-}
-if ( $size_c < $MIN_SIZE ) {
- &nph_user_error( "query sequence is too short (<$MIN_SIZE)" );
-}
-
-
-# Writes a temp file for the query sequence
-open( PROT, ">$TEMPDIR/$$.query" ) || &nph_fatal_error( "failed to open temp query file" );
-foreach $oneline ( @lines ) {
- print PROT $oneline;
- $query_sequence .= $oneline;
-}
-close( PROT );
-
-if ( !defined( $species ) || $species !~ /\w+/ || length( $species ) < 2 ) {
- &nph_user_error( "need to specify the species of the query sequence" );
-}
-
-$link_to_hmmsearch = "<A HREF=\"$hmm_search_url_A".$query_sequence."$hmm_search_url_B\" TARGET=\"_blank\"> >> click here to perform hmmsearch on query sequence << </A>";
-
-if ( !defined( $pfam_domain ) || $pfam_domain !~ /\w+/ || length( $pfam_domain ) < 1 ) {
- &nph_user_error( "need to specify a name for a pfam domain of the query sequence<BR>$link_to_hmmsearch" );
-}
-
-if ( length( $species ) > 5 ) {
- &nph_user_error( "invalid species name" );
-}
-$species =~ s/[^A-Za-z0-9]//g;
-if ( length( $species ) < 2 ) {
- &nph_user_error( "invalid species name" );
-}
-
-if ( length( $pfam_domain ) > 40 ) {
- &nph_user_error( "invalid pfam domain name<BR>$link_to_hmmsearch" );
-}
-$pfam_domain =~ s/[\s,;\.><\|\\\/\(\)!@\#\$%&\*\^=]//g;
-if ( length( $pfam_domain ) < 1 ) {
- &nph_user_error( "invalid pfam domain name<BR>$link_to_hmmsearch" );
-}
-
-if ( defined( $tree_file ) && $tree_file =~ /\w+/ ) {
- $user_defined_tree = 1;
-}
-
-$species =~ tr/a-z/A-Z/;
-
-if ( $user_defined_tree != 1 ) {
- &checkForPresenceOfSpecies( $species );
-}
-
-$aln = $RIO_ALN_DIRECTORY.$pfam_domain.$ALIGN_FILE_SUFFIX;
-
-if ( &checkForTextFilePresence( $aln ) != 1 ) {
- &nph_user_error( "no pairwise distances precalculated for pfam domain \"$pfam_domain\"<BR>$link_to_hmmsearch" );
-}
-
-
-if ( checkForNumberBetween0and100( $o_threshold ) != 1 ) {
- $o_threshold = $O_THRESHOLD_DEFAULT;
-}
-if ( checkForNumberBetween0and100( $sn_threshold ) != 1 ) {
- $sn_threshold = $SN_THRESHOLD_DEFAULT;
-}
-if ( checkForNumberBetween0and100( $u_threshold ) != 1 ) {
- $u_threshold = $U_THRESHOLD_DEFAULT;
-}
-if ( !defined( $seed_for_random ) || $seed_for_random !~ /\d/
-|| $seed_for_random =~ /\D/ || $seed_for_random > 10000 || $seed_for_random < 0 ) {
- $seed_for_random = $SEED_FOR_RANDOM_DEFAULT;
-}
-if ( !defined( $sort )
-|| $sort > 16 || $sort < 12 ) {
- $sort = $SORT_DEFAULT;
-}
-
-if ( defined( $output_up ) && $output_up eq "yes" ) {
- $RIO_OPTIONS .= " p";
-}
-else {
- $u_threshold = -1;
-}
-
-
-
-
-
-
-
-
-# User defined species tree is dealt with here
-# --------------------------------------------
-
-if ( $user_defined_tree == 1 ) {
- &readInFile( $tree_file );
- $size_d = 0;
- $size_c = 0;
- foreach $oneline ( @lines ) {
- $size_d += length( $oneline );
- if ( $size_d > $MAX_SIZE ) {
- &nph_user_error( "user defined species tree file is too long (>$MAX_SIZE)" );
- }
- $oneline =~ s/;\|,<>\s//g;
- $oneline =~ tr/a-z/A-Z/;
-
- $size_c += length( $oneline );
- }
- if ( $size_c < $MIN_SIZE ) {
- &nph_user_error( "user defined species tree file is too short (<$MIN_SIZE)" );
- }
-
- open( TREE, ">$TEMPDIR/$$.tree" ) || &nph_fatal_error( "failed to open temp species tree file" );
- foreach $oneline ( @lines ) {
- print TREE $oneline;
- }
- close( TREE );
-
- $speciestree = "$TEMPDIR/$$.tree";
- system( "$TEST_NHX $speciestree" )
- && nph_user_error( "user defined species tree is not in proper NHX format (or is unrooted, or contains multifurcations) $!" );
-
-}
-else {
- $speciestree = $SPECIESTREE;
-}
-
-
-
-# Join the queue, using queue.pm API
-# ----------------------------------
-
-$entry_time = time;
-
-( $njobs, $njobs_thisuser ) = &queue::CheckQueue( "rioqueue", $remote_addr, $TEMPDIR );
-if ( $njobs > 5 ) {
- &nph_user_error("The server is currently swamped, with $njobs searches in the queue.<BR>Please come back later! Sorry.");
-}
-if ( $njobs_thisuser > 0 ) {
- &nph_user_error( "We already have $njobs_thisuser searches in the queue from
- your IP address ($remote_addr). Please wait until some or all of them
- finish.<BR>If you think you got this message in error, wait a minute or so and
- resubmit your job. You probably hit your browser's stop button after you
- started a search - but that doesn't stop our compute cluster, it just breaks
- your connection to us. You won't be able to start a new search until the
- cluster's done." );
-}
-if ( $njobs > 0 ) {
- print_waiting_message( $njobs );
-}
-&queue::WaitInQueue( "rioqueue", $remote_addr, $TEMPDIR, $$, 10 ); # wait with ten-second granularity.
-
-
-
-
-# Prints "waiting" header
-# -----------------------
-
-my $number_of_seqs = &getNumberOfSeqsFromNBDfile( $RIO_NBD_DIRECTORY.$pfam_domain.$ALIGN_NBD_FILE );
-my $estimated_time = &estimateTime( $number_of_seqs );
-
-print( "<H4 class = \"messages\"> RIO: Starting search. Estimated time: $estimated_time seconds per domain (assuming all rio nodes are running). Please wait...</H4>\n" );
-
-
-
-
-# Runs RIO
-# --------
-
-&run_rio( $pfam_domain, # domain
- "$TEMPDIR/$$.query", # query file name
- "$TEMPDIR/$$.outfile", # output file name
- "QUERY_$species", # name for query
- $speciestree, # species tree
- $RIO_OPTIONS, # more options
- "$TEMPDIR/$$", # temp file
- $o_threshold,
- $sn_threshold,
- $u_threshold,
- $seed_for_random,
- $sort );
-
-
-
-# Done
-# ----
-
-&showATVlinks();
-
-
-
-# Cleanup
-unlink( "$TEMPDIR/$$.query", "$TEMPDIR/$$.tree" );
-
-$output .= "<P>   </P>\n";
-
-# Leaves the queue in an orderly fashion.
-&queue::RemoveFromQueue( "rioqueue", $remote_addr, $TEMPDIR, $$ );
-
-print( $output );
-
-&print_footer();
-
-&removeFiles( $DIR_FOR_TREES, $TARGET_FILES_IN_DIR_FOR_TREES, $MAX_FILES_IN_DIR_FOR_TREES );
-
-exit( 0 );
-
-
-
-
-
-
-
-
-# Methods
-# -------
-
-
-
-# Last modified: 02/19/02
-sub run_rio {
-
- my $pfam_name = $_[ 0 ];
- my $query_file = $_[ 1 ];
- my $output_file = $_[ 2 ];
- my $name_for_query = $_[ 3 ];
- my $species_tree_file = $_[ 4 ];
- my $more_options = $_[ 5 ];
- my $tmp_file_rio = $_[ 6 ];
- my $t_o = $_[ 7 ];
- my $t_sn = $_[ 8 ];
- my $t_u = $_[ 9 ];
- my $seed = $_[ 10 ];
- my $sort = $_[ 11 ];
-
- my $start_time = time;
-
- my $options_for_rio = "";
-
- $options_for_rio .= ( " A=".$pfam_name );
- $options_for_rio .= ( " Q=".$query_file );
- $options_for_rio .= ( " O=".$output_file );
- $options_for_rio .= ( " N=".$name_for_query );
- $options_for_rio .= ( " S=".$species_tree_file );
- $options_for_rio .= ( " j=".$tmp_file_rio );
- $options_for_rio .= ( " L=".$t_o );
- $options_for_rio .= ( " B=".$t_sn );
- if ( $t_u != -1 ) {
- $options_for_rio .= ( " v=".$t_u );
- }
- $options_for_rio .= ( " y=".$seed );
- $options_for_rio .= ( " P=".$sort );
- $options_for_rio .= ( " ".$more_options );
-
- $output = `$RIOPL 1 $options_for_rio`;
-
- if ( $? != 0 ) {
- nph_rio_error();
- }
- my $finish_time = time;
- my $wait_time = $finish_time - $entry_time;
- my $cpu_time = $finish_time - $start_time;
-
-
-
- # Logs the results.
- my $date = `date`;
- chop( $date );
- open ( LOGFILE, ">>$TEMPDIR/log") || &nph_fatal_error( "could not open log file" );
- flock( LOGFILE, 2 );
- print LOGFILE "$date queue: $njobs wait: $wait_time true_cpu: $cpu_time pid: $$ addr: $ENV{'REMOTE_ADDR'} host: $ENV{'REMOTE_HOST'} pfam: $pfam_name\n";
- flock( LOGFILE, 8 );
- close ( LOGFILE );
-
- return;
-
-} ## run_rio
-
-
-
-
-# Reads a file into "@lines"
-# Last modified: 02/19/02
-sub readInFile {
- my $file = $_[ 0 ];
- my $l = 0;
- my $s = 0;
- @lines = ();
-
- $file =~ s/;\|,<>&\s//g;
-
- while( <$file> ) {
- $s += length( $_ );
- if ( $s > $MAX_SIZE ) {
- &nph_user_error( "query sequence is too long (>$MAX_SIZE)" );
- }
- $l++;
- if ( $l > $MAX_LINES ) {
- &nph_user_error( "file has too many lines (>$MAX_LINES)" );
- }
-
- push( @lines, $_ );
-
- }
-
-} ## readInFile
-
-
-
-
-# Reads in (SWISS-PROT) species names from a file.
-# Names must be separated by newlines.
-# Lines beginning with "#" are ignored.
-# A possible "=" and everything after is ignored.
-# One argument: species-names-file name
-# Last modified: 02/19/02
-sub readSpeciesNamesFile {
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $species = "";
-
- open( IN_RSNF, "$infile" ) || &nph_fatal_error( "could not open species list" );
- while ( $return_line = <IN_RSNF> ) {
- if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) {
- $species = $1;
- $species =~ s/=.+//;
- $Species_names_hash{ $species } = "";
- }
- }
- close( IN_RSNF );
-
- return;
-} ## readSpeciesNamesFile
-
-
-
-# Last modified: 02/19/02
-sub checkForNumberBetween0and100 {
-
- my $x = $_[ 0 ];
-
- if ( !defined( $x ) || $x !~ /\d/ || $x =~ /\D/ || $x > 100 || $x < 0 ) {
- return 0;
- }
- else {
- return 1;
- }
-
-} ## checkForNumberBetween0and100
-
-
-
-# Last modified: 02/19/02
-sub getNumberOfSeqsFromNBDfile {
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $number_of_seqs = 0;
-
- open( C, "$infile" ) || &nph_fatal_error( "could not open NBD file" );
- while ( $return_line = <C> ) {
- if ( $return_line =~ /^\s*(\d+)\s*$/ ) {
- $number_of_seqs = $1;
- last;
- }
- }
- close( C );
- return $number_of_seqs;
-
-} ## getNumberOfSeqsFromNBDfile
-
-
-
-# Last modified: 02/19/02
-sub print_waiting_message {
-
- my $njobs = $_[ 0 ];
-
- print( "<H4 class = \"messages\">\n" );
- print( "RIO: There are $njobs searches queued ahead of you on the RIO server. Please wait...\n" );
- print( "</H4>\n" );
-
- return;
-
-} ## print_waiting_message
-
-
-
-# Last modified: 02/19/02
-sub checkForPresenceOfSpecies {
-
- my $species = $_[ 0 ];
-
- &readSpeciesNamesFile( $SPECIESLIST );
- unless( exists( $Species_names_hash{ $species } ) ) {
- &nph_user_error( "species \"$species\" not present in currently used species tree" );
- }
-
- return;
-} ## checkForPresenceOfSepecies
-
-
-
-# Last modified: 02/19/02
-sub checkForTextFilePresence {
-
- my $file = $_[ 0 ];
-
- if ( ( -s $file ) && ( -f $file ) && ( -T $file ) ) {
- return 1;
- }
- else {
- return 0;
- }
-
-} ## checkForTextFilePresence
-
-
-
-
-
-# Last modified: 02/19/02
-sub print_footer {
-
- &print_navbar();
- &print_contact();
- print( "</BODY>\n" );
- print( "</HTML>\n" );
-
- return;
-
-} ## print_footer
-
-
-
-# Last modified: 02/19/02
-sub print_navbar {
-
- print( "<HR NOSHADE COLOR=\"#FF3300\">\n" );
- print( "<P class = \"nomargins\">\n" );
- print( "<B>RIO $VERSION</B> \n" );
- print( "<A HREF=\"http://www.rio.wustl.edu/\">phylogenomic analysis of a protein sequence</A> | " );
- print( "<A HREF=\"http://www.rio.wustl.edu/help.html\" TARGET=\"_blank\">help</A> | " );
- print( "<A HREF=\"http://www.genetics.wustl.edu/eddy/forester/\" TARGET=\"_blank\">forester/rio home</A> | " );
- print( "<A HREF=\"http://pfam.wustl.edu/\" TARGET=\"_blank\">pfam</A>\n" );
- print( "</P class = \"nomargins\">\n" );
- print( "<HR NOSHADE COLOR=\"#FF3300\">\n" );
-
- return;
-
-} ## print_navbar
-
-
-
-# Last modified: 02/19/02
-sub print_contact {
-
- print( "<P>comments, questions, flames? email <A HREF = \"mailto:$CONTACT\">$CONTACT</A></P>\n" );
-
- return;
-
-} ## print_contact
-
-
-
-# Last modified: 02/19/02
-sub showATVlinks {
-
- my $domain_no = 0;
-
- if ( -s "$TEMPDIR/$$.outfile.rio.nhx" ) {
- $domain_no = 1;
- system( "mv", "$TEMPDIR/$$.outfile.rio.nhx", $DIR_FOR_TREES )
- && &nph_fatal_error( "could not mv $TEMPDIR/$$.outfile.rio.nhx" );
- }
- elsif ( -s "$TEMPDIR/$$.outfile.rio-1.nhx" ) {
- $domain_no = 1;
- while ( -s "$TEMPDIR/$$.outfile.rio-$domain_no.nhx" ) {
- system( "mv", "$TEMPDIR/$$.outfile.rio-$domain_no.nhx", $DIR_FOR_TREES )
- && &nph_fatal_error( "could not mv $TEMPDIR/$$.outfile.rio-$domain_no.nhx.nhx" );
- $domain_no++;
- }
-
- }
-
-
- if ( $domain_no == 1 ) {
- $output .= "<P> </P>\n";
- $output .= "<TABLE BORDER=\"0\" CELLPADDING=\"1\"\n";
- $output .= "<TR><TD><FORM>\n";
- $output .= "<INPUT TYPE=BUTTON VALUE=\"view tree\" onClick=\"openWin( '$URL_FOR_TREES$$.outfile.rio.nhx' )\">\n";
- $output .= "</FORM></TD><TD>\n";
- $output .= "<A HREF=\"$URL_FOR_TREES$$.outfile.rio.nhx\" TARGET=\"_blank\">download NHX file describing this tree</A></TD></TR>\n";
- $output .= "</TABLE>\n";
- }
- elsif ( $domain_no > 1 ) {
- $output .= "<P> </P>\n";
- $output .= "<TABLE BORDER=\"0\" CELLPADDING=\"1\"\n";
- for ( my $x = 1; $x < $domain_no; $x++ ) {
- $output .= "<TR><TD><FORM>\n";
- $output .= "<INPUT TYPE=BUTTON VALUE=\"view tree for domain #$x\" onClick=\"openWin( '$URL_FOR_TREES$$.outfile.rio-$x.nhx' )\">\n";
- $output .= "</FORM></TD><TD>\n";
- $output .= "<A HREF=\"$URL_FOR_TREES$$.outfile.rio-$x.nhx\" TARGET=\"_blank\">download NHX file for domain #$x</A></TD></TR>\n";
- }
- $output .= "</TABLE>\n";
- }
-
- return;
-
-} ## showATVlinks
-
-
-# Removes output tree (NHX) files if more than $_[ 2 ] in $_[ 0 ]
-# Removes until $_[ 1 ] are left
-# Last modified: 02/19/02
-sub removeFiles {
-
- my $dir = $_[ 0 ];
- my $target = $_[ 1 ];
- my $max = $_[ 2 ];
-
- my $files = &countFilesInDir( $dir );
-
- if ( $files > $max ) {
-
- my $diff = $files - $target;
-
- for ( my $i = 0; $i < $diff; $i++ ) {
- &removeOldestFile( $dir );
- }
- }
-
- return;
-} ## removeFiles
-
-
-
-# Last modified: 02/19/02
-sub countFilesInDir {
-
- my $dir = $_[ 0 ];
- my $file = "";
- my $c = 0;
-
- opendir( DIR, $dir ) || &nph_fatal_error( "could not open dir $dir" );
- while( defined ( $file = readdir( DIR ) ) ) {
- unless ( $file =~ /\d/ ) {
- next;
- }
- $c++;
- }
- closedir( DIR );
-
- return( $c );
-
-} ## countFilesInDir
-
-
-
-# Last modified: 02/19/02
-sub removeOldestFile {
- my $dir = $_[ 0 ];
- my $file = "";
- my $oldest = "";
- my $smallest_time = 0;
- my $time = 0;
- my $first = 1;
-
- opendir( DIR, $dir ) || &nph_fatal_error( "could not open dir $dir" );
- while( defined ( $file = readdir( DIR ) ) ) {
- unless ( $file =~ /\d/ ) {
- next;
- }
- $file =~ /(\d+)/;
- $time = $1;
- if ( $first == 1 ) {
- $first = 0;
- $smallest_time = $time;
- $oldest = $file
- }
- elsif ( $time < $smallest_time ) {
- $smallest_time = $time;
- $oldest = $file;
- }
- }
- closedir( DIR );
-
- unlink( $dir.$oldest ) || &nph_fatal_error( "could not delete $oldest" );
-
- return;
-
-} ## removeOldestFile
-
-
-
-# Last modified: 02/19/02
-sub print_ATV_JavaScript {
-
-print <<END;
-
-<SCRIPT language="JavaScript">
-<!-- hide
-function openWin( u ) {
- atv_window = open("", "atv_window", "width=300,height=150,status=no,toolbar=no,menubar=no,resizable=yes");
-
- // open document for further output
- atv_window.document.open();
-
- // create document
- atv_window.document.writeln( "<HTML><HEAD><TITLE>ATV</TITLE></HEAD>" );
- atv_window.document.writeln( "<BODY TEXT =\\"#FF3300\\" BGCOLOR =\\"#000000\\">" );
- atv_window.document.writeln( "<FONT FACE = \\"VERDANA, HELVETICA, ARIAL\\">" );
- atv_window.document.writeln( "<CENTER><B>" );
- atv_window.document.writeln( "Please do not close this window<BR>as long as you want to use ATV" );
- atv_window.document.writeln( "<APPLET CODEBASE = \\"$CODE_BASE_FOR_ATV_APPLET\\" ARCHIVE = \\"ATVapplet.jar\\"" );
- atv_window.document.writeln( " CODE = \\"forester.atv_awt.ATVapplet.class\\"" );
- atv_window.document.writeln( " WIDTH = 200 HEIGHT = 50>" );
- atv_window.document.writeln( "<PARAM NAME = url_of_tree_to_load" );
- atv_window.document.writeln( " VALUE = " + u + ">" );
- atv_window.document.writeln( "</APPLET>" );
- atv_window.document.writeln( "</BODY></HTML>" );
-
-
- // close the document - (not the window! flushes buffer)
- atv_window.document.close();
-}
-// -->
-</SCRIPT>
-
-END
-
- return;
-
-} ## print_ATV_JavaScript
-
-
-
-# Last modified: 02/19/02
-sub estimateTime {
- my $number_of_seqs = $_[ 0 ];
- my $estimated_time = 0;
- if ( $number_of_seqs <= 50 ) {
- $estimated_time = 15;
- }
- elsif ( $number_of_seqs <= 100 ) {
- $estimated_time = 20;
- }
- elsif ( $number_of_seqs <= 150 ) {
- $estimated_time = 30;
- }
- elsif ( $number_of_seqs <= 200 ) {
- $estimated_time = 35;
- }
- elsif ( $number_of_seqs <= 250 ) {
- $estimated_time = 40;
- }
- elsif ( $number_of_seqs <= 300 ) {
- $estimated_time = 60;
- }
- elsif ( $number_of_seqs <= 400 ) {
- $estimated_time = 100;
- }
- elsif ( $number_of_seqs <= 500 ) {
- $estimated_time = 160;
- }
- elsif ( $number_of_seqs <= 600 ) {
- $estimated_time = 390;
- }
- elsif ( $number_of_seqs <= 700 ) {
- $estimated_time = 530;
- }
- elsif ( $number_of_seqs <= 800 ) {
- $estimated_time = 750;
- }
- elsif ( $number_of_seqs <= 900 ) {
- $estimated_time = 850;
- }
- else {
- $estimated_time = $number_of_seqs;
- }
- return $estimated_time;
-} ## estimateTime
-
-
-
-# Last modified: 02/19/02
-sub nph_rio_error {
-
- my $mesg = $_[ 0 ];
-
- &queue::RemoveFromQueue( "rioqueue", $remote_addr, $TEMPDIR, $$ );
-
- unlink( "$TEMPDIR/$$.query", "$TEMPDIR/$$.tree" );
-
-
-
- if ( $user_defined_tree == 1 ) {
- print( "<H4 class=\"error\">RIO error</H4>\n" );
- print( "<P><B>[the RIO analysis appearently died]</B></P>\n" );
- print( "<P><B>the most likely source of this error is an invalid user defined species tree</B></P>\n" );
- }
- else {
- print( "<H4 class=\"error\">RIO server fatal error</H4>\n" );
- print( "<P>[the RIO analysis appearently died for unknown reasons]</P>\n" );
- print( "<P><B>This type of error should not happen</B></P>\n" );
- print( "<P>\n" );
- print( "We may have logged it automatically, but we would appreciate it if you would also notify us at\n" );
- print( "<A HREF = \"mailto:$CONTACT\">$CONTACT</A>\n" );
- print( "</P>\n" );
- }
- print( "<P>  </P>\n" );
-
- &print_footer();
- system( "rm -r $TEMPDIR/$$"."0" );
- die;
-
-} ## nph_fatal_error
-
-
-
-# Last modified: 02/19/02
-sub nph_fatal_error {
-
- my $mesg = $_[ 0 ];
-
- &queue::RemoveFromQueue( "rioqueue", $remote_addr, $TEMPDIR, $$ );
-
- unlink( "$TEMPDIR/$$.query", "$TEMPDIR/$$.tree" );
-
- print( "<H4 class=\"error\">RIO server fatal error</H4>\n" );
- print( "<P>[$mesg : $!]</P>\n" );
- print( "<P><B>This type of error should not happen</B></P>\n" );
- print( "<P>\n" );
- print( "We may have logged it automatically, but we would appreciate it if you would also notify us at\n" );
- print( "<A HREF = \"mailto:$CONTACT\">$CONTACT</A>\n" );
- print( "</P>\n" );
- print( "<P>  </P>\n" );
-
-
- &print_footer();
- die;
-
-} ## nph_fatal_error
-
-
-
-# Last modified: 02/19/02
-sub nph_user_error {
-
- my $mesg = $_[ 0 ];
-
- &queue::RemoveFromQueue( "rioqueue", $remote_addr, $TEMPDIR, $$ );
-
- unlink( "$TEMPDIR/$$.query", "$TEMPDIR/$$.tree" );
-
- print( "<H4 class=\"error\">user error</H4>\n" );
- print( "<P>\n" );
- print( "<B>$mesg</B>\n" );
- print( "</P>\n" );
- print( "<P>  </P>\n" );
-
-
- &print_footer();
-
- die "nph-riowebserver handled: $mesg";
-
-} ## nph_user_error
-
-
-
-
+++ /dev/null
-#! /usr/bin/perl
-
-# Usage: p7extract.pl <hmmsearch output file>
-#
-# Converts hmmsearch output to GLF or GDF. GLF is the default.
-# Order is sorted by bit score
-#
-# Options:
-# -C : extract Pfam coverage statistics (NAR paper)
-# -d : extract domains in GDF format
-# -t <x> : report only hits better than evalue of <x>
-# -s : include scores in output
-# -e : include evalues in output
-# -l : include negative log evalues in output for easy sorting
-#
-# Note: p7extract.pl -sel gives the extended GLF format expected by
-# the PROFMARK benchmark scripts
-
-require "getopts.pl";
-
-$ethresh = 0;
-
-&Getopts('Cdt:sel');
-if ($opt_C) { $coverage_mode = 1; $gdfmode = 1;}
-if ($opt_d) { $gdfmode = 1; }
-if ($opt_t) { $ethresh = $opt_t; }
-if ($opt_s) { $do_scores = 1; }
-if ($opt_e) { $do_eval = 1; }
-if ($opt_l) { $do_log = 1; }
-
-$status = 1; # -C only: assume we will fail, 'til proven otherwise
-
-while (<>)
-{
- if (/^Query HMM:\s+(\S+)/) {$hmmname = $1;}
- if (/^Scores for complete sequences/) {$indom = 0; $inseq = 1;}
- if (/^Parsed for domains/) {$indom = 1; $inseq = 0;}
- if (/^Histogram of all scores/) {$indom = 0; $inseq = 0;}
- if (/^Total sequences searched/) {$status = 0;} # looks like we've seen output
-
- if ( $inseq &&
- (($id, $sc, $ev, $nd) = /(\S+).+\s(\S+)\s+(\S+)\s+(\d+)\s*$/))
- {
- if (($ethresh == 0 || $ev < $ethresh) && $show_key{$id} == 0)
- {
- if (! $gdfmode) {
- $show_key{$id} = 1; # remember this name
- $show_sc{$id} = $sc;
- $show_ev{$id} = $ev;
- }
- $numseqs++;
- }
- }
-
- if ($gdfmode && $indom &&
- (($id, $sqfrom, $sqto, $sc, $ev) =
- /(\S+)\s+\S+\s+(\d+)\s+(\d+).+\s(\S+)\s+(\S+)\s*$/))
- {
- if (($ethresh == 0 || $ev < $ethresh) && $show_key{$id} == 0)
- {
- $key = "$id/$sqfrom-$sqto";
- $show_key{$key} = 1;
- $show_id{$key} = $id;
- $show_sqfrom{$key} = $sqfrom;
- $show_sqto{$key} = $sqto;
- $show_sc{$key} = $sc;
- $show_ev{$key} = $ev;
-
- $numdomains++;
-
- $domsize = $sqto - $sqfrom + 1;
- if ($domsize < 0) { $domsize *= -1; }
- $numresidues += $domsize;
- }
- }
-
-}
-
-if ($coverage_mode)
-{
- if ($status == 0) {
- printf "%-20s %6d %6d %6d\n", $hmmname, $numseqs, $numdomains, $numresidues;
- exit 0;
- } else {
- printf "%-20s [FAILED]\n", $hmmname;
- exit 1;
- }
-
-}
-
-
-
-foreach $key (sort byscore keys(%show_key))
-{
- if ($gdfmode)
- {
- printf("%-24s\t%6d\t%6d\t%15s",
- $key, $show_sqfrom{$key}, $show_sqto{$key}, $show_id{$key})
- } else {
- printf("%-24s", $key);
- }
- # Optional extensions to GDF/GLF
- if ($do_scores) { printf("\t%8s", $show_sc{$key}); }
- if ($do_eval) { printf("\t%12s", $show_ev{$key}); }
- if ($do_log) { printf("\t%12.1f", -log($show_ev{$key})); }
- print "\n";
-}
-
-sub byscore {
- $show_sc{$b} <=> $show_sc{$a};
-}
-
-sub byevalue {
- $show_ev{$a} <=> $show_ev{$b};
-}
-
+++ /dev/null
-#!/usr/bin/perl -W
-
-# pfam2pwd.pl
-# -----------
-# Copyright (C) 1999-2002 Washington University School of Medicine
-# and Howard Hughes Medical Institute
-# All rights reserved
-#
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Created: 05/17/01
-#
-# Last modified 02/20/03
-#
-#
-# See RIO_INSTALL on how to use this program.
-# ------------------------------------------
-#
-
-use strict;
-
-use FindBin;
-use lib $FindBin::Bin;
-use rio_module;
-
-my $VERSION = "3.002";
-
-
-
-# =============================================================================
-# =============================================================================
-#
-# THESE VARIABLES NEED TO BE SET BY THE USER
-# ------------------------------------------
-#
-
-
-# Pfam alignments to calculate pairwise distances from:
-# -----------------------------------------------------
-my $MY_PFAM_FULL_DIRECTORY = "/path/to/Pfam/Full/"; # must end with "/"
-
-
-
-# This file lists all the alignments for which to calculate pairwise distances
-# from. If left empty, ALL the alignments in $MY_PFAM_FULL_DIRECTORY
-# will be used:
-# ----------------------------------------------------------------------------
-my $ALGNS_TO_USE_LIST_FILE = "";
-
-
-
-# This is _VERY IMPORTANT_. It determines the species whose sequences
-# are being used (sequences from species not listed in $MY_SPECIES_NAMES_FILE
-# are ignored). Normally, one would use the same list as RIO uses
-# ($SPECIES_NAMES_FILE in "rio_module.pm") -- currently "tree_of_life_bin_1-6.nhx".
-#
-# For certain large families (such as protein kinases), one might use a
-# species file which contains less species in order to be able to finish
-# the calculations in reasonable time.
-# For example, to exclude most mammals, use:
-# my $MY_SPECIES_NAMES_FILE = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6_species_list_NO_RAT_MONKEYS_APES_SHEEP_GOAT_HAMSTER"
-# (to only use sequences from SWISS-PROT add this line:
-# $TREMBL_ACDEOS_FILE = $PATH_TO_FORESTER."data/NO_TREMBL";)
-# ----------------------------------------------------------------------------
-my $MY_SPECIES_NAMES_FILE = $SPECIES_NAMES_FILE;
-
-
-
-# This is were the output goes (must end with "/")
-# ------------------------------------------------
-my $MY_RIO_PWD_DIRECTORY = "/path/to/pfam2pwd_out/pwd/";
-my $MY_RIO_BSP_DIRECTORY = "/path/to/pfam2pwd_out/bsp/";
-my $MY_RIO_NBD_DIRECTORY = "/path/to/pfam2pwd_out/nbd/";
-my $MY_RIO_ALN_DIRECTORY = "/path/to/pfam2pwd_out/aln/";
-my $MY_RIO_HMM_DIRECTORY = "/path/to/pfam2pwd_out/hmm/";
-
-
-
-# A directory to create temporary files in:
-# -----------------------------------------
-my $MY_TEMP_DIR = "/tmp/"; # must end with "/"
-
-
-
-# Alignments in which the number of sequences after pruning (determined
-# by "$MY_SPECIES_NAMES_FILE") is lower than this, are ignored
-# (no calculation of pwds):
-# ------------------------------------------------------------------
-my $MIN_SEQS = 5;
-
-
-
-# Alignments in which the number of sequences after pruning (determined
-# by "$MY_SPECIES_NAMES_FILE") is greater than this, are ignored
-# (no calculation of pwds):
-# ------------------------------------------------------------------
-my $MAX_SEQS = 700;
-
-
-
-# Seed for the random number generator for bootstrapping (must be 4n+1):
-# ---------------------------------------------------------------------
-my $MY_SEED = 85;
-
-
-
-# This is used to choose the model to be used for the (ML)
-# distance calculation:
-# IMPORTANT: "$MY_MATRIX_FOR_PWD" in "rio_module.pm" needs to
-# have the same value, when the pwds calculated are going to
-# be used for RIO!
-# 0 = JTT
-# 2 = BLOSUM 62
-# 3 = mtREV24
-# 5 = VT
-# 6 = WAG
-# PAM otherwise
-# --------------------------------------------------------
-my $MY_MATRIX = 2;
-
-
-
-#
-# End of variables which need to be set by the user.
-#
-# =============================================================================
-# =============================================================================
-
-
-
-
-
-
-
-
-my $too_small = 0;
-my $too_large = 0;
-my $i = 0;
-my $seqs = 0;
-my $filename = "";
-my $tmp_dir = "";
-my $current_dir = "";
-my $return_line = "";
-my @filenames = ();
-my @too_small_names = ();
-my @too_large_names = ();
-my %Species_names_hash = ();
-my %AC_OS = (); # AC -> species name
-my %AC_DE = (); # AC -> description
-my %ALGNS_TO_USE = (); # name of alignment -> ""
-my $use_algns_to_use_list = 0;
-my $LOGFILE = "00_pfam2pwd_LOGFILE";
- $HMMBUILD = $HMMBUILD." --amino";
-
-
-&createTempdir();
-
-
-&startLogfile();
-
-
-opendir( DIR, $MY_PFAM_FULL_DIRECTORY ) || die "\n\n$0: Cannot open directory $MY_PFAM_FULL_DIRECTORY: $!\n\n";
-$i = 0;
-while( defined( $filename = readdir( DIR ) ) ) {
- if ( $filename =~ /^\.\.?$/ ) {
- next;
- }
- $filenames[ $i ] = $filename;
- $i++;
-}
-close( DIR );
-
-
-&readSpeciesNamesFile( $MY_SPECIES_NAMES_FILE );
-
-&readTrEMBL_ACDEOS_FILE();
-
-if ( defined( $ALGNS_TO_USE_LIST_FILE ) && $ALGNS_TO_USE_LIST_FILE =~ /\w/ ) {
- $use_algns_to_use_list = 1;
- &readListFile();
-}
-
-
-$current_dir = `pwd`;
-$current_dir =~ s/\s//;
-chdir ( $tmp_dir )
-|| die "\n\n$0: Unexpected error: Could not chdir to <<$tmp_dir>>: $!";
-
-$i = 0;
-
-FOREACH_ALIGN: foreach $filename ( @filenames ) {
-
- # If the corresponding pwd, positions, and aln files seem to already exists, do next one.
- if ( ( -e $MY_RIO_PWD_DIRECTORY.$filename.$SUFFIX_PWD )
- && ( -e $MY_RIO_BSP_DIRECTORY.$filename.$SUFFIX_BOOT_STRP_POS )
- && ( -e $MY_RIO_NBD_DIRECTORY.$filename.$SUFFIX_PWD_NOT_BOOTS )
- && ( -e $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX )
- && ( -e $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM ) ) {
- next FOREACH_ALIGN;
- }
-
- if ( $use_algns_to_use_list == 1 && !exists( $ALGNS_TO_USE{ $filename } ) ) {
- next FOREACH_ALIGN;
- }
-
-
- $seqs = &removeSeqsFromPfamAlign( $MY_PFAM_FULL_DIRECTORY.$filename,
- "REM_SEQ_OUTFILE",
- 1 );
- if ( $seqs < $MIN_SEQS ) {
- unlink( "REM_SEQ_OUTFILE" );
- $too_small_names[ $too_small++ ] = $filename;
- next FOREACH_ALIGN;
- }
- elsif ( $seqs > $MAX_SEQS ) {
- unlink( "REM_SEQ_OUTFILE" );
- $too_large_names [ $too_large++ ] = $filename;
- next FOREACH_ALIGN;
- }
-
-
- print "\n\n\n";
- print " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
- print " $i: $filename ($seqs seqs)\n";
- print " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
- print "\n";
-
- # If one of the two file exists from a previous (interrupted) run.
- unlink( $MY_RIO_PWD_DIRECTORY.$filename.$SUFFIX_PWD );
- unlink( $MY_RIO_BSP_DIRECTORY.$filename.$SUFFIX_BOOT_STRP_POS );
- unlink( $MY_RIO_NBD_DIRECTORY.$filename.$SUFFIX_PWD_NOT_BOOTS );
- unlink( $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX );
- unlink( $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM );
-
-
- &executeHmmbuild( "REM_SEQ_OUTFILE",
- $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX,
- "hmm" );
-
- if ( unlink( "hmm" ) != 1 ) {
- die "\n\n$0: Unexpected error: Could not delete <<hmm>>: $!";
- }
-
- if ( unlink( "REM_SEQ_OUTFILE" ) != 1 ) {
- die "\n\n$0: Unexpected error: Could not delete <<REM_SEQ_OUTFILE>>: $!";
- }
-
- executeHmmbuildHand( $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX,
- $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM );
-
- system( $HMMCALIBRATE, $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM )
- && die "\n\n$0: Could not execute \"$HMMCALIBRATE $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM\": $!";
-
- &pfam2phylipMatchOnly( $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX, "infile" );
-
- &executePuzzle( "infile", $MY_MATRIX );
-
- system( "mv", "infile.dist", $MY_RIO_NBD_DIRECTORY.$filename.$SUFFIX_PWD_NOT_BOOTS )
- && die "\n\n$0: Unexpected error: $!";
-
- &executeBootstrap( "infile",
- $BOOTSTRAPS,
- "BOOTSTRAPPED_ALGN",
- $MY_RIO_BSP_DIRECTORY.$filename.$SUFFIX_BOOT_STRP_POS,
- $MY_SEED );
-
- if ( unlink( "infile" ) != 1 ) {
- die "\n\n$0: Unexpected error: Could not delete <<infile>>: $!";
- }
-
-
- &executePuzzleBootstrapped( "BOOTSTRAPPED_ALGN", $MY_MATRIX );
-
- ##if ( unlink( "outfile" ) != 1 ) {
- ## die "\n\n$0: Unexpected error: Could not delete <<outfile>>: $!";
- ##}
-
-
- system( "mv", "BOOTSTRAPPED_ALGN".".dist", $MY_RIO_PWD_DIRECTORY.$filename.$SUFFIX_PWD )
- && die "\n\n$0: Unexpected error: $!\n\n";
-
- if ( unlink( "BOOTSTRAPPED_ALGN" ) != 1 ) {
- die "\n\n$0: Unexpected error: Could not delete <<BOOTSTRAPPED_ALGN>>: $!";
- }
-
- $i++;
-
-} ## End of FOREACH_ALIGN loop.
-
-
-chdir( $current_dir )
-|| die "\n\n$0: Unexpected error: Could not chdir to <<$current_dir>>: $!";
-
-rmdir( $tmp_dir );
-
-&finishLogfile();
-
-print "\n\n\n";
-print( "pfam2pwd.pl: Done.\n" );
-print( "Successfully calculated $i pairwise distance files.\n" );
-print( "Too large alignments (>$MAX_SEQS): $too_large\n" );
-print( "Too small alignments (<$MIN_SEQS): $too_small\n" );
-print( "See the logfile \"$MY_RIO_PWD_DIRECTORY".$LOGFILE."\"\n" );
-print "\n\n\n";
-
-exit( 0 );
-
-
-
-
-
-
-# Methods
-# -------
-
-
-
-# Three arguments:
-# 1. Stockholm alignment
-# 2. Outalignment
-# 3. Outhmm
-# Returns the options used.
-# Last modified: 06/26/01
-sub executeHmmbuild {
-
- my $full = $_[ 0 ];
- my $outalignment = $_[ 1 ];
- my $outhmm = $_[ 2 ];
- my $options = "";
-
- unless ( ( -s $full ) && ( -f $full ) && ( -T $full ) ) {
- die "\n\n$0: \"$full\" does not exist, is empty, or is not a plain textfile.\n\n";
- }
-
- $options = getHmmbuildOptionsFromPfam( $full );
-
- $options =~ s/-f//;
- $options =~ s/-g//;
- $options =~ s/-s//;
- $options =~ s/-F//;
- $options =~ s/-A//;
- $options =~ s/-o\s+\S+//;
- $options =~ s/(\s|^)[^-]\S+/ /g;
-
- if ( $options =~ /--prior/ ) {
- my $basename = basename( $full );
- $basename .= ".PRIOR";
- $options =~ s/--prior/--prior $PRIOR_FILE_DIR$basename/;
- }
-
- # Remove for versions of HMMER lower than 2.2.
- if ( $options =~ /--informat\s+\S+/ ) {
- $options =~ s/--informat\s+\S+/-/;
- }
-
- system( "$HMMBUILD $options -o $outalignment $outhmm $full" )
- && die "\n\n$0: Could not execute \"$HMMBUILD $options -o $outalignment $outhmm $full\".\n\n";
-
- return $options;
-
-} ## executeHmmbuild.
-
-
-# Two arguments:
-# 1. Stockholm alignment
-# 2. Outhmm
-# Returns the options used.
-# Last modified: 06/26/01
-sub executeHmmbuildHand {
-
- my $full = $_[ 0 ];
- my $outhmm = $_[ 1 ];
- my $options = "";
-
- unless ( ( -s $full ) && ( -f $full ) && ( -T $full ) ) {
- die "\n\n$0: \"$full\" does not exist, is empty, or is not a plain textfile.\n\n";
- }
-
- $options = getHmmbuildOptionsFromPfam( $full );
-
- $options =~ s/-f//;
- $options =~ s/-g//;
- $options =~ s/-s//;
- $options =~ s/-F//;
- $options =~ s/-A//;
- $options =~ s/-o\s+\S+//;
- $options =~ s/(\s|^)[^-]\S+/ /g;
-
- if ( $options =~ /--prior/ ) {
- my $basename = basename( $full );
- $basename .= ".PRIOR";
- $options =~ s/--prior/--prior $PRIOR_FILE_DIR$basename/;
- }
-
- # Remove for versions of HMMER lower than 2.2.
- if ( $options =~ /--informat\s+\S+/ ) {
- $options =~ s/--informat\s+\S+/-/;
- }
-
- system( "$HMMBUILD --hand $options $outhmm $full" )
- && die "\n\n$0: Could not execute \"$HMMBUILD -- hand $options $outhmm $full\".\n\n";
-
- return $options;
-
-} ## executeHmmbuildHand.
-
-
-
-# One argument:
-# Pfam align name.
-# Last modified: 02/26/01
-sub getHmmbuildOptionsFromPfam {
-
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $result = "";
-
- unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n\n$0: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n";
- }
-
- open( GHO, $infile ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!";
- while ( $return_line = <GHO> ) {
- if ( $return_line =~ /^\s*#.*hmmbuild\s+(.+)\s*$/ ) {
- $result = $1;
- close( GHO );
- return $result;
- }
- }
- close( GHO );
- return $result;
-
-} ## getHmmbuildOptionsFromPfam
-
-
-
-# Similar to the method with the same name in "rio.pl".
-# Removes sequences from a Pfam flat file.
-# Adds species to TrEMBL seqs.
-# It can remove all sequences not from species listed in a species names file.
-# It can remove all sequences which do not have a SWISS-PROT name (XXXX_XXXXX)
-# Three arguments:
-# 1. Pfam flat file name
-# 2. outfile name
-# 3. 1 to remove TrEMBL seqs with "(FRAGMENT)" in their DE line.
-# Returns the number of sequences in the resulting alignment.
-# If a query name is given, it returns -1 if query is not found in alignment,
-# -10 if the name is not unique.
-# Last modified: 05/24/02
-sub removeSeqsFromPfamAlign {
- my $infile = $_[ 0 ];
- my $outfile = $_[ 1 ];
- my $remove_frags = $_[ 2 ];
- my $return_line = "";
- my $saw_sequence_line = 0;
- my $number_of_seqs = 0;
- my $DE = "";
- my $OS = "";
- my $AC = "";
- my $i = 0;
- my $length = 0;
- my $seq_name = "";
- my $seq = "";
-
-
- open( OUT_RNSP, ">$outfile" ) || die "\n\n$0: Unexpected error: Cannot create file \"$outfile\": $!";
- open( IN_RNSP, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!";
- while ( $return_line = <IN_RNSP> ) {
-
- if ( $saw_sequence_line == 1
- && !&containsPfamNamedSequence( $return_line )
- && !&isPfamCommentLine( $return_line ) ) {
- # This is just for counting purposes.
- $saw_sequence_line = 2;
- }
- if ( &isPfamSequenceLine( $return_line ) ) {
- if ( $saw_sequence_line == 0 ) {
- $saw_sequence_line = 1;
- }
- $return_line =~ /^\s*(\S+)\s+(\S+)/;
- $seq_name = $1;
- $seq = $2;
- if ( !&startsWithSWISS_PROTname( $return_line ) ) {
- $seq_name =~ /^(\S+)\//;
- $AC = $1;
- unless( exists( $AC_OS{ $AC } ) ) {
- #ACs not present in "ACDEOS" file.
- next;
- }
- $OS = $AC_OS{ $AC };
- if ( !$OS || $OS eq "" ) {
- die "\n\n$0: Unexpected error: species for \"$AC\" not found.\n\n";
- }
- unless( exists( $Species_names_hash{ $OS } ) ) {
- next;
- }
- if ( $remove_frags == 1 ) {
- $DE = $AC_DE{ $AC };
- if ( $DE && $DE =~ /\(FRAGMENT\)/ ) {
- next;
- }
- }
- $seq_name =~ s/\//_$OS\//;
- }
- else {
- if ( $return_line =~ /_([A-Z0-9]{1,5})\// ) {
- unless( exists( $Species_names_hash{ $1 } ) ) {
- next;
- }
- }
- # remove everything whose species cannot be determined.
- else {
- next;
- }
- }
- $length = length( $seq_name );
- for ( $i = 0; $i <= ( $LENGTH_OF_NAME - $length - 1 ); $i++ ) {
- $seq_name .= " ";
- }
- $return_line = $seq_name.$seq."\n";
- }
-
- if ( !&isPfamCommentLine( $return_line ) ) {
- print OUT_RNSP $return_line;
- }
-
- if ( $saw_sequence_line == 1 ) {
- $number_of_seqs++;
- }
- } ## while ( $return_line = <IN_RNSP> )
- close( IN_RNSP );
- close( OUT_RNSP );
-
- return $number_of_seqs;
-
-} ## removeSeqsFromPfamAlign
-
-
-
-
-
-
-
-# Reads in (SWISS-PROT) species names from a file.
-# Names must be separated by newlines.
-# Lines beginning with "#" are ignored.
-# A possible "=" and everything after is ignored.
-# One argument: species-names-file name
-# Last modified: 04/24/01
-sub readSpeciesNamesFile {
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $species = "";
-
- unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n\n$0: Error: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n";
- }
-
- open( IN_RSNF, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!\n\n";
- while ( $return_line = <IN_RSNF> ) {
- if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) {
- $species = $1;
- $species =~ s/=.+//;
- $Species_names_hash{ $species } = "";
- }
- }
- close( IN_RSNF );
-
- return;
-} ## readSpeciesNamesFile
-
-
-
-# Last modified: 02/21/03
-sub readTrEMBL_ACDEOS_FILE {
-
- my $return_line = "";
-
- unless ( ( -s $TREMBL_ACDEOS_FILE ) && ( -f $TREMBL_ACDEOS_FILE ) && ( -T $TREMBL_ACDEOS_FILE ) ) {
- die "\n\n$0: Error: \"$TREMBL_ACDEOS_FILE\" does not exist, is empty, or is not a plain textfile.\n\n";
- }
- # Fill up (huge) hashs.
- open( HH, "$TREMBL_ACDEOS_FILE" ) || die "\n\n$0: Unexpected error: Cannot open file <<$TREMBL_ACDEOS_FILE>>: $!\n\n";
- while ( $return_line = <HH> ) {
-
- if ( $return_line =~ /(\S+);([^;]*);(\S+)/ ) {
- $AC_OS{ $1 } = $3;
- $AC_DE{ $1 } = $2;
- }
- }
- close( HH );
-} ## readTrEMBL_ACDEOS_FILE
-
-
-# Last modified: 02/21/03
-sub readListFile {
-
- my $return_line = "";
-
- unless ( ( -s $ALGNS_TO_USE_LIST_FILE ) && ( -f $ALGNS_TO_USE_LIST_FILE ) && ( -T $ALGNS_TO_USE_LIST_FILE ) ) {
- die "\n\n$0: Error: \"$ALGNS_TO_USE_LIST_FILE\" does not exist, is empty, or is not a plain textfile.\n\n";
- }
- # Fill up hash.
- open( LF, "$ALGNS_TO_USE_LIST_FILE" ) || die "\n\n$0: Unexpected error: Cannot open file <<$ALGNS_TO_USE_LIST_FILE>>: $!\n\n";
- while ( $return_line = <LF> ) {
- if ( $return_line =~ /^\s*(\S+)\s*$/ ) { # just a list
- $ALGNS_TO_USE{ $1 } = "";
- }
- elsif ( $return_line =~ /^\s*\S+\s+\S+\s+(\S+)/ ) { # "changes" list from Pfam
- $ALGNS_TO_USE{ $1 } = "";
- }
-
- }
- close( LF );
-
-} ## readListFile
-
-
-
-# Five arguments:
-# 1. Name of inputfile
-# 2. Bootstraps
-# 2. Name of output alignment file
-# 3. Name of output positions file
-# 4. Seed for random number generator
-#
-# Last modified: 06/23/01
-sub executeBootstrap {
- my $infile = $_[ 0 ];
- my $bootstraps = $_[ 1 ];
- my $outalign = $_[ 2 ];
- my $positions = $_[ 3 ];
- my $seed = $_[ 4 ];
-
- system( "$BOOTSTRAP_CZ_PL 0 $bootstraps $infile $outalign $positions $seed" )
- && die "\n\n$0: executeBootstrap:\nCould not execute \"$BOOTSTRAP_CZ_PL 0 $bootstraps $infile $outalign $positions $seed\".\n\n";
-
-} ## executeBootstrap
-
-
-
-
-# Last modified: 05/22/02
-sub createTempdir {
-
- my $ii = 0;
- my $time = time;
-
- $tmp_dir = $MY_TEMP_DIR.$time.$ii;
-
- while ( -e $tmp_dir ) {
- $ii++;
- $tmp_dir = $MY_TEMP_DIR.$time.$ii;
- }
-
- mkdir( $tmp_dir, 0777 )
- || die "\n\n$0: Unexpected error: Could not create <<$tmp_dir>>: $!\n\n";
-
- unless ( ( -e $tmp_dir ) && ( -d $tmp_dir ) ) {
- die "\n\n$0: Unexpected error: failed to create <<$tmp_dir>>.\n\n";
- }
-
-} ## createTempdir
-
-
-
-# Last modified: 05/17/01
-sub startLogfile {
- if ( -e $MY_RIO_PWD_DIRECTORY.$LOGFILE ) {
- print "\npfam2pwd.pl:\n";
- print "logfile $MY_RIO_PWD_DIRECTORY"."$LOGFILE already exists\n";
- print "rename it or place it in another directory\n";
- exit( -1 );
- }
-
- open( L, ">$MY_RIO_PWD_DIRECTORY".$LOGFILE )
- || die "\n\n$0: startLogfile: Cannot create logfile: $!\n\n";
- print L "Min seqs : $MIN_SEQS\n";
- print L "Max seqs : $MAX_SEQS\n";
- print L "Seed : $MY_SEED\n";
- print L "TrEMBL ACDEOS file : $TREMBL_ACDEOS_FILE\n";
- print L "Species names file : $MY_SPECIES_NAMES_FILE\n";
- print L "Pfam directory : $MY_PFAM_FULL_DIRECTORY\n";
- print L "PWD outputdirectory: $MY_RIO_PWD_DIRECTORY\n";
- print L "BSP outputdirectory: $MY_RIO_BSP_DIRECTORY\n";
- print L "NBD outputdirectory: $MY_RIO_NBD_DIRECTORY\n";
- print L "ALN outputdirectory: $MY_RIO_ALN_DIRECTORY\n";
- print L "HMM outputdirectory: $MY_RIO_HMM_DIRECTORY\n";
- print L "Start date : ".`date`;
- if ( $MY_MATRIX == 0 ) {
- print L "Matrix : JTT\n";
- }
- elsif ( $MY_MATRIX == 2 ) {
- print L "Matrix : BLOSUM 62\n";
- }
- elsif ( $MY_MATRIX == 3 ) {
- print L "Matrix : mtREV24\n";
- }
- elsif ( $MY_MATRIX == 5 ) {
- print L "Matrix : VT\n";
- }
- elsif ( $MY_MATRIX == 6 ) {
- print L "Matrix : WAG\n";
- }
- elsif ( $MY_MATRIX == 7 ) {
- print L "Matrix : auto\n";
- }
- else {
- print L "Matrix : PAM\n";
- }
-} ## startLogfile
-
-
-
-# Last modified: 05/17/01
-sub finishLogfile {
- my $j = 0;
- print L "\n\n";
- print L "Successfully calculated $i pairwise distance files.\n";
- print L "Too large alignments (>$MAX_SEQS): $too_large\n";
- print L "Too small alignments (<$MIN_SEQS): $too_small\n";
- print L "Finish date : ".`date`."\n\n";
-
- print L "List of the $too_large alignments which were ignored because they\n";
- print L "contained too many sequences (>$MAX_SEQS) after pruning:\n";
- for ( $j = 0; $j < $too_large; ++$j ) {
- print L "$too_large_names[ $j ]\n";
- }
- print L "\n\n";
- print L "List of the $too_small alignments which were ignored because they\n";
- print L "contained not enough sequences (<$MIN_SEQS) after pruning:\n";
- for ( $j = 0; $j < $too_small; ++$j ) {
- print L "$too_small_names[ $j ]\n";
- }
- print L "\n";
- close( L );
-} ## finishLogfile
-
-
-
-
+++ /dev/null
-#! /usr/bin/perl
-
-# Unpack a pfam flatfile, containing many alignments,
-# into separate SELEX-format alignment files.
-#
-# Assumes that ID is the first line in a record,
-# that SQ is the last line before the alignment starts,
-# and that there is one aligned sequence per line.
-#
-
-
-################################################################
-# PFAMSERVER - The Washington University/St. Louis Pfam web server
-# Copyright (C) 1995-1999 Washington University School of Medicine
-# Copyright (C) 1995-1999 Sanger Centre/Genome Research Ltd.
-# Copyright (C) 1998-1999 Karolinska Institutet Center for Genomics Research
-# All Rights Reserved
-#
-# This source code is distributed under the terms of the
-# GNU General Public License. See the files COPYRIGHT and LICENSE
-# for details.
-#
-################################################################
-
-$cpl = 50; # 50 sequence characters per line
-$/ = "\n//"; # paragraph mode on // separators
-
-while (<>) {
- $in_alignment = 0;
- $nseq = 0;
- @lines = split(/^/);
- while ($line = shift(@lines)) {
- if ($in_alignment) {
- if ($line =~ /^\#/) { next; }
- elsif ($line =~ /^(\S+)\s+(\S+)/) {
- $name[$nseq] = $1;
- $aseq[$nseq] = $2;
- $nseq++;
- }
- }
- elsif ($line =~ /^\#=GF ID (\S+)\s*$/) {
- $root = $1;
- print "working on $root\n";
- if (-e "$root") {
- system ("mv $root $root.orig");
- print "$root exists -- moved to $root.orig\n";
- }
- open(SELEX,">$root") || die;
- print SELEX "#=ID $root\n";
- }
- elsif ($line =~ /^\#=GF AC (.+)$/) { print SELEX "#=AC $1\n"; }
- elsif ($line =~ /^\#=GF DE (.+)$/) { print SELEX "#=DE $1\n"; }
-
- elsif ($line =~ /^\#=GF GA (\S+)\s+(\S+)/)
- { print SELEX "#=GA $1 $2\n"; }
-
- elsif ($line =~ /^\#=GF TC (\S+) (\S+)/)
- { print SELEX "#=TC $1 $2\n"; }
-
- elsif ($line =~ /^\#=GF NC (\S+) (\S+)/)
- { print SELEX "#=NC $1 $2\n"; }
-
- elsif ($line =~ /^\#=GF SQ \d+/) {
- print SELEX "# $line";
- $in_alignment = 1;
- }
- elsif ($line =~ /^\/\//) {
- last;
- }
- else {
- print SELEX "# $line";
- }
- }
-
- # figure out maximum name length
- $maxnamelen = 0;
- for ($idx = 0; $idx < $nseq; $idx++) {
- if (length($name[$idx]) > $maxnamelen) {
- $maxnamelen = length($name[$idx]);
- }
- }
- # break the alignment across
- # multiple lines
- $alen = length($aseq[0]);
- for ($pos = 0; $pos < $alen; $pos += $cpl) {
- for ($idx = 0; $idx < $nseq; $idx++) {
- printf(SELEX "%-${maxnamelen}s %s\n",
- $name[$idx], substr($aseq[$idx], $pos, $cpl));
- }
- print SELEX "\n";
- }
- close SELEX;
-}
-
+++ /dev/null
-package queue;
-
-# Process queueing
-# SRE, Wed Sep 2 14:37:14 1998
-# CVS $Id: queue.pm,v 1.1.1.1 2005/03/22 08:35:51 cmzmasek Exp $
-# Master copy: see src/queue (CVS controlled, separate from pfamserver)
-#
-# Written for Pfam web server; suited for queuing any set of commands.
-#
-# API:
-#
-# $mypid = $$;
-# $delay_in_seconds = 2;
-#
-# $nqueued = &queue::CheckQueue("pfamqueue", "username", "/tmp");
-# print ("There are $nqueued jobs ahead of you in line\n");
-# &queue::WaitInQueue("pfamqueue", "username", "/tmp", $mypid, $delay_in_seconds);
-# print ("Our turn! Working...\n");
-# (do stuff)
-# &queue::RemoveFromQueue("pfamqueue", "username", "/tmp", $mypid);
-#
-# queuedir is a directory where the script has write permissions;
-# typically a tmp directory of some sort.
-#
-
-
-################################################################
-# PFAMSERVER - The Washington University/St. Louis Pfam web server
-# Copyright (C) 1995-1999 Washington University School of Medicine
-# Copyright (C) 1995-1999 Sanger Centre/Genome Research Ltd.
-# Copyright (C) 1998-1999 Karolinska Institutet Center for Genomics Research
-# All Rights Reserved
-#
-# This source code is distributed under the terms of the
-# GNU General Public License. See the files COPYRIGHT and LICENSE
-# for details.
-#
-################################################################
-# RCS $Id: queue.pm,v 1.1.1.1 2005/03/22 08:35:51 cmzmasek Exp $
-
-
-# WaitInQueue() - add a process id to a queue, wait for turn
-#
-# Arguments: queue - name of queue (prefix of queue stamp)
-# username - name of user (middle part of queue stamp)
-# queuedir - directory to keep queue stamps in
-# mypid - our process id
-# delay - number of seconds between checking queue status
-#
-# Note: When it checks the queue, if a stamp is present that
-# doesn't seem to correspond to a running process (ps -a),
-# it deletes the stamp. This protects against crashed processes
-# freezing all subsequent jobs.
-#
-# example: &WaitInQueue("pfamqueue", "/tmp", $mypid, 2);
-#
-# Returns 1 on success, 0 on failure.
-#
-# NOTE: You may have to set the ps command in WaitInQueue.
-# It must return all running processes.
-#
-sub WaitInQueue
-{
- local($queue, $username, $queuedir, $mypid, $delay) = @_;
- local(@newqueue, @queuelist, %mark);
- local(*STAMP, *QUEUEDIR);
- local(%is_running);
- local(@output, $line, $pid, $waiting);
-
- # get list of other guys who are working
- opendir(QUEUEDIR, $queuedir);
- @queuelist = grep(/$queue\.\S*\.\d+/, readdir(QUEUEDIR));
- closedir(QUEUEDIR);
- # make stamp for our pid
- if ($username eq "") { $username = "unknown"; }
- open(STAMP, ">$queuedir/$queue.$username.$mypid") || return 0;
- close(STAMP);
- # wait for our turn
- while (1)
- {
- if ($#queuelist == -1) { last; } # nobody ahead of us; our turn!
- sleep($delay);
- # get list of running processes
- %is_running = 0;
- @output = split(/^/, `ps -ax`);
- foreach $line (@output) {
- $line =~ /\s*(\d+)/;
- $is_running{$1} = 1;
- }
- # verify that the guys we're waiting for
- # are still running, and haven't crashed.
- # if they have, reap their stamps, and their
- # tmp files.
- foreach $waiting (@queuelist) {
- ($name, $pid) = ($waiting =~ /$queue\.(\S*)\.(\d+)/);
- if (! $is_running{$pid}) { unlink "$queuedir/$queue.$name.$pid"; }
- }
-
- # get new list of queued jobs ahead of us.
- # ignore guys who came in after we grabbed our initial queue list;
- # they're waiting for *us*. The crazed greps are the Perl-y
- # way of computing an intersection between two arrays.
- #
- opendir(QUEUEDIR, $queuedir);
- @newqueue = grep(/$queue\.\S*\.\d+/, readdir(QUEUEDIR));
- closedir(QUEUEDIR);
- %mark = 0;
- grep($mark{$_}++,@queuelist);
- @queuelist = grep($mark{$_},@newqueue);
- }
-
- 1; # time to run!
-}
-
-
-# CheckQueue() - return total number of processes working, other than us
-# and the total that this particular username is running.
-#
-# Arguments: queue, username, queuedir
-#
-sub CheckQueue
-{
- local($queue, $username, $queuedir) = @_;
- local(*QUEUEDIR, @allqueue, $nall, $nuser);
-
- opendir(QUEUEDIR, $queuedir);
- @allqueue = grep(/$queue\.\S*\.\d+/, readdir(QUEUEDIR));
- closedir(QUEUEDIR);
-
- if ($username eq "") {$username = "unknown"; }
- $nall = $nuser = 0;
- foreach $waiting (@allqueue) {
- ($name, $pid) = ($waiting =~ /$queue\.(\S*)\.(\d+)/);
- $nall++;
- if ($name eq $username) { $nuser++; }
- }
- return ($nall, $nuser);
-}
-
-
-# RemoveFromQueue() - remove a pid from a queue
-#
-sub RemoveFromQueue
-{
- local($queue, $username, $queuedir, $pid) = @_;
- if ($username eq "") {$username = "unknown"; }
- unlink "$queuedir/$queue.$username.$pid";
-}
-
-1;
+++ /dev/null
-#!/usr/bin/perl -W
-
-# rio.pl
-# ------
-#
-# Copyright (C) 2000-2002 Washington University School of Medicine
-# and Howard Hughes Medical Institute
-# All rights reserved
-#
-# Created: 11/25/00
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Last modified 09/06/03
-#
-
-#
-#
-# Available at: http://www.genetics.wustl.edu/eddy/forester/
-# RIO webserver: http://www.rio.wustl.edu/
-#
-# Reference:
-# Zmasek C.M. and Eddy S.R. (2002)
-# RIO: Analyzing proteomes by automated phylogenomics using
-# resampled inference of orthologs.
-# BMC Bioinformatics 3:14
-# http://www.biomedcentral.com/1471-2105/3/14/
-#
-# It is highly recommended that you read this paper before
-# installing and/or using RIO. (Included in the RIO
-# distribution as PDF: "RIO.pdf".)
-#
-#
-# Before rio.pl can be used, some variables in rio_module.pm need to be set,
-# as described in RIO_INSTALL.
-#
-# Usage: rio.pl <Mode: 1, 2, 3, or 4> <tagged arguments, single letter arguments>
-# -----
-#
-#
-# Examples:
-# --------
-# % RIO1.1/perl/rio.pl 1 A=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out1 p I C E
-#
-# % RIO1.1/perl/rio.pl 2 A=aconitase N=LEU2_LACLA/5-449 O=out2 p I C E
-#
-# % RIO1.1/perl/rio.pl 3 A=/path/to/my/pfam/Full/aconitase H=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out3 p I C E
-#
-# % RIO1.1/perl/rio.pl 4 A=/path/to/my/pfam/Full/aconitase N=LEU2_LACLA/5-449 O=out4 p I C E
-#
-# % RIO1.1/perl/rio.pl 3 A=/path/to/my/pfam/Full/aconitase b=/path/to/my/pfam/Seed/aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out5 p I C E
-#
-#
-# Modes:
-# ------
-#
-# 1: RIO analysis based on precalculated pairwise distances
-# alignment does not contain query sequence
-#
-# 2: RIO analysis based on precalculated pairwise distances
-# alignment does contain query sequence
-#
-# 3: RIO analysis based on Pfam alignments,
-# alignment does not contain query sequence
-#
-# 4: RIO analysis based on Pfam alignments,
-# alignment does contain query sequence
-#
-#
-#
-# Tagged arguments:
-# -----------------
-#
-# No "G=", "H=", "F=", "T=", "a=", "b=", "s", "f" in modes 1 and 2.
-#
-#
-# A=<String> Pfam alignment name (mandatory). This specifies the alignment
-# against which the RIO analysis is to be performed.
-# In modes 1 and 2: Pfam model (alignment) name
-# (e.g. "A=aconitase").
-# In modes 3 and 4: Pfam alignment path/name
-# (e.g. "A=/path/to/your/pfam/Full/aconitase").
-#
-# Q=<String> Path/name of file containing the query sequence
-# (in FASTA format or raw sequence) (mandatory in modes 1 and 3).
-#
-# N=<String> Query name (mandatory). This must include the SWISS-PROT code
-# for the species of the query after a "_" (e.g. "N=QUERY_HAEIN").
-# If the query sequence is already in the alignment (modes 2 and 4)
-# the complete name needs to be specified -- including "/xxx-xxx".
-#
-# O=<String> Output file path/name (mandatory).
-#
-# T=<char> Model for pairwaise distance calculation:
-# J=JTT, B=BLOSUM 62, M=mtREV24, V=VT, W=WAG, P=PAM.
-# BLOSUM 62 is default.
-# (Not in modes 1 and 2; these modes use $MATRIX_FOR_PWD instead.)
-#
-# In modes 1 and 3, a HMM is needed to align the query sequence to
-# the alignment and either one of the following options must be
-# employed:
-# H=<String> HMM name: This uses hmmfetch to retrieve a HMM from
-# $PFAM_HMM_DB.
-# F=<String> HMM file: This directly reads the HMM from a file.
-#
-# S=<String> Species tree file path/name (in NHX format) (optional).
-# If not specified, $SPECIES_TREE_FILE_DEFAULT is used.
-#
-# G=<String> Species names file (optional). Only sequences associated with
-# species found in this file are used.
-# In the species names file, individual species names must be
-# separated by newlines and lines starting with "#" are ignored.
-# While only sequences associated with species found in the species
-# tree ("S=") are used for the actual RIO analysis, this allows to
-# remove sequences prior to tree calculation (which is the most
-# time consuming step).
-#
-# P=<int> Sort priority (default is 12):
-# 0 : Ortholog
-# 1 : Ortholog, Super ortholog
-# 2 : Super ortholog, Ortholog
-# 3 : Ortholog, Distance
-# 4 : Distance, Ortholog
-# 5 : Ortholog, Super ortholog, Distance
-# 6 : Ortholog, Distance, Super ortholog
-# 7 : Super ortholog, Ortholog, Distance
-# 8 : Super ortholog, Distance, Ortholog
-# 9 : Distance, Ortholog, Super ortholog
-# 10 : Distance, Super ortholog, Ortholog
-# 11 : Ortholog, Subtree neighbor, Distance
-# 12 : Ortholog, Subtree neighbor, Super ortholog, Distance (default)
-# 13 : Ortholog, Super ortholog, Subtree neighbor, Distance
-# 14 : Subtree neighbor, Ortholog, Super ortholog, Distance
-# 15 : Subtree neighbor, Distance, Ortholog, Super ortholog
-# 16 : Ortholog, Distance, Subtree neighbor, Super ortholog
-# 17 : Ortholog, Subtree neighbor, Distance, Super ortholog
-#
-# a=<int> Bootstraps for tree construction (not in modes 1 and 2).
-# Default is 100.
-#
-# L=<int> Threshold for orthologies for output. Default is 0.
-# v=<int> Threshold for ultra-paralogies for output. Default is 50.
-#
-# U=<int> Threshold for orthologies for distance calculation. Default is 60.
-#
-# X=<int> In case of more than one putative orthologs:
-# number of sd the distance query - LCA has to differ
-# from the mean to generate a warning. Default is 2.
-#
-# Y=<int> In case of no putative orthologs:
-# number of sd the distance query - root has to differ
-# from mean to generate a warning. Default is 2.
-#
-# Z=<double> In case of one putative ortholog:
-# threshold for factor between the two distances to their
-# LCA (larger/smaller) to generate a warning. Default is 2.
-#
-# B=<int> Threshold for subtree-neighborings. Default is 0.
-#
-# b=<String> Build HMM from seed alignment with "hmmbuild -s" (optional).
-# This is to prevent from finding multiple domains per sequence
-# (i.e. prevents "cutting" the query sequence). Give path/name to
-# Seed with this.
-#
-# j=<String> Name for temporary directory (optional).
-#
-# y=<int> Seed for random number generator. Default is 41.
-#
-# I Create and save a rooted, with duplication vs speciation,
-# and orthology information annotated gene tree.
-# If precalculated distances are used (modes 1 and 2): this gene
-# tree is a NJ tree calculated based on the non-bootstrap resampled
-# (original) pairwise distances.
-# If precalculated distances are not used (modes 3 and 4): this gene
-# is a consenus tree with ML branch length values and is also
-# annotated with bootstrap values for each node.
-#
-# Options for output:
-# p Output ultra-paralogs.
-# D Description from SWISS-PROT and TrEMBL.
-# C Complete description from SWISS-PROT and TrEMBL.
-# E 118 character output instead of 78 character output.
-#
-# K Keep intermediate files (they will go into the same directory
-# as the output file, their names are the same as of the output
-# file, with various suffixes added).
-#
-# s Ignore non SWISS-PROT sequences (i.e. sequences from TrEMBL)
-# in the Pfam alignment.
-#
-# f Try to ignore TrEMBL "fragments" (sequences with "fragment" in
-# their description).
-#
-# + Parallel, use machines listed in file $NODE_LIST.
-#
-# x RIO used as web server -- HTML output.
-#
-#
-#
-#
-# History
-# -------
-# 09/06/03: Removal of minor bug. Only create consenus tree with ML branch length
-# values if "I" option used (in modes 3 or 4) -- the problem/bug was that
-# this tree was always created whether "I" was used or not.
-#
-
-
-use strict;
-
-use FindBin;
-use lib $FindBin::Bin;
-use Net::Ping;
-use rio_module;
-
-use File::Basename;
-
-
-my $VERSION = "5.010";
-
-my $E_VALUE_THRESHOLD = 0.01; # For HMMSEARCH.
-my $SORT_DEFAULT = 12;
-my $THRESHOLD_ORTHOLOGS_DEFAULT = 0;
-my $THRESHOLD_SN_DEFAULT = 0;
-my $THRESHOLD_ORTHOLOGS_DEFAULT_DC = 60;
-my $T_ULTRA_PARALOGS_DEFAULT = 50;
-my $WARN_NO_ORTHOS_DEFAULT = 2;
-my $WARN_MORE_THAN_ONE_ORTHO_DEFAULT = 2;
-my $WARN_ONE_ORTHO_DEFAULT = 2;
-my $MIN_NUMBER_OF_SEQS_IN_ALN = 4;
-my $BOOSTRAPS_FOR_MAKETREE_DEFAULT = 100;
-my $SEED_FOR_MAKETREE_DEFAULT = 41;
-my $MATRIX_DEFAULT = 2; # 2=BLOSUM62
-
-my $DO_RIO_TEMP_OUTFILE = "DoRIO_OUTFILE";
-my $TEMP_HMM_FILE = "HMMFILE";
-
-my $DEFAULT_OPTIONS_FOR_MAKETREE = "XR";
-
-
-# I/O files, names:
-my $alignment = "";
-my $hmm_file = "";
-my $hmm_name = "";
-my $seqX_file = "";
-my $species_tree_file = "";
-my $outfile = "";
-my $outfile_annot_nhx_tree = "";
-my $query_name = "";
-my $multiple_trees_file = "";
-my $distance_matrix_file = "";
-my $maketree_out_tree_file = "";
-my $seed_aln_for_hmmbuild = "";
-my $temp_dir = "";
-my $bsp_file = "";
-my $pwd_file = "";
-my $nbd_file = "";
-my $output_dir = "";
-my $species_names_file = " "; # Must be " ".
-my $options_for_makeTree = "";
-
-
-# multiple choice options:
-my $mode = 0;
-my $sort = $SORT_DEFAULT;
-my $matrix_n = $MATRIX_DEFAULT; # 0=JTT 1=PAM 2=BLOSUM62 3=mtREV24 5=VT 6=WAG
-
-
-
-
-# yes/no options:
-my $description = 0;
-my $complete_description = 0;
-my $long_output = 0;
-my $keep = 0;
-my $non_sp = 1; # 0 to remove non SP seqs.
-my $safe_nhx = 0;
-my $no_frags = 0;
-my $output_ultraparalogs = 0;
-my $parallel = 0;
-my $output_HTML = 0;
-
-
-# numerical options:
-my $warn_no_orthos = $WARN_NO_ORTHOS_DEFAULT;
-my $warn_more_than_one_ortho = $WARN_MORE_THAN_ONE_ORTHO_DEFAULT;
-my $warn_one_ortho = $WARN_ONE_ORTHO_DEFAULT;
-my $boostraps_for_makeTree = $BOOSTRAPS_FOR_MAKETREE_DEFAULT;
-my $seed_for_makeTree = $SEED_FOR_MAKETREE_DEFAULT;
-my $t_orthologs = $THRESHOLD_ORTHOLOGS_DEFAULT;
-my $t_sn = $THRESHOLD_SN_DEFAULT;
-my $t_orthologs_dc = $THRESHOLD_ORTHOLOGS_DEFAULT_DC;
-my $t_ultra_paralogs = $T_ULTRA_PARALOGS_DEFAULT;
-
-
-# internal variables:
-my $print_header_for_orthologies = 0;
-my $print_header_for_s_paralogs = 0;
-my $length_of_alignment = 0;
-my $length_of_orig_alignment = 0;
-my $time = 0;
-my $ii = 0;
-my $j = 0;
-my $jj = 0;
-my $number_of_seqs_in_aln = 0;
-my $f = 0;
-my $saw_distance_values = 0;
-my $saw_ultra_paralogs = 0;
-my $bootstraps = 0;
-my $ext_nodes_in_trees_analyzed = 0;
-my $time_total = 0;
-my $time_tree_calc = 0;
-my $time_tree_calcT = 0;
-my $time_rio = 0;
-my $time_rioT = 0;
-my $time_dqopuzzle = 0;
-my $time_dqopuzzleT = 0;
-my $time_addingdists = 0;
-my $time_addingdistsT = 0;
-my $processors = 0;
-my $block_size = 0;
-my $larger_blocks = 0;
-my $printed_ultra_paralogs = 0;
-
-my $dorio_outfile = "";
-my $options_for_DoRIO = "";
-my $ortho_name = "";
-my $orthos = 0;
-my $s_orthos = 0;
-my $subtree_neighbors = 0;
-my $dist = 0;
-my $s_para_name = "";
-my $s_paras = 0;
-my $sort_priority = "";
-my $return_line = "";
-my $matrix = "";
-my $command_line = "";
-my $command_line_for_hmmbuild = "";
-my $current_dir = "";
-my @complete_names = ();
-my @temp_array = ();
-my %Species_names_hash = ();
-my %AC_DE = (); # AC => DE from "ACDEOS" TrEMBL file.
-my %SP_AC_DE = (); # ID => DE from "ACIDOS" SWISS-PROT file.
-my %names_in_pwd_file = ();
-my @nodelist = ();
-
-my $start_date = `date`;
-
-
-
-
-# This analyzes the options:
-# --------------------------
-
-$time_total = time;
-
-if ( @ARGV < 4 ) {
- &printHelp();
-}
-
-$command_line = "$0 ";
-for ( $j = 0; $j < @ARGV; ++$j ) {
- $command_line .= "$ARGV[ $j ] ";
-}
-
-&analyzeCommandLine( @ARGV );
-
-if ( $species_tree_file eq "" ) {
- $species_tree_file = $SPECIES_TREE_FILE_DEFAULT;
-}
-
-&CheckArguments;
-
-$options_for_makeTree = $DEFAULT_OPTIONS_FOR_MAKETREE;
-$options_for_makeTree .= "S".$seed_for_makeTree;
-
-
-if ( $mode == 1 || $mode == 2 ) {
-
- if ( $mode == 1 ) {
- $hmm_file = $RIO_HMM_DIRECTORY.$alignment.$SUFFIX_HMM;
- $bsp_file = $RIO_BSP_DIRECTORY.$alignment.$SUFFIX_BOOT_STRP_POS;
- &userErrorCheckForTextFileExistence( $hmm_file );
- &userErrorCheckForTextFileExistence( $bsp_file );
- }
-
- $pwd_file = $RIO_PWD_DIRECTORY.$alignment.$SUFFIX_PWD;
- $nbd_file = $RIO_NBD_DIRECTORY.$alignment.$SUFFIX_PWD_NOT_BOOTS;
- $alignment = $RIO_ALN_DIRECTORY.$alignment.$ALIGN_FILE_SUFFIX;
- &userErrorCheckForTextFileExistence( $pwd_file );
- &userErrorCheckForTextFileExistence( $nbd_file );
- &userErrorCheckForTextFileExistence( $alignment );
- $no_frags = 0;
- $non_sp = 1;
-
- $options_for_makeTree .= "F";
-}
-elsif ( $mode == 3 || $mode == 4 ) {
- if ( $safe_nhx == 1 ) {
- $options_for_makeTree .= "U";
- }
- else {
- $options_for_makeTree .= "#";
- }
- $options_for_makeTree .= "D"; # To calc. and keep pairwise distances.
- $options_for_makeTree .= "B".$boostraps_for_makeTree;
-
-}
-
-if ( $output_HTML == 1 ) {
- $| = 1;
- $complete_description = 1;
- $long_output = 1;
-
-}
-
-if ( $mode == 1 || $mode == 3 || $mode == 4 ) {
-
- if ( $mode == 1 ) {
- $matrix_n = $MATRIX_FOR_PWD;
- }
-
- if ( $matrix_n == 0 ) {
- $options_for_makeTree .= "J";
- $matrix = "JTT (Jones et al. 1992)";
- }
- elsif ( $matrix_n == 1 ) { # PAM is makeTree's default.
- $matrix = "PAM (Dayhoff et al. 1978)";
- }
- elsif ( $matrix_n == 2 ) {
- $options_for_makeTree .= "L";
- $matrix = "BLOSUM 62 (Henikoff-Henikoff 92)";
- }
- elsif ( $matrix_n == 3 ) {
- $options_for_makeTree .= "M";
- $matrix = "mtREV24 (Adachi-Hasegawa 1996)";
- }
- elsif ( $matrix_n == 5 ) {
- $options_for_makeTree .= "T";
- $matrix = "VT (Mueller-Vingron 2000)";
- }
- elsif ( $matrix_n == 6 ) {
- $options_for_makeTree .= "W";
- $matrix = "WAG (Whelan-Goldman 2000)";
- }
- else {
- &dieWithUnexpectedError( "Failed sanity check" );
- }
-}
-
-
-# This creates the temp directory:
-# --------------------------------
-
-$ii = 0;
-
-$time = time;
-
-if ( $temp_dir eq "" ) {
- $temp_dir = $TEMP_DIR_DEFAULT.$time.$ii;
-}
-else {
- $temp_dir = $temp_dir.$ii;
-}
-
-while ( -e $temp_dir ) {
- $ii++;
- $temp_dir = $TEMP_DIR_DEFAULT.$time.$ii;
-}
-
-mkdir( $temp_dir, 0700 ) || &dieWithUnexpectedError( "Could not create \"$temp_dir\"" );
-
-unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) {
- &dieWithUnexpectedError( "\"$temp_dir\" does not exist, or is not a directory" );
-}
-
-
-
-# The analysis starts here:
-# -------------------------
-
-$dorio_outfile = $temp_dir."/".$DO_RIO_TEMP_OUTFILE;
-
-$output_dir = dirname( $outfile );
-
-unless ( ( -e $output_dir ) && ( -d $output_dir ) ) {
- &userError( "Outfile directory (\"$output_dir\") does not exist,\n or is not a directory." );
-}
-
-if ( $mode == 1 || $mode == 3 ) {
- $query_name = substr( $query_name, 0, $LENGTH_OF_NAME - 10 );
-}
-
-
-
-
-
-if ( $mode == 1 || $mode == 3 ) {
-
- # Prepares the query file:
- # ------------------------
- $query_name = &seqFile2CleanedUpFastaFile( $seqX_file,
- "$temp_dir/QUERY_SEQ",
- $query_name );
- if ( $query_name eq "" ) {
- &userError( "Query file \"$seqX_file\") does not appear to contain a valid name\n and/or \"-N\" option has not been used." );
- }
-
- if ( $mode == 3 ) {
- # Prepares the HMM:
- # -----------------
- if ( $hmm_file eq "" ) {
- $hmm_file = $temp_dir."/".$TEMP_HMM_FILE;
- if ( $hmm_name ne "" ) {
- &executeHmmfetch( $PFAM_HMM_DB, $hmm_name, $hmm_file );
- }
- elsif ( $seed_aln_for_hmmbuild ne "" ) {
- $command_line_for_hmmbuild = &executeHmmbuild( $seed_aln_for_hmmbuild, $hmm_file );
- }
- }
-
- }
-}
-
-
-
-
-# This might remove non SWISS PROT seqs, TreMBL fragments,
-# and seqs from species not in $species_names_file from the alignment:
-# --------------------------------------------------------------------
-if ( $mode == 3 || $mode == 4 ) {
- #if ( $do_not_removeSeqsFromPfamAlign != 1 ) {
-
- if ( $mode == 3 ) {
- &removeSeqsFromPfamAlign( $alignment,
- $temp_dir."/ALIGN2",
- " ",
- $species_names_file,
- $non_sp,
- $no_frags );
- }
- else {
- &removeSeqsFromPfamAlign( $alignment,
- $temp_dir."/ALIGN2",
- $query_name,
- $species_names_file,
- $non_sp,
- $no_frags );
- }
-
-}
-
-
-
-# If necessary, this aligns the query to the pfam alignment
-# using hmmsearch, p7extract.pl, multifetch.pl, and hmmalign
-# from the HMMER package:
-# ----------------------------------------------------------
-if ( $mode == 1 || $mode == 3 ) {
- if ( $mode == 1 ) {
-
- $f = &alignWithHmmalign( $alignment,
- $temp_dir."/QUERY_SEQ",
- $hmm_file,
- $temp_dir."/HMMALIGNOUT",
- 1 ); # --mapali
-
-
- }
- else {
-
- $f = &alignWithHmmalign( $temp_dir."/ALIGN2",
- $temp_dir."/QUERY_SEQ",
- $hmm_file,
- $temp_dir."/HMMALIGNOUT",
- 0 ); # --withali
-
- }
- if ( $f != 1 ) {
- if ( $alignment =~ /.+\/(.+)/ ) {
- $alignment = $1;
- }
- if ( $alignment =~ /(.+)\..+/ ) {
- $alignment = $1;
- }
- &cleanUpTempDir();
- if ( $output_HTML == 1 ) {
- &exitWithWarning( "query sequence does not contain sufficient similarity to the \"$alignment\" domain", 1 );
- }
- else {
- &exitWithWarning( "Query sequence does not contain sufficient similarity to the \"$alignment\" domain" );
- }
- }
-
-
- # In case query contains more than one of the same domain:
-
- @complete_names = &getCompleteName( $temp_dir."/HMMALIGNOUT", $query_name );
-
- if ( @complete_names < 1 ) {
- &dieWithUnexpectedError( "Could not find \"$query_name in $temp_dir"."/HMMALIGNOUT\"" );
- }
-}
-elsif ( $mode == 2 || $mode == 4 ) {
- # Here, this is just for checking:
- if ( $mode == 2 ) {
- @complete_names = &getCompleteName( $alignment, $query_name );
- }
- elsif ( $mode == 4 ) {
- @complete_names = &getCompleteName( $temp_dir."/ALIGN2", $query_name );
- }
- if ( @complete_names < 1 ) {
- &dieWithUnexpectedError( "Could not find \"$query_name in $temp_dir"."/HMMALIGNOUT\"" );
- }
- @complete_names = ();
- $complete_names[ 0 ] = $query_name;
-}
-
-if ( $parallel == 1 ) {
- &readInNodesList();
- &pingNodes();
- $processors = scalar( @nodelist );
- if ( $processors < 2 ) {
- $parallel = 0;
- }
- if ( $processors > $BOOTSTRAPS ) {
- $processors = $BOOTSTRAPS;
- }
- else {
- $block_size = int $BOOTSTRAPS / $processors;
- $larger_blocks = $BOOTSTRAPS - ( $block_size * $processors ); # number of blocks which have a size of
- # block_size + 1
-
- }
-}
-
-
-# This opens the output file:
-# ---------------------------
-if ( $output_HTML != 1 ) {
- open( OUT, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" );
-}
-
-# This starts printing to the output file:
-# ----------------------------------------
-&printHeader();
-
-
-
-# This loop goes through the different domains of the query
-# which aligned to the alignment (in modes 2 and 4 this can
-# obviously be only one):
-# -----------------------------------------------------------
-for ( $jj = 0; $jj < @complete_names; ++$jj ) {
-
- if ( $mode == 1 ) {
- # Moves the query to the last line(s) of the alignment.
- # Removes other querie domains $complete_names[i]-- for which i != $jj
- # --------------------------------------------------------------------
-
- &moveToLast( $complete_names[ $jj ],
- $temp_dir."/HMMALIGNOUT",
- $temp_dir."/MOVETOLASTOUT",
- \@complete_names );
-
- }
-
- if ( $mode == 1 || $mode == 3 ) {
- if ( $mode == 1 ) {
- @temp_array = &pfam2phylipMatchOnly( $temp_dir."/MOVETOLASTOUT",
- $temp_dir."/ALIGN2_PHYLIP_MO",
- 0 );
- }
- else {
- @temp_array = &pfam2phylipMatchOnly( $temp_dir."/HMMALIGNOUT",
- $temp_dir."/ALIGN2",
- 1 );
- }
- $length_of_alignment = $temp_array[ 0 ];
- $length_of_orig_alignment = $temp_array[ 1 ];
- $number_of_seqs_in_aln = $temp_array[ 2 ];
- }
- elsif ( $mode == 2 || $mode == 4 ) {
-
- $query_name = $complete_names[ 0 ];
-
- if ( $mode == 4 ) {
- if ( !&startsWithSWISS_PROTname( $query_name ) ) {
- # Query is not a SWISS-PROT sequence.
- $query_name = &getCompleteNameForTrEMBLquerySeq( $temp_dir."/ALIGN2",
- $query_name );
- }
-
- $number_of_seqs_in_aln = &countSeqsInPfamAlign( $temp_dir."/ALIGN2" );
- }
- else {
- if ( !&startsWithSWISS_PROTname( $query_name ) ) {
- # Query is not a SWISS-PROT sequence.
- $query_name = &getCompleteNameForTrEMBLquerySeq( $alignment,
- $query_name );
- }
- $number_of_seqs_in_aln = &countSeqsInPfamAlign( $alignment );
- }
-
-
-
- }
-
- if ( $number_of_seqs_in_aln < $MIN_NUMBER_OF_SEQS_IN_ALN ) {
- &cleanUpTempDir();
- if ( $output_HTML == 1 ) {
- &exitWithWarning( "Removal of sequences resulted in an alignment with less than $MIN_NUMBER_OF_SEQS_IN_ALN sequences ($number_of_seqs_in_aln)", 1 );
- }
- else {
- &exitWithWarning( "Removal of sequences resulted in an alignment with less than $MIN_NUMBER_OF_SEQS_IN_ALN sequences ($number_of_seqs_in_aln)" );
- }
- }
-
-
- if ( $mode == 1 ) {
-
- unlink( $temp_dir."/ALIGN2_BOOTSTRAPPED" );
-
- if ( $parallel == 1 ) {
- &executeBootstrap_cz( $BOOTSTRAPS,
- $bsp_file,
- $temp_dir."/ALIGN2_PHYLIP_MO",
- $temp_dir."/ALIGN2_BOOTSTRAPPED",
- $processors );
-
- }
- else {
-
- &executeBootstrap_cz( $BOOTSTRAPS,
- $bsp_file,
- $temp_dir."/ALIGN2_PHYLIP_MO",
- $temp_dir."/ALIGN2_BOOTSTRAPPED" );
-
- }
-
-
- $current_dir = `pwd`;
- $current_dir =~ s/\s//;
-
- chdir ( $temp_dir ) || &dieWithUnexpectedError( "Could not chdir to \"$temp_dir\"" );
-
-
- if ( $parallel == 1 ) {
-
- my $number = 0;
- my $all_finished = 0;
-
- system( $RIO_SLAVE_DRIVER,
- $block_size,
- $larger_blocks,
- $temp_dir."/ALIGN2_BOOTSTRAPPED",
- $matrix_n,
- $complete_names[ $jj ],
- $pwd_file,
- $temp_dir,
- $seed_for_makeTree,
- @nodelist )
- && &dieWithUnexpectedError( "Could not execute \"$RIO_SLAVE_DRIVER\"" );
-
- while ( $all_finished != 1 ) {
- for ( $number = 0; $number < $processors; $number++ ) {
- unless ( -e "FINISHED_$number" ) {
- $number = -1;
- }
- }
- $all_finished = 1;
- }
-
- sleep( 1 );
-
- system( "mv",
- "MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX."0",
- "MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX )
- && &dieWithUnexpectedError( "$!" );
-
- for ( $number = 1; $number < $processors; $number++ ) {
- system( "cat MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX$number >> MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX" )
- && &dieWithUnexpectedError( "$!" );
- if ( unlink( "MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX$number" ) != 1 ) {
- &dieWithUnexpectedError( "Could not delete \"MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX$number" );
- }
- }
-
- # Sanity check: Counts ";" in "MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX".
- if ( `grep -c ';' MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX` != $BOOTSTRAPS ) {
- &dieWithUnexpectedError( "\"MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX\" does not contain $BOOTSTRAPS \";\"" );
- }
-
- for ( $number = 0; $number < $processors; $number++ ) {
- if ( unlink( "FINISHED_$number" ) != 1 ) {
- &dieWithUnexpectedError( "Could not delete \"FINISHED_$number\"" );
- }
- }
-
- &executeConsense( "MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX );
- unlink( "outfile", "intree" );
-
- system( "mv", "outtree", "MAKETREEOUT.nhx" )
- && &dieWithUnexpectedError( "$!" );
-
-
- }
- else {
- $time_dqopuzzle = time; #time
- &executePuzzleDQObootstrapped( "ALIGN2_BOOTSTRAPPED", $matrix_n );
- $time_dqopuzzle = time - $time_dqopuzzle; #time
- $time_dqopuzzleT += $time_dqopuzzle; #time
-
- system( "mv", "ALIGN2_BOOTSTRAPPED.dist", "DISTs_TO_QUERY" )
- && &dieWithUnexpectedError( "$!" );
- }
-
-
- &executePuzzleDQO( "ALIGN2_PHYLIP_MO", $matrix_n );
-
- unlink( "ALIGN2_PHYLIP_MO" );
-
- system( "mv", "ALIGN2_PHYLIP_MO.dist", "DIST_TO_QUERY" )
- && &dieWithUnexpectedError( "$!" );
-
- if ( $parallel != 1 ) {
- $time_addingdists = time;
- &addDistsToQueryToPWDfile( $pwd_file,
- $temp_dir."/DISTs_TO_QUERY",
- $temp_dir."/PWD_INC_QUERY",
- $complete_names[ $jj ] );
-
-
- $time_addingdists = time - $time_addingdists;
- $time_addingdistsT += $time_addingdists;
- }
- &addDistsToQueryToPWDfile( $nbd_file,
- $temp_dir."/DIST_TO_QUERY",
- $temp_dir."/NBD_INC_QUERY",
- $complete_names[ $jj ] );
-
- }
-
- if ( $mode == 2 ) {
- $current_dir = `pwd`;
- $current_dir =~ s/\s//;
- chdir ( $temp_dir )
- || &dieWithUnexpectedError( "Could not chdir to \"$temp_dir\"" );
-
- }
-
-
- if ( $parallel != 1 ) {
- unlink( $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX );
- }
-
- $time_tree_calc = time;
-
- # This calculates the trees
- # -------------------------
-
- if ( $mode == 1 || $mode == 2 ) {
-
- if ( $mode == 1 ) {
-
- &executeNeighbor( $temp_dir."/NBD_INC_QUERY",
- 0,
- 0,
- 0,
- 1 );
-
- unlink( "outfile" );
- system( "mv", "outtree", "NBD_NJ_TREE" )
- && &dieWithUnexpectedError( "$!" );
- if ( $parallel != 1 ) {
- &executeMakeTree( $options_for_makeTree,
- $temp_dir."/PWD_INC_QUERY",
- $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX,
- $temp_dir."/maketree_tempdir" );
- }
-
- }
- else {
- &executeNeighbor( $nbd_file,
- 0,
- 0,
- 0,
- 1 );
-
- unlink( "outfile" );
- system( "mv", "outtree", "NBD_NJ_TREE" )
- && &dieWithUnexpectedError( "$!" );
-
- &executeMakeTree( $options_for_makeTree,
- $pwd_file,
- $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX,
- $temp_dir."/maketree_tempdir" );
-
- }
-
- chdir( $current_dir )
- || &dieWithUnexpectedError( "Could not chdir to \"$current_dir\"" );
-
-
- }
- elsif ( $mode == 3 || $mode == 4 ) {
- &executeMakeTree( $options_for_makeTree,
- $temp_dir."/ALIGN2",
- $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX,
- $temp_dir."/maketree_tempdir" );
-
- unlink( $temp_dir."/MAKETREEOUT".$ALIGN_FILE_SUFFIX );
- }
-
-
- $time_tree_calc = time - $time_tree_calc;
- $time_tree_calcT += $time_tree_calc;
-
- if ( $keep == 1 ) {
-
- system( "cp", $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX, $outfile.$TREE_FILE_SUFFIX );
- system( "cp", $temp_dir."/MAKETREEOUT".$LOG_FILE_SUFFIX, $outfile.$LOG_FILE_SUFFIX );
- system( "cp", $temp_dir."/MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX, $outfile.$MULTIPLE_TREES_FILE_SUFFIX );
- if ( $mode == 1 || $mode == 2 ) {
- system( "cp", $temp_dir."/NBD_NJ_TREE", $outfile."-NJ".$TREE_FILE_SUFFIX );
- }
-
- }
-
- unlink( $temp_dir."/ALIGN2" );
-
- $multiple_trees_file = $temp_dir."/MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX;
- $maketree_out_tree_file = $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX;
- $distance_matrix_file = $temp_dir."/MAKETREEOUT".$SUFFIX_PWD_NOT_BOOTS;
-
-
- if ( $mode == 1 || $mode == 3 ) {
- $query_name = $complete_names[ $jj ];
- }
-
- $options_for_DoRIO = "";
-
- # This will result in saving of the annotated consenus tree:
- # ----------------------------------------------------------
- if ( $safe_nhx == 1 ) {
- my $number = $jj + 1;
- if ( @complete_names > 1 ) {
- $outfile_annot_nhx_tree = $outfile.$ADDITION_FOR_RIO_ANNOT_TREE."-".$number.$TREE_FILE_SUFFIX;
- }
- else {
- $outfile_annot_nhx_tree = $outfile.$ADDITION_FOR_RIO_ANNOT_TREE.$TREE_FILE_SUFFIX;
- }
- }
-
-
-
- if ( $sort > 2 ) {
- if ( $mode == 3 || $mode == 4 ) {
- $options_for_DoRIO .= " D=".$distance_matrix_file;
- }
- elsif ( $mode == 1 ) {
- $options_for_DoRIO .= " d=".$temp_dir."/DIST_TO_QUERY";
- }
- elsif ( $mode == 2 ) {
- $options_for_DoRIO .= " D=".$nbd_file;
- }
- }
- $options_for_DoRIO .= " M=".$multiple_trees_file;
- $options_for_DoRIO .= " 'N=".$query_name."'";
- $options_for_DoRIO .= " S=".$species_tree_file;
- $options_for_DoRIO .= " O=".$dorio_outfile;
- $options_for_DoRIO .= " P=".$sort;
- $options_for_DoRIO .= " L=".$t_orthologs;
- $options_for_DoRIO .= " B=".$t_sn;
- $options_for_DoRIO .= " U=".$t_orthologs_dc;
- $options_for_DoRIO .= " X=".$warn_more_than_one_ortho;
- $options_for_DoRIO .= " Y=".$warn_no_orthos;
- $options_for_DoRIO .= " Z=".$warn_one_ortho;
-
- if ( $mode == 1 || $mode == 2 ) {
- $options_for_DoRIO .= " T=".$temp_dir."/NBD_NJ_TREE";
- $options_for_DoRIO .= " t=".$maketree_out_tree_file;
- }
- elsif ( $mode == 3 || $mode == 4 ) {
- if ( $safe_nhx == 1 ) { # Added 09/04/03.
- $options_for_DoRIO .= " T=".$maketree_out_tree_file;
- }
- }
-
- if ( $safe_nhx == 1 ) {
- $options_for_DoRIO .= " I";
- }
- if ( $output_ultraparalogs == 1 ) {
- $options_for_DoRIO .= " p";
- $options_for_DoRIO .= " v=".$t_ultra_paralogs;
- }
-
- $time_rio = time;
-
- &executeDoRIO( $options_for_DoRIO );
-
- $time_rio = time - $time_rio;
- $time_rioT += $time_rio;
-
- unless ( ( -s $dorio_outfile ) && ( -f $dorio_outfile ) && ( -T $dorio_outfile ) ) {
- close( OUT );
- unlink( $outfile );
- &dieWithUnexpectedError( "failure during execution of RIO (no output generated)" );
- }
-
- if ( $safe_nhx == 1 ) {
- system( "mv",
- $temp_dir."/".$DO_RIO_TEMP_OUTFILE.$ADDITION_FOR_RIO_ANNOT_TREE.$TREE_FILE_SUFFIX,
- $outfile_annot_nhx_tree )
- && &dieWithUnexpectedError( "$!" );
- }
-
-
- open( IN, "$dorio_outfile" )
- || &dieWithUnexpectedError( "Cannot open file \"$dorio_outfile\"" );
-
- $saw_distance_values = 0;
- $saw_ultra_paralogs = 0;
- $printed_ultra_paralogs = 0;
- $print_header_for_orthologies = 1;
- $print_header_for_s_paralogs = 1;
-
-
-
-
- # This generates the report
- # -------------------------
-
- W: while ( $return_line = <IN> ) {
-
- if ( $return_line =~ /distance values:/i ) {
- $saw_distance_values = 1;
- &printTitleForDistanceValues();
- }
- elsif ( $return_line =~ /ultra paralogs/i ) {
- $saw_ultra_paralogs = 1;
- }
- elsif ( $return_line =~ /^mean bootstrap/i ) {
- &printMeanBootstraps();
- }
- elsif ( $return_line =~ /sort priority\s*:\s*(.+)/i ) {
- $sort_priority = $1;
- }
- elsif ( $return_line =~ /ext nodes\s*:\s*(.+)/i ) {
- $ext_nodes_in_trees_analyzed = $1 - 1; # One seq is query.
- }
- elsif ( $return_line =~ /bootstraps\s*:\s*(\S+)/i ) {
- if ( $jj == @complete_names - 1 ) {
- $bootstraps = $1;
- if ( $output_HTML == 1 ) {
- $| = 1;
- }
- &printOptions();
- last W;
- }
- }
- elsif ( $saw_distance_values != 1
- && $saw_ultra_paralogs != 1
- && $return_line =~ /(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s*(\S*)/ ) {
- $ortho_name = $1;
- $orthos = $2;
- $subtree_neighbors = $3;
- $s_orthos = $4;
- $dist = $5;
-
- if ( $print_header_for_orthologies == 1 ) {
- &printHeaderForOrthologies();
- $print_header_for_orthologies = 0;
- }
- &printOrthologies();
- }
- elsif ( $saw_distance_values != 1
- && $saw_ultra_paralogs != 1
- && $return_line =~ /^\s*-\s*$/ ) {
- $ortho_name = "-";
- $orthos = 0;
- $s_orthos = 0;
- $dist = 0;
- if ( $print_header_for_orthologies == 1 ) {
- &printHeaderForOrthologies();
- $print_header_for_orthologies = 0;
- }
- &printOrthologies();
- }
- elsif ( $output_ultraparalogs == 1
- && $saw_ultra_paralogs == 1
- && $return_line =~ /(\S+)\s+(\S+)\s+(\S+)/ ) {
- $s_para_name = $1;
- $s_paras = $2;
- $dist = $3;
- if ( $print_header_for_s_paralogs == 1 ) {
- &printHeaderForSparalogs();
- $print_header_for_s_paralogs = 0;
- }
- &printUltraParlogs();
- $printed_ultra_paralogs = 1;
- }
- elsif ( $output_ultraparalogs == 1
- && $saw_ultra_paralogs == 1
- && $return_line =~ /^\s*-\s*$/ ) {
- &printNoUltraParalogs();
- }
- elsif ( $return_line =~ /Bootstraps/ ) {
- $saw_distance_values = 0;
- }
- elsif ( $saw_distance_values == 1 && $saw_ultra_paralogs != 1 ) {
- &printDistanceValues();
- }
-
- }
- close( IN );
-
-} # End of for loop going through possible
- # multiple matches to the same alignment/model.
-
-if ( $output_HTML != 1 ) {
- close( OUT );
-}
-
-&cleanUpTempDir();
-
-if ( $output_HTML != 1 ) {
- print( "\n\nrio.pl successfully terminated.\nOutput written to: $outfile\n\n" );
-}
-
-exit( 0 );
-
-
-
-
-
-
-
-
-
-# ===========================================================
-# Methods
-# -----------------------------------------------------------
-
-
-
-
-# -----------------------------------------------------------
-# Parallization related
-# -----------------------------------------------------------
-
-
-
-# Last modified: 02/02/02
-sub readInNodesList {
-
- &testForTextFilePresence( $NODE_LIST );
-
- open( NIN, "$NODE_LIST" ) || &dieWithUnexpectedError( "Cannot open file \"$NODE_LIST\"" );
-
- while ( <NIN> ) {
- if ( $_ =~ /(\S+)/ ) {
- push( @nodelist, $1 );
- }
- }
- close( NIN );
- return;
-}
-
-
-
-# Last modified: 02/02/02
-sub pingNodes {
- my @temp_node_list = ();
- my $p = Net::Ping->new( "tcp", 2 ); # or "udp"
- my $n = "";
-
- foreach $n ( @nodelist ) {
- if ( defined( $p->ping( $n ) ) ) {
- push( @temp_node_list, $n );
- }
- }
- @nodelist = ();
- @nodelist = @temp_node_list;
- return;
-
-}
-
-
-
-
-# -----------------------------------------------------------
-# Output related
-# -----------------------------------------------------------
-
-
-# Last modified: 03/07/01
-sub printHeader {
-
- if ( $output_HTML != 1 ) {
- print OUT "RIO - Resampled Inference of Orthologs\n";
- print OUT "Version: $VERSION\n";
- print OUT "------------------------------------------------------------------------------\n";
-
- print OUT "Pfam alignment file : $alignment\n";
- if ( $mode == 3 ) {
- print OUT "Pfam alignment description : ".&getDescriptionFromPfam( $alignment )."\n";
- }
- if ( $mode == 1 || $mode == 2 ) {
- print OUT "Bootstrapped pairwise distances file : $pwd_file\n";
- print OUT "Not bootstrapped pairwise distances file: $nbd_file\n";
- print OUT "Bootstrap positions file : $bsp_file\n";
- }
- if ( $mode == 1 || $mode == 3 ) {
- if ( $seed_aln_for_hmmbuild ne "" ) {
- print OUT "HMM : built based on $seed_aln_for_hmmbuild\n";
- }
- elsif ( $hmm_name ne "" ) {
- print OUT "HMM : $hmm_name\n";
- }
- else {
- print OUT "HMM : $hmm_file\n";
- }
- print OUT "Query file : $seqX_file\n";
- }
- print OUT "==============================================================================\n\n";
- }
-
-} ## printHeader
-
-
-
-
-# Last modified: 03/07/01
-sub printHeaderForOrthologies {
-
- if ( $output_HTML != 1 ) {
- if ( $jj > 0 ) {
- print OUT "\n\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n";
- }
-
- print OUT "Query : $query_name\n\n";
-
- if ( @complete_names > 1 ) {
- my $size = @complete_names;
- my $number = $jj + 1;
- print OUT "More than one region of the query were aligned to the profile HMM.\n";
- print OUT "This is for domain #$number out of $size.\n\n";
- }
-
- print OUT "Number (in %) of observed orthologies (o), \"subtree-neighborings\" (n),\n";
- print OUT "and super-orthologies (s) to query in bootstrapped trees, evolutionary\n";
- print OUT "distance to query (as average number of aa replacements per residue):\n\n";
- if ( $long_output != 1 ) {
- print OUT "Sequence Description o[%] n[%] s[%] distance\n";
- print OUT "-------- ----------- ---- ---- ---- --------\n";
- }
- else {
- print OUT "Sequence Description o[%] n[%] s[%] distance\n";
- print OUT "-------- ----------- ---- ---- ---- --------\n";
- }
- }
- else {
- if ( $jj > 0 ) {
- print "</TABLE>\n";
- print "<P>   </P>\n";
- print "<HR NOSHADE COLOR=\"#CCCCCC\">\n";
- print "<P>   </P>\n";
- }
-
- if ( @complete_names > 1 ) {
- my $size = @complete_names;
- my $number = $jj + 1;
- print "<P>More than one region of the query were aligned to the profile HMM. \n";
- print "This is for domain #$number out of $size.</P>\n";
- }
- print "<P>Query : $query_name</P>\n";
- print "<H4 class = \"title\">Orthologies, subtree-neighborings, super-orthologies</H4>\n";
-
- print "<P>Number (in %) of observed orthologies (o), \"subtree-neighborings\" (n), \n";
- print "and super-orthologies (s) to query in bootstrapped trees, evolutionary \n";
- print "distance to query (as average number of aa replacements per residue):</P>\n";
- if ( $ortho_name ne "-" ) {
- print "<TABLE BORDER=\"0\" CELLPADDING=\"1\">\n";
-
- print "<TR VALIGN=\"TOP\"> <TD NOWRAP> <B>Sequence</B> </TD><TD NOWRAP> <B>Description</B> </TD><TD NOWRAP ALIGN=\"RIGHT\"> <B>o[%]</B> </TD><TD NOWRAP ALIGN=\"RIGHT\"> <B>n[%]</B> </TD><TD NOWRAP ALIGN=\"RIGHT\"> <B>s[%]</B> </TD><TD NOWRAP>   <B>distance</B> </TD> </TR>\n";
- }
- }
-
-} ## printHeaderForOrthologies
-
-
-
-# Last modified: 10/15/01
-sub printHeaderForSparalogs {
-
- if ( $output_HTML != 1 ) {
- print OUT "\nUltra-paralogs\n";
- print OUT "--------------\n";
- print OUT "Number (in %) of observed ultra-paralogies (up) to query\n";
- print OUT "in bootstrapped trees, evolutionary distance to query (as average number\n";
- print OUT "of aa replacements per residue):\n\n";
- if ( $long_output != 1 ) {
- print OUT "Sequence Description up[%] distance\n";
- print OUT "-------- ----------- ----- --------\n";
- }
- else {
- print OUT "Sequence Description up[%] distance\n";
- print OUT "-------- ----------- ----- --------\n";
- }
- }
- else {
- print "<H4 class = \"title\">Ultra-paralogs</H4>\n";
- print "<P>Number (in %) of observed ultra-paralogies (up) to query \n";
- print "in bootstrapped trees, evolutionary distance to query (as average number \n";
- print "of aa replacements per residue):</P>\n";
- print "<TABLE BORDER=\"0\" CELLPADDING=\"1\">\n";
- print "<TR VALIGN=\"TOP\"> <TD NOWRAP> <B>Sequence</B> </TD><TD NOWRAP> <B>Description</B> </TD><TD NOWRAP ALIGN=\"RIGHT\"> <B>up[%]</B> </TD><TD NOWRAP>   <B>distance</B> </TD> </TR>\n";
-
- }
-
-} ## printHeaderForSparalogs
-
-
-
-# Last modified: 03/07/01
-sub printOrthologies {
- my @cut = ();
- my $i = 0;
- my $descp = "";
- $orthos = &roundToInt( $orthos );
- $s_orthos = &roundToInt( $s_orthos );
-
- if ( $sort > 10 ) {
- $subtree_neighbors = &roundToInt( $subtree_neighbors );
- }
-
- if ( ( $description == 1 || $complete_description == 1 )
- && $ortho_name ne "-" ) {
-
- if ( $non_sp != 1 ) {
- if ( &startsWithSWISS_PROTname( $ortho_name ) ) {
- $descp = &getDescriptionFromSWISSPROT_ACDEOSfile( $SWISSPROT_ACDEOS_FILE, $ortho_name );
- }
- else {
- $descp = "-";
- }
- }
- else {
- if ( &startsWithSWISS_PROTname( $ortho_name ) ) {
- $descp = &getDescriptionFromSWISSPROT_ACDEOSfile( $SWISSPROT_ACDEOS_FILE, $ortho_name );
- }
- else {
- $descp = &getDescriptionFromTrEMBL_ACDEOSfile( $TREMBL_ACDEOS_FILE, $ortho_name );
- }
- }
-
- if ( $output_HTML != 1 ) {
- if ( $long_output == 1 ) {
- @cut = &cutDescription( $descp, 73 );
- }
- else {
- @cut = &cutDescription( $descp, 33 );
- }
- $descp = $cut[ 0 ];
- }
- }
- if ( $descp eq "" ) {
- $descp = "-";
- }
-
- if ( $output_HTML != 1 ) {
-
- if ( $ortho_name eq "-" ) {
- print OUT "\nNO ORTHOLOGS in alignment with the current thresholds for output\n";
- }
- elsif ( $dist ne "-" ) {
- if ( $long_output == 1 ) {
- print OUT sprintf "%-24.24s%-74.74s%5s%5s%5s%10.6f", $ortho_name,$descp,$orthos,$subtree_neighbors,$s_orthos,$dist;
- }
- else {
- print OUT sprintf "%-24.24s%-34.34s%5s%5s%5s%10.6f", $ortho_name,$descp,$orthos,$subtree_neighbors,$s_orthos,$dist;
- }
- }
- else {
- if ( $long_output == 1 ) {
- print OUT sprintf "%-24.24s%-74.74s%5s%5s%5s%10.10s", $ortho_name,$descp,$orthos,$subtree_neighbors,$s_orthos,$dist;
- }
- else {
- print OUT sprintf "%-24.24s%-34.34s%5s%5s%5s%10.10s", $ortho_name,$descp,$orthos,$subtree_neighbors,$s_orthos,$dist;
- }
- }
- if ( $complete_description == 1 ) {
- for ( $i = 1; $i < @cut; ++$i ) {
- print OUT "\n";
- if ( $long_output == 1 ) {
- print OUT sprintf " %-74.74s", $cut[ $i ];
- }
- else {
- print OUT sprintf " %-34.34s", $cut[ $i ];
- }
- }
- }
- print OUT "\n";
- }
- else {
- if ( $ortho_name eq "-" ) {
- print "<H4 class = \"warnings\">NO ORTHOLOGS in alignment with the current thresholds for output</H4>\n";
- }
- else {
- $ortho_name = &replaceNameWithLinkToExpasy( $ortho_name );
- print "<TR VALIGN=\"TOP\"> <TD NOWRAP> $ortho_name </TD><TD> $descp </TD><TD NOWRAP ALIGN=\"RIGHT\"> $orthos </TD><TD NOWRAP ALIGN=\"RIGHT\"> $subtree_neighbors </TD><TD NOWRAP ALIGN=\"RIGHT\"> $s_orthos </TD><TD NOWRAP>   $dist </TD> </TR>\n";
- }
- }
-
-} ## printOrthologies
-
-
-
-sub replaceNameWithLinkToExpasy {
- my $name = $_[ 0 ];
-
- if ( $name =~ /(.+)_(.+)\/(.+)/ ) {
- my $desc = $1;
- my $spec = $2;
- my $numbers = $3;
- if ( length( $desc ) <= 4 ) {
- $name = "<A HREF=\"".$EXPASY_SPROT_SEARCH_DE.$desc."_".$spec."\" TARGET=\"_blank\">".$desc."_".$spec."</A>\/".$numbers;
- }
- else {
- $name = "<A HREF=\"".$EXPASY_SPROT_SEARCH_AC.$desc."\" TARGET=\"_blank\">".$desc."</A>_".$spec."\/".$numbers;
- }
- }
-
- return $name;
-
-} ## replaceNameWithLinkToExpasy
-
-
-
-
-# Last modified: 10/15/01
-sub printUltraParlogs {
- my @cut = ();
- my $i = 0;
- my $descp = "";
- $s_paras = &roundToInt( $s_paras );
-
- if ( ( $description == 1 || $complete_description == 1 )
- && $s_para_name ne "-" ) {
-
- if ( $non_sp != 1 ) {
- if ( &startsWithSWISS_PROTname( $s_para_name ) ) {
- $descp = &getDescriptionFromSWISSPROT_ACDEOSfile( $SWISSPROT_ACDEOS_FILE, $s_para_name );
- }
- else {
- $descp = "-";
- }
- }
- else {
- if ( &startsWithSWISS_PROTname( $s_para_name ) ) {
- $descp = &getDescriptionFromSWISSPROT_ACDEOSfile( $SWISSPROT_ACDEOS_FILE, $s_para_name );
- }
- else {
- $descp = &getDescriptionFromTrEMBL_ACDEOSfile( $TREMBL_ACDEOS_FILE, $s_para_name );
- }
- }
-
- if ( $output_HTML != 1 ) {
- if ( $long_output == 1 ) {
- @cut = &cutDescription( $descp, 73 );
- }
- else {
- @cut = &cutDescription( $descp, 33 );
- }
- $descp = $cut[ 0 ];
- }
- }
- if ( $descp eq "" ) {
- $descp = "-";
- }
-
- if ( $output_HTML != 1 ) {
-
- if ( $dist ne "-" ) {
- if ( $long_output == 1 ) {
- print OUT sprintf "%-24.24s%-74.74s%5s%10.6f", $s_para_name,$descp,$s_paras,$dist;
- }
- else {
- print OUT sprintf "%-24.24s%-34.34s%5s%10.6f", $s_para_name,$descp,$s_paras,$dist;
- }
- }
- else {
- if ( $long_output == 1 ) {
- print OUT sprintf "%-24.24s%-74.74s%5s%10.10s", $s_para_name,$descp,$s_paras,$dist;
- }
- else {
- print OUT sprintf "%-24.24s%-34.34s%5s%10.10s", $s_para_name,$descp,$s_paras,$dist;
- }
- }
- if ( $complete_description == 1 ) {
- for ( $i = 1; $i < @cut; ++$i ) {
- print OUT "\n";
- if ( $long_output == 1 ) {
- print OUT sprintf " %-74.74s", $cut[ $i ];
- }
- else {
- print OUT sprintf " %-34.34s", $cut[ $i ];
- }
- }
- }
- print OUT "\n";
-
- }
- else {
- $s_para_name = &replaceNameWithLinkToExpasy( $s_para_name );
- print "<TR VALIGN=\"TOP\"> <TD NOWRAP> $s_para_name </TD><TD> $descp </TD><TD NOWRAP ALIGN=\"RIGHT\"> $s_paras </TD><TD NOWRAP>   $dist </TD> </TR>\n";
- }
-
-} ## printUltraParlogs
-
-
-
-sub printNoUltraParalogs {
- if ( $output_HTML != 1 ) {
- print OUT "\nUltra-paralogs\n";
- print OUT "--------------\n";
- print OUT "\nNO ULTRA-PARALOGS in alignment with the current threshold of $t_ultra_paralogs%\n";
- }
- else {
- print "<H4 class = \"title\">Ultra-paralogs</H4>\n";
- print "<H4 class = \"warnings\">NO ULTRA-PARALOGS in alignment with the current threshold of $t_ultra_paralogs%</H4>\n";
- }
-} ## printNoUltraParalogs
-
-
-
-# Called by method "printOrthologies".
-# Last modified: 02/27/01
-sub cutDescription {
- my $line = $_[ 0 ];
- my $size = $_[ 1 ];
- my @cut = ();
- my $i = 0;
-
- while ( ( length( $line ) ) > $size ) {
- $cut[ $i++ ] = substr( $line, 0, $size );
- $line = substr( $line, $size );
- }
- $cut[ $i++ ] = $line;
- return @cut;
-} ## cutDescription
-
-
-
-
-# Last modified: 02/27/01
-sub printTitleForDistanceValues {
- if ( $output_HTML != 1 ) {
- if ( $mode == 1 || $mode == 2 ) {
- print OUT "\n\nDistance values (based on NJ tree of original alignment)\n";
- print OUT "--------------------------------------------------------\n";
- }
- elsif ( $mode == 3 || $mode == 4 ) {
- print OUT "\n\nDistance values (based on ML branch length values on consensus tree)\n";
- print OUT "--------------------------------------------------------------------\n";
- }
- }
- else {
- print "<H4 class = \"title\">Distance values (based on NJ tree of original alignment)</H4>\n";
- }
-
-} ## printTitleForDistanceValues
-
-
-
-
-# Last modified: 02/27/01
-sub printDistanceValues {
- if ( $output_HTML != 1 ) {
- print OUT "$return_line";
- }
- else {
- chomp( $return_line );
- if ( $return_line =~ /WARNING/ ) {
- $return_line =~ s/\+\/-/ ± /;
- $return_line =~ s/\*/ × /;
- print "<H4 class = \"warnings\">$return_line</H4>\n";
- }
- elsif ( $return_line =~ /lca\s+is/i ) {
- print "<P class = \"nomargins\">$return_line</P>\n";
- }
- elsif ( $return_line =~ /orthologous/i ) {
- print "<P class = \"nomargins\">$return_line</P>\n";
- }
- elsif ( $return_line =~ /distance\s+of\s+query/i ) {
- print "<TABLE BORDER=\"0\" CELLPADDING=\"1\">\n";
- }
- if ( $return_line =~ /(.+)=(.+)/ ) {
- print "<TR VALIGN=\"TOP\"><TD>$1</TD><TD> = $2</TD></TR>\n";
- }
- if ( $return_line =~ /sum\s+/i || $return_line =~ /distance\s+of\s+ortholog\s+to\s+LCA/i ) {
- print "</TABLE>\n";
- }
- }
-} ## printDistanceValues
-
-
-
-
-# Last modified: 02/27/01
-sub printMeanBootstraps {
- if ( $output_HTML != 1 ) {
- print OUT "\n\n$return_line";
- }
- else {
- chomp( $return_line );
- $return_line =~ s/\+\/-/ ± /;
- print "</TABLE>\n";
- print "<P>$return_line</P>\n";
- }
-} ## printMeanBootstraps
-
-
-
-
-# Last modified: 02/12/02
-sub printOptions {
-
- if ( $output_HTML != 1 ) {
- print OUT "\n\n\n==============================================================================\n";
- if ( $number_of_seqs_in_aln >= $MIN_NUMBER_OF_SEQS_IN_ALN ) {
- print OUT "RIO options\n";
- print OUT "-----------\n";
- print OUT "Mode : ";
- if ( $mode == 1 ) {
- print OUT "precalc. pwd files with alignment not containing query (1)\n";
- }
- elsif ( $mode == 2 ) {
- print OUT "precalc. pwd files with alignment containing query (2)\n";
- }
- elsif ( $mode == 3 ) {
- print OUT "alignment not containing query (3)\n";
- }
- elsif ( $mode == 4 ) {
- print OUT "alignment containing query (4)\n";
- }
- print OUT "Bootstraps : $bootstraps\n";
- print OUT "Species tree : $species_tree_file\n";
- if ( $safe_nhx == 1 ) {
- if ( $mode == 3 || $mode == 4 ) {
- if ( @complete_names > 1 ) {
- $outfile_annot_nhx_tree =~ s/-\d+\.nhx/-X.nhx/;
- print OUT "Saved annotated consensus trees (ML branch lengths) : $outfile_annot_nhx_tree\n";
- }
- else {
- print OUT "Saved annotated consensus tree (ML branch lengths) : $outfile_annot_nhx_tree\n";
- }
- }
- elsif ( $mode == 1 || $mode == 2 ) {
- if ( @complete_names > 1 ) {
- $outfile_annot_nhx_tree =~ s/-\d+\.nhx/-X.nhx/;
- print OUT "Saved annotated NJ trees (based on original alignment) : $outfile_annot_nhx_tree\n";
- }
- else {
- print OUT "Saved annotated NJ tree (based on original alignment) : $outfile_annot_nhx_tree\n";
- }
- }
- }
- print OUT "Threshold for output for orthologies (L=) : $t_orthologs\n";
- print OUT "Threshold for output for subtree-neighborings (B=) : $t_sn\n";
- print OUT "Threshold for distance calc for orthologies (U=) : $t_orthologs_dc\n";
-
- print OUT "When to generate warnings:\n";
- print OUT "More than one ortholog: diff. in standard deviations (X=): $warn_more_than_one_ortho\n";
- print OUT "No orthologs : diff. in standard deviations (Y=): $warn_no_orthos\n";
- print OUT "One ortholog : factor (Z=): $warn_one_ortho\n";
- if ( $output_ultraparalogs == 1 ) {
- print OUT "Output ultra-paralogs (p)\n";
- print OUT "Threshold for ultra-paralogies (v=) : $t_ultra_paralogs\n";
- }
- print OUT "Sort priority: $sort_priority\n";
- }
-
- print OUT "\nOptions for the calculation of the phylgenetic trees\n";
- print OUT "----------------------------------------------------\n";
- if ( $mode == 1 ) {
- print OUT "Model for pairwise distance calculations : $matrix\n";
- }
- elsif ( $mode == 3 || $mode == 4 ) {
- print OUT "Model for pairwise dist and ML branch length calc. : $matrix\n";
- }
- if ( $mode == 1 || $mode == 3 || $mode == 4 ) {
- print OUT "Columns in alignment used for tree calc : $length_of_alignment\n";
- print OUT "Columns in original alignment : $length_of_orig_alignment\n";
- }
- print OUT "Sequences in alignment used for trees (incl query) : $number_of_seqs_in_aln\n";
-
- if ( $mode == 3 || $mode == 4 ) {
- print OUT "Removed non-SWISS-PROT sequences : ";
- if ( $non_sp == 1 ) {
- print OUT "no\n";
- }
- else {
- print OUT "yes\n";
- }
- if ( $non_sp == 1 ) {
- print OUT "Removed \"TrEMBL fragments\" : ";
- if ( $no_frags == 1 ) {
- print OUT "yes\n";
- }
- else {
- print OUT "no\n";
- }
- }
- }
- if ( $mode == 1 || $mode == 2 ) {
- print OUT "Prgrm to calc. branch lengths for distance values : PHYLIP NEIGHBOR (NJ)\n";
- }
- elsif ( $mode == 3 || $mode == 4 ) {
- print OUT "Prgrm to calc branch lengths for distance values : TREE-PUZZLE\n";
- }
- if ( $seed_aln_for_hmmbuild ne "" ) {
- print OUT "HMM was built with hmmbuild using options : $command_line_for_hmmbuild\n";
- }
- if ( ( $mode == 3 || $mode == 4 ) && $species_names_file =~ /\S/ ) {
- print OUT "File listing species used for tree calculation (G=): $species_names_file\n";
- }
- print OUT "Seed for random number generator : $seed_for_makeTree\n";
- print OUT "Options for makeTree : $options_for_makeTree\n";
-
- $time_total = time - $time_total;
-
- print OUT "\nTime and date\n";
- print OUT "-------------\n";
- if ( $mode == 1 ) {
- print OUT "Time requirement dqo puzzle : $time_dqopuzzleT s\n";
- }
-
- print OUT "Time requirement for tree calculation: $time_tree_calcT s\n";
- print OUT "Time requirement for SDI and RIO : $time_rioT s\n";
- print OUT "Total time requirement : $time_total s\n";
- print OUT "Date started : $start_date";
- print OUT ( "Date finished : ".`date` );
-
- print OUT "\nCommand line\n";
- print OUT "------------\n";
- print OUT "$command_line\n";
- if ( $parallel == 1 ) {
- print OUT "\nProcessors used: @nodelist\n";
- }
- }
- else {
- if ( $printed_ultra_paralogs == 1 ) {
- print "</TABLE>\n";
- }
- if ( $species_tree_file =~ /.+\/(.+)/ ) {
- $species_tree_file = $1;
- }
- print "<H4 class = \"title\">Options</H4>\n";
- print "<TABLE BORDER=\"0\" CELLPADDING=\"1\">\n";
- print "<TR VALIGN=\"TOP\"><TD> Bootstraps: </TD><TD> $bootstraps </TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> Species tree: </TD><TD> $species_tree_file </TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> Threshold for output for orthologies: </TD><TD> $t_orthologs </TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> Threshold for output for subtree-neighborings: </TD><TD> $t_sn </TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> Threshold for distance calc for orthologies: </TD><TD> $t_orthologs_dc </TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> When to generate warnings </TD><TD> </TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> More than one ortholog [diff in standard deviations]: </TD><TD> $warn_more_than_one_ortho </TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> No orthologs [diff in standard deviations]: </TD><TD> $warn_no_orthos </TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> One ortholog [factor]: </TD><TD> $warn_one_ortho </TD></TR>\n";
- if ( $output_ultraparalogs == 1 ) {
- print "<TR VALIGN=\"TOP\"><TD> Output ultra-paralogs </TD><TD> </TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> Threshold for ultra-paralogies: </TD><TD> $t_ultra_paralogs </TD></TR>\n";
- }
- print "<TR VALIGN=\"TOP\"><TD> Sort priority: </TD><TD> $sort_priority </TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> Model for pairwise distance calculations:</TD><TD> $matrix </TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> Columns in alignment used for tree calc: </TD><TD> $length_of_alignment </TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> Columns in original alignment: </TD><TD> $length_of_orig_alignment </TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> Sequences in alignment used for trees (incl query): </TD><TD> $number_of_seqs_in_aln </TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> Seed for random number generator: </TD><TD> $seed_for_makeTree </TD></TR>\n";
- print "</TABLE>\n";
-
- $time_total = time - $time_total;
-
- print "<P>   </P>\n";
- print "<TABLE BORDER=\"0\" CELLPADDING=\"1\">\n";
- print "<TR VALIGN=\"TOP\"><TD> Time requirement: </TD><TD> $time_total s</TD></TR>\n";
- print "<TR VALIGN=\"TOP\"><TD> Date started: </TD><TD> $start_date </TD></TR>\n";
- print ( "<TR VALIGN=\"TOP\"><TD> Date finished: </TD><TD> ".`date`." </TD></TR>\n" );
- if ( $parallel == 1 ) {
- print "<TR VALIGN=\"TOP\"><TD> Number of processors used: </TD><TD> ".scalar( @nodelist )." </TD></TR>\n";
- }
- print "</TABLE>\n";
- }
-
-} ## printOptions
-
-
-
-
-
-
-
-
-
-
-# -----------------------------------------------------------
-# Execution of other programs
-# -----------------------------------------------------------
-
-
-
-
-
-# Two arguments:
-# 1. seed
-# 2. outfile
-# Returns the options used.
-# Last modified: 05/11/01
-sub executeHmmbuild {
-
- my $seed = $_[ 0 ];
- my $outfile = $_[ 1 ];
- my $options = "";
-
- &testForTextFilePresence( $seed );
-
- $options = getHmmbuildOptionsFromPfam( $seed );
-
- $options =~ s/-f//;
- $options =~ s/-g//;
- $options =~ s/-s//;
- $options =~ s/-F//;
- $options =~ s/-A//;
- $options =~ s/-o\s+\S+//;
- $options =~ s/(\s|^)[^-]\S+/ /g;
-
- if ( $options =~ /--prior/ ) {
- my $basename = basename( $seed );
- $basename .= ".PRIOR";
- $options =~ s/--prior/--prior $PRIOR_FILE_DIR$basename/;
- }
-
- # Remove for versions of HMMER lower than 2.2.
- if ( $options =~ /--informat\s+\S+/ ) {
- $options =~ s/--informat\s+\S+/--informat SELEX/;
- }
- else {
- $options = "--informat SELEX ".$options;
- }
-
- system( "$HMMBUILD $options $outfile $seed" )
- && &dieWithUnexpectedError( "Could not execute \"$HMMBUILD $options $outfile $seed\"" );
- return $options;
-
-} ## executeHmmbuild.
-
-
-
-
-# One argument:
-# Pfam align name.
-# Last modified: 02/26/01
-sub getHmmbuildOptionsFromPfam {
-
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $result = "";
-
- &testForTextFilePresence( $infile );
-
- open( GHO, $infile ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
- while ( $return_line = <GHO> ) {
- if ( $return_line =~ /^\s*#.*hmmbuild\s+(.+)\s*$/ ) {
- $result = $1;
- close( GHO );
- return $result;
- }
- }
- close( GHO );
- return $result;
-
-} ## getHmmbuildOptionsFromPfam
-
-
-
-
-# Purpose. Aligns a FASTA file to a Pfam alignment using an HMM profile.
-# Five arguemnts:
-# 1. Pfam flat file name
-# 2. Name of FASTA file to append
-# 3. HMM profile file name
-# 4. outputfile name
-# 5. 1 use --mapali, --withali otherwise (in hmmalign)
-# Returns 1 if successful, -1 if no alignment was made because
-# E value of HMMSEARCH output was larger than $E_VALUE_THRESHOLD.
-# Last modified: 07/11/01
-sub alignWithHmmalign {
- my $alignment = $_[ 0 ];
- my $query = $_[ 1 ];
- my $hmm = $_[ 2 ];
- my $outfile = $_[ 3 ];
- my $use_mapali = $_[ 4 ];
- my $E = 2000;
- my $ali = "--withali";
-
- if ( $use_mapali == 1 ) {
- $ali = "--mapali";
- }
-
- &testForTextFilePresence( $alignment );
- &testForTextFilePresence( $query );
- &testForTextFilePresence( $hmm );
-
- system( "$HMMSEARCH $hmm $query > $temp_dir/HMMSEARCHOUT" )
- && &dieWithUnexpectedError( "Could not execute \"$HMMSEARCH $hmm $query > $temp_dir/HMMSEARCHOUT\"" );
-
-
-
- $E = &getEvalue( "$temp_dir/HMMSEARCHOUT" );
- if ( $E == 2000 ) {
- &dieWithUnexpectedError( "No E-value found in \"$temp_dir/HMMSEARCHOUT\"" );
- }
- elsif ( $E > $E_VALUE_THRESHOLD ) {
- unlink( "$temp_dir/HMMSEARCHOUT" );
- return ( -1 );
- }
-
- system( "$P7EXTRACT -d $temp_dir/HMMSEARCHOUT > $temp_dir/GDF" )
- && &dieWithUnexpectedError( "Could not execute \"$P7EXTRACT -d $temp_dir/HMMSEARCHOUT > $temp_dir/GDF\"" );
-
-
- system( "$MULTIFETCH -d -g $query $temp_dir/GDF > $temp_dir/MULTIFETCHOUT" )
- && &dieWithUnexpectedError( "Could not execute \"$MULTIFETCH -d -g $query $temp_dir/GDF > $temp_dir/MULTIFETCHOUT\"" );
-
- # Checks if score was too low to have made a reasonable alignment.
- unless ( -s "$temp_dir/MULTIFETCHOUT" ) {
- unlink( "$temp_dir/HMMSEARCHOUT", "$temp_dir/GDF", "$temp_dir/MULTIFETCHOUT" );
- return ( -1 );
- }
-
- system( "$HMMALIGN -o $outfile $ali $alignment $hmm $temp_dir/MULTIFETCHOUT >/dev/null 2>&1" )
- && &dieWithUnexpectedError( "Could not execute \"$HMMALIGN -o $outfile $ali $alignment $hmm $temp_dir/MULTIFETCHOUT\"" );
-
- if ( unlink( "$temp_dir/HMMSEARCHOUT", "$temp_dir/GDF","$temp_dir/MULTIFETCHOUT" ) != 3 ) {
- &dieWithUnexpectedError( "Could not delete (a) file(s)" );
- }
-
- return 1;
-} ## alignWithHmmalign
-
-
-
-
-# Gets the E value for complete sequences (score includes all domains)
-# from a HMMSEARCH output file.
-# One argument: the HMMSEARCH output file name
-# Returns the E value, 2000 if no E value found
-# Last modified: 07/11/01
-sub getEvalue {
-
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $flag = 0;
- my $E = 2000;
-
- &testForTextFilePresence( $infile );
-
- open( E, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
- while ( $return_line = <E> ) {
-
- # "Sequence Description Score E-value N"
- if ( $return_line =~ /Sequence.+Description.+Score.+E.+value.+N/ ) {
- $flag = 1;
- }
- # "QUERY_HUMAN 657.4 1.3e-198 1"
- elsif ( $flag == 1 && $return_line =~ /\s+(\S+)\s+\d+\s*$/ ) {
- $E = $1;
- close( E );
- return $E;
- }
-
- }
- close( E );
- return $E;
-
-} ## getEvalue
-
-
-
-# Four/Five arguments:
-# 1. Number of bootstraps
-# 2. bsp (bootstrap positions) file
-# 3. Infile (alignment)
-# 4. Outfile (bootstrapped according to bsp file)
-# 5. Number of processors
-# Last modified: 01/30/02
-sub executeBootstrap_cz {
- my $boots = $_[ 0 ];
- my $bsp_file = $_[ 1 ];
- my $infile = $_[ 2 ];
- my $outfile = $_[ 3 ];
- my $processors = $_[ 4 ];
-
- if ( defined( $processors ) && ( $processors > 1 ) ) {
- system( "$BOOTSTRAP_CZ $boots $infile $bsp_file $outfile $processors" )
- && &dieWithUnexpectedError( "Could not execute \"$BOOTSTRAP_CZ $boots $infile $bsp_file $outfile $processors\"" );
-
- }
- else {
- system( "$BOOTSTRAP_CZ $boots $infile $bsp_file $outfile" )
- && &dieWithUnexpectedError( "Could not execute \"$BOOTSTRAP_CZ $boots $infile $bsp_file $outfile\"" );
- }
-
-} ## executeBootstrap_cz
-
-
-
-
-
-# One argument:
-# options for DoRIO.main.
-# Last modified: 02/26/01
-sub executeDoRIO {
-
- my $options = $_[ 0 ];
-
- system( "$DORIO $options >/dev/null 2>&1" )
- && &dieWithUnexpectedError( "Could not execute \"$DORIO $options\"" );
-
- return;
-
-} ## executeDoRIO
-
-
-
-
-
-
-
-
-
-
-
-# -----------------------------------------------------------
-# These deal with the alignment
-# -----------------------------------------------------------
-
-
-
-
-# Counts sequences from a Pfam flat file or
-# in a PHYLIP interleaved aligment.
-# One arguments: Pfam flat file name.
-# Returns the number of sequences.
-# Last modified: 07/10/01
-sub countSeqsInPfamAlign {
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $saw_sequence_line = 0;
- my $number_of_seqs = 0;
-
- &testForTextFilePresence( $infile );
-
- open( C, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
- while ( $return_line = <C> ) {
-
- if ( $saw_sequence_line == 1
- && !&containsPfamNamedSequence( $return_line )
- && !&isPfamCommentLine( $return_line ) ) {
- last;
- }
- if ( &isPfamSequenceLine( $return_line )
- && $return_line !~ /^\s*\d+\s+\d+/ ) {
- if ( $saw_sequence_line == 0 ) {
- $saw_sequence_line = 1;
- }
- $number_of_seqs++;
- }
- }
- close( C );
- return $number_of_seqs;
-
-} ## countSeqsInPfamAlign
-
-
-
-
-# This gets the complete name(s) of a sequence from a Pfam alignment.
-# I.e. it adds "/xxx-xxx".
-# 2 arguments:
-# 1. Infile (alignment)
-# 2. Name of query
-# Returns a String-array of all the complete names found.
-# Last modified: 03/04/01
-sub getCompleteName {
-
- my $infile = $_[ 0 ];
- my $query_name = $_[ 1 ];
- my $return_line = "";
- my @complete_names = ();
- my $complete_name = "";
- my $i = 0;
-
- &testForTextFilePresence( $infile );
-
- $query_name =~ s/\/.*//;
-
- open( INGCN, $infile ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
- while ( $return_line = <INGCN> ) {
- if ( $return_line =~ /^\s*$query_name(\S+)\s+.+/ ) {
- $complete_name = $query_name.$1;
- if ( $i > 0 && $complete_names[ 0 ] eq $complete_name ) {
- # Now, we saw of all of them.
- last;
- }
- $complete_names[ $i++ ] = $complete_name;
- }
- }
-
- close( INGCN );
- return @complete_names;
-} ## getCompleteName
-
-
-
-
-# Removes sequences from a Pfam flat file.
-# It can remove all sequences not from species listed in a species names file.
-# It can remove all sequences which do not have a SWISS-PROT name (XXXX_XXXXX)
-# It can remove all sequences which are "TrEMBL" fragments.
-# Six arguments:
-# 1. Pfam flat file name
-# 2. outfile name
-# 3. Name of the query - not to be removed
-# (use " " to not use this functionality)
-# 4. species names file (will be ignored if " ")
-# 5. 1 to NOT remove non-SWISS_PROT seqs.
-# 6. 1 to remove TrEMBL seqs with "(FRAGMENT)" in their DE line.
-# (Only used if non SWISS_PROT seqswill not be removed)
-# Returns the number of sequences in the resulting alignment.
-# If a query name is given, it returns -1 if query is not found in alignment,
-# -10 if the name is not unique.
-# Last modified: 05/11/01
-sub removeSeqsFromPfamAlign {
- my $infile = $_[ 0 ];
- my $outfile = $_[ 1 ];
- my $query = $_[ 2 ];
- my $species_names_file = $_[ 3 ];
- my $keep_non_sp = $_[ 4 ];
- my $remove_frags = $_[ 5 ];
- my $return_line = "";
- my $name = "";
- my $seq = "";
- my $saw_sequence_line = 0;
- my $number_of_seqs = 0;
- my $saw_query = 0;
- my $query_given = 0;
- my $species_names_file_given = 0;
- my $length_of_name = 0;
- my %AC_OS = (); # AC -> species name (TrEMBL)
- my %AC_DE = (); # AC -> description (TrEMBL)
- my $AC = "";
- my $DE = "";
- my $OS = "";
-
- &testForTextFilePresence( $infile );
-
- if ( $query =~ /\S/ ) {
- $query_given = 1;
- }
- if ( $species_names_file =~ /\S/ ) {
- $species_names_file_given = 1;
- &readSpeciesNamesFile( $species_names_file );
- }
-
- if ( $keep_non_sp == 1
- || ( $query_given == 1 && !&startsWithSWISS_PROTname( $query ) ) ) {
-
- &testForTextFilePresence( $TREMBL_ACDEOS_FILE );
-
- # Fill up hash $AC_OS and $AC_DE.
- open( HH, "$TREMBL_ACDEOS_FILE" ) || &dieWithUnexpectedError( "Cannot open file \"$TREMBL_ACDEOS_FILE\"" );
- while ( $return_line = <HH> ) {
- if ( $return_line =~ /(\S+);([^;]*);(\S+)/ ) {
- $AC_OS{ $1 } = $3;
- if ( $remove_frags == 1 ) {
- $AC_DE{ $1 } = $2;
- }
- }
- }
- close( HH );
- }
-
- open( OUT_RNSP, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" );
- open( IN_RNSP, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
- while ( $return_line = <IN_RNSP> ) {
-
- if ( $saw_sequence_line == 1
- && !&containsPfamNamedSequence( $return_line )
- && !&isPfamCommentLine( $return_line ) ) {
- # This is just for counting purposes.
- $saw_sequence_line = 2;
- }
- if ( &isPfamSequenceLine( $return_line ) ) {
- if ( $saw_sequence_line == 0 ) {
- $saw_sequence_line = 1;
- }
- $return_line =~ /(\S+)\s+(\S+)/;
- $name = $1;
- $seq = $2;
- if ( $query_given == 1 && $name eq $query ) {
- $saw_query++;
- }
- if ( ( $query_given == 1 && $name ne $query )
- || $query_given != 1 ) {
- if ( !&startsWithSWISS_PROTname( $name ) ) {
- if ( $keep_non_sp != 1 ) {
- next;
- }
- else {
- $name =~ /(\S+)\//;
- $AC = $1;
- unless( exists( $AC_OS{ $AC } ) ) {
- #ACs not present in "ACDEOS" file.
- next;
- }
- $OS = $AC_OS{ $AC };
- if ( !$OS || $OS eq "" ) {
- &dieWithUnexpectedError( "species for \"$AC\" not found" );
- }
- if ( $species_names_file_given == 1 ) {
- unless( exists( $Species_names_hash{ $OS } ) ) {
- next;
- }
- }
- if ( $remove_frags == 1 ) {
- $DE = $AC_DE{ $AC };
- if ( $DE && $DE =~ /\(FRAGMENT\)/ ) {
- next;
- }
- }
- $name =~ s/\//_$OS\//;
- }
- }
- else {
- if ( $species_names_file_given == 1 ) {
- if ( $name =~ /_([A-Z0-9]{1,5})/ ) {
- unless( exists( $Species_names_hash{ $1 } ) ) {
- next;
- }
- }
- # remove everything whose species cannot be determined.
- else {
- next;
- }
- }
- }
- }
- elsif ( $query_given == 1 && $name eq $query
- && !&startsWithSWISS_PROTname( $query ) ) {
- # Adding species to non SWISS-PROT query
- $name =~ /(\S+)\//;
- $AC = $1;
- unless( exists( $AC_OS{ $AC } ) ) {
- #ACs not present in "ACDEOS" file.
- &userError( "Could not establish species of query.\n Check file \"$TREMBL_ACDEOS_FILE\"." );
- }
- $OS = $AC_OS{ $AC };
- if ( !$OS || $OS eq "" ) {
- &dieWithUnexpectedError( "species for \"$AC\" not found" );
- }
- $name =~ s/\//_$OS\//;
- }
-
- $length_of_name = length( $name );
-
- if ( $length_of_name > ( $LENGTH_OF_NAME - 1 ) ) {
- &userError( "Name \"$name\" is too long." );
- }
-
- for ( my $j = 0; $j <= ( $LENGTH_OF_NAME - $length_of_name - 1 ); ++$j ) {
- $name .= " ";
- }
-
- $return_line = $name.$seq."\n";
- }
-
- print OUT_RNSP $return_line;
- if ( $saw_sequence_line == 1 ) {
- $number_of_seqs++;
- }
- }
- close( IN_RNSP );
- close( OUT_RNSP );
- if ( $query_given == 1 ) {
- if ( $saw_query < 1 ) {
- return -1;
- }
- elsif ( $saw_query > 1 ) {
- return -10;
- }
- }
- return $number_of_seqs;
-
-} ## removeSeqsFromPfamAlign
-
-
-
-
-# One argument:
-# 1. PWD file
-# "Returns" a Hash of Strings (=keys) containing all the names found in PWD file
-# Last modified: 05/29/01
-sub getNamesFromPWDFile {
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $i = 0;
- my $saw_dist_line = 0;
-
- &testForTextFilePresence( $infile );
-
- open( GN_IN, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
-
- while ( $return_line = <GN_IN> ) {
- if ( $saw_dist_line == 1 && $return_line =~ /^\s*(\d+)\s*$/ ) {
- if ( $1 != $i ) {
- &dieWithUnexpectedError( "Failed sanity check" );
- }
- last;
- }
- elsif ( $return_line =~ /^\s*(\S+)\s+\S+/ ) {
- $names_in_pwd_file{ $1 } = 0;
- $i++;
- $saw_dist_line = 1;
- }
- }
- close( GN_IN );
- return;
-} ## getNamesFromPWDFile
-
-
-
-
-# Moves sequences which start with query name (argument 1)
-# to the last positions in pfam alignment sepecified by argument 2.
-# Removes seqs present in argument 4, unless for query name.
-# Four arguments:
-# 1. Query name
-# 2. Infile (alignment)
-# 3. Outfile (=infile with query seq moved to the bottom)
-# 4. Array of seq names to remove, unless for query name
-# Last modified: 06/25/01
-sub moveToLast {
- my $query = $_[ 0 ];
- my $infile = $_[ 1 ];
- my $outfile = $_[ 2 ];
- my @to_remove = @{ $_[ 3 ] }; # @{} tells Perl that this is a list.
- my $return_line = "";
- my $query_line = "";
- my $n = "";
-
- &testForTextFilePresence( $infile );
-
- open( MTL_IN, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
- open( MTL_OUT, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" );
-
- W: while ( $return_line = <MTL_IN> ) {
- if ( &isPfamCommentLine( $return_line )
- && ( !isRFline( $return_line ) || $mode != 1 ) ) {
- next W;
- }
- if ( @to_remove > 1 ) {
- foreach $n ( @to_remove ) {
- if ( $n ne $query && $return_line =~ /^\s*$n\s+/ ) {
- next W;
- }
- }
- }
- if ( $return_line =~ /^\s*$query\s+/ ) {
- $query_line = $return_line;
- }
- elsif ( $query_line ne ""
- && ( $return_line !~ /\S+/ || isRFline( $return_line ) ) ) {
- print MTL_OUT $query_line;
- print MTL_OUT $return_line;
- $query_line = "";
- }
- else {
- print MTL_OUT $return_line;
- }
- }
- if ( $query_line ne "" ) {
- print MTL_OUT $query_line;
- }
-
- close( MTL_IN );
- close( MTL_OUT );
-
- return;
-
-} ## moveToLast
-
-
-
-
-
-
-
-
-
-
-# -----------------------------------------------------------
-# Others
-# -----------------------------------------------------------
-
-
-
-
-# This gets the complete name of a TrEMBL sequence from a Pfam alignment.
-# I.e. it adds the species between "_" and "/XXX-XXX".
-# 2 arguments:
-# 1. Infile (alignment)
-# 2. Name of query
-# Returns the complete name found.
-# Last modified: 04/25/01
-sub getCompleteNameForTrEMBLquerySeq {
-
- my $infile = $_[ 0 ];
- my $query_name = $_[ 1 ];
- my $return_line = "";
- my $complete_name = "";
- my $before_slash = "";
- my $after_slash = "";
-
- &testForTextFilePresence( $infile );
-
- $query_name =~ /(.+)\/.+/;
- $before_slash = $1;
-
- $query_name =~ /.+\/(.+)/;
- $after_slash = $1;
-
- open( INGCN, $infile ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
- while ( $return_line = <INGCN> ) {
- if ( $return_line =~ /^\s*($before_slash.+\/$after_slash)/ ) {
- $complete_name = $1;
- last;
- }
- }
- close( INGCN );
- if ( $complete_name eq "" ) {
- &userError( "Could not find \"$query_name\" in \"$alignment\"." );
- }
- return $complete_name;
-} ## getCompleteNameForTrEMBLquerySeq
-
-
-
-
-# One argument:
-# Pfam align name.
-# Last modified: 02/26/01
-sub getDescriptionFromPfam {
-
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $result = "";
-
- &testForTextFilePresence( $infile );
-
- open( INGDPF, $infile ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
- while ( $return_line = <INGDPF> ) {
- if ( $return_line =~ /^\s*#=DE\s+(.+)/ ) {
- $result = $1;
- close( INGDPF );
- return $result;
- }
- }
- close( INGDPF );
- return $result;
-
-} ## getDescriptionFromPfam
-
-
-
-# Reads in (SWISS-PROT) species names from a file.
-# Names must be separated by newlines.
-# Lines beginning with "#" are ignored.
-# A possible "=" and everything after is ignored.
-# One argument: species-names-file name
-# Last modified: 04/24/01
-sub readSpeciesNamesFile {
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $species = "";
-
- &testForTextFilePresence( $infile );
-
- open( IN_RSNF, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
- while ( $return_line = <IN_RSNF> ) {
- if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) {
- $species = $1;
- $species =~ s/=.+//;
- $Species_names_hash{ $species } = "";
- }
- }
- close( IN_RSNF );
-
- return;
-} ## readSpeciesNamesFile
-
-
-
-
-# This reads a raw sequence file or a FASTA sequence file
-# and saves it as a "cleaned up" FASTA sequence file.
-# If no > line is in the file, it creates one with new sequence name.
-# If a > line is in the file, it modifes it:
-# white space -> _, ";" ":" "," or "|" -> "~", deletes everything after ( or [;
-# length is limited to 40 characters.
-# Error if $new_seq_name is "" and no > line in the file.
-# Two/three arguments:
-# 1. infile name
-# 2. outfile name
-# 3. new sequence name for > line(will be ignored if "")
-# If new sequence name is "":
-# returns the contents of the ">" line after modification.
-# If new sequence name is specified:
-# return new sequence name.
-# Last modified: 03/04/01
-sub seqFile2CleanedUpFastaFile {
- my $infile = $_[ 0 ];
- my $outfile = $_[ 1 ];
- my $new_seq_name = $_[ 2 ];
- my $return_line = "";
- my $mod_desc = "";
- my $saw_desc_line = 0;
-
- &testForTextFilePresence( $infile );
-
- open( IN_CUFF, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
- open( OUT_CUFF, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" );
-
- while ( $return_line = <IN_CUFF> ) {
- if ( $return_line =~ /\w/ && $return_line !~ /^\s*#/ ) {
- if ( $return_line =~ /^\s*>/ ) {
- if ( $new_seq_name eq "" && $return_line !~ /_/ ) {
- &userError( "Description line of query file appears not to\n contain any species information. Use \"N=\" option." );
- }
- elsif ( $new_seq_name eq "" ) {
- $return_line =~ s/^\s*>\s*(.*?)\s*/>$1/; # Removes spaces before and after >.
- $return_line = substr( $return_line, 0, $LENGTH_OF_NAME - 1 );
- $return_line =~ s/[\(\[].*//; # Removes "(" or "[" and everything after.
- $return_line =~ s/\s+$//; # Removes spaces at end.
- $return_line =~ s/\s+/_/g; # Replaces all white spaces with "_".
- $return_line =~ s/[;:,\|]/~/g; # Replaces all ";", ":", ",", or "|" with "~".
- $return_line =~ />\s*(\S+)/;
- $mod_desc = $1;
- $return_line .= "\n";
- }
- else {
- $return_line = ">".$new_seq_name."\n";
- $mod_desc = $new_seq_name;
- }
- $saw_desc_line = 1;
- }
- else {
- if ( $saw_desc_line != 1 ) {
- if ( $new_seq_name ne "" ) {
- print OUT_CUFF ( ">".$new_seq_name."\n" );
- $mod_desc = $new_seq_name;
- }
- else {
- &userError( "Query file is not a FASTA file\n and option \"N=\" has not been used." );
- }
- $saw_desc_line = 1;
- }
- $return_line =~ s/[^a-zA-Z\r\n\f]//g; # Removes non-letters from sequence.
- }
-
- if ( $return_line =~ /\w/ ) {
- print OUT_CUFF $return_line;
- }
- }
- }
- close( IN_CUFF );
- close( OUT_CUFF );
-
- return $mod_desc;
-} ## seqFile2CleanedUpFastaFile
-
-
-
-
-# Purpose. Gets description for TrEMBL seqs,
-# from a file which contains the AC, DE, and OS
-# and which has to be generated from a TrEMBL flat file db
-# using "extractTrembl.pl".
-# The same file is used in method "addSpeciesToNonSPseqs".
-# Two arguments:
-# 1. "ACDEOS" file (AC, DE, OS from TrEMBL db)
-# 2. AC ("_species/..." is removed)
-# Format: AC;DE;OS\n
-# Last modified: 02/14/02
-sub getDescriptionFromTrEMBL_ACDEOSfile {
- my $ACDEOS = $_[ 0 ];
- my $AC = $_[ 1 ];
- my $DE = "";
-
- # Fill up (huge) hash, if not already done.
- unless ( %AC_DE ) {
- &testForTextFilePresence( $ACDEOS );
- open( ACDEOS, "$ACDEOS" ) || &dieWithUnexpectedError( "Cannot open file \"$ACDEOS\"" );
- while ( $return_line = <ACDEOS> ) {
- if ( $return_line =~ /(\S+);([^;]+);/ ) {
- $AC_DE{ $1 } = $2;
- }
- }
- close( ACDEOS );
- }
-
- $AC =~ s/_.+//;
-
- unless( exists( $AC_DE{ $AC } ) ) {
- #AC not present in "ACDEOS" file.
- return "-";
- }
-
- $DE = $AC_DE{ $AC };
-
- if ( !$DE || $DE eq "" ) {
- $DE = "-";
- }
-
- return $DE;
-
-} ## getDescriptionFromTrEMBL_ACDEOSfile
-
-
-
-# Purpose. Gets description for SP seqs,
-# from a file which contains the AC, DE, and OS
-# and which has to be generated from a sprot.dat flat file db
-# using "extractSWISS-PROT.pl".
-# Two arguments:
-# 1. "ACDEOS" file (AC, DE, OS from SWISS-PROT db)
-# 2. SWISS-PROT AC (XXXX_XXXX)
-# Format: AC;DE;OS\n
-# Last modified: 02/12/02
-sub getDescriptionFromSWISSPROT_ACDEOSfile {
- my $SPACDEOS = $_[ 0 ];
- my $AC = $_[ 1 ];
- my $DE = "";
-
- # Fill up (huge) hash, if not already done.
- unless ( %SP_AC_DE ) {
- &testForTextFilePresence( $SPACDEOS );
- open( ACDEOS, "$SPACDEOS" ) || &dieWithUnexpectedError( "Cannot open file \"$SPACDEOS\"" );
- while ( $return_line = <ACDEOS> ) {
- if ( $return_line =~ /(\S+);([^;]+);/ ) {
- $SP_AC_DE{ $1 } = $2;
- }
- }
- close( ACDEOS );
- }
-
- $AC =~ s/\/.+//;
-
- unless( exists( $SP_AC_DE{ $AC } ) ) {
- #AC not present in "ACDEOS" file.
- return "-";
- }
-
- $DE = $SP_AC_DE{ $AC };
-
- if ( !$DE || $DE eq "" ) {
- $DE = "-";
- }
-
- return $DE;
-
-} ## getDescriptionFromSWISSPROT_ACDEOSfile
-
-
-
-
-
-
-
-
-
-# -----------------------------------------------------------
-# Helpers
-# -----------------------------------------------------------
-
-
-
-# One argument:
-# Numeric value to be rounded to int.
-# Last modified: 10/17/01
-sub roundToInt {
- my $x = $_[ 0 ];
- unless ( $x eq "-" ) {
- $x = int ( $x + 0.5 );
- }
- return $x;
-} ## roundToInt
-
-
-
-# Removes files.
-# Last modified: 03/10/01
-sub cleanUpTempDir {
- unlink( $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX, $temp_dir."/MAKETREEOUT".$LOG_FILE_SUFFIX,
- $temp_dir."/MAKETREEOUT".$ALIGN_FILE_SUFFIX, $temp_dir."/MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX,
- $temp_dir."/MAKETREEOUT".$SUFFIX_PWD_NOT_BOOTS, $temp_dir."/".$DO_RIO_TEMP_OUTFILE,
- $temp_dir."/ALIGN1", $temp_dir."/ALIGN2", $temp_dir."/QUERY_SEQ", $temp_dir."/NBD_NJ_TREE",
- $temp_dir."/ALIGN2_BOOTSTRAPPED", $temp_dir."/ALIGN2_PROCESSED", $temp_dir."/DIST_TO_QUERY",
- $temp_dir."/DISTs_TO_QUERY", $temp_dir."/HMMALIGNOUT", $temp_dir."/NBD_INC_QUERY", $temp_dir."/PWD_INC_QUERY",
- $temp_dir."/HMMFILE", $temp_dir."/MOVETOLASTOUT" );
- rmdir( $temp_dir );
-} ## cleanUpTempDir
-
-
-
-
-
-
-
-
-
-
-
-
-# -----------------------------------------------------------
-# Command line and arguments, Errors
-# -----------------------------------------------------------
-
-
-
-# One argument:
-# the command line.
-# Last modified: 03/08/01
-sub analyzeCommandLine {
-
- my $args = "";
- my $arg = "";
- my $char = "";
-
-
-
- $mode = shift( @_ );
-
- if ( $mode != 1 && $mode != 2 && $mode != 3 && $mode != 4 ) {
- &errorInCommandLine( "Mode can only be: 1, 2, 3, or 4." );
- }
-
-
- foreach $args ( @_ ) {
-
- $args =~ s/\s//g;
-
- $char = substr( $args, 0, 1 );
-
-
- if ( length( $args ) > 1 ) {
- $arg = substr( $args, 2 );
- }
-
- if ( $char =~ /A/ ) {
- if ( $alignment ne "" ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- if ( $mode == 3 || $mode == 4 ) {
- &userErrorCheckForTextFileExistence( $arg );
- }
- $alignment = $arg;
- }
- elsif ( $char =~ /B/ ) {
- if ( $t_sn != $THRESHOLD_SN_DEFAULT ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $t_sn = $arg;
- }
- elsif ( $char =~ /C/ ) {
- if ( $description == 1 || $complete_description == 1 ) {
- &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"D\" and \"C\"." );
- }
- $complete_description = 1;
- }
- elsif ( $char =~ /D/ ) {
- if ( $description == 1 || $complete_description == 1 ) {
- &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"D\" and \"C\"." );
- }
- $description = 1;
- }
- elsif ( $char =~ /E/ ) {
- if ( $long_output != 0 ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $long_output = 1;
- }
- elsif ( $char =~ /F/ ) {
- if ( $hmm_file ne "" || $hmm_name ne "" || $seed_aln_for_hmmbuild ne "") {
- &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"F=\", \"H=\" and \"b=\"." );
- }
- if ( $mode == 1 || $mode == 2 ) {
- &errorInCommandLine( "Can not use \"F=\" in modes 1 or 2." );
- }
- &userErrorCheckForTextFileExistence( $arg );
- $hmm_file = $arg;
- }
- elsif ( $char =~ /G/ ) {
- if ( $species_names_file ne " " ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- &userErrorCheckForTextFileExistence( $arg );
- $species_names_file = $arg;
- }
- elsif ( $char =~ /H/ ) {
- if ( $hmm_name ne "" || $hmm_file ne "" || $seed_aln_for_hmmbuild ne "" ) {
- &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"F=\", \"H=\" and \"b=\"." );
- }
- if ( $mode == 1 || $mode == 2 ) {
- &errorInCommandLine( "Can not use \"H=\" in modes 1 or 2." );
- }
- $hmm_name = $arg;
- }
- elsif ( $char =~ /I/ ) {
- if ( $safe_nhx != 0 ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $safe_nhx = 1;
- }
- elsif ( $char =~ /K/ ) {
- if ( $keep != 0 ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $keep = 1;
- }
- elsif ( $char =~ /L/ ) {
- if ( $t_orthologs != $THRESHOLD_ORTHOLOGS_DEFAULT ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $t_orthologs = $arg;
- }
- elsif ( $char =~ /N/ ) {
- if ( $query_name ne "" ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $query_name = $arg;
- }
- elsif ( $char =~ /O/ ) {
- if ( $outfile ne "" ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $outfile = $arg;
- }
- elsif ( $char =~ /P/ ) {
- if ( $sort != $SORT_DEFAULT ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $sort = $arg;
- }
- elsif ( $char =~ /Q/ ) {
- if ( $seqX_file ne "" ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- &userErrorCheckForTextFileExistence( $arg );
- $seqX_file = $arg;
- }
- elsif ( $char =~ /S/ ) {
- if ( $species_tree_file ne "" ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- &userErrorCheckForTextFileExistence( $arg );
- $species_tree_file = $arg;
- }
- elsif ( $char =~ /T/ ) {
- if ( $mode == 1 || $mode == 2 ) {
- &errorInCommandLine( "Matrix cannot be changed in modes 1 and 2 (is dictated by \"\$MATRIX_FOR_PWD\" for mode 1)." );
- }
- if ( $arg eq "J" ) {
- $matrix_n = 0;
- }
- elsif ( $arg eq "P" ) {
- $matrix_n = 1;
- }
- elsif ( $arg eq "B" ) {
- $matrix_n = 2;
- }
- elsif ( $arg eq "M" ) {
- $matrix_n = 3;
- }
- elsif ( $arg eq "V" ) {
- $matrix_n = 5;
- }
- elsif ( $arg eq "W" ) {
- $matrix_n = 6;
- }
- else {
- &errorInCommandLine( "Use T=J for JTT, P for PAM, B for BLOSUM62, M for mtREV24, V for VT, W for WAG." );
- }
- }
- elsif ( $char =~ /U/ ) {
- if ( $t_orthologs_dc != $THRESHOLD_ORTHOLOGS_DEFAULT_DC ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $t_orthologs_dc = $arg;
- }
- elsif ( $char =~ /X/ ) {
- if ( $warn_more_than_one_ortho
- != $WARN_MORE_THAN_ONE_ORTHO_DEFAULT ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $warn_more_than_one_ortho = $arg;
- }
- elsif ( $char =~ /Y/ ) {
- if ( $warn_no_orthos != $WARN_NO_ORTHOS_DEFAULT ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $warn_no_orthos = $arg;
- }
- elsif ( $char =~ /Z/ ) {
- if ( $warn_one_ortho != $WARN_ONE_ORTHO_DEFAULT ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $warn_one_ortho = $arg;
- }
- elsif ( $char =~ /a/ ) {
- if ( $boostraps_for_makeTree != $BOOSTRAPS_FOR_MAKETREE_DEFAULT ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- if ( $mode == 1 || $mode == 2 ) {
- &errorInCommandLine( "Modes 1 and 2: Cannot change bootstrap value. Do not use \"a=\"." );
- }
- $boostraps_for_makeTree = $arg;
- if ( $boostraps_for_makeTree < 10 ) {
- &errorInCommandLine( "Bootsraps cannot be smaller than 10." );
- }
- }
- elsif ( $char =~ /b/ ) {
- if ( $hmm_name ne "" || $hmm_file ne "" || $seed_aln_for_hmmbuild ne "" ) {
- &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"F=\", \"H=\" and \"b=\"." );
- }
- if ( $mode == 1 || $mode == 2 ) {
- &errorInCommandLine( "Can not use \"b=\" in modes 1 or 2." );
- }
- &userErrorCheckForTextFileExistence( $arg );
- $seed_aln_for_hmmbuild = $arg;
- }
- elsif ( $char =~ /f/ ) {
- if ( $no_frags ne 0 ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $no_frags = 1;
- }
- elsif ( $char =~ /j/ ) {
- if ( $temp_dir ne "" ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $temp_dir = $arg;
- }
- elsif ( $char =~ /p/ ) {
- if ( $output_ultraparalogs != 0 ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $output_ultraparalogs = 1;
- }
- elsif ( $char =~ /s/ ) {
- if ( $non_sp != 1 ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $non_sp = 0;
- }
- elsif ( $char =~ /v/ ) {
- $t_ultra_paralogs = $arg;
- }
- elsif ( $char =~ /x/ ) {
- if ( $output_HTML == 1 ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $output_HTML = 1;
- }
- elsif ( $char =~ /y/ ) {
- if ( $seed_for_makeTree != $SEED_FOR_MAKETREE_DEFAULT ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $seed_for_makeTree = $arg;
- }
- elsif ( $char =~ /\+/ ) {
- if ( $parallel != 0 ) {
- &errorInCommandLine( "Entered same argument twice." );
- }
- $parallel = 1;
- }
- else {
- &errorInCommandLine( "Unknown option: \"$args\"." );
- }
- }
-} ## analyzeCommandLine
-
-
-
-
-# Last modified: 03/08/01
-sub CheckArguments {
-
- if ( $outfile eq "" ) {
- &errorInCommandLine( "Outfile not specified. Use \"O=\"." );
- }
- if ( $alignment eq "" ) {
- &errorInCommandLine( "Need to specify a Pfam alignment file. Use \"A=\"." );
- }
- if ( -e $outfile ) {
- &userError( "\"$outfile\" already exists." );
- }
-
- if ( $sort < 0 || $sort > 17 ) {
- &errorInCommandLine( "Sort priority (\"P=\") must be between 0 and 15." );
- }
-
- if ( $parallel == 1 && $mode != 1 ) {
- &errorInCommandLine( "Parallelization only implemented for mode 1." );
- }
-
- if ( $mode == 1 || $mode == 2 ) {
-
- if ( $species_names_file =~ /\S/ ) {
- &errorInCommandLine( "Modes 1 and 2: Cannot use species names file. Do not use \"G=\"." );
- }
- if ( $non_sp == 0 ) {
- &errorInCommandLine( "Can not use \"s\" in modes 1 or 2." );
- }
- if ( $no_frags == 1 ) {
- &errorInCommandLine( "Can not use \"f\" in modes 1 or 2." );
- }
- }
-
- if ( $mode == 1 || $mode == 3 ) {
- if ( $seqX_file eq "" ) {
- &errorInCommandLine( "Modes 1 and 3: Need to specify a query file. Use \"Q=\"." );
- }
- }
-
- if ( $mode == 3 ) {
- if ( $hmm_name eq "" && $hmm_file eq "" && $seed_aln_for_hmmbuild eq "" ) {
- &errorInCommandLine( "Mode 3: Need to specify either a HMM name (\"H=\"), a HMM file (\"F=\") or build a HMM (\"b=\")." );
- }
- }
-
- if ( $mode == 1 ) {
- if ( $hmm_name ne "" || $hmm_file ne "" || $seed_aln_for_hmmbuild ne "" ) {
- &errorInCommandLine( "Mode 1: Must not specify a HMM name (\"H=\"), a HMM file (\"F=\") or build a HMM (\"b=\")." );
- }
- }
-
- if ( $mode == 2 || $mode == 4 ) {
- if ( $seqX_file ne "" ) {
- &errorInCommandLine( "Modes 2 and 4: Must not specify a query file. Do not use \"Q=\".\n" );
- }
- if ( $query_name eq "" ) {
- &errorInCommandLine( "Modes 2 and 4: Must specify a query name. Use \"N=\"." );
- }
- if ( $hmm_name ne "" || $hmm_file ne "" || $seed_aln_for_hmmbuild ne "" ) {
- &errorInCommandLine( "Modes 2 and 4: Cannot specify a HMM name (\"H=\"), a HMM file (\"F=\") or build a HMM (\"b=\")." );
- }
-
- }
-
- if ( $non_sp != 1 && $no_frags == 1 ) {
- &errorInCommandLine( "\"Fragments\" are assumed to be only found in non SWISS-PROT seqs.\n Do not use \"f\" together with \"s\"." );
- }
-
- if ( $output_HTML == 1 ) {
- if ( $mode != 1 ) {
- &errorInCommandLine( "Output in HTML (for web server) only for mode 1." );
- }
- }
-
- if ( $output_ultraparalogs == 0 && $t_ultra_paralogs != $T_ULTRA_PARALOGS_DEFAULT ) {
- &errorInCommandLine( "Use \"p\" to output ultra paralogs (cannot use \"v=\" without \"p\")." );
- }
-
- if ( $non_sp == 1 && ( $mode == 3 || $mode == 4 ) ) {
- unless ( ( -s $TREMBL_ACDEOS_FILE ) && ( -f $TREMBL_ACDEOS_FILE ) && ( -T $TREMBL_ACDEOS_FILE ) ) {
- my $message = "AC, DE, and OS-file not found.\n";
- $message .= " If non SWISS-PROT sequences are not to be removed from the\n";
- $message .= " Pfam alignment (\"s\" option), variable \"\$TREMBL_ACDEOS_FILE\" needs\n";
- $message .= " to point to a file containing AC, DE, and OS from TrEMBL. Such a\n";
- $message .= " file can be generated with \"extractTrembl.pl\".\n";
- $message .= " Currently, \"TREMBL_ACDEOS_FILE\" points to:\n";
- $message .= " $TREMBL_ACDEOS_FILE";
- &userError( $message );
- }
- }
-
- unless ( ( -s $species_tree_file ) && ( -f $species_tree_file ) && ( -T $species_tree_file ) ) {
- my $message = "Species tree file not found.\n";
- $message .= " A valid species tree must be specified.\n";
- $message .= " Either, use \"S=\" option, or set variable\n";
- $message .= " \"\$SPECIES_TREE_FILE_DEFAULT\".\n";
- $message .= " Currently, this program looks for a species tree at:\n";
- $message .= " $species_tree_file";
- &userError( $message );
- }
-
- if ( $hmm_name ne "" ) {
- unless ( ( -s $PFAM_HMM_DB ) && ( -f $PFAM_HMM_DB ) ) {
- my $message = "HMMER model db file not found.\n";
- $message .= " If \"H=\" option is used, a valid HMMER model db needs\n";
- $message .= " to be specified with variable \"\$PFAM_HMM_DB\".\n";
- $message .= " Currently, \"\$PFAM_HMM_DB\" points to:\n";
- $message .= " $PFAM_HMM_DB";
- &userError( $message );
- }
- }
-} ## CheckArguments
-
-
-
-# Last modfied: 06/25/01
-sub userErrorCheckForTextFileExistence {
- my $file = $_[ 0 ];
- unless ( ( -s $file ) && ( -f $file ) && ( -T $file ) ) {
- &userError( "\"$file\" does not exist or is not a plain text file." );
- }
-} ## checkForFileExistence
-
-
-
-# One argument: the error message.
-# Last modified: 04/26/01
-sub errorInCommandLine {
-
- my $error = $_[ 0 ];
-
- print " \n";
- print " rio.pl version: $VERSION\n";
- print " ------\n";
- print " \n";
- print " Error in command line:\n";
- if ( $error ne "" ) {
- print " $error";
- }
- print " \n\n";
- print " Type \"rio.pl\" (no arguments) for more information.\n";
- print " \n";
- exit( -1 );
-} ## errorInCommandLine
-
-
-
-
-# One argument: the error message.
-# Last modified: 04/26/01
-sub userError {
-
- my $error = $_[ 0 ];
-
- print " \n";
- print " rio.pl version: $VERSION\n";
- print " ------\n";
- print " \n";
- print " Error:\n";
- if ( $error ne "" ) {
- print " $error";
- }
- print " \n\n";
- print " Type \"rio.pl\" (no arguments) for more information.\n";
- print " \n";
- &cleanUpTempDir();
- exit( -1 );
-} ## UserError
-
-
-
-
-
-
-# Last modified: 04/26/01
-sub printHelp {
-
- print " \n";
- print " rio.pl version: $VERSION\n";
- print " ------\n\n";
-
- print <<END;
- Copyright (C) 2000-2002 Washington University School of Medicine
- and Howard Hughes Medical Institute
- All rights reserved
-
- Created: 11/25/00
- Author: Christian M. Zmasek
- zmasek\@genetics.wustl.edu
- http://www.genetics.wustl.edu/eddy/people/zmasek/
-
- Last modified 05/26/02
-
- Available at : http://www.genetics.wustl.edu/eddy/forester/
- RIO webserver: http://www.rio.wustl.edu/
-
- Reference:
- Zmasek C.M. and Eddy S.R. (2002)
- RIO: Analyzing proteomes by automated phylogenomics using
- resampled inference of orthologs.
- BMC Bioinformatics 3:14
- http://www.biomedcentral.com/1471-2105/3/14/
-
- It is highly recommended that you read this paper before
- installing and/or using RIO. (Included in the RIO
- distribution as PDF: "RIO.pdf".)
-
-
- Before rio.pl can be used, some variables in rio_module.pm need to be set,
- as described in RIO_INSTALL.
-
-
-
- Usage: rio.pl <Mode: 1, 2, 3, or 4> <tagged arguments, single letter arguments>
- -----
-
-
- Examples:
- ---------
-
- % RIO1.1/perl/rio.pl 1 A=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out1 p I C E
-
- % RIO1.1/perl/rio.pl 2 A=aconitase N=LEU2_LACLA/5-449 O=out2 p I C E
-
- % RIO1.1/perl/rio.pl 3 A=/path/to/my/pfam/Full/aconitase H=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out3 p I C E
-
- % RIO1.1/perl/rio.pl 4 A=/path/to/my/pfam/Full/aconitase N=LEU2_LACLA/5-449 O=out4 p I C E
-
- % RIO1.1/perl/rio.pl 3 A=/path/to/my/pfam/Full/aconitase b=/path/to/my/pfam/Seed/aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out5 p I C E
-
-
-
- Modes:
- ------
-
- 1: RIO analysis based on precalculated pairwise distances
- alignment does not contain query sequence
-
- 2: RIO analysis based on precalculated pairwise distances
- alignment does contain query sequence
-
- 3: RIO analysis based on Pfam alignments,
- alignment does not contain query sequence
-
- 4: RIO analysis based on Pfam alignments,
- alignment does contain query sequence
-
-
-
- Tagged arguments:
- -----------------
-
- No "G=", "H=", "F=", "T=", "a=", "b=", "s", "f" in modes 1 and 2.
-
-
- A=<String> Pfam alignment name (mandatory). This specifies the alignment
- against which the RIO analysis is to be performed.
- In modes 1 and 2: Pfam model (alignment) name
- (e.g. "A=aconitase").
- In modes 3 and 4: Pfam alignment path/name
- (e.g. "A=/path/to/your/pfam/Full/aconitase").
-
- Q=<String> Path/name of file containing the query sequence
- (in FASTA format or raw sequence) (mandatory in modes 1 and 3).
-
- N=<String> Query name (mandatory). This must include the SWISS-PROT code
- for the species of the query after a "_" (e.g. "N=QUERY_HAEIN").
- If the query sequence is already in the alignment (modes 2 and 4)
- the complete name needs to be specified -- including "/xxx-xxx".
-
- O=<String> Output file path/name (mandatory).
-
- T=<char> Model for pairwaise distance calculation:
- J=JTT, B=BLOSUM 62, M=mtREV24, V=VT, W=WAG, P=PAM.
- BLOSUM 62 is default.
- (Not in modes 1 and 2; these modes use \$MATRIX_FOR_PWD instead.)
-
- In modes 1 and 3, a HMM is needed to align the query sequence to
- the alignment and either one of the following options must be
- employed:
- H=<String> HMM name: This uses hmmfetch to retrieve a HMM from
- \$PFAM_HMM_DB.
- F=<String> HMM file: This directly reads the HMM from a file.
-
- S=<String> Species tree file path/name (in NHX format) (optional).
- If not specified, \$SPECIES_TREE_FILE_DEFAULT is used.
-
- G=<String> Species names file (optional). Only sequences associated with
- species found in this file are used.
- In the species names file, individual species names must be
- separated by newlines and lines starting with "#" are ignored.
- While only sequences associated with species found in the species
- tree ("S=") are used for the actual RIO analysis, this allows to
- remove sequences prior to tree calculation (which is the most
- time consuming step).
-
- P=<int> Sort priority (default is 12):
- 0 : Ortholog
- 1 : Ortholog, Super ortholog
- 2 : Super ortholog, Ortholog
- 3 : Ortholog, Distance
- 4 : Distance, Ortholog
- 5 : Ortholog, Super ortholog, Distance
- 6 : Ortholog, Distance, Super ortholog
- 7 : Super ortholog, Ortholog, Distance
- 8 : Super ortholog, Distance, Ortholog
- 9 : Distance, Ortholog, Super ortholog
- 10 : Distance, Super ortholog, Ortholog
- 11 : Ortholog, Subtree neighbor, Distance
- 12 : Ortholog, Subtree neighbor, Super ortholog, Distance (default)
- 13 : Ortholog, Super ortholog, Subtree neighbor, Distance
- 14 : Subtree neighbor, Ortholog, Super ortholog, Distance
- 15 : Subtree neighbor, Distance, Ortholog, Super ortholog
- 16 : Ortholog, Distance, Subtree neighbor, Super ortholog
- 17 : Ortholog, Subtree neighbor, Distance, Super ortholog
-
- a=<int> Bootstraps for tree construction (not in modes 1 and 2).
- Default is 100.
-
- L=<int> Threshold for orthologies for output. Default is 0.
- v=<int> Threshold for ultra-paralogies for output. Default is 50.
-
- U=<int> Threshold for orthologies for distance calculation. Default is 60.
-
- X=<int> In case of more than one putative orthologs:
- number of sd the distance query - LCA has to differ
- from the mean to generate a warning. Default is 2.
-
- Y=<int> In case of no putative orthologs:
- number of sd the distance query - root has to differ
- from mean to generate a warning. Default is 2.
-
- Z=<double> In case of one putative ortholog:
- threshold for factor between the two distances to their
- LCA (larger/smaller) to generate a warning. Default is 2.
-
- B=<int> Threshold for subtree-neighborings. Default is 0.
-
- b=<String> Build HMM from seed alignment with "hmmbuild -s" (optional).
- This is to prevent from finding multiple domains per sequence
- (i.e. prevents "cutting" the query sequence). Give path/name to
- Seed with this.
-
- j=<String> Name for temporary directory (optional).
-
- y=<int> Seed for random number generator. Default is 41.
-
- I Create and save a rooted, with duplication vs speciation,
- and orthology information annotated gene tree.
- If precalculated distances are used (modes 1 and 2): this gene
- tree is a NJ tree calculated based on the non-bootstrap resampled
- (original) pairwise distances.
- If precalculated distances are not used (modes 3 and 4): this gene
- is a consenus tree with ML branch length values and is also
- annotated with bootstrap values for each node.
-
- Options for output:
- p Output ultra-paralogs.
- D Description from SWISS-PROT and TrEMBL.
- C Complete description from SWISS-PROT and TrEMBL.
- E 118 character output instead of 78 character output.
-
- K Keep intermediate files (they will go into the same directory
- as the output file, their names are the same as of the output
- file, with various suffixes added).
-
- s Ignore non SWISS-PROT sequences (i.e. sequences from TrEMBL)
- in the Pfam alignment.
-
- f Try to ignore TrEMBL "fragments" (sequences with "fragment" in
- their description).
-
- + Parallel, use machines listed in file \$NODE_LIST.
-
- x RIO used as web server -- HTML output.
-
-
-END
- exit( 0 );
-
-} ## printHelp
-
+++ /dev/null
-# Copyright (C) 2002-2003 Washington University School of Medicine
-# and Howard Hughes Medical Institute
-# All rights reserved
-#
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Last modified 03/13/03
-
-
-package rio_module2;
-use strict;
-require Exporter;
-
-our $VERSION = 3.20;
-
-our @ISA = qw( Exporter );
-
-our @EXPORT = qw( executeConsense
- executeMakeTree
- executePuzzleDQO
- executePuzzleDQObootstrapped
- pfam2phylipMatchOnly
- startsWithSWISS_PROTname
- isPfamSequenceLine
- isPfamCommentLine
- containsPfamNamedSequence
- isRFline
- executeNeighbor
- executeProtpars
- setModelForPuzzle
- setRateHeterogeneityOptionForPuzzle
- setParameterEstimatesOptionForPuzzle
- executePuzzleBootstrapped
- executePuzzle
- executeHmmfetch
- addDistsToQueryToPWDfile
- testForTextFilePresence
- exitWithWarning
- dieWithUnexpectedError
- addSlashAtEndIfNotPresent
- $LENGTH_OF_NAME
- $MIN_NUMBER_OF_AA
- $TREMBL_ACDEOS_FILE
- $SWISSPROT_ACDEOS_FILE
- $SPECIES_NAMES_FILE
- $SPECIES_TREE_FILE_DEFAULT
- $MULTIPLE_TREES_FILE_SUFFIX
- $LOG_FILE_SUFFIX
- $ALIGN_FILE_SUFFIX
- $TREE_FILE_SUFFIX
- $ADDITION_FOR_RIO_ANNOT_TREE
- $SUFFIX_PWD
- $SUFFIX_BOOT_STRP_POS
- $SUFFIX_PWD_NOT_BOOTS
- $SUFFIX_HMM
- $MATRIX_FOR_PWD
- $RIO_PWD_DIRECTORY
- $RIO_BSP_DIRECTORY
- $RIO_NBD_DIRECTORY
- $RIO_ALN_DIRECTORY
- $RIO_HMM_DIRECTORY
- $PFAM_FULL_DIRECTORY
- $PFAM_SEED_DIRECTORY
- $PRIOR_FILE_DIR
- $PFAM_HMM_DB
- $SEQBOOT
- $NEIGHBOR
- $PROTPARS
- $CONSENSE
- $PUZZLE
- $HMMALIGN
- $HMMSEARCH
- $HMMBUILD
- $HMMFETCH
- $SFE
- $HMMCALIBRATE
- $P7EXTRACT
- $MULTIFETCH
- $BOOTSTRAP_CZ
- $BOOTSTRAP_CZ_PL
- $TRANSFERSBRANCHLENGHTS
- $MAKETREE
- $RIO_PL
- $DORIO
- $PUZZLE_DQO
- $BOOTSTRAPS
- $PATH_TO_FORESTER
- $JAVA
- $NODE_LIST
- $RIO_SLAVE_DRIVER
- $RIO_SLAVE
- $TEMP_DIR_DEFAULT
- $EXPASY_SPROT_SEARCH_DE
- $EXPASY_SPROT_SEARCH_AC
- );
-
-
-
-
-# =============================================================================
-# =============================================================================
-#
-# THESE VARIABLES ARE ENVIRONMENT DEPENDENT, AND NEED TO BE SET ACCORDINGLY
-# BY THE USER
-# -------------------------------------------------------------------------
-#
-
-
-
-# RIO itself:
-# -----------
-our $PATH_TO_FORESTER = "/nfs/dm3/homedir1/czmasek/RIO1.24/";
-
-
-# Java virtual machine:
-# ---------------------
-our $JAVA = "/usr/local/java/jdk/bin/java";
-
-
-
-# Where all the temporary files can be created:
-# ---------------------------------------------
-our $TEMP_DIR_DEFAULT = "/tmp/";
-
-
-
-# Pfam data:
-# ----------
-our $PFAM_FULL_DIRECTORY = "/path/to/Pfam/Full/";
-our $PFAM_SEED_DIRECTORY = "/path/to/Pfam/Seed/";
-our $PFAM_HMM_DB = "/path/to/Pfam/Pfam_ls"; # Need to run "hmmindex" on this
- # to produce .ssi file.
- # Then, for example
- # "setenv HMMERDB /home/rio/pfam-6.6/"
-
-
-$PATH_TO_FORESTER = &addSlashAtEndIfNotPresent( $PATH_TO_FORESTER );
-
-
-# Description lines and species from SWISS-PROT and TrEMBL:
-# ---------------------------------------------------------
-our $TREMBL_ACDEOS_FILE = $PATH_TO_FORESTER."data/trembl22_ACDEOS_1-6";
-
-our $SWISSPROT_ACDEOS_FILE = $PATH_TO_FORESTER."data/sp40_ACDEOS_1-6";
-
-
-
-# Names of species which can be analyzed and analyzed
-# against (must also be in tree $SPECIES_TREE_FILE_DEFAULT).
-# By using a list with less species, RIO analyses become faster
-# but lose phylogenetic resolution.
-# For many purposes, list "tree_of_life_bin_1-6_species_list"
-# in "data/species/" might be sufficient:
-# --------------------------------------------------------------
-our $SPECIES_NAMES_FILE = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6_species_list";
-
-
-
-# A default species tree in NHX format.
-# For many purposes, tree "tree_of_life_bin_1-6.nhx"
-# in "data/species/" might be fine:
-# --------------------------------------------------
-our $SPECIES_TREE_FILE_DEFAULT = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6.nhx";
-
-
-
-# Data for using precalculated distances:
-# ---------------------------------------
-our $MATRIX_FOR_PWD = 2; # The matrix which has been used for the pwd in $RIO_PWD_DIRECTORY.
- # 0=JTT, 1=PAM, 2=BLOSUM 62, 3=mtREV24, 5=VT, 6=WAG.
-
-our $RIO_PWD_DIRECTORY = $PATH_TO_FORESTER."example_data/"; # all must end with "/"
-our $RIO_BSP_DIRECTORY = $PATH_TO_FORESTER."example_data/";
-our $RIO_NBD_DIRECTORY = $PATH_TO_FORESTER."example_data/";
-our $RIO_ALN_DIRECTORY = $PATH_TO_FORESTER."example_data/";
-our $RIO_HMM_DIRECTORY = $PATH_TO_FORESTER."example_data/";
-
-
-
-#
-# End of variables which need to be set by the user.
-#
-# =============================================================================
-# =============================================================================
-
-
-
-
-
-$TEMP_DIR_DEFAULT = &addSlashAtEndIfNotPresent( $TEMP_DIR_DEFAULT );
-$PFAM_FULL_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_FULL_DIRECTORY );
-$PFAM_SEED_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_SEED_DIRECTORY );
-
-
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# These variables should normally not be changed:
-#
-
-our $PRIOR_FILE_DIR = $PATH_TO_FORESTER."data/priors_for_hmmbuild/";
- # Directory containing dirichlet prior
- # files needed for certain aligments
- # by hmmbuild (e.g. Collagen).
-
-
-# PHYLIP:
-our $SEQBOOT = $PATH_TO_FORESTER."phylip_mod/exe/seqboot";
-our $NEIGHBOR = $PATH_TO_FORESTER."phylip_mod/exe/neighbor";
-our $PROTPARS = $PATH_TO_FORESTER."phylip_mod/exe/protpars";
-our $CONSENSE = $PATH_TO_FORESTER."phylip_mod/exe/consense";
-
-# TREE-PUZZLE:
-our $PUZZLE = $PATH_TO_FORESTER."puzzle_mod/src/puzzle";
-our $PUZZLE_DQO = $PATH_TO_FORESTER."puzzle_dqo/src/puzzle";
-
-# HMMER:
-our $HMMALIGN = $PATH_TO_FORESTER."hmmer/binaries/hmmalign";
-our $HMMSEARCH = $PATH_TO_FORESTER."hmmer/binaries/hmmsearch";
-our $HMMBUILD = $PATH_TO_FORESTER."hmmer/binaries/hmmbuild";
-our $HMMFETCH = $PATH_TO_FORESTER."hmmer/binaries/hmmfetch";
-our $SFE = $PATH_TO_FORESTER."hmmer/binaries/sfetch";
-our $HMMCALIBRATE = $PATH_TO_FORESTER."hmmer/binaries/hmmcalibrate";
-
-our $P7EXTRACT = $PATH_TO_FORESTER."perl/p7extract.pl";
-our $MULTIFETCH = $PATH_TO_FORESTER."perl/multifetch.pl";
-
-
-# RIO/FORESTER:
-our $BOOTSTRAP_CZ = $PATH_TO_FORESTER."C/bootstrap_cz";
-our $BOOTSTRAP_CZ_PL = $PATH_TO_FORESTER."perl/bootstrap_cz.pl";
-our $TRANSFERSBRANCHLENGHTS = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.transfersBranchLenghts";
-our $MAKETREE = $PATH_TO_FORESTER."perl/makeTree.pl";
-our $RIO_PL = $PATH_TO_FORESTER."perl/rio.pl";
-our $DORIO = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.DoRIO";
-# parallel RIO:
-our $RIO_SLAVE_DRIVER = $PATH_TO_FORESTER."perl/rio_slave_driver.pl";
-our $RIO_SLAVE = $PATH_TO_FORESTER."perl/rio_slave.pl";
-our $NODE_LIST = $PATH_TO_FORESTER."data/node_list.dat";
-
-#
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-
-our $BOOTSTRAPS = 100;
-our $MIN_NUMBER_OF_AA = 20; # After removal of gaps, if less, gaps are not removed.
-our $LENGTH_OF_NAME = 26;
-
-
-
-
-our $MULTIPLE_TREES_FILE_SUFFIX = ".mlt";
-our $LOG_FILE_SUFFIX = ".log";
-our $ALIGN_FILE_SUFFIX = ".aln";
-our $TREE_FILE_SUFFIX = ".nhx";
-our $ADDITION_FOR_RIO_ANNOT_TREE = ".rio";
-our $SUFFIX_PWD = ".pwd";
-our $SUFFIX_BOOT_STRP_POS = ".bsp";
-our $SUFFIX_PWD_NOT_BOOTS = ".nbd";
-our $SUFFIX_HMM = ".hmm";
-
-our $EXPASY_SPROT_SEARCH_DE = "http://www.expasy.org/cgi-bin/sprot-search-de?";
-our $EXPASY_SPROT_SEARCH_AC = "http://www.expasy.org/cgi-bin/sprot-search-ac?";
-
-
-
-# One argument: input multiple trees file
-# Last modified: 07/05/01
-sub executeConsense {
- my $in = $_[ 0 ];
-
- &testForTextFilePresence( "$in" );
-
- system( "$CONSENSE >/dev/null 2>&1 << !
-$in
-Y
-!" )
- && &dieWithUnexpectedError( "Could not execute \"$CONSENSE \"" );
-
- return;
-}
-
-
-
-# Four arguments:
-# 1. options ("-" is not necessary)
-# 2. alignment or pwd file
-# 3. outfile
-# 4. temp dir
-# Last modified: 07/05/01
-sub executeMakeTree {
-
- my $opts = $_[ 0 ];
- my $B = $_[ 1 ];
- my $C = $_[ 2 ];
- my $D = $_[ 3 ];
-
- &testForTextFilePresence( $B );
-
- $opts = "-".$opts;
-
- system( "$MAKETREE $opts $B $C $D" )
- && &dieWithUnexpectedError( "Could not execute \"$MAKETREE $opts $B $C $D\"" );
-
-} ## executeMakeTree
-
-
-
-
-# Two arguments:
-# 1. Name of inputfile
-# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
-# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
-sub executePuzzleDQO {
- my $in = $_[ 0 ];
- my $matrix_option = $_[ 1 ];
- my $mat = "";
-
- &testForTextFilePresence( $in );
-
- $mat = setModelForPuzzle( $matrix_option );
-
- system( "$PUZZLE_DQO $in >/dev/null 2>&1 << !$mat
-y
-!" )
- && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO\"" );
-
- return;
-
-} ## executePuzzleDQO
-
-
-
-
-# Two arguments:
-# 1. Name of inputfile
-# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
-# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
-# Last modified: 01/28/02
-sub executePuzzleDQObootstrapped {
- my $in = $_[ 0 ];
- my $matrix_option = $_[ 1 ];
-
-
- my $l = 0;
- my $slen = 0;
- my $counter = 0;
- my $mat = "";
- my $a = "";
- my @a = ();
-
- &testForTextFilePresence( $in );
-
- open( GRP, "<$in" ) || &dieWithUnexpectedError( "Cannot open file \"$in\"" );
- while( <GRP> ) {
- if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) {
- $counter++;
- }
- }
- close( GRP );
-
- $l = `cat $in | wc -l`;
- $slen = $l / $counter;
-
- system( "split -$slen $in $in.splt." )
- && &dieWithUnexpectedError( "Could not execute \"split -$slen $in $in.splt.\"" );
-
- @a = <$in.splt.*>;
-
- $mat = setModelForPuzzle( $matrix_option );
-
- foreach $a ( @a ) {
-
- system( "$PUZZLE_DQO $a >/dev/null 2>&1 << !$mat
-y
-!" )
- && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO $a\"" );
-
- system( "cat $a.dist >> $in.dist" )
- && &dieWithUnexpectedError( "Could not execute \"cat outdist >> $in.dist\"" );
-
- unlink( $a, $a.".dist" );
- }
-
- return;
-
-} ## executePuzzleDQObootstrapped
-
-
-
-# Transfers a Pfam (SELEX) alignment to a
-# PHYLIP sequential style alignment.
-# It only writes "match columns" as indicated by the
-# "# RF" line ('x' means match).
-#
-# Three arguments:
-# 1. infile name
-# 2. outfile name
-# 3. 1 to NOT ensure that match states contain only 'A'-'Z' or '-'
-#
-# Returns the number of match states (=length of output alignment),
-# the length of the input alignment,
-# the number of seqs in the input alignment
-#
-# Last modified: 07/07/01
-#
-sub pfam2phylipMatchOnly {
-
- my $infile = $_[ 0 ];
- my $outfile = $_[ 1 ];
- my $ne = $_[ 2 ];
- my @seq_name = ();
- my @seq_array = ();
- my $return_line = "";
- my $seq = "";
- my $x = 0;
- my $y = 0;
- my $i = 0;
- my $x_offset = 0;
- my $max_x = 0;
- my $rf_y = 0;
- my $number_colum = 0;
- my $not_ensure = 0;
- my $saw_rf_line = 0;
-
- if ( $ne && $ne == 1 ) {
- $not_ensure = 1;
- }
-
- &testForTextFilePresence( $infile );
-
- open( INPP, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" );
-
- # This reads in the first block. It reads in the seq names.
- while ( 1 ) {
- if ( &isPfamSequenceLine( $return_line ) ) {
- $return_line =~ /^(\S+)\s+(\S+)/;
- $seq_name[ $y ] = substr( $1, 0, $LENGTH_OF_NAME );
- $seq = $2;
- for ( $x = 0; $x < length( $seq ); $x++ ) {
- $seq_array[ $x ][ $y ] = substr( $seq, $x, 1 );
- }
- $y++;
- }
- elsif ( &isRFline( $return_line ) ) {
- $saw_rf_line = 1;
- $return_line =~ /\s+(\S+)\s*$/;
- $seq = $1;
- $x_offset = length( $seq );
- $rf_y = $y;
- for ( $x = 0; $x < $x_offset; $x++ ) {
- $seq_array[ $x ][ $rf_y ] = substr( $seq, $x, 1 );
- }
- last;
- }
-
- $return_line = <INPP>;
-
- if ( !$return_line ) {
- &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" );
- }
- }
-
- if ( $saw_rf_line != 1 ) {
- &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" );
- }
-
- $y = 0;
- $max_x = 0;
-
- # This reads all blocks after the 1st one.
- while ( $return_line = <INPP> ) {
- if ( &isPfamSequenceLine( $return_line ) ) {
- $return_line =~ /^\S+\s+(\S+)/;
- $seq = $1;
- for ( $x = 0; $x < length( $seq ); $x++ ) {
- $seq_array[ $x + $x_offset ][ $y % $rf_y ] = substr( $seq, $x, 1 );
- }
- $y++;
- }
- elsif ( &isRFline( $return_line ) ) {
- if ( $y != $rf_y ) {
- &dieWithUnexpectedError( "Alignment not in expected format" );
- }
-
- $return_line =~ /\s+(\S+)\s*$/;
- $seq = $1;
- $max_x = length( $seq );
-
- for ( $x = 0; $x < length( $seq ); $x++ ) {
- $seq_array[ $x + $x_offset ][ $rf_y ] = substr( $seq, $x, 1 );
- }
-
- $y = 0;
- $x_offset = $x_offset + $max_x;
- $max_x = 0;
- }
- }
-
- close( INPP );
-
- # Counts the match states, and hence the number of aa in the alignment:
- for ( $x = 0; $x < $x_offset; $x++ ) {
- if ( !$seq_array[ $x ][ $rf_y ] ) {
- &dieWithUnexpectedError( "Alignment not in expected format" );
- }
- if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) {
- $number_colum++;
- }
- }
-
- # Writes the file:
-
- open( OUTPP, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" );
- print OUTPP "$rf_y $number_colum\n";
- for ( $y = 0; $y < $rf_y; $y++ ) {
- print OUTPP "$seq_name[ $y ]";
- for ( $i = 0; $i < ( $LENGTH_OF_NAME - length( $seq_name[ $y ] ) ); $i++ ) {
- print OUTPP " ";
- }
- for ( $x = 0; $x < $x_offset; $x++ ) {
- if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) {
- if ( !$seq_array[ $x ][ $y ] ) {
- &dieWithUnexpectedError( "Alignment not in expected format" );
- }
- if ( $not_ensure != 1 && $seq_array[ $x ][ $y ] !~ /[A-Z]|-/ ) {
- &dieWithUnexpectedError( "Alignment not in expected format (match states must only contain 'A'-'Z' or '-')" );
- }
- print OUTPP "$seq_array[ $x ][ $y ]";
- }
- }
- print OUTPP "\n";
- }
- close( OUTPP );
-
- return $number_colum, $x_offset, $rf_y;
-
-} ## pfam2phylipMatchOnly
-
-
-
-# Returns whether the argument (a String)
-# starts with a SWISS-PROT name (SEQN_SPECI).
-# Last modified: 06/21/01
-sub startsWithSWISS_PROTname {
- return ( $_[ 0 ] =~ /^[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/ );
-}
-
-
-
-# Returns whether the argument starts with XXX.. XXXXX.. and the first
-# character is not a "#".
-# Last modified: 06/21/01
-sub isPfamSequenceLine {
- return( !&isPfamCommentLine( $_[ 0 ] )
- && &containsPfamNamedSequence( $_[ 0 ] ) );
-}
-
-
-
-# Returns whether the argument does start with a "#".
-# Last modified: 06/21/01
-sub isPfamCommentLine {
- return ( $_[ 0 ] =~ /^#/ );
-}
-
-
-
-# Returns whether the argument starts with XXX XXXXX.
-# Last modified: 06/21/01
-sub containsPfamNamedSequence {
- return ( $_[ 0 ] =~ /^\S+\s+\S+/ );
-}
-
-
-# Returns whether the argument starts with XXX XXXXX.
-# Last modified: 06/21/01
-sub isRFline {
- return ( $_[ 0 ] =~ /^#.*RF/ );
-}
-
-
-
-
-# Five arguments:
-# 1. pairwise distance file
-# 2. number of bootstraps
-# 3. randomize_input_order: 0: do not randomize input order; >=1 jumble
-# 4. seed for random number generator
-# 5. lower-triangular data matrix? 1: yes; no, otherwise
-# Last modified: 06/08/01
-sub executeNeighbor {
- my $inpwd = $_[ 0 ];
- my $bs = $_[ 1 ];
- my $rand = $_[ 2 ];
- my $s = $_[ 3 ];
- my $l = $_[ 4 ];
- my $jumble = "";
- my $multi = "";
- my $lower = "";
-
-
- &testForTextFilePresence( $inpwd );
-
- if ( $rand >= 1 ) {
- $jumble = "
-J
-$s";
- }
-
- if ( $bs >= 2 ) {
- $multi = "
-M
-$bs
-$s";
- }
- if ( $l == 1 ) {
- $lower = "
-L";
- }
-
-
- system( "$NEIGHBOR >/dev/null 2>&1 << !
-$inpwd$jumble$multi$lower
-2
-3
-Y
-!" )
- && &dieWithUnexpectedError( "Could not execute \"$NEIGHBOR $inpwd$jumble$multi$lower\"" );
- # 3: Do NOT print out tree
-
-
- return;
-
-} ## executeNeighbor
-
-
-
-# Four arguments:
-# 1. name of alignment file (in correct format!)
-# 2. number of bootstraps
-# 3. jumbles: 0: do not jumble; >=1 number of jumbles
-# 4. seed for random number generator
-# Last modified: 03/13/04
-sub executeProtpars {
- my $alin = $_[ 0 ];
- my $bs = $_[ 1 ];
- my $rand = $_[ 2 ];
- my $s = $_[ 3 ];
- my $jumble = "";
- my $multi = "";
-
-
- &testForTextFilePresence( $alin );
-
- if ( $bs >= 2 && $rand < 1 ) {
- $rand = 1;
- }
-
- if ( $rand >= 1 ) {
- $jumble = "
-J
-$s
-$rand";
- }
-
- if ( $bs >= 2 ) {
- $multi = "
-M
-D
-$bs";
- }
-
-
-
- system( "$PROTPARS 2>&1 << !
-$alin$jumble$multi
-I
-3
-Y
-!" )
- && &dieWithUnexpectedError( "Could not execute \"$PROTPARS $alin$jumble$multi\"" );
- # 3: Do NOT print out tree
- # I: Interleaved
-
- return;
-
-} ## executeProtpars
-
-
-
-# "Model of substitution" order for DQO TREE-PUZZLE 5.0:
-# Auto
-# m -> Dayhoff (Dayhoff et al. 1978)
-# m -> JTT (Jones et al. 1992)
-# m -> mtREV24 (Adachi-Hasegawa 1996)
-# m -> BLOSUM62 (Henikoff-Henikoff 92)
-# m -> VT (Mueller-Vingron 2000)
-# m -> WAG (Whelan-Goldman 2000)
-# m -> Auto
-# One argument:
-# matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
-# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
-# Last modified: 07/07/01
-sub setModelForPuzzle {
- my $matrix_option = $_[ 0 ];
- my $matr = "";
-
- if ( $matrix_option == 0 ) { # JTT
- $matr = "
-m
-m";
- }
- elsif ( $matrix_option == 2 ) { # BLOSUM 62
- $matr = "
-m
-m
-m
-m";
- }
- elsif ( $matrix_option == 3 ) { # mtREV24
- $matr = "
-m
-m
-m";
- }
- elsif ( $matrix_option == 5 ) { # VT
- $matr = "
-m
-m
-m
-m
-m";
- }
- elsif ( $matrix_option == 6 ) { # WAG
- $matr = "
-m
-m
-m
-m
-m
-m";
- }
- elsif ( $matrix_option == 7 ) { # auto
- $matr = "";
- }
- else { # PAM
- $matr = "
-m"
- }
-
- return $matr;
-
-} ## setModelForPuzzle
-
-# One argument:
-# Model of rate heterogeneity:
-# 1 for "8 Gamma distributed rates"
-# 2 for "Two rates (1 invariable + 1 variable)"
-# 3 for "Mixed (1 invariable + 8 Gamma rates)"
-# otherwise: Uniform rate
-# Last modified: 09/08/03
-sub setRateHeterogeneityOptionForPuzzle {
- my $rate_heterogeneity_option = $_[ 0 ];
- my $opt = "";
-
- if ( $rate_heterogeneity_option == 1 ) {
- $opt = "
-w";
- }
- elsif ( $rate_heterogeneity_option == 2 ) {
- $opt = "
-w
-w";
- }
- elsif ( $rate_heterogeneity_option == 3 ) {
- $opt = "
-w
-w
-w";
- }
- else {
- $opt = "";
- }
-
- return $opt;
-} ## setRateHeterogeneityOptionForPuzzle
-
-
-# One argument:
-# Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise
-# Last modified: 09/08/03
-sub setParameterEstimatesOptionForPuzzle {
- my $parameter_estimates_option = $_[ 0 ];
- my $opt = "";
-
- if ( $parameter_estimates_option == 1 ) {
- $opt = "
-e";
- }
- else {
- $opt = "";
- }
-
- return $opt;
-} ## setParameterEstimatesOptionForPuzzle
-
-
-
-# Two/three/four arguments:
-# 1. Name of inputfile
-# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
-# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
-# 3. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise
-# 4. Model of rate heterogeneity:
-# 1 for "8 Gamma distributed rates"
-# 2 for "Two rates (1 invariable + 1 variable)"
-# 3 for "Mixed (1 invariable + 8 Gamma rates)"
-# otherwise: Uniform rate
-# Last modified: 09/08/03 (added 3rd and 4th parameter)
-sub executePuzzleBootstrapped {
- my $in = $_[ 0 ];
- my $matrix_option = $_[ 1 ];
- my $parameter_estimates_option = $_[ 2 ];
- my $rate_heterogeneity_option = $_[ 3 ];
-
- my $l = 0;
- my $slen = 0;
- my $counter = 0;
- my $mat = "";
- my $est = "";
- my $rate = "";
- my $a = "";
- my @a = ();
-
- &testForTextFilePresence( $in );
-
- open( GRP, "<$in" ) || die "\n\n$0: Unexpected error: Cannot open file <<$in>>: $!";
- while( <GRP> ) {
- if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) {
- $counter++;
- }
- }
- close( GRP );
-
- $l = `cat $in | wc -l`;
- $slen = $l / $counter;
-
- system( "split -$slen $in $in.splt." )
- && die "\n\n$0: executePuzzleDQObootstrapped: Could not execute \"split -$slen $in $in.splt.\": $!";
-
- @a = <$in.splt.*>;
-
- $mat = setModelForPuzzle( $matrix_option );
- if ( $parameter_estimates_option ) {
- $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option );
- }
- if ( $rate_heterogeneity_option ) {
- $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option );
- }
-
- foreach $a ( @a ) {
- print "-".$a."\n";
- system( "$PUZZLE $a << !
-k
-k$mat$est$rate
-y
-!" )
- && die "$0: Could not execute \"$PUZZLE $a\"";
-
- system( "cat $a.dist >> $in.dist" )
- && die "$0: Could not execute \"cat outdist >> $in.dist\"";
-
- unlink( $a, $a.".dist", $a.".tree" );
- }
-
- return;
-
-} ## executePuzzleBootstrapped
-
-
-
-
-
-# Two/three/four arguments:
-# 1. Name of inputfile
-# 2. Matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24;
-# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise
-# 3. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise
-# 4. Model of rate heterogeneity:
-# 1 for "8 Gamma distributed rates"
-# 2 for "Two rates (1 invariable + 1 variable)"
-# 3 for "Mixed (1 invariable + 8 Gamma rates)"
-# otherwise: Uniform rate
-# Last modified: 09/08/03 (added 3rd and 4th parameter)
-sub executePuzzle {
- my $in = $_[ 0 ];
- my $matrix_option = $_[ 1 ];
- my $parameter_estimates_option = $_[ 2 ];
- my $rate_heterogeneity_option = $_[ 3 ];
- my $mat = "";
- my $est = "";
- my $rate = "";
-
- &testForTextFilePresence( $in );
-
- $mat = &setModelForPuzzle( $matrix_option );
- if ( $parameter_estimates_option ) {
- $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option );
- }
- if ( $rate_heterogeneity_option ) {
- $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option );
- }
-
-
- system( "$PUZZLE $in << !
-k
-k$mat$est$rate
-y
-!" )
- && die "$0: Could not execute \"$PUZZLE\"";
-
- return;
-
-} ## executePuzzle
-
-
-
-
-# Preparation of the pwd file
-sub addDistsToQueryToPWDfile {
- my $pwd_file = $_[ 0 ];
- my $disttoquery_file = $_[ 1 ];
- my $outfile = $_[ 2 ];
- my $name_of_query = $_[ 3 ];
- my $name_of_query_ = "";
- my $return_line_pwd = "";
- my $return_line_dq = "";
- my $num_of_sqs = 0;
- my $block = 0;
- my $name_from_pwd = "X";
- my $name_from_dq = "Y";
- my @dists_to_query = ();
- my $i = 0;
-
- &testForTextFilePresence( $pwd_file );
- &testForTextFilePresence( $disttoquery_file );
-
- $name_of_query_ = $name_of_query;
- for ( my $j = 0; $j <= ( $LENGTH_OF_NAME - length( $name_of_query ) - 1 ); ++$j ) {
- $name_of_query_ .= " ";
- }
-
- open( OUT_AD, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" );
- open( IN_PWD, "$pwd_file" ) || &dieWithUnexpectedError( "Cannot open file \"$pwd_file\"" );
- open( IN_DQ, "$disttoquery_file" ) || &dieWithUnexpectedError( "Cannot open file \"$disttoquery_file\"" );
-
- W: while ( $return_line_pwd = <IN_PWD> ) {
-
-
- if ( $return_line_pwd =~ /^\s*(\d+)\s*$/ ) {
- $num_of_sqs = $1;
- $num_of_sqs++;
- if ( $block > 0 ) {
- print OUT_AD "$name_of_query_ ";
- for ( my $j = 0; $j < $i; ++$j ) {
- print OUT_AD "$dists_to_query[ $j ] ";
- }
- print OUT_AD "0.0\n";
- }
- print OUT_AD " $num_of_sqs\n";
- $block++;
- @dists_to_query = ();
- $i = 0;
- }
-
- if ( $block == 1
- && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) {
- $name_from_pwd = $1;
-
- if ( !defined( $return_line_dq = <IN_DQ> ) ) {
- &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" );
- }
-
- if ( $return_line_dq !~ /\S/ ) {
- if ( !defined( $return_line_dq = <IN_DQ> ) ) {
- &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" );
- }
- }
- $return_line_dq =~ /^\s*(\S+)\s+(\S+)/;
- $name_from_dq = $1;
- $dists_to_query[ $i++ ] = $2;
-
-
- if ( $name_from_pwd ne $name_from_dq ) {
- &dieWithUnexpectedError( "Order of sequence names in \"$pwd_file\" and \"$disttoquery_file\" is not the same" );
- }
- print OUT_AD $return_line_pwd;
-
- }
- elsif ( $block > 1
- && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) {
- $name_from_pwd = $1;
- if ( !defined( $return_line_dq = <IN_DQ> ) ) {
- &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" );
- }
- if ( $return_line_dq !~ /\S/ ) {
- if ( !defined( $return_line_dq = <IN_DQ>) ) {
- &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" );
- }
- }
- $return_line_dq =~ /^\s*\S+\s+(\S+)/;
- $dists_to_query[ $i++ ] = $1;
- print OUT_AD $return_line_pwd;
- }
- }
- print OUT_AD "$name_of_query_ ";
- for ( my $j = 0; $j < $i; ++$j ) {
- print OUT_AD "$dists_to_query[ $j ] ";
- }
- print OUT_AD "0.0\n";
-
- close( OUT_AD );
- close( IN_PWD );
- close( IN_DQ );
- return $block;
-
-} ## addDistsToQueryToPWDfile
-
-
-
-
-# Three arguments:
-# 1. HMMER model db
-# 2. name of HMM
-# 3. outputfile name
-# Last modified: 02/27/01
-sub executeHmmfetch {
-
- my $db = $_[ 0 ];
- my $name = $_[ 1 ];
- my $outfile = $_[ 2 ];
-
- system( "$HMMFETCH $db $name > $outfile" )
- && &dieWithUnexpectedError( "Could not execute \"$HMMFETCH $db $name > $outfile\"" );
- return;
-
-} ## executeHmmfetch
-
-
-
-# Checks wether a file is present, not empty and a plain textfile.
-# One argument: name of file.
-# Last modified: 07/07/01
-sub testForTextFilePresence {
- my $file = $_[ 0 ];
- unless ( ( -s $file ) && ( -f $file ) && ( -T $file ) ) {
- dieWithUnexpectedError( "File \"$file\" does not exist, is empty, or is not a plain textfile" );
- }
-} ## testForTextFilePresence
-
-
-# Last modified: 02/21/03
-sub addSlashAtEndIfNotPresent {
- my $filename = $_[ 0 ];
- $filename =~ s/\s+//g;
- unless ( $filename =~ /\/$/ ) {
- $filename = $filename."/";
- }
- return $filename;
-} ## addSlashAtEndIfNotPresent
-
-
-
-# Last modified: 02/15/02
-sub exitWithWarning {
-
- my $text = $_[ 0 ];
- if ( defined( $_[ 1 ] ) && $_[ 1 ] == 1 ) {
- print( "<H4 class=\"error\">user error</H4>\n" );
- print( "<P>\n" );
- print( "<B>$text</B>\n" );
- print( "</P>\n" );
- print( "<P>  </P>\n" );
- }
- else {
- print( "\n\n$text\n\n" );
- }
-
- exit( 0 );
-
-} ## exit_with_warning
-
-
-
-# Last modified: 02/15/02
-sub dieWithUnexpectedError {
-
- my $text = $_[ 0 ];
-
- die( "\n\n$0:\nUnexpected error (should not have happened):\n$text\n$!\n\n" );
-
-} ## dieWithUnexpectedError
-
-
-
-1;
+++ /dev/null
-#!/usr/bin/perl -W
-
-# rio_slave.pl
-# ------------
-#
-# Copyright (C) 2002 Washington University School of Medicine
-# and Howard Hughes Medical Institute
-# All rights reserved
-#
-# Created: 01/18/02
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Last modified: 02/20/02
-
-
-# Arguments:
-
-# 0: first block in multiple alignment to process
-# 1: last block in multiple alignment to process
-# 2: name of resampled alignment, inc. query
-# 3: matrix number
-# 4: name of query
-# 5: PWD file
-# 6: seed for random number generator for neighbor
-# 7: node number
-# 8: temp dir
-
-
-use strict;
-
-use FindBin;
-use lib $FindBin::Bin;
-use rio_module;
-
-if ( @ARGV != 9 ) {
- &dieWithUnexpectedError( "argument count is off" );
-}
-
-my $start = $ARGV[ 0 ];
-my $end = $ARGV[ 1 ];
-my $align = $ARGV[ 2 ];
-my $matrix_n = $ARGV[ 3 ];
-my $name = $ARGV[ 4 ];
-my $pwd_file = $ARGV[ 5 ];
-my $seed = $ARGV[ 6 ];
-my $number = $ARGV[ 7 ];
-my $temp_dir = $ARGV[ 8 ];
-
-my $b = 0;
-my $outfile = "";
-my $mytemp_dir = $temp_dir."/dir_".$number;
-
-mkdir( $mytemp_dir, 0700 )
-|| &dieWithUnexpectedError( "Could not create \"$mytemp_dir\"" );
-
-unless ( ( -e $mytemp_dir ) && ( -d $mytemp_dir ) ) {
- &dieWithUnexpectedError( "\"$mytemp_dir\" does not exist, or is not a directory" );
-}
-
-
-&executePuzzleDQObootstrapped( $align, $matrix_n );
-
-system( "mv", $align.".dist", $mytemp_dir."/DISTs_TO_QUERY" )
-&& &dieWithUnexpectedError( "could not mv" );
-
-unlink( $align );
-
-sleep( 2 );
-
-÷PWDfile( $pwd_file,
- $mytemp_dir."/DIVIDED",
- $start,
- $end );
-
-&addDistsToQueryToPWDfile( $mytemp_dir."/DIVIDED",
- $mytemp_dir."/DISTs_TO_QUERY",
- $mytemp_dir."/PWD_INC_QUERY",
- $name );
-
-unlink( $mytemp_dir."/DIVIDED" );
-
-$b = $end - $start + 1;
-
-chdir ( $mytemp_dir )
-|| &dieWithUnexpectedError( "Could not chdir to \"$mytemp_dir\"" );
-
-&executeNeighbor( $mytemp_dir."/PWD_INC_QUERY",
- $b,
- 1, # randomize input order
- $seed,
- 1 ); # lower-triangular data matrix
-
-
-unlink( "outfile", $mytemp_dir."/PWD_INC_QUERY", $mytemp_dir."/DISTs_TO_QUERY" );
-
-system( "mv", "outtree", "../MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX.$number )
-&& &dieWithUnexpectedError( "could not mv" );
-
-sleep( 1 );
-
-chdir( ".." )
-|| &dieWithUnexpectedError( "Could not chdir to \"..\"" );
-
-rmdir( $mytemp_dir ) || &dieWithUnexpectedError( "Could not delete \"$mytemp_dir\"" );
-
-$outfile = "FINISHED_$number";
-
-open( OUT, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" );
-close( OUT );
-
-exit( 0 );
-
-
-
-
-sub dividePWDfile {
- my $pwd_file = $_[ 0 ];
- my $outfile = $_[ 1 ];
- my $start = $_[ 2 ]; # e.g. 0
- my $end = $_[ 3 ]; # e.g. 9
-
- my $c = 0;
- my $write = 0;
- my $return_line = "";
-
- &testForTextFilePresence( $pwd_file );
-
- open( IN_PWD, "$pwd_file" ) || &dieWithUnexpectedError( "Cannot open file \"$pwd_file\"" );
- open( OUT_PWD, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" );
-
- while ( $return_line = <IN_PWD> ) {
- if ( $return_line =~ /^\s*(\d+)\s*$/ ) {
- if ( $c >= $start && $c <= $end ) {
- $write = 1;
- }
- elsif ( $c > $end ) {
- last;
- }
- $c++;
- }
- if ( $write == 1 ) {
- print OUT_PWD $return_line;
- }
- }
-
- close( IN_PWD );
- close( OUT_PWD );
-
- return;
-
-} ## dividePWDfile
-
-
-
-
-
-
-
+++ /dev/null
-#!/usr/bin/perl -W
-
-# rio_slave_driver.pl
-# -------------------
-#
-# Copyright (C) 2002 Washington University School of Medicine
-# and Howard Hughes Medical Institute
-# All rights reserved
-#
-# Created: 01/18/02
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Last modified: 02/20/02
-
-
-# 0: block size
-# 1: number of blocks which have a size of block size + 1
-# 2: name of resampled alignment, inc. query
-# 3: matrix number
-# 4: name of query
-# 5: PWD file
-# 6: temp dir
-# 7: seed for random number generator for neighbor
-# 8...: list of node names
-
-
-
-
-use strict;
-
-use FindBin;
-use lib $FindBin::Bin;
-use rio_module;
-
-if ( @ARGV < 9 ) {
- &dieWithUnexpectedError( "argumnet count off" );
-}
-
-
-my $block_size = shift( @ARGV );
-my $larger_blocks = shift( @ARGV );
-my $align = shift( @ARGV );
-my $matrix_n = shift( @ARGV );
-my $name = shift( @ARGV );
-my $pwd_file = shift( @ARGV );
-my $temp_dir = shift( @ARGV );
-my $seed = shift( @ARGV );
-my @nodelist = @ARGV;
-my $start = 0;
-my $end = 0;
-my $x = 0;
-my $node = "";
-
-
-$start = 0;
-
-if ( $larger_blocks > 0 ) {
- $end = $block_size;
-}
-else {
- $end = $block_size - 1;
-}
-
-for ( $x = 0; $x < scalar( @nodelist ); $x++ ) {
- my $child_pid;
- $node = $nodelist[ $x ];
-
- if ( !defined( $child_pid = fork() ) ) {
- &dieWithUnexpectedError( "cannot fork" );
- }
- elsif ( $child_pid ) {
- # I'm the parent, forking off $nodelist number of children
- }
- else {
- exec( "ssh",
- $node,
- "/usr/bin/perl",
- $RIO_SLAVE,
- $start,
- $end,
- $align.$x,
- $matrix_n,
- $name,
- $pwd_file,
- $seed,
- $x,
- $temp_dir )
- || &dieWithUnexpectedError( "could not \"exec ssh $node /usr/bin/perl $RIO_SLAVE\"" );
-
- }
- $larger_blocks--;
- if ( $larger_blocks > 0 ) {
- $start += ( $block_size + 1 );
- $end += ( $block_size + 1 );
- }
- elsif ( $larger_blocks == 0 ) {
- $start += ( $block_size + 1 );
- $end += $block_size;
- }
- else {
- $start += $block_size;
- $end += $block_size;
- }
-}
-
-exit( 0 );
+++ /dev/null
-#!/usr/bin/perl -W
-
-# xt.pl
-# -----
-#
-# Copyright (C) 2003 Christian M. Zmasek
-# All rights reserved
-#
-# Author: Christian M. Zmasek
-# zmasek@genetics.wustl.edu
-# http://www.genetics.wustl.edu/eddy/people/zmasek/
-#
-# Version: 1.010
-# Last modified 03/25/03
-#
-#
-#
-# Calculates trees based on Pfam alignments or precalculated distances using
-# makeTree.pl.
-
-
-use strict;
-use FindBin;
-use lib $FindBin::Bin;
-use rio_module;
-
-
-# To use _your_ species list make $MY_SPECIES_NAMES_FILE point to it.
-# To use _your_ TrEMBL ACDEOS make $MY_TREMBL_ACDEOS_FILE point to it.
-
-my $MY_SPECIES_NAMES_FILE = $SPECIES_NAMES_FILE; # $SPECIES_NAMES_FILE is inherited
- # from rio_module.pm
-
-my $MY_TREMBL_ACDEOS_FILE = $TREMBL_ACDEOS_FILE; # $TREMBL_ACDEOS_FILE is inherited
- # from rio_module.pm
-
-my $MY_TEMP_DIR = $TEMP_DIR_DEFAULT; # $TEMP_DIR_DEFAULT is inherited
- # from rio_module.pm
-
-my $LOGFILE = "00_xt_logfile";
-my $PWD_SUFFIX = ".pwd";
-my $ALN_SUFFIX = ".aln";
-
-my $use_precalc_pwd = 0; # 0: input is Pfam aligments ($input_dir must point to "/Pfam/Full/").
- # 1: input is precalculated pairwise distancs ($input_dir must point to "").
-my $use_precalc_pwd_and_aln = 0;# 0: otherwise
- # 1: input is precalculated pairwise distancs
- # _and_ alns,$use_precalc_pwd = 1 ($input_dir must point to alns).
-my $add_species = 0; # "I": 0: do nothing with species information.
- # "S": 1: add species code to TrEMBL sequences and ignore sequences from
- # species not in $MY_SPECIES_NAMES_FILE (only if input is Pfam aligments).
-my $options = ""; # Options for makeTree.pl, see makeTree.pl.
- # Do not use F [Pairwise distance (pwd) file as input (instead of alignment)]
- # since this is determined with $USE_PRECALC_PWD
-my $min_seqs = 0; # Minimal number of sequences (TREE-PUZZLE needs at least four seqs).
- # Ignored if $USE_PRECALC_PWD = 1
-my $max_seqs = 0; # Maximal number of sequences.
- # Ignored if $USE_PRECALC_PWD = 1
-my $input_dir = "";
-my $input_dir_aln = ""; # for .aln files
-my $output_dir = "";
-
-my $i = 0;
-my $seqs = 0;
-my $filename = "";
-my @filenames = ();
-my %AC_OS = (); # AC -> species name
-my %Species_names_hash = ();
-my $too_small = 0;
-my $too_large = 0;
-my $already_present = 0;
-my @too_small_names = ();
-my @too_large_names = ();
-my @already_present_names = ();
-
-
-# Analyzes the options:
-# ---------------------
-
-unless ( @ARGV == 3 || @ARGV == 4 || @ARGV == 6 ) {
- &printUsage();
-}
-
-if ( @ARGV == 3 ) {
- $use_precalc_pwd = 1;
- $use_precalc_pwd_and_aln = 0;
- $options = $ARGV[ 0 ];
- $input_dir = $ARGV[ 1 ];
- $output_dir = $ARGV[ 2 ];
- $add_species = 0;
-}
-elsif ( @ARGV == 4 ) {
- $use_precalc_pwd = 1;
- $use_precalc_pwd_and_aln = 1;
- $options = $ARGV[ 0 ];
- $input_dir = $ARGV[ 1 ];
- $input_dir_aln = $ARGV[ 2 ];
- $output_dir = $ARGV[ 3 ];
- $add_species = 0;
- $input_dir_aln = &addSlashAtEndIfNotPresent( $input_dir_aln );
-}
-else {
- $use_precalc_pwd = 0;
- $use_precalc_pwd_and_aln = 0;
- $add_species = $ARGV[ 0 ];
- $options = $ARGV[ 1 ];
- $min_seqs = $ARGV[ 2 ];
- $max_seqs = $ARGV[ 3 ];
- $input_dir = $ARGV[ 4 ];
- $output_dir = $ARGV[ 5 ];
- if ( $min_seqs < 4 ) {
- $min_seqs = 4;
- }
- if ( $add_species eq "I" ) {
- $add_species = 0;
- }
- elsif ( $add_species eq "S" ) {
- $add_species = 1;
- }
- else {
- print( "\nFirst must be either \"I\" [Ignore species] or\n\"S\" [add Species code to TrEMBL sequences and ignore sequences from species not in $MY_SPECIES_NAMES_FILE].\n\n" );
- &printUsage();
- }
-}
-
-
-
-$input_dir = &addSlashAtEndIfNotPresent( $input_dir );
-$output_dir = &addSlashAtEndIfNotPresent( $output_dir );
-$MY_TEMP_DIR = &addSlashAtEndIfNotPresent( $MY_TEMP_DIR );
-
-
-
-
-# This adds a "-" before the options for makeTree:
-# ------------------------------------------------
-unless ( $options =~ /^-/ ) {
- $options = "-".$options;
-}
-
-
-
-# If based on pwd, species are "fixed" and certain options for makeTree
-# are not applicable and option "F" is mandatory:
-# ---------------------------------------------------------------------
-if ( $use_precalc_pwd == 1 ) {
- $options =~ s/D//g;
- $options =~ s/C//g;
- $options =~ s/N//g;
- unless ( $options =~ /F/ ) {
- $options = $options."F";
- }
-}
-else {
- $options =~ s/F//g;
-}
-
-if ( $use_precalc_pwd_and_aln == 1 ) {
- unless ( $options =~ /U/ ) {
- $options = $options."U";
- }
-}
-if ( $use_precalc_pwd_and_aln == 0 && $use_precalc_pwd == 1 ) {
- $options =~ s/U//g;
-}
-
-
-
-
-# If species are to be considered, speices names file and TrEMBL ACDEOS
-# files need to be read in:
-# ---------------------------------------------------------------------
-if ( $add_species == 1 ) {
- print "\nXT.PL: Reading species names file...\n";
- &readSpeciesNamesFile( $MY_SPECIES_NAMES_FILE );
- print "\nXT.PL: Reading TrEMBL ACDEOS file...\n";
- &readTrEMBL_ACDEOS_FILE( $MY_TREMBL_ACDEOS_FILE );
-}
-
-
-
-# This creates the temp file:
-# --------------------------
-
-my $time = time;
-my $ii = 0;
-
-my $temp_file = $MY_TEMP_DIR."xt".$time.$ii;
-
-while ( -e $temp_file ) {
- $ii++;
- $temp_file = $MY_TEMP_DIR."xt".$time.$ii;
-}
-
-
-
-&startLogfile();
-
-opendir( DIR, $input_dir ) || error( "Cannot open directory \"$input_dir\": $!" );
-
-$i = 0;
-
-while( defined( $filename = readdir( DIR ) ) ) {
- if ( $filename =~ /^\.\.?$/ ) {
- next;
- }
- if ( $use_precalc_pwd == 1 && $filename !~ /$PWD_SUFFIX$/ ) {
- next
- }
- $filenames[ $i ] = $filename;
- $i++;
-}
-
-close( DIR );
-
-$i = 0;
-
-FOREACH: foreach $filename ( @filenames ) {
-
- # If the corresponding tree seems to already exists, do next one.
- my $fn = $filename;
- if ( $use_precalc_pwd == 1 ) {
- $fn =~ s/$PWD_SUFFIX$//;
- }
- if ( -e "$output_dir$fn.nhx" ) {
- $already_present_names[ $already_present++ ] = $fn;
- next FOREACH;
- }
-
- if ( $use_precalc_pwd != 1 ) {
-
- if ( $add_species == 1 ) {
-
- # 1. Pfam flat file name
- # 2. outfile name
- # Returns the number of sequences in the resulting alignment.
- $seqs = &removeSeqsFromPfamAlign( $input_dir.$filename, $temp_file );
-
- }
- else {
- # Gets the number of seqs in the alignment.
- open( F, "$input_dir"."$filename" );
- while( <F> ) {
- if ( $_ =~/^#.+SQ\s+(\d+)\s*$/ ) {
- $seqs = $1;
- last;
- }
- }
- close( F );
- }
-
- if ( $seqs < $min_seqs ) {
- $too_small_names[ $too_small++ ] = $filename;
- next FOREACH;
- }
- if ( $seqs > $max_seqs ) {
- $too_large_names [ $too_large++ ] = $filename;
- next FOREACH;
- }
- }
-
- print "\n\n\n\n";
- print "XT.PL\n";
- if ( $use_precalc_pwd == 1 ) {
- print "working on: $filename\n";
- }
- else {
- print "working on: $filename [$seqs seqs]\n";
- }
- print "[tree calculation $i]\n";
- print "=====================================================================\n\n\n";
-
-
- unlink( "$output_dir$filename.aln", "$output_dir$filename.log" );
-
- print( "XT.PL: executing:\n" );
-
- my $inputfile = "";
-
- if ( $add_species == 1 ) {
- $inputfile = $temp_file;
- }
- else {
- $inputfile = $input_dir.$filename;
- }
-
- if ( $use_precalc_pwd == 1 ) {
- $filename =~ s/$PWD_SUFFIX$//;
- }
-
- if ( $use_precalc_pwd_and_aln == 1 ) {
- $inputfile = $inputfile." ".$input_dir_aln.$filename.$ALN_SUFFIX;
- }
-
- my $command = "$MAKETREE $options $inputfile $output_dir$filename.nhx";
-
- print( "$command\n" );
- system( $command ) && &error( "Could not execute \"$command\"" );
-
- if ( $add_species == 1 ) {
- if ( unlink( $temp_file ) != 1 ) {
- &error( "Unexpected: Could not delete \"$temp_file\"" );
- }
- }
-
- $i++;
-
-}
-
-&finishLogfile();
-
-print( "\n\n\nXT.PL: Done!\n" );
-print( "Wrote \"$LOGFILE\".\n\n" );
-
-exit( 0 );
-
-
-
-
-
-
-sub error{
-
- my $text = $_[ 0 ];
-
- print( "\nxt.pl: ERROR:\n" );
- print( "$text\n\n" );
-
- exit( -1 );
-
-} ## dieWithUnexpectedError
-
-# Similar to the method with the same name in "rio.pl".
-# Removes sequences from a Pfam flat file.
-# Adds species to TrEMBL seqs.
-# It can remove all sequences not from species listed in a species names file.
-# Two arguments:
-# 1. Pfam flat file name
-# 2. outfile name
-# Returns the number of sequences in the resulting alignment.
-# Last modified: 02/22/03
-sub removeSeqsFromPfamAlign {
- my $infile = $_[ 0 ];
- my $outfile = $_[ 1 ];
- my $return_line = "";
- my $saw_sequence_line = 0;
- my $number_of_seqs = 0;
- my $OS = "";
- my $AC = "";
- my $i = 0;
- my $length = 0;
- my $seq_name = "";
- my $seq = "";
-
-
- open( OUT_RNSP, ">$outfile" ) || die "\n\n$0: Unexpected error: Cannot create file \"$outfile\": $!";
- open( IN_RNSP, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!";
- while ( $return_line = <IN_RNSP> ) {
-
- if ( $saw_sequence_line == 1
- && !&containsPfamNamedSequence( $return_line )
- && !&isPfamCommentLine( $return_line ) ) {
- # This is just for counting purposes.
- $saw_sequence_line = 2;
- }
- if ( &isPfamSequenceLine( $return_line ) ) {
- if ( $saw_sequence_line == 0 ) {
- $saw_sequence_line = 1;
- }
- $return_line =~ /^\s*(\S+)\s+(\S+)/;
- $seq_name = $1;
- $seq = $2;
- if ( !&startsWithSWISS_PROTname( $return_line ) ) {
- $seq_name =~ /^(\S+)\//;
- $AC = $1;
- unless( exists( $AC_OS{ $AC } ) ) {
- #ACs not present in "ACDEOS" file.
- next;
- }
- $OS = $AC_OS{ $AC };
- if ( !$OS || $OS eq "" ) {
- die "\n\n$0: Unexpected error: species for \"$AC\" not found.\n\n";
- }
- unless( exists( $Species_names_hash{ $OS } ) ) {
- next;
- }
- $seq_name =~ s/\//_$OS\//;
- }
- else {
- if ( $return_line =~ /_([A-Z0-9]{1,5})\// ) {
- unless( exists( $Species_names_hash{ $1 } ) ) {
- next;
- }
- }
- # remove everything whose species cannot be determined.
- else {
- next;
- }
- }
- $length = length( $seq_name );
- for ( $i = 0; $i <= ( $LENGTH_OF_NAME - $length - 1 ); $i++ ) {
- $seq_name .= " ";
- }
- $return_line = $seq_name.$seq."\n";
- }
-
- if ( !&isPfamCommentLine( $return_line ) ) {
- print OUT_RNSP $return_line;
- }
-
- if ( $saw_sequence_line == 1 ) {
- $number_of_seqs++;
- }
- } ## while ( $return_line = <IN_RNSP> )
- close( IN_RNSP );
- close( OUT_RNSP );
-
- return $number_of_seqs;
-
-} ## removeSeqsFromPfamAlign
-
-
-
-
-
-
-
-# Reads in (SWISS-PROT) species names from a file.
-# Names must be separated by newlines.
-# Lines beginning with "#" are ignored.
-# A possible "=" and everything after is ignored.
-# One argument: species-names-file name
-# Last modified: 04/24/01
-sub readSpeciesNamesFile {
- my $infile = $_[ 0 ];
- my $return_line = "";
- my $species = "";
-
- unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- die "\n\n$0: Error: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n";
- }
-
- open( IN_RSNF, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!\n\n";
- while ( $return_line = <IN_RSNF> ) {
- if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) {
- $species = $1;
- $species =~ s/=.+//;
- $Species_names_hash{ $species } = "";
- }
- }
- close( IN_RSNF );
-
- return;
-} ## readSpeciesNamesFile
-
-
-
-# Last modified: 05/18/01
-sub readTrEMBL_ACDEOS_FILE {
- my $infile = $_[ 0 ];
- my $return_line = "";
-
- unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
- &error( "\"$infile\" does not exist, is empty, or is not a plain textfile" );
- }
- # Fill up (huge) hashs.
- open( HH, "$infile" ) || &error( "Unexpected error: Cannot open file \"$infile\"" );
- while ( $return_line = <HH> ) {
-
- if ( $return_line =~ /(\S+);[^;]*;(\S+)/ ) {
- $AC_OS{ $1 } = $2;
- }
- }
- close( HH );
-} ## readTrEMBL_ACDEOS_FILE
-
-
-
-# Last modified: 05/17/01
-sub startLogfile {
- if ( -e "$LOGFILE" ) {
- &error( "logfile \"$LOGFILE\" already exists, rename it or place it in another directory" );
- }
-
- open( L, ">$LOGFILE" ) || &error( "Cannot create logfile: $!" );
- if ( $use_precalc_pwd != 1 ) {
- print L "Trees are based directly on Pfam alignments\n";
- if ( $add_species == 1 ) {
- print L "Add species code to TrEMBL sequences and ignore sequences\nfrom species not in $MY_SPECIES_NAMES_FILE\n";
- }
- else {
- print L "Do nothing with species information\n";
- }
- }
- else {
- print L "Trees are based on precalculated pairwise distances\n";
- }
- if ( $use_precalc_pwd_and_aln == 1 ) {
- print L "and the matching alignments\n";
- }
- print L "Options for makeTree: $options\n";
- if ( $use_precalc_pwd != 1 ) {
- print L "Min seqs : $min_seqs\n";
- print L "Max seqs : $max_seqs\n";
- }
- if ( $add_species == 1 ) {
- print L "TrEMBL ACDEOS file : $MY_TREMBL_ACDEOS_FILE\n";
- print L "Species names file : $MY_SPECIES_NAMES_FILE\n";
- }
- print L "Input directory : $input_dir\n";
- if ( $use_precalc_pwd_and_aln == 1 ) {
- print L "Input directory aln : $input_dir_aln\n";
- }
- print L "Output directory : $output_dir\n";
- print L "Start date : ".`date`;
-
-} ## startLogfile
-
-
-
-# Last modified: 05/17/01
-sub finishLogfile {
- my $j = 0;
- print L "\n\n";
- print L "Successfully calculated $i trees.\n";
- if ( $use_precalc_pwd != 1 ) {
- print L "Too large alignments (>$max_seqs): $too_large\n";
- print L "Too small alignments (<$min_seqs): $too_small\n";
- }
- print L "Alignments for which a tree appears to already exist: $already_present\n";
- print L "Finish date : ".`date`."\n\n";
- if ( $use_precalc_pwd != 1 ) {
- print L "List of the $too_large alignments which were ignored because they\n";
- print L "contained too many sequences (>$max_seqs) [after pruning]:\n";
- for ( $j = 0; $j < $too_large; ++$j ) {
- print L "$too_large_names[ $j ]\n";
- }
- print L "\n\n";
- print L "List of the $too_small alignments which were ignored because they\n";
- print L "contained not enough sequences (<$min_seqs) [after pruning]:\n";
- for ( $j = 0; $j < $too_small; ++$j ) {
- print L "$too_small_names[ $j ]\n";
- }
- }
- print L "\n\n";
- print L "List of the $already_present alignments which were ignored because\n";
- print L "a tree appears to already exist:\n";
- for ( $j = 0; $j < $already_present; ++$j ) {
- print L "$already_present_names[ $j ]\n";
- }
- print L "\n";
- close( L );
-} ## finishLogfile
-
-
-sub printUsage {
- print "\n";
- print " xt.pl\n";
- print " _____\n";
- print " \n";
- print " Copyright (C) 2003 Christian M. Zmasek\n";
- print " All rights reserved\n";
- print "\n";
- print " Author: Christian M. Zmasek\n";
- print " zmasek\@genetics.wustl.edu\n";
- print " http://www.genetics.wustl.edu/eddy/forester/\n";
- print "\n";
- print "\n";
- print " Purpose\n";
- print " -------\n";
- print "\n";
- print " Tree construction using makeTree.pl based on directories\n";
- print " of Pfam alignments or precalculated pairwise distances.\n";
- print "\n";
- print "\n";
- print " Usage\n";
- print " -----\n";
- print "\n";
- print " Input is Pfam aligments:\n";
- print " xt.pl <I or S> <options for makeTree.pl> <minimal number of seqs>\n";
- print " <maximal number of seqs> <input directory: Pfam aligments> <output directory>\n";
- print "\n";
- print " Input is precalculated pairwise distancs:\n";
- print " xt.pl <options for makeTree.pl> <input directory: pairwise distances \"$PWD_SUFFIX\"> <output directory>\n";
- print "\n";
- print " Input is precalculated pairwise distancs and corresponding alignment files:\n";
- print " xt.pl <options for makeTree.pl> <input directory: pairwise distances \"$PWD_SUFFIX\">\n";
- print " <input directory: corresponding alignment files \"$ALN_SUFFIX\"> <output directory>\n";
- print "\n";
- print "\n";
- print " Examples\n";
- print " --------\n";
- print "\n";
- print " \"xt.pl S NS21UTRB100DX 4 200 DB/PFAM/Full/ trees/\"\n";
- print "\n";
- print " \"xt.pl FLB100R /pfam2pwd_out/ trees/\"\n";
- print "\n";
- print " \"xt.pl FULB100R /pfam2pwd_out/ /pfam2pwd_out/ trees/\"\n";
- print "\n";
- print "\n";
- print " Options\n";
- print " -------\n";
- print "\n";
- print " I: ignore species information (use all sequences)\n";
- print " S: add species codes to TrEMBL sequences and ignore sequences\n";
- print " from species not in $MY_SPECIES_NAMES_FILE,\n";
- print " species codes are extracted from $MY_TREMBL_ACDEOS_FILE\n";
- print "\n";
- print "\n";
- print " Options for makeTree\n";
- print " --------------------\n";
- print "\n";
- print " N : Suggestion to remove columns in the alignment which contain gaps.\n";
- print " Gaps are not removed, if, after removal of gaps, the resulting\n";
- print " alignment would be shorter than $MIN_NUMBER_OF_AA aa (\$MIN_NUMBER_OF_AA).\n";
- print " Default is not to remove gaps.\n";
- print " Bx : Number of bootstrapps. B0: do not bootstrap. Default: 100 bootstrapps.\n";
- print " The number of bootstrapps should be divisible by 10.\n";
- print " U : Use TREE-PUZZLE to calculate ML branchlengths for consesus tree, in case of\n";
- print " bootstrapped analysis.\n";
- print " J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE, default: PAM.\n";
- print " L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE, default: PAM.\n";
- print " M : Use mtREV24 matrix (Adachi-Hasegawa 1996) in TREE-PUZZLE, default: PAM.\n";
- print " W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE, default: PAM.\n";
- print " T : Use VT matrix (Mueller-Vingron 2000) in TREE-PUZZLE, default: PAM.\n";
- print " P : Let TREE-PUZZLE choose which matrix to use, default: PAM.\n";
- print " R : Randomize input order in PHYLIP NEIGHBOR.\n";
- print " Sx : Seed for random number generator(s). Must be 4n+1. Default is 9.\n";
- print " X : To keep multiple tree file (=trees from bootstrap resampled alignments).\n";
- print " D : To keep (and create, in case of bootstrap analysis) pairwise distance\n";
- print " matrix file. This is created form the not resampled aligment.\n";
- print " C : Calculate pairwise distances only (no tree). Bootstrap is always 1.\n";
- print " No other files are generated.\n";
- print " F : Pairwise distance (pwd) file as input (instead of alignment).\n";
- print " No -D, -C, and -N options available in this case.\n";
- print " V : Verbose\n";
- print "\n";
- exit( -1 );
-
-}